18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci *  linux/mm/swapfile.c
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
68c2ecf20Sopenharmony_ci *  Swap reorganised 29.12.95, Stephen Tweedie
78c2ecf20Sopenharmony_ci */
88c2ecf20Sopenharmony_ci
98c2ecf20Sopenharmony_ci#include <linux/mm.h>
108c2ecf20Sopenharmony_ci#include <linux/sched/mm.h>
118c2ecf20Sopenharmony_ci#include <linux/sched/task.h>
128c2ecf20Sopenharmony_ci#include <linux/hugetlb.h>
138c2ecf20Sopenharmony_ci#include <linux/mman.h>
148c2ecf20Sopenharmony_ci#include <linux/slab.h>
158c2ecf20Sopenharmony_ci#include <linux/kernel_stat.h>
168c2ecf20Sopenharmony_ci#include <linux/swap.h>
178c2ecf20Sopenharmony_ci#include <linux/vmalloc.h>
188c2ecf20Sopenharmony_ci#include <linux/pagemap.h>
198c2ecf20Sopenharmony_ci#include <linux/namei.h>
208c2ecf20Sopenharmony_ci#include <linux/shmem_fs.h>
218c2ecf20Sopenharmony_ci#include <linux/blkdev.h>
228c2ecf20Sopenharmony_ci#include <linux/random.h>
238c2ecf20Sopenharmony_ci#include <linux/writeback.h>
248c2ecf20Sopenharmony_ci#include <linux/proc_fs.h>
258c2ecf20Sopenharmony_ci#include <linux/seq_file.h>
268c2ecf20Sopenharmony_ci#include <linux/init.h>
278c2ecf20Sopenharmony_ci#include <linux/ksm.h>
288c2ecf20Sopenharmony_ci#include <linux/rmap.h>
298c2ecf20Sopenharmony_ci#include <linux/security.h>
308c2ecf20Sopenharmony_ci#include <linux/backing-dev.h>
318c2ecf20Sopenharmony_ci#include <linux/mutex.h>
328c2ecf20Sopenharmony_ci#include <linux/capability.h>
338c2ecf20Sopenharmony_ci#include <linux/syscalls.h>
348c2ecf20Sopenharmony_ci#include <linux/memcontrol.h>
358c2ecf20Sopenharmony_ci#include <linux/poll.h>
368c2ecf20Sopenharmony_ci#include <linux/oom.h>
378c2ecf20Sopenharmony_ci#include <linux/frontswap.h>
388c2ecf20Sopenharmony_ci#include <linux/swapfile.h>
398c2ecf20Sopenharmony_ci#include <linux/export.h>
408c2ecf20Sopenharmony_ci#include <linux/swap_slots.h>
418c2ecf20Sopenharmony_ci#include <linux/sort.h>
428c2ecf20Sopenharmony_ci
438c2ecf20Sopenharmony_ci#include <asm/tlbflush.h>
448c2ecf20Sopenharmony_ci#include <linux/swapops.h>
458c2ecf20Sopenharmony_ci#include <linux/swap_cgroup.h>
468c2ecf20Sopenharmony_ci#include <linux/zswapd.h>
478c2ecf20Sopenharmony_ci
488c2ecf20Sopenharmony_cistatic bool swap_count_continued(struct swap_info_struct *, pgoff_t,
498c2ecf20Sopenharmony_ci				 unsigned char);
508c2ecf20Sopenharmony_cistatic void free_swap_count_continuations(struct swap_info_struct *);
518c2ecf20Sopenharmony_cistatic sector_t map_swap_entry(swp_entry_t, struct block_device**);
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ciDEFINE_SPINLOCK(swap_lock);
548c2ecf20Sopenharmony_cistatic unsigned int nr_swapfiles;
558c2ecf20Sopenharmony_ciatomic_long_t nr_swap_pages;
568c2ecf20Sopenharmony_ci/*
578c2ecf20Sopenharmony_ci * Some modules use swappable objects and may try to swap them out under
588c2ecf20Sopenharmony_ci * memory pressure (via the shrinker). Before doing so, they may wish to
598c2ecf20Sopenharmony_ci * check to see if any swap space is available.
608c2ecf20Sopenharmony_ci */
618c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(nr_swap_pages);
628c2ecf20Sopenharmony_ci/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
638c2ecf20Sopenharmony_cilong total_swap_pages;
648c2ecf20Sopenharmony_cistatic int least_priority = -1;
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_cistatic const char Bad_file[] = "Bad swap file entry ";
678c2ecf20Sopenharmony_cistatic const char Unused_file[] = "Unused swap file entry ";
688c2ecf20Sopenharmony_cistatic const char Bad_offset[] = "Bad swap offset entry ";
698c2ecf20Sopenharmony_cistatic const char Unused_offset[] = "Unused swap offset entry ";
708c2ecf20Sopenharmony_ci
718c2ecf20Sopenharmony_ci/*
728c2ecf20Sopenharmony_ci * all active swap_info_structs
738c2ecf20Sopenharmony_ci * protected with swap_lock, and ordered by priority.
748c2ecf20Sopenharmony_ci */
758c2ecf20Sopenharmony_ciPLIST_HEAD(swap_active_head);
768c2ecf20Sopenharmony_ci
778c2ecf20Sopenharmony_ci/*
788c2ecf20Sopenharmony_ci * all available (active, not full) swap_info_structs
798c2ecf20Sopenharmony_ci * protected with swap_avail_lock, ordered by priority.
808c2ecf20Sopenharmony_ci * This is used by get_swap_page() instead of swap_active_head
818c2ecf20Sopenharmony_ci * because swap_active_head includes all swap_info_structs,
828c2ecf20Sopenharmony_ci * but get_swap_page() doesn't need to look at full ones.
838c2ecf20Sopenharmony_ci * This uses its own lock instead of swap_lock because when a
848c2ecf20Sopenharmony_ci * swap_info_struct changes between not-full/full, it needs to
858c2ecf20Sopenharmony_ci * add/remove itself to/from this list, but the swap_info_struct->lock
868c2ecf20Sopenharmony_ci * is held and the locking order requires swap_lock to be taken
878c2ecf20Sopenharmony_ci * before any swap_info_struct->lock.
888c2ecf20Sopenharmony_ci */
898c2ecf20Sopenharmony_cistatic struct plist_head *swap_avail_heads;
908c2ecf20Sopenharmony_cistatic DEFINE_SPINLOCK(swap_avail_lock);
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_cistruct swap_info_struct *swap_info[MAX_SWAPFILES];
938c2ecf20Sopenharmony_ci
948c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(swapon_mutex);
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
978c2ecf20Sopenharmony_ci/* Activity counter to indicate that a swapon or swapoff has occurred */
988c2ecf20Sopenharmony_cistatic atomic_t proc_poll_event = ATOMIC_INIT(0);
998c2ecf20Sopenharmony_ci
1008c2ecf20Sopenharmony_ciatomic_t nr_rotate_swap = ATOMIC_INIT(0);
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_cistatic struct swap_info_struct *swap_type_to_swap_info(int type)
1038c2ecf20Sopenharmony_ci{
1048c2ecf20Sopenharmony_ci	if (type >= READ_ONCE(nr_swapfiles))
1058c2ecf20Sopenharmony_ci		return NULL;
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci	smp_rmb();	/* Pairs with smp_wmb in alloc_swap_info. */
1088c2ecf20Sopenharmony_ci	return READ_ONCE(swap_info[type]);
1098c2ecf20Sopenharmony_ci}
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_cistatic inline unsigned char swap_count(unsigned char ent)
1128c2ecf20Sopenharmony_ci{
1138c2ecf20Sopenharmony_ci	return ent & ~SWAP_HAS_CACHE;	/* may include COUNT_CONTINUED flag */
1148c2ecf20Sopenharmony_ci}
1158c2ecf20Sopenharmony_ci
1168c2ecf20Sopenharmony_ci/* Reclaim the swap entry anyway if possible */
1178c2ecf20Sopenharmony_ci#define TTRS_ANYWAY		0x1
1188c2ecf20Sopenharmony_ci/*
1198c2ecf20Sopenharmony_ci * Reclaim the swap entry if there are no more mappings of the
1208c2ecf20Sopenharmony_ci * corresponding page
1218c2ecf20Sopenharmony_ci */
1228c2ecf20Sopenharmony_ci#define TTRS_UNMAPPED		0x2
1238c2ecf20Sopenharmony_ci/* Reclaim the swap entry if swap is getting full*/
1248c2ecf20Sopenharmony_ci#define TTRS_FULL		0x4
1258c2ecf20Sopenharmony_ci
1268c2ecf20Sopenharmony_ci/* returns 1 if swap entry is freed */
1278c2ecf20Sopenharmony_cistatic int __try_to_reclaim_swap(struct swap_info_struct *si,
1288c2ecf20Sopenharmony_ci				 unsigned long offset, unsigned long flags)
1298c2ecf20Sopenharmony_ci{
1308c2ecf20Sopenharmony_ci	swp_entry_t entry = swp_entry(si->type, offset);
1318c2ecf20Sopenharmony_ci	struct page *page;
1328c2ecf20Sopenharmony_ci	int ret = 0;
1338c2ecf20Sopenharmony_ci
1348c2ecf20Sopenharmony_ci	page = find_get_page(swap_address_space(entry), offset);
1358c2ecf20Sopenharmony_ci	if (!page)
1368c2ecf20Sopenharmony_ci		return 0;
1378c2ecf20Sopenharmony_ci	/*
1388c2ecf20Sopenharmony_ci	 * When this function is called from scan_swap_map_slots() and it's
1398c2ecf20Sopenharmony_ci	 * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
1408c2ecf20Sopenharmony_ci	 * here. We have to use trylock for avoiding deadlock. This is a special
1418c2ecf20Sopenharmony_ci	 * case and you should use try_to_free_swap() with explicit lock_page()
1428c2ecf20Sopenharmony_ci	 * in usual operations.
1438c2ecf20Sopenharmony_ci	 */
1448c2ecf20Sopenharmony_ci	if (trylock_page(page)) {
1458c2ecf20Sopenharmony_ci		if ((flags & TTRS_ANYWAY) ||
1468c2ecf20Sopenharmony_ci		    ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
1478c2ecf20Sopenharmony_ci		    ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
1488c2ecf20Sopenharmony_ci			ret = try_to_free_swap(page);
1498c2ecf20Sopenharmony_ci		unlock_page(page);
1508c2ecf20Sopenharmony_ci	}
1518c2ecf20Sopenharmony_ci	put_page(page);
1528c2ecf20Sopenharmony_ci	return ret;
1538c2ecf20Sopenharmony_ci}
1548c2ecf20Sopenharmony_ci
1558c2ecf20Sopenharmony_cistatic inline struct swap_extent *first_se(struct swap_info_struct *sis)
1568c2ecf20Sopenharmony_ci{
1578c2ecf20Sopenharmony_ci	struct rb_node *rb = rb_first(&sis->swap_extent_root);
1588c2ecf20Sopenharmony_ci	return rb_entry(rb, struct swap_extent, rb_node);
1598c2ecf20Sopenharmony_ci}
1608c2ecf20Sopenharmony_ci
1618c2ecf20Sopenharmony_cistatic inline struct swap_extent *next_se(struct swap_extent *se)
1628c2ecf20Sopenharmony_ci{
1638c2ecf20Sopenharmony_ci	struct rb_node *rb = rb_next(&se->rb_node);
1648c2ecf20Sopenharmony_ci	return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
1658c2ecf20Sopenharmony_ci}
1668c2ecf20Sopenharmony_ci
1678c2ecf20Sopenharmony_ci/*
1688c2ecf20Sopenharmony_ci * swapon tell device that all the old swap contents can be discarded,
1698c2ecf20Sopenharmony_ci * to allow the swap device to optimize its wear-levelling.
1708c2ecf20Sopenharmony_ci */
1718c2ecf20Sopenharmony_cistatic int discard_swap(struct swap_info_struct *si)
1728c2ecf20Sopenharmony_ci{
1738c2ecf20Sopenharmony_ci	struct swap_extent *se;
1748c2ecf20Sopenharmony_ci	sector_t start_block;
1758c2ecf20Sopenharmony_ci	sector_t nr_blocks;
1768c2ecf20Sopenharmony_ci	int err = 0;
1778c2ecf20Sopenharmony_ci
1788c2ecf20Sopenharmony_ci	/* Do not discard the swap header page! */
1798c2ecf20Sopenharmony_ci	se = first_se(si);
1808c2ecf20Sopenharmony_ci	start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
1818c2ecf20Sopenharmony_ci	nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
1828c2ecf20Sopenharmony_ci	if (nr_blocks) {
1838c2ecf20Sopenharmony_ci		err = blkdev_issue_discard(si->bdev, start_block,
1848c2ecf20Sopenharmony_ci				nr_blocks, GFP_KERNEL, 0);
1858c2ecf20Sopenharmony_ci		if (err)
1868c2ecf20Sopenharmony_ci			return err;
1878c2ecf20Sopenharmony_ci		cond_resched();
1888c2ecf20Sopenharmony_ci	}
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci	for (se = next_se(se); se; se = next_se(se)) {
1918c2ecf20Sopenharmony_ci		start_block = se->start_block << (PAGE_SHIFT - 9);
1928c2ecf20Sopenharmony_ci		nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
1938c2ecf20Sopenharmony_ci
1948c2ecf20Sopenharmony_ci		err = blkdev_issue_discard(si->bdev, start_block,
1958c2ecf20Sopenharmony_ci				nr_blocks, GFP_KERNEL, 0);
1968c2ecf20Sopenharmony_ci		if (err)
1978c2ecf20Sopenharmony_ci			break;
1988c2ecf20Sopenharmony_ci
1998c2ecf20Sopenharmony_ci		cond_resched();
2008c2ecf20Sopenharmony_ci	}
2018c2ecf20Sopenharmony_ci	return err;		/* That will often be -EOPNOTSUPP */
2028c2ecf20Sopenharmony_ci}
2038c2ecf20Sopenharmony_ci
2048c2ecf20Sopenharmony_cistatic struct swap_extent *
2058c2ecf20Sopenharmony_cioffset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
2068c2ecf20Sopenharmony_ci{
2078c2ecf20Sopenharmony_ci	struct swap_extent *se;
2088c2ecf20Sopenharmony_ci	struct rb_node *rb;
2098c2ecf20Sopenharmony_ci
2108c2ecf20Sopenharmony_ci	rb = sis->swap_extent_root.rb_node;
2118c2ecf20Sopenharmony_ci	while (rb) {
2128c2ecf20Sopenharmony_ci		se = rb_entry(rb, struct swap_extent, rb_node);
2138c2ecf20Sopenharmony_ci		if (offset < se->start_page)
2148c2ecf20Sopenharmony_ci			rb = rb->rb_left;
2158c2ecf20Sopenharmony_ci		else if (offset >= se->start_page + se->nr_pages)
2168c2ecf20Sopenharmony_ci			rb = rb->rb_right;
2178c2ecf20Sopenharmony_ci		else
2188c2ecf20Sopenharmony_ci			return se;
2198c2ecf20Sopenharmony_ci	}
2208c2ecf20Sopenharmony_ci	/* It *must* be present */
2218c2ecf20Sopenharmony_ci	BUG();
2228c2ecf20Sopenharmony_ci}
2238c2ecf20Sopenharmony_ci
2248c2ecf20Sopenharmony_cisector_t swap_page_sector(struct page *page)
2258c2ecf20Sopenharmony_ci{
2268c2ecf20Sopenharmony_ci	struct swap_info_struct *sis = page_swap_info(page);
2278c2ecf20Sopenharmony_ci	struct swap_extent *se;
2288c2ecf20Sopenharmony_ci	sector_t sector;
2298c2ecf20Sopenharmony_ci	pgoff_t offset;
2308c2ecf20Sopenharmony_ci
2318c2ecf20Sopenharmony_ci	offset = __page_file_index(page);
2328c2ecf20Sopenharmony_ci	se = offset_to_swap_extent(sis, offset);
2338c2ecf20Sopenharmony_ci	sector = se->start_block + (offset - se->start_page);
2348c2ecf20Sopenharmony_ci	return sector << (PAGE_SHIFT - 9);
2358c2ecf20Sopenharmony_ci}
2368c2ecf20Sopenharmony_ci
2378c2ecf20Sopenharmony_ci/*
2388c2ecf20Sopenharmony_ci * swap allocation tell device that a cluster of swap can now be discarded,
2398c2ecf20Sopenharmony_ci * to allow the swap device to optimize its wear-levelling.
2408c2ecf20Sopenharmony_ci */
2418c2ecf20Sopenharmony_cistatic void discard_swap_cluster(struct swap_info_struct *si,
2428c2ecf20Sopenharmony_ci				 pgoff_t start_page, pgoff_t nr_pages)
2438c2ecf20Sopenharmony_ci{
2448c2ecf20Sopenharmony_ci	struct swap_extent *se = offset_to_swap_extent(si, start_page);
2458c2ecf20Sopenharmony_ci
2468c2ecf20Sopenharmony_ci	while (nr_pages) {
2478c2ecf20Sopenharmony_ci		pgoff_t offset = start_page - se->start_page;
2488c2ecf20Sopenharmony_ci		sector_t start_block = se->start_block + offset;
2498c2ecf20Sopenharmony_ci		sector_t nr_blocks = se->nr_pages - offset;
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci		if (nr_blocks > nr_pages)
2528c2ecf20Sopenharmony_ci			nr_blocks = nr_pages;
2538c2ecf20Sopenharmony_ci		start_page += nr_blocks;
2548c2ecf20Sopenharmony_ci		nr_pages -= nr_blocks;
2558c2ecf20Sopenharmony_ci
2568c2ecf20Sopenharmony_ci		start_block <<= PAGE_SHIFT - 9;
2578c2ecf20Sopenharmony_ci		nr_blocks <<= PAGE_SHIFT - 9;
2588c2ecf20Sopenharmony_ci		if (blkdev_issue_discard(si->bdev, start_block,
2598c2ecf20Sopenharmony_ci					nr_blocks, GFP_NOIO, 0))
2608c2ecf20Sopenharmony_ci			break;
2618c2ecf20Sopenharmony_ci
2628c2ecf20Sopenharmony_ci		se = next_se(se);
2638c2ecf20Sopenharmony_ci	}
2648c2ecf20Sopenharmony_ci}
2658c2ecf20Sopenharmony_ci
2668c2ecf20Sopenharmony_ci#ifdef CONFIG_THP_SWAP
2678c2ecf20Sopenharmony_ci#define SWAPFILE_CLUSTER	HPAGE_PMD_NR
2688c2ecf20Sopenharmony_ci
2698c2ecf20Sopenharmony_ci#define swap_entry_size(size)	(size)
2708c2ecf20Sopenharmony_ci#else
2718c2ecf20Sopenharmony_ci#define SWAPFILE_CLUSTER	256
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_ci/*
2748c2ecf20Sopenharmony_ci * Define swap_entry_size() as constant to let compiler to optimize
2758c2ecf20Sopenharmony_ci * out some code if !CONFIG_THP_SWAP
2768c2ecf20Sopenharmony_ci */
2778c2ecf20Sopenharmony_ci#define swap_entry_size(size)	1
2788c2ecf20Sopenharmony_ci#endif
2798c2ecf20Sopenharmony_ci#define LATENCY_LIMIT		256
2808c2ecf20Sopenharmony_ci
2818c2ecf20Sopenharmony_cistatic inline void cluster_set_flag(struct swap_cluster_info *info,
2828c2ecf20Sopenharmony_ci	unsigned int flag)
2838c2ecf20Sopenharmony_ci{
2848c2ecf20Sopenharmony_ci	info->flags = flag;
2858c2ecf20Sopenharmony_ci}
2868c2ecf20Sopenharmony_ci
2878c2ecf20Sopenharmony_cistatic inline unsigned int cluster_count(struct swap_cluster_info *info)
2888c2ecf20Sopenharmony_ci{
2898c2ecf20Sopenharmony_ci	return info->data;
2908c2ecf20Sopenharmony_ci}
2918c2ecf20Sopenharmony_ci
2928c2ecf20Sopenharmony_cistatic inline void cluster_set_count(struct swap_cluster_info *info,
2938c2ecf20Sopenharmony_ci				     unsigned int c)
2948c2ecf20Sopenharmony_ci{
2958c2ecf20Sopenharmony_ci	info->data = c;
2968c2ecf20Sopenharmony_ci}
2978c2ecf20Sopenharmony_ci
2988c2ecf20Sopenharmony_cistatic inline void cluster_set_count_flag(struct swap_cluster_info *info,
2998c2ecf20Sopenharmony_ci					 unsigned int c, unsigned int f)
3008c2ecf20Sopenharmony_ci{
3018c2ecf20Sopenharmony_ci	info->flags = f;
3028c2ecf20Sopenharmony_ci	info->data = c;
3038c2ecf20Sopenharmony_ci}
3048c2ecf20Sopenharmony_ci
3058c2ecf20Sopenharmony_cistatic inline unsigned int cluster_next(struct swap_cluster_info *info)
3068c2ecf20Sopenharmony_ci{
3078c2ecf20Sopenharmony_ci	return info->data;
3088c2ecf20Sopenharmony_ci}
3098c2ecf20Sopenharmony_ci
3108c2ecf20Sopenharmony_cistatic inline void cluster_set_next(struct swap_cluster_info *info,
3118c2ecf20Sopenharmony_ci				    unsigned int n)
3128c2ecf20Sopenharmony_ci{
3138c2ecf20Sopenharmony_ci	info->data = n;
3148c2ecf20Sopenharmony_ci}
3158c2ecf20Sopenharmony_ci
3168c2ecf20Sopenharmony_cistatic inline void cluster_set_next_flag(struct swap_cluster_info *info,
3178c2ecf20Sopenharmony_ci					 unsigned int n, unsigned int f)
3188c2ecf20Sopenharmony_ci{
3198c2ecf20Sopenharmony_ci	info->flags = f;
3208c2ecf20Sopenharmony_ci	info->data = n;
3218c2ecf20Sopenharmony_ci}
3228c2ecf20Sopenharmony_ci
3238c2ecf20Sopenharmony_cistatic inline bool cluster_is_free(struct swap_cluster_info *info)
3248c2ecf20Sopenharmony_ci{
3258c2ecf20Sopenharmony_ci	return info->flags & CLUSTER_FLAG_FREE;
3268c2ecf20Sopenharmony_ci}
3278c2ecf20Sopenharmony_ci
3288c2ecf20Sopenharmony_cistatic inline bool cluster_is_null(struct swap_cluster_info *info)
3298c2ecf20Sopenharmony_ci{
3308c2ecf20Sopenharmony_ci	return info->flags & CLUSTER_FLAG_NEXT_NULL;
3318c2ecf20Sopenharmony_ci}
3328c2ecf20Sopenharmony_ci
3338c2ecf20Sopenharmony_cistatic inline void cluster_set_null(struct swap_cluster_info *info)
3348c2ecf20Sopenharmony_ci{
3358c2ecf20Sopenharmony_ci	info->flags = CLUSTER_FLAG_NEXT_NULL;
3368c2ecf20Sopenharmony_ci	info->data = 0;
3378c2ecf20Sopenharmony_ci}
3388c2ecf20Sopenharmony_ci
3398c2ecf20Sopenharmony_cistatic inline bool cluster_is_huge(struct swap_cluster_info *info)
3408c2ecf20Sopenharmony_ci{
3418c2ecf20Sopenharmony_ci	if (IS_ENABLED(CONFIG_THP_SWAP))
3428c2ecf20Sopenharmony_ci		return info->flags & CLUSTER_FLAG_HUGE;
3438c2ecf20Sopenharmony_ci	return false;
3448c2ecf20Sopenharmony_ci}
3458c2ecf20Sopenharmony_ci
3468c2ecf20Sopenharmony_cistatic inline void cluster_clear_huge(struct swap_cluster_info *info)
3478c2ecf20Sopenharmony_ci{
3488c2ecf20Sopenharmony_ci	info->flags &= ~CLUSTER_FLAG_HUGE;
3498c2ecf20Sopenharmony_ci}
3508c2ecf20Sopenharmony_ci
3518c2ecf20Sopenharmony_cistatic inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
3528c2ecf20Sopenharmony_ci						     unsigned long offset)
3538c2ecf20Sopenharmony_ci{
3548c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
3558c2ecf20Sopenharmony_ci
3568c2ecf20Sopenharmony_ci	ci = si->cluster_info;
3578c2ecf20Sopenharmony_ci	if (ci) {
3588c2ecf20Sopenharmony_ci		ci += offset / SWAPFILE_CLUSTER;
3598c2ecf20Sopenharmony_ci		spin_lock(&ci->lock);
3608c2ecf20Sopenharmony_ci	}
3618c2ecf20Sopenharmony_ci	return ci;
3628c2ecf20Sopenharmony_ci}
3638c2ecf20Sopenharmony_ci
3648c2ecf20Sopenharmony_cistatic inline void unlock_cluster(struct swap_cluster_info *ci)
3658c2ecf20Sopenharmony_ci{
3668c2ecf20Sopenharmony_ci	if (ci)
3678c2ecf20Sopenharmony_ci		spin_unlock(&ci->lock);
3688c2ecf20Sopenharmony_ci}
3698c2ecf20Sopenharmony_ci
3708c2ecf20Sopenharmony_ci/*
3718c2ecf20Sopenharmony_ci * Determine the locking method in use for this device.  Return
3728c2ecf20Sopenharmony_ci * swap_cluster_info if SSD-style cluster-based locking is in place.
3738c2ecf20Sopenharmony_ci */
3748c2ecf20Sopenharmony_cistatic inline struct swap_cluster_info *lock_cluster_or_swap_info(
3758c2ecf20Sopenharmony_ci		struct swap_info_struct *si, unsigned long offset)
3768c2ecf20Sopenharmony_ci{
3778c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
3788c2ecf20Sopenharmony_ci
3798c2ecf20Sopenharmony_ci	/* Try to use fine-grained SSD-style locking if available: */
3808c2ecf20Sopenharmony_ci	ci = lock_cluster(si, offset);
3818c2ecf20Sopenharmony_ci	/* Otherwise, fall back to traditional, coarse locking: */
3828c2ecf20Sopenharmony_ci	if (!ci)
3838c2ecf20Sopenharmony_ci		spin_lock(&si->lock);
3848c2ecf20Sopenharmony_ci
3858c2ecf20Sopenharmony_ci	return ci;
3868c2ecf20Sopenharmony_ci}
3878c2ecf20Sopenharmony_ci
3888c2ecf20Sopenharmony_cistatic inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
3898c2ecf20Sopenharmony_ci					       struct swap_cluster_info *ci)
3908c2ecf20Sopenharmony_ci{
3918c2ecf20Sopenharmony_ci	if (ci)
3928c2ecf20Sopenharmony_ci		unlock_cluster(ci);
3938c2ecf20Sopenharmony_ci	else
3948c2ecf20Sopenharmony_ci		spin_unlock(&si->lock);
3958c2ecf20Sopenharmony_ci}
3968c2ecf20Sopenharmony_ci
3978c2ecf20Sopenharmony_cistatic inline bool cluster_list_empty(struct swap_cluster_list *list)
3988c2ecf20Sopenharmony_ci{
3998c2ecf20Sopenharmony_ci	return cluster_is_null(&list->head);
4008c2ecf20Sopenharmony_ci}
4018c2ecf20Sopenharmony_ci
4028c2ecf20Sopenharmony_cistatic inline unsigned int cluster_list_first(struct swap_cluster_list *list)
4038c2ecf20Sopenharmony_ci{
4048c2ecf20Sopenharmony_ci	return cluster_next(&list->head);
4058c2ecf20Sopenharmony_ci}
4068c2ecf20Sopenharmony_ci
4078c2ecf20Sopenharmony_cistatic void cluster_list_init(struct swap_cluster_list *list)
4088c2ecf20Sopenharmony_ci{
4098c2ecf20Sopenharmony_ci	cluster_set_null(&list->head);
4108c2ecf20Sopenharmony_ci	cluster_set_null(&list->tail);
4118c2ecf20Sopenharmony_ci}
4128c2ecf20Sopenharmony_ci
4138c2ecf20Sopenharmony_cistatic void cluster_list_add_tail(struct swap_cluster_list *list,
4148c2ecf20Sopenharmony_ci				  struct swap_cluster_info *ci,
4158c2ecf20Sopenharmony_ci				  unsigned int idx)
4168c2ecf20Sopenharmony_ci{
4178c2ecf20Sopenharmony_ci	if (cluster_list_empty(list)) {
4188c2ecf20Sopenharmony_ci		cluster_set_next_flag(&list->head, idx, 0);
4198c2ecf20Sopenharmony_ci		cluster_set_next_flag(&list->tail, idx, 0);
4208c2ecf20Sopenharmony_ci	} else {
4218c2ecf20Sopenharmony_ci		struct swap_cluster_info *ci_tail;
4228c2ecf20Sopenharmony_ci		unsigned int tail = cluster_next(&list->tail);
4238c2ecf20Sopenharmony_ci
4248c2ecf20Sopenharmony_ci		/*
4258c2ecf20Sopenharmony_ci		 * Nested cluster lock, but both cluster locks are
4268c2ecf20Sopenharmony_ci		 * only acquired when we held swap_info_struct->lock
4278c2ecf20Sopenharmony_ci		 */
4288c2ecf20Sopenharmony_ci		ci_tail = ci + tail;
4298c2ecf20Sopenharmony_ci		spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
4308c2ecf20Sopenharmony_ci		cluster_set_next(ci_tail, idx);
4318c2ecf20Sopenharmony_ci		spin_unlock(&ci_tail->lock);
4328c2ecf20Sopenharmony_ci		cluster_set_next_flag(&list->tail, idx, 0);
4338c2ecf20Sopenharmony_ci	}
4348c2ecf20Sopenharmony_ci}
4358c2ecf20Sopenharmony_ci
4368c2ecf20Sopenharmony_cistatic unsigned int cluster_list_del_first(struct swap_cluster_list *list,
4378c2ecf20Sopenharmony_ci					   struct swap_cluster_info *ci)
4388c2ecf20Sopenharmony_ci{
4398c2ecf20Sopenharmony_ci	unsigned int idx;
4408c2ecf20Sopenharmony_ci
4418c2ecf20Sopenharmony_ci	idx = cluster_next(&list->head);
4428c2ecf20Sopenharmony_ci	if (cluster_next(&list->tail) == idx) {
4438c2ecf20Sopenharmony_ci		cluster_set_null(&list->head);
4448c2ecf20Sopenharmony_ci		cluster_set_null(&list->tail);
4458c2ecf20Sopenharmony_ci	} else
4468c2ecf20Sopenharmony_ci		cluster_set_next_flag(&list->head,
4478c2ecf20Sopenharmony_ci				      cluster_next(&ci[idx]), 0);
4488c2ecf20Sopenharmony_ci
4498c2ecf20Sopenharmony_ci	return idx;
4508c2ecf20Sopenharmony_ci}
4518c2ecf20Sopenharmony_ci
4528c2ecf20Sopenharmony_ci/* Add a cluster to discard list and schedule it to do discard */
4538c2ecf20Sopenharmony_cistatic void swap_cluster_schedule_discard(struct swap_info_struct *si,
4548c2ecf20Sopenharmony_ci		unsigned int idx)
4558c2ecf20Sopenharmony_ci{
4568c2ecf20Sopenharmony_ci	/*
4578c2ecf20Sopenharmony_ci	 * If scan_swap_map() can't find a free cluster, it will check
4588c2ecf20Sopenharmony_ci	 * si->swap_map directly. To make sure the discarding cluster isn't
4598c2ecf20Sopenharmony_ci	 * taken by scan_swap_map(), mark the swap entries bad (occupied). It
4608c2ecf20Sopenharmony_ci	 * will be cleared after discard
4618c2ecf20Sopenharmony_ci	 */
4628c2ecf20Sopenharmony_ci	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
4638c2ecf20Sopenharmony_ci			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
4648c2ecf20Sopenharmony_ci
4658c2ecf20Sopenharmony_ci	cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
4668c2ecf20Sopenharmony_ci
4678c2ecf20Sopenharmony_ci	schedule_work(&si->discard_work);
4688c2ecf20Sopenharmony_ci}
4698c2ecf20Sopenharmony_ci
4708c2ecf20Sopenharmony_cistatic void __free_cluster(struct swap_info_struct *si, unsigned long idx)
4718c2ecf20Sopenharmony_ci{
4728c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci = si->cluster_info;
4738c2ecf20Sopenharmony_ci
4748c2ecf20Sopenharmony_ci	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
4758c2ecf20Sopenharmony_ci	cluster_list_add_tail(&si->free_clusters, ci, idx);
4768c2ecf20Sopenharmony_ci}
4778c2ecf20Sopenharmony_ci
4788c2ecf20Sopenharmony_ci/*
4798c2ecf20Sopenharmony_ci * Doing discard actually. After a cluster discard is finished, the cluster
4808c2ecf20Sopenharmony_ci * will be added to free cluster list. caller should hold si->lock.
4818c2ecf20Sopenharmony_ci*/
4828c2ecf20Sopenharmony_cistatic void swap_do_scheduled_discard(struct swap_info_struct *si)
4838c2ecf20Sopenharmony_ci{
4848c2ecf20Sopenharmony_ci	struct swap_cluster_info *info, *ci;
4858c2ecf20Sopenharmony_ci	unsigned int idx;
4868c2ecf20Sopenharmony_ci
4878c2ecf20Sopenharmony_ci	info = si->cluster_info;
4888c2ecf20Sopenharmony_ci
4898c2ecf20Sopenharmony_ci	while (!cluster_list_empty(&si->discard_clusters)) {
4908c2ecf20Sopenharmony_ci		idx = cluster_list_del_first(&si->discard_clusters, info);
4918c2ecf20Sopenharmony_ci		spin_unlock(&si->lock);
4928c2ecf20Sopenharmony_ci
4938c2ecf20Sopenharmony_ci		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
4948c2ecf20Sopenharmony_ci				SWAPFILE_CLUSTER);
4958c2ecf20Sopenharmony_ci
4968c2ecf20Sopenharmony_ci		spin_lock(&si->lock);
4978c2ecf20Sopenharmony_ci		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
4988c2ecf20Sopenharmony_ci		__free_cluster(si, idx);
4998c2ecf20Sopenharmony_ci		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
5008c2ecf20Sopenharmony_ci				0, SWAPFILE_CLUSTER);
5018c2ecf20Sopenharmony_ci		unlock_cluster(ci);
5028c2ecf20Sopenharmony_ci	}
5038c2ecf20Sopenharmony_ci}
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_cistatic void swap_discard_work(struct work_struct *work)
5068c2ecf20Sopenharmony_ci{
5078c2ecf20Sopenharmony_ci	struct swap_info_struct *si;
5088c2ecf20Sopenharmony_ci
5098c2ecf20Sopenharmony_ci	si = container_of(work, struct swap_info_struct, discard_work);
5108c2ecf20Sopenharmony_ci
5118c2ecf20Sopenharmony_ci	spin_lock(&si->lock);
5128c2ecf20Sopenharmony_ci	swap_do_scheduled_discard(si);
5138c2ecf20Sopenharmony_ci	spin_unlock(&si->lock);
5148c2ecf20Sopenharmony_ci}
5158c2ecf20Sopenharmony_ci
5168c2ecf20Sopenharmony_cistatic void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
5178c2ecf20Sopenharmony_ci{
5188c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci = si->cluster_info;
5198c2ecf20Sopenharmony_ci
5208c2ecf20Sopenharmony_ci	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
5218c2ecf20Sopenharmony_ci	cluster_list_del_first(&si->free_clusters, ci);
5228c2ecf20Sopenharmony_ci	cluster_set_count_flag(ci + idx, 0, 0);
5238c2ecf20Sopenharmony_ci}
5248c2ecf20Sopenharmony_ci
5258c2ecf20Sopenharmony_cistatic void free_cluster(struct swap_info_struct *si, unsigned long idx)
5268c2ecf20Sopenharmony_ci{
5278c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci = si->cluster_info + idx;
5288c2ecf20Sopenharmony_ci
5298c2ecf20Sopenharmony_ci	VM_BUG_ON(cluster_count(ci) != 0);
5308c2ecf20Sopenharmony_ci	/*
5318c2ecf20Sopenharmony_ci	 * If the swap is discardable, prepare discard the cluster
5328c2ecf20Sopenharmony_ci	 * instead of free it immediately. The cluster will be freed
5338c2ecf20Sopenharmony_ci	 * after discard.
5348c2ecf20Sopenharmony_ci	 */
5358c2ecf20Sopenharmony_ci	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
5368c2ecf20Sopenharmony_ci	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
5378c2ecf20Sopenharmony_ci		swap_cluster_schedule_discard(si, idx);
5388c2ecf20Sopenharmony_ci		return;
5398c2ecf20Sopenharmony_ci	}
5408c2ecf20Sopenharmony_ci
5418c2ecf20Sopenharmony_ci	__free_cluster(si, idx);
5428c2ecf20Sopenharmony_ci}
5438c2ecf20Sopenharmony_ci
5448c2ecf20Sopenharmony_ci/*
5458c2ecf20Sopenharmony_ci * The cluster corresponding to page_nr will be used. The cluster will be
5468c2ecf20Sopenharmony_ci * removed from free cluster list and its usage counter will be increased.
5478c2ecf20Sopenharmony_ci */
5488c2ecf20Sopenharmony_cistatic void inc_cluster_info_page(struct swap_info_struct *p,
5498c2ecf20Sopenharmony_ci	struct swap_cluster_info *cluster_info, unsigned long page_nr)
5508c2ecf20Sopenharmony_ci{
5518c2ecf20Sopenharmony_ci	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
5528c2ecf20Sopenharmony_ci
5538c2ecf20Sopenharmony_ci	if (!cluster_info)
5548c2ecf20Sopenharmony_ci		return;
5558c2ecf20Sopenharmony_ci	if (cluster_is_free(&cluster_info[idx]))
5568c2ecf20Sopenharmony_ci		alloc_cluster(p, idx);
5578c2ecf20Sopenharmony_ci
5588c2ecf20Sopenharmony_ci	VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
5598c2ecf20Sopenharmony_ci	cluster_set_count(&cluster_info[idx],
5608c2ecf20Sopenharmony_ci		cluster_count(&cluster_info[idx]) + 1);
5618c2ecf20Sopenharmony_ci}
5628c2ecf20Sopenharmony_ci
5638c2ecf20Sopenharmony_ci/*
5648c2ecf20Sopenharmony_ci * The cluster corresponding to page_nr decreases one usage. If the usage
5658c2ecf20Sopenharmony_ci * counter becomes 0, which means no page in the cluster is in using, we can
5668c2ecf20Sopenharmony_ci * optionally discard the cluster and add it to free cluster list.
5678c2ecf20Sopenharmony_ci */
5688c2ecf20Sopenharmony_cistatic void dec_cluster_info_page(struct swap_info_struct *p,
5698c2ecf20Sopenharmony_ci	struct swap_cluster_info *cluster_info, unsigned long page_nr)
5708c2ecf20Sopenharmony_ci{
5718c2ecf20Sopenharmony_ci	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
5728c2ecf20Sopenharmony_ci
5738c2ecf20Sopenharmony_ci	if (!cluster_info)
5748c2ecf20Sopenharmony_ci		return;
5758c2ecf20Sopenharmony_ci
5768c2ecf20Sopenharmony_ci	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
5778c2ecf20Sopenharmony_ci	cluster_set_count(&cluster_info[idx],
5788c2ecf20Sopenharmony_ci		cluster_count(&cluster_info[idx]) - 1);
5798c2ecf20Sopenharmony_ci
5808c2ecf20Sopenharmony_ci	if (cluster_count(&cluster_info[idx]) == 0)
5818c2ecf20Sopenharmony_ci		free_cluster(p, idx);
5828c2ecf20Sopenharmony_ci}
5838c2ecf20Sopenharmony_ci
5848c2ecf20Sopenharmony_ci/*
5858c2ecf20Sopenharmony_ci * It's possible scan_swap_map() uses a free cluster in the middle of free
5868c2ecf20Sopenharmony_ci * cluster list. Avoiding such abuse to avoid list corruption.
5878c2ecf20Sopenharmony_ci */
5888c2ecf20Sopenharmony_cistatic bool
5898c2ecf20Sopenharmony_ciscan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
5908c2ecf20Sopenharmony_ci	unsigned long offset)
5918c2ecf20Sopenharmony_ci{
5928c2ecf20Sopenharmony_ci	struct percpu_cluster *percpu_cluster;
5938c2ecf20Sopenharmony_ci	bool conflict;
5948c2ecf20Sopenharmony_ci
5958c2ecf20Sopenharmony_ci	offset /= SWAPFILE_CLUSTER;
5968c2ecf20Sopenharmony_ci	conflict = !cluster_list_empty(&si->free_clusters) &&
5978c2ecf20Sopenharmony_ci		offset != cluster_list_first(&si->free_clusters) &&
5988c2ecf20Sopenharmony_ci		cluster_is_free(&si->cluster_info[offset]);
5998c2ecf20Sopenharmony_ci
6008c2ecf20Sopenharmony_ci	if (!conflict)
6018c2ecf20Sopenharmony_ci		return false;
6028c2ecf20Sopenharmony_ci
6038c2ecf20Sopenharmony_ci	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
6048c2ecf20Sopenharmony_ci	cluster_set_null(&percpu_cluster->index);
6058c2ecf20Sopenharmony_ci	return true;
6068c2ecf20Sopenharmony_ci}
6078c2ecf20Sopenharmony_ci
6088c2ecf20Sopenharmony_ci/*
6098c2ecf20Sopenharmony_ci * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
6108c2ecf20Sopenharmony_ci * might involve allocating a new cluster for current CPU too.
6118c2ecf20Sopenharmony_ci */
6128c2ecf20Sopenharmony_cistatic bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
6138c2ecf20Sopenharmony_ci	unsigned long *offset, unsigned long *scan_base)
6148c2ecf20Sopenharmony_ci{
6158c2ecf20Sopenharmony_ci	struct percpu_cluster *cluster;
6168c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
6178c2ecf20Sopenharmony_ci	unsigned long tmp, max;
6188c2ecf20Sopenharmony_ci
6198c2ecf20Sopenharmony_cinew_cluster:
6208c2ecf20Sopenharmony_ci	cluster = this_cpu_ptr(si->percpu_cluster);
6218c2ecf20Sopenharmony_ci	if (cluster_is_null(&cluster->index)) {
6228c2ecf20Sopenharmony_ci		if (!cluster_list_empty(&si->free_clusters)) {
6238c2ecf20Sopenharmony_ci			cluster->index = si->free_clusters.head;
6248c2ecf20Sopenharmony_ci			cluster->next = cluster_next(&cluster->index) *
6258c2ecf20Sopenharmony_ci					SWAPFILE_CLUSTER;
6268c2ecf20Sopenharmony_ci		} else if (!cluster_list_empty(&si->discard_clusters)) {
6278c2ecf20Sopenharmony_ci			/*
6288c2ecf20Sopenharmony_ci			 * we don't have free cluster but have some clusters in
6298c2ecf20Sopenharmony_ci			 * discarding, do discard now and reclaim them, then
6308c2ecf20Sopenharmony_ci			 * reread cluster_next_cpu since we dropped si->lock
6318c2ecf20Sopenharmony_ci			 */
6328c2ecf20Sopenharmony_ci			swap_do_scheduled_discard(si);
6338c2ecf20Sopenharmony_ci			*scan_base = this_cpu_read(*si->cluster_next_cpu);
6348c2ecf20Sopenharmony_ci			*offset = *scan_base;
6358c2ecf20Sopenharmony_ci			goto new_cluster;
6368c2ecf20Sopenharmony_ci		} else
6378c2ecf20Sopenharmony_ci			return false;
6388c2ecf20Sopenharmony_ci	}
6398c2ecf20Sopenharmony_ci
6408c2ecf20Sopenharmony_ci	/*
6418c2ecf20Sopenharmony_ci	 * Other CPUs can use our cluster if they can't find a free cluster,
6428c2ecf20Sopenharmony_ci	 * check if there is still free entry in the cluster
6438c2ecf20Sopenharmony_ci	 */
6448c2ecf20Sopenharmony_ci	tmp = cluster->next;
6458c2ecf20Sopenharmony_ci	max = min_t(unsigned long, si->max,
6468c2ecf20Sopenharmony_ci		    (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
6478c2ecf20Sopenharmony_ci	if (tmp < max) {
6488c2ecf20Sopenharmony_ci		ci = lock_cluster(si, tmp);
6498c2ecf20Sopenharmony_ci		while (tmp < max) {
6508c2ecf20Sopenharmony_ci			if (!si->swap_map[tmp])
6518c2ecf20Sopenharmony_ci				break;
6528c2ecf20Sopenharmony_ci			tmp++;
6538c2ecf20Sopenharmony_ci		}
6548c2ecf20Sopenharmony_ci		unlock_cluster(ci);
6558c2ecf20Sopenharmony_ci	}
6568c2ecf20Sopenharmony_ci	if (tmp >= max) {
6578c2ecf20Sopenharmony_ci		cluster_set_null(&cluster->index);
6588c2ecf20Sopenharmony_ci		goto new_cluster;
6598c2ecf20Sopenharmony_ci	}
6608c2ecf20Sopenharmony_ci	cluster->next = tmp + 1;
6618c2ecf20Sopenharmony_ci	*offset = tmp;
6628c2ecf20Sopenharmony_ci	*scan_base = tmp;
6638c2ecf20Sopenharmony_ci	return true;
6648c2ecf20Sopenharmony_ci}
6658c2ecf20Sopenharmony_ci
6668c2ecf20Sopenharmony_cistatic void __del_from_avail_list(struct swap_info_struct *p)
6678c2ecf20Sopenharmony_ci{
6688c2ecf20Sopenharmony_ci	int nid;
6698c2ecf20Sopenharmony_ci
6708c2ecf20Sopenharmony_ci	assert_spin_locked(&p->lock);
6718c2ecf20Sopenharmony_ci	for_each_node(nid)
6728c2ecf20Sopenharmony_ci		plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
6738c2ecf20Sopenharmony_ci}
6748c2ecf20Sopenharmony_ci
6758c2ecf20Sopenharmony_cistatic void del_from_avail_list(struct swap_info_struct *p)
6768c2ecf20Sopenharmony_ci{
6778c2ecf20Sopenharmony_ci	spin_lock(&swap_avail_lock);
6788c2ecf20Sopenharmony_ci	__del_from_avail_list(p);
6798c2ecf20Sopenharmony_ci	spin_unlock(&swap_avail_lock);
6808c2ecf20Sopenharmony_ci}
6818c2ecf20Sopenharmony_ci
6828c2ecf20Sopenharmony_cistatic void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
6838c2ecf20Sopenharmony_ci			     unsigned int nr_entries)
6848c2ecf20Sopenharmony_ci{
6858c2ecf20Sopenharmony_ci	unsigned int end = offset + nr_entries - 1;
6868c2ecf20Sopenharmony_ci
6878c2ecf20Sopenharmony_ci	if (offset == si->lowest_bit)
6888c2ecf20Sopenharmony_ci		si->lowest_bit += nr_entries;
6898c2ecf20Sopenharmony_ci	if (end == si->highest_bit)
6908c2ecf20Sopenharmony_ci		WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
6918c2ecf20Sopenharmony_ci	si->inuse_pages += nr_entries;
6928c2ecf20Sopenharmony_ci	if (si->inuse_pages == si->pages) {
6938c2ecf20Sopenharmony_ci		si->lowest_bit = si->max;
6948c2ecf20Sopenharmony_ci		si->highest_bit = 0;
6958c2ecf20Sopenharmony_ci		del_from_avail_list(si);
6968c2ecf20Sopenharmony_ci	}
6978c2ecf20Sopenharmony_ci}
6988c2ecf20Sopenharmony_ci
6998c2ecf20Sopenharmony_cistatic void add_to_avail_list(struct swap_info_struct *p)
7008c2ecf20Sopenharmony_ci{
7018c2ecf20Sopenharmony_ci	int nid;
7028c2ecf20Sopenharmony_ci
7038c2ecf20Sopenharmony_ci	spin_lock(&swap_avail_lock);
7048c2ecf20Sopenharmony_ci	for_each_node(nid) {
7058c2ecf20Sopenharmony_ci		WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
7068c2ecf20Sopenharmony_ci		plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
7078c2ecf20Sopenharmony_ci	}
7088c2ecf20Sopenharmony_ci	spin_unlock(&swap_avail_lock);
7098c2ecf20Sopenharmony_ci}
7108c2ecf20Sopenharmony_ci
7118c2ecf20Sopenharmony_cistatic void swap_range_free(struct swap_info_struct *si, unsigned long offset,
7128c2ecf20Sopenharmony_ci			    unsigned int nr_entries)
7138c2ecf20Sopenharmony_ci{
7148c2ecf20Sopenharmony_ci	unsigned long begin = offset;
7158c2ecf20Sopenharmony_ci	unsigned long end = offset + nr_entries - 1;
7168c2ecf20Sopenharmony_ci	void (*swap_slot_free_notify)(struct block_device *, unsigned long);
7178c2ecf20Sopenharmony_ci
7188c2ecf20Sopenharmony_ci	if (offset < si->lowest_bit)
7198c2ecf20Sopenharmony_ci		si->lowest_bit = offset;
7208c2ecf20Sopenharmony_ci	if (end > si->highest_bit) {
7218c2ecf20Sopenharmony_ci		bool was_full = !si->highest_bit;
7228c2ecf20Sopenharmony_ci
7238c2ecf20Sopenharmony_ci		WRITE_ONCE(si->highest_bit, end);
7248c2ecf20Sopenharmony_ci		if (was_full && (si->flags & SWP_WRITEOK))
7258c2ecf20Sopenharmony_ci			add_to_avail_list(si);
7268c2ecf20Sopenharmony_ci	}
7278c2ecf20Sopenharmony_ci	atomic_long_add(nr_entries, &nr_swap_pages);
7288c2ecf20Sopenharmony_ci	si->inuse_pages -= nr_entries;
7298c2ecf20Sopenharmony_ci	if (si->flags & SWP_BLKDEV)
7308c2ecf20Sopenharmony_ci		swap_slot_free_notify =
7318c2ecf20Sopenharmony_ci			si->bdev->bd_disk->fops->swap_slot_free_notify;
7328c2ecf20Sopenharmony_ci	else
7338c2ecf20Sopenharmony_ci		swap_slot_free_notify = NULL;
7348c2ecf20Sopenharmony_ci	while (offset <= end) {
7358c2ecf20Sopenharmony_ci		arch_swap_invalidate_page(si->type, offset);
7368c2ecf20Sopenharmony_ci		frontswap_invalidate_page(si->type, offset);
7378c2ecf20Sopenharmony_ci		if (swap_slot_free_notify)
7388c2ecf20Sopenharmony_ci			swap_slot_free_notify(si->bdev, offset);
7398c2ecf20Sopenharmony_ci		offset++;
7408c2ecf20Sopenharmony_ci	}
7418c2ecf20Sopenharmony_ci	clear_shadow_from_swap_cache(si->type, begin, end);
7428c2ecf20Sopenharmony_ci}
7438c2ecf20Sopenharmony_ci
7448c2ecf20Sopenharmony_cistatic void set_cluster_next(struct swap_info_struct *si, unsigned long next)
7458c2ecf20Sopenharmony_ci{
7468c2ecf20Sopenharmony_ci	unsigned long prev;
7478c2ecf20Sopenharmony_ci
7488c2ecf20Sopenharmony_ci	if (!(si->flags & SWP_SOLIDSTATE)) {
7498c2ecf20Sopenharmony_ci		si->cluster_next = next;
7508c2ecf20Sopenharmony_ci		return;
7518c2ecf20Sopenharmony_ci	}
7528c2ecf20Sopenharmony_ci
7538c2ecf20Sopenharmony_ci	prev = this_cpu_read(*si->cluster_next_cpu);
7548c2ecf20Sopenharmony_ci	/*
7558c2ecf20Sopenharmony_ci	 * Cross the swap address space size aligned trunk, choose
7568c2ecf20Sopenharmony_ci	 * another trunk randomly to avoid lock contention on swap
7578c2ecf20Sopenharmony_ci	 * address space if possible.
7588c2ecf20Sopenharmony_ci	 */
7598c2ecf20Sopenharmony_ci	if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
7608c2ecf20Sopenharmony_ci	    (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
7618c2ecf20Sopenharmony_ci		/* No free swap slots available */
7628c2ecf20Sopenharmony_ci		if (si->highest_bit <= si->lowest_bit)
7638c2ecf20Sopenharmony_ci			return;
7648c2ecf20Sopenharmony_ci		next = si->lowest_bit +
7658c2ecf20Sopenharmony_ci			prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
7668c2ecf20Sopenharmony_ci		next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
7678c2ecf20Sopenharmony_ci		next = max_t(unsigned int, next, si->lowest_bit);
7688c2ecf20Sopenharmony_ci	}
7698c2ecf20Sopenharmony_ci	this_cpu_write(*si->cluster_next_cpu, next);
7708c2ecf20Sopenharmony_ci}
7718c2ecf20Sopenharmony_ci
7728c2ecf20Sopenharmony_cistatic int scan_swap_map_slots(struct swap_info_struct *si,
7738c2ecf20Sopenharmony_ci			       unsigned char usage, int nr,
7748c2ecf20Sopenharmony_ci			       swp_entry_t slots[])
7758c2ecf20Sopenharmony_ci{
7768c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
7778c2ecf20Sopenharmony_ci	unsigned long offset;
7788c2ecf20Sopenharmony_ci	unsigned long scan_base;
7798c2ecf20Sopenharmony_ci	unsigned long last_in_cluster = 0;
7808c2ecf20Sopenharmony_ci	int latency_ration = LATENCY_LIMIT;
7818c2ecf20Sopenharmony_ci	int n_ret = 0;
7828c2ecf20Sopenharmony_ci	bool scanned_many = false;
7838c2ecf20Sopenharmony_ci
7848c2ecf20Sopenharmony_ci	/*
7858c2ecf20Sopenharmony_ci	 * We try to cluster swap pages by allocating them sequentially
7868c2ecf20Sopenharmony_ci	 * in swap.  Once we've allocated SWAPFILE_CLUSTER pages this
7878c2ecf20Sopenharmony_ci	 * way, however, we resort to first-free allocation, starting
7888c2ecf20Sopenharmony_ci	 * a new cluster.  This prevents us from scattering swap pages
7898c2ecf20Sopenharmony_ci	 * all over the entire swap partition, so that we reduce
7908c2ecf20Sopenharmony_ci	 * overall disk seek times between swap pages.  -- sct
7918c2ecf20Sopenharmony_ci	 * But we do now try to find an empty cluster.  -Andrea
7928c2ecf20Sopenharmony_ci	 * And we let swap pages go all over an SSD partition.  Hugh
7938c2ecf20Sopenharmony_ci	 */
7948c2ecf20Sopenharmony_ci
7958c2ecf20Sopenharmony_ci	si->flags += SWP_SCANNING;
7968c2ecf20Sopenharmony_ci	/*
7978c2ecf20Sopenharmony_ci	 * Use percpu scan base for SSD to reduce lock contention on
7988c2ecf20Sopenharmony_ci	 * cluster and swap cache.  For HDD, sequential access is more
7998c2ecf20Sopenharmony_ci	 * important.
8008c2ecf20Sopenharmony_ci	 */
8018c2ecf20Sopenharmony_ci	if (si->flags & SWP_SOLIDSTATE)
8028c2ecf20Sopenharmony_ci		scan_base = this_cpu_read(*si->cluster_next_cpu);
8038c2ecf20Sopenharmony_ci	else
8048c2ecf20Sopenharmony_ci		scan_base = si->cluster_next;
8058c2ecf20Sopenharmony_ci	offset = scan_base;
8068c2ecf20Sopenharmony_ci
8078c2ecf20Sopenharmony_ci	/* SSD algorithm */
8088c2ecf20Sopenharmony_ci	if (si->cluster_info) {
8098c2ecf20Sopenharmony_ci		if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
8108c2ecf20Sopenharmony_ci			goto scan;
8118c2ecf20Sopenharmony_ci	} else if (unlikely(!si->cluster_nr--)) {
8128c2ecf20Sopenharmony_ci		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
8138c2ecf20Sopenharmony_ci			si->cluster_nr = SWAPFILE_CLUSTER - 1;
8148c2ecf20Sopenharmony_ci			goto checks;
8158c2ecf20Sopenharmony_ci		}
8168c2ecf20Sopenharmony_ci
8178c2ecf20Sopenharmony_ci		spin_unlock(&si->lock);
8188c2ecf20Sopenharmony_ci
8198c2ecf20Sopenharmony_ci		/*
8208c2ecf20Sopenharmony_ci		 * If seek is expensive, start searching for new cluster from
8218c2ecf20Sopenharmony_ci		 * start of partition, to minimize the span of allocated swap.
8228c2ecf20Sopenharmony_ci		 * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
8238c2ecf20Sopenharmony_ci		 * case, just handled by scan_swap_map_try_ssd_cluster() above.
8248c2ecf20Sopenharmony_ci		 */
8258c2ecf20Sopenharmony_ci		scan_base = offset = si->lowest_bit;
8268c2ecf20Sopenharmony_ci		last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
8278c2ecf20Sopenharmony_ci
8288c2ecf20Sopenharmony_ci		/* Locate the first empty (unaligned) cluster */
8298c2ecf20Sopenharmony_ci		for (; last_in_cluster <= si->highest_bit; offset++) {
8308c2ecf20Sopenharmony_ci			if (si->swap_map[offset])
8318c2ecf20Sopenharmony_ci				last_in_cluster = offset + SWAPFILE_CLUSTER;
8328c2ecf20Sopenharmony_ci			else if (offset == last_in_cluster) {
8338c2ecf20Sopenharmony_ci				spin_lock(&si->lock);
8348c2ecf20Sopenharmony_ci				offset -= SWAPFILE_CLUSTER - 1;
8358c2ecf20Sopenharmony_ci				si->cluster_next = offset;
8368c2ecf20Sopenharmony_ci				si->cluster_nr = SWAPFILE_CLUSTER - 1;
8378c2ecf20Sopenharmony_ci				goto checks;
8388c2ecf20Sopenharmony_ci			}
8398c2ecf20Sopenharmony_ci			if (unlikely(--latency_ration < 0)) {
8408c2ecf20Sopenharmony_ci				cond_resched();
8418c2ecf20Sopenharmony_ci				latency_ration = LATENCY_LIMIT;
8428c2ecf20Sopenharmony_ci			}
8438c2ecf20Sopenharmony_ci		}
8448c2ecf20Sopenharmony_ci
8458c2ecf20Sopenharmony_ci		offset = scan_base;
8468c2ecf20Sopenharmony_ci		spin_lock(&si->lock);
8478c2ecf20Sopenharmony_ci		si->cluster_nr = SWAPFILE_CLUSTER - 1;
8488c2ecf20Sopenharmony_ci	}
8498c2ecf20Sopenharmony_ci
8508c2ecf20Sopenharmony_cichecks:
8518c2ecf20Sopenharmony_ci	if (si->cluster_info) {
8528c2ecf20Sopenharmony_ci		while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
8538c2ecf20Sopenharmony_ci		/* take a break if we already got some slots */
8548c2ecf20Sopenharmony_ci			if (n_ret)
8558c2ecf20Sopenharmony_ci				goto done;
8568c2ecf20Sopenharmony_ci			if (!scan_swap_map_try_ssd_cluster(si, &offset,
8578c2ecf20Sopenharmony_ci							&scan_base))
8588c2ecf20Sopenharmony_ci				goto scan;
8598c2ecf20Sopenharmony_ci		}
8608c2ecf20Sopenharmony_ci	}
8618c2ecf20Sopenharmony_ci	if (!(si->flags & SWP_WRITEOK))
8628c2ecf20Sopenharmony_ci		goto no_page;
8638c2ecf20Sopenharmony_ci	if (!si->highest_bit)
8648c2ecf20Sopenharmony_ci		goto no_page;
8658c2ecf20Sopenharmony_ci	if (offset > si->highest_bit)
8668c2ecf20Sopenharmony_ci		scan_base = offset = si->lowest_bit;
8678c2ecf20Sopenharmony_ci
8688c2ecf20Sopenharmony_ci	ci = lock_cluster(si, offset);
8698c2ecf20Sopenharmony_ci	/* reuse swap entry of cache-only swap if not busy. */
8708c2ecf20Sopenharmony_ci	if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
8718c2ecf20Sopenharmony_ci		int swap_was_freed;
8728c2ecf20Sopenharmony_ci		unlock_cluster(ci);
8738c2ecf20Sopenharmony_ci		spin_unlock(&si->lock);
8748c2ecf20Sopenharmony_ci		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
8758c2ecf20Sopenharmony_ci		spin_lock(&si->lock);
8768c2ecf20Sopenharmony_ci		/* entry was freed successfully, try to use this again */
8778c2ecf20Sopenharmony_ci		if (swap_was_freed)
8788c2ecf20Sopenharmony_ci			goto checks;
8798c2ecf20Sopenharmony_ci		goto scan; /* check next one */
8808c2ecf20Sopenharmony_ci	}
8818c2ecf20Sopenharmony_ci
8828c2ecf20Sopenharmony_ci	if (si->swap_map[offset]) {
8838c2ecf20Sopenharmony_ci		unlock_cluster(ci);
8848c2ecf20Sopenharmony_ci		if (!n_ret)
8858c2ecf20Sopenharmony_ci			goto scan;
8868c2ecf20Sopenharmony_ci		else
8878c2ecf20Sopenharmony_ci			goto done;
8888c2ecf20Sopenharmony_ci	}
8898c2ecf20Sopenharmony_ci	WRITE_ONCE(si->swap_map[offset], usage);
8908c2ecf20Sopenharmony_ci	inc_cluster_info_page(si, si->cluster_info, offset);
8918c2ecf20Sopenharmony_ci	unlock_cluster(ci);
8928c2ecf20Sopenharmony_ci
8938c2ecf20Sopenharmony_ci	swap_range_alloc(si, offset, 1);
8948c2ecf20Sopenharmony_ci	slots[n_ret++] = swp_entry(si->type, offset);
8958c2ecf20Sopenharmony_ci
8968c2ecf20Sopenharmony_ci	/* got enough slots or reach max slots? */
8978c2ecf20Sopenharmony_ci	if ((n_ret == nr) || (offset >= si->highest_bit))
8988c2ecf20Sopenharmony_ci		goto done;
8998c2ecf20Sopenharmony_ci
9008c2ecf20Sopenharmony_ci	/* search for next available slot */
9018c2ecf20Sopenharmony_ci
9028c2ecf20Sopenharmony_ci	/* time to take a break? */
9038c2ecf20Sopenharmony_ci	if (unlikely(--latency_ration < 0)) {
9048c2ecf20Sopenharmony_ci		if (n_ret)
9058c2ecf20Sopenharmony_ci			goto done;
9068c2ecf20Sopenharmony_ci		spin_unlock(&si->lock);
9078c2ecf20Sopenharmony_ci		cond_resched();
9088c2ecf20Sopenharmony_ci		spin_lock(&si->lock);
9098c2ecf20Sopenharmony_ci		latency_ration = LATENCY_LIMIT;
9108c2ecf20Sopenharmony_ci	}
9118c2ecf20Sopenharmony_ci
9128c2ecf20Sopenharmony_ci	/* try to get more slots in cluster */
9138c2ecf20Sopenharmony_ci	if (si->cluster_info) {
9148c2ecf20Sopenharmony_ci		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
9158c2ecf20Sopenharmony_ci			goto checks;
9168c2ecf20Sopenharmony_ci	} else if (si->cluster_nr && !si->swap_map[++offset]) {
9178c2ecf20Sopenharmony_ci		/* non-ssd case, still more slots in cluster? */
9188c2ecf20Sopenharmony_ci		--si->cluster_nr;
9198c2ecf20Sopenharmony_ci		goto checks;
9208c2ecf20Sopenharmony_ci	}
9218c2ecf20Sopenharmony_ci
9228c2ecf20Sopenharmony_ci	/*
9238c2ecf20Sopenharmony_ci	 * Even if there's no free clusters available (fragmented),
9248c2ecf20Sopenharmony_ci	 * try to scan a little more quickly with lock held unless we
9258c2ecf20Sopenharmony_ci	 * have scanned too many slots already.
9268c2ecf20Sopenharmony_ci	 */
9278c2ecf20Sopenharmony_ci	if (!scanned_many) {
9288c2ecf20Sopenharmony_ci		unsigned long scan_limit;
9298c2ecf20Sopenharmony_ci
9308c2ecf20Sopenharmony_ci		if (offset < scan_base)
9318c2ecf20Sopenharmony_ci			scan_limit = scan_base;
9328c2ecf20Sopenharmony_ci		else
9338c2ecf20Sopenharmony_ci			scan_limit = si->highest_bit;
9348c2ecf20Sopenharmony_ci		for (; offset <= scan_limit && --latency_ration > 0;
9358c2ecf20Sopenharmony_ci		     offset++) {
9368c2ecf20Sopenharmony_ci			if (!si->swap_map[offset])
9378c2ecf20Sopenharmony_ci				goto checks;
9388c2ecf20Sopenharmony_ci		}
9398c2ecf20Sopenharmony_ci	}
9408c2ecf20Sopenharmony_ci
9418c2ecf20Sopenharmony_cidone:
9428c2ecf20Sopenharmony_ci	set_cluster_next(si, offset + 1);
9438c2ecf20Sopenharmony_ci	si->flags -= SWP_SCANNING;
9448c2ecf20Sopenharmony_ci	return n_ret;
9458c2ecf20Sopenharmony_ci
9468c2ecf20Sopenharmony_ciscan:
9478c2ecf20Sopenharmony_ci	spin_unlock(&si->lock);
9488c2ecf20Sopenharmony_ci	while (++offset <= READ_ONCE(si->highest_bit)) {
9498c2ecf20Sopenharmony_ci		if (data_race(!si->swap_map[offset])) {
9508c2ecf20Sopenharmony_ci			spin_lock(&si->lock);
9518c2ecf20Sopenharmony_ci			goto checks;
9528c2ecf20Sopenharmony_ci		}
9538c2ecf20Sopenharmony_ci		if (vm_swap_full() &&
9548c2ecf20Sopenharmony_ci		    READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
9558c2ecf20Sopenharmony_ci			spin_lock(&si->lock);
9568c2ecf20Sopenharmony_ci			goto checks;
9578c2ecf20Sopenharmony_ci		}
9588c2ecf20Sopenharmony_ci		if (unlikely(--latency_ration < 0)) {
9598c2ecf20Sopenharmony_ci			cond_resched();
9608c2ecf20Sopenharmony_ci			latency_ration = LATENCY_LIMIT;
9618c2ecf20Sopenharmony_ci			scanned_many = true;
9628c2ecf20Sopenharmony_ci		}
9638c2ecf20Sopenharmony_ci	}
9648c2ecf20Sopenharmony_ci	offset = si->lowest_bit;
9658c2ecf20Sopenharmony_ci	while (offset < scan_base) {
9668c2ecf20Sopenharmony_ci		if (data_race(!si->swap_map[offset])) {
9678c2ecf20Sopenharmony_ci			spin_lock(&si->lock);
9688c2ecf20Sopenharmony_ci			goto checks;
9698c2ecf20Sopenharmony_ci		}
9708c2ecf20Sopenharmony_ci		if (vm_swap_full() &&
9718c2ecf20Sopenharmony_ci		    READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
9728c2ecf20Sopenharmony_ci			spin_lock(&si->lock);
9738c2ecf20Sopenharmony_ci			goto checks;
9748c2ecf20Sopenharmony_ci		}
9758c2ecf20Sopenharmony_ci		if (unlikely(--latency_ration < 0)) {
9768c2ecf20Sopenharmony_ci			cond_resched();
9778c2ecf20Sopenharmony_ci			latency_ration = LATENCY_LIMIT;
9788c2ecf20Sopenharmony_ci			scanned_many = true;
9798c2ecf20Sopenharmony_ci		}
9808c2ecf20Sopenharmony_ci		offset++;
9818c2ecf20Sopenharmony_ci	}
9828c2ecf20Sopenharmony_ci	spin_lock(&si->lock);
9838c2ecf20Sopenharmony_ci
9848c2ecf20Sopenharmony_cino_page:
9858c2ecf20Sopenharmony_ci	si->flags -= SWP_SCANNING;
9868c2ecf20Sopenharmony_ci	return n_ret;
9878c2ecf20Sopenharmony_ci}
9888c2ecf20Sopenharmony_ci
9898c2ecf20Sopenharmony_cistatic int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
9908c2ecf20Sopenharmony_ci{
9918c2ecf20Sopenharmony_ci	unsigned long idx;
9928c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
9938c2ecf20Sopenharmony_ci	unsigned long offset, i;
9948c2ecf20Sopenharmony_ci	unsigned char *map;
9958c2ecf20Sopenharmony_ci
9968c2ecf20Sopenharmony_ci	/*
9978c2ecf20Sopenharmony_ci	 * Should not even be attempting cluster allocations when huge
9988c2ecf20Sopenharmony_ci	 * page swap is disabled.  Warn and fail the allocation.
9998c2ecf20Sopenharmony_ci	 */
10008c2ecf20Sopenharmony_ci	if (!IS_ENABLED(CONFIG_THP_SWAP)) {
10018c2ecf20Sopenharmony_ci		VM_WARN_ON_ONCE(1);
10028c2ecf20Sopenharmony_ci		return 0;
10038c2ecf20Sopenharmony_ci	}
10048c2ecf20Sopenharmony_ci
10058c2ecf20Sopenharmony_ci	if (cluster_list_empty(&si->free_clusters))
10068c2ecf20Sopenharmony_ci		return 0;
10078c2ecf20Sopenharmony_ci
10088c2ecf20Sopenharmony_ci	idx = cluster_list_first(&si->free_clusters);
10098c2ecf20Sopenharmony_ci	offset = idx * SWAPFILE_CLUSTER;
10108c2ecf20Sopenharmony_ci	ci = lock_cluster(si, offset);
10118c2ecf20Sopenharmony_ci	alloc_cluster(si, idx);
10128c2ecf20Sopenharmony_ci	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
10138c2ecf20Sopenharmony_ci
10148c2ecf20Sopenharmony_ci	map = si->swap_map + offset;
10158c2ecf20Sopenharmony_ci	for (i = 0; i < SWAPFILE_CLUSTER; i++)
10168c2ecf20Sopenharmony_ci		map[i] = SWAP_HAS_CACHE;
10178c2ecf20Sopenharmony_ci	unlock_cluster(ci);
10188c2ecf20Sopenharmony_ci	swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
10198c2ecf20Sopenharmony_ci	*slot = swp_entry(si->type, offset);
10208c2ecf20Sopenharmony_ci
10218c2ecf20Sopenharmony_ci	return 1;
10228c2ecf20Sopenharmony_ci}
10238c2ecf20Sopenharmony_ci
10248c2ecf20Sopenharmony_cistatic void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
10258c2ecf20Sopenharmony_ci{
10268c2ecf20Sopenharmony_ci	unsigned long offset = idx * SWAPFILE_CLUSTER;
10278c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
10288c2ecf20Sopenharmony_ci
10298c2ecf20Sopenharmony_ci	ci = lock_cluster(si, offset);
10308c2ecf20Sopenharmony_ci	memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
10318c2ecf20Sopenharmony_ci	cluster_set_count_flag(ci, 0, 0);
10328c2ecf20Sopenharmony_ci	free_cluster(si, idx);
10338c2ecf20Sopenharmony_ci	unlock_cluster(ci);
10348c2ecf20Sopenharmony_ci	swap_range_free(si, offset, SWAPFILE_CLUSTER);
10358c2ecf20Sopenharmony_ci}
10368c2ecf20Sopenharmony_ci
10378c2ecf20Sopenharmony_cistatic unsigned long scan_swap_map(struct swap_info_struct *si,
10388c2ecf20Sopenharmony_ci				   unsigned char usage)
10398c2ecf20Sopenharmony_ci{
10408c2ecf20Sopenharmony_ci	swp_entry_t entry;
10418c2ecf20Sopenharmony_ci	int n_ret;
10428c2ecf20Sopenharmony_ci
10438c2ecf20Sopenharmony_ci	n_ret = scan_swap_map_slots(si, usage, 1, &entry);
10448c2ecf20Sopenharmony_ci
10458c2ecf20Sopenharmony_ci	if (n_ret)
10468c2ecf20Sopenharmony_ci		return swp_offset(entry);
10478c2ecf20Sopenharmony_ci	else
10488c2ecf20Sopenharmony_ci		return 0;
10498c2ecf20Sopenharmony_ci
10508c2ecf20Sopenharmony_ci}
10518c2ecf20Sopenharmony_ci
10528c2ecf20Sopenharmony_ciint get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
10538c2ecf20Sopenharmony_ci{
10548c2ecf20Sopenharmony_ci	unsigned long size = swap_entry_size(entry_size);
10558c2ecf20Sopenharmony_ci	struct swap_info_struct *si, *next;
10568c2ecf20Sopenharmony_ci	long avail_pgs;
10578c2ecf20Sopenharmony_ci	int n_ret = 0;
10588c2ecf20Sopenharmony_ci	int node;
10598c2ecf20Sopenharmony_ci
10608c2ecf20Sopenharmony_ci	/* Only single cluster request supported */
10618c2ecf20Sopenharmony_ci	WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
10628c2ecf20Sopenharmony_ci
10638c2ecf20Sopenharmony_ci	spin_lock(&swap_avail_lock);
10648c2ecf20Sopenharmony_ci
10658c2ecf20Sopenharmony_ci	avail_pgs = atomic_long_read(&nr_swap_pages) / size;
10668c2ecf20Sopenharmony_ci	if (avail_pgs <= 0) {
10678c2ecf20Sopenharmony_ci		spin_unlock(&swap_avail_lock);
10688c2ecf20Sopenharmony_ci		goto noswap;
10698c2ecf20Sopenharmony_ci	}
10708c2ecf20Sopenharmony_ci
10718c2ecf20Sopenharmony_ci	n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
10728c2ecf20Sopenharmony_ci
10738c2ecf20Sopenharmony_ci	atomic_long_sub(n_goal * size, &nr_swap_pages);
10748c2ecf20Sopenharmony_ci
10758c2ecf20Sopenharmony_cistart_over:
10768c2ecf20Sopenharmony_ci	node = numa_node_id();
10778c2ecf20Sopenharmony_ci	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
10788c2ecf20Sopenharmony_ci		/* requeue si to after same-priority siblings */
10798c2ecf20Sopenharmony_ci		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
10808c2ecf20Sopenharmony_ci		spin_unlock(&swap_avail_lock);
10818c2ecf20Sopenharmony_ci		spin_lock(&si->lock);
10828c2ecf20Sopenharmony_ci		if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
10838c2ecf20Sopenharmony_ci			spin_lock(&swap_avail_lock);
10848c2ecf20Sopenharmony_ci			if (plist_node_empty(&si->avail_lists[node])) {
10858c2ecf20Sopenharmony_ci				spin_unlock(&si->lock);
10868c2ecf20Sopenharmony_ci				goto nextsi;
10878c2ecf20Sopenharmony_ci			}
10888c2ecf20Sopenharmony_ci			WARN(!si->highest_bit,
10898c2ecf20Sopenharmony_ci			     "swap_info %d in list but !highest_bit\n",
10908c2ecf20Sopenharmony_ci			     si->type);
10918c2ecf20Sopenharmony_ci			WARN(!(si->flags & SWP_WRITEOK),
10928c2ecf20Sopenharmony_ci			     "swap_info %d in list but !SWP_WRITEOK\n",
10938c2ecf20Sopenharmony_ci			     si->type);
10948c2ecf20Sopenharmony_ci			__del_from_avail_list(si);
10958c2ecf20Sopenharmony_ci			spin_unlock(&si->lock);
10968c2ecf20Sopenharmony_ci			goto nextsi;
10978c2ecf20Sopenharmony_ci		}
10988c2ecf20Sopenharmony_ci		if (size == SWAPFILE_CLUSTER) {
10998c2ecf20Sopenharmony_ci			if (si->flags & SWP_BLKDEV)
11008c2ecf20Sopenharmony_ci				n_ret = swap_alloc_cluster(si, swp_entries);
11018c2ecf20Sopenharmony_ci		} else
11028c2ecf20Sopenharmony_ci			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
11038c2ecf20Sopenharmony_ci						    n_goal, swp_entries);
11048c2ecf20Sopenharmony_ci		spin_unlock(&si->lock);
11058c2ecf20Sopenharmony_ci		if (n_ret || size == SWAPFILE_CLUSTER)
11068c2ecf20Sopenharmony_ci			goto check_out;
11078c2ecf20Sopenharmony_ci		pr_debug("scan_swap_map of si %d failed to find offset\n",
11088c2ecf20Sopenharmony_ci			si->type);
11098c2ecf20Sopenharmony_ci		cond_resched();
11108c2ecf20Sopenharmony_ci
11118c2ecf20Sopenharmony_ci		spin_lock(&swap_avail_lock);
11128c2ecf20Sopenharmony_cinextsi:
11138c2ecf20Sopenharmony_ci		/*
11148c2ecf20Sopenharmony_ci		 * if we got here, it's likely that si was almost full before,
11158c2ecf20Sopenharmony_ci		 * and since scan_swap_map() can drop the si->lock, multiple
11168c2ecf20Sopenharmony_ci		 * callers probably all tried to get a page from the same si
11178c2ecf20Sopenharmony_ci		 * and it filled up before we could get one; or, the si filled
11188c2ecf20Sopenharmony_ci		 * up between us dropping swap_avail_lock and taking si->lock.
11198c2ecf20Sopenharmony_ci		 * Since we dropped the swap_avail_lock, the swap_avail_head
11208c2ecf20Sopenharmony_ci		 * list may have been modified; so if next is still in the
11218c2ecf20Sopenharmony_ci		 * swap_avail_head list then try it, otherwise start over
11228c2ecf20Sopenharmony_ci		 * if we have not gotten any slots.
11238c2ecf20Sopenharmony_ci		 */
11248c2ecf20Sopenharmony_ci		if (plist_node_empty(&next->avail_lists[node]))
11258c2ecf20Sopenharmony_ci			goto start_over;
11268c2ecf20Sopenharmony_ci	}
11278c2ecf20Sopenharmony_ci
11288c2ecf20Sopenharmony_ci	spin_unlock(&swap_avail_lock);
11298c2ecf20Sopenharmony_ci
11308c2ecf20Sopenharmony_cicheck_out:
11318c2ecf20Sopenharmony_ci	if (n_ret < n_goal)
11328c2ecf20Sopenharmony_ci		atomic_long_add((long)(n_goal - n_ret) * size,
11338c2ecf20Sopenharmony_ci				&nr_swap_pages);
11348c2ecf20Sopenharmony_cinoswap:
11358c2ecf20Sopenharmony_ci	return n_ret;
11368c2ecf20Sopenharmony_ci}
11378c2ecf20Sopenharmony_ci
11388c2ecf20Sopenharmony_ci/* The only caller of this function is now suspend routine */
11398c2ecf20Sopenharmony_ciswp_entry_t get_swap_page_of_type(int type)
11408c2ecf20Sopenharmony_ci{
11418c2ecf20Sopenharmony_ci	struct swap_info_struct *si = swap_type_to_swap_info(type);
11428c2ecf20Sopenharmony_ci	pgoff_t offset;
11438c2ecf20Sopenharmony_ci
11448c2ecf20Sopenharmony_ci	if (!si)
11458c2ecf20Sopenharmony_ci		goto fail;
11468c2ecf20Sopenharmony_ci
11478c2ecf20Sopenharmony_ci	spin_lock(&si->lock);
11488c2ecf20Sopenharmony_ci	if (si->flags & SWP_WRITEOK) {
11498c2ecf20Sopenharmony_ci		/* This is called for allocating swap entry, not cache */
11508c2ecf20Sopenharmony_ci		offset = scan_swap_map(si, 1);
11518c2ecf20Sopenharmony_ci		if (offset) {
11528c2ecf20Sopenharmony_ci			atomic_long_dec(&nr_swap_pages);
11538c2ecf20Sopenharmony_ci			spin_unlock(&si->lock);
11548c2ecf20Sopenharmony_ci			return swp_entry(type, offset);
11558c2ecf20Sopenharmony_ci		}
11568c2ecf20Sopenharmony_ci	}
11578c2ecf20Sopenharmony_ci	spin_unlock(&si->lock);
11588c2ecf20Sopenharmony_cifail:
11598c2ecf20Sopenharmony_ci	return (swp_entry_t) {0};
11608c2ecf20Sopenharmony_ci}
11618c2ecf20Sopenharmony_ci
11628c2ecf20Sopenharmony_cistatic struct swap_info_struct *__swap_info_get(swp_entry_t entry)
11638c2ecf20Sopenharmony_ci{
11648c2ecf20Sopenharmony_ci	struct swap_info_struct *p;
11658c2ecf20Sopenharmony_ci	unsigned long offset;
11668c2ecf20Sopenharmony_ci
11678c2ecf20Sopenharmony_ci	if (!entry.val)
11688c2ecf20Sopenharmony_ci		goto out;
11698c2ecf20Sopenharmony_ci	p = swp_swap_info(entry);
11708c2ecf20Sopenharmony_ci	if (!p)
11718c2ecf20Sopenharmony_ci		goto bad_nofile;
11728c2ecf20Sopenharmony_ci	if (data_race(!(p->flags & SWP_USED)))
11738c2ecf20Sopenharmony_ci		goto bad_device;
11748c2ecf20Sopenharmony_ci	offset = swp_offset(entry);
11758c2ecf20Sopenharmony_ci	if (offset >= p->max)
11768c2ecf20Sopenharmony_ci		goto bad_offset;
11778c2ecf20Sopenharmony_ci	return p;
11788c2ecf20Sopenharmony_ci
11798c2ecf20Sopenharmony_cibad_offset:
11808c2ecf20Sopenharmony_ci	pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
11818c2ecf20Sopenharmony_ci	goto out;
11828c2ecf20Sopenharmony_cibad_device:
11838c2ecf20Sopenharmony_ci	pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
11848c2ecf20Sopenharmony_ci	goto out;
11858c2ecf20Sopenharmony_cibad_nofile:
11868c2ecf20Sopenharmony_ci	pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
11878c2ecf20Sopenharmony_ciout:
11888c2ecf20Sopenharmony_ci	return NULL;
11898c2ecf20Sopenharmony_ci}
11908c2ecf20Sopenharmony_ci
11918c2ecf20Sopenharmony_cistatic struct swap_info_struct *_swap_info_get(swp_entry_t entry)
11928c2ecf20Sopenharmony_ci{
11938c2ecf20Sopenharmony_ci	struct swap_info_struct *p;
11948c2ecf20Sopenharmony_ci
11958c2ecf20Sopenharmony_ci	p = __swap_info_get(entry);
11968c2ecf20Sopenharmony_ci	if (!p)
11978c2ecf20Sopenharmony_ci		goto out;
11988c2ecf20Sopenharmony_ci	if (data_race(!p->swap_map[swp_offset(entry)]))
11998c2ecf20Sopenharmony_ci		goto bad_free;
12008c2ecf20Sopenharmony_ci	return p;
12018c2ecf20Sopenharmony_ci
12028c2ecf20Sopenharmony_cibad_free:
12038c2ecf20Sopenharmony_ci	pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
12048c2ecf20Sopenharmony_ciout:
12058c2ecf20Sopenharmony_ci	return NULL;
12068c2ecf20Sopenharmony_ci}
12078c2ecf20Sopenharmony_ci
12088c2ecf20Sopenharmony_cistatic struct swap_info_struct *swap_info_get(swp_entry_t entry)
12098c2ecf20Sopenharmony_ci{
12108c2ecf20Sopenharmony_ci	struct swap_info_struct *p;
12118c2ecf20Sopenharmony_ci
12128c2ecf20Sopenharmony_ci	p = _swap_info_get(entry);
12138c2ecf20Sopenharmony_ci	if (p)
12148c2ecf20Sopenharmony_ci		spin_lock(&p->lock);
12158c2ecf20Sopenharmony_ci	return p;
12168c2ecf20Sopenharmony_ci}
12178c2ecf20Sopenharmony_ci
12188c2ecf20Sopenharmony_cistatic struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
12198c2ecf20Sopenharmony_ci					struct swap_info_struct *q)
12208c2ecf20Sopenharmony_ci{
12218c2ecf20Sopenharmony_ci	struct swap_info_struct *p;
12228c2ecf20Sopenharmony_ci
12238c2ecf20Sopenharmony_ci	p = _swap_info_get(entry);
12248c2ecf20Sopenharmony_ci
12258c2ecf20Sopenharmony_ci	if (p != q) {
12268c2ecf20Sopenharmony_ci		if (q != NULL)
12278c2ecf20Sopenharmony_ci			spin_unlock(&q->lock);
12288c2ecf20Sopenharmony_ci		if (p != NULL)
12298c2ecf20Sopenharmony_ci			spin_lock(&p->lock);
12308c2ecf20Sopenharmony_ci	}
12318c2ecf20Sopenharmony_ci	return p;
12328c2ecf20Sopenharmony_ci}
12338c2ecf20Sopenharmony_ci
12348c2ecf20Sopenharmony_cistatic unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
12358c2ecf20Sopenharmony_ci					      unsigned long offset,
12368c2ecf20Sopenharmony_ci					      unsigned char usage)
12378c2ecf20Sopenharmony_ci{
12388c2ecf20Sopenharmony_ci	unsigned char count;
12398c2ecf20Sopenharmony_ci	unsigned char has_cache;
12408c2ecf20Sopenharmony_ci
12418c2ecf20Sopenharmony_ci	count = p->swap_map[offset];
12428c2ecf20Sopenharmony_ci
12438c2ecf20Sopenharmony_ci	has_cache = count & SWAP_HAS_CACHE;
12448c2ecf20Sopenharmony_ci	count &= ~SWAP_HAS_CACHE;
12458c2ecf20Sopenharmony_ci
12468c2ecf20Sopenharmony_ci	if (usage == SWAP_HAS_CACHE) {
12478c2ecf20Sopenharmony_ci		VM_BUG_ON(!has_cache);
12488c2ecf20Sopenharmony_ci		has_cache = 0;
12498c2ecf20Sopenharmony_ci	} else if (count == SWAP_MAP_SHMEM) {
12508c2ecf20Sopenharmony_ci		/*
12518c2ecf20Sopenharmony_ci		 * Or we could insist on shmem.c using a special
12528c2ecf20Sopenharmony_ci		 * swap_shmem_free() and free_shmem_swap_and_cache()...
12538c2ecf20Sopenharmony_ci		 */
12548c2ecf20Sopenharmony_ci		count = 0;
12558c2ecf20Sopenharmony_ci	} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
12568c2ecf20Sopenharmony_ci		if (count == COUNT_CONTINUED) {
12578c2ecf20Sopenharmony_ci			if (swap_count_continued(p, offset, count))
12588c2ecf20Sopenharmony_ci				count = SWAP_MAP_MAX | COUNT_CONTINUED;
12598c2ecf20Sopenharmony_ci			else
12608c2ecf20Sopenharmony_ci				count = SWAP_MAP_MAX;
12618c2ecf20Sopenharmony_ci		} else
12628c2ecf20Sopenharmony_ci			count--;
12638c2ecf20Sopenharmony_ci	}
12648c2ecf20Sopenharmony_ci
12658c2ecf20Sopenharmony_ci	usage = count | has_cache;
12668c2ecf20Sopenharmony_ci	if (usage)
12678c2ecf20Sopenharmony_ci		WRITE_ONCE(p->swap_map[offset], usage);
12688c2ecf20Sopenharmony_ci	else
12698c2ecf20Sopenharmony_ci		WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
12708c2ecf20Sopenharmony_ci
12718c2ecf20Sopenharmony_ci	return usage;
12728c2ecf20Sopenharmony_ci}
12738c2ecf20Sopenharmony_ci
12748c2ecf20Sopenharmony_ci/*
12758c2ecf20Sopenharmony_ci * Note that when only holding the PTL, swapoff might succeed immediately
12768c2ecf20Sopenharmony_ci * after freeing a swap entry. Therefore, immediately after
12778c2ecf20Sopenharmony_ci * __swap_entry_free(), the swap info might become stale and should not
12788c2ecf20Sopenharmony_ci * be touched without a prior get_swap_device().
12798c2ecf20Sopenharmony_ci *
12808c2ecf20Sopenharmony_ci * Check whether swap entry is valid in the swap device.  If so,
12818c2ecf20Sopenharmony_ci * return pointer to swap_info_struct, and keep the swap entry valid
12828c2ecf20Sopenharmony_ci * via preventing the swap device from being swapoff, until
12838c2ecf20Sopenharmony_ci * put_swap_device() is called.  Otherwise return NULL.
12848c2ecf20Sopenharmony_ci *
12858c2ecf20Sopenharmony_ci * The entirety of the RCU read critical section must come before the
12868c2ecf20Sopenharmony_ci * return from or after the call to synchronize_rcu() in
12878c2ecf20Sopenharmony_ci * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
12888c2ecf20Sopenharmony_ci * true, the si->map, si->cluster_info, etc. must be valid in the
12898c2ecf20Sopenharmony_ci * critical section.
12908c2ecf20Sopenharmony_ci *
12918c2ecf20Sopenharmony_ci * Notice that swapoff or swapoff+swapon can still happen before the
12928c2ecf20Sopenharmony_ci * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
12938c2ecf20Sopenharmony_ci * in put_swap_device() if there isn't any other way to prevent
12948c2ecf20Sopenharmony_ci * swapoff, such as page lock, page table lock, etc.  The caller must
12958c2ecf20Sopenharmony_ci * be prepared for that.  For example, the following situation is
12968c2ecf20Sopenharmony_ci * possible.
12978c2ecf20Sopenharmony_ci *
12988c2ecf20Sopenharmony_ci *   CPU1				CPU2
12998c2ecf20Sopenharmony_ci *   do_swap_page()
13008c2ecf20Sopenharmony_ci *     ...				swapoff+swapon
13018c2ecf20Sopenharmony_ci *     __read_swap_cache_async()
13028c2ecf20Sopenharmony_ci *       swapcache_prepare()
13038c2ecf20Sopenharmony_ci *         __swap_duplicate()
13048c2ecf20Sopenharmony_ci *           // check swap_map
13058c2ecf20Sopenharmony_ci *     // verify PTE not changed
13068c2ecf20Sopenharmony_ci *
13078c2ecf20Sopenharmony_ci * In __swap_duplicate(), the swap_map need to be checked before
13088c2ecf20Sopenharmony_ci * changing partly because the specified swap entry may be for another
13098c2ecf20Sopenharmony_ci * swap device which has been swapoff.  And in do_swap_page(), after
13108c2ecf20Sopenharmony_ci * the page is read from the swap device, the PTE is verified not
13118c2ecf20Sopenharmony_ci * changed with the page table locked to check whether the swap device
13128c2ecf20Sopenharmony_ci * has been swapoff or swapoff+swapon.
13138c2ecf20Sopenharmony_ci */
13148c2ecf20Sopenharmony_cistruct swap_info_struct *get_swap_device(swp_entry_t entry)
13158c2ecf20Sopenharmony_ci{
13168c2ecf20Sopenharmony_ci	struct swap_info_struct *si;
13178c2ecf20Sopenharmony_ci	unsigned long offset;
13188c2ecf20Sopenharmony_ci
13198c2ecf20Sopenharmony_ci	if (!entry.val)
13208c2ecf20Sopenharmony_ci		goto out;
13218c2ecf20Sopenharmony_ci	si = swp_swap_info(entry);
13228c2ecf20Sopenharmony_ci	if (!si)
13238c2ecf20Sopenharmony_ci		goto bad_nofile;
13248c2ecf20Sopenharmony_ci
13258c2ecf20Sopenharmony_ci	rcu_read_lock();
13268c2ecf20Sopenharmony_ci	if (data_race(!(si->flags & SWP_VALID)))
13278c2ecf20Sopenharmony_ci		goto unlock_out;
13288c2ecf20Sopenharmony_ci	offset = swp_offset(entry);
13298c2ecf20Sopenharmony_ci	if (offset >= si->max)
13308c2ecf20Sopenharmony_ci		goto unlock_out;
13318c2ecf20Sopenharmony_ci
13328c2ecf20Sopenharmony_ci	return si;
13338c2ecf20Sopenharmony_cibad_nofile:
13348c2ecf20Sopenharmony_ci	pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
13358c2ecf20Sopenharmony_ciout:
13368c2ecf20Sopenharmony_ci	return NULL;
13378c2ecf20Sopenharmony_ciunlock_out:
13388c2ecf20Sopenharmony_ci	rcu_read_unlock();
13398c2ecf20Sopenharmony_ci	return NULL;
13408c2ecf20Sopenharmony_ci}
13418c2ecf20Sopenharmony_ci
13428c2ecf20Sopenharmony_cistatic unsigned char __swap_entry_free(struct swap_info_struct *p,
13438c2ecf20Sopenharmony_ci				       swp_entry_t entry)
13448c2ecf20Sopenharmony_ci{
13458c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
13468c2ecf20Sopenharmony_ci	unsigned long offset = swp_offset(entry);
13478c2ecf20Sopenharmony_ci	unsigned char usage;
13488c2ecf20Sopenharmony_ci
13498c2ecf20Sopenharmony_ci	ci = lock_cluster_or_swap_info(p, offset);
13508c2ecf20Sopenharmony_ci	usage = __swap_entry_free_locked(p, offset, 1);
13518c2ecf20Sopenharmony_ci	unlock_cluster_or_swap_info(p, ci);
13528c2ecf20Sopenharmony_ci	if (!usage)
13538c2ecf20Sopenharmony_ci		free_swap_slot(entry);
13548c2ecf20Sopenharmony_ci
13558c2ecf20Sopenharmony_ci	return usage;
13568c2ecf20Sopenharmony_ci}
13578c2ecf20Sopenharmony_ci
13588c2ecf20Sopenharmony_cistatic void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
13598c2ecf20Sopenharmony_ci{
13608c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
13618c2ecf20Sopenharmony_ci	unsigned long offset = swp_offset(entry);
13628c2ecf20Sopenharmony_ci	unsigned char count;
13638c2ecf20Sopenharmony_ci
13648c2ecf20Sopenharmony_ci	ci = lock_cluster(p, offset);
13658c2ecf20Sopenharmony_ci	count = p->swap_map[offset];
13668c2ecf20Sopenharmony_ci	VM_BUG_ON(count != SWAP_HAS_CACHE);
13678c2ecf20Sopenharmony_ci	p->swap_map[offset] = 0;
13688c2ecf20Sopenharmony_ci	dec_cluster_info_page(p, p->cluster_info, offset);
13698c2ecf20Sopenharmony_ci	unlock_cluster(ci);
13708c2ecf20Sopenharmony_ci
13718c2ecf20Sopenharmony_ci	mem_cgroup_uncharge_swap(entry, 1);
13728c2ecf20Sopenharmony_ci	swap_range_free(p, offset, 1);
13738c2ecf20Sopenharmony_ci}
13748c2ecf20Sopenharmony_ci
13758c2ecf20Sopenharmony_ci/*
13768c2ecf20Sopenharmony_ci * Caller has made sure that the swap device corresponding to entry
13778c2ecf20Sopenharmony_ci * is still around or has not been recycled.
13788c2ecf20Sopenharmony_ci */
13798c2ecf20Sopenharmony_civoid swap_free(swp_entry_t entry)
13808c2ecf20Sopenharmony_ci{
13818c2ecf20Sopenharmony_ci	struct swap_info_struct *p;
13828c2ecf20Sopenharmony_ci
13838c2ecf20Sopenharmony_ci	p = _swap_info_get(entry);
13848c2ecf20Sopenharmony_ci	if (p)
13858c2ecf20Sopenharmony_ci		__swap_entry_free(p, entry);
13868c2ecf20Sopenharmony_ci}
13878c2ecf20Sopenharmony_ci
13888c2ecf20Sopenharmony_ci/*
13898c2ecf20Sopenharmony_ci * Called after dropping swapcache to decrease refcnt to swap entries.
13908c2ecf20Sopenharmony_ci */
13918c2ecf20Sopenharmony_civoid put_swap_page(struct page *page, swp_entry_t entry)
13928c2ecf20Sopenharmony_ci{
13938c2ecf20Sopenharmony_ci	unsigned long offset = swp_offset(entry);
13948c2ecf20Sopenharmony_ci	unsigned long idx = offset / SWAPFILE_CLUSTER;
13958c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
13968c2ecf20Sopenharmony_ci	struct swap_info_struct *si;
13978c2ecf20Sopenharmony_ci	unsigned char *map;
13988c2ecf20Sopenharmony_ci	unsigned int i, free_entries = 0;
13998c2ecf20Sopenharmony_ci	unsigned char val;
14008c2ecf20Sopenharmony_ci	int size = swap_entry_size(thp_nr_pages(page));
14018c2ecf20Sopenharmony_ci
14028c2ecf20Sopenharmony_ci	si = _swap_info_get(entry);
14038c2ecf20Sopenharmony_ci	if (!si)
14048c2ecf20Sopenharmony_ci		return;
14058c2ecf20Sopenharmony_ci
14068c2ecf20Sopenharmony_ci	ci = lock_cluster_or_swap_info(si, offset);
14078c2ecf20Sopenharmony_ci	if (size == SWAPFILE_CLUSTER) {
14088c2ecf20Sopenharmony_ci		VM_BUG_ON(!cluster_is_huge(ci));
14098c2ecf20Sopenharmony_ci		map = si->swap_map + offset;
14108c2ecf20Sopenharmony_ci		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
14118c2ecf20Sopenharmony_ci			val = map[i];
14128c2ecf20Sopenharmony_ci			VM_BUG_ON(!(val & SWAP_HAS_CACHE));
14138c2ecf20Sopenharmony_ci			if (val == SWAP_HAS_CACHE)
14148c2ecf20Sopenharmony_ci				free_entries++;
14158c2ecf20Sopenharmony_ci		}
14168c2ecf20Sopenharmony_ci		cluster_clear_huge(ci);
14178c2ecf20Sopenharmony_ci		if (free_entries == SWAPFILE_CLUSTER) {
14188c2ecf20Sopenharmony_ci			unlock_cluster_or_swap_info(si, ci);
14198c2ecf20Sopenharmony_ci			spin_lock(&si->lock);
14208c2ecf20Sopenharmony_ci			mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
14218c2ecf20Sopenharmony_ci			swap_free_cluster(si, idx);
14228c2ecf20Sopenharmony_ci			spin_unlock(&si->lock);
14238c2ecf20Sopenharmony_ci			return;
14248c2ecf20Sopenharmony_ci		}
14258c2ecf20Sopenharmony_ci	}
14268c2ecf20Sopenharmony_ci	for (i = 0; i < size; i++, entry.val++) {
14278c2ecf20Sopenharmony_ci		if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
14288c2ecf20Sopenharmony_ci			unlock_cluster_or_swap_info(si, ci);
14298c2ecf20Sopenharmony_ci			free_swap_slot(entry);
14308c2ecf20Sopenharmony_ci			if (i == size - 1)
14318c2ecf20Sopenharmony_ci				return;
14328c2ecf20Sopenharmony_ci			lock_cluster_or_swap_info(si, offset);
14338c2ecf20Sopenharmony_ci		}
14348c2ecf20Sopenharmony_ci	}
14358c2ecf20Sopenharmony_ci	unlock_cluster_or_swap_info(si, ci);
14368c2ecf20Sopenharmony_ci}
14378c2ecf20Sopenharmony_ci
14388c2ecf20Sopenharmony_ci#ifdef CONFIG_THP_SWAP
14398c2ecf20Sopenharmony_ciint split_swap_cluster(swp_entry_t entry)
14408c2ecf20Sopenharmony_ci{
14418c2ecf20Sopenharmony_ci	struct swap_info_struct *si;
14428c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
14438c2ecf20Sopenharmony_ci	unsigned long offset = swp_offset(entry);
14448c2ecf20Sopenharmony_ci
14458c2ecf20Sopenharmony_ci	si = _swap_info_get(entry);
14468c2ecf20Sopenharmony_ci	if (!si)
14478c2ecf20Sopenharmony_ci		return -EBUSY;
14488c2ecf20Sopenharmony_ci	ci = lock_cluster(si, offset);
14498c2ecf20Sopenharmony_ci	cluster_clear_huge(ci);
14508c2ecf20Sopenharmony_ci	unlock_cluster(ci);
14518c2ecf20Sopenharmony_ci	return 0;
14528c2ecf20Sopenharmony_ci}
14538c2ecf20Sopenharmony_ci#endif
14548c2ecf20Sopenharmony_ci
14558c2ecf20Sopenharmony_cistatic int swp_entry_cmp(const void *ent1, const void *ent2)
14568c2ecf20Sopenharmony_ci{
14578c2ecf20Sopenharmony_ci	const swp_entry_t *e1 = ent1, *e2 = ent2;
14588c2ecf20Sopenharmony_ci
14598c2ecf20Sopenharmony_ci	return (int)swp_type(*e1) - (int)swp_type(*e2);
14608c2ecf20Sopenharmony_ci}
14618c2ecf20Sopenharmony_ci
14628c2ecf20Sopenharmony_civoid swapcache_free_entries(swp_entry_t *entries, int n)
14638c2ecf20Sopenharmony_ci{
14648c2ecf20Sopenharmony_ci	struct swap_info_struct *p, *prev;
14658c2ecf20Sopenharmony_ci	int i;
14668c2ecf20Sopenharmony_ci
14678c2ecf20Sopenharmony_ci	if (n <= 0)
14688c2ecf20Sopenharmony_ci		return;
14698c2ecf20Sopenharmony_ci
14708c2ecf20Sopenharmony_ci	prev = NULL;
14718c2ecf20Sopenharmony_ci	p = NULL;
14728c2ecf20Sopenharmony_ci
14738c2ecf20Sopenharmony_ci	/*
14748c2ecf20Sopenharmony_ci	 * Sort swap entries by swap device, so each lock is only taken once.
14758c2ecf20Sopenharmony_ci	 * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
14768c2ecf20Sopenharmony_ci	 * so low that it isn't necessary to optimize further.
14778c2ecf20Sopenharmony_ci	 */
14788c2ecf20Sopenharmony_ci	if (nr_swapfiles > 1)
14798c2ecf20Sopenharmony_ci		sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
14808c2ecf20Sopenharmony_ci	for (i = 0; i < n; ++i) {
14818c2ecf20Sopenharmony_ci		p = swap_info_get_cont(entries[i], prev);
14828c2ecf20Sopenharmony_ci		if (p)
14838c2ecf20Sopenharmony_ci			swap_entry_free(p, entries[i]);
14848c2ecf20Sopenharmony_ci		prev = p;
14858c2ecf20Sopenharmony_ci	}
14868c2ecf20Sopenharmony_ci	if (p)
14878c2ecf20Sopenharmony_ci		spin_unlock(&p->lock);
14888c2ecf20Sopenharmony_ci}
14898c2ecf20Sopenharmony_ci
14908c2ecf20Sopenharmony_ci/*
14918c2ecf20Sopenharmony_ci * How many references to page are currently swapped out?
14928c2ecf20Sopenharmony_ci * This does not give an exact answer when swap count is continued,
14938c2ecf20Sopenharmony_ci * but does include the high COUNT_CONTINUED flag to allow for that.
14948c2ecf20Sopenharmony_ci */
14958c2ecf20Sopenharmony_ciint page_swapcount(struct page *page)
14968c2ecf20Sopenharmony_ci{
14978c2ecf20Sopenharmony_ci	int count = 0;
14988c2ecf20Sopenharmony_ci	struct swap_info_struct *p;
14998c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
15008c2ecf20Sopenharmony_ci	swp_entry_t entry;
15018c2ecf20Sopenharmony_ci	unsigned long offset;
15028c2ecf20Sopenharmony_ci
15038c2ecf20Sopenharmony_ci	entry.val = page_private(page);
15048c2ecf20Sopenharmony_ci	p = _swap_info_get(entry);
15058c2ecf20Sopenharmony_ci	if (p) {
15068c2ecf20Sopenharmony_ci		offset = swp_offset(entry);
15078c2ecf20Sopenharmony_ci		ci = lock_cluster_or_swap_info(p, offset);
15088c2ecf20Sopenharmony_ci		count = swap_count(p->swap_map[offset]);
15098c2ecf20Sopenharmony_ci		unlock_cluster_or_swap_info(p, ci);
15108c2ecf20Sopenharmony_ci	}
15118c2ecf20Sopenharmony_ci	return count;
15128c2ecf20Sopenharmony_ci}
15138c2ecf20Sopenharmony_ci
15148c2ecf20Sopenharmony_ciint __swap_count(swp_entry_t entry)
15158c2ecf20Sopenharmony_ci{
15168c2ecf20Sopenharmony_ci	struct swap_info_struct *si;
15178c2ecf20Sopenharmony_ci	pgoff_t offset = swp_offset(entry);
15188c2ecf20Sopenharmony_ci	int count = 0;
15198c2ecf20Sopenharmony_ci
15208c2ecf20Sopenharmony_ci	si = get_swap_device(entry);
15218c2ecf20Sopenharmony_ci	if (si) {
15228c2ecf20Sopenharmony_ci		count = swap_count(si->swap_map[offset]);
15238c2ecf20Sopenharmony_ci		put_swap_device(si);
15248c2ecf20Sopenharmony_ci	}
15258c2ecf20Sopenharmony_ci	return count;
15268c2ecf20Sopenharmony_ci}
15278c2ecf20Sopenharmony_ci
15288c2ecf20Sopenharmony_cistatic int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
15298c2ecf20Sopenharmony_ci{
15308c2ecf20Sopenharmony_ci	int count = 0;
15318c2ecf20Sopenharmony_ci	pgoff_t offset = swp_offset(entry);
15328c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
15338c2ecf20Sopenharmony_ci
15348c2ecf20Sopenharmony_ci	ci = lock_cluster_or_swap_info(si, offset);
15358c2ecf20Sopenharmony_ci	count = swap_count(si->swap_map[offset]);
15368c2ecf20Sopenharmony_ci	unlock_cluster_or_swap_info(si, ci);
15378c2ecf20Sopenharmony_ci	return count;
15388c2ecf20Sopenharmony_ci}
15398c2ecf20Sopenharmony_ci
15408c2ecf20Sopenharmony_ci/*
15418c2ecf20Sopenharmony_ci * How many references to @entry are currently swapped out?
15428c2ecf20Sopenharmony_ci * This does not give an exact answer when swap count is continued,
15438c2ecf20Sopenharmony_ci * but does include the high COUNT_CONTINUED flag to allow for that.
15448c2ecf20Sopenharmony_ci */
15458c2ecf20Sopenharmony_ciint __swp_swapcount(swp_entry_t entry)
15468c2ecf20Sopenharmony_ci{
15478c2ecf20Sopenharmony_ci	int count = 0;
15488c2ecf20Sopenharmony_ci	struct swap_info_struct *si;
15498c2ecf20Sopenharmony_ci
15508c2ecf20Sopenharmony_ci	si = get_swap_device(entry);
15518c2ecf20Sopenharmony_ci	if (si) {
15528c2ecf20Sopenharmony_ci		count = swap_swapcount(si, entry);
15538c2ecf20Sopenharmony_ci		put_swap_device(si);
15548c2ecf20Sopenharmony_ci	}
15558c2ecf20Sopenharmony_ci	return count;
15568c2ecf20Sopenharmony_ci}
15578c2ecf20Sopenharmony_ci
15588c2ecf20Sopenharmony_ci/*
15598c2ecf20Sopenharmony_ci * How many references to @entry are currently swapped out?
15608c2ecf20Sopenharmony_ci * This considers COUNT_CONTINUED so it returns exact answer.
15618c2ecf20Sopenharmony_ci */
15628c2ecf20Sopenharmony_ciint swp_swapcount(swp_entry_t entry)
15638c2ecf20Sopenharmony_ci{
15648c2ecf20Sopenharmony_ci	int count, tmp_count, n;
15658c2ecf20Sopenharmony_ci	struct swap_info_struct *p;
15668c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
15678c2ecf20Sopenharmony_ci	struct page *page;
15688c2ecf20Sopenharmony_ci	pgoff_t offset;
15698c2ecf20Sopenharmony_ci	unsigned char *map;
15708c2ecf20Sopenharmony_ci
15718c2ecf20Sopenharmony_ci	p = _swap_info_get(entry);
15728c2ecf20Sopenharmony_ci	if (!p)
15738c2ecf20Sopenharmony_ci		return 0;
15748c2ecf20Sopenharmony_ci
15758c2ecf20Sopenharmony_ci	offset = swp_offset(entry);
15768c2ecf20Sopenharmony_ci
15778c2ecf20Sopenharmony_ci	ci = lock_cluster_or_swap_info(p, offset);
15788c2ecf20Sopenharmony_ci
15798c2ecf20Sopenharmony_ci	count = swap_count(p->swap_map[offset]);
15808c2ecf20Sopenharmony_ci	if (!(count & COUNT_CONTINUED))
15818c2ecf20Sopenharmony_ci		goto out;
15828c2ecf20Sopenharmony_ci
15838c2ecf20Sopenharmony_ci	count &= ~COUNT_CONTINUED;
15848c2ecf20Sopenharmony_ci	n = SWAP_MAP_MAX + 1;
15858c2ecf20Sopenharmony_ci
15868c2ecf20Sopenharmony_ci	page = vmalloc_to_page(p->swap_map + offset);
15878c2ecf20Sopenharmony_ci	offset &= ~PAGE_MASK;
15888c2ecf20Sopenharmony_ci	VM_BUG_ON(page_private(page) != SWP_CONTINUED);
15898c2ecf20Sopenharmony_ci
15908c2ecf20Sopenharmony_ci	do {
15918c2ecf20Sopenharmony_ci		page = list_next_entry(page, lru);
15928c2ecf20Sopenharmony_ci		map = kmap_atomic(page);
15938c2ecf20Sopenharmony_ci		tmp_count = map[offset];
15948c2ecf20Sopenharmony_ci		kunmap_atomic(map);
15958c2ecf20Sopenharmony_ci
15968c2ecf20Sopenharmony_ci		count += (tmp_count & ~COUNT_CONTINUED) * n;
15978c2ecf20Sopenharmony_ci		n *= (SWAP_CONT_MAX + 1);
15988c2ecf20Sopenharmony_ci	} while (tmp_count & COUNT_CONTINUED);
15998c2ecf20Sopenharmony_ciout:
16008c2ecf20Sopenharmony_ci	unlock_cluster_or_swap_info(p, ci);
16018c2ecf20Sopenharmony_ci	return count;
16028c2ecf20Sopenharmony_ci}
16038c2ecf20Sopenharmony_ci
16048c2ecf20Sopenharmony_cistatic bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
16058c2ecf20Sopenharmony_ci					 swp_entry_t entry)
16068c2ecf20Sopenharmony_ci{
16078c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
16088c2ecf20Sopenharmony_ci	unsigned char *map = si->swap_map;
16098c2ecf20Sopenharmony_ci	unsigned long roffset = swp_offset(entry);
16108c2ecf20Sopenharmony_ci	unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
16118c2ecf20Sopenharmony_ci	int i;
16128c2ecf20Sopenharmony_ci	bool ret = false;
16138c2ecf20Sopenharmony_ci
16148c2ecf20Sopenharmony_ci	ci = lock_cluster_or_swap_info(si, offset);
16158c2ecf20Sopenharmony_ci	if (!ci || !cluster_is_huge(ci)) {
16168c2ecf20Sopenharmony_ci		if (swap_count(map[roffset]))
16178c2ecf20Sopenharmony_ci			ret = true;
16188c2ecf20Sopenharmony_ci		goto unlock_out;
16198c2ecf20Sopenharmony_ci	}
16208c2ecf20Sopenharmony_ci	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
16218c2ecf20Sopenharmony_ci		if (swap_count(map[offset + i])) {
16228c2ecf20Sopenharmony_ci			ret = true;
16238c2ecf20Sopenharmony_ci			break;
16248c2ecf20Sopenharmony_ci		}
16258c2ecf20Sopenharmony_ci	}
16268c2ecf20Sopenharmony_ciunlock_out:
16278c2ecf20Sopenharmony_ci	unlock_cluster_or_swap_info(si, ci);
16288c2ecf20Sopenharmony_ci	return ret;
16298c2ecf20Sopenharmony_ci}
16308c2ecf20Sopenharmony_ci
16318c2ecf20Sopenharmony_cistatic bool page_swapped(struct page *page)
16328c2ecf20Sopenharmony_ci{
16338c2ecf20Sopenharmony_ci	swp_entry_t entry;
16348c2ecf20Sopenharmony_ci	struct swap_info_struct *si;
16358c2ecf20Sopenharmony_ci
16368c2ecf20Sopenharmony_ci	if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
16378c2ecf20Sopenharmony_ci		return page_swapcount(page) != 0;
16388c2ecf20Sopenharmony_ci
16398c2ecf20Sopenharmony_ci	page = compound_head(page);
16408c2ecf20Sopenharmony_ci	entry.val = page_private(page);
16418c2ecf20Sopenharmony_ci	si = _swap_info_get(entry);
16428c2ecf20Sopenharmony_ci	if (si)
16438c2ecf20Sopenharmony_ci		return swap_page_trans_huge_swapped(si, entry);
16448c2ecf20Sopenharmony_ci	return false;
16458c2ecf20Sopenharmony_ci}
16468c2ecf20Sopenharmony_ci
16478c2ecf20Sopenharmony_cistatic int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
16488c2ecf20Sopenharmony_ci					 int *total_swapcount)
16498c2ecf20Sopenharmony_ci{
16508c2ecf20Sopenharmony_ci	int i, map_swapcount, _total_mapcount, _total_swapcount;
16518c2ecf20Sopenharmony_ci	unsigned long offset = 0;
16528c2ecf20Sopenharmony_ci	struct swap_info_struct *si;
16538c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci = NULL;
16548c2ecf20Sopenharmony_ci	unsigned char *map = NULL;
16558c2ecf20Sopenharmony_ci	int mapcount, swapcount = 0;
16568c2ecf20Sopenharmony_ci
16578c2ecf20Sopenharmony_ci	/* hugetlbfs shouldn't call it */
16588c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(PageHuge(page), page);
16598c2ecf20Sopenharmony_ci
16608c2ecf20Sopenharmony_ci	if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
16618c2ecf20Sopenharmony_ci		mapcount = page_trans_huge_mapcount(page, total_mapcount);
16628c2ecf20Sopenharmony_ci		if (PageSwapCache(page))
16638c2ecf20Sopenharmony_ci			swapcount = page_swapcount(page);
16648c2ecf20Sopenharmony_ci		if (total_swapcount)
16658c2ecf20Sopenharmony_ci			*total_swapcount = swapcount;
16668c2ecf20Sopenharmony_ci		return mapcount + swapcount;
16678c2ecf20Sopenharmony_ci	}
16688c2ecf20Sopenharmony_ci
16698c2ecf20Sopenharmony_ci	page = compound_head(page);
16708c2ecf20Sopenharmony_ci
16718c2ecf20Sopenharmony_ci	_total_mapcount = _total_swapcount = map_swapcount = 0;
16728c2ecf20Sopenharmony_ci	if (PageSwapCache(page)) {
16738c2ecf20Sopenharmony_ci		swp_entry_t entry;
16748c2ecf20Sopenharmony_ci
16758c2ecf20Sopenharmony_ci		entry.val = page_private(page);
16768c2ecf20Sopenharmony_ci		si = _swap_info_get(entry);
16778c2ecf20Sopenharmony_ci		if (si) {
16788c2ecf20Sopenharmony_ci			map = si->swap_map;
16798c2ecf20Sopenharmony_ci			offset = swp_offset(entry);
16808c2ecf20Sopenharmony_ci		}
16818c2ecf20Sopenharmony_ci	}
16828c2ecf20Sopenharmony_ci	if (map)
16838c2ecf20Sopenharmony_ci		ci = lock_cluster(si, offset);
16848c2ecf20Sopenharmony_ci	for (i = 0; i < HPAGE_PMD_NR; i++) {
16858c2ecf20Sopenharmony_ci		mapcount = atomic_read(&page[i]._mapcount) + 1;
16868c2ecf20Sopenharmony_ci		_total_mapcount += mapcount;
16878c2ecf20Sopenharmony_ci		if (map) {
16888c2ecf20Sopenharmony_ci			swapcount = swap_count(map[offset + i]);
16898c2ecf20Sopenharmony_ci			_total_swapcount += swapcount;
16908c2ecf20Sopenharmony_ci		}
16918c2ecf20Sopenharmony_ci		map_swapcount = max(map_swapcount, mapcount + swapcount);
16928c2ecf20Sopenharmony_ci	}
16938c2ecf20Sopenharmony_ci	unlock_cluster(ci);
16948c2ecf20Sopenharmony_ci	if (PageDoubleMap(page)) {
16958c2ecf20Sopenharmony_ci		map_swapcount -= 1;
16968c2ecf20Sopenharmony_ci		_total_mapcount -= HPAGE_PMD_NR;
16978c2ecf20Sopenharmony_ci	}
16988c2ecf20Sopenharmony_ci	mapcount = compound_mapcount(page);
16998c2ecf20Sopenharmony_ci	map_swapcount += mapcount;
17008c2ecf20Sopenharmony_ci	_total_mapcount += mapcount;
17018c2ecf20Sopenharmony_ci	if (total_mapcount)
17028c2ecf20Sopenharmony_ci		*total_mapcount = _total_mapcount;
17038c2ecf20Sopenharmony_ci	if (total_swapcount)
17048c2ecf20Sopenharmony_ci		*total_swapcount = _total_swapcount;
17058c2ecf20Sopenharmony_ci
17068c2ecf20Sopenharmony_ci	return map_swapcount;
17078c2ecf20Sopenharmony_ci}
17088c2ecf20Sopenharmony_ci
17098c2ecf20Sopenharmony_ci/*
17108c2ecf20Sopenharmony_ci * We can write to an anon page without COW if there are no other references
17118c2ecf20Sopenharmony_ci * to it.  And as a side-effect, free up its swap: because the old content
17128c2ecf20Sopenharmony_ci * on disk will never be read, and seeking back there to write new content
17138c2ecf20Sopenharmony_ci * later would only waste time away from clustering.
17148c2ecf20Sopenharmony_ci *
17158c2ecf20Sopenharmony_ci * NOTE: total_map_swapcount should not be relied upon by the caller if
17168c2ecf20Sopenharmony_ci * reuse_swap_page() returns false, but it may be always overwritten
17178c2ecf20Sopenharmony_ci * (see the other implementation for CONFIG_SWAP=n).
17188c2ecf20Sopenharmony_ci */
17198c2ecf20Sopenharmony_cibool reuse_swap_page(struct page *page, int *total_map_swapcount)
17208c2ecf20Sopenharmony_ci{
17218c2ecf20Sopenharmony_ci	int count, total_mapcount, total_swapcount;
17228c2ecf20Sopenharmony_ci
17238c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(!PageLocked(page), page);
17248c2ecf20Sopenharmony_ci	if (unlikely(PageKsm(page)))
17258c2ecf20Sopenharmony_ci		return false;
17268c2ecf20Sopenharmony_ci	count = page_trans_huge_map_swapcount(page, &total_mapcount,
17278c2ecf20Sopenharmony_ci					      &total_swapcount);
17288c2ecf20Sopenharmony_ci	if (total_map_swapcount)
17298c2ecf20Sopenharmony_ci		*total_map_swapcount = total_mapcount + total_swapcount;
17308c2ecf20Sopenharmony_ci	if (count == 1 && PageSwapCache(page) &&
17318c2ecf20Sopenharmony_ci	    (likely(!PageTransCompound(page)) ||
17328c2ecf20Sopenharmony_ci	     /* The remaining swap count will be freed soon */
17338c2ecf20Sopenharmony_ci	     total_swapcount == page_swapcount(page))) {
17348c2ecf20Sopenharmony_ci		if (!PageWriteback(page)) {
17358c2ecf20Sopenharmony_ci			page = compound_head(page);
17368c2ecf20Sopenharmony_ci			delete_from_swap_cache(page);
17378c2ecf20Sopenharmony_ci			SetPageDirty(page);
17388c2ecf20Sopenharmony_ci		} else {
17398c2ecf20Sopenharmony_ci			swp_entry_t entry;
17408c2ecf20Sopenharmony_ci			struct swap_info_struct *p;
17418c2ecf20Sopenharmony_ci
17428c2ecf20Sopenharmony_ci			entry.val = page_private(page);
17438c2ecf20Sopenharmony_ci			p = swap_info_get(entry);
17448c2ecf20Sopenharmony_ci			if (p->flags & SWP_STABLE_WRITES) {
17458c2ecf20Sopenharmony_ci				spin_unlock(&p->lock);
17468c2ecf20Sopenharmony_ci				return false;
17478c2ecf20Sopenharmony_ci			}
17488c2ecf20Sopenharmony_ci			spin_unlock(&p->lock);
17498c2ecf20Sopenharmony_ci		}
17508c2ecf20Sopenharmony_ci	}
17518c2ecf20Sopenharmony_ci
17528c2ecf20Sopenharmony_ci	return count <= 1;
17538c2ecf20Sopenharmony_ci}
17548c2ecf20Sopenharmony_ci
17558c2ecf20Sopenharmony_ci/*
17568c2ecf20Sopenharmony_ci * If swap is getting full, or if there are no more mappings of this page,
17578c2ecf20Sopenharmony_ci * then try_to_free_swap is called to free its swap space.
17588c2ecf20Sopenharmony_ci */
17598c2ecf20Sopenharmony_ciint try_to_free_swap(struct page *page)
17608c2ecf20Sopenharmony_ci{
17618c2ecf20Sopenharmony_ci	VM_BUG_ON_PAGE(!PageLocked(page), page);
17628c2ecf20Sopenharmony_ci
17638c2ecf20Sopenharmony_ci	if (!PageSwapCache(page))
17648c2ecf20Sopenharmony_ci		return 0;
17658c2ecf20Sopenharmony_ci	if (PageWriteback(page))
17668c2ecf20Sopenharmony_ci		return 0;
17678c2ecf20Sopenharmony_ci	if (page_swapped(page))
17688c2ecf20Sopenharmony_ci		return 0;
17698c2ecf20Sopenharmony_ci
17708c2ecf20Sopenharmony_ci	/*
17718c2ecf20Sopenharmony_ci	 * Once hibernation has begun to create its image of memory,
17728c2ecf20Sopenharmony_ci	 * there's a danger that one of the calls to try_to_free_swap()
17738c2ecf20Sopenharmony_ci	 * - most probably a call from __try_to_reclaim_swap() while
17748c2ecf20Sopenharmony_ci	 * hibernation is allocating its own swap pages for the image,
17758c2ecf20Sopenharmony_ci	 * but conceivably even a call from memory reclaim - will free
17768c2ecf20Sopenharmony_ci	 * the swap from a page which has already been recorded in the
17778c2ecf20Sopenharmony_ci	 * image as a clean swapcache page, and then reuse its swap for
17788c2ecf20Sopenharmony_ci	 * another page of the image.  On waking from hibernation, the
17798c2ecf20Sopenharmony_ci	 * original page might be freed under memory pressure, then
17808c2ecf20Sopenharmony_ci	 * later read back in from swap, now with the wrong data.
17818c2ecf20Sopenharmony_ci	 *
17828c2ecf20Sopenharmony_ci	 * Hibernation suspends storage while it is writing the image
17838c2ecf20Sopenharmony_ci	 * to disk so check that here.
17848c2ecf20Sopenharmony_ci	 */
17858c2ecf20Sopenharmony_ci	if (pm_suspended_storage())
17868c2ecf20Sopenharmony_ci		return 0;
17878c2ecf20Sopenharmony_ci
17888c2ecf20Sopenharmony_ci	page = compound_head(page);
17898c2ecf20Sopenharmony_ci	delete_from_swap_cache(page);
17908c2ecf20Sopenharmony_ci	SetPageDirty(page);
17918c2ecf20Sopenharmony_ci	return 1;
17928c2ecf20Sopenharmony_ci}
17938c2ecf20Sopenharmony_ci
17948c2ecf20Sopenharmony_ci/*
17958c2ecf20Sopenharmony_ci * Free the swap entry like above, but also try to
17968c2ecf20Sopenharmony_ci * free the page cache entry if it is the last user.
17978c2ecf20Sopenharmony_ci */
17988c2ecf20Sopenharmony_ciint free_swap_and_cache(swp_entry_t entry)
17998c2ecf20Sopenharmony_ci{
18008c2ecf20Sopenharmony_ci	struct swap_info_struct *p;
18018c2ecf20Sopenharmony_ci	unsigned char count;
18028c2ecf20Sopenharmony_ci
18038c2ecf20Sopenharmony_ci	if (non_swap_entry(entry))
18048c2ecf20Sopenharmony_ci		return 1;
18058c2ecf20Sopenharmony_ci
18068c2ecf20Sopenharmony_ci	p = get_swap_device(entry);
18078c2ecf20Sopenharmony_ci	if (p) {
18088c2ecf20Sopenharmony_ci		if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
18098c2ecf20Sopenharmony_ci			put_swap_device(p);
18108c2ecf20Sopenharmony_ci			return 0;
18118c2ecf20Sopenharmony_ci		}
18128c2ecf20Sopenharmony_ci
18138c2ecf20Sopenharmony_ci		count = __swap_entry_free(p, entry);
18148c2ecf20Sopenharmony_ci		if (count == SWAP_HAS_CACHE &&
18158c2ecf20Sopenharmony_ci		    !swap_page_trans_huge_swapped(p, entry))
18168c2ecf20Sopenharmony_ci			__try_to_reclaim_swap(p, swp_offset(entry),
18178c2ecf20Sopenharmony_ci					      TTRS_UNMAPPED | TTRS_FULL);
18188c2ecf20Sopenharmony_ci		put_swap_device(p);
18198c2ecf20Sopenharmony_ci	}
18208c2ecf20Sopenharmony_ci	return p != NULL;
18218c2ecf20Sopenharmony_ci}
18228c2ecf20Sopenharmony_ci
18238c2ecf20Sopenharmony_ci#ifdef CONFIG_HIBERNATION
18248c2ecf20Sopenharmony_ci/*
18258c2ecf20Sopenharmony_ci * Find the swap type that corresponds to given device (if any).
18268c2ecf20Sopenharmony_ci *
18278c2ecf20Sopenharmony_ci * @offset - number of the PAGE_SIZE-sized block of the device, starting
18288c2ecf20Sopenharmony_ci * from 0, in which the swap header is expected to be located.
18298c2ecf20Sopenharmony_ci *
18308c2ecf20Sopenharmony_ci * This is needed for the suspend to disk (aka swsusp).
18318c2ecf20Sopenharmony_ci */
18328c2ecf20Sopenharmony_ciint swap_type_of(dev_t device, sector_t offset)
18338c2ecf20Sopenharmony_ci{
18348c2ecf20Sopenharmony_ci	int type;
18358c2ecf20Sopenharmony_ci
18368c2ecf20Sopenharmony_ci	if (!device)
18378c2ecf20Sopenharmony_ci		return -1;
18388c2ecf20Sopenharmony_ci
18398c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
18408c2ecf20Sopenharmony_ci	for (type = 0; type < nr_swapfiles; type++) {
18418c2ecf20Sopenharmony_ci		struct swap_info_struct *sis = swap_info[type];
18428c2ecf20Sopenharmony_ci
18438c2ecf20Sopenharmony_ci		if (!(sis->flags & SWP_WRITEOK))
18448c2ecf20Sopenharmony_ci			continue;
18458c2ecf20Sopenharmony_ci
18468c2ecf20Sopenharmony_ci		if (device == sis->bdev->bd_dev) {
18478c2ecf20Sopenharmony_ci			struct swap_extent *se = first_se(sis);
18488c2ecf20Sopenharmony_ci
18498c2ecf20Sopenharmony_ci			if (se->start_block == offset) {
18508c2ecf20Sopenharmony_ci				spin_unlock(&swap_lock);
18518c2ecf20Sopenharmony_ci				return type;
18528c2ecf20Sopenharmony_ci			}
18538c2ecf20Sopenharmony_ci		}
18548c2ecf20Sopenharmony_ci	}
18558c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
18568c2ecf20Sopenharmony_ci	return -ENODEV;
18578c2ecf20Sopenharmony_ci}
18588c2ecf20Sopenharmony_ci
18598c2ecf20Sopenharmony_ciint find_first_swap(dev_t *device)
18608c2ecf20Sopenharmony_ci{
18618c2ecf20Sopenharmony_ci	int type;
18628c2ecf20Sopenharmony_ci
18638c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
18648c2ecf20Sopenharmony_ci	for (type = 0; type < nr_swapfiles; type++) {
18658c2ecf20Sopenharmony_ci		struct swap_info_struct *sis = swap_info[type];
18668c2ecf20Sopenharmony_ci
18678c2ecf20Sopenharmony_ci		if (!(sis->flags & SWP_WRITEOK))
18688c2ecf20Sopenharmony_ci			continue;
18698c2ecf20Sopenharmony_ci		*device = sis->bdev->bd_dev;
18708c2ecf20Sopenharmony_ci		spin_unlock(&swap_lock);
18718c2ecf20Sopenharmony_ci		return type;
18728c2ecf20Sopenharmony_ci	}
18738c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
18748c2ecf20Sopenharmony_ci	return -ENODEV;
18758c2ecf20Sopenharmony_ci}
18768c2ecf20Sopenharmony_ci
18778c2ecf20Sopenharmony_ci/*
18788c2ecf20Sopenharmony_ci * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
18798c2ecf20Sopenharmony_ci * corresponding to given index in swap_info (swap type).
18808c2ecf20Sopenharmony_ci */
18818c2ecf20Sopenharmony_cisector_t swapdev_block(int type, pgoff_t offset)
18828c2ecf20Sopenharmony_ci{
18838c2ecf20Sopenharmony_ci	struct block_device *bdev;
18848c2ecf20Sopenharmony_ci	struct swap_info_struct *si = swap_type_to_swap_info(type);
18858c2ecf20Sopenharmony_ci
18868c2ecf20Sopenharmony_ci	if (!si || !(si->flags & SWP_WRITEOK))
18878c2ecf20Sopenharmony_ci		return 0;
18888c2ecf20Sopenharmony_ci	return map_swap_entry(swp_entry(type, offset), &bdev);
18898c2ecf20Sopenharmony_ci}
18908c2ecf20Sopenharmony_ci
18918c2ecf20Sopenharmony_ci/*
18928c2ecf20Sopenharmony_ci * Return either the total number of swap pages of given type, or the number
18938c2ecf20Sopenharmony_ci * of free pages of that type (depending on @free)
18948c2ecf20Sopenharmony_ci *
18958c2ecf20Sopenharmony_ci * This is needed for software suspend
18968c2ecf20Sopenharmony_ci */
18978c2ecf20Sopenharmony_ciunsigned int count_swap_pages(int type, int free)
18988c2ecf20Sopenharmony_ci{
18998c2ecf20Sopenharmony_ci	unsigned int n = 0;
19008c2ecf20Sopenharmony_ci
19018c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
19028c2ecf20Sopenharmony_ci	if ((unsigned int)type < nr_swapfiles) {
19038c2ecf20Sopenharmony_ci		struct swap_info_struct *sis = swap_info[type];
19048c2ecf20Sopenharmony_ci
19058c2ecf20Sopenharmony_ci		spin_lock(&sis->lock);
19068c2ecf20Sopenharmony_ci		if (sis->flags & SWP_WRITEOK) {
19078c2ecf20Sopenharmony_ci			n = sis->pages;
19088c2ecf20Sopenharmony_ci			if (free)
19098c2ecf20Sopenharmony_ci				n -= sis->inuse_pages;
19108c2ecf20Sopenharmony_ci		}
19118c2ecf20Sopenharmony_ci		spin_unlock(&sis->lock);
19128c2ecf20Sopenharmony_ci	}
19138c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
19148c2ecf20Sopenharmony_ci	return n;
19158c2ecf20Sopenharmony_ci}
19168c2ecf20Sopenharmony_ci#endif /* CONFIG_HIBERNATION */
19178c2ecf20Sopenharmony_ci
19188c2ecf20Sopenharmony_cistatic inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
19198c2ecf20Sopenharmony_ci{
19208c2ecf20Sopenharmony_ci	return pte_same(pte_swp_clear_flags(pte), swp_pte);
19218c2ecf20Sopenharmony_ci}
19228c2ecf20Sopenharmony_ci
19238c2ecf20Sopenharmony_ci/*
19248c2ecf20Sopenharmony_ci * No need to decide whether this PTE shares the swap entry with others,
19258c2ecf20Sopenharmony_ci * just let do_wp_page work it out if a write is requested later - to
19268c2ecf20Sopenharmony_ci * force COW, vm_page_prot omits write permission from any private vma.
19278c2ecf20Sopenharmony_ci */
19288c2ecf20Sopenharmony_cistatic int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
19298c2ecf20Sopenharmony_ci		unsigned long addr, swp_entry_t entry, struct page *page)
19308c2ecf20Sopenharmony_ci{
19318c2ecf20Sopenharmony_ci	struct page *swapcache;
19328c2ecf20Sopenharmony_ci	spinlock_t *ptl;
19338c2ecf20Sopenharmony_ci	pte_t *pte;
19348c2ecf20Sopenharmony_ci	int ret = 1;
19358c2ecf20Sopenharmony_ci
19368c2ecf20Sopenharmony_ci	swapcache = page;
19378c2ecf20Sopenharmony_ci	page = ksm_might_need_to_copy(page, vma, addr);
19388c2ecf20Sopenharmony_ci	if (unlikely(!page))
19398c2ecf20Sopenharmony_ci		return -ENOMEM;
19408c2ecf20Sopenharmony_ci
19418c2ecf20Sopenharmony_ci	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
19428c2ecf20Sopenharmony_ci	if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
19438c2ecf20Sopenharmony_ci		ret = 0;
19448c2ecf20Sopenharmony_ci		goto out;
19458c2ecf20Sopenharmony_ci	}
19468c2ecf20Sopenharmony_ci
19478c2ecf20Sopenharmony_ci	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
19488c2ecf20Sopenharmony_ci	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
19498c2ecf20Sopenharmony_ci	get_page(page);
19508c2ecf20Sopenharmony_ci	set_pte_at(vma->vm_mm, addr, pte,
19518c2ecf20Sopenharmony_ci		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
19528c2ecf20Sopenharmony_ci	if (page == swapcache) {
19538c2ecf20Sopenharmony_ci		page_add_anon_rmap(page, vma, addr, false);
19548c2ecf20Sopenharmony_ci	} else { /* ksm created a completely new copy */
19558c2ecf20Sopenharmony_ci		page_add_new_anon_rmap(page, vma, addr, false);
19568c2ecf20Sopenharmony_ci		lru_cache_add_inactive_or_unevictable(page, vma);
19578c2ecf20Sopenharmony_ci	}
19588c2ecf20Sopenharmony_ci	swap_free(entry);
19598c2ecf20Sopenharmony_ciout:
19608c2ecf20Sopenharmony_ci	pte_unmap_unlock(pte, ptl);
19618c2ecf20Sopenharmony_ci	if (page != swapcache) {
19628c2ecf20Sopenharmony_ci		unlock_page(page);
19638c2ecf20Sopenharmony_ci		put_page(page);
19648c2ecf20Sopenharmony_ci	}
19658c2ecf20Sopenharmony_ci	return ret;
19668c2ecf20Sopenharmony_ci}
19678c2ecf20Sopenharmony_ci
19688c2ecf20Sopenharmony_cistatic int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
19698c2ecf20Sopenharmony_ci			unsigned long addr, unsigned long end,
19708c2ecf20Sopenharmony_ci			unsigned int type, bool frontswap,
19718c2ecf20Sopenharmony_ci			unsigned long *fs_pages_to_unuse)
19728c2ecf20Sopenharmony_ci{
19738c2ecf20Sopenharmony_ci	struct page *page;
19748c2ecf20Sopenharmony_ci	swp_entry_t entry;
19758c2ecf20Sopenharmony_ci	pte_t *pte;
19768c2ecf20Sopenharmony_ci	struct swap_info_struct *si;
19778c2ecf20Sopenharmony_ci	unsigned long offset;
19788c2ecf20Sopenharmony_ci	int ret = 0;
19798c2ecf20Sopenharmony_ci	volatile unsigned char *swap_map;
19808c2ecf20Sopenharmony_ci
19818c2ecf20Sopenharmony_ci	si = swap_info[type];
19828c2ecf20Sopenharmony_ci	pte = pte_offset_map(pmd, addr);
19838c2ecf20Sopenharmony_ci	do {
19848c2ecf20Sopenharmony_ci		struct vm_fault vmf;
19858c2ecf20Sopenharmony_ci
19868c2ecf20Sopenharmony_ci		if (!is_swap_pte(*pte))
19878c2ecf20Sopenharmony_ci			continue;
19888c2ecf20Sopenharmony_ci
19898c2ecf20Sopenharmony_ci		entry = pte_to_swp_entry(*pte);
19908c2ecf20Sopenharmony_ci		if (swp_type(entry) != type)
19918c2ecf20Sopenharmony_ci			continue;
19928c2ecf20Sopenharmony_ci
19938c2ecf20Sopenharmony_ci		offset = swp_offset(entry);
19948c2ecf20Sopenharmony_ci		if (frontswap && !frontswap_test(si, offset))
19958c2ecf20Sopenharmony_ci			continue;
19968c2ecf20Sopenharmony_ci
19978c2ecf20Sopenharmony_ci		pte_unmap(pte);
19988c2ecf20Sopenharmony_ci		swap_map = &si->swap_map[offset];
19998c2ecf20Sopenharmony_ci		page = lookup_swap_cache(entry, vma, addr);
20008c2ecf20Sopenharmony_ci		if (!page) {
20018c2ecf20Sopenharmony_ci			vmf.vma = vma;
20028c2ecf20Sopenharmony_ci			vmf.address = addr;
20038c2ecf20Sopenharmony_ci			vmf.pmd = pmd;
20048c2ecf20Sopenharmony_ci			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
20058c2ecf20Sopenharmony_ci						&vmf);
20068c2ecf20Sopenharmony_ci		}
20078c2ecf20Sopenharmony_ci		if (!page) {
20088c2ecf20Sopenharmony_ci			if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
20098c2ecf20Sopenharmony_ci				goto try_next;
20108c2ecf20Sopenharmony_ci			return -ENOMEM;
20118c2ecf20Sopenharmony_ci		}
20128c2ecf20Sopenharmony_ci
20138c2ecf20Sopenharmony_ci		lock_page(page);
20148c2ecf20Sopenharmony_ci		wait_on_page_writeback(page);
20158c2ecf20Sopenharmony_ci		ret = unuse_pte(vma, pmd, addr, entry, page);
20168c2ecf20Sopenharmony_ci		if (ret < 0) {
20178c2ecf20Sopenharmony_ci			unlock_page(page);
20188c2ecf20Sopenharmony_ci			put_page(page);
20198c2ecf20Sopenharmony_ci			goto out;
20208c2ecf20Sopenharmony_ci		}
20218c2ecf20Sopenharmony_ci
20228c2ecf20Sopenharmony_ci		try_to_free_swap(page);
20238c2ecf20Sopenharmony_ci		unlock_page(page);
20248c2ecf20Sopenharmony_ci		put_page(page);
20258c2ecf20Sopenharmony_ci
20268c2ecf20Sopenharmony_ci		if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
20278c2ecf20Sopenharmony_ci			ret = FRONTSWAP_PAGES_UNUSED;
20288c2ecf20Sopenharmony_ci			goto out;
20298c2ecf20Sopenharmony_ci		}
20308c2ecf20Sopenharmony_citry_next:
20318c2ecf20Sopenharmony_ci		pte = pte_offset_map(pmd, addr);
20328c2ecf20Sopenharmony_ci	} while (pte++, addr += PAGE_SIZE, addr != end);
20338c2ecf20Sopenharmony_ci	pte_unmap(pte - 1);
20348c2ecf20Sopenharmony_ci
20358c2ecf20Sopenharmony_ci	ret = 0;
20368c2ecf20Sopenharmony_ciout:
20378c2ecf20Sopenharmony_ci	return ret;
20388c2ecf20Sopenharmony_ci}
20398c2ecf20Sopenharmony_ci
20408c2ecf20Sopenharmony_cistatic inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
20418c2ecf20Sopenharmony_ci				unsigned long addr, unsigned long end,
20428c2ecf20Sopenharmony_ci				unsigned int type, bool frontswap,
20438c2ecf20Sopenharmony_ci				unsigned long *fs_pages_to_unuse)
20448c2ecf20Sopenharmony_ci{
20458c2ecf20Sopenharmony_ci	pmd_t *pmd;
20468c2ecf20Sopenharmony_ci	unsigned long next;
20478c2ecf20Sopenharmony_ci	int ret;
20488c2ecf20Sopenharmony_ci
20498c2ecf20Sopenharmony_ci	pmd = pmd_offset(pud, addr);
20508c2ecf20Sopenharmony_ci	do {
20518c2ecf20Sopenharmony_ci		cond_resched();
20528c2ecf20Sopenharmony_ci		next = pmd_addr_end(addr, end);
20538c2ecf20Sopenharmony_ci		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
20548c2ecf20Sopenharmony_ci			continue;
20558c2ecf20Sopenharmony_ci		ret = unuse_pte_range(vma, pmd, addr, next, type,
20568c2ecf20Sopenharmony_ci				      frontswap, fs_pages_to_unuse);
20578c2ecf20Sopenharmony_ci		if (ret)
20588c2ecf20Sopenharmony_ci			return ret;
20598c2ecf20Sopenharmony_ci	} while (pmd++, addr = next, addr != end);
20608c2ecf20Sopenharmony_ci	return 0;
20618c2ecf20Sopenharmony_ci}
20628c2ecf20Sopenharmony_ci
20638c2ecf20Sopenharmony_cistatic inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
20648c2ecf20Sopenharmony_ci				unsigned long addr, unsigned long end,
20658c2ecf20Sopenharmony_ci				unsigned int type, bool frontswap,
20668c2ecf20Sopenharmony_ci				unsigned long *fs_pages_to_unuse)
20678c2ecf20Sopenharmony_ci{
20688c2ecf20Sopenharmony_ci	pud_t *pud;
20698c2ecf20Sopenharmony_ci	unsigned long next;
20708c2ecf20Sopenharmony_ci	int ret;
20718c2ecf20Sopenharmony_ci
20728c2ecf20Sopenharmony_ci	pud = pud_offset(p4d, addr);
20738c2ecf20Sopenharmony_ci	do {
20748c2ecf20Sopenharmony_ci		next = pud_addr_end(addr, end);
20758c2ecf20Sopenharmony_ci		if (pud_none_or_clear_bad(pud))
20768c2ecf20Sopenharmony_ci			continue;
20778c2ecf20Sopenharmony_ci		ret = unuse_pmd_range(vma, pud, addr, next, type,
20788c2ecf20Sopenharmony_ci				      frontswap, fs_pages_to_unuse);
20798c2ecf20Sopenharmony_ci		if (ret)
20808c2ecf20Sopenharmony_ci			return ret;
20818c2ecf20Sopenharmony_ci	} while (pud++, addr = next, addr != end);
20828c2ecf20Sopenharmony_ci	return 0;
20838c2ecf20Sopenharmony_ci}
20848c2ecf20Sopenharmony_ci
20858c2ecf20Sopenharmony_cistatic inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
20868c2ecf20Sopenharmony_ci				unsigned long addr, unsigned long end,
20878c2ecf20Sopenharmony_ci				unsigned int type, bool frontswap,
20888c2ecf20Sopenharmony_ci				unsigned long *fs_pages_to_unuse)
20898c2ecf20Sopenharmony_ci{
20908c2ecf20Sopenharmony_ci	p4d_t *p4d;
20918c2ecf20Sopenharmony_ci	unsigned long next;
20928c2ecf20Sopenharmony_ci	int ret;
20938c2ecf20Sopenharmony_ci
20948c2ecf20Sopenharmony_ci	p4d = p4d_offset(pgd, addr);
20958c2ecf20Sopenharmony_ci	do {
20968c2ecf20Sopenharmony_ci		next = p4d_addr_end(addr, end);
20978c2ecf20Sopenharmony_ci		if (p4d_none_or_clear_bad(p4d))
20988c2ecf20Sopenharmony_ci			continue;
20998c2ecf20Sopenharmony_ci		ret = unuse_pud_range(vma, p4d, addr, next, type,
21008c2ecf20Sopenharmony_ci				      frontswap, fs_pages_to_unuse);
21018c2ecf20Sopenharmony_ci		if (ret)
21028c2ecf20Sopenharmony_ci			return ret;
21038c2ecf20Sopenharmony_ci	} while (p4d++, addr = next, addr != end);
21048c2ecf20Sopenharmony_ci	return 0;
21058c2ecf20Sopenharmony_ci}
21068c2ecf20Sopenharmony_ci
21078c2ecf20Sopenharmony_cistatic int unuse_vma(struct vm_area_struct *vma, unsigned int type,
21088c2ecf20Sopenharmony_ci		     bool frontswap, unsigned long *fs_pages_to_unuse)
21098c2ecf20Sopenharmony_ci{
21108c2ecf20Sopenharmony_ci	pgd_t *pgd;
21118c2ecf20Sopenharmony_ci	unsigned long addr, end, next;
21128c2ecf20Sopenharmony_ci	int ret;
21138c2ecf20Sopenharmony_ci
21148c2ecf20Sopenharmony_ci	addr = vma->vm_start;
21158c2ecf20Sopenharmony_ci	end = vma->vm_end;
21168c2ecf20Sopenharmony_ci
21178c2ecf20Sopenharmony_ci	pgd = pgd_offset(vma->vm_mm, addr);
21188c2ecf20Sopenharmony_ci	do {
21198c2ecf20Sopenharmony_ci		next = pgd_addr_end(addr, end);
21208c2ecf20Sopenharmony_ci		if (pgd_none_or_clear_bad(pgd))
21218c2ecf20Sopenharmony_ci			continue;
21228c2ecf20Sopenharmony_ci		ret = unuse_p4d_range(vma, pgd, addr, next, type,
21238c2ecf20Sopenharmony_ci				      frontswap, fs_pages_to_unuse);
21248c2ecf20Sopenharmony_ci		if (ret)
21258c2ecf20Sopenharmony_ci			return ret;
21268c2ecf20Sopenharmony_ci	} while (pgd++, addr = next, addr != end);
21278c2ecf20Sopenharmony_ci	return 0;
21288c2ecf20Sopenharmony_ci}
21298c2ecf20Sopenharmony_ci
21308c2ecf20Sopenharmony_cistatic int unuse_mm(struct mm_struct *mm, unsigned int type,
21318c2ecf20Sopenharmony_ci		    bool frontswap, unsigned long *fs_pages_to_unuse)
21328c2ecf20Sopenharmony_ci{
21338c2ecf20Sopenharmony_ci	struct vm_area_struct *vma;
21348c2ecf20Sopenharmony_ci	int ret = 0;
21358c2ecf20Sopenharmony_ci
21368c2ecf20Sopenharmony_ci	mmap_read_lock(mm);
21378c2ecf20Sopenharmony_ci	for (vma = mm->mmap; vma; vma = vma->vm_next) {
21388c2ecf20Sopenharmony_ci		if (vma->anon_vma) {
21398c2ecf20Sopenharmony_ci			ret = unuse_vma(vma, type, frontswap,
21408c2ecf20Sopenharmony_ci					fs_pages_to_unuse);
21418c2ecf20Sopenharmony_ci			if (ret)
21428c2ecf20Sopenharmony_ci				break;
21438c2ecf20Sopenharmony_ci		}
21448c2ecf20Sopenharmony_ci		cond_resched();
21458c2ecf20Sopenharmony_ci	}
21468c2ecf20Sopenharmony_ci	mmap_read_unlock(mm);
21478c2ecf20Sopenharmony_ci	return ret;
21488c2ecf20Sopenharmony_ci}
21498c2ecf20Sopenharmony_ci
21508c2ecf20Sopenharmony_ci/*
21518c2ecf20Sopenharmony_ci * Scan swap_map (or frontswap_map if frontswap parameter is true)
21528c2ecf20Sopenharmony_ci * from current position to next entry still in use. Return 0
21538c2ecf20Sopenharmony_ci * if there are no inuse entries after prev till end of the map.
21548c2ecf20Sopenharmony_ci */
21558c2ecf20Sopenharmony_cistatic unsigned int find_next_to_unuse(struct swap_info_struct *si,
21568c2ecf20Sopenharmony_ci					unsigned int prev, bool frontswap)
21578c2ecf20Sopenharmony_ci{
21588c2ecf20Sopenharmony_ci	unsigned int i;
21598c2ecf20Sopenharmony_ci	unsigned char count;
21608c2ecf20Sopenharmony_ci
21618c2ecf20Sopenharmony_ci	/*
21628c2ecf20Sopenharmony_ci	 * No need for swap_lock here: we're just looking
21638c2ecf20Sopenharmony_ci	 * for whether an entry is in use, not modifying it; false
21648c2ecf20Sopenharmony_ci	 * hits are okay, and sys_swapoff() has already prevented new
21658c2ecf20Sopenharmony_ci	 * allocations from this area (while holding swap_lock).
21668c2ecf20Sopenharmony_ci	 */
21678c2ecf20Sopenharmony_ci	for (i = prev + 1; i < si->max; i++) {
21688c2ecf20Sopenharmony_ci		count = READ_ONCE(si->swap_map[i]);
21698c2ecf20Sopenharmony_ci		if (count && swap_count(count) != SWAP_MAP_BAD)
21708c2ecf20Sopenharmony_ci			if (!frontswap || frontswap_test(si, i))
21718c2ecf20Sopenharmony_ci				break;
21728c2ecf20Sopenharmony_ci		if ((i % LATENCY_LIMIT) == 0)
21738c2ecf20Sopenharmony_ci			cond_resched();
21748c2ecf20Sopenharmony_ci	}
21758c2ecf20Sopenharmony_ci
21768c2ecf20Sopenharmony_ci	if (i == si->max)
21778c2ecf20Sopenharmony_ci		i = 0;
21788c2ecf20Sopenharmony_ci
21798c2ecf20Sopenharmony_ci	return i;
21808c2ecf20Sopenharmony_ci}
21818c2ecf20Sopenharmony_ci
21828c2ecf20Sopenharmony_ci/*
21838c2ecf20Sopenharmony_ci * If the boolean frontswap is true, only unuse pages_to_unuse pages;
21848c2ecf20Sopenharmony_ci * pages_to_unuse==0 means all pages; ignored if frontswap is false
21858c2ecf20Sopenharmony_ci */
21868c2ecf20Sopenharmony_ciint try_to_unuse(unsigned int type, bool frontswap,
21878c2ecf20Sopenharmony_ci		 unsigned long pages_to_unuse)
21888c2ecf20Sopenharmony_ci{
21898c2ecf20Sopenharmony_ci	struct mm_struct *prev_mm;
21908c2ecf20Sopenharmony_ci	struct mm_struct *mm;
21918c2ecf20Sopenharmony_ci	struct list_head *p;
21928c2ecf20Sopenharmony_ci	int retval = 0;
21938c2ecf20Sopenharmony_ci	struct swap_info_struct *si = swap_info[type];
21948c2ecf20Sopenharmony_ci	struct page *page;
21958c2ecf20Sopenharmony_ci	swp_entry_t entry;
21968c2ecf20Sopenharmony_ci	unsigned int i;
21978c2ecf20Sopenharmony_ci
21988c2ecf20Sopenharmony_ci	if (!READ_ONCE(si->inuse_pages))
21998c2ecf20Sopenharmony_ci		return 0;
22008c2ecf20Sopenharmony_ci
22018c2ecf20Sopenharmony_ci	if (!frontswap)
22028c2ecf20Sopenharmony_ci		pages_to_unuse = 0;
22038c2ecf20Sopenharmony_ci
22048c2ecf20Sopenharmony_ciretry:
22058c2ecf20Sopenharmony_ci	retval = shmem_unuse(type, frontswap, &pages_to_unuse);
22068c2ecf20Sopenharmony_ci	if (retval)
22078c2ecf20Sopenharmony_ci		goto out;
22088c2ecf20Sopenharmony_ci
22098c2ecf20Sopenharmony_ci	prev_mm = &init_mm;
22108c2ecf20Sopenharmony_ci	mmget(prev_mm);
22118c2ecf20Sopenharmony_ci
22128c2ecf20Sopenharmony_ci	spin_lock(&mmlist_lock);
22138c2ecf20Sopenharmony_ci	p = &init_mm.mmlist;
22148c2ecf20Sopenharmony_ci	while (READ_ONCE(si->inuse_pages) &&
22158c2ecf20Sopenharmony_ci	       !signal_pending(current) &&
22168c2ecf20Sopenharmony_ci	       (p = p->next) != &init_mm.mmlist) {
22178c2ecf20Sopenharmony_ci
22188c2ecf20Sopenharmony_ci		mm = list_entry(p, struct mm_struct, mmlist);
22198c2ecf20Sopenharmony_ci		if (!mmget_not_zero(mm))
22208c2ecf20Sopenharmony_ci			continue;
22218c2ecf20Sopenharmony_ci		spin_unlock(&mmlist_lock);
22228c2ecf20Sopenharmony_ci		mmput(prev_mm);
22238c2ecf20Sopenharmony_ci		prev_mm = mm;
22248c2ecf20Sopenharmony_ci		retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
22258c2ecf20Sopenharmony_ci
22268c2ecf20Sopenharmony_ci		if (retval) {
22278c2ecf20Sopenharmony_ci			mmput(prev_mm);
22288c2ecf20Sopenharmony_ci			goto out;
22298c2ecf20Sopenharmony_ci		}
22308c2ecf20Sopenharmony_ci
22318c2ecf20Sopenharmony_ci		/*
22328c2ecf20Sopenharmony_ci		 * Make sure that we aren't completely killing
22338c2ecf20Sopenharmony_ci		 * interactive performance.
22348c2ecf20Sopenharmony_ci		 */
22358c2ecf20Sopenharmony_ci		cond_resched();
22368c2ecf20Sopenharmony_ci		spin_lock(&mmlist_lock);
22378c2ecf20Sopenharmony_ci	}
22388c2ecf20Sopenharmony_ci	spin_unlock(&mmlist_lock);
22398c2ecf20Sopenharmony_ci
22408c2ecf20Sopenharmony_ci	mmput(prev_mm);
22418c2ecf20Sopenharmony_ci
22428c2ecf20Sopenharmony_ci	i = 0;
22438c2ecf20Sopenharmony_ci	while (READ_ONCE(si->inuse_pages) &&
22448c2ecf20Sopenharmony_ci	       !signal_pending(current) &&
22458c2ecf20Sopenharmony_ci	       (i = find_next_to_unuse(si, i, frontswap)) != 0) {
22468c2ecf20Sopenharmony_ci
22478c2ecf20Sopenharmony_ci		entry = swp_entry(type, i);
22488c2ecf20Sopenharmony_ci		page = find_get_page(swap_address_space(entry), i);
22498c2ecf20Sopenharmony_ci		if (!page)
22508c2ecf20Sopenharmony_ci			continue;
22518c2ecf20Sopenharmony_ci
22528c2ecf20Sopenharmony_ci		/*
22538c2ecf20Sopenharmony_ci		 * It is conceivable that a racing task removed this page from
22548c2ecf20Sopenharmony_ci		 * swap cache just before we acquired the page lock. The page
22558c2ecf20Sopenharmony_ci		 * might even be back in swap cache on another swap area. But
22568c2ecf20Sopenharmony_ci		 * that is okay, try_to_free_swap() only removes stale pages.
22578c2ecf20Sopenharmony_ci		 */
22588c2ecf20Sopenharmony_ci		lock_page(page);
22598c2ecf20Sopenharmony_ci		wait_on_page_writeback(page);
22608c2ecf20Sopenharmony_ci		try_to_free_swap(page);
22618c2ecf20Sopenharmony_ci		unlock_page(page);
22628c2ecf20Sopenharmony_ci		put_page(page);
22638c2ecf20Sopenharmony_ci
22648c2ecf20Sopenharmony_ci		/*
22658c2ecf20Sopenharmony_ci		 * For frontswap, we just need to unuse pages_to_unuse, if
22668c2ecf20Sopenharmony_ci		 * it was specified. Need not check frontswap again here as
22678c2ecf20Sopenharmony_ci		 * we already zeroed out pages_to_unuse if not frontswap.
22688c2ecf20Sopenharmony_ci		 */
22698c2ecf20Sopenharmony_ci		if (pages_to_unuse && --pages_to_unuse == 0)
22708c2ecf20Sopenharmony_ci			goto out;
22718c2ecf20Sopenharmony_ci	}
22728c2ecf20Sopenharmony_ci
22738c2ecf20Sopenharmony_ci	/*
22748c2ecf20Sopenharmony_ci	 * Lets check again to see if there are still swap entries in the map.
22758c2ecf20Sopenharmony_ci	 * If yes, we would need to do retry the unuse logic again.
22768c2ecf20Sopenharmony_ci	 * Under global memory pressure, swap entries can be reinserted back
22778c2ecf20Sopenharmony_ci	 * into process space after the mmlist loop above passes over them.
22788c2ecf20Sopenharmony_ci	 *
22798c2ecf20Sopenharmony_ci	 * Limit the number of retries? No: when mmget_not_zero() above fails,
22808c2ecf20Sopenharmony_ci	 * that mm is likely to be freeing swap from exit_mmap(), which proceeds
22818c2ecf20Sopenharmony_ci	 * at its own independent pace; and even shmem_writepage() could have
22828c2ecf20Sopenharmony_ci	 * been preempted after get_swap_page(), temporarily hiding that swap.
22838c2ecf20Sopenharmony_ci	 * It's easy and robust (though cpu-intensive) just to keep retrying.
22848c2ecf20Sopenharmony_ci	 */
22858c2ecf20Sopenharmony_ci	if (READ_ONCE(si->inuse_pages)) {
22868c2ecf20Sopenharmony_ci		if (!signal_pending(current))
22878c2ecf20Sopenharmony_ci			goto retry;
22888c2ecf20Sopenharmony_ci		retval = -EINTR;
22898c2ecf20Sopenharmony_ci	}
22908c2ecf20Sopenharmony_ciout:
22918c2ecf20Sopenharmony_ci	return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
22928c2ecf20Sopenharmony_ci}
22938c2ecf20Sopenharmony_ci
22948c2ecf20Sopenharmony_ci/*
22958c2ecf20Sopenharmony_ci * After a successful try_to_unuse, if no swap is now in use, we know
22968c2ecf20Sopenharmony_ci * we can empty the mmlist.  swap_lock must be held on entry and exit.
22978c2ecf20Sopenharmony_ci * Note that mmlist_lock nests inside swap_lock, and an mm must be
22988c2ecf20Sopenharmony_ci * added to the mmlist just after page_duplicate - before would be racy.
22998c2ecf20Sopenharmony_ci */
23008c2ecf20Sopenharmony_cistatic void drain_mmlist(void)
23018c2ecf20Sopenharmony_ci{
23028c2ecf20Sopenharmony_ci	struct list_head *p, *next;
23038c2ecf20Sopenharmony_ci	unsigned int type;
23048c2ecf20Sopenharmony_ci
23058c2ecf20Sopenharmony_ci	for (type = 0; type < nr_swapfiles; type++)
23068c2ecf20Sopenharmony_ci		if (swap_info[type]->inuse_pages)
23078c2ecf20Sopenharmony_ci			return;
23088c2ecf20Sopenharmony_ci	spin_lock(&mmlist_lock);
23098c2ecf20Sopenharmony_ci	list_for_each_safe(p, next, &init_mm.mmlist)
23108c2ecf20Sopenharmony_ci		list_del_init(p);
23118c2ecf20Sopenharmony_ci	spin_unlock(&mmlist_lock);
23128c2ecf20Sopenharmony_ci}
23138c2ecf20Sopenharmony_ci
23148c2ecf20Sopenharmony_ci/*
23158c2ecf20Sopenharmony_ci * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
23168c2ecf20Sopenharmony_ci * corresponds to page offset for the specified swap entry.
23178c2ecf20Sopenharmony_ci * Note that the type of this function is sector_t, but it returns page offset
23188c2ecf20Sopenharmony_ci * into the bdev, not sector offset.
23198c2ecf20Sopenharmony_ci */
23208c2ecf20Sopenharmony_cistatic sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
23218c2ecf20Sopenharmony_ci{
23228c2ecf20Sopenharmony_ci	struct swap_info_struct *sis;
23238c2ecf20Sopenharmony_ci	struct swap_extent *se;
23248c2ecf20Sopenharmony_ci	pgoff_t offset;
23258c2ecf20Sopenharmony_ci
23268c2ecf20Sopenharmony_ci	sis = swp_swap_info(entry);
23278c2ecf20Sopenharmony_ci	*bdev = sis->bdev;
23288c2ecf20Sopenharmony_ci
23298c2ecf20Sopenharmony_ci	offset = swp_offset(entry);
23308c2ecf20Sopenharmony_ci	se = offset_to_swap_extent(sis, offset);
23318c2ecf20Sopenharmony_ci	return se->start_block + (offset - se->start_page);
23328c2ecf20Sopenharmony_ci}
23338c2ecf20Sopenharmony_ci
23348c2ecf20Sopenharmony_ci/*
23358c2ecf20Sopenharmony_ci * Returns the page offset into bdev for the specified page's swap entry.
23368c2ecf20Sopenharmony_ci */
23378c2ecf20Sopenharmony_cisector_t map_swap_page(struct page *page, struct block_device **bdev)
23388c2ecf20Sopenharmony_ci{
23398c2ecf20Sopenharmony_ci	swp_entry_t entry;
23408c2ecf20Sopenharmony_ci	entry.val = page_private(page);
23418c2ecf20Sopenharmony_ci	return map_swap_entry(entry, bdev);
23428c2ecf20Sopenharmony_ci}
23438c2ecf20Sopenharmony_ci
23448c2ecf20Sopenharmony_ci/*
23458c2ecf20Sopenharmony_ci * Free all of a swapdev's extent information
23468c2ecf20Sopenharmony_ci */
23478c2ecf20Sopenharmony_cistatic void destroy_swap_extents(struct swap_info_struct *sis)
23488c2ecf20Sopenharmony_ci{
23498c2ecf20Sopenharmony_ci	while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
23508c2ecf20Sopenharmony_ci		struct rb_node *rb = sis->swap_extent_root.rb_node;
23518c2ecf20Sopenharmony_ci		struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
23528c2ecf20Sopenharmony_ci
23538c2ecf20Sopenharmony_ci		rb_erase(rb, &sis->swap_extent_root);
23548c2ecf20Sopenharmony_ci		kfree(se);
23558c2ecf20Sopenharmony_ci	}
23568c2ecf20Sopenharmony_ci
23578c2ecf20Sopenharmony_ci	if (sis->flags & SWP_ACTIVATED) {
23588c2ecf20Sopenharmony_ci		struct file *swap_file = sis->swap_file;
23598c2ecf20Sopenharmony_ci		struct address_space *mapping = swap_file->f_mapping;
23608c2ecf20Sopenharmony_ci
23618c2ecf20Sopenharmony_ci		sis->flags &= ~SWP_ACTIVATED;
23628c2ecf20Sopenharmony_ci		if (mapping->a_ops->swap_deactivate)
23638c2ecf20Sopenharmony_ci			mapping->a_ops->swap_deactivate(swap_file);
23648c2ecf20Sopenharmony_ci	}
23658c2ecf20Sopenharmony_ci}
23668c2ecf20Sopenharmony_ci
23678c2ecf20Sopenharmony_ci/*
23688c2ecf20Sopenharmony_ci * Add a block range (and the corresponding page range) into this swapdev's
23698c2ecf20Sopenharmony_ci * extent tree.
23708c2ecf20Sopenharmony_ci *
23718c2ecf20Sopenharmony_ci * This function rather assumes that it is called in ascending page order.
23728c2ecf20Sopenharmony_ci */
23738c2ecf20Sopenharmony_ciint
23748c2ecf20Sopenharmony_ciadd_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
23758c2ecf20Sopenharmony_ci		unsigned long nr_pages, sector_t start_block)
23768c2ecf20Sopenharmony_ci{
23778c2ecf20Sopenharmony_ci	struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
23788c2ecf20Sopenharmony_ci	struct swap_extent *se;
23798c2ecf20Sopenharmony_ci	struct swap_extent *new_se;
23808c2ecf20Sopenharmony_ci
23818c2ecf20Sopenharmony_ci	/*
23828c2ecf20Sopenharmony_ci	 * place the new node at the right most since the
23838c2ecf20Sopenharmony_ci	 * function is called in ascending page order.
23848c2ecf20Sopenharmony_ci	 */
23858c2ecf20Sopenharmony_ci	while (*link) {
23868c2ecf20Sopenharmony_ci		parent = *link;
23878c2ecf20Sopenharmony_ci		link = &parent->rb_right;
23888c2ecf20Sopenharmony_ci	}
23898c2ecf20Sopenharmony_ci
23908c2ecf20Sopenharmony_ci	if (parent) {
23918c2ecf20Sopenharmony_ci		se = rb_entry(parent, struct swap_extent, rb_node);
23928c2ecf20Sopenharmony_ci		BUG_ON(se->start_page + se->nr_pages != start_page);
23938c2ecf20Sopenharmony_ci		if (se->start_block + se->nr_pages == start_block) {
23948c2ecf20Sopenharmony_ci			/* Merge it */
23958c2ecf20Sopenharmony_ci			se->nr_pages += nr_pages;
23968c2ecf20Sopenharmony_ci			return 0;
23978c2ecf20Sopenharmony_ci		}
23988c2ecf20Sopenharmony_ci	}
23998c2ecf20Sopenharmony_ci
24008c2ecf20Sopenharmony_ci	/* No merge, insert a new extent. */
24018c2ecf20Sopenharmony_ci	new_se = kmalloc(sizeof(*se), GFP_KERNEL);
24028c2ecf20Sopenharmony_ci	if (new_se == NULL)
24038c2ecf20Sopenharmony_ci		return -ENOMEM;
24048c2ecf20Sopenharmony_ci	new_se->start_page = start_page;
24058c2ecf20Sopenharmony_ci	new_se->nr_pages = nr_pages;
24068c2ecf20Sopenharmony_ci	new_se->start_block = start_block;
24078c2ecf20Sopenharmony_ci
24088c2ecf20Sopenharmony_ci	rb_link_node(&new_se->rb_node, parent, link);
24098c2ecf20Sopenharmony_ci	rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
24108c2ecf20Sopenharmony_ci	return 1;
24118c2ecf20Sopenharmony_ci}
24128c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(add_swap_extent);
24138c2ecf20Sopenharmony_ci
24148c2ecf20Sopenharmony_ci/*
24158c2ecf20Sopenharmony_ci * A `swap extent' is a simple thing which maps a contiguous range of pages
24168c2ecf20Sopenharmony_ci * onto a contiguous range of disk blocks.  An ordered list of swap extents
24178c2ecf20Sopenharmony_ci * is built at swapon time and is then used at swap_writepage/swap_readpage
24188c2ecf20Sopenharmony_ci * time for locating where on disk a page belongs.
24198c2ecf20Sopenharmony_ci *
24208c2ecf20Sopenharmony_ci * If the swapfile is an S_ISBLK block device, a single extent is installed.
24218c2ecf20Sopenharmony_ci * This is done so that the main operating code can treat S_ISBLK and S_ISREG
24228c2ecf20Sopenharmony_ci * swap files identically.
24238c2ecf20Sopenharmony_ci *
24248c2ecf20Sopenharmony_ci * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
24258c2ecf20Sopenharmony_ci * extent list operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
24268c2ecf20Sopenharmony_ci * swapfiles are handled *identically* after swapon time.
24278c2ecf20Sopenharmony_ci *
24288c2ecf20Sopenharmony_ci * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
24298c2ecf20Sopenharmony_ci * and will parse them into an ordered extent list, in PAGE_SIZE chunks.  If
24308c2ecf20Sopenharmony_ci * some stray blocks are found which do not fall within the PAGE_SIZE alignment
24318c2ecf20Sopenharmony_ci * requirements, they are simply tossed out - we will never use those blocks
24328c2ecf20Sopenharmony_ci * for swapping.
24338c2ecf20Sopenharmony_ci *
24348c2ecf20Sopenharmony_ci * For all swap devices we set S_SWAPFILE across the life of the swapon.  This
24358c2ecf20Sopenharmony_ci * prevents users from writing to the swap device, which will corrupt memory.
24368c2ecf20Sopenharmony_ci *
24378c2ecf20Sopenharmony_ci * The amount of disk space which a single swap extent represents varies.
24388c2ecf20Sopenharmony_ci * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
24398c2ecf20Sopenharmony_ci * extents in the list.  To avoid much list walking, we cache the previous
24408c2ecf20Sopenharmony_ci * search location in `curr_swap_extent', and start new searches from there.
24418c2ecf20Sopenharmony_ci * This is extremely effective.  The average number of iterations in
24428c2ecf20Sopenharmony_ci * map_swap_page() has been measured at about 0.3 per page.  - akpm.
24438c2ecf20Sopenharmony_ci */
24448c2ecf20Sopenharmony_cistatic int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
24458c2ecf20Sopenharmony_ci{
24468c2ecf20Sopenharmony_ci	struct file *swap_file = sis->swap_file;
24478c2ecf20Sopenharmony_ci	struct address_space *mapping = swap_file->f_mapping;
24488c2ecf20Sopenharmony_ci	struct inode *inode = mapping->host;
24498c2ecf20Sopenharmony_ci	int ret;
24508c2ecf20Sopenharmony_ci
24518c2ecf20Sopenharmony_ci	if (S_ISBLK(inode->i_mode)) {
24528c2ecf20Sopenharmony_ci		ret = add_swap_extent(sis, 0, sis->max, 0);
24538c2ecf20Sopenharmony_ci		*span = sis->pages;
24548c2ecf20Sopenharmony_ci		return ret;
24558c2ecf20Sopenharmony_ci	}
24568c2ecf20Sopenharmony_ci
24578c2ecf20Sopenharmony_ci	if (mapping->a_ops->swap_activate) {
24588c2ecf20Sopenharmony_ci		ret = mapping->a_ops->swap_activate(sis, swap_file, span);
24598c2ecf20Sopenharmony_ci		if (ret >= 0)
24608c2ecf20Sopenharmony_ci			sis->flags |= SWP_ACTIVATED;
24618c2ecf20Sopenharmony_ci		if (!ret) {
24628c2ecf20Sopenharmony_ci			sis->flags |= SWP_FS_OPS;
24638c2ecf20Sopenharmony_ci			ret = add_swap_extent(sis, 0, sis->max, 0);
24648c2ecf20Sopenharmony_ci			*span = sis->pages;
24658c2ecf20Sopenharmony_ci		}
24668c2ecf20Sopenharmony_ci		return ret;
24678c2ecf20Sopenharmony_ci	}
24688c2ecf20Sopenharmony_ci
24698c2ecf20Sopenharmony_ci	return generic_swapfile_activate(sis, swap_file, span);
24708c2ecf20Sopenharmony_ci}
24718c2ecf20Sopenharmony_ci
24728c2ecf20Sopenharmony_cistatic int swap_node(struct swap_info_struct *p)
24738c2ecf20Sopenharmony_ci{
24748c2ecf20Sopenharmony_ci	struct block_device *bdev;
24758c2ecf20Sopenharmony_ci
24768c2ecf20Sopenharmony_ci	if (p->bdev)
24778c2ecf20Sopenharmony_ci		bdev = p->bdev;
24788c2ecf20Sopenharmony_ci	else
24798c2ecf20Sopenharmony_ci		bdev = p->swap_file->f_inode->i_sb->s_bdev;
24808c2ecf20Sopenharmony_ci
24818c2ecf20Sopenharmony_ci	return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
24828c2ecf20Sopenharmony_ci}
24838c2ecf20Sopenharmony_ci
24848c2ecf20Sopenharmony_cistatic void setup_swap_info(struct swap_info_struct *p, int prio,
24858c2ecf20Sopenharmony_ci			    unsigned char *swap_map,
24868c2ecf20Sopenharmony_ci			    struct swap_cluster_info *cluster_info)
24878c2ecf20Sopenharmony_ci{
24888c2ecf20Sopenharmony_ci	int i;
24898c2ecf20Sopenharmony_ci
24908c2ecf20Sopenharmony_ci	if (prio >= 0)
24918c2ecf20Sopenharmony_ci		p->prio = prio;
24928c2ecf20Sopenharmony_ci	else
24938c2ecf20Sopenharmony_ci		p->prio = --least_priority;
24948c2ecf20Sopenharmony_ci	/*
24958c2ecf20Sopenharmony_ci	 * the plist prio is negated because plist ordering is
24968c2ecf20Sopenharmony_ci	 * low-to-high, while swap ordering is high-to-low
24978c2ecf20Sopenharmony_ci	 */
24988c2ecf20Sopenharmony_ci	p->list.prio = -p->prio;
24998c2ecf20Sopenharmony_ci	for_each_node(i) {
25008c2ecf20Sopenharmony_ci		if (p->prio >= 0)
25018c2ecf20Sopenharmony_ci			p->avail_lists[i].prio = -p->prio;
25028c2ecf20Sopenharmony_ci		else {
25038c2ecf20Sopenharmony_ci			if (swap_node(p) == i)
25048c2ecf20Sopenharmony_ci				p->avail_lists[i].prio = 1;
25058c2ecf20Sopenharmony_ci			else
25068c2ecf20Sopenharmony_ci				p->avail_lists[i].prio = -p->prio;
25078c2ecf20Sopenharmony_ci		}
25088c2ecf20Sopenharmony_ci	}
25098c2ecf20Sopenharmony_ci	p->swap_map = swap_map;
25108c2ecf20Sopenharmony_ci	p->cluster_info = cluster_info;
25118c2ecf20Sopenharmony_ci}
25128c2ecf20Sopenharmony_ci
25138c2ecf20Sopenharmony_cistatic void _enable_swap_info(struct swap_info_struct *p)
25148c2ecf20Sopenharmony_ci{
25158c2ecf20Sopenharmony_ci	p->flags |= SWP_WRITEOK | SWP_VALID;
25168c2ecf20Sopenharmony_ci	atomic_long_add(p->pages, &nr_swap_pages);
25178c2ecf20Sopenharmony_ci	total_swap_pages += p->pages;
25188c2ecf20Sopenharmony_ci
25198c2ecf20Sopenharmony_ci	assert_spin_locked(&swap_lock);
25208c2ecf20Sopenharmony_ci	/*
25218c2ecf20Sopenharmony_ci	 * both lists are plists, and thus priority ordered.
25228c2ecf20Sopenharmony_ci	 * swap_active_head needs to be priority ordered for swapoff(),
25238c2ecf20Sopenharmony_ci	 * which on removal of any swap_info_struct with an auto-assigned
25248c2ecf20Sopenharmony_ci	 * (i.e. negative) priority increments the auto-assigned priority
25258c2ecf20Sopenharmony_ci	 * of any lower-priority swap_info_structs.
25268c2ecf20Sopenharmony_ci	 * swap_avail_head needs to be priority ordered for get_swap_page(),
25278c2ecf20Sopenharmony_ci	 * which allocates swap pages from the highest available priority
25288c2ecf20Sopenharmony_ci	 * swap_info_struct.
25298c2ecf20Sopenharmony_ci	 */
25308c2ecf20Sopenharmony_ci	plist_add(&p->list, &swap_active_head);
25318c2ecf20Sopenharmony_ci	add_to_avail_list(p);
25328c2ecf20Sopenharmony_ci}
25338c2ecf20Sopenharmony_ci
25348c2ecf20Sopenharmony_cistatic void enable_swap_info(struct swap_info_struct *p, int prio,
25358c2ecf20Sopenharmony_ci				unsigned char *swap_map,
25368c2ecf20Sopenharmony_ci				struct swap_cluster_info *cluster_info,
25378c2ecf20Sopenharmony_ci				unsigned long *frontswap_map)
25388c2ecf20Sopenharmony_ci{
25398c2ecf20Sopenharmony_ci	frontswap_init(p->type, frontswap_map);
25408c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
25418c2ecf20Sopenharmony_ci	spin_lock(&p->lock);
25428c2ecf20Sopenharmony_ci	setup_swap_info(p, prio, swap_map, cluster_info);
25438c2ecf20Sopenharmony_ci	spin_unlock(&p->lock);
25448c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
25458c2ecf20Sopenharmony_ci	/*
25468c2ecf20Sopenharmony_ci	 * Guarantee swap_map, cluster_info, etc. fields are valid
25478c2ecf20Sopenharmony_ci	 * between get/put_swap_device() if SWP_VALID bit is set
25488c2ecf20Sopenharmony_ci	 */
25498c2ecf20Sopenharmony_ci	synchronize_rcu();
25508c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
25518c2ecf20Sopenharmony_ci	spin_lock(&p->lock);
25528c2ecf20Sopenharmony_ci	_enable_swap_info(p);
25538c2ecf20Sopenharmony_ci	spin_unlock(&p->lock);
25548c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
25558c2ecf20Sopenharmony_ci}
25568c2ecf20Sopenharmony_ci
25578c2ecf20Sopenharmony_cistatic void reinsert_swap_info(struct swap_info_struct *p)
25588c2ecf20Sopenharmony_ci{
25598c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
25608c2ecf20Sopenharmony_ci	spin_lock(&p->lock);
25618c2ecf20Sopenharmony_ci	setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
25628c2ecf20Sopenharmony_ci	_enable_swap_info(p);
25638c2ecf20Sopenharmony_ci	spin_unlock(&p->lock);
25648c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
25658c2ecf20Sopenharmony_ci}
25668c2ecf20Sopenharmony_ci
25678c2ecf20Sopenharmony_cibool has_usable_swap(void)
25688c2ecf20Sopenharmony_ci{
25698c2ecf20Sopenharmony_ci	bool ret = true;
25708c2ecf20Sopenharmony_ci
25718c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
25728c2ecf20Sopenharmony_ci	if (plist_head_empty(&swap_active_head))
25738c2ecf20Sopenharmony_ci		ret = false;
25748c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
25758c2ecf20Sopenharmony_ci	return ret;
25768c2ecf20Sopenharmony_ci}
25778c2ecf20Sopenharmony_ci
25788c2ecf20Sopenharmony_ciSYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
25798c2ecf20Sopenharmony_ci{
25808c2ecf20Sopenharmony_ci	struct swap_info_struct *p = NULL;
25818c2ecf20Sopenharmony_ci	unsigned char *swap_map;
25828c2ecf20Sopenharmony_ci	struct swap_cluster_info *cluster_info;
25838c2ecf20Sopenharmony_ci	unsigned long *frontswap_map;
25848c2ecf20Sopenharmony_ci	struct file *swap_file, *victim;
25858c2ecf20Sopenharmony_ci	struct address_space *mapping;
25868c2ecf20Sopenharmony_ci	struct inode *inode;
25878c2ecf20Sopenharmony_ci	struct filename *pathname;
25888c2ecf20Sopenharmony_ci	int err, found = 0;
25898c2ecf20Sopenharmony_ci	unsigned int old_block_size;
25908c2ecf20Sopenharmony_ci
25918c2ecf20Sopenharmony_ci	if (!capable(CAP_SYS_ADMIN))
25928c2ecf20Sopenharmony_ci		return -EPERM;
25938c2ecf20Sopenharmony_ci
25948c2ecf20Sopenharmony_ci	BUG_ON(!current->mm);
25958c2ecf20Sopenharmony_ci
25968c2ecf20Sopenharmony_ci	pathname = getname(specialfile);
25978c2ecf20Sopenharmony_ci	if (IS_ERR(pathname))
25988c2ecf20Sopenharmony_ci		return PTR_ERR(pathname);
25998c2ecf20Sopenharmony_ci
26008c2ecf20Sopenharmony_ci	victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
26018c2ecf20Sopenharmony_ci	err = PTR_ERR(victim);
26028c2ecf20Sopenharmony_ci	if (IS_ERR(victim))
26038c2ecf20Sopenharmony_ci		goto out;
26048c2ecf20Sopenharmony_ci
26058c2ecf20Sopenharmony_ci	mapping = victim->f_mapping;
26068c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
26078c2ecf20Sopenharmony_ci	plist_for_each_entry(p, &swap_active_head, list) {
26088c2ecf20Sopenharmony_ci		if (p->flags & SWP_WRITEOK) {
26098c2ecf20Sopenharmony_ci			if (p->swap_file->f_mapping == mapping) {
26108c2ecf20Sopenharmony_ci				found = 1;
26118c2ecf20Sopenharmony_ci				break;
26128c2ecf20Sopenharmony_ci			}
26138c2ecf20Sopenharmony_ci		}
26148c2ecf20Sopenharmony_ci	}
26158c2ecf20Sopenharmony_ci	if (!found) {
26168c2ecf20Sopenharmony_ci		err = -EINVAL;
26178c2ecf20Sopenharmony_ci		spin_unlock(&swap_lock);
26188c2ecf20Sopenharmony_ci		goto out_dput;
26198c2ecf20Sopenharmony_ci	}
26208c2ecf20Sopenharmony_ci	if (!security_vm_enough_memory_mm(current->mm, p->pages))
26218c2ecf20Sopenharmony_ci		vm_unacct_memory(p->pages);
26228c2ecf20Sopenharmony_ci	else {
26238c2ecf20Sopenharmony_ci		err = -ENOMEM;
26248c2ecf20Sopenharmony_ci		spin_unlock(&swap_lock);
26258c2ecf20Sopenharmony_ci		goto out_dput;
26268c2ecf20Sopenharmony_ci	}
26278c2ecf20Sopenharmony_ci	spin_lock(&p->lock);
26288c2ecf20Sopenharmony_ci	del_from_avail_list(p);
26298c2ecf20Sopenharmony_ci	if (p->prio < 0) {
26308c2ecf20Sopenharmony_ci		struct swap_info_struct *si = p;
26318c2ecf20Sopenharmony_ci		int nid;
26328c2ecf20Sopenharmony_ci
26338c2ecf20Sopenharmony_ci		plist_for_each_entry_continue(si, &swap_active_head, list) {
26348c2ecf20Sopenharmony_ci			si->prio++;
26358c2ecf20Sopenharmony_ci			si->list.prio--;
26368c2ecf20Sopenharmony_ci			for_each_node(nid) {
26378c2ecf20Sopenharmony_ci				if (si->avail_lists[nid].prio != 1)
26388c2ecf20Sopenharmony_ci					si->avail_lists[nid].prio--;
26398c2ecf20Sopenharmony_ci			}
26408c2ecf20Sopenharmony_ci		}
26418c2ecf20Sopenharmony_ci		least_priority++;
26428c2ecf20Sopenharmony_ci	}
26438c2ecf20Sopenharmony_ci	plist_del(&p->list, &swap_active_head);
26448c2ecf20Sopenharmony_ci	atomic_long_sub(p->pages, &nr_swap_pages);
26458c2ecf20Sopenharmony_ci	total_swap_pages -= p->pages;
26468c2ecf20Sopenharmony_ci	p->flags &= ~SWP_WRITEOK;
26478c2ecf20Sopenharmony_ci	spin_unlock(&p->lock);
26488c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
26498c2ecf20Sopenharmony_ci
26508c2ecf20Sopenharmony_ci	disable_swap_slots_cache_lock();
26518c2ecf20Sopenharmony_ci
26528c2ecf20Sopenharmony_ci	set_current_oom_origin();
26538c2ecf20Sopenharmony_ci	err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
26548c2ecf20Sopenharmony_ci	clear_current_oom_origin();
26558c2ecf20Sopenharmony_ci
26568c2ecf20Sopenharmony_ci	if (err) {
26578c2ecf20Sopenharmony_ci		/* re-insert swap space back into swap_list */
26588c2ecf20Sopenharmony_ci		reinsert_swap_info(p);
26598c2ecf20Sopenharmony_ci		reenable_swap_slots_cache_unlock();
26608c2ecf20Sopenharmony_ci		goto out_dput;
26618c2ecf20Sopenharmony_ci	}
26628c2ecf20Sopenharmony_ci
26638c2ecf20Sopenharmony_ci	reenable_swap_slots_cache_unlock();
26648c2ecf20Sopenharmony_ci
26658c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
26668c2ecf20Sopenharmony_ci	spin_lock(&p->lock);
26678c2ecf20Sopenharmony_ci	p->flags &= ~SWP_VALID;		/* mark swap device as invalid */
26688c2ecf20Sopenharmony_ci	spin_unlock(&p->lock);
26698c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
26708c2ecf20Sopenharmony_ci	/*
26718c2ecf20Sopenharmony_ci	 * wait for swap operations protected by get/put_swap_device()
26728c2ecf20Sopenharmony_ci	 * to complete
26738c2ecf20Sopenharmony_ci	 */
26748c2ecf20Sopenharmony_ci	synchronize_rcu();
26758c2ecf20Sopenharmony_ci
26768c2ecf20Sopenharmony_ci	flush_work(&p->discard_work);
26778c2ecf20Sopenharmony_ci
26788c2ecf20Sopenharmony_ci	destroy_swap_extents(p);
26798c2ecf20Sopenharmony_ci	if (p->flags & SWP_CONTINUED)
26808c2ecf20Sopenharmony_ci		free_swap_count_continuations(p);
26818c2ecf20Sopenharmony_ci
26828c2ecf20Sopenharmony_ci	if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
26838c2ecf20Sopenharmony_ci		atomic_dec(&nr_rotate_swap);
26848c2ecf20Sopenharmony_ci
26858c2ecf20Sopenharmony_ci	mutex_lock(&swapon_mutex);
26868c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
26878c2ecf20Sopenharmony_ci	spin_lock(&p->lock);
26888c2ecf20Sopenharmony_ci	drain_mmlist();
26898c2ecf20Sopenharmony_ci
26908c2ecf20Sopenharmony_ci	/* wait for anyone still in scan_swap_map */
26918c2ecf20Sopenharmony_ci	p->highest_bit = 0;		/* cuts scans short */
26928c2ecf20Sopenharmony_ci	while (p->flags >= SWP_SCANNING) {
26938c2ecf20Sopenharmony_ci		spin_unlock(&p->lock);
26948c2ecf20Sopenharmony_ci		spin_unlock(&swap_lock);
26958c2ecf20Sopenharmony_ci		schedule_timeout_uninterruptible(1);
26968c2ecf20Sopenharmony_ci		spin_lock(&swap_lock);
26978c2ecf20Sopenharmony_ci		spin_lock(&p->lock);
26988c2ecf20Sopenharmony_ci	}
26998c2ecf20Sopenharmony_ci
27008c2ecf20Sopenharmony_ci	swap_file = p->swap_file;
27018c2ecf20Sopenharmony_ci	old_block_size = p->old_block_size;
27028c2ecf20Sopenharmony_ci	p->swap_file = NULL;
27038c2ecf20Sopenharmony_ci	p->max = 0;
27048c2ecf20Sopenharmony_ci	swap_map = p->swap_map;
27058c2ecf20Sopenharmony_ci	p->swap_map = NULL;
27068c2ecf20Sopenharmony_ci	cluster_info = p->cluster_info;
27078c2ecf20Sopenharmony_ci	p->cluster_info = NULL;
27088c2ecf20Sopenharmony_ci	frontswap_map = frontswap_map_get(p);
27098c2ecf20Sopenharmony_ci	spin_unlock(&p->lock);
27108c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
27118c2ecf20Sopenharmony_ci	arch_swap_invalidate_area(p->type);
27128c2ecf20Sopenharmony_ci	frontswap_invalidate_area(p->type);
27138c2ecf20Sopenharmony_ci	frontswap_map_set(p, NULL);
27148c2ecf20Sopenharmony_ci	mutex_unlock(&swapon_mutex);
27158c2ecf20Sopenharmony_ci	free_percpu(p->percpu_cluster);
27168c2ecf20Sopenharmony_ci	p->percpu_cluster = NULL;
27178c2ecf20Sopenharmony_ci	free_percpu(p->cluster_next_cpu);
27188c2ecf20Sopenharmony_ci	p->cluster_next_cpu = NULL;
27198c2ecf20Sopenharmony_ci	vfree(swap_map);
27208c2ecf20Sopenharmony_ci	kvfree(cluster_info);
27218c2ecf20Sopenharmony_ci	kvfree(frontswap_map);
27228c2ecf20Sopenharmony_ci	/* Destroy swap account information */
27238c2ecf20Sopenharmony_ci	swap_cgroup_swapoff(p->type);
27248c2ecf20Sopenharmony_ci	exit_swap_address_space(p->type);
27258c2ecf20Sopenharmony_ci
27268c2ecf20Sopenharmony_ci	inode = mapping->host;
27278c2ecf20Sopenharmony_ci	if (S_ISBLK(inode->i_mode)) {
27288c2ecf20Sopenharmony_ci		struct block_device *bdev = I_BDEV(inode);
27298c2ecf20Sopenharmony_ci
27308c2ecf20Sopenharmony_ci		set_blocksize(bdev, old_block_size);
27318c2ecf20Sopenharmony_ci		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
27328c2ecf20Sopenharmony_ci	}
27338c2ecf20Sopenharmony_ci
27348c2ecf20Sopenharmony_ci	inode_lock(inode);
27358c2ecf20Sopenharmony_ci	inode->i_flags &= ~S_SWAPFILE;
27368c2ecf20Sopenharmony_ci	inode_unlock(inode);
27378c2ecf20Sopenharmony_ci	filp_close(swap_file, NULL);
27388c2ecf20Sopenharmony_ci
27398c2ecf20Sopenharmony_ci	/*
27408c2ecf20Sopenharmony_ci	 * Clear the SWP_USED flag after all resources are freed so that swapon
27418c2ecf20Sopenharmony_ci	 * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
27428c2ecf20Sopenharmony_ci	 * not hold p->lock after we cleared its SWP_WRITEOK.
27438c2ecf20Sopenharmony_ci	 */
27448c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
27458c2ecf20Sopenharmony_ci	p->flags = 0;
27468c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
27478c2ecf20Sopenharmony_ci
27488c2ecf20Sopenharmony_ci	err = 0;
27498c2ecf20Sopenharmony_ci	atomic_inc(&proc_poll_event);
27508c2ecf20Sopenharmony_ci	wake_up_interruptible(&proc_poll_wait);
27518c2ecf20Sopenharmony_ci
27528c2ecf20Sopenharmony_ciout_dput:
27538c2ecf20Sopenharmony_ci	filp_close(victim, NULL);
27548c2ecf20Sopenharmony_ciout:
27558c2ecf20Sopenharmony_ci	putname(pathname);
27568c2ecf20Sopenharmony_ci	return err;
27578c2ecf20Sopenharmony_ci}
27588c2ecf20Sopenharmony_ci
27598c2ecf20Sopenharmony_ci#ifdef CONFIG_PROC_FS
27608c2ecf20Sopenharmony_cistatic __poll_t swaps_poll(struct file *file, poll_table *wait)
27618c2ecf20Sopenharmony_ci{
27628c2ecf20Sopenharmony_ci	struct seq_file *seq = file->private_data;
27638c2ecf20Sopenharmony_ci
27648c2ecf20Sopenharmony_ci	poll_wait(file, &proc_poll_wait, wait);
27658c2ecf20Sopenharmony_ci
27668c2ecf20Sopenharmony_ci	if (seq->poll_event != atomic_read(&proc_poll_event)) {
27678c2ecf20Sopenharmony_ci		seq->poll_event = atomic_read(&proc_poll_event);
27688c2ecf20Sopenharmony_ci		return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
27698c2ecf20Sopenharmony_ci	}
27708c2ecf20Sopenharmony_ci
27718c2ecf20Sopenharmony_ci	return EPOLLIN | EPOLLRDNORM;
27728c2ecf20Sopenharmony_ci}
27738c2ecf20Sopenharmony_ci
27748c2ecf20Sopenharmony_ci/* iterator */
27758c2ecf20Sopenharmony_cistatic void *swap_start(struct seq_file *swap, loff_t *pos)
27768c2ecf20Sopenharmony_ci{
27778c2ecf20Sopenharmony_ci	struct swap_info_struct *si;
27788c2ecf20Sopenharmony_ci	int type;
27798c2ecf20Sopenharmony_ci	loff_t l = *pos;
27808c2ecf20Sopenharmony_ci
27818c2ecf20Sopenharmony_ci	mutex_lock(&swapon_mutex);
27828c2ecf20Sopenharmony_ci
27838c2ecf20Sopenharmony_ci	if (!l)
27848c2ecf20Sopenharmony_ci		return SEQ_START_TOKEN;
27858c2ecf20Sopenharmony_ci
27868c2ecf20Sopenharmony_ci	for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
27878c2ecf20Sopenharmony_ci		if (!(si->flags & SWP_USED) || !si->swap_map)
27888c2ecf20Sopenharmony_ci			continue;
27898c2ecf20Sopenharmony_ci		if (!--l)
27908c2ecf20Sopenharmony_ci			return si;
27918c2ecf20Sopenharmony_ci	}
27928c2ecf20Sopenharmony_ci
27938c2ecf20Sopenharmony_ci	return NULL;
27948c2ecf20Sopenharmony_ci}
27958c2ecf20Sopenharmony_ci
27968c2ecf20Sopenharmony_cistatic void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
27978c2ecf20Sopenharmony_ci{
27988c2ecf20Sopenharmony_ci	struct swap_info_struct *si = v;
27998c2ecf20Sopenharmony_ci	int type;
28008c2ecf20Sopenharmony_ci
28018c2ecf20Sopenharmony_ci	if (v == SEQ_START_TOKEN)
28028c2ecf20Sopenharmony_ci		type = 0;
28038c2ecf20Sopenharmony_ci	else
28048c2ecf20Sopenharmony_ci		type = si->type + 1;
28058c2ecf20Sopenharmony_ci
28068c2ecf20Sopenharmony_ci	++(*pos);
28078c2ecf20Sopenharmony_ci	for (; (si = swap_type_to_swap_info(type)); type++) {
28088c2ecf20Sopenharmony_ci		if (!(si->flags & SWP_USED) || !si->swap_map)
28098c2ecf20Sopenharmony_ci			continue;
28108c2ecf20Sopenharmony_ci		return si;
28118c2ecf20Sopenharmony_ci	}
28128c2ecf20Sopenharmony_ci
28138c2ecf20Sopenharmony_ci	return NULL;
28148c2ecf20Sopenharmony_ci}
28158c2ecf20Sopenharmony_ci
28168c2ecf20Sopenharmony_cistatic void swap_stop(struct seq_file *swap, void *v)
28178c2ecf20Sopenharmony_ci{
28188c2ecf20Sopenharmony_ci	mutex_unlock(&swapon_mutex);
28198c2ecf20Sopenharmony_ci}
28208c2ecf20Sopenharmony_ci
28218c2ecf20Sopenharmony_cistatic int swap_show(struct seq_file *swap, void *v)
28228c2ecf20Sopenharmony_ci{
28238c2ecf20Sopenharmony_ci	struct swap_info_struct *si = v;
28248c2ecf20Sopenharmony_ci	struct file *file;
28258c2ecf20Sopenharmony_ci	int len;
28268c2ecf20Sopenharmony_ci	unsigned int bytes, inuse;
28278c2ecf20Sopenharmony_ci
28288c2ecf20Sopenharmony_ci	if (si == SEQ_START_TOKEN) {
28298c2ecf20Sopenharmony_ci		seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
28308c2ecf20Sopenharmony_ci		return 0;
28318c2ecf20Sopenharmony_ci	}
28328c2ecf20Sopenharmony_ci
28338c2ecf20Sopenharmony_ci	bytes = si->pages << (PAGE_SHIFT - 10);
28348c2ecf20Sopenharmony_ci	inuse = si->inuse_pages << (PAGE_SHIFT - 10);
28358c2ecf20Sopenharmony_ci
28368c2ecf20Sopenharmony_ci	file = si->swap_file;
28378c2ecf20Sopenharmony_ci	len = seq_file_path(swap, file, " \t\n\\");
28388c2ecf20Sopenharmony_ci	seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
28398c2ecf20Sopenharmony_ci			len < 40 ? 40 - len : 1, " ",
28408c2ecf20Sopenharmony_ci			S_ISBLK(file_inode(file)->i_mode) ?
28418c2ecf20Sopenharmony_ci				"partition" : "file\t",
28428c2ecf20Sopenharmony_ci			bytes, bytes < 10000000 ? "\t" : "",
28438c2ecf20Sopenharmony_ci			inuse, inuse < 10000000 ? "\t" : "",
28448c2ecf20Sopenharmony_ci			si->prio);
28458c2ecf20Sopenharmony_ci	return 0;
28468c2ecf20Sopenharmony_ci}
28478c2ecf20Sopenharmony_ci
28488c2ecf20Sopenharmony_cistatic const struct seq_operations swaps_op = {
28498c2ecf20Sopenharmony_ci	.start =	swap_start,
28508c2ecf20Sopenharmony_ci	.next =		swap_next,
28518c2ecf20Sopenharmony_ci	.stop =		swap_stop,
28528c2ecf20Sopenharmony_ci	.show =		swap_show
28538c2ecf20Sopenharmony_ci};
28548c2ecf20Sopenharmony_ci
28558c2ecf20Sopenharmony_cistatic int swaps_open(struct inode *inode, struct file *file)
28568c2ecf20Sopenharmony_ci{
28578c2ecf20Sopenharmony_ci	struct seq_file *seq;
28588c2ecf20Sopenharmony_ci	int ret;
28598c2ecf20Sopenharmony_ci
28608c2ecf20Sopenharmony_ci	ret = seq_open(file, &swaps_op);
28618c2ecf20Sopenharmony_ci	if (ret)
28628c2ecf20Sopenharmony_ci		return ret;
28638c2ecf20Sopenharmony_ci
28648c2ecf20Sopenharmony_ci	seq = file->private_data;
28658c2ecf20Sopenharmony_ci	seq->poll_event = atomic_read(&proc_poll_event);
28668c2ecf20Sopenharmony_ci	return 0;
28678c2ecf20Sopenharmony_ci}
28688c2ecf20Sopenharmony_ci
28698c2ecf20Sopenharmony_cistatic const struct proc_ops swaps_proc_ops = {
28708c2ecf20Sopenharmony_ci	.proc_flags	= PROC_ENTRY_PERMANENT,
28718c2ecf20Sopenharmony_ci	.proc_open	= swaps_open,
28728c2ecf20Sopenharmony_ci	.proc_read	= seq_read,
28738c2ecf20Sopenharmony_ci	.proc_lseek	= seq_lseek,
28748c2ecf20Sopenharmony_ci	.proc_release	= seq_release,
28758c2ecf20Sopenharmony_ci	.proc_poll	= swaps_poll,
28768c2ecf20Sopenharmony_ci};
28778c2ecf20Sopenharmony_ci
28788c2ecf20Sopenharmony_cistatic int __init procswaps_init(void)
28798c2ecf20Sopenharmony_ci{
28808c2ecf20Sopenharmony_ci	proc_create("swaps", 0, NULL, &swaps_proc_ops);
28818c2ecf20Sopenharmony_ci	return 0;
28828c2ecf20Sopenharmony_ci}
28838c2ecf20Sopenharmony_ci__initcall(procswaps_init);
28848c2ecf20Sopenharmony_ci#endif /* CONFIG_PROC_FS */
28858c2ecf20Sopenharmony_ci
28868c2ecf20Sopenharmony_ci#ifdef MAX_SWAPFILES_CHECK
28878c2ecf20Sopenharmony_cistatic int __init max_swapfiles_check(void)
28888c2ecf20Sopenharmony_ci{
28898c2ecf20Sopenharmony_ci	MAX_SWAPFILES_CHECK();
28908c2ecf20Sopenharmony_ci	return 0;
28918c2ecf20Sopenharmony_ci}
28928c2ecf20Sopenharmony_cilate_initcall(max_swapfiles_check);
28938c2ecf20Sopenharmony_ci#endif
28948c2ecf20Sopenharmony_ci
28958c2ecf20Sopenharmony_cistatic struct swap_info_struct *alloc_swap_info(void)
28968c2ecf20Sopenharmony_ci{
28978c2ecf20Sopenharmony_ci	struct swap_info_struct *p;
28988c2ecf20Sopenharmony_ci	struct swap_info_struct *defer = NULL;
28998c2ecf20Sopenharmony_ci	unsigned int type;
29008c2ecf20Sopenharmony_ci	int i;
29018c2ecf20Sopenharmony_ci
29028c2ecf20Sopenharmony_ci	p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
29038c2ecf20Sopenharmony_ci	if (!p)
29048c2ecf20Sopenharmony_ci		return ERR_PTR(-ENOMEM);
29058c2ecf20Sopenharmony_ci
29068c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
29078c2ecf20Sopenharmony_ci	for (type = 0; type < nr_swapfiles; type++) {
29088c2ecf20Sopenharmony_ci		if (!(swap_info[type]->flags & SWP_USED))
29098c2ecf20Sopenharmony_ci			break;
29108c2ecf20Sopenharmony_ci	}
29118c2ecf20Sopenharmony_ci	if (type >= MAX_SWAPFILES) {
29128c2ecf20Sopenharmony_ci		spin_unlock(&swap_lock);
29138c2ecf20Sopenharmony_ci		kvfree(p);
29148c2ecf20Sopenharmony_ci		return ERR_PTR(-EPERM);
29158c2ecf20Sopenharmony_ci	}
29168c2ecf20Sopenharmony_ci	if (type >= nr_swapfiles) {
29178c2ecf20Sopenharmony_ci		p->type = type;
29188c2ecf20Sopenharmony_ci		WRITE_ONCE(swap_info[type], p);
29198c2ecf20Sopenharmony_ci		/*
29208c2ecf20Sopenharmony_ci		 * Write swap_info[type] before nr_swapfiles, in case a
29218c2ecf20Sopenharmony_ci		 * racing procfs swap_start() or swap_next() is reading them.
29228c2ecf20Sopenharmony_ci		 * (We never shrink nr_swapfiles, we never free this entry.)
29238c2ecf20Sopenharmony_ci		 */
29248c2ecf20Sopenharmony_ci		smp_wmb();
29258c2ecf20Sopenharmony_ci		WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
29268c2ecf20Sopenharmony_ci	} else {
29278c2ecf20Sopenharmony_ci		defer = p;
29288c2ecf20Sopenharmony_ci		p = swap_info[type];
29298c2ecf20Sopenharmony_ci		/*
29308c2ecf20Sopenharmony_ci		 * Do not memset this entry: a racing procfs swap_next()
29318c2ecf20Sopenharmony_ci		 * would be relying on p->type to remain valid.
29328c2ecf20Sopenharmony_ci		 */
29338c2ecf20Sopenharmony_ci	}
29348c2ecf20Sopenharmony_ci	p->swap_extent_root = RB_ROOT;
29358c2ecf20Sopenharmony_ci	plist_node_init(&p->list, 0);
29368c2ecf20Sopenharmony_ci	for_each_node(i)
29378c2ecf20Sopenharmony_ci		plist_node_init(&p->avail_lists[i], 0);
29388c2ecf20Sopenharmony_ci	p->flags = SWP_USED;
29398c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
29408c2ecf20Sopenharmony_ci	kvfree(defer);
29418c2ecf20Sopenharmony_ci	spin_lock_init(&p->lock);
29428c2ecf20Sopenharmony_ci	spin_lock_init(&p->cont_lock);
29438c2ecf20Sopenharmony_ci
29448c2ecf20Sopenharmony_ci	return p;
29458c2ecf20Sopenharmony_ci}
29468c2ecf20Sopenharmony_ci
29478c2ecf20Sopenharmony_cistatic int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
29488c2ecf20Sopenharmony_ci{
29498c2ecf20Sopenharmony_ci	int error;
29508c2ecf20Sopenharmony_ci
29518c2ecf20Sopenharmony_ci	if (S_ISBLK(inode->i_mode)) {
29528c2ecf20Sopenharmony_ci		p->bdev = blkdev_get_by_dev(inode->i_rdev,
29538c2ecf20Sopenharmony_ci				   FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
29548c2ecf20Sopenharmony_ci		if (IS_ERR(p->bdev)) {
29558c2ecf20Sopenharmony_ci			error = PTR_ERR(p->bdev);
29568c2ecf20Sopenharmony_ci			p->bdev = NULL;
29578c2ecf20Sopenharmony_ci			return error;
29588c2ecf20Sopenharmony_ci		}
29598c2ecf20Sopenharmony_ci		p->old_block_size = block_size(p->bdev);
29608c2ecf20Sopenharmony_ci		error = set_blocksize(p->bdev, PAGE_SIZE);
29618c2ecf20Sopenharmony_ci		if (error < 0)
29628c2ecf20Sopenharmony_ci			return error;
29638c2ecf20Sopenharmony_ci		/*
29648c2ecf20Sopenharmony_ci		 * Zoned block devices contain zones that have a sequential
29658c2ecf20Sopenharmony_ci		 * write only restriction.  Hence zoned block devices are not
29668c2ecf20Sopenharmony_ci		 * suitable for swapping.  Disallow them here.
29678c2ecf20Sopenharmony_ci		 */
29688c2ecf20Sopenharmony_ci		if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
29698c2ecf20Sopenharmony_ci			return -EINVAL;
29708c2ecf20Sopenharmony_ci		p->flags |= SWP_BLKDEV;
29718c2ecf20Sopenharmony_ci	} else if (S_ISREG(inode->i_mode)) {
29728c2ecf20Sopenharmony_ci		p->bdev = inode->i_sb->s_bdev;
29738c2ecf20Sopenharmony_ci	}
29748c2ecf20Sopenharmony_ci
29758c2ecf20Sopenharmony_ci	return 0;
29768c2ecf20Sopenharmony_ci}
29778c2ecf20Sopenharmony_ci
29788c2ecf20Sopenharmony_ci
29798c2ecf20Sopenharmony_ci/*
29808c2ecf20Sopenharmony_ci * Find out how many pages are allowed for a single swap device. There
29818c2ecf20Sopenharmony_ci * are two limiting factors:
29828c2ecf20Sopenharmony_ci * 1) the number of bits for the swap offset in the swp_entry_t type, and
29838c2ecf20Sopenharmony_ci * 2) the number of bits in the swap pte, as defined by the different
29848c2ecf20Sopenharmony_ci * architectures.
29858c2ecf20Sopenharmony_ci *
29868c2ecf20Sopenharmony_ci * In order to find the largest possible bit mask, a swap entry with
29878c2ecf20Sopenharmony_ci * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
29888c2ecf20Sopenharmony_ci * decoded to a swp_entry_t again, and finally the swap offset is
29898c2ecf20Sopenharmony_ci * extracted.
29908c2ecf20Sopenharmony_ci *
29918c2ecf20Sopenharmony_ci * This will mask all the bits from the initial ~0UL mask that can't
29928c2ecf20Sopenharmony_ci * be encoded in either the swp_entry_t or the architecture definition
29938c2ecf20Sopenharmony_ci * of a swap pte.
29948c2ecf20Sopenharmony_ci */
29958c2ecf20Sopenharmony_ciunsigned long generic_max_swapfile_size(void)
29968c2ecf20Sopenharmony_ci{
29978c2ecf20Sopenharmony_ci	return swp_offset(pte_to_swp_entry(
29988c2ecf20Sopenharmony_ci			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
29998c2ecf20Sopenharmony_ci}
30008c2ecf20Sopenharmony_ci
30018c2ecf20Sopenharmony_ci/* Can be overridden by an architecture for additional checks. */
30028c2ecf20Sopenharmony_ci__weak unsigned long max_swapfile_size(void)
30038c2ecf20Sopenharmony_ci{
30048c2ecf20Sopenharmony_ci	return generic_max_swapfile_size();
30058c2ecf20Sopenharmony_ci}
30068c2ecf20Sopenharmony_ci
30078c2ecf20Sopenharmony_cistatic unsigned long read_swap_header(struct swap_info_struct *p,
30088c2ecf20Sopenharmony_ci					union swap_header *swap_header,
30098c2ecf20Sopenharmony_ci					struct inode *inode)
30108c2ecf20Sopenharmony_ci{
30118c2ecf20Sopenharmony_ci	int i;
30128c2ecf20Sopenharmony_ci	unsigned long maxpages;
30138c2ecf20Sopenharmony_ci	unsigned long swapfilepages;
30148c2ecf20Sopenharmony_ci	unsigned long last_page;
30158c2ecf20Sopenharmony_ci
30168c2ecf20Sopenharmony_ci	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
30178c2ecf20Sopenharmony_ci		pr_err("Unable to find swap-space signature\n");
30188c2ecf20Sopenharmony_ci		return 0;
30198c2ecf20Sopenharmony_ci	}
30208c2ecf20Sopenharmony_ci
30218c2ecf20Sopenharmony_ci	/* swap partition endianess hack... */
30228c2ecf20Sopenharmony_ci	if (swab32(swap_header->info.version) == 1) {
30238c2ecf20Sopenharmony_ci		swab32s(&swap_header->info.version);
30248c2ecf20Sopenharmony_ci		swab32s(&swap_header->info.last_page);
30258c2ecf20Sopenharmony_ci		swab32s(&swap_header->info.nr_badpages);
30268c2ecf20Sopenharmony_ci		if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
30278c2ecf20Sopenharmony_ci			return 0;
30288c2ecf20Sopenharmony_ci		for (i = 0; i < swap_header->info.nr_badpages; i++)
30298c2ecf20Sopenharmony_ci			swab32s(&swap_header->info.badpages[i]);
30308c2ecf20Sopenharmony_ci	}
30318c2ecf20Sopenharmony_ci	/* Check the swap header's sub-version */
30328c2ecf20Sopenharmony_ci	if (swap_header->info.version != 1) {
30338c2ecf20Sopenharmony_ci		pr_warn("Unable to handle swap header version %d\n",
30348c2ecf20Sopenharmony_ci			swap_header->info.version);
30358c2ecf20Sopenharmony_ci		return 0;
30368c2ecf20Sopenharmony_ci	}
30378c2ecf20Sopenharmony_ci
30388c2ecf20Sopenharmony_ci	p->lowest_bit  = 1;
30398c2ecf20Sopenharmony_ci	p->cluster_next = 1;
30408c2ecf20Sopenharmony_ci	p->cluster_nr = 0;
30418c2ecf20Sopenharmony_ci
30428c2ecf20Sopenharmony_ci	maxpages = max_swapfile_size();
30438c2ecf20Sopenharmony_ci	last_page = swap_header->info.last_page;
30448c2ecf20Sopenharmony_ci	if (!last_page) {
30458c2ecf20Sopenharmony_ci		pr_warn("Empty swap-file\n");
30468c2ecf20Sopenharmony_ci		return 0;
30478c2ecf20Sopenharmony_ci	}
30488c2ecf20Sopenharmony_ci	if (last_page > maxpages) {
30498c2ecf20Sopenharmony_ci		pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
30508c2ecf20Sopenharmony_ci			maxpages << (PAGE_SHIFT - 10),
30518c2ecf20Sopenharmony_ci			last_page << (PAGE_SHIFT - 10));
30528c2ecf20Sopenharmony_ci	}
30538c2ecf20Sopenharmony_ci	if (maxpages > last_page) {
30548c2ecf20Sopenharmony_ci		maxpages = last_page + 1;
30558c2ecf20Sopenharmony_ci		/* p->max is an unsigned int: don't overflow it */
30568c2ecf20Sopenharmony_ci		if ((unsigned int)maxpages == 0)
30578c2ecf20Sopenharmony_ci			maxpages = UINT_MAX;
30588c2ecf20Sopenharmony_ci	}
30598c2ecf20Sopenharmony_ci	p->highest_bit = maxpages - 1;
30608c2ecf20Sopenharmony_ci
30618c2ecf20Sopenharmony_ci	if (!maxpages)
30628c2ecf20Sopenharmony_ci		return 0;
30638c2ecf20Sopenharmony_ci	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
30648c2ecf20Sopenharmony_ci	if (swapfilepages && maxpages > swapfilepages) {
30658c2ecf20Sopenharmony_ci		pr_warn("Swap area shorter than signature indicates\n");
30668c2ecf20Sopenharmony_ci		return 0;
30678c2ecf20Sopenharmony_ci	}
30688c2ecf20Sopenharmony_ci	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
30698c2ecf20Sopenharmony_ci		return 0;
30708c2ecf20Sopenharmony_ci	if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
30718c2ecf20Sopenharmony_ci		return 0;
30728c2ecf20Sopenharmony_ci
30738c2ecf20Sopenharmony_ci	return maxpages;
30748c2ecf20Sopenharmony_ci}
30758c2ecf20Sopenharmony_ci
30768c2ecf20Sopenharmony_ci#define SWAP_CLUSTER_INFO_COLS						\
30778c2ecf20Sopenharmony_ci	DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
30788c2ecf20Sopenharmony_ci#define SWAP_CLUSTER_SPACE_COLS						\
30798c2ecf20Sopenharmony_ci	DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
30808c2ecf20Sopenharmony_ci#define SWAP_CLUSTER_COLS						\
30818c2ecf20Sopenharmony_ci	max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
30828c2ecf20Sopenharmony_ci
30838c2ecf20Sopenharmony_cistatic int setup_swap_map_and_extents(struct swap_info_struct *p,
30848c2ecf20Sopenharmony_ci					union swap_header *swap_header,
30858c2ecf20Sopenharmony_ci					unsigned char *swap_map,
30868c2ecf20Sopenharmony_ci					struct swap_cluster_info *cluster_info,
30878c2ecf20Sopenharmony_ci					unsigned long maxpages,
30888c2ecf20Sopenharmony_ci					sector_t *span)
30898c2ecf20Sopenharmony_ci{
30908c2ecf20Sopenharmony_ci	unsigned int j, k;
30918c2ecf20Sopenharmony_ci	unsigned int nr_good_pages;
30928c2ecf20Sopenharmony_ci	int nr_extents;
30938c2ecf20Sopenharmony_ci	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
30948c2ecf20Sopenharmony_ci	unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
30958c2ecf20Sopenharmony_ci	unsigned long i, idx;
30968c2ecf20Sopenharmony_ci
30978c2ecf20Sopenharmony_ci	nr_good_pages = maxpages - 1;	/* omit header page */
30988c2ecf20Sopenharmony_ci
30998c2ecf20Sopenharmony_ci	cluster_list_init(&p->free_clusters);
31008c2ecf20Sopenharmony_ci	cluster_list_init(&p->discard_clusters);
31018c2ecf20Sopenharmony_ci
31028c2ecf20Sopenharmony_ci	for (i = 0; i < swap_header->info.nr_badpages; i++) {
31038c2ecf20Sopenharmony_ci		unsigned int page_nr = swap_header->info.badpages[i];
31048c2ecf20Sopenharmony_ci		if (page_nr == 0 || page_nr > swap_header->info.last_page)
31058c2ecf20Sopenharmony_ci			return -EINVAL;
31068c2ecf20Sopenharmony_ci		if (page_nr < maxpages) {
31078c2ecf20Sopenharmony_ci			swap_map[page_nr] = SWAP_MAP_BAD;
31088c2ecf20Sopenharmony_ci			nr_good_pages--;
31098c2ecf20Sopenharmony_ci			/*
31108c2ecf20Sopenharmony_ci			 * Haven't marked the cluster free yet, no list
31118c2ecf20Sopenharmony_ci			 * operation involved
31128c2ecf20Sopenharmony_ci			 */
31138c2ecf20Sopenharmony_ci			inc_cluster_info_page(p, cluster_info, page_nr);
31148c2ecf20Sopenharmony_ci		}
31158c2ecf20Sopenharmony_ci	}
31168c2ecf20Sopenharmony_ci
31178c2ecf20Sopenharmony_ci	/* Haven't marked the cluster free yet, no list operation involved */
31188c2ecf20Sopenharmony_ci	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
31198c2ecf20Sopenharmony_ci		inc_cluster_info_page(p, cluster_info, i);
31208c2ecf20Sopenharmony_ci
31218c2ecf20Sopenharmony_ci	if (nr_good_pages) {
31228c2ecf20Sopenharmony_ci		swap_map[0] = SWAP_MAP_BAD;
31238c2ecf20Sopenharmony_ci		/*
31248c2ecf20Sopenharmony_ci		 * Not mark the cluster free yet, no list
31258c2ecf20Sopenharmony_ci		 * operation involved
31268c2ecf20Sopenharmony_ci		 */
31278c2ecf20Sopenharmony_ci		inc_cluster_info_page(p, cluster_info, 0);
31288c2ecf20Sopenharmony_ci		p->max = maxpages;
31298c2ecf20Sopenharmony_ci		p->pages = nr_good_pages;
31308c2ecf20Sopenharmony_ci		nr_extents = setup_swap_extents(p, span);
31318c2ecf20Sopenharmony_ci		if (nr_extents < 0)
31328c2ecf20Sopenharmony_ci			return nr_extents;
31338c2ecf20Sopenharmony_ci		nr_good_pages = p->pages;
31348c2ecf20Sopenharmony_ci	}
31358c2ecf20Sopenharmony_ci	if (!nr_good_pages) {
31368c2ecf20Sopenharmony_ci		pr_warn("Empty swap-file\n");
31378c2ecf20Sopenharmony_ci		return -EINVAL;
31388c2ecf20Sopenharmony_ci	}
31398c2ecf20Sopenharmony_ci
31408c2ecf20Sopenharmony_ci	if (!cluster_info)
31418c2ecf20Sopenharmony_ci		return nr_extents;
31428c2ecf20Sopenharmony_ci
31438c2ecf20Sopenharmony_ci
31448c2ecf20Sopenharmony_ci	/*
31458c2ecf20Sopenharmony_ci	 * Reduce false cache line sharing between cluster_info and
31468c2ecf20Sopenharmony_ci	 * sharing same address space.
31478c2ecf20Sopenharmony_ci	 */
31488c2ecf20Sopenharmony_ci	for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
31498c2ecf20Sopenharmony_ci		j = (k + col) % SWAP_CLUSTER_COLS;
31508c2ecf20Sopenharmony_ci		for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
31518c2ecf20Sopenharmony_ci			idx = i * SWAP_CLUSTER_COLS + j;
31528c2ecf20Sopenharmony_ci			if (idx >= nr_clusters)
31538c2ecf20Sopenharmony_ci				continue;
31548c2ecf20Sopenharmony_ci			if (cluster_count(&cluster_info[idx]))
31558c2ecf20Sopenharmony_ci				continue;
31568c2ecf20Sopenharmony_ci			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
31578c2ecf20Sopenharmony_ci			cluster_list_add_tail(&p->free_clusters, cluster_info,
31588c2ecf20Sopenharmony_ci					      idx);
31598c2ecf20Sopenharmony_ci		}
31608c2ecf20Sopenharmony_ci	}
31618c2ecf20Sopenharmony_ci	return nr_extents;
31628c2ecf20Sopenharmony_ci}
31638c2ecf20Sopenharmony_ci
31648c2ecf20Sopenharmony_ci/*
31658c2ecf20Sopenharmony_ci * Helper to sys_swapon determining if a given swap
31668c2ecf20Sopenharmony_ci * backing device queue supports DISCARD operations.
31678c2ecf20Sopenharmony_ci */
31688c2ecf20Sopenharmony_cistatic bool swap_discardable(struct swap_info_struct *si)
31698c2ecf20Sopenharmony_ci{
31708c2ecf20Sopenharmony_ci	struct request_queue *q = bdev_get_queue(si->bdev);
31718c2ecf20Sopenharmony_ci
31728c2ecf20Sopenharmony_ci	if (!q || !blk_queue_discard(q))
31738c2ecf20Sopenharmony_ci		return false;
31748c2ecf20Sopenharmony_ci
31758c2ecf20Sopenharmony_ci	return true;
31768c2ecf20Sopenharmony_ci}
31778c2ecf20Sopenharmony_ci
31788c2ecf20Sopenharmony_ciSYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
31798c2ecf20Sopenharmony_ci{
31808c2ecf20Sopenharmony_ci	struct swap_info_struct *p;
31818c2ecf20Sopenharmony_ci	struct filename *name;
31828c2ecf20Sopenharmony_ci	struct file *swap_file = NULL;
31838c2ecf20Sopenharmony_ci	struct address_space *mapping;
31848c2ecf20Sopenharmony_ci	int prio;
31858c2ecf20Sopenharmony_ci	int error;
31868c2ecf20Sopenharmony_ci	union swap_header *swap_header;
31878c2ecf20Sopenharmony_ci	int nr_extents;
31888c2ecf20Sopenharmony_ci	sector_t span;
31898c2ecf20Sopenharmony_ci	unsigned long maxpages;
31908c2ecf20Sopenharmony_ci	unsigned char *swap_map = NULL;
31918c2ecf20Sopenharmony_ci	struct swap_cluster_info *cluster_info = NULL;
31928c2ecf20Sopenharmony_ci	unsigned long *frontswap_map = NULL;
31938c2ecf20Sopenharmony_ci	struct page *page = NULL;
31948c2ecf20Sopenharmony_ci	struct inode *inode = NULL;
31958c2ecf20Sopenharmony_ci	bool inced_nr_rotate_swap = false;
31968c2ecf20Sopenharmony_ci
31978c2ecf20Sopenharmony_ci	if (swap_flags & ~SWAP_FLAGS_VALID)
31988c2ecf20Sopenharmony_ci		return -EINVAL;
31998c2ecf20Sopenharmony_ci
32008c2ecf20Sopenharmony_ci	if (!capable(CAP_SYS_ADMIN))
32018c2ecf20Sopenharmony_ci		return -EPERM;
32028c2ecf20Sopenharmony_ci
32038c2ecf20Sopenharmony_ci	if (!swap_avail_heads)
32048c2ecf20Sopenharmony_ci		return -ENOMEM;
32058c2ecf20Sopenharmony_ci
32068c2ecf20Sopenharmony_ci	p = alloc_swap_info();
32078c2ecf20Sopenharmony_ci	if (IS_ERR(p))
32088c2ecf20Sopenharmony_ci		return PTR_ERR(p);
32098c2ecf20Sopenharmony_ci
32108c2ecf20Sopenharmony_ci	INIT_WORK(&p->discard_work, swap_discard_work);
32118c2ecf20Sopenharmony_ci
32128c2ecf20Sopenharmony_ci	name = getname(specialfile);
32138c2ecf20Sopenharmony_ci	if (IS_ERR(name)) {
32148c2ecf20Sopenharmony_ci		error = PTR_ERR(name);
32158c2ecf20Sopenharmony_ci		name = NULL;
32168c2ecf20Sopenharmony_ci		goto bad_swap;
32178c2ecf20Sopenharmony_ci	}
32188c2ecf20Sopenharmony_ci	swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
32198c2ecf20Sopenharmony_ci	if (IS_ERR(swap_file)) {
32208c2ecf20Sopenharmony_ci		error = PTR_ERR(swap_file);
32218c2ecf20Sopenharmony_ci		swap_file = NULL;
32228c2ecf20Sopenharmony_ci		goto bad_swap;
32238c2ecf20Sopenharmony_ci	}
32248c2ecf20Sopenharmony_ci
32258c2ecf20Sopenharmony_ci	p->swap_file = swap_file;
32268c2ecf20Sopenharmony_ci	mapping = swap_file->f_mapping;
32278c2ecf20Sopenharmony_ci	inode = mapping->host;
32288c2ecf20Sopenharmony_ci
32298c2ecf20Sopenharmony_ci	error = claim_swapfile(p, inode);
32308c2ecf20Sopenharmony_ci	if (unlikely(error))
32318c2ecf20Sopenharmony_ci		goto bad_swap;
32328c2ecf20Sopenharmony_ci
32338c2ecf20Sopenharmony_ci	inode_lock(inode);
32348c2ecf20Sopenharmony_ci	if (IS_SWAPFILE(inode)) {
32358c2ecf20Sopenharmony_ci		error = -EBUSY;
32368c2ecf20Sopenharmony_ci		goto bad_swap_unlock_inode;
32378c2ecf20Sopenharmony_ci	}
32388c2ecf20Sopenharmony_ci
32398c2ecf20Sopenharmony_ci	/*
32408c2ecf20Sopenharmony_ci	 * Read the swap header.
32418c2ecf20Sopenharmony_ci	 */
32428c2ecf20Sopenharmony_ci	if (!mapping->a_ops->readpage) {
32438c2ecf20Sopenharmony_ci		error = -EINVAL;
32448c2ecf20Sopenharmony_ci		goto bad_swap_unlock_inode;
32458c2ecf20Sopenharmony_ci	}
32468c2ecf20Sopenharmony_ci	page = read_mapping_page(mapping, 0, swap_file);
32478c2ecf20Sopenharmony_ci	if (IS_ERR(page)) {
32488c2ecf20Sopenharmony_ci		error = PTR_ERR(page);
32498c2ecf20Sopenharmony_ci		goto bad_swap_unlock_inode;
32508c2ecf20Sopenharmony_ci	}
32518c2ecf20Sopenharmony_ci	swap_header = kmap(page);
32528c2ecf20Sopenharmony_ci
32538c2ecf20Sopenharmony_ci	maxpages = read_swap_header(p, swap_header, inode);
32548c2ecf20Sopenharmony_ci	if (unlikely(!maxpages)) {
32558c2ecf20Sopenharmony_ci		error = -EINVAL;
32568c2ecf20Sopenharmony_ci		goto bad_swap_unlock_inode;
32578c2ecf20Sopenharmony_ci	}
32588c2ecf20Sopenharmony_ci
32598c2ecf20Sopenharmony_ci	/* OK, set up the swap map and apply the bad block list */
32608c2ecf20Sopenharmony_ci	swap_map = vzalloc(maxpages);
32618c2ecf20Sopenharmony_ci	if (!swap_map) {
32628c2ecf20Sopenharmony_ci		error = -ENOMEM;
32638c2ecf20Sopenharmony_ci		goto bad_swap_unlock_inode;
32648c2ecf20Sopenharmony_ci	}
32658c2ecf20Sopenharmony_ci
32668c2ecf20Sopenharmony_ci	if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
32678c2ecf20Sopenharmony_ci		p->flags |= SWP_STABLE_WRITES;
32688c2ecf20Sopenharmony_ci
32698c2ecf20Sopenharmony_ci	if (p->bdev && p->bdev->bd_disk->fops->rw_page)
32708c2ecf20Sopenharmony_ci		p->flags |= SWP_SYNCHRONOUS_IO;
32718c2ecf20Sopenharmony_ci
32728c2ecf20Sopenharmony_ci	if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
32738c2ecf20Sopenharmony_ci		int cpu;
32748c2ecf20Sopenharmony_ci		unsigned long ci, nr_cluster;
32758c2ecf20Sopenharmony_ci
32768c2ecf20Sopenharmony_ci		p->flags |= SWP_SOLIDSTATE;
32778c2ecf20Sopenharmony_ci		p->cluster_next_cpu = alloc_percpu(unsigned int);
32788c2ecf20Sopenharmony_ci		if (!p->cluster_next_cpu) {
32798c2ecf20Sopenharmony_ci			error = -ENOMEM;
32808c2ecf20Sopenharmony_ci			goto bad_swap_unlock_inode;
32818c2ecf20Sopenharmony_ci		}
32828c2ecf20Sopenharmony_ci		/*
32838c2ecf20Sopenharmony_ci		 * select a random position to start with to help wear leveling
32848c2ecf20Sopenharmony_ci		 * SSD
32858c2ecf20Sopenharmony_ci		 */
32868c2ecf20Sopenharmony_ci		for_each_possible_cpu(cpu) {
32878c2ecf20Sopenharmony_ci			per_cpu(*p->cluster_next_cpu, cpu) =
32888c2ecf20Sopenharmony_ci				1 + prandom_u32_max(p->highest_bit);
32898c2ecf20Sopenharmony_ci		}
32908c2ecf20Sopenharmony_ci		nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
32918c2ecf20Sopenharmony_ci
32928c2ecf20Sopenharmony_ci		cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
32938c2ecf20Sopenharmony_ci					GFP_KERNEL);
32948c2ecf20Sopenharmony_ci		if (!cluster_info) {
32958c2ecf20Sopenharmony_ci			error = -ENOMEM;
32968c2ecf20Sopenharmony_ci			goto bad_swap_unlock_inode;
32978c2ecf20Sopenharmony_ci		}
32988c2ecf20Sopenharmony_ci
32998c2ecf20Sopenharmony_ci		for (ci = 0; ci < nr_cluster; ci++)
33008c2ecf20Sopenharmony_ci			spin_lock_init(&((cluster_info + ci)->lock));
33018c2ecf20Sopenharmony_ci
33028c2ecf20Sopenharmony_ci		p->percpu_cluster = alloc_percpu(struct percpu_cluster);
33038c2ecf20Sopenharmony_ci		if (!p->percpu_cluster) {
33048c2ecf20Sopenharmony_ci			error = -ENOMEM;
33058c2ecf20Sopenharmony_ci			goto bad_swap_unlock_inode;
33068c2ecf20Sopenharmony_ci		}
33078c2ecf20Sopenharmony_ci		for_each_possible_cpu(cpu) {
33088c2ecf20Sopenharmony_ci			struct percpu_cluster *cluster;
33098c2ecf20Sopenharmony_ci			cluster = per_cpu_ptr(p->percpu_cluster, cpu);
33108c2ecf20Sopenharmony_ci			cluster_set_null(&cluster->index);
33118c2ecf20Sopenharmony_ci		}
33128c2ecf20Sopenharmony_ci	} else {
33138c2ecf20Sopenharmony_ci		atomic_inc(&nr_rotate_swap);
33148c2ecf20Sopenharmony_ci		inced_nr_rotate_swap = true;
33158c2ecf20Sopenharmony_ci	}
33168c2ecf20Sopenharmony_ci
33178c2ecf20Sopenharmony_ci	error = swap_cgroup_swapon(p->type, maxpages);
33188c2ecf20Sopenharmony_ci	if (error)
33198c2ecf20Sopenharmony_ci		goto bad_swap_unlock_inode;
33208c2ecf20Sopenharmony_ci
33218c2ecf20Sopenharmony_ci	nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
33228c2ecf20Sopenharmony_ci		cluster_info, maxpages, &span);
33238c2ecf20Sopenharmony_ci	if (unlikely(nr_extents < 0)) {
33248c2ecf20Sopenharmony_ci		error = nr_extents;
33258c2ecf20Sopenharmony_ci		goto bad_swap_unlock_inode;
33268c2ecf20Sopenharmony_ci	}
33278c2ecf20Sopenharmony_ci	/* frontswap enabled? set up bit-per-page map for frontswap */
33288c2ecf20Sopenharmony_ci	if (IS_ENABLED(CONFIG_FRONTSWAP))
33298c2ecf20Sopenharmony_ci		frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
33308c2ecf20Sopenharmony_ci					 sizeof(long),
33318c2ecf20Sopenharmony_ci					 GFP_KERNEL);
33328c2ecf20Sopenharmony_ci
33338c2ecf20Sopenharmony_ci	if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
33348c2ecf20Sopenharmony_ci		/*
33358c2ecf20Sopenharmony_ci		 * When discard is enabled for swap with no particular
33368c2ecf20Sopenharmony_ci		 * policy flagged, we set all swap discard flags here in
33378c2ecf20Sopenharmony_ci		 * order to sustain backward compatibility with older
33388c2ecf20Sopenharmony_ci		 * swapon(8) releases.
33398c2ecf20Sopenharmony_ci		 */
33408c2ecf20Sopenharmony_ci		p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
33418c2ecf20Sopenharmony_ci			     SWP_PAGE_DISCARD);
33428c2ecf20Sopenharmony_ci
33438c2ecf20Sopenharmony_ci		/*
33448c2ecf20Sopenharmony_ci		 * By flagging sys_swapon, a sysadmin can tell us to
33458c2ecf20Sopenharmony_ci		 * either do single-time area discards only, or to just
33468c2ecf20Sopenharmony_ci		 * perform discards for released swap page-clusters.
33478c2ecf20Sopenharmony_ci		 * Now it's time to adjust the p->flags accordingly.
33488c2ecf20Sopenharmony_ci		 */
33498c2ecf20Sopenharmony_ci		if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
33508c2ecf20Sopenharmony_ci			p->flags &= ~SWP_PAGE_DISCARD;
33518c2ecf20Sopenharmony_ci		else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
33528c2ecf20Sopenharmony_ci			p->flags &= ~SWP_AREA_DISCARD;
33538c2ecf20Sopenharmony_ci
33548c2ecf20Sopenharmony_ci		/* issue a swapon-time discard if it's still required */
33558c2ecf20Sopenharmony_ci		if (p->flags & SWP_AREA_DISCARD) {
33568c2ecf20Sopenharmony_ci			int err = discard_swap(p);
33578c2ecf20Sopenharmony_ci			if (unlikely(err))
33588c2ecf20Sopenharmony_ci				pr_err("swapon: discard_swap(%p): %d\n",
33598c2ecf20Sopenharmony_ci					p, err);
33608c2ecf20Sopenharmony_ci		}
33618c2ecf20Sopenharmony_ci	}
33628c2ecf20Sopenharmony_ci
33638c2ecf20Sopenharmony_ci	error = init_swap_address_space(p->type, maxpages);
33648c2ecf20Sopenharmony_ci	if (error)
33658c2ecf20Sopenharmony_ci		goto bad_swap_unlock_inode;
33668c2ecf20Sopenharmony_ci
33678c2ecf20Sopenharmony_ci	/*
33688c2ecf20Sopenharmony_ci	 * Flush any pending IO and dirty mappings before we start using this
33698c2ecf20Sopenharmony_ci	 * swap device.
33708c2ecf20Sopenharmony_ci	 */
33718c2ecf20Sopenharmony_ci	inode->i_flags |= S_SWAPFILE;
33728c2ecf20Sopenharmony_ci	error = inode_drain_writes(inode);
33738c2ecf20Sopenharmony_ci	if (error) {
33748c2ecf20Sopenharmony_ci		inode->i_flags &= ~S_SWAPFILE;
33758c2ecf20Sopenharmony_ci		goto free_swap_address_space;
33768c2ecf20Sopenharmony_ci	}
33778c2ecf20Sopenharmony_ci
33788c2ecf20Sopenharmony_ci	mutex_lock(&swapon_mutex);
33798c2ecf20Sopenharmony_ci	prio = -1;
33808c2ecf20Sopenharmony_ci	if (swap_flags & SWAP_FLAG_PREFER)
33818c2ecf20Sopenharmony_ci		prio =
33828c2ecf20Sopenharmony_ci		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
33838c2ecf20Sopenharmony_ci	enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
33848c2ecf20Sopenharmony_ci
33858c2ecf20Sopenharmony_ci	pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
33868c2ecf20Sopenharmony_ci		p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
33878c2ecf20Sopenharmony_ci		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
33888c2ecf20Sopenharmony_ci		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
33898c2ecf20Sopenharmony_ci		(p->flags & SWP_DISCARDABLE) ? "D" : "",
33908c2ecf20Sopenharmony_ci		(p->flags & SWP_AREA_DISCARD) ? "s" : "",
33918c2ecf20Sopenharmony_ci		(p->flags & SWP_PAGE_DISCARD) ? "c" : "",
33928c2ecf20Sopenharmony_ci		(frontswap_map) ? "FS" : "");
33938c2ecf20Sopenharmony_ci
33948c2ecf20Sopenharmony_ci	mutex_unlock(&swapon_mutex);
33958c2ecf20Sopenharmony_ci	atomic_inc(&proc_poll_event);
33968c2ecf20Sopenharmony_ci	wake_up_interruptible(&proc_poll_wait);
33978c2ecf20Sopenharmony_ci
33988c2ecf20Sopenharmony_ci	error = 0;
33998c2ecf20Sopenharmony_ci	goto out;
34008c2ecf20Sopenharmony_cifree_swap_address_space:
34018c2ecf20Sopenharmony_ci	exit_swap_address_space(p->type);
34028c2ecf20Sopenharmony_cibad_swap_unlock_inode:
34038c2ecf20Sopenharmony_ci	inode_unlock(inode);
34048c2ecf20Sopenharmony_cibad_swap:
34058c2ecf20Sopenharmony_ci	free_percpu(p->percpu_cluster);
34068c2ecf20Sopenharmony_ci	p->percpu_cluster = NULL;
34078c2ecf20Sopenharmony_ci	free_percpu(p->cluster_next_cpu);
34088c2ecf20Sopenharmony_ci	p->cluster_next_cpu = NULL;
34098c2ecf20Sopenharmony_ci	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
34108c2ecf20Sopenharmony_ci		set_blocksize(p->bdev, p->old_block_size);
34118c2ecf20Sopenharmony_ci		blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
34128c2ecf20Sopenharmony_ci	}
34138c2ecf20Sopenharmony_ci	inode = NULL;
34148c2ecf20Sopenharmony_ci	destroy_swap_extents(p);
34158c2ecf20Sopenharmony_ci	swap_cgroup_swapoff(p->type);
34168c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
34178c2ecf20Sopenharmony_ci	p->swap_file = NULL;
34188c2ecf20Sopenharmony_ci	p->flags = 0;
34198c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
34208c2ecf20Sopenharmony_ci	vfree(swap_map);
34218c2ecf20Sopenharmony_ci	kvfree(cluster_info);
34228c2ecf20Sopenharmony_ci	kvfree(frontswap_map);
34238c2ecf20Sopenharmony_ci	if (inced_nr_rotate_swap)
34248c2ecf20Sopenharmony_ci		atomic_dec(&nr_rotate_swap);
34258c2ecf20Sopenharmony_ci	if (swap_file)
34268c2ecf20Sopenharmony_ci		filp_close(swap_file, NULL);
34278c2ecf20Sopenharmony_ciout:
34288c2ecf20Sopenharmony_ci	if (page && !IS_ERR(page)) {
34298c2ecf20Sopenharmony_ci		kunmap(page);
34308c2ecf20Sopenharmony_ci		put_page(page);
34318c2ecf20Sopenharmony_ci	}
34328c2ecf20Sopenharmony_ci	if (name)
34338c2ecf20Sopenharmony_ci		putname(name);
34348c2ecf20Sopenharmony_ci	if (inode)
34358c2ecf20Sopenharmony_ci		inode_unlock(inode);
34368c2ecf20Sopenharmony_ci	if (!error)
34378c2ecf20Sopenharmony_ci		enable_swap_slots_cache();
34388c2ecf20Sopenharmony_ci	return error;
34398c2ecf20Sopenharmony_ci}
34408c2ecf20Sopenharmony_ci
34418c2ecf20Sopenharmony_civoid si_swapinfo(struct sysinfo *val)
34428c2ecf20Sopenharmony_ci{
34438c2ecf20Sopenharmony_ci	unsigned int type;
34448c2ecf20Sopenharmony_ci	unsigned long nr_to_be_unused = 0;
34458c2ecf20Sopenharmony_ci
34468c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
34478c2ecf20Sopenharmony_ci	for (type = 0; type < nr_swapfiles; type++) {
34488c2ecf20Sopenharmony_ci		struct swap_info_struct *si = swap_info[type];
34498c2ecf20Sopenharmony_ci
34508c2ecf20Sopenharmony_ci		if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
34518c2ecf20Sopenharmony_ci			nr_to_be_unused += si->inuse_pages;
34528c2ecf20Sopenharmony_ci	}
34538c2ecf20Sopenharmony_ci	val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
34548c2ecf20Sopenharmony_ci	val->totalswap = total_swap_pages + nr_to_be_unused;
34558c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
34568c2ecf20Sopenharmony_ci}
34578c2ecf20Sopenharmony_ci
34588c2ecf20Sopenharmony_ci#ifdef CONFIG_HYPERHOLD_ZSWAPD
34598c2ecf20Sopenharmony_cibool free_swap_is_low(void)
34608c2ecf20Sopenharmony_ci{
34618c2ecf20Sopenharmony_ci	unsigned int type;
34628c2ecf20Sopenharmony_ci	unsigned long long freeswap = 0;
34638c2ecf20Sopenharmony_ci	unsigned long nr_to_be_unused = 0;
34648c2ecf20Sopenharmony_ci
34658c2ecf20Sopenharmony_ci	spin_lock(&swap_lock);
34668c2ecf20Sopenharmony_ci	for (type = 0; type < nr_swapfiles; type++) {
34678c2ecf20Sopenharmony_ci		struct swap_info_struct *si = swap_info[type];
34688c2ecf20Sopenharmony_ci
34698c2ecf20Sopenharmony_ci		if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
34708c2ecf20Sopenharmony_ci			nr_to_be_unused += si->inuse_pages;
34718c2ecf20Sopenharmony_ci	}
34728c2ecf20Sopenharmony_ci	freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
34738c2ecf20Sopenharmony_ci	spin_unlock(&swap_lock);
34748c2ecf20Sopenharmony_ci
34758c2ecf20Sopenharmony_ci	return (freeswap < get_free_swap_threshold());
34768c2ecf20Sopenharmony_ci}
34778c2ecf20Sopenharmony_ciEXPORT_SYMBOL(free_swap_is_low);
34788c2ecf20Sopenharmony_ci#endif
34798c2ecf20Sopenharmony_ci
34808c2ecf20Sopenharmony_ci/*
34818c2ecf20Sopenharmony_ci * Verify that a swap entry is valid and increment its swap map count.
34828c2ecf20Sopenharmony_ci *
34838c2ecf20Sopenharmony_ci * Returns error code in following case.
34848c2ecf20Sopenharmony_ci * - success -> 0
34858c2ecf20Sopenharmony_ci * - swp_entry is invalid -> EINVAL
34868c2ecf20Sopenharmony_ci * - swp_entry is migration entry -> EINVAL
34878c2ecf20Sopenharmony_ci * - swap-cache reference is requested but there is already one. -> EEXIST
34888c2ecf20Sopenharmony_ci * - swap-cache reference is requested but the entry is not used. -> ENOENT
34898c2ecf20Sopenharmony_ci * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
34908c2ecf20Sopenharmony_ci */
34918c2ecf20Sopenharmony_cistatic int __swap_duplicate(swp_entry_t entry, unsigned char usage)
34928c2ecf20Sopenharmony_ci{
34938c2ecf20Sopenharmony_ci	struct swap_info_struct *p;
34948c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
34958c2ecf20Sopenharmony_ci	unsigned long offset;
34968c2ecf20Sopenharmony_ci	unsigned char count;
34978c2ecf20Sopenharmony_ci	unsigned char has_cache;
34988c2ecf20Sopenharmony_ci	int err = -EINVAL;
34998c2ecf20Sopenharmony_ci
35008c2ecf20Sopenharmony_ci	p = get_swap_device(entry);
35018c2ecf20Sopenharmony_ci	if (!p)
35028c2ecf20Sopenharmony_ci		goto out;
35038c2ecf20Sopenharmony_ci
35048c2ecf20Sopenharmony_ci	offset = swp_offset(entry);
35058c2ecf20Sopenharmony_ci	ci = lock_cluster_or_swap_info(p, offset);
35068c2ecf20Sopenharmony_ci
35078c2ecf20Sopenharmony_ci	count = p->swap_map[offset];
35088c2ecf20Sopenharmony_ci
35098c2ecf20Sopenharmony_ci	/*
35108c2ecf20Sopenharmony_ci	 * swapin_readahead() doesn't check if a swap entry is valid, so the
35118c2ecf20Sopenharmony_ci	 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
35128c2ecf20Sopenharmony_ci	 */
35138c2ecf20Sopenharmony_ci	if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
35148c2ecf20Sopenharmony_ci		err = -ENOENT;
35158c2ecf20Sopenharmony_ci		goto unlock_out;
35168c2ecf20Sopenharmony_ci	}
35178c2ecf20Sopenharmony_ci
35188c2ecf20Sopenharmony_ci	has_cache = count & SWAP_HAS_CACHE;
35198c2ecf20Sopenharmony_ci	count &= ~SWAP_HAS_CACHE;
35208c2ecf20Sopenharmony_ci	err = 0;
35218c2ecf20Sopenharmony_ci
35228c2ecf20Sopenharmony_ci	if (usage == SWAP_HAS_CACHE) {
35238c2ecf20Sopenharmony_ci
35248c2ecf20Sopenharmony_ci		/* set SWAP_HAS_CACHE if there is no cache and entry is used */
35258c2ecf20Sopenharmony_ci		if (!has_cache && count)
35268c2ecf20Sopenharmony_ci			has_cache = SWAP_HAS_CACHE;
35278c2ecf20Sopenharmony_ci		else if (has_cache)		/* someone else added cache */
35288c2ecf20Sopenharmony_ci			err = -EEXIST;
35298c2ecf20Sopenharmony_ci		else				/* no users remaining */
35308c2ecf20Sopenharmony_ci			err = -ENOENT;
35318c2ecf20Sopenharmony_ci
35328c2ecf20Sopenharmony_ci	} else if (count || has_cache) {
35338c2ecf20Sopenharmony_ci
35348c2ecf20Sopenharmony_ci		if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
35358c2ecf20Sopenharmony_ci			count += usage;
35368c2ecf20Sopenharmony_ci		else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
35378c2ecf20Sopenharmony_ci			err = -EINVAL;
35388c2ecf20Sopenharmony_ci		else if (swap_count_continued(p, offset, count))
35398c2ecf20Sopenharmony_ci			count = COUNT_CONTINUED;
35408c2ecf20Sopenharmony_ci		else
35418c2ecf20Sopenharmony_ci			err = -ENOMEM;
35428c2ecf20Sopenharmony_ci	} else
35438c2ecf20Sopenharmony_ci		err = -ENOENT;			/* unused swap entry */
35448c2ecf20Sopenharmony_ci
35458c2ecf20Sopenharmony_ci	WRITE_ONCE(p->swap_map[offset], count | has_cache);
35468c2ecf20Sopenharmony_ci
35478c2ecf20Sopenharmony_ciunlock_out:
35488c2ecf20Sopenharmony_ci	unlock_cluster_or_swap_info(p, ci);
35498c2ecf20Sopenharmony_ciout:
35508c2ecf20Sopenharmony_ci	if (p)
35518c2ecf20Sopenharmony_ci		put_swap_device(p);
35528c2ecf20Sopenharmony_ci	return err;
35538c2ecf20Sopenharmony_ci}
35548c2ecf20Sopenharmony_ci
35558c2ecf20Sopenharmony_ci/*
35568c2ecf20Sopenharmony_ci * Help swapoff by noting that swap entry belongs to shmem/tmpfs
35578c2ecf20Sopenharmony_ci * (in which case its reference count is never incremented).
35588c2ecf20Sopenharmony_ci */
35598c2ecf20Sopenharmony_civoid swap_shmem_alloc(swp_entry_t entry)
35608c2ecf20Sopenharmony_ci{
35618c2ecf20Sopenharmony_ci	__swap_duplicate(entry, SWAP_MAP_SHMEM);
35628c2ecf20Sopenharmony_ci}
35638c2ecf20Sopenharmony_ci
35648c2ecf20Sopenharmony_ci/*
35658c2ecf20Sopenharmony_ci * Increase reference count of swap entry by 1.
35668c2ecf20Sopenharmony_ci * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
35678c2ecf20Sopenharmony_ci * but could not be atomically allocated.  Returns 0, just as if it succeeded,
35688c2ecf20Sopenharmony_ci * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
35698c2ecf20Sopenharmony_ci * might occur if a page table entry has got corrupted.
35708c2ecf20Sopenharmony_ci */
35718c2ecf20Sopenharmony_ciint swap_duplicate(swp_entry_t entry)
35728c2ecf20Sopenharmony_ci{
35738c2ecf20Sopenharmony_ci	int err = 0;
35748c2ecf20Sopenharmony_ci
35758c2ecf20Sopenharmony_ci	while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
35768c2ecf20Sopenharmony_ci		err = add_swap_count_continuation(entry, GFP_ATOMIC);
35778c2ecf20Sopenharmony_ci	return err;
35788c2ecf20Sopenharmony_ci}
35798c2ecf20Sopenharmony_ci
35808c2ecf20Sopenharmony_ci/*
35818c2ecf20Sopenharmony_ci * @entry: swap entry for which we allocate swap cache.
35828c2ecf20Sopenharmony_ci *
35838c2ecf20Sopenharmony_ci * Called when allocating swap cache for existing swap entry,
35848c2ecf20Sopenharmony_ci * This can return error codes. Returns 0 at success.
35858c2ecf20Sopenharmony_ci * -EEXIST means there is a swap cache.
35868c2ecf20Sopenharmony_ci * Note: return code is different from swap_duplicate().
35878c2ecf20Sopenharmony_ci */
35888c2ecf20Sopenharmony_ciint swapcache_prepare(swp_entry_t entry)
35898c2ecf20Sopenharmony_ci{
35908c2ecf20Sopenharmony_ci	return __swap_duplicate(entry, SWAP_HAS_CACHE);
35918c2ecf20Sopenharmony_ci}
35928c2ecf20Sopenharmony_ci
35938c2ecf20Sopenharmony_cistruct swap_info_struct *swp_swap_info(swp_entry_t entry)
35948c2ecf20Sopenharmony_ci{
35958c2ecf20Sopenharmony_ci	return swap_type_to_swap_info(swp_type(entry));
35968c2ecf20Sopenharmony_ci}
35978c2ecf20Sopenharmony_ci
35988c2ecf20Sopenharmony_cistruct swap_info_struct *page_swap_info(struct page *page)
35998c2ecf20Sopenharmony_ci{
36008c2ecf20Sopenharmony_ci	swp_entry_t entry = { .val = page_private(page) };
36018c2ecf20Sopenharmony_ci	return swp_swap_info(entry);
36028c2ecf20Sopenharmony_ci}
36038c2ecf20Sopenharmony_ci
36048c2ecf20Sopenharmony_ci/*
36058c2ecf20Sopenharmony_ci * out-of-line __page_file_ methods to avoid include hell.
36068c2ecf20Sopenharmony_ci */
36078c2ecf20Sopenharmony_cistruct address_space *__page_file_mapping(struct page *page)
36088c2ecf20Sopenharmony_ci{
36098c2ecf20Sopenharmony_ci	return page_swap_info(page)->swap_file->f_mapping;
36108c2ecf20Sopenharmony_ci}
36118c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(__page_file_mapping);
36128c2ecf20Sopenharmony_ci
36138c2ecf20Sopenharmony_cipgoff_t __page_file_index(struct page *page)
36148c2ecf20Sopenharmony_ci{
36158c2ecf20Sopenharmony_ci	swp_entry_t swap = { .val = page_private(page) };
36168c2ecf20Sopenharmony_ci	return swp_offset(swap);
36178c2ecf20Sopenharmony_ci}
36188c2ecf20Sopenharmony_ciEXPORT_SYMBOL_GPL(__page_file_index);
36198c2ecf20Sopenharmony_ci
36208c2ecf20Sopenharmony_ci/*
36218c2ecf20Sopenharmony_ci * add_swap_count_continuation - called when a swap count is duplicated
36228c2ecf20Sopenharmony_ci * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
36238c2ecf20Sopenharmony_ci * page of the original vmalloc'ed swap_map, to hold the continuation count
36248c2ecf20Sopenharmony_ci * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
36258c2ecf20Sopenharmony_ci * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
36268c2ecf20Sopenharmony_ci *
36278c2ecf20Sopenharmony_ci * These continuation pages are seldom referenced: the common paths all work
36288c2ecf20Sopenharmony_ci * on the original swap_map, only referring to a continuation page when the
36298c2ecf20Sopenharmony_ci * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
36308c2ecf20Sopenharmony_ci *
36318c2ecf20Sopenharmony_ci * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
36328c2ecf20Sopenharmony_ci * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
36338c2ecf20Sopenharmony_ci * can be called after dropping locks.
36348c2ecf20Sopenharmony_ci */
36358c2ecf20Sopenharmony_ciint add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
36368c2ecf20Sopenharmony_ci{
36378c2ecf20Sopenharmony_ci	struct swap_info_struct *si;
36388c2ecf20Sopenharmony_ci	struct swap_cluster_info *ci;
36398c2ecf20Sopenharmony_ci	struct page *head;
36408c2ecf20Sopenharmony_ci	struct page *page;
36418c2ecf20Sopenharmony_ci	struct page *list_page;
36428c2ecf20Sopenharmony_ci	pgoff_t offset;
36438c2ecf20Sopenharmony_ci	unsigned char count;
36448c2ecf20Sopenharmony_ci	int ret = 0;
36458c2ecf20Sopenharmony_ci
36468c2ecf20Sopenharmony_ci	/*
36478c2ecf20Sopenharmony_ci	 * When debugging, it's easier to use __GFP_ZERO here; but it's better
36488c2ecf20Sopenharmony_ci	 * for latency not to zero a page while GFP_ATOMIC and holding locks.
36498c2ecf20Sopenharmony_ci	 */
36508c2ecf20Sopenharmony_ci	page = alloc_page(gfp_mask | __GFP_HIGHMEM);
36518c2ecf20Sopenharmony_ci
36528c2ecf20Sopenharmony_ci	si = get_swap_device(entry);
36538c2ecf20Sopenharmony_ci	if (!si) {
36548c2ecf20Sopenharmony_ci		/*
36558c2ecf20Sopenharmony_ci		 * An acceptable race has occurred since the failing
36568c2ecf20Sopenharmony_ci		 * __swap_duplicate(): the swap device may be swapoff
36578c2ecf20Sopenharmony_ci		 */
36588c2ecf20Sopenharmony_ci		goto outer;
36598c2ecf20Sopenharmony_ci	}
36608c2ecf20Sopenharmony_ci	spin_lock(&si->lock);
36618c2ecf20Sopenharmony_ci
36628c2ecf20Sopenharmony_ci	offset = swp_offset(entry);
36638c2ecf20Sopenharmony_ci
36648c2ecf20Sopenharmony_ci	ci = lock_cluster(si, offset);
36658c2ecf20Sopenharmony_ci
36668c2ecf20Sopenharmony_ci	count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
36678c2ecf20Sopenharmony_ci
36688c2ecf20Sopenharmony_ci	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
36698c2ecf20Sopenharmony_ci		/*
36708c2ecf20Sopenharmony_ci		 * The higher the swap count, the more likely it is that tasks
36718c2ecf20Sopenharmony_ci		 * will race to add swap count continuation: we need to avoid
36728c2ecf20Sopenharmony_ci		 * over-provisioning.
36738c2ecf20Sopenharmony_ci		 */
36748c2ecf20Sopenharmony_ci		goto out;
36758c2ecf20Sopenharmony_ci	}
36768c2ecf20Sopenharmony_ci
36778c2ecf20Sopenharmony_ci	if (!page) {
36788c2ecf20Sopenharmony_ci		ret = -ENOMEM;
36798c2ecf20Sopenharmony_ci		goto out;
36808c2ecf20Sopenharmony_ci	}
36818c2ecf20Sopenharmony_ci
36828c2ecf20Sopenharmony_ci	/*
36838c2ecf20Sopenharmony_ci	 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
36848c2ecf20Sopenharmony_ci	 * no architecture is using highmem pages for kernel page tables: so it
36858c2ecf20Sopenharmony_ci	 * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
36868c2ecf20Sopenharmony_ci	 */
36878c2ecf20Sopenharmony_ci	head = vmalloc_to_page(si->swap_map + offset);
36888c2ecf20Sopenharmony_ci	offset &= ~PAGE_MASK;
36898c2ecf20Sopenharmony_ci
36908c2ecf20Sopenharmony_ci	spin_lock(&si->cont_lock);
36918c2ecf20Sopenharmony_ci	/*
36928c2ecf20Sopenharmony_ci	 * Page allocation does not initialize the page's lru field,
36938c2ecf20Sopenharmony_ci	 * but it does always reset its private field.
36948c2ecf20Sopenharmony_ci	 */
36958c2ecf20Sopenharmony_ci	if (!page_private(head)) {
36968c2ecf20Sopenharmony_ci		BUG_ON(count & COUNT_CONTINUED);
36978c2ecf20Sopenharmony_ci		INIT_LIST_HEAD(&head->lru);
36988c2ecf20Sopenharmony_ci		set_page_private(head, SWP_CONTINUED);
36998c2ecf20Sopenharmony_ci		si->flags |= SWP_CONTINUED;
37008c2ecf20Sopenharmony_ci	}
37018c2ecf20Sopenharmony_ci
37028c2ecf20Sopenharmony_ci	list_for_each_entry(list_page, &head->lru, lru) {
37038c2ecf20Sopenharmony_ci		unsigned char *map;
37048c2ecf20Sopenharmony_ci
37058c2ecf20Sopenharmony_ci		/*
37068c2ecf20Sopenharmony_ci		 * If the previous map said no continuation, but we've found
37078c2ecf20Sopenharmony_ci		 * a continuation page, free our allocation and use this one.
37088c2ecf20Sopenharmony_ci		 */
37098c2ecf20Sopenharmony_ci		if (!(count & COUNT_CONTINUED))
37108c2ecf20Sopenharmony_ci			goto out_unlock_cont;
37118c2ecf20Sopenharmony_ci
37128c2ecf20Sopenharmony_ci		map = kmap_atomic(list_page) + offset;
37138c2ecf20Sopenharmony_ci		count = *map;
37148c2ecf20Sopenharmony_ci		kunmap_atomic(map);
37158c2ecf20Sopenharmony_ci
37168c2ecf20Sopenharmony_ci		/*
37178c2ecf20Sopenharmony_ci		 * If this continuation count now has some space in it,
37188c2ecf20Sopenharmony_ci		 * free our allocation and use this one.
37198c2ecf20Sopenharmony_ci		 */
37208c2ecf20Sopenharmony_ci		if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
37218c2ecf20Sopenharmony_ci			goto out_unlock_cont;
37228c2ecf20Sopenharmony_ci	}
37238c2ecf20Sopenharmony_ci
37248c2ecf20Sopenharmony_ci	list_add_tail(&page->lru, &head->lru);
37258c2ecf20Sopenharmony_ci	page = NULL;			/* now it's attached, don't free it */
37268c2ecf20Sopenharmony_ciout_unlock_cont:
37278c2ecf20Sopenharmony_ci	spin_unlock(&si->cont_lock);
37288c2ecf20Sopenharmony_ciout:
37298c2ecf20Sopenharmony_ci	unlock_cluster(ci);
37308c2ecf20Sopenharmony_ci	spin_unlock(&si->lock);
37318c2ecf20Sopenharmony_ci	put_swap_device(si);
37328c2ecf20Sopenharmony_ciouter:
37338c2ecf20Sopenharmony_ci	if (page)
37348c2ecf20Sopenharmony_ci		__free_page(page);
37358c2ecf20Sopenharmony_ci	return ret;
37368c2ecf20Sopenharmony_ci}
37378c2ecf20Sopenharmony_ci
37388c2ecf20Sopenharmony_ci/*
37398c2ecf20Sopenharmony_ci * swap_count_continued - when the original swap_map count is incremented
37408c2ecf20Sopenharmony_ci * from SWAP_MAP_MAX, check if there is already a continuation page to carry
37418c2ecf20Sopenharmony_ci * into, carry if so, or else fail until a new continuation page is allocated;
37428c2ecf20Sopenharmony_ci * when the original swap_map count is decremented from 0 with continuation,
37438c2ecf20Sopenharmony_ci * borrow from the continuation and report whether it still holds more.
37448c2ecf20Sopenharmony_ci * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
37458c2ecf20Sopenharmony_ci * lock.
37468c2ecf20Sopenharmony_ci */
37478c2ecf20Sopenharmony_cistatic bool swap_count_continued(struct swap_info_struct *si,
37488c2ecf20Sopenharmony_ci				 pgoff_t offset, unsigned char count)
37498c2ecf20Sopenharmony_ci{
37508c2ecf20Sopenharmony_ci	struct page *head;
37518c2ecf20Sopenharmony_ci	struct page *page;
37528c2ecf20Sopenharmony_ci	unsigned char *map;
37538c2ecf20Sopenharmony_ci	bool ret;
37548c2ecf20Sopenharmony_ci
37558c2ecf20Sopenharmony_ci	head = vmalloc_to_page(si->swap_map + offset);
37568c2ecf20Sopenharmony_ci	if (page_private(head) != SWP_CONTINUED) {
37578c2ecf20Sopenharmony_ci		BUG_ON(count & COUNT_CONTINUED);
37588c2ecf20Sopenharmony_ci		return false;		/* need to add count continuation */
37598c2ecf20Sopenharmony_ci	}
37608c2ecf20Sopenharmony_ci
37618c2ecf20Sopenharmony_ci	spin_lock(&si->cont_lock);
37628c2ecf20Sopenharmony_ci	offset &= ~PAGE_MASK;
37638c2ecf20Sopenharmony_ci	page = list_next_entry(head, lru);
37648c2ecf20Sopenharmony_ci	map = kmap_atomic(page) + offset;
37658c2ecf20Sopenharmony_ci
37668c2ecf20Sopenharmony_ci	if (count == SWAP_MAP_MAX)	/* initial increment from swap_map */
37678c2ecf20Sopenharmony_ci		goto init_map;		/* jump over SWAP_CONT_MAX checks */
37688c2ecf20Sopenharmony_ci
37698c2ecf20Sopenharmony_ci	if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
37708c2ecf20Sopenharmony_ci		/*
37718c2ecf20Sopenharmony_ci		 * Think of how you add 1 to 999
37728c2ecf20Sopenharmony_ci		 */
37738c2ecf20Sopenharmony_ci		while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
37748c2ecf20Sopenharmony_ci			kunmap_atomic(map);
37758c2ecf20Sopenharmony_ci			page = list_next_entry(page, lru);
37768c2ecf20Sopenharmony_ci			BUG_ON(page == head);
37778c2ecf20Sopenharmony_ci			map = kmap_atomic(page) + offset;
37788c2ecf20Sopenharmony_ci		}
37798c2ecf20Sopenharmony_ci		if (*map == SWAP_CONT_MAX) {
37808c2ecf20Sopenharmony_ci			kunmap_atomic(map);
37818c2ecf20Sopenharmony_ci			page = list_next_entry(page, lru);
37828c2ecf20Sopenharmony_ci			if (page == head) {
37838c2ecf20Sopenharmony_ci				ret = false;	/* add count continuation */
37848c2ecf20Sopenharmony_ci				goto out;
37858c2ecf20Sopenharmony_ci			}
37868c2ecf20Sopenharmony_ci			map = kmap_atomic(page) + offset;
37878c2ecf20Sopenharmony_ciinit_map:		*map = 0;		/* we didn't zero the page */
37888c2ecf20Sopenharmony_ci		}
37898c2ecf20Sopenharmony_ci		*map += 1;
37908c2ecf20Sopenharmony_ci		kunmap_atomic(map);
37918c2ecf20Sopenharmony_ci		while ((page = list_prev_entry(page, lru)) != head) {
37928c2ecf20Sopenharmony_ci			map = kmap_atomic(page) + offset;
37938c2ecf20Sopenharmony_ci			*map = COUNT_CONTINUED;
37948c2ecf20Sopenharmony_ci			kunmap_atomic(map);
37958c2ecf20Sopenharmony_ci		}
37968c2ecf20Sopenharmony_ci		ret = true;			/* incremented */
37978c2ecf20Sopenharmony_ci
37988c2ecf20Sopenharmony_ci	} else {				/* decrementing */
37998c2ecf20Sopenharmony_ci		/*
38008c2ecf20Sopenharmony_ci		 * Think of how you subtract 1 from 1000
38018c2ecf20Sopenharmony_ci		 */
38028c2ecf20Sopenharmony_ci		BUG_ON(count != COUNT_CONTINUED);
38038c2ecf20Sopenharmony_ci		while (*map == COUNT_CONTINUED) {
38048c2ecf20Sopenharmony_ci			kunmap_atomic(map);
38058c2ecf20Sopenharmony_ci			page = list_next_entry(page, lru);
38068c2ecf20Sopenharmony_ci			BUG_ON(page == head);
38078c2ecf20Sopenharmony_ci			map = kmap_atomic(page) + offset;
38088c2ecf20Sopenharmony_ci		}
38098c2ecf20Sopenharmony_ci		BUG_ON(*map == 0);
38108c2ecf20Sopenharmony_ci		*map -= 1;
38118c2ecf20Sopenharmony_ci		if (*map == 0)
38128c2ecf20Sopenharmony_ci			count = 0;
38138c2ecf20Sopenharmony_ci		kunmap_atomic(map);
38148c2ecf20Sopenharmony_ci		while ((page = list_prev_entry(page, lru)) != head) {
38158c2ecf20Sopenharmony_ci			map = kmap_atomic(page) + offset;
38168c2ecf20Sopenharmony_ci			*map = SWAP_CONT_MAX | count;
38178c2ecf20Sopenharmony_ci			count = COUNT_CONTINUED;
38188c2ecf20Sopenharmony_ci			kunmap_atomic(map);
38198c2ecf20Sopenharmony_ci		}
38208c2ecf20Sopenharmony_ci		ret = count == COUNT_CONTINUED;
38218c2ecf20Sopenharmony_ci	}
38228c2ecf20Sopenharmony_ciout:
38238c2ecf20Sopenharmony_ci	spin_unlock(&si->cont_lock);
38248c2ecf20Sopenharmony_ci	return ret;
38258c2ecf20Sopenharmony_ci}
38268c2ecf20Sopenharmony_ci
38278c2ecf20Sopenharmony_ci/*
38288c2ecf20Sopenharmony_ci * free_swap_count_continuations - swapoff free all the continuation pages
38298c2ecf20Sopenharmony_ci * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
38308c2ecf20Sopenharmony_ci */
38318c2ecf20Sopenharmony_cistatic void free_swap_count_continuations(struct swap_info_struct *si)
38328c2ecf20Sopenharmony_ci{
38338c2ecf20Sopenharmony_ci	pgoff_t offset;
38348c2ecf20Sopenharmony_ci
38358c2ecf20Sopenharmony_ci	for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
38368c2ecf20Sopenharmony_ci		struct page *head;
38378c2ecf20Sopenharmony_ci		head = vmalloc_to_page(si->swap_map + offset);
38388c2ecf20Sopenharmony_ci		if (page_private(head)) {
38398c2ecf20Sopenharmony_ci			struct page *page, *next;
38408c2ecf20Sopenharmony_ci
38418c2ecf20Sopenharmony_ci			list_for_each_entry_safe(page, next, &head->lru, lru) {
38428c2ecf20Sopenharmony_ci				list_del(&page->lru);
38438c2ecf20Sopenharmony_ci				__free_page(page);
38448c2ecf20Sopenharmony_ci			}
38458c2ecf20Sopenharmony_ci		}
38468c2ecf20Sopenharmony_ci	}
38478c2ecf20Sopenharmony_ci}
38488c2ecf20Sopenharmony_ci
38498c2ecf20Sopenharmony_ci#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
38508c2ecf20Sopenharmony_civoid cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
38518c2ecf20Sopenharmony_ci{
38528c2ecf20Sopenharmony_ci	struct swap_info_struct *si, *next;
38538c2ecf20Sopenharmony_ci	int nid = page_to_nid(page);
38548c2ecf20Sopenharmony_ci
38558c2ecf20Sopenharmony_ci	if (!(gfp_mask & __GFP_IO))
38568c2ecf20Sopenharmony_ci		return;
38578c2ecf20Sopenharmony_ci
38588c2ecf20Sopenharmony_ci	if (!blk_cgroup_congested())
38598c2ecf20Sopenharmony_ci		return;
38608c2ecf20Sopenharmony_ci
38618c2ecf20Sopenharmony_ci	/*
38628c2ecf20Sopenharmony_ci	 * We've already scheduled a throttle, avoid taking the global swap
38638c2ecf20Sopenharmony_ci	 * lock.
38648c2ecf20Sopenharmony_ci	 */
38658c2ecf20Sopenharmony_ci	if (current->throttle_queue)
38668c2ecf20Sopenharmony_ci		return;
38678c2ecf20Sopenharmony_ci
38688c2ecf20Sopenharmony_ci	spin_lock(&swap_avail_lock);
38698c2ecf20Sopenharmony_ci	plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
38708c2ecf20Sopenharmony_ci				  avail_lists[nid]) {
38718c2ecf20Sopenharmony_ci		if (si->bdev) {
38728c2ecf20Sopenharmony_ci			blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
38738c2ecf20Sopenharmony_ci			break;
38748c2ecf20Sopenharmony_ci		}
38758c2ecf20Sopenharmony_ci	}
38768c2ecf20Sopenharmony_ci	spin_unlock(&swap_avail_lock);
38778c2ecf20Sopenharmony_ci}
38788c2ecf20Sopenharmony_ci#endif
38798c2ecf20Sopenharmony_ci
38808c2ecf20Sopenharmony_cistatic int __init swapfile_init(void)
38818c2ecf20Sopenharmony_ci{
38828c2ecf20Sopenharmony_ci	int nid;
38838c2ecf20Sopenharmony_ci
38848c2ecf20Sopenharmony_ci	swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
38858c2ecf20Sopenharmony_ci					 GFP_KERNEL);
38868c2ecf20Sopenharmony_ci	if (!swap_avail_heads) {
38878c2ecf20Sopenharmony_ci		pr_emerg("Not enough memory for swap heads, swap is disabled\n");
38888c2ecf20Sopenharmony_ci		return -ENOMEM;
38898c2ecf20Sopenharmony_ci	}
38908c2ecf20Sopenharmony_ci
38918c2ecf20Sopenharmony_ci	for_each_node(nid)
38928c2ecf20Sopenharmony_ci		plist_head_init(&swap_avail_heads[nid]);
38938c2ecf20Sopenharmony_ci
38948c2ecf20Sopenharmony_ci	return 0;
38958c2ecf20Sopenharmony_ci}
38968c2ecf20Sopenharmony_cisubsys_initcall(swapfile_init);
3897