162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Memory merging support.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * This code enables dynamic sharing of identical pages found in different
662306a36Sopenharmony_ci * memory areas, even if they are not shared by fork()
762306a36Sopenharmony_ci *
862306a36Sopenharmony_ci * Copyright (C) 2008-2009 Red Hat, Inc.
962306a36Sopenharmony_ci * Authors:
1062306a36Sopenharmony_ci *	Izik Eidus
1162306a36Sopenharmony_ci *	Andrea Arcangeli
1262306a36Sopenharmony_ci *	Chris Wright
1362306a36Sopenharmony_ci *	Hugh Dickins
1462306a36Sopenharmony_ci */
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci#include <linux/errno.h>
1762306a36Sopenharmony_ci#include <linux/mm.h>
1862306a36Sopenharmony_ci#include <linux/mm_inline.h>
1962306a36Sopenharmony_ci#include <linux/fs.h>
2062306a36Sopenharmony_ci#include <linux/mman.h>
2162306a36Sopenharmony_ci#include <linux/sched.h>
2262306a36Sopenharmony_ci#include <linux/sched/mm.h>
2362306a36Sopenharmony_ci#include <linux/sched/coredump.h>
2462306a36Sopenharmony_ci#include <linux/rwsem.h>
2562306a36Sopenharmony_ci#include <linux/pagemap.h>
2662306a36Sopenharmony_ci#include <linux/rmap.h>
2762306a36Sopenharmony_ci#include <linux/spinlock.h>
2862306a36Sopenharmony_ci#include <linux/xxhash.h>
2962306a36Sopenharmony_ci#include <linux/delay.h>
3062306a36Sopenharmony_ci#include <linux/kthread.h>
3162306a36Sopenharmony_ci#include <linux/wait.h>
3262306a36Sopenharmony_ci#include <linux/slab.h>
3362306a36Sopenharmony_ci#include <linux/rbtree.h>
3462306a36Sopenharmony_ci#include <linux/memory.h>
3562306a36Sopenharmony_ci#include <linux/mmu_notifier.h>
3662306a36Sopenharmony_ci#include <linux/swap.h>
3762306a36Sopenharmony_ci#include <linux/ksm.h>
3862306a36Sopenharmony_ci#include <linux/hashtable.h>
3962306a36Sopenharmony_ci#include <linux/freezer.h>
4062306a36Sopenharmony_ci#include <linux/oom.h>
4162306a36Sopenharmony_ci#include <linux/numa.h>
4262306a36Sopenharmony_ci#include <linux/pagewalk.h>
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci#include <asm/tlbflush.h>
4562306a36Sopenharmony_ci#include "internal.h"
4662306a36Sopenharmony_ci#include "mm_slot.h"
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_ci#define CREATE_TRACE_POINTS
4962306a36Sopenharmony_ci#include <trace/events/ksm.h>
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci#ifdef CONFIG_NUMA
5262306a36Sopenharmony_ci#define NUMA(x)		(x)
5362306a36Sopenharmony_ci#define DO_NUMA(x)	do { (x); } while (0)
5462306a36Sopenharmony_ci#else
5562306a36Sopenharmony_ci#define NUMA(x)		(0)
5662306a36Sopenharmony_ci#define DO_NUMA(x)	do { } while (0)
5762306a36Sopenharmony_ci#endif
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_ci/**
6062306a36Sopenharmony_ci * DOC: Overview
6162306a36Sopenharmony_ci *
6262306a36Sopenharmony_ci * A few notes about the KSM scanning process,
6362306a36Sopenharmony_ci * to make it easier to understand the data structures below:
6462306a36Sopenharmony_ci *
6562306a36Sopenharmony_ci * In order to reduce excessive scanning, KSM sorts the memory pages by their
6662306a36Sopenharmony_ci * contents into a data structure that holds pointers to the pages' locations.
6762306a36Sopenharmony_ci *
6862306a36Sopenharmony_ci * Since the contents of the pages may change at any moment, KSM cannot just
6962306a36Sopenharmony_ci * insert the pages into a normal sorted tree and expect it to find anything.
7062306a36Sopenharmony_ci * Therefore KSM uses two data structures - the stable and the unstable tree.
7162306a36Sopenharmony_ci *
7262306a36Sopenharmony_ci * The stable tree holds pointers to all the merged pages (ksm pages), sorted
7362306a36Sopenharmony_ci * by their contents.  Because each such page is write-protected, searching on
7462306a36Sopenharmony_ci * this tree is fully assured to be working (except when pages are unmapped),
7562306a36Sopenharmony_ci * and therefore this tree is called the stable tree.
7662306a36Sopenharmony_ci *
7762306a36Sopenharmony_ci * The stable tree node includes information required for reverse
7862306a36Sopenharmony_ci * mapping from a KSM page to virtual addresses that map this page.
7962306a36Sopenharmony_ci *
8062306a36Sopenharmony_ci * In order to avoid large latencies of the rmap walks on KSM pages,
8162306a36Sopenharmony_ci * KSM maintains two types of nodes in the stable tree:
8262306a36Sopenharmony_ci *
8362306a36Sopenharmony_ci * * the regular nodes that keep the reverse mapping structures in a
8462306a36Sopenharmony_ci *   linked list
8562306a36Sopenharmony_ci * * the "chains" that link nodes ("dups") that represent the same
8662306a36Sopenharmony_ci *   write protected memory content, but each "dup" corresponds to a
8762306a36Sopenharmony_ci *   different KSM page copy of that content
8862306a36Sopenharmony_ci *
8962306a36Sopenharmony_ci * Internally, the regular nodes, "dups" and "chains" are represented
9062306a36Sopenharmony_ci * using the same struct ksm_stable_node structure.
9162306a36Sopenharmony_ci *
9262306a36Sopenharmony_ci * In addition to the stable tree, KSM uses a second data structure called the
9362306a36Sopenharmony_ci * unstable tree: this tree holds pointers to pages which have been found to
9462306a36Sopenharmony_ci * be "unchanged for a period of time".  The unstable tree sorts these pages
9562306a36Sopenharmony_ci * by their contents, but since they are not write-protected, KSM cannot rely
9662306a36Sopenharmony_ci * upon the unstable tree to work correctly - the unstable tree is liable to
9762306a36Sopenharmony_ci * be corrupted as its contents are modified, and so it is called unstable.
9862306a36Sopenharmony_ci *
9962306a36Sopenharmony_ci * KSM solves this problem by several techniques:
10062306a36Sopenharmony_ci *
10162306a36Sopenharmony_ci * 1) The unstable tree is flushed every time KSM completes scanning all
10262306a36Sopenharmony_ci *    memory areas, and then the tree is rebuilt again from the beginning.
10362306a36Sopenharmony_ci * 2) KSM will only insert into the unstable tree, pages whose hash value
10462306a36Sopenharmony_ci *    has not changed since the previous scan of all memory areas.
10562306a36Sopenharmony_ci * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
10662306a36Sopenharmony_ci *    colors of the nodes and not on their contents, assuring that even when
10762306a36Sopenharmony_ci *    the tree gets "corrupted" it won't get out of balance, so scanning time
10862306a36Sopenharmony_ci *    remains the same (also, searching and inserting nodes in an rbtree uses
10962306a36Sopenharmony_ci *    the same algorithm, so we have no overhead when we flush and rebuild).
11062306a36Sopenharmony_ci * 4) KSM never flushes the stable tree, which means that even if it were to
11162306a36Sopenharmony_ci *    take 10 attempts to find a page in the unstable tree, once it is found,
11262306a36Sopenharmony_ci *    it is secured in the stable tree.  (When we scan a new page, we first
11362306a36Sopenharmony_ci *    compare it against the stable tree, and then against the unstable tree.)
11462306a36Sopenharmony_ci *
11562306a36Sopenharmony_ci * If the merge_across_nodes tunable is unset, then KSM maintains multiple
11662306a36Sopenharmony_ci * stable trees and multiple unstable trees: one of each for each NUMA node.
11762306a36Sopenharmony_ci */
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci/**
12062306a36Sopenharmony_ci * struct ksm_mm_slot - ksm information per mm that is being scanned
12162306a36Sopenharmony_ci * @slot: hash lookup from mm to mm_slot
12262306a36Sopenharmony_ci * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
12362306a36Sopenharmony_ci */
12462306a36Sopenharmony_cistruct ksm_mm_slot {
12562306a36Sopenharmony_ci	struct mm_slot slot;
12662306a36Sopenharmony_ci	struct ksm_rmap_item *rmap_list;
12762306a36Sopenharmony_ci};
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci/**
13062306a36Sopenharmony_ci * struct ksm_scan - cursor for scanning
13162306a36Sopenharmony_ci * @mm_slot: the current mm_slot we are scanning
13262306a36Sopenharmony_ci * @address: the next address inside that to be scanned
13362306a36Sopenharmony_ci * @rmap_list: link to the next rmap to be scanned in the rmap_list
13462306a36Sopenharmony_ci * @seqnr: count of completed full scans (needed when removing unstable node)
13562306a36Sopenharmony_ci *
13662306a36Sopenharmony_ci * There is only the one ksm_scan instance of this cursor structure.
13762306a36Sopenharmony_ci */
13862306a36Sopenharmony_cistruct ksm_scan {
13962306a36Sopenharmony_ci	struct ksm_mm_slot *mm_slot;
14062306a36Sopenharmony_ci	unsigned long address;
14162306a36Sopenharmony_ci	struct ksm_rmap_item **rmap_list;
14262306a36Sopenharmony_ci	unsigned long seqnr;
14362306a36Sopenharmony_ci};
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_ci/**
14662306a36Sopenharmony_ci * struct ksm_stable_node - node of the stable rbtree
14762306a36Sopenharmony_ci * @node: rb node of this ksm page in the stable tree
14862306a36Sopenharmony_ci * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
14962306a36Sopenharmony_ci * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
15062306a36Sopenharmony_ci * @list: linked into migrate_nodes, pending placement in the proper node tree
15162306a36Sopenharmony_ci * @hlist: hlist head of rmap_items using this ksm page
15262306a36Sopenharmony_ci * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
15362306a36Sopenharmony_ci * @chain_prune_time: time of the last full garbage collection
15462306a36Sopenharmony_ci * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
15562306a36Sopenharmony_ci * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
15662306a36Sopenharmony_ci */
15762306a36Sopenharmony_cistruct ksm_stable_node {
15862306a36Sopenharmony_ci	union {
15962306a36Sopenharmony_ci		struct rb_node node;	/* when node of stable tree */
16062306a36Sopenharmony_ci		struct {		/* when listed for migration */
16162306a36Sopenharmony_ci			struct list_head *head;
16262306a36Sopenharmony_ci			struct {
16362306a36Sopenharmony_ci				struct hlist_node hlist_dup;
16462306a36Sopenharmony_ci				struct list_head list;
16562306a36Sopenharmony_ci			};
16662306a36Sopenharmony_ci		};
16762306a36Sopenharmony_ci	};
16862306a36Sopenharmony_ci	struct hlist_head hlist;
16962306a36Sopenharmony_ci	union {
17062306a36Sopenharmony_ci		unsigned long kpfn;
17162306a36Sopenharmony_ci		unsigned long chain_prune_time;
17262306a36Sopenharmony_ci	};
17362306a36Sopenharmony_ci	/*
17462306a36Sopenharmony_ci	 * STABLE_NODE_CHAIN can be any negative number in
17562306a36Sopenharmony_ci	 * rmap_hlist_len negative range, but better not -1 to be able
17662306a36Sopenharmony_ci	 * to reliably detect underflows.
17762306a36Sopenharmony_ci	 */
17862306a36Sopenharmony_ci#define STABLE_NODE_CHAIN -1024
17962306a36Sopenharmony_ci	int rmap_hlist_len;
18062306a36Sopenharmony_ci#ifdef CONFIG_NUMA
18162306a36Sopenharmony_ci	int nid;
18262306a36Sopenharmony_ci#endif
18362306a36Sopenharmony_ci};
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci/**
18662306a36Sopenharmony_ci * struct ksm_rmap_item - reverse mapping item for virtual addresses
18762306a36Sopenharmony_ci * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
18862306a36Sopenharmony_ci * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
18962306a36Sopenharmony_ci * @nid: NUMA node id of unstable tree in which linked (may not match page)
19062306a36Sopenharmony_ci * @mm: the memory structure this rmap_item is pointing into
19162306a36Sopenharmony_ci * @address: the virtual address this rmap_item tracks (+ flags in low bits)
19262306a36Sopenharmony_ci * @oldchecksum: previous checksum of the page at that virtual address
19362306a36Sopenharmony_ci * @node: rb node of this rmap_item in the unstable tree
19462306a36Sopenharmony_ci * @head: pointer to stable_node heading this list in the stable tree
19562306a36Sopenharmony_ci * @hlist: link into hlist of rmap_items hanging off that stable_node
19662306a36Sopenharmony_ci */
19762306a36Sopenharmony_cistruct ksm_rmap_item {
19862306a36Sopenharmony_ci	struct ksm_rmap_item *rmap_list;
19962306a36Sopenharmony_ci	union {
20062306a36Sopenharmony_ci		struct anon_vma *anon_vma;	/* when stable */
20162306a36Sopenharmony_ci#ifdef CONFIG_NUMA
20262306a36Sopenharmony_ci		int nid;		/* when node of unstable tree */
20362306a36Sopenharmony_ci#endif
20462306a36Sopenharmony_ci	};
20562306a36Sopenharmony_ci	struct mm_struct *mm;
20662306a36Sopenharmony_ci	unsigned long address;		/* + low bits used for flags below */
20762306a36Sopenharmony_ci	unsigned int oldchecksum;	/* when unstable */
20862306a36Sopenharmony_ci	union {
20962306a36Sopenharmony_ci		struct rb_node node;	/* when node of unstable tree */
21062306a36Sopenharmony_ci		struct {		/* when listed from stable tree */
21162306a36Sopenharmony_ci			struct ksm_stable_node *head;
21262306a36Sopenharmony_ci			struct hlist_node hlist;
21362306a36Sopenharmony_ci		};
21462306a36Sopenharmony_ci	};
21562306a36Sopenharmony_ci};
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci#define SEQNR_MASK	0x0ff	/* low bits of unstable tree seqnr */
21862306a36Sopenharmony_ci#define UNSTABLE_FLAG	0x100	/* is a node of the unstable tree */
21962306a36Sopenharmony_ci#define STABLE_FLAG	0x200	/* is listed from the stable tree */
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_ci/* The stable and unstable tree heads */
22262306a36Sopenharmony_cistatic struct rb_root one_stable_tree[1] = { RB_ROOT };
22362306a36Sopenharmony_cistatic struct rb_root one_unstable_tree[1] = { RB_ROOT };
22462306a36Sopenharmony_cistatic struct rb_root *root_stable_tree = one_stable_tree;
22562306a36Sopenharmony_cistatic struct rb_root *root_unstable_tree = one_unstable_tree;
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci/* Recently migrated nodes of stable tree, pending proper placement */
22862306a36Sopenharmony_cistatic LIST_HEAD(migrate_nodes);
22962306a36Sopenharmony_ci#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_ci#define MM_SLOTS_HASH_BITS 10
23262306a36Sopenharmony_cistatic DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_cistatic struct ksm_mm_slot ksm_mm_head = {
23562306a36Sopenharmony_ci	.slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node),
23662306a36Sopenharmony_ci};
23762306a36Sopenharmony_cistatic struct ksm_scan ksm_scan = {
23862306a36Sopenharmony_ci	.mm_slot = &ksm_mm_head,
23962306a36Sopenharmony_ci};
24062306a36Sopenharmony_ci
24162306a36Sopenharmony_cistatic struct kmem_cache *rmap_item_cache;
24262306a36Sopenharmony_cistatic struct kmem_cache *stable_node_cache;
24362306a36Sopenharmony_cistatic struct kmem_cache *mm_slot_cache;
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci/* The number of pages scanned */
24662306a36Sopenharmony_cistatic unsigned long ksm_pages_scanned;
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci/* The number of nodes in the stable tree */
24962306a36Sopenharmony_cistatic unsigned long ksm_pages_shared;
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci/* The number of page slots additionally sharing those nodes */
25262306a36Sopenharmony_cistatic unsigned long ksm_pages_sharing;
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci/* The number of nodes in the unstable tree */
25562306a36Sopenharmony_cistatic unsigned long ksm_pages_unshared;
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci/* The number of rmap_items in use: to calculate pages_volatile */
25862306a36Sopenharmony_cistatic unsigned long ksm_rmap_items;
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci/* The number of stable_node chains */
26162306a36Sopenharmony_cistatic unsigned long ksm_stable_node_chains;
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci/* The number of stable_node dups linked to the stable_node chains */
26462306a36Sopenharmony_cistatic unsigned long ksm_stable_node_dups;
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_ci/* Delay in pruning stale stable_node_dups in the stable_node_chains */
26762306a36Sopenharmony_cistatic unsigned int ksm_stable_node_chains_prune_millisecs = 2000;
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci/* Maximum number of page slots sharing a stable node */
27062306a36Sopenharmony_cistatic int ksm_max_page_sharing = 256;
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci/* Number of pages ksmd should scan in one batch */
27362306a36Sopenharmony_cistatic unsigned int ksm_thread_pages_to_scan = 100;
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci/* Milliseconds ksmd should sleep between batches */
27662306a36Sopenharmony_cistatic unsigned int ksm_thread_sleep_millisecs = 20;
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci/* Checksum of an empty (zeroed) page */
27962306a36Sopenharmony_cistatic unsigned int zero_checksum __read_mostly;
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_ci/* Whether to merge empty (zeroed) pages with actual zero pages */
28262306a36Sopenharmony_cistatic bool ksm_use_zero_pages __read_mostly;
28362306a36Sopenharmony_ci
28462306a36Sopenharmony_ci/* The number of zero pages which is placed by KSM */
28562306a36Sopenharmony_ciunsigned long ksm_zero_pages;
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci#ifdef CONFIG_NUMA
28862306a36Sopenharmony_ci/* Zeroed when merging across nodes is not allowed */
28962306a36Sopenharmony_cistatic unsigned int ksm_merge_across_nodes = 1;
29062306a36Sopenharmony_cistatic int ksm_nr_node_ids = 1;
29162306a36Sopenharmony_ci#else
29262306a36Sopenharmony_ci#define ksm_merge_across_nodes	1U
29362306a36Sopenharmony_ci#define ksm_nr_node_ids		1
29462306a36Sopenharmony_ci#endif
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci#define KSM_RUN_STOP	0
29762306a36Sopenharmony_ci#define KSM_RUN_MERGE	1
29862306a36Sopenharmony_ci#define KSM_RUN_UNMERGE	2
29962306a36Sopenharmony_ci#define KSM_RUN_OFFLINE	4
30062306a36Sopenharmony_cistatic unsigned long ksm_run = KSM_RUN_STOP;
30162306a36Sopenharmony_cistatic void wait_while_offlining(void);
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
30462306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
30562306a36Sopenharmony_cistatic DEFINE_MUTEX(ksm_thread_mutex);
30662306a36Sopenharmony_cistatic DEFINE_SPINLOCK(ksm_mmlist_lock);
30762306a36Sopenharmony_ci
30862306a36Sopenharmony_ci#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
30962306a36Sopenharmony_ci		sizeof(struct __struct), __alignof__(struct __struct),\
31062306a36Sopenharmony_ci		(__flags), NULL)
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_cistatic int __init ksm_slab_init(void)
31362306a36Sopenharmony_ci{
31462306a36Sopenharmony_ci	rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0);
31562306a36Sopenharmony_ci	if (!rmap_item_cache)
31662306a36Sopenharmony_ci		goto out;
31762306a36Sopenharmony_ci
31862306a36Sopenharmony_ci	stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0);
31962306a36Sopenharmony_ci	if (!stable_node_cache)
32062306a36Sopenharmony_ci		goto out_free1;
32162306a36Sopenharmony_ci
32262306a36Sopenharmony_ci	mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0);
32362306a36Sopenharmony_ci	if (!mm_slot_cache)
32462306a36Sopenharmony_ci		goto out_free2;
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci	return 0;
32762306a36Sopenharmony_ci
32862306a36Sopenharmony_ciout_free2:
32962306a36Sopenharmony_ci	kmem_cache_destroy(stable_node_cache);
33062306a36Sopenharmony_ciout_free1:
33162306a36Sopenharmony_ci	kmem_cache_destroy(rmap_item_cache);
33262306a36Sopenharmony_ciout:
33362306a36Sopenharmony_ci	return -ENOMEM;
33462306a36Sopenharmony_ci}
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_cistatic void __init ksm_slab_free(void)
33762306a36Sopenharmony_ci{
33862306a36Sopenharmony_ci	kmem_cache_destroy(mm_slot_cache);
33962306a36Sopenharmony_ci	kmem_cache_destroy(stable_node_cache);
34062306a36Sopenharmony_ci	kmem_cache_destroy(rmap_item_cache);
34162306a36Sopenharmony_ci	mm_slot_cache = NULL;
34262306a36Sopenharmony_ci}
34362306a36Sopenharmony_ci
34462306a36Sopenharmony_cistatic __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain)
34562306a36Sopenharmony_ci{
34662306a36Sopenharmony_ci	return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
34762306a36Sopenharmony_ci}
34862306a36Sopenharmony_ci
34962306a36Sopenharmony_cistatic __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup)
35062306a36Sopenharmony_ci{
35162306a36Sopenharmony_ci	return dup->head == STABLE_NODE_DUP_HEAD;
35262306a36Sopenharmony_ci}
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_cistatic inline void stable_node_chain_add_dup(struct ksm_stable_node *dup,
35562306a36Sopenharmony_ci					     struct ksm_stable_node *chain)
35662306a36Sopenharmony_ci{
35762306a36Sopenharmony_ci	VM_BUG_ON(is_stable_node_dup(dup));
35862306a36Sopenharmony_ci	dup->head = STABLE_NODE_DUP_HEAD;
35962306a36Sopenharmony_ci	VM_BUG_ON(!is_stable_node_chain(chain));
36062306a36Sopenharmony_ci	hlist_add_head(&dup->hlist_dup, &chain->hlist);
36162306a36Sopenharmony_ci	ksm_stable_node_dups++;
36262306a36Sopenharmony_ci}
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_cistatic inline void __stable_node_dup_del(struct ksm_stable_node *dup)
36562306a36Sopenharmony_ci{
36662306a36Sopenharmony_ci	VM_BUG_ON(!is_stable_node_dup(dup));
36762306a36Sopenharmony_ci	hlist_del(&dup->hlist_dup);
36862306a36Sopenharmony_ci	ksm_stable_node_dups--;
36962306a36Sopenharmony_ci}
37062306a36Sopenharmony_ci
37162306a36Sopenharmony_cistatic inline void stable_node_dup_del(struct ksm_stable_node *dup)
37262306a36Sopenharmony_ci{
37362306a36Sopenharmony_ci	VM_BUG_ON(is_stable_node_chain(dup));
37462306a36Sopenharmony_ci	if (is_stable_node_dup(dup))
37562306a36Sopenharmony_ci		__stable_node_dup_del(dup);
37662306a36Sopenharmony_ci	else
37762306a36Sopenharmony_ci		rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
37862306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM
37962306a36Sopenharmony_ci	dup->head = NULL;
38062306a36Sopenharmony_ci#endif
38162306a36Sopenharmony_ci}
38262306a36Sopenharmony_ci
38362306a36Sopenharmony_cistatic inline struct ksm_rmap_item *alloc_rmap_item(void)
38462306a36Sopenharmony_ci{
38562306a36Sopenharmony_ci	struct ksm_rmap_item *rmap_item;
38662306a36Sopenharmony_ci
38762306a36Sopenharmony_ci	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
38862306a36Sopenharmony_ci						__GFP_NORETRY | __GFP_NOWARN);
38962306a36Sopenharmony_ci	if (rmap_item)
39062306a36Sopenharmony_ci		ksm_rmap_items++;
39162306a36Sopenharmony_ci	return rmap_item;
39262306a36Sopenharmony_ci}
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_cistatic inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
39562306a36Sopenharmony_ci{
39662306a36Sopenharmony_ci	ksm_rmap_items--;
39762306a36Sopenharmony_ci	rmap_item->mm->ksm_rmap_items--;
39862306a36Sopenharmony_ci	rmap_item->mm = NULL;	/* debug safety */
39962306a36Sopenharmony_ci	kmem_cache_free(rmap_item_cache, rmap_item);
40062306a36Sopenharmony_ci}
40162306a36Sopenharmony_ci
40262306a36Sopenharmony_cistatic inline struct ksm_stable_node *alloc_stable_node(void)
40362306a36Sopenharmony_ci{
40462306a36Sopenharmony_ci	/*
40562306a36Sopenharmony_ci	 * The allocation can take too long with GFP_KERNEL when memory is under
40662306a36Sopenharmony_ci	 * pressure, which may lead to hung task warnings.  Adding __GFP_HIGH
40762306a36Sopenharmony_ci	 * grants access to memory reserves, helping to avoid this problem.
40862306a36Sopenharmony_ci	 */
40962306a36Sopenharmony_ci	return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
41062306a36Sopenharmony_ci}
41162306a36Sopenharmony_ci
41262306a36Sopenharmony_cistatic inline void free_stable_node(struct ksm_stable_node *stable_node)
41362306a36Sopenharmony_ci{
41462306a36Sopenharmony_ci	VM_BUG_ON(stable_node->rmap_hlist_len &&
41562306a36Sopenharmony_ci		  !is_stable_node_chain(stable_node));
41662306a36Sopenharmony_ci	kmem_cache_free(stable_node_cache, stable_node);
41762306a36Sopenharmony_ci}
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci/*
42062306a36Sopenharmony_ci * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
42162306a36Sopenharmony_ci * page tables after it has passed through ksm_exit() - which, if necessary,
42262306a36Sopenharmony_ci * takes mmap_lock briefly to serialize against them.  ksm_exit() does not set
42362306a36Sopenharmony_ci * a special flag: they can just back out as soon as mm_users goes to zero.
42462306a36Sopenharmony_ci * ksm_test_exit() is used throughout to make this test for exit: in some
42562306a36Sopenharmony_ci * places for correctness, in some places just to avoid unnecessary work.
42662306a36Sopenharmony_ci */
42762306a36Sopenharmony_cistatic inline bool ksm_test_exit(struct mm_struct *mm)
42862306a36Sopenharmony_ci{
42962306a36Sopenharmony_ci	return atomic_read(&mm->mm_users) == 0;
43062306a36Sopenharmony_ci}
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_cistatic int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
43362306a36Sopenharmony_ci			struct mm_walk *walk)
43462306a36Sopenharmony_ci{
43562306a36Sopenharmony_ci	struct page *page = NULL;
43662306a36Sopenharmony_ci	spinlock_t *ptl;
43762306a36Sopenharmony_ci	pte_t *pte;
43862306a36Sopenharmony_ci	pte_t ptent;
43962306a36Sopenharmony_ci	int ret;
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
44262306a36Sopenharmony_ci	if (!pte)
44362306a36Sopenharmony_ci		return 0;
44462306a36Sopenharmony_ci	ptent = ptep_get(pte);
44562306a36Sopenharmony_ci	if (pte_present(ptent)) {
44662306a36Sopenharmony_ci		page = vm_normal_page(walk->vma, addr, ptent);
44762306a36Sopenharmony_ci	} else if (!pte_none(ptent)) {
44862306a36Sopenharmony_ci		swp_entry_t entry = pte_to_swp_entry(ptent);
44962306a36Sopenharmony_ci
45062306a36Sopenharmony_ci		/*
45162306a36Sopenharmony_ci		 * As KSM pages remain KSM pages until freed, no need to wait
45262306a36Sopenharmony_ci		 * here for migration to end.
45362306a36Sopenharmony_ci		 */
45462306a36Sopenharmony_ci		if (is_migration_entry(entry))
45562306a36Sopenharmony_ci			page = pfn_swap_entry_to_page(entry);
45662306a36Sopenharmony_ci	}
45762306a36Sopenharmony_ci	/* return 1 if the page is an normal ksm page or KSM-placed zero page */
45862306a36Sopenharmony_ci	ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte);
45962306a36Sopenharmony_ci	pte_unmap_unlock(pte, ptl);
46062306a36Sopenharmony_ci	return ret;
46162306a36Sopenharmony_ci}
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_cistatic const struct mm_walk_ops break_ksm_ops = {
46462306a36Sopenharmony_ci	.pmd_entry = break_ksm_pmd_entry,
46562306a36Sopenharmony_ci	.walk_lock = PGWALK_RDLOCK,
46662306a36Sopenharmony_ci};
46762306a36Sopenharmony_ci
46862306a36Sopenharmony_cistatic const struct mm_walk_ops break_ksm_lock_vma_ops = {
46962306a36Sopenharmony_ci	.pmd_entry = break_ksm_pmd_entry,
47062306a36Sopenharmony_ci	.walk_lock = PGWALK_WRLOCK,
47162306a36Sopenharmony_ci};
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci/*
47462306a36Sopenharmony_ci * We use break_ksm to break COW on a ksm page by triggering unsharing,
47562306a36Sopenharmony_ci * such that the ksm page will get replaced by an exclusive anonymous page.
47662306a36Sopenharmony_ci *
47762306a36Sopenharmony_ci * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
47862306a36Sopenharmony_ci * in case the application has unmapped and remapped mm,addr meanwhile.
47962306a36Sopenharmony_ci * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
48062306a36Sopenharmony_ci * mmap of /dev/mem, where we would not want to touch it.
48162306a36Sopenharmony_ci *
48262306a36Sopenharmony_ci * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
48362306a36Sopenharmony_ci * of the process that owns 'vma'.  We also do not want to enforce
48462306a36Sopenharmony_ci * protection keys here anyway.
48562306a36Sopenharmony_ci */
48662306a36Sopenharmony_cistatic int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
48762306a36Sopenharmony_ci{
48862306a36Sopenharmony_ci	vm_fault_t ret = 0;
48962306a36Sopenharmony_ci	const struct mm_walk_ops *ops = lock_vma ?
49062306a36Sopenharmony_ci				&break_ksm_lock_vma_ops : &break_ksm_ops;
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	do {
49362306a36Sopenharmony_ci		int ksm_page;
49462306a36Sopenharmony_ci
49562306a36Sopenharmony_ci		cond_resched();
49662306a36Sopenharmony_ci		ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
49762306a36Sopenharmony_ci		if (WARN_ON_ONCE(ksm_page < 0))
49862306a36Sopenharmony_ci			return ksm_page;
49962306a36Sopenharmony_ci		if (!ksm_page)
50062306a36Sopenharmony_ci			return 0;
50162306a36Sopenharmony_ci		ret = handle_mm_fault(vma, addr,
50262306a36Sopenharmony_ci				      FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
50362306a36Sopenharmony_ci				      NULL);
50462306a36Sopenharmony_ci	} while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
50562306a36Sopenharmony_ci	/*
50662306a36Sopenharmony_ci	 * We must loop until we no longer find a KSM page because
50762306a36Sopenharmony_ci	 * handle_mm_fault() may back out if there's any difficulty e.g. if
50862306a36Sopenharmony_ci	 * pte accessed bit gets updated concurrently.
50962306a36Sopenharmony_ci	 *
51062306a36Sopenharmony_ci	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
51162306a36Sopenharmony_ci	 * backing file, which also invalidates anonymous pages: that's
51262306a36Sopenharmony_ci	 * okay, that truncation will have unmapped the PageKsm for us.
51362306a36Sopenharmony_ci	 *
51462306a36Sopenharmony_ci	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
51562306a36Sopenharmony_ci	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
51662306a36Sopenharmony_ci	 * current task has TIF_MEMDIE set, and will be OOM killed on return
51762306a36Sopenharmony_ci	 * to user; and ksmd, having no mm, would never be chosen for that.
51862306a36Sopenharmony_ci	 *
51962306a36Sopenharmony_ci	 * But if the mm is in a limited mem_cgroup, then the fault may fail
52062306a36Sopenharmony_ci	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
52162306a36Sopenharmony_ci	 * even ksmd can fail in this way - though it's usually breaking ksm
52262306a36Sopenharmony_ci	 * just to undo a merge it made a moment before, so unlikely to oom.
52362306a36Sopenharmony_ci	 *
52462306a36Sopenharmony_ci	 * That's a pity: we might therefore have more kernel pages allocated
52562306a36Sopenharmony_ci	 * than we're counting as nodes in the stable tree; but ksm_do_scan
52662306a36Sopenharmony_ci	 * will retry to break_cow on each pass, so should recover the page
52762306a36Sopenharmony_ci	 * in due course.  The important thing is to not let VM_MERGEABLE
52862306a36Sopenharmony_ci	 * be cleared while any such pages might remain in the area.
52962306a36Sopenharmony_ci	 */
53062306a36Sopenharmony_ci	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
53162306a36Sopenharmony_ci}
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_cistatic bool vma_ksm_compatible(struct vm_area_struct *vma)
53462306a36Sopenharmony_ci{
53562306a36Sopenharmony_ci	if (vma->vm_flags & (VM_SHARED  | VM_MAYSHARE   | VM_PFNMAP  |
53662306a36Sopenharmony_ci			     VM_IO      | VM_DONTEXPAND | VM_HUGETLB |
53762306a36Sopenharmony_ci			     VM_MIXEDMAP))
53862306a36Sopenharmony_ci		return false;		/* just ignore the advice */
53962306a36Sopenharmony_ci
54062306a36Sopenharmony_ci	if (vma_is_dax(vma))
54162306a36Sopenharmony_ci		return false;
54262306a36Sopenharmony_ci
54362306a36Sopenharmony_ci#ifdef VM_SAO
54462306a36Sopenharmony_ci	if (vma->vm_flags & VM_SAO)
54562306a36Sopenharmony_ci		return false;
54662306a36Sopenharmony_ci#endif
54762306a36Sopenharmony_ci#ifdef VM_SPARC_ADI
54862306a36Sopenharmony_ci	if (vma->vm_flags & VM_SPARC_ADI)
54962306a36Sopenharmony_ci		return false;
55062306a36Sopenharmony_ci#endif
55162306a36Sopenharmony_ci
55262306a36Sopenharmony_ci	return true;
55362306a36Sopenharmony_ci}
55462306a36Sopenharmony_ci
55562306a36Sopenharmony_cistatic struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
55662306a36Sopenharmony_ci		unsigned long addr)
55762306a36Sopenharmony_ci{
55862306a36Sopenharmony_ci	struct vm_area_struct *vma;
55962306a36Sopenharmony_ci	if (ksm_test_exit(mm))
56062306a36Sopenharmony_ci		return NULL;
56162306a36Sopenharmony_ci	vma = vma_lookup(mm, addr);
56262306a36Sopenharmony_ci	if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
56362306a36Sopenharmony_ci		return NULL;
56462306a36Sopenharmony_ci	return vma;
56562306a36Sopenharmony_ci}
56662306a36Sopenharmony_ci
56762306a36Sopenharmony_cistatic void break_cow(struct ksm_rmap_item *rmap_item)
56862306a36Sopenharmony_ci{
56962306a36Sopenharmony_ci	struct mm_struct *mm = rmap_item->mm;
57062306a36Sopenharmony_ci	unsigned long addr = rmap_item->address;
57162306a36Sopenharmony_ci	struct vm_area_struct *vma;
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci	/*
57462306a36Sopenharmony_ci	 * It is not an accident that whenever we want to break COW
57562306a36Sopenharmony_ci	 * to undo, we also need to drop a reference to the anon_vma.
57662306a36Sopenharmony_ci	 */
57762306a36Sopenharmony_ci	put_anon_vma(rmap_item->anon_vma);
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci	mmap_read_lock(mm);
58062306a36Sopenharmony_ci	vma = find_mergeable_vma(mm, addr);
58162306a36Sopenharmony_ci	if (vma)
58262306a36Sopenharmony_ci		break_ksm(vma, addr, false);
58362306a36Sopenharmony_ci	mmap_read_unlock(mm);
58462306a36Sopenharmony_ci}
58562306a36Sopenharmony_ci
58662306a36Sopenharmony_cistatic struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
58762306a36Sopenharmony_ci{
58862306a36Sopenharmony_ci	struct mm_struct *mm = rmap_item->mm;
58962306a36Sopenharmony_ci	unsigned long addr = rmap_item->address;
59062306a36Sopenharmony_ci	struct vm_area_struct *vma;
59162306a36Sopenharmony_ci	struct page *page;
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_ci	mmap_read_lock(mm);
59462306a36Sopenharmony_ci	vma = find_mergeable_vma(mm, addr);
59562306a36Sopenharmony_ci	if (!vma)
59662306a36Sopenharmony_ci		goto out;
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci	page = follow_page(vma, addr, FOLL_GET);
59962306a36Sopenharmony_ci	if (IS_ERR_OR_NULL(page))
60062306a36Sopenharmony_ci		goto out;
60162306a36Sopenharmony_ci	if (is_zone_device_page(page))
60262306a36Sopenharmony_ci		goto out_putpage;
60362306a36Sopenharmony_ci	if (PageAnon(page)) {
60462306a36Sopenharmony_ci		flush_anon_page(vma, page, addr);
60562306a36Sopenharmony_ci		flush_dcache_page(page);
60662306a36Sopenharmony_ci	} else {
60762306a36Sopenharmony_ciout_putpage:
60862306a36Sopenharmony_ci		put_page(page);
60962306a36Sopenharmony_ciout:
61062306a36Sopenharmony_ci		page = NULL;
61162306a36Sopenharmony_ci	}
61262306a36Sopenharmony_ci	mmap_read_unlock(mm);
61362306a36Sopenharmony_ci	return page;
61462306a36Sopenharmony_ci}
61562306a36Sopenharmony_ci
61662306a36Sopenharmony_ci/*
61762306a36Sopenharmony_ci * This helper is used for getting right index into array of tree roots.
61862306a36Sopenharmony_ci * When merge_across_nodes knob is set to 1, there are only two rb-trees for
61962306a36Sopenharmony_ci * stable and unstable pages from all nodes with roots in index 0. Otherwise,
62062306a36Sopenharmony_ci * every node has its own stable and unstable tree.
62162306a36Sopenharmony_ci */
62262306a36Sopenharmony_cistatic inline int get_kpfn_nid(unsigned long kpfn)
62362306a36Sopenharmony_ci{
62462306a36Sopenharmony_ci	return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
62562306a36Sopenharmony_ci}
62662306a36Sopenharmony_ci
62762306a36Sopenharmony_cistatic struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup,
62862306a36Sopenharmony_ci						   struct rb_root *root)
62962306a36Sopenharmony_ci{
63062306a36Sopenharmony_ci	struct ksm_stable_node *chain = alloc_stable_node();
63162306a36Sopenharmony_ci	VM_BUG_ON(is_stable_node_chain(dup));
63262306a36Sopenharmony_ci	if (likely(chain)) {
63362306a36Sopenharmony_ci		INIT_HLIST_HEAD(&chain->hlist);
63462306a36Sopenharmony_ci		chain->chain_prune_time = jiffies;
63562306a36Sopenharmony_ci		chain->rmap_hlist_len = STABLE_NODE_CHAIN;
63662306a36Sopenharmony_ci#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
63762306a36Sopenharmony_ci		chain->nid = NUMA_NO_NODE; /* debug */
63862306a36Sopenharmony_ci#endif
63962306a36Sopenharmony_ci		ksm_stable_node_chains++;
64062306a36Sopenharmony_ci
64162306a36Sopenharmony_ci		/*
64262306a36Sopenharmony_ci		 * Put the stable node chain in the first dimension of
64362306a36Sopenharmony_ci		 * the stable tree and at the same time remove the old
64462306a36Sopenharmony_ci		 * stable node.
64562306a36Sopenharmony_ci		 */
64662306a36Sopenharmony_ci		rb_replace_node(&dup->node, &chain->node, root);
64762306a36Sopenharmony_ci
64862306a36Sopenharmony_ci		/*
64962306a36Sopenharmony_ci		 * Move the old stable node to the second dimension
65062306a36Sopenharmony_ci		 * queued in the hlist_dup. The invariant is that all
65162306a36Sopenharmony_ci		 * dup stable_nodes in the chain->hlist point to pages
65262306a36Sopenharmony_ci		 * that are write protected and have the exact same
65362306a36Sopenharmony_ci		 * content.
65462306a36Sopenharmony_ci		 */
65562306a36Sopenharmony_ci		stable_node_chain_add_dup(dup, chain);
65662306a36Sopenharmony_ci	}
65762306a36Sopenharmony_ci	return chain;
65862306a36Sopenharmony_ci}
65962306a36Sopenharmony_ci
66062306a36Sopenharmony_cistatic inline void free_stable_node_chain(struct ksm_stable_node *chain,
66162306a36Sopenharmony_ci					  struct rb_root *root)
66262306a36Sopenharmony_ci{
66362306a36Sopenharmony_ci	rb_erase(&chain->node, root);
66462306a36Sopenharmony_ci	free_stable_node(chain);
66562306a36Sopenharmony_ci	ksm_stable_node_chains--;
66662306a36Sopenharmony_ci}
66762306a36Sopenharmony_ci
66862306a36Sopenharmony_cistatic void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
66962306a36Sopenharmony_ci{
67062306a36Sopenharmony_ci	struct ksm_rmap_item *rmap_item;
67162306a36Sopenharmony_ci
67262306a36Sopenharmony_ci	/* check it's not STABLE_NODE_CHAIN or negative */
67362306a36Sopenharmony_ci	BUG_ON(stable_node->rmap_hlist_len < 0);
67462306a36Sopenharmony_ci
67562306a36Sopenharmony_ci	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
67662306a36Sopenharmony_ci		if (rmap_item->hlist.next) {
67762306a36Sopenharmony_ci			ksm_pages_sharing--;
67862306a36Sopenharmony_ci			trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm);
67962306a36Sopenharmony_ci		} else {
68062306a36Sopenharmony_ci			ksm_pages_shared--;
68162306a36Sopenharmony_ci		}
68262306a36Sopenharmony_ci
68362306a36Sopenharmony_ci		rmap_item->mm->ksm_merging_pages--;
68462306a36Sopenharmony_ci
68562306a36Sopenharmony_ci		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
68662306a36Sopenharmony_ci		stable_node->rmap_hlist_len--;
68762306a36Sopenharmony_ci		put_anon_vma(rmap_item->anon_vma);
68862306a36Sopenharmony_ci		rmap_item->address &= PAGE_MASK;
68962306a36Sopenharmony_ci		cond_resched();
69062306a36Sopenharmony_ci	}
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci	/*
69362306a36Sopenharmony_ci	 * We need the second aligned pointer of the migrate_nodes
69462306a36Sopenharmony_ci	 * list_head to stay clear from the rb_parent_color union
69562306a36Sopenharmony_ci	 * (aligned and different than any node) and also different
69662306a36Sopenharmony_ci	 * from &migrate_nodes. This will verify that future list.h changes
69762306a36Sopenharmony_ci	 * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
69862306a36Sopenharmony_ci	 */
69962306a36Sopenharmony_ci	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
70062306a36Sopenharmony_ci	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
70162306a36Sopenharmony_ci
70262306a36Sopenharmony_ci	trace_ksm_remove_ksm_page(stable_node->kpfn);
70362306a36Sopenharmony_ci	if (stable_node->head == &migrate_nodes)
70462306a36Sopenharmony_ci		list_del(&stable_node->list);
70562306a36Sopenharmony_ci	else
70662306a36Sopenharmony_ci		stable_node_dup_del(stable_node);
70762306a36Sopenharmony_ci	free_stable_node(stable_node);
70862306a36Sopenharmony_ci}
70962306a36Sopenharmony_ci
71062306a36Sopenharmony_cienum get_ksm_page_flags {
71162306a36Sopenharmony_ci	GET_KSM_PAGE_NOLOCK,
71262306a36Sopenharmony_ci	GET_KSM_PAGE_LOCK,
71362306a36Sopenharmony_ci	GET_KSM_PAGE_TRYLOCK
71462306a36Sopenharmony_ci};
71562306a36Sopenharmony_ci
71662306a36Sopenharmony_ci/*
71762306a36Sopenharmony_ci * get_ksm_page: checks if the page indicated by the stable node
71862306a36Sopenharmony_ci * is still its ksm page, despite having held no reference to it.
71962306a36Sopenharmony_ci * In which case we can trust the content of the page, and it
72062306a36Sopenharmony_ci * returns the gotten page; but if the page has now been zapped,
72162306a36Sopenharmony_ci * remove the stale node from the stable tree and return NULL.
72262306a36Sopenharmony_ci * But beware, the stable node's page might be being migrated.
72362306a36Sopenharmony_ci *
72462306a36Sopenharmony_ci * You would expect the stable_node to hold a reference to the ksm page.
72562306a36Sopenharmony_ci * But if it increments the page's count, swapping out has to wait for
72662306a36Sopenharmony_ci * ksmd to come around again before it can free the page, which may take
72762306a36Sopenharmony_ci * seconds or even minutes: much too unresponsive.  So instead we use a
72862306a36Sopenharmony_ci * "keyhole reference": access to the ksm page from the stable node peeps
72962306a36Sopenharmony_ci * out through its keyhole to see if that page still holds the right key,
73062306a36Sopenharmony_ci * pointing back to this stable node.  This relies on freeing a PageAnon
73162306a36Sopenharmony_ci * page to reset its page->mapping to NULL, and relies on no other use of
73262306a36Sopenharmony_ci * a page to put something that might look like our key in page->mapping.
73362306a36Sopenharmony_ci * is on its way to being freed; but it is an anomaly to bear in mind.
73462306a36Sopenharmony_ci */
73562306a36Sopenharmony_cistatic struct page *get_ksm_page(struct ksm_stable_node *stable_node,
73662306a36Sopenharmony_ci				 enum get_ksm_page_flags flags)
73762306a36Sopenharmony_ci{
73862306a36Sopenharmony_ci	struct page *page;
73962306a36Sopenharmony_ci	void *expected_mapping;
74062306a36Sopenharmony_ci	unsigned long kpfn;
74162306a36Sopenharmony_ci
74262306a36Sopenharmony_ci	expected_mapping = (void *)((unsigned long)stable_node |
74362306a36Sopenharmony_ci					PAGE_MAPPING_KSM);
74462306a36Sopenharmony_ciagain:
74562306a36Sopenharmony_ci	kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
74662306a36Sopenharmony_ci	page = pfn_to_page(kpfn);
74762306a36Sopenharmony_ci	if (READ_ONCE(page->mapping) != expected_mapping)
74862306a36Sopenharmony_ci		goto stale;
74962306a36Sopenharmony_ci
75062306a36Sopenharmony_ci	/*
75162306a36Sopenharmony_ci	 * We cannot do anything with the page while its refcount is 0.
75262306a36Sopenharmony_ci	 * Usually 0 means free, or tail of a higher-order page: in which
75362306a36Sopenharmony_ci	 * case this node is no longer referenced, and should be freed;
75462306a36Sopenharmony_ci	 * however, it might mean that the page is under page_ref_freeze().
75562306a36Sopenharmony_ci	 * The __remove_mapping() case is easy, again the node is now stale;
75662306a36Sopenharmony_ci	 * the same is in reuse_ksm_page() case; but if page is swapcache
75762306a36Sopenharmony_ci	 * in folio_migrate_mapping(), it might still be our page,
75862306a36Sopenharmony_ci	 * in which case it's essential to keep the node.
75962306a36Sopenharmony_ci	 */
76062306a36Sopenharmony_ci	while (!get_page_unless_zero(page)) {
76162306a36Sopenharmony_ci		/*
76262306a36Sopenharmony_ci		 * Another check for page->mapping != expected_mapping would
76362306a36Sopenharmony_ci		 * work here too.  We have chosen the !PageSwapCache test to
76462306a36Sopenharmony_ci		 * optimize the common case, when the page is or is about to
76562306a36Sopenharmony_ci		 * be freed: PageSwapCache is cleared (under spin_lock_irq)
76662306a36Sopenharmony_ci		 * in the ref_freeze section of __remove_mapping(); but Anon
76762306a36Sopenharmony_ci		 * page->mapping reset to NULL later, in free_pages_prepare().
76862306a36Sopenharmony_ci		 */
76962306a36Sopenharmony_ci		if (!PageSwapCache(page))
77062306a36Sopenharmony_ci			goto stale;
77162306a36Sopenharmony_ci		cpu_relax();
77262306a36Sopenharmony_ci	}
77362306a36Sopenharmony_ci
77462306a36Sopenharmony_ci	if (READ_ONCE(page->mapping) != expected_mapping) {
77562306a36Sopenharmony_ci		put_page(page);
77662306a36Sopenharmony_ci		goto stale;
77762306a36Sopenharmony_ci	}
77862306a36Sopenharmony_ci
77962306a36Sopenharmony_ci	if (flags == GET_KSM_PAGE_TRYLOCK) {
78062306a36Sopenharmony_ci		if (!trylock_page(page)) {
78162306a36Sopenharmony_ci			put_page(page);
78262306a36Sopenharmony_ci			return ERR_PTR(-EBUSY);
78362306a36Sopenharmony_ci		}
78462306a36Sopenharmony_ci	} else if (flags == GET_KSM_PAGE_LOCK)
78562306a36Sopenharmony_ci		lock_page(page);
78662306a36Sopenharmony_ci
78762306a36Sopenharmony_ci	if (flags != GET_KSM_PAGE_NOLOCK) {
78862306a36Sopenharmony_ci		if (READ_ONCE(page->mapping) != expected_mapping) {
78962306a36Sopenharmony_ci			unlock_page(page);
79062306a36Sopenharmony_ci			put_page(page);
79162306a36Sopenharmony_ci			goto stale;
79262306a36Sopenharmony_ci		}
79362306a36Sopenharmony_ci	}
79462306a36Sopenharmony_ci	return page;
79562306a36Sopenharmony_ci
79662306a36Sopenharmony_cistale:
79762306a36Sopenharmony_ci	/*
79862306a36Sopenharmony_ci	 * We come here from above when page->mapping or !PageSwapCache
79962306a36Sopenharmony_ci	 * suggests that the node is stale; but it might be under migration.
80062306a36Sopenharmony_ci	 * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
80162306a36Sopenharmony_ci	 * before checking whether node->kpfn has been changed.
80262306a36Sopenharmony_ci	 */
80362306a36Sopenharmony_ci	smp_rmb();
80462306a36Sopenharmony_ci	if (READ_ONCE(stable_node->kpfn) != kpfn)
80562306a36Sopenharmony_ci		goto again;
80662306a36Sopenharmony_ci	remove_node_from_stable_tree(stable_node);
80762306a36Sopenharmony_ci	return NULL;
80862306a36Sopenharmony_ci}
80962306a36Sopenharmony_ci
81062306a36Sopenharmony_ci/*
81162306a36Sopenharmony_ci * Removing rmap_item from stable or unstable tree.
81262306a36Sopenharmony_ci * This function will clean the information from the stable/unstable tree.
81362306a36Sopenharmony_ci */
81462306a36Sopenharmony_cistatic void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
81562306a36Sopenharmony_ci{
81662306a36Sopenharmony_ci	if (rmap_item->address & STABLE_FLAG) {
81762306a36Sopenharmony_ci		struct ksm_stable_node *stable_node;
81862306a36Sopenharmony_ci		struct page *page;
81962306a36Sopenharmony_ci
82062306a36Sopenharmony_ci		stable_node = rmap_item->head;
82162306a36Sopenharmony_ci		page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
82262306a36Sopenharmony_ci		if (!page)
82362306a36Sopenharmony_ci			goto out;
82462306a36Sopenharmony_ci
82562306a36Sopenharmony_ci		hlist_del(&rmap_item->hlist);
82662306a36Sopenharmony_ci		unlock_page(page);
82762306a36Sopenharmony_ci		put_page(page);
82862306a36Sopenharmony_ci
82962306a36Sopenharmony_ci		if (!hlist_empty(&stable_node->hlist))
83062306a36Sopenharmony_ci			ksm_pages_sharing--;
83162306a36Sopenharmony_ci		else
83262306a36Sopenharmony_ci			ksm_pages_shared--;
83362306a36Sopenharmony_ci
83462306a36Sopenharmony_ci		rmap_item->mm->ksm_merging_pages--;
83562306a36Sopenharmony_ci
83662306a36Sopenharmony_ci		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
83762306a36Sopenharmony_ci		stable_node->rmap_hlist_len--;
83862306a36Sopenharmony_ci
83962306a36Sopenharmony_ci		put_anon_vma(rmap_item->anon_vma);
84062306a36Sopenharmony_ci		rmap_item->head = NULL;
84162306a36Sopenharmony_ci		rmap_item->address &= PAGE_MASK;
84262306a36Sopenharmony_ci
84362306a36Sopenharmony_ci	} else if (rmap_item->address & UNSTABLE_FLAG) {
84462306a36Sopenharmony_ci		unsigned char age;
84562306a36Sopenharmony_ci		/*
84662306a36Sopenharmony_ci		 * Usually ksmd can and must skip the rb_erase, because
84762306a36Sopenharmony_ci		 * root_unstable_tree was already reset to RB_ROOT.
84862306a36Sopenharmony_ci		 * But be careful when an mm is exiting: do the rb_erase
84962306a36Sopenharmony_ci		 * if this rmap_item was inserted by this scan, rather
85062306a36Sopenharmony_ci		 * than left over from before.
85162306a36Sopenharmony_ci		 */
85262306a36Sopenharmony_ci		age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
85362306a36Sopenharmony_ci		BUG_ON(age > 1);
85462306a36Sopenharmony_ci		if (!age)
85562306a36Sopenharmony_ci			rb_erase(&rmap_item->node,
85662306a36Sopenharmony_ci				 root_unstable_tree + NUMA(rmap_item->nid));
85762306a36Sopenharmony_ci		ksm_pages_unshared--;
85862306a36Sopenharmony_ci		rmap_item->address &= PAGE_MASK;
85962306a36Sopenharmony_ci	}
86062306a36Sopenharmony_ciout:
86162306a36Sopenharmony_ci	cond_resched();		/* we're called from many long loops */
86262306a36Sopenharmony_ci}
86362306a36Sopenharmony_ci
86462306a36Sopenharmony_cistatic void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
86562306a36Sopenharmony_ci{
86662306a36Sopenharmony_ci	while (*rmap_list) {
86762306a36Sopenharmony_ci		struct ksm_rmap_item *rmap_item = *rmap_list;
86862306a36Sopenharmony_ci		*rmap_list = rmap_item->rmap_list;
86962306a36Sopenharmony_ci		remove_rmap_item_from_tree(rmap_item);
87062306a36Sopenharmony_ci		free_rmap_item(rmap_item);
87162306a36Sopenharmony_ci	}
87262306a36Sopenharmony_ci}
87362306a36Sopenharmony_ci
87462306a36Sopenharmony_ci/*
87562306a36Sopenharmony_ci * Though it's very tempting to unmerge rmap_items from stable tree rather
87662306a36Sopenharmony_ci * than check every pte of a given vma, the locking doesn't quite work for
87762306a36Sopenharmony_ci * that - an rmap_item is assigned to the stable tree after inserting ksm
87862306a36Sopenharmony_ci * page and upping mmap_lock.  Nor does it fit with the way we skip dup'ing
87962306a36Sopenharmony_ci * rmap_items from parent to child at fork time (so as not to waste time
88062306a36Sopenharmony_ci * if exit comes before the next scan reaches it).
88162306a36Sopenharmony_ci *
88262306a36Sopenharmony_ci * Similarly, although we'd like to remove rmap_items (so updating counts
88362306a36Sopenharmony_ci * and freeing memory) when unmerging an area, it's easier to leave that
88462306a36Sopenharmony_ci * to the next pass of ksmd - consider, for example, how ksmd might be
88562306a36Sopenharmony_ci * in cmp_and_merge_page on one of the rmap_items we would be removing.
88662306a36Sopenharmony_ci */
88762306a36Sopenharmony_cistatic int unmerge_ksm_pages(struct vm_area_struct *vma,
88862306a36Sopenharmony_ci			     unsigned long start, unsigned long end, bool lock_vma)
88962306a36Sopenharmony_ci{
89062306a36Sopenharmony_ci	unsigned long addr;
89162306a36Sopenharmony_ci	int err = 0;
89262306a36Sopenharmony_ci
89362306a36Sopenharmony_ci	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
89462306a36Sopenharmony_ci		if (ksm_test_exit(vma->vm_mm))
89562306a36Sopenharmony_ci			break;
89662306a36Sopenharmony_ci		if (signal_pending(current))
89762306a36Sopenharmony_ci			err = -ERESTARTSYS;
89862306a36Sopenharmony_ci		else
89962306a36Sopenharmony_ci			err = break_ksm(vma, addr, lock_vma);
90062306a36Sopenharmony_ci	}
90162306a36Sopenharmony_ci	return err;
90262306a36Sopenharmony_ci}
90362306a36Sopenharmony_ci
90462306a36Sopenharmony_cistatic inline struct ksm_stable_node *folio_stable_node(struct folio *folio)
90562306a36Sopenharmony_ci{
90662306a36Sopenharmony_ci	return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
90762306a36Sopenharmony_ci}
90862306a36Sopenharmony_ci
90962306a36Sopenharmony_cistatic inline struct ksm_stable_node *page_stable_node(struct page *page)
91062306a36Sopenharmony_ci{
91162306a36Sopenharmony_ci	return folio_stable_node(page_folio(page));
91262306a36Sopenharmony_ci}
91362306a36Sopenharmony_ci
91462306a36Sopenharmony_cistatic inline void set_page_stable_node(struct page *page,
91562306a36Sopenharmony_ci					struct ksm_stable_node *stable_node)
91662306a36Sopenharmony_ci{
91762306a36Sopenharmony_ci	VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
91862306a36Sopenharmony_ci	page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
91962306a36Sopenharmony_ci}
92062306a36Sopenharmony_ci
92162306a36Sopenharmony_ci#ifdef CONFIG_SYSFS
92262306a36Sopenharmony_ci/*
92362306a36Sopenharmony_ci * Only called through the sysfs control interface:
92462306a36Sopenharmony_ci */
92562306a36Sopenharmony_cistatic int remove_stable_node(struct ksm_stable_node *stable_node)
92662306a36Sopenharmony_ci{
92762306a36Sopenharmony_ci	struct page *page;
92862306a36Sopenharmony_ci	int err;
92962306a36Sopenharmony_ci
93062306a36Sopenharmony_ci	page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
93162306a36Sopenharmony_ci	if (!page) {
93262306a36Sopenharmony_ci		/*
93362306a36Sopenharmony_ci		 * get_ksm_page did remove_node_from_stable_tree itself.
93462306a36Sopenharmony_ci		 */
93562306a36Sopenharmony_ci		return 0;
93662306a36Sopenharmony_ci	}
93762306a36Sopenharmony_ci
93862306a36Sopenharmony_ci	/*
93962306a36Sopenharmony_ci	 * Page could be still mapped if this races with __mmput() running in
94062306a36Sopenharmony_ci	 * between ksm_exit() and exit_mmap(). Just refuse to let
94162306a36Sopenharmony_ci	 * merge_across_nodes/max_page_sharing be switched.
94262306a36Sopenharmony_ci	 */
94362306a36Sopenharmony_ci	err = -EBUSY;
94462306a36Sopenharmony_ci	if (!page_mapped(page)) {
94562306a36Sopenharmony_ci		/*
94662306a36Sopenharmony_ci		 * The stable node did not yet appear stale to get_ksm_page(),
94762306a36Sopenharmony_ci		 * since that allows for an unmapped ksm page to be recognized
94862306a36Sopenharmony_ci		 * right up until it is freed; but the node is safe to remove.
94962306a36Sopenharmony_ci		 * This page might be in an LRU cache waiting to be freed,
95062306a36Sopenharmony_ci		 * or it might be PageSwapCache (perhaps under writeback),
95162306a36Sopenharmony_ci		 * or it might have been removed from swapcache a moment ago.
95262306a36Sopenharmony_ci		 */
95362306a36Sopenharmony_ci		set_page_stable_node(page, NULL);
95462306a36Sopenharmony_ci		remove_node_from_stable_tree(stable_node);
95562306a36Sopenharmony_ci		err = 0;
95662306a36Sopenharmony_ci	}
95762306a36Sopenharmony_ci
95862306a36Sopenharmony_ci	unlock_page(page);
95962306a36Sopenharmony_ci	put_page(page);
96062306a36Sopenharmony_ci	return err;
96162306a36Sopenharmony_ci}
96262306a36Sopenharmony_ci
96362306a36Sopenharmony_cistatic int remove_stable_node_chain(struct ksm_stable_node *stable_node,
96462306a36Sopenharmony_ci				    struct rb_root *root)
96562306a36Sopenharmony_ci{
96662306a36Sopenharmony_ci	struct ksm_stable_node *dup;
96762306a36Sopenharmony_ci	struct hlist_node *hlist_safe;
96862306a36Sopenharmony_ci
96962306a36Sopenharmony_ci	if (!is_stable_node_chain(stable_node)) {
97062306a36Sopenharmony_ci		VM_BUG_ON(is_stable_node_dup(stable_node));
97162306a36Sopenharmony_ci		if (remove_stable_node(stable_node))
97262306a36Sopenharmony_ci			return true;
97362306a36Sopenharmony_ci		else
97462306a36Sopenharmony_ci			return false;
97562306a36Sopenharmony_ci	}
97662306a36Sopenharmony_ci
97762306a36Sopenharmony_ci	hlist_for_each_entry_safe(dup, hlist_safe,
97862306a36Sopenharmony_ci				  &stable_node->hlist, hlist_dup) {
97962306a36Sopenharmony_ci		VM_BUG_ON(!is_stable_node_dup(dup));
98062306a36Sopenharmony_ci		if (remove_stable_node(dup))
98162306a36Sopenharmony_ci			return true;
98262306a36Sopenharmony_ci	}
98362306a36Sopenharmony_ci	BUG_ON(!hlist_empty(&stable_node->hlist));
98462306a36Sopenharmony_ci	free_stable_node_chain(stable_node, root);
98562306a36Sopenharmony_ci	return false;
98662306a36Sopenharmony_ci}
98762306a36Sopenharmony_ci
98862306a36Sopenharmony_cistatic int remove_all_stable_nodes(void)
98962306a36Sopenharmony_ci{
99062306a36Sopenharmony_ci	struct ksm_stable_node *stable_node, *next;
99162306a36Sopenharmony_ci	int nid;
99262306a36Sopenharmony_ci	int err = 0;
99362306a36Sopenharmony_ci
99462306a36Sopenharmony_ci	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
99562306a36Sopenharmony_ci		while (root_stable_tree[nid].rb_node) {
99662306a36Sopenharmony_ci			stable_node = rb_entry(root_stable_tree[nid].rb_node,
99762306a36Sopenharmony_ci						struct ksm_stable_node, node);
99862306a36Sopenharmony_ci			if (remove_stable_node_chain(stable_node,
99962306a36Sopenharmony_ci						     root_stable_tree + nid)) {
100062306a36Sopenharmony_ci				err = -EBUSY;
100162306a36Sopenharmony_ci				break;	/* proceed to next nid */
100262306a36Sopenharmony_ci			}
100362306a36Sopenharmony_ci			cond_resched();
100462306a36Sopenharmony_ci		}
100562306a36Sopenharmony_ci	}
100662306a36Sopenharmony_ci	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
100762306a36Sopenharmony_ci		if (remove_stable_node(stable_node))
100862306a36Sopenharmony_ci			err = -EBUSY;
100962306a36Sopenharmony_ci		cond_resched();
101062306a36Sopenharmony_ci	}
101162306a36Sopenharmony_ci	return err;
101262306a36Sopenharmony_ci}
101362306a36Sopenharmony_ci
101462306a36Sopenharmony_cistatic int unmerge_and_remove_all_rmap_items(void)
101562306a36Sopenharmony_ci{
101662306a36Sopenharmony_ci	struct ksm_mm_slot *mm_slot;
101762306a36Sopenharmony_ci	struct mm_slot *slot;
101862306a36Sopenharmony_ci	struct mm_struct *mm;
101962306a36Sopenharmony_ci	struct vm_area_struct *vma;
102062306a36Sopenharmony_ci	int err = 0;
102162306a36Sopenharmony_ci
102262306a36Sopenharmony_ci	spin_lock(&ksm_mmlist_lock);
102362306a36Sopenharmony_ci	slot = list_entry(ksm_mm_head.slot.mm_node.next,
102462306a36Sopenharmony_ci			  struct mm_slot, mm_node);
102562306a36Sopenharmony_ci	ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
102662306a36Sopenharmony_ci	spin_unlock(&ksm_mmlist_lock);
102762306a36Sopenharmony_ci
102862306a36Sopenharmony_ci	for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head;
102962306a36Sopenharmony_ci	     mm_slot = ksm_scan.mm_slot) {
103062306a36Sopenharmony_ci		VMA_ITERATOR(vmi, mm_slot->slot.mm, 0);
103162306a36Sopenharmony_ci
103262306a36Sopenharmony_ci		mm = mm_slot->slot.mm;
103362306a36Sopenharmony_ci		mmap_read_lock(mm);
103462306a36Sopenharmony_ci
103562306a36Sopenharmony_ci		/*
103662306a36Sopenharmony_ci		 * Exit right away if mm is exiting to avoid lockdep issue in
103762306a36Sopenharmony_ci		 * the maple tree
103862306a36Sopenharmony_ci		 */
103962306a36Sopenharmony_ci		if (ksm_test_exit(mm))
104062306a36Sopenharmony_ci			goto mm_exiting;
104162306a36Sopenharmony_ci
104262306a36Sopenharmony_ci		for_each_vma(vmi, vma) {
104362306a36Sopenharmony_ci			if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
104462306a36Sopenharmony_ci				continue;
104562306a36Sopenharmony_ci			err = unmerge_ksm_pages(vma,
104662306a36Sopenharmony_ci						vma->vm_start, vma->vm_end, false);
104762306a36Sopenharmony_ci			if (err)
104862306a36Sopenharmony_ci				goto error;
104962306a36Sopenharmony_ci		}
105062306a36Sopenharmony_ci
105162306a36Sopenharmony_cimm_exiting:
105262306a36Sopenharmony_ci		remove_trailing_rmap_items(&mm_slot->rmap_list);
105362306a36Sopenharmony_ci		mmap_read_unlock(mm);
105462306a36Sopenharmony_ci
105562306a36Sopenharmony_ci		spin_lock(&ksm_mmlist_lock);
105662306a36Sopenharmony_ci		slot = list_entry(mm_slot->slot.mm_node.next,
105762306a36Sopenharmony_ci				  struct mm_slot, mm_node);
105862306a36Sopenharmony_ci		ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
105962306a36Sopenharmony_ci		if (ksm_test_exit(mm)) {
106062306a36Sopenharmony_ci			hash_del(&mm_slot->slot.hash);
106162306a36Sopenharmony_ci			list_del(&mm_slot->slot.mm_node);
106262306a36Sopenharmony_ci			spin_unlock(&ksm_mmlist_lock);
106362306a36Sopenharmony_ci
106462306a36Sopenharmony_ci			mm_slot_free(mm_slot_cache, mm_slot);
106562306a36Sopenharmony_ci			clear_bit(MMF_VM_MERGEABLE, &mm->flags);
106662306a36Sopenharmony_ci			clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
106762306a36Sopenharmony_ci			mmdrop(mm);
106862306a36Sopenharmony_ci		} else
106962306a36Sopenharmony_ci			spin_unlock(&ksm_mmlist_lock);
107062306a36Sopenharmony_ci	}
107162306a36Sopenharmony_ci
107262306a36Sopenharmony_ci	/* Clean up stable nodes, but don't worry if some are still busy */
107362306a36Sopenharmony_ci	remove_all_stable_nodes();
107462306a36Sopenharmony_ci	ksm_scan.seqnr = 0;
107562306a36Sopenharmony_ci	return 0;
107662306a36Sopenharmony_ci
107762306a36Sopenharmony_cierror:
107862306a36Sopenharmony_ci	mmap_read_unlock(mm);
107962306a36Sopenharmony_ci	spin_lock(&ksm_mmlist_lock);
108062306a36Sopenharmony_ci	ksm_scan.mm_slot = &ksm_mm_head;
108162306a36Sopenharmony_ci	spin_unlock(&ksm_mmlist_lock);
108262306a36Sopenharmony_ci	return err;
108362306a36Sopenharmony_ci}
108462306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */
108562306a36Sopenharmony_ci
108662306a36Sopenharmony_cistatic u32 calc_checksum(struct page *page)
108762306a36Sopenharmony_ci{
108862306a36Sopenharmony_ci	u32 checksum;
108962306a36Sopenharmony_ci	void *addr = kmap_atomic(page);
109062306a36Sopenharmony_ci	checksum = xxhash(addr, PAGE_SIZE, 0);
109162306a36Sopenharmony_ci	kunmap_atomic(addr);
109262306a36Sopenharmony_ci	return checksum;
109362306a36Sopenharmony_ci}
109462306a36Sopenharmony_ci
109562306a36Sopenharmony_cistatic int write_protect_page(struct vm_area_struct *vma, struct page *page,
109662306a36Sopenharmony_ci			      pte_t *orig_pte)
109762306a36Sopenharmony_ci{
109862306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
109962306a36Sopenharmony_ci	DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0);
110062306a36Sopenharmony_ci	int swapped;
110162306a36Sopenharmony_ci	int err = -EFAULT;
110262306a36Sopenharmony_ci	struct mmu_notifier_range range;
110362306a36Sopenharmony_ci	bool anon_exclusive;
110462306a36Sopenharmony_ci	pte_t entry;
110562306a36Sopenharmony_ci
110662306a36Sopenharmony_ci	pvmw.address = page_address_in_vma(page, vma);
110762306a36Sopenharmony_ci	if (pvmw.address == -EFAULT)
110862306a36Sopenharmony_ci		goto out;
110962306a36Sopenharmony_ci
111062306a36Sopenharmony_ci	BUG_ON(PageTransCompound(page));
111162306a36Sopenharmony_ci
111262306a36Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address,
111362306a36Sopenharmony_ci				pvmw.address + PAGE_SIZE);
111462306a36Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
111562306a36Sopenharmony_ci
111662306a36Sopenharmony_ci	if (!page_vma_mapped_walk(&pvmw))
111762306a36Sopenharmony_ci		goto out_mn;
111862306a36Sopenharmony_ci	if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
111962306a36Sopenharmony_ci		goto out_unlock;
112062306a36Sopenharmony_ci
112162306a36Sopenharmony_ci	anon_exclusive = PageAnonExclusive(page);
112262306a36Sopenharmony_ci	entry = ptep_get(pvmw.pte);
112362306a36Sopenharmony_ci	if (pte_write(entry) || pte_dirty(entry) ||
112462306a36Sopenharmony_ci	    anon_exclusive || mm_tlb_flush_pending(mm)) {
112562306a36Sopenharmony_ci		swapped = PageSwapCache(page);
112662306a36Sopenharmony_ci		flush_cache_page(vma, pvmw.address, page_to_pfn(page));
112762306a36Sopenharmony_ci		/*
112862306a36Sopenharmony_ci		 * Ok this is tricky, when get_user_pages_fast() run it doesn't
112962306a36Sopenharmony_ci		 * take any lock, therefore the check that we are going to make
113062306a36Sopenharmony_ci		 * with the pagecount against the mapcount is racy and
113162306a36Sopenharmony_ci		 * O_DIRECT can happen right after the check.
113262306a36Sopenharmony_ci		 * So we clear the pte and flush the tlb before the check
113362306a36Sopenharmony_ci		 * this assure us that no O_DIRECT can happen after the check
113462306a36Sopenharmony_ci		 * or in the middle of the check.
113562306a36Sopenharmony_ci		 *
113662306a36Sopenharmony_ci		 * No need to notify as we are downgrading page table to read
113762306a36Sopenharmony_ci		 * only not changing it to point to a new page.
113862306a36Sopenharmony_ci		 *
113962306a36Sopenharmony_ci		 * See Documentation/mm/mmu_notifier.rst
114062306a36Sopenharmony_ci		 */
114162306a36Sopenharmony_ci		entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
114262306a36Sopenharmony_ci		/*
114362306a36Sopenharmony_ci		 * Check that no O_DIRECT or similar I/O is in progress on the
114462306a36Sopenharmony_ci		 * page
114562306a36Sopenharmony_ci		 */
114662306a36Sopenharmony_ci		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
114762306a36Sopenharmony_ci			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
114862306a36Sopenharmony_ci			goto out_unlock;
114962306a36Sopenharmony_ci		}
115062306a36Sopenharmony_ci
115162306a36Sopenharmony_ci		/* See page_try_share_anon_rmap(): clear PTE first. */
115262306a36Sopenharmony_ci		if (anon_exclusive && page_try_share_anon_rmap(page)) {
115362306a36Sopenharmony_ci			set_pte_at(mm, pvmw.address, pvmw.pte, entry);
115462306a36Sopenharmony_ci			goto out_unlock;
115562306a36Sopenharmony_ci		}
115662306a36Sopenharmony_ci
115762306a36Sopenharmony_ci		if (pte_dirty(entry))
115862306a36Sopenharmony_ci			set_page_dirty(page);
115962306a36Sopenharmony_ci		entry = pte_mkclean(entry);
116062306a36Sopenharmony_ci
116162306a36Sopenharmony_ci		if (pte_write(entry))
116262306a36Sopenharmony_ci			entry = pte_wrprotect(entry);
116362306a36Sopenharmony_ci
116462306a36Sopenharmony_ci		set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
116562306a36Sopenharmony_ci	}
116662306a36Sopenharmony_ci	*orig_pte = entry;
116762306a36Sopenharmony_ci	err = 0;
116862306a36Sopenharmony_ci
116962306a36Sopenharmony_ciout_unlock:
117062306a36Sopenharmony_ci	page_vma_mapped_walk_done(&pvmw);
117162306a36Sopenharmony_ciout_mn:
117262306a36Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
117362306a36Sopenharmony_ciout:
117462306a36Sopenharmony_ci	return err;
117562306a36Sopenharmony_ci}
117662306a36Sopenharmony_ci
117762306a36Sopenharmony_ci/**
117862306a36Sopenharmony_ci * replace_page - replace page in vma by new ksm page
117962306a36Sopenharmony_ci * @vma:      vma that holds the pte pointing to page
118062306a36Sopenharmony_ci * @page:     the page we are replacing by kpage
118162306a36Sopenharmony_ci * @kpage:    the ksm page we replace page by
118262306a36Sopenharmony_ci * @orig_pte: the original value of the pte
118362306a36Sopenharmony_ci *
118462306a36Sopenharmony_ci * Returns 0 on success, -EFAULT on failure.
118562306a36Sopenharmony_ci */
118662306a36Sopenharmony_cistatic int replace_page(struct vm_area_struct *vma, struct page *page,
118762306a36Sopenharmony_ci			struct page *kpage, pte_t orig_pte)
118862306a36Sopenharmony_ci{
118962306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
119062306a36Sopenharmony_ci	struct folio *folio;
119162306a36Sopenharmony_ci	pmd_t *pmd;
119262306a36Sopenharmony_ci	pmd_t pmde;
119362306a36Sopenharmony_ci	pte_t *ptep;
119462306a36Sopenharmony_ci	pte_t newpte;
119562306a36Sopenharmony_ci	spinlock_t *ptl;
119662306a36Sopenharmony_ci	unsigned long addr;
119762306a36Sopenharmony_ci	int err = -EFAULT;
119862306a36Sopenharmony_ci	struct mmu_notifier_range range;
119962306a36Sopenharmony_ci
120062306a36Sopenharmony_ci	addr = page_address_in_vma(page, vma);
120162306a36Sopenharmony_ci	if (addr == -EFAULT)
120262306a36Sopenharmony_ci		goto out;
120362306a36Sopenharmony_ci
120462306a36Sopenharmony_ci	pmd = mm_find_pmd(mm, addr);
120562306a36Sopenharmony_ci	if (!pmd)
120662306a36Sopenharmony_ci		goto out;
120762306a36Sopenharmony_ci	/*
120862306a36Sopenharmony_ci	 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
120962306a36Sopenharmony_ci	 * without holding anon_vma lock for write.  So when looking for a
121062306a36Sopenharmony_ci	 * genuine pmde (in which to find pte), test present and !THP together.
121162306a36Sopenharmony_ci	 */
121262306a36Sopenharmony_ci	pmde = pmdp_get_lockless(pmd);
121362306a36Sopenharmony_ci	if (!pmd_present(pmde) || pmd_trans_huge(pmde))
121462306a36Sopenharmony_ci		goto out;
121562306a36Sopenharmony_ci
121662306a36Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
121762306a36Sopenharmony_ci				addr + PAGE_SIZE);
121862306a36Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
121962306a36Sopenharmony_ci
122062306a36Sopenharmony_ci	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
122162306a36Sopenharmony_ci	if (!ptep)
122262306a36Sopenharmony_ci		goto out_mn;
122362306a36Sopenharmony_ci	if (!pte_same(ptep_get(ptep), orig_pte)) {
122462306a36Sopenharmony_ci		pte_unmap_unlock(ptep, ptl);
122562306a36Sopenharmony_ci		goto out_mn;
122662306a36Sopenharmony_ci	}
122762306a36Sopenharmony_ci	VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
122862306a36Sopenharmony_ci	VM_BUG_ON_PAGE(PageAnon(kpage) && PageAnonExclusive(kpage), kpage);
122962306a36Sopenharmony_ci
123062306a36Sopenharmony_ci	/*
123162306a36Sopenharmony_ci	 * No need to check ksm_use_zero_pages here: we can only have a
123262306a36Sopenharmony_ci	 * zero_page here if ksm_use_zero_pages was enabled already.
123362306a36Sopenharmony_ci	 */
123462306a36Sopenharmony_ci	if (!is_zero_pfn(page_to_pfn(kpage))) {
123562306a36Sopenharmony_ci		get_page(kpage);
123662306a36Sopenharmony_ci		page_add_anon_rmap(kpage, vma, addr, RMAP_NONE);
123762306a36Sopenharmony_ci		newpte = mk_pte(kpage, vma->vm_page_prot);
123862306a36Sopenharmony_ci	} else {
123962306a36Sopenharmony_ci		/*
124062306a36Sopenharmony_ci		 * Use pte_mkdirty to mark the zero page mapped by KSM, and then
124162306a36Sopenharmony_ci		 * we can easily track all KSM-placed zero pages by checking if
124262306a36Sopenharmony_ci		 * the dirty bit in zero page's PTE is set.
124362306a36Sopenharmony_ci		 */
124462306a36Sopenharmony_ci		newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
124562306a36Sopenharmony_ci		ksm_zero_pages++;
124662306a36Sopenharmony_ci		mm->ksm_zero_pages++;
124762306a36Sopenharmony_ci		/*
124862306a36Sopenharmony_ci		 * We're replacing an anonymous page with a zero page, which is
124962306a36Sopenharmony_ci		 * not anonymous. We need to do proper accounting otherwise we
125062306a36Sopenharmony_ci		 * will get wrong values in /proc, and a BUG message in dmesg
125162306a36Sopenharmony_ci		 * when tearing down the mm.
125262306a36Sopenharmony_ci		 */
125362306a36Sopenharmony_ci		dec_mm_counter(mm, MM_ANONPAGES);
125462306a36Sopenharmony_ci	}
125562306a36Sopenharmony_ci
125662306a36Sopenharmony_ci	flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
125762306a36Sopenharmony_ci	/*
125862306a36Sopenharmony_ci	 * No need to notify as we are replacing a read only page with another
125962306a36Sopenharmony_ci	 * read only page with the same content.
126062306a36Sopenharmony_ci	 *
126162306a36Sopenharmony_ci	 * See Documentation/mm/mmu_notifier.rst
126262306a36Sopenharmony_ci	 */
126362306a36Sopenharmony_ci	ptep_clear_flush(vma, addr, ptep);
126462306a36Sopenharmony_ci	set_pte_at_notify(mm, addr, ptep, newpte);
126562306a36Sopenharmony_ci
126662306a36Sopenharmony_ci	folio = page_folio(page);
126762306a36Sopenharmony_ci	page_remove_rmap(page, vma, false);
126862306a36Sopenharmony_ci	if (!folio_mapped(folio))
126962306a36Sopenharmony_ci		folio_free_swap(folio);
127062306a36Sopenharmony_ci	folio_put(folio);
127162306a36Sopenharmony_ci
127262306a36Sopenharmony_ci	pte_unmap_unlock(ptep, ptl);
127362306a36Sopenharmony_ci	err = 0;
127462306a36Sopenharmony_ciout_mn:
127562306a36Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
127662306a36Sopenharmony_ciout:
127762306a36Sopenharmony_ci	return err;
127862306a36Sopenharmony_ci}
127962306a36Sopenharmony_ci
128062306a36Sopenharmony_ci/*
128162306a36Sopenharmony_ci * try_to_merge_one_page - take two pages and merge them into one
128262306a36Sopenharmony_ci * @vma: the vma that holds the pte pointing to page
128362306a36Sopenharmony_ci * @page: the PageAnon page that we want to replace with kpage
128462306a36Sopenharmony_ci * @kpage: the PageKsm page that we want to map instead of page,
128562306a36Sopenharmony_ci *         or NULL the first time when we want to use page as kpage.
128662306a36Sopenharmony_ci *
128762306a36Sopenharmony_ci * This function returns 0 if the pages were merged, -EFAULT otherwise.
128862306a36Sopenharmony_ci */
128962306a36Sopenharmony_cistatic int try_to_merge_one_page(struct vm_area_struct *vma,
129062306a36Sopenharmony_ci				 struct page *page, struct page *kpage)
129162306a36Sopenharmony_ci{
129262306a36Sopenharmony_ci	pte_t orig_pte = __pte(0);
129362306a36Sopenharmony_ci	int err = -EFAULT;
129462306a36Sopenharmony_ci
129562306a36Sopenharmony_ci	if (page == kpage)			/* ksm page forked */
129662306a36Sopenharmony_ci		return 0;
129762306a36Sopenharmony_ci
129862306a36Sopenharmony_ci	if (!PageAnon(page))
129962306a36Sopenharmony_ci		goto out;
130062306a36Sopenharmony_ci
130162306a36Sopenharmony_ci	/*
130262306a36Sopenharmony_ci	 * We need the page lock to read a stable PageSwapCache in
130362306a36Sopenharmony_ci	 * write_protect_page().  We use trylock_page() instead of
130462306a36Sopenharmony_ci	 * lock_page() because we don't want to wait here - we
130562306a36Sopenharmony_ci	 * prefer to continue scanning and merging different pages,
130662306a36Sopenharmony_ci	 * then come back to this page when it is unlocked.
130762306a36Sopenharmony_ci	 */
130862306a36Sopenharmony_ci	if (!trylock_page(page))
130962306a36Sopenharmony_ci		goto out;
131062306a36Sopenharmony_ci
131162306a36Sopenharmony_ci	if (PageTransCompound(page)) {
131262306a36Sopenharmony_ci		if (split_huge_page(page))
131362306a36Sopenharmony_ci			goto out_unlock;
131462306a36Sopenharmony_ci	}
131562306a36Sopenharmony_ci
131662306a36Sopenharmony_ci	/*
131762306a36Sopenharmony_ci	 * If this anonymous page is mapped only here, its pte may need
131862306a36Sopenharmony_ci	 * to be write-protected.  If it's mapped elsewhere, all of its
131962306a36Sopenharmony_ci	 * ptes are necessarily already write-protected.  But in either
132062306a36Sopenharmony_ci	 * case, we need to lock and check page_count is not raised.
132162306a36Sopenharmony_ci	 */
132262306a36Sopenharmony_ci	if (write_protect_page(vma, page, &orig_pte) == 0) {
132362306a36Sopenharmony_ci		if (!kpage) {
132462306a36Sopenharmony_ci			/*
132562306a36Sopenharmony_ci			 * While we hold page lock, upgrade page from
132662306a36Sopenharmony_ci			 * PageAnon+anon_vma to PageKsm+NULL stable_node:
132762306a36Sopenharmony_ci			 * stable_tree_insert() will update stable_node.
132862306a36Sopenharmony_ci			 */
132962306a36Sopenharmony_ci			set_page_stable_node(page, NULL);
133062306a36Sopenharmony_ci			mark_page_accessed(page);
133162306a36Sopenharmony_ci			/*
133262306a36Sopenharmony_ci			 * Page reclaim just frees a clean page with no dirty
133362306a36Sopenharmony_ci			 * ptes: make sure that the ksm page would be swapped.
133462306a36Sopenharmony_ci			 */
133562306a36Sopenharmony_ci			if (!PageDirty(page))
133662306a36Sopenharmony_ci				SetPageDirty(page);
133762306a36Sopenharmony_ci			err = 0;
133862306a36Sopenharmony_ci		} else if (pages_identical(page, kpage))
133962306a36Sopenharmony_ci			err = replace_page(vma, page, kpage, orig_pte);
134062306a36Sopenharmony_ci	}
134162306a36Sopenharmony_ci
134262306a36Sopenharmony_ciout_unlock:
134362306a36Sopenharmony_ci	unlock_page(page);
134462306a36Sopenharmony_ciout:
134562306a36Sopenharmony_ci	return err;
134662306a36Sopenharmony_ci}
134762306a36Sopenharmony_ci
134862306a36Sopenharmony_ci/*
134962306a36Sopenharmony_ci * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
135062306a36Sopenharmony_ci * but no new kernel page is allocated: kpage must already be a ksm page.
135162306a36Sopenharmony_ci *
135262306a36Sopenharmony_ci * This function returns 0 if the pages were merged, -EFAULT otherwise.
135362306a36Sopenharmony_ci */
135462306a36Sopenharmony_cistatic int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
135562306a36Sopenharmony_ci				      struct page *page, struct page *kpage)
135662306a36Sopenharmony_ci{
135762306a36Sopenharmony_ci	struct mm_struct *mm = rmap_item->mm;
135862306a36Sopenharmony_ci	struct vm_area_struct *vma;
135962306a36Sopenharmony_ci	int err = -EFAULT;
136062306a36Sopenharmony_ci
136162306a36Sopenharmony_ci	mmap_read_lock(mm);
136262306a36Sopenharmony_ci	vma = find_mergeable_vma(mm, rmap_item->address);
136362306a36Sopenharmony_ci	if (!vma)
136462306a36Sopenharmony_ci		goto out;
136562306a36Sopenharmony_ci
136662306a36Sopenharmony_ci	err = try_to_merge_one_page(vma, page, kpage);
136762306a36Sopenharmony_ci	if (err)
136862306a36Sopenharmony_ci		goto out;
136962306a36Sopenharmony_ci
137062306a36Sopenharmony_ci	/* Unstable nid is in union with stable anon_vma: remove first */
137162306a36Sopenharmony_ci	remove_rmap_item_from_tree(rmap_item);
137262306a36Sopenharmony_ci
137362306a36Sopenharmony_ci	/* Must get reference to anon_vma while still holding mmap_lock */
137462306a36Sopenharmony_ci	rmap_item->anon_vma = vma->anon_vma;
137562306a36Sopenharmony_ci	get_anon_vma(vma->anon_vma);
137662306a36Sopenharmony_ciout:
137762306a36Sopenharmony_ci	mmap_read_unlock(mm);
137862306a36Sopenharmony_ci	trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page),
137962306a36Sopenharmony_ci				rmap_item, mm, err);
138062306a36Sopenharmony_ci	return err;
138162306a36Sopenharmony_ci}
138262306a36Sopenharmony_ci
138362306a36Sopenharmony_ci/*
138462306a36Sopenharmony_ci * try_to_merge_two_pages - take two identical pages and prepare them
138562306a36Sopenharmony_ci * to be merged into one page.
138662306a36Sopenharmony_ci *
138762306a36Sopenharmony_ci * This function returns the kpage if we successfully merged two identical
138862306a36Sopenharmony_ci * pages into one ksm page, NULL otherwise.
138962306a36Sopenharmony_ci *
139062306a36Sopenharmony_ci * Note that this function upgrades page to ksm page: if one of the pages
139162306a36Sopenharmony_ci * is already a ksm page, try_to_merge_with_ksm_page should be used.
139262306a36Sopenharmony_ci */
139362306a36Sopenharmony_cistatic struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
139462306a36Sopenharmony_ci					   struct page *page,
139562306a36Sopenharmony_ci					   struct ksm_rmap_item *tree_rmap_item,
139662306a36Sopenharmony_ci					   struct page *tree_page)
139762306a36Sopenharmony_ci{
139862306a36Sopenharmony_ci	int err;
139962306a36Sopenharmony_ci
140062306a36Sopenharmony_ci	err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
140162306a36Sopenharmony_ci	if (!err) {
140262306a36Sopenharmony_ci		err = try_to_merge_with_ksm_page(tree_rmap_item,
140362306a36Sopenharmony_ci							tree_page, page);
140462306a36Sopenharmony_ci		/*
140562306a36Sopenharmony_ci		 * If that fails, we have a ksm page with only one pte
140662306a36Sopenharmony_ci		 * pointing to it: so break it.
140762306a36Sopenharmony_ci		 */
140862306a36Sopenharmony_ci		if (err)
140962306a36Sopenharmony_ci			break_cow(rmap_item);
141062306a36Sopenharmony_ci	}
141162306a36Sopenharmony_ci	return err ? NULL : page;
141262306a36Sopenharmony_ci}
141362306a36Sopenharmony_ci
141462306a36Sopenharmony_cistatic __always_inline
141562306a36Sopenharmony_cibool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset)
141662306a36Sopenharmony_ci{
141762306a36Sopenharmony_ci	VM_BUG_ON(stable_node->rmap_hlist_len < 0);
141862306a36Sopenharmony_ci	/*
141962306a36Sopenharmony_ci	 * Check that at least one mapping still exists, otherwise
142062306a36Sopenharmony_ci	 * there's no much point to merge and share with this
142162306a36Sopenharmony_ci	 * stable_node, as the underlying tree_page of the other
142262306a36Sopenharmony_ci	 * sharer is going to be freed soon.
142362306a36Sopenharmony_ci	 */
142462306a36Sopenharmony_ci	return stable_node->rmap_hlist_len &&
142562306a36Sopenharmony_ci		stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
142662306a36Sopenharmony_ci}
142762306a36Sopenharmony_ci
142862306a36Sopenharmony_cistatic __always_inline
142962306a36Sopenharmony_cibool is_page_sharing_candidate(struct ksm_stable_node *stable_node)
143062306a36Sopenharmony_ci{
143162306a36Sopenharmony_ci	return __is_page_sharing_candidate(stable_node, 0);
143262306a36Sopenharmony_ci}
143362306a36Sopenharmony_ci
143462306a36Sopenharmony_cistatic struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
143562306a36Sopenharmony_ci				    struct ksm_stable_node **_stable_node,
143662306a36Sopenharmony_ci				    struct rb_root *root,
143762306a36Sopenharmony_ci				    bool prune_stale_stable_nodes)
143862306a36Sopenharmony_ci{
143962306a36Sopenharmony_ci	struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node;
144062306a36Sopenharmony_ci	struct hlist_node *hlist_safe;
144162306a36Sopenharmony_ci	struct page *_tree_page, *tree_page = NULL;
144262306a36Sopenharmony_ci	int nr = 0;
144362306a36Sopenharmony_ci	int found_rmap_hlist_len;
144462306a36Sopenharmony_ci
144562306a36Sopenharmony_ci	if (!prune_stale_stable_nodes ||
144662306a36Sopenharmony_ci	    time_before(jiffies, stable_node->chain_prune_time +
144762306a36Sopenharmony_ci			msecs_to_jiffies(
144862306a36Sopenharmony_ci				ksm_stable_node_chains_prune_millisecs)))
144962306a36Sopenharmony_ci		prune_stale_stable_nodes = false;
145062306a36Sopenharmony_ci	else
145162306a36Sopenharmony_ci		stable_node->chain_prune_time = jiffies;
145262306a36Sopenharmony_ci
145362306a36Sopenharmony_ci	hlist_for_each_entry_safe(dup, hlist_safe,
145462306a36Sopenharmony_ci				  &stable_node->hlist, hlist_dup) {
145562306a36Sopenharmony_ci		cond_resched();
145662306a36Sopenharmony_ci		/*
145762306a36Sopenharmony_ci		 * We must walk all stable_node_dup to prune the stale
145862306a36Sopenharmony_ci		 * stable nodes during lookup.
145962306a36Sopenharmony_ci		 *
146062306a36Sopenharmony_ci		 * get_ksm_page can drop the nodes from the
146162306a36Sopenharmony_ci		 * stable_node->hlist if they point to freed pages
146262306a36Sopenharmony_ci		 * (that's why we do a _safe walk). The "dup"
146362306a36Sopenharmony_ci		 * stable_node parameter itself will be freed from
146462306a36Sopenharmony_ci		 * under us if it returns NULL.
146562306a36Sopenharmony_ci		 */
146662306a36Sopenharmony_ci		_tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
146762306a36Sopenharmony_ci		if (!_tree_page)
146862306a36Sopenharmony_ci			continue;
146962306a36Sopenharmony_ci		nr += 1;
147062306a36Sopenharmony_ci		if (is_page_sharing_candidate(dup)) {
147162306a36Sopenharmony_ci			if (!found ||
147262306a36Sopenharmony_ci			    dup->rmap_hlist_len > found_rmap_hlist_len) {
147362306a36Sopenharmony_ci				if (found)
147462306a36Sopenharmony_ci					put_page(tree_page);
147562306a36Sopenharmony_ci				found = dup;
147662306a36Sopenharmony_ci				found_rmap_hlist_len = found->rmap_hlist_len;
147762306a36Sopenharmony_ci				tree_page = _tree_page;
147862306a36Sopenharmony_ci
147962306a36Sopenharmony_ci				/* skip put_page for found dup */
148062306a36Sopenharmony_ci				if (!prune_stale_stable_nodes)
148162306a36Sopenharmony_ci					break;
148262306a36Sopenharmony_ci				continue;
148362306a36Sopenharmony_ci			}
148462306a36Sopenharmony_ci		}
148562306a36Sopenharmony_ci		put_page(_tree_page);
148662306a36Sopenharmony_ci	}
148762306a36Sopenharmony_ci
148862306a36Sopenharmony_ci	if (found) {
148962306a36Sopenharmony_ci		/*
149062306a36Sopenharmony_ci		 * nr is counting all dups in the chain only if
149162306a36Sopenharmony_ci		 * prune_stale_stable_nodes is true, otherwise we may
149262306a36Sopenharmony_ci		 * break the loop at nr == 1 even if there are
149362306a36Sopenharmony_ci		 * multiple entries.
149462306a36Sopenharmony_ci		 */
149562306a36Sopenharmony_ci		if (prune_stale_stable_nodes && nr == 1) {
149662306a36Sopenharmony_ci			/*
149762306a36Sopenharmony_ci			 * If there's not just one entry it would
149862306a36Sopenharmony_ci			 * corrupt memory, better BUG_ON. In KSM
149962306a36Sopenharmony_ci			 * context with no lock held it's not even
150062306a36Sopenharmony_ci			 * fatal.
150162306a36Sopenharmony_ci			 */
150262306a36Sopenharmony_ci			BUG_ON(stable_node->hlist.first->next);
150362306a36Sopenharmony_ci
150462306a36Sopenharmony_ci			/*
150562306a36Sopenharmony_ci			 * There's just one entry and it is below the
150662306a36Sopenharmony_ci			 * deduplication limit so drop the chain.
150762306a36Sopenharmony_ci			 */
150862306a36Sopenharmony_ci			rb_replace_node(&stable_node->node, &found->node,
150962306a36Sopenharmony_ci					root);
151062306a36Sopenharmony_ci			free_stable_node(stable_node);
151162306a36Sopenharmony_ci			ksm_stable_node_chains--;
151262306a36Sopenharmony_ci			ksm_stable_node_dups--;
151362306a36Sopenharmony_ci			/*
151462306a36Sopenharmony_ci			 * NOTE: the caller depends on the stable_node
151562306a36Sopenharmony_ci			 * to be equal to stable_node_dup if the chain
151662306a36Sopenharmony_ci			 * was collapsed.
151762306a36Sopenharmony_ci			 */
151862306a36Sopenharmony_ci			*_stable_node = found;
151962306a36Sopenharmony_ci			/*
152062306a36Sopenharmony_ci			 * Just for robustness, as stable_node is
152162306a36Sopenharmony_ci			 * otherwise left as a stable pointer, the
152262306a36Sopenharmony_ci			 * compiler shall optimize it away at build
152362306a36Sopenharmony_ci			 * time.
152462306a36Sopenharmony_ci			 */
152562306a36Sopenharmony_ci			stable_node = NULL;
152662306a36Sopenharmony_ci		} else if (stable_node->hlist.first != &found->hlist_dup &&
152762306a36Sopenharmony_ci			   __is_page_sharing_candidate(found, 1)) {
152862306a36Sopenharmony_ci			/*
152962306a36Sopenharmony_ci			 * If the found stable_node dup can accept one
153062306a36Sopenharmony_ci			 * more future merge (in addition to the one
153162306a36Sopenharmony_ci			 * that is underway) and is not at the head of
153262306a36Sopenharmony_ci			 * the chain, put it there so next search will
153362306a36Sopenharmony_ci			 * be quicker in the !prune_stale_stable_nodes
153462306a36Sopenharmony_ci			 * case.
153562306a36Sopenharmony_ci			 *
153662306a36Sopenharmony_ci			 * NOTE: it would be inaccurate to use nr > 1
153762306a36Sopenharmony_ci			 * instead of checking the hlist.first pointer
153862306a36Sopenharmony_ci			 * directly, because in the
153962306a36Sopenharmony_ci			 * prune_stale_stable_nodes case "nr" isn't
154062306a36Sopenharmony_ci			 * the position of the found dup in the chain,
154162306a36Sopenharmony_ci			 * but the total number of dups in the chain.
154262306a36Sopenharmony_ci			 */
154362306a36Sopenharmony_ci			hlist_del(&found->hlist_dup);
154462306a36Sopenharmony_ci			hlist_add_head(&found->hlist_dup,
154562306a36Sopenharmony_ci				       &stable_node->hlist);
154662306a36Sopenharmony_ci		}
154762306a36Sopenharmony_ci	}
154862306a36Sopenharmony_ci
154962306a36Sopenharmony_ci	*_stable_node_dup = found;
155062306a36Sopenharmony_ci	return tree_page;
155162306a36Sopenharmony_ci}
155262306a36Sopenharmony_ci
155362306a36Sopenharmony_cistatic struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node,
155462306a36Sopenharmony_ci					       struct rb_root *root)
155562306a36Sopenharmony_ci{
155662306a36Sopenharmony_ci	if (!is_stable_node_chain(stable_node))
155762306a36Sopenharmony_ci		return stable_node;
155862306a36Sopenharmony_ci	if (hlist_empty(&stable_node->hlist)) {
155962306a36Sopenharmony_ci		free_stable_node_chain(stable_node, root);
156062306a36Sopenharmony_ci		return NULL;
156162306a36Sopenharmony_ci	}
156262306a36Sopenharmony_ci	return hlist_entry(stable_node->hlist.first,
156362306a36Sopenharmony_ci			   typeof(*stable_node), hlist_dup);
156462306a36Sopenharmony_ci}
156562306a36Sopenharmony_ci
156662306a36Sopenharmony_ci/*
156762306a36Sopenharmony_ci * Like for get_ksm_page, this function can free the *_stable_node and
156862306a36Sopenharmony_ci * *_stable_node_dup if the returned tree_page is NULL.
156962306a36Sopenharmony_ci *
157062306a36Sopenharmony_ci * It can also free and overwrite *_stable_node with the found
157162306a36Sopenharmony_ci * stable_node_dup if the chain is collapsed (in which case
157262306a36Sopenharmony_ci * *_stable_node will be equal to *_stable_node_dup like if the chain
157362306a36Sopenharmony_ci * never existed). It's up to the caller to verify tree_page is not
157462306a36Sopenharmony_ci * NULL before dereferencing *_stable_node or *_stable_node_dup.
157562306a36Sopenharmony_ci *
157662306a36Sopenharmony_ci * *_stable_node_dup is really a second output parameter of this
157762306a36Sopenharmony_ci * function and will be overwritten in all cases, the caller doesn't
157862306a36Sopenharmony_ci * need to initialize it.
157962306a36Sopenharmony_ci */
158062306a36Sopenharmony_cistatic struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
158162306a36Sopenharmony_ci					struct ksm_stable_node **_stable_node,
158262306a36Sopenharmony_ci					struct rb_root *root,
158362306a36Sopenharmony_ci					bool prune_stale_stable_nodes)
158462306a36Sopenharmony_ci{
158562306a36Sopenharmony_ci	struct ksm_stable_node *stable_node = *_stable_node;
158662306a36Sopenharmony_ci	if (!is_stable_node_chain(stable_node)) {
158762306a36Sopenharmony_ci		if (is_page_sharing_candidate(stable_node)) {
158862306a36Sopenharmony_ci			*_stable_node_dup = stable_node;
158962306a36Sopenharmony_ci			return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
159062306a36Sopenharmony_ci		}
159162306a36Sopenharmony_ci		/*
159262306a36Sopenharmony_ci		 * _stable_node_dup set to NULL means the stable_node
159362306a36Sopenharmony_ci		 * reached the ksm_max_page_sharing limit.
159462306a36Sopenharmony_ci		 */
159562306a36Sopenharmony_ci		*_stable_node_dup = NULL;
159662306a36Sopenharmony_ci		return NULL;
159762306a36Sopenharmony_ci	}
159862306a36Sopenharmony_ci	return stable_node_dup(_stable_node_dup, _stable_node, root,
159962306a36Sopenharmony_ci			       prune_stale_stable_nodes);
160062306a36Sopenharmony_ci}
160162306a36Sopenharmony_ci
160262306a36Sopenharmony_cistatic __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d,
160362306a36Sopenharmony_ci						struct ksm_stable_node **s_n,
160462306a36Sopenharmony_ci						struct rb_root *root)
160562306a36Sopenharmony_ci{
160662306a36Sopenharmony_ci	return __stable_node_chain(s_n_d, s_n, root, true);
160762306a36Sopenharmony_ci}
160862306a36Sopenharmony_ci
160962306a36Sopenharmony_cistatic __always_inline struct page *chain(struct ksm_stable_node **s_n_d,
161062306a36Sopenharmony_ci					  struct ksm_stable_node *s_n,
161162306a36Sopenharmony_ci					  struct rb_root *root)
161262306a36Sopenharmony_ci{
161362306a36Sopenharmony_ci	struct ksm_stable_node *old_stable_node = s_n;
161462306a36Sopenharmony_ci	struct page *tree_page;
161562306a36Sopenharmony_ci
161662306a36Sopenharmony_ci	tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
161762306a36Sopenharmony_ci	/* not pruning dups so s_n cannot have changed */
161862306a36Sopenharmony_ci	VM_BUG_ON(s_n != old_stable_node);
161962306a36Sopenharmony_ci	return tree_page;
162062306a36Sopenharmony_ci}
162162306a36Sopenharmony_ci
162262306a36Sopenharmony_ci/*
162362306a36Sopenharmony_ci * stable_tree_search - search for page inside the stable tree
162462306a36Sopenharmony_ci *
162562306a36Sopenharmony_ci * This function checks if there is a page inside the stable tree
162662306a36Sopenharmony_ci * with identical content to the page that we are scanning right now.
162762306a36Sopenharmony_ci *
162862306a36Sopenharmony_ci * This function returns the stable tree node of identical content if found,
162962306a36Sopenharmony_ci * NULL otherwise.
163062306a36Sopenharmony_ci */
163162306a36Sopenharmony_cistatic struct page *stable_tree_search(struct page *page)
163262306a36Sopenharmony_ci{
163362306a36Sopenharmony_ci	int nid;
163462306a36Sopenharmony_ci	struct rb_root *root;
163562306a36Sopenharmony_ci	struct rb_node **new;
163662306a36Sopenharmony_ci	struct rb_node *parent;
163762306a36Sopenharmony_ci	struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
163862306a36Sopenharmony_ci	struct ksm_stable_node *page_node;
163962306a36Sopenharmony_ci
164062306a36Sopenharmony_ci	page_node = page_stable_node(page);
164162306a36Sopenharmony_ci	if (page_node && page_node->head != &migrate_nodes) {
164262306a36Sopenharmony_ci		/* ksm page forked */
164362306a36Sopenharmony_ci		get_page(page);
164462306a36Sopenharmony_ci		return page;
164562306a36Sopenharmony_ci	}
164662306a36Sopenharmony_ci
164762306a36Sopenharmony_ci	nid = get_kpfn_nid(page_to_pfn(page));
164862306a36Sopenharmony_ci	root = root_stable_tree + nid;
164962306a36Sopenharmony_ciagain:
165062306a36Sopenharmony_ci	new = &root->rb_node;
165162306a36Sopenharmony_ci	parent = NULL;
165262306a36Sopenharmony_ci
165362306a36Sopenharmony_ci	while (*new) {
165462306a36Sopenharmony_ci		struct page *tree_page;
165562306a36Sopenharmony_ci		int ret;
165662306a36Sopenharmony_ci
165762306a36Sopenharmony_ci		cond_resched();
165862306a36Sopenharmony_ci		stable_node = rb_entry(*new, struct ksm_stable_node, node);
165962306a36Sopenharmony_ci		stable_node_any = NULL;
166062306a36Sopenharmony_ci		tree_page = chain_prune(&stable_node_dup, &stable_node,	root);
166162306a36Sopenharmony_ci		/*
166262306a36Sopenharmony_ci		 * NOTE: stable_node may have been freed by
166362306a36Sopenharmony_ci		 * chain_prune() if the returned stable_node_dup is
166462306a36Sopenharmony_ci		 * not NULL. stable_node_dup may have been inserted in
166562306a36Sopenharmony_ci		 * the rbtree instead as a regular stable_node (in
166662306a36Sopenharmony_ci		 * order to collapse the stable_node chain if a single
166762306a36Sopenharmony_ci		 * stable_node dup was found in it). In such case the
166862306a36Sopenharmony_ci		 * stable_node is overwritten by the callee to point
166962306a36Sopenharmony_ci		 * to the stable_node_dup that was collapsed in the
167062306a36Sopenharmony_ci		 * stable rbtree and stable_node will be equal to
167162306a36Sopenharmony_ci		 * stable_node_dup like if the chain never existed.
167262306a36Sopenharmony_ci		 */
167362306a36Sopenharmony_ci		if (!stable_node_dup) {
167462306a36Sopenharmony_ci			/*
167562306a36Sopenharmony_ci			 * Either all stable_node dups were full in
167662306a36Sopenharmony_ci			 * this stable_node chain, or this chain was
167762306a36Sopenharmony_ci			 * empty and should be rb_erased.
167862306a36Sopenharmony_ci			 */
167962306a36Sopenharmony_ci			stable_node_any = stable_node_dup_any(stable_node,
168062306a36Sopenharmony_ci							      root);
168162306a36Sopenharmony_ci			if (!stable_node_any) {
168262306a36Sopenharmony_ci				/* rb_erase just run */
168362306a36Sopenharmony_ci				goto again;
168462306a36Sopenharmony_ci			}
168562306a36Sopenharmony_ci			/*
168662306a36Sopenharmony_ci			 * Take any of the stable_node dups page of
168762306a36Sopenharmony_ci			 * this stable_node chain to let the tree walk
168862306a36Sopenharmony_ci			 * continue. All KSM pages belonging to the
168962306a36Sopenharmony_ci			 * stable_node dups in a stable_node chain
169062306a36Sopenharmony_ci			 * have the same content and they're
169162306a36Sopenharmony_ci			 * write protected at all times. Any will work
169262306a36Sopenharmony_ci			 * fine to continue the walk.
169362306a36Sopenharmony_ci			 */
169462306a36Sopenharmony_ci			tree_page = get_ksm_page(stable_node_any,
169562306a36Sopenharmony_ci						 GET_KSM_PAGE_NOLOCK);
169662306a36Sopenharmony_ci		}
169762306a36Sopenharmony_ci		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
169862306a36Sopenharmony_ci		if (!tree_page) {
169962306a36Sopenharmony_ci			/*
170062306a36Sopenharmony_ci			 * If we walked over a stale stable_node,
170162306a36Sopenharmony_ci			 * get_ksm_page() will call rb_erase() and it
170262306a36Sopenharmony_ci			 * may rebalance the tree from under us. So
170362306a36Sopenharmony_ci			 * restart the search from scratch. Returning
170462306a36Sopenharmony_ci			 * NULL would be safe too, but we'd generate
170562306a36Sopenharmony_ci			 * false negative insertions just because some
170662306a36Sopenharmony_ci			 * stable_node was stale.
170762306a36Sopenharmony_ci			 */
170862306a36Sopenharmony_ci			goto again;
170962306a36Sopenharmony_ci		}
171062306a36Sopenharmony_ci
171162306a36Sopenharmony_ci		ret = memcmp_pages(page, tree_page);
171262306a36Sopenharmony_ci		put_page(tree_page);
171362306a36Sopenharmony_ci
171462306a36Sopenharmony_ci		parent = *new;
171562306a36Sopenharmony_ci		if (ret < 0)
171662306a36Sopenharmony_ci			new = &parent->rb_left;
171762306a36Sopenharmony_ci		else if (ret > 0)
171862306a36Sopenharmony_ci			new = &parent->rb_right;
171962306a36Sopenharmony_ci		else {
172062306a36Sopenharmony_ci			if (page_node) {
172162306a36Sopenharmony_ci				VM_BUG_ON(page_node->head != &migrate_nodes);
172262306a36Sopenharmony_ci				/*
172362306a36Sopenharmony_ci				 * Test if the migrated page should be merged
172462306a36Sopenharmony_ci				 * into a stable node dup. If the mapcount is
172562306a36Sopenharmony_ci				 * 1 we can migrate it with another KSM page
172662306a36Sopenharmony_ci				 * without adding it to the chain.
172762306a36Sopenharmony_ci				 */
172862306a36Sopenharmony_ci				if (page_mapcount(page) > 1)
172962306a36Sopenharmony_ci					goto chain_append;
173062306a36Sopenharmony_ci			}
173162306a36Sopenharmony_ci
173262306a36Sopenharmony_ci			if (!stable_node_dup) {
173362306a36Sopenharmony_ci				/*
173462306a36Sopenharmony_ci				 * If the stable_node is a chain and
173562306a36Sopenharmony_ci				 * we got a payload match in memcmp
173662306a36Sopenharmony_ci				 * but we cannot merge the scanned
173762306a36Sopenharmony_ci				 * page in any of the existing
173862306a36Sopenharmony_ci				 * stable_node dups because they're
173962306a36Sopenharmony_ci				 * all full, we need to wait the
174062306a36Sopenharmony_ci				 * scanned page to find itself a match
174162306a36Sopenharmony_ci				 * in the unstable tree to create a
174262306a36Sopenharmony_ci				 * brand new KSM page to add later to
174362306a36Sopenharmony_ci				 * the dups of this stable_node.
174462306a36Sopenharmony_ci				 */
174562306a36Sopenharmony_ci				return NULL;
174662306a36Sopenharmony_ci			}
174762306a36Sopenharmony_ci
174862306a36Sopenharmony_ci			/*
174962306a36Sopenharmony_ci			 * Lock and unlock the stable_node's page (which
175062306a36Sopenharmony_ci			 * might already have been migrated) so that page
175162306a36Sopenharmony_ci			 * migration is sure to notice its raised count.
175262306a36Sopenharmony_ci			 * It would be more elegant to return stable_node
175362306a36Sopenharmony_ci			 * than kpage, but that involves more changes.
175462306a36Sopenharmony_ci			 */
175562306a36Sopenharmony_ci			tree_page = get_ksm_page(stable_node_dup,
175662306a36Sopenharmony_ci						 GET_KSM_PAGE_TRYLOCK);
175762306a36Sopenharmony_ci
175862306a36Sopenharmony_ci			if (PTR_ERR(tree_page) == -EBUSY)
175962306a36Sopenharmony_ci				return ERR_PTR(-EBUSY);
176062306a36Sopenharmony_ci
176162306a36Sopenharmony_ci			if (unlikely(!tree_page))
176262306a36Sopenharmony_ci				/*
176362306a36Sopenharmony_ci				 * The tree may have been rebalanced,
176462306a36Sopenharmony_ci				 * so re-evaluate parent and new.
176562306a36Sopenharmony_ci				 */
176662306a36Sopenharmony_ci				goto again;
176762306a36Sopenharmony_ci			unlock_page(tree_page);
176862306a36Sopenharmony_ci
176962306a36Sopenharmony_ci			if (get_kpfn_nid(stable_node_dup->kpfn) !=
177062306a36Sopenharmony_ci			    NUMA(stable_node_dup->nid)) {
177162306a36Sopenharmony_ci				put_page(tree_page);
177262306a36Sopenharmony_ci				goto replace;
177362306a36Sopenharmony_ci			}
177462306a36Sopenharmony_ci			return tree_page;
177562306a36Sopenharmony_ci		}
177662306a36Sopenharmony_ci	}
177762306a36Sopenharmony_ci
177862306a36Sopenharmony_ci	if (!page_node)
177962306a36Sopenharmony_ci		return NULL;
178062306a36Sopenharmony_ci
178162306a36Sopenharmony_ci	list_del(&page_node->list);
178262306a36Sopenharmony_ci	DO_NUMA(page_node->nid = nid);
178362306a36Sopenharmony_ci	rb_link_node(&page_node->node, parent, new);
178462306a36Sopenharmony_ci	rb_insert_color(&page_node->node, root);
178562306a36Sopenharmony_ciout:
178662306a36Sopenharmony_ci	if (is_page_sharing_candidate(page_node)) {
178762306a36Sopenharmony_ci		get_page(page);
178862306a36Sopenharmony_ci		return page;
178962306a36Sopenharmony_ci	} else
179062306a36Sopenharmony_ci		return NULL;
179162306a36Sopenharmony_ci
179262306a36Sopenharmony_cireplace:
179362306a36Sopenharmony_ci	/*
179462306a36Sopenharmony_ci	 * If stable_node was a chain and chain_prune collapsed it,
179562306a36Sopenharmony_ci	 * stable_node has been updated to be the new regular
179662306a36Sopenharmony_ci	 * stable_node. A collapse of the chain is indistinguishable
179762306a36Sopenharmony_ci	 * from the case there was no chain in the stable
179862306a36Sopenharmony_ci	 * rbtree. Otherwise stable_node is the chain and
179962306a36Sopenharmony_ci	 * stable_node_dup is the dup to replace.
180062306a36Sopenharmony_ci	 */
180162306a36Sopenharmony_ci	if (stable_node_dup == stable_node) {
180262306a36Sopenharmony_ci		VM_BUG_ON(is_stable_node_chain(stable_node_dup));
180362306a36Sopenharmony_ci		VM_BUG_ON(is_stable_node_dup(stable_node_dup));
180462306a36Sopenharmony_ci		/* there is no chain */
180562306a36Sopenharmony_ci		if (page_node) {
180662306a36Sopenharmony_ci			VM_BUG_ON(page_node->head != &migrate_nodes);
180762306a36Sopenharmony_ci			list_del(&page_node->list);
180862306a36Sopenharmony_ci			DO_NUMA(page_node->nid = nid);
180962306a36Sopenharmony_ci			rb_replace_node(&stable_node_dup->node,
181062306a36Sopenharmony_ci					&page_node->node,
181162306a36Sopenharmony_ci					root);
181262306a36Sopenharmony_ci			if (is_page_sharing_candidate(page_node))
181362306a36Sopenharmony_ci				get_page(page);
181462306a36Sopenharmony_ci			else
181562306a36Sopenharmony_ci				page = NULL;
181662306a36Sopenharmony_ci		} else {
181762306a36Sopenharmony_ci			rb_erase(&stable_node_dup->node, root);
181862306a36Sopenharmony_ci			page = NULL;
181962306a36Sopenharmony_ci		}
182062306a36Sopenharmony_ci	} else {
182162306a36Sopenharmony_ci		VM_BUG_ON(!is_stable_node_chain(stable_node));
182262306a36Sopenharmony_ci		__stable_node_dup_del(stable_node_dup);
182362306a36Sopenharmony_ci		if (page_node) {
182462306a36Sopenharmony_ci			VM_BUG_ON(page_node->head != &migrate_nodes);
182562306a36Sopenharmony_ci			list_del(&page_node->list);
182662306a36Sopenharmony_ci			DO_NUMA(page_node->nid = nid);
182762306a36Sopenharmony_ci			stable_node_chain_add_dup(page_node, stable_node);
182862306a36Sopenharmony_ci			if (is_page_sharing_candidate(page_node))
182962306a36Sopenharmony_ci				get_page(page);
183062306a36Sopenharmony_ci			else
183162306a36Sopenharmony_ci				page = NULL;
183262306a36Sopenharmony_ci		} else {
183362306a36Sopenharmony_ci			page = NULL;
183462306a36Sopenharmony_ci		}
183562306a36Sopenharmony_ci	}
183662306a36Sopenharmony_ci	stable_node_dup->head = &migrate_nodes;
183762306a36Sopenharmony_ci	list_add(&stable_node_dup->list, stable_node_dup->head);
183862306a36Sopenharmony_ci	return page;
183962306a36Sopenharmony_ci
184062306a36Sopenharmony_cichain_append:
184162306a36Sopenharmony_ci	/* stable_node_dup could be null if it reached the limit */
184262306a36Sopenharmony_ci	if (!stable_node_dup)
184362306a36Sopenharmony_ci		stable_node_dup = stable_node_any;
184462306a36Sopenharmony_ci	/*
184562306a36Sopenharmony_ci	 * If stable_node was a chain and chain_prune collapsed it,
184662306a36Sopenharmony_ci	 * stable_node has been updated to be the new regular
184762306a36Sopenharmony_ci	 * stable_node. A collapse of the chain is indistinguishable
184862306a36Sopenharmony_ci	 * from the case there was no chain in the stable
184962306a36Sopenharmony_ci	 * rbtree. Otherwise stable_node is the chain and
185062306a36Sopenharmony_ci	 * stable_node_dup is the dup to replace.
185162306a36Sopenharmony_ci	 */
185262306a36Sopenharmony_ci	if (stable_node_dup == stable_node) {
185362306a36Sopenharmony_ci		VM_BUG_ON(is_stable_node_dup(stable_node_dup));
185462306a36Sopenharmony_ci		/* chain is missing so create it */
185562306a36Sopenharmony_ci		stable_node = alloc_stable_node_chain(stable_node_dup,
185662306a36Sopenharmony_ci						      root);
185762306a36Sopenharmony_ci		if (!stable_node)
185862306a36Sopenharmony_ci			return NULL;
185962306a36Sopenharmony_ci	}
186062306a36Sopenharmony_ci	/*
186162306a36Sopenharmony_ci	 * Add this stable_node dup that was
186262306a36Sopenharmony_ci	 * migrated to the stable_node chain
186362306a36Sopenharmony_ci	 * of the current nid for this page
186462306a36Sopenharmony_ci	 * content.
186562306a36Sopenharmony_ci	 */
186662306a36Sopenharmony_ci	VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
186762306a36Sopenharmony_ci	VM_BUG_ON(page_node->head != &migrate_nodes);
186862306a36Sopenharmony_ci	list_del(&page_node->list);
186962306a36Sopenharmony_ci	DO_NUMA(page_node->nid = nid);
187062306a36Sopenharmony_ci	stable_node_chain_add_dup(page_node, stable_node);
187162306a36Sopenharmony_ci	goto out;
187262306a36Sopenharmony_ci}
187362306a36Sopenharmony_ci
187462306a36Sopenharmony_ci/*
187562306a36Sopenharmony_ci * stable_tree_insert - insert stable tree node pointing to new ksm page
187662306a36Sopenharmony_ci * into the stable tree.
187762306a36Sopenharmony_ci *
187862306a36Sopenharmony_ci * This function returns the stable tree node just allocated on success,
187962306a36Sopenharmony_ci * NULL otherwise.
188062306a36Sopenharmony_ci */
188162306a36Sopenharmony_cistatic struct ksm_stable_node *stable_tree_insert(struct page *kpage)
188262306a36Sopenharmony_ci{
188362306a36Sopenharmony_ci	int nid;
188462306a36Sopenharmony_ci	unsigned long kpfn;
188562306a36Sopenharmony_ci	struct rb_root *root;
188662306a36Sopenharmony_ci	struct rb_node **new;
188762306a36Sopenharmony_ci	struct rb_node *parent;
188862306a36Sopenharmony_ci	struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
188962306a36Sopenharmony_ci	bool need_chain = false;
189062306a36Sopenharmony_ci
189162306a36Sopenharmony_ci	kpfn = page_to_pfn(kpage);
189262306a36Sopenharmony_ci	nid = get_kpfn_nid(kpfn);
189362306a36Sopenharmony_ci	root = root_stable_tree + nid;
189462306a36Sopenharmony_ciagain:
189562306a36Sopenharmony_ci	parent = NULL;
189662306a36Sopenharmony_ci	new = &root->rb_node;
189762306a36Sopenharmony_ci
189862306a36Sopenharmony_ci	while (*new) {
189962306a36Sopenharmony_ci		struct page *tree_page;
190062306a36Sopenharmony_ci		int ret;
190162306a36Sopenharmony_ci
190262306a36Sopenharmony_ci		cond_resched();
190362306a36Sopenharmony_ci		stable_node = rb_entry(*new, struct ksm_stable_node, node);
190462306a36Sopenharmony_ci		stable_node_any = NULL;
190562306a36Sopenharmony_ci		tree_page = chain(&stable_node_dup, stable_node, root);
190662306a36Sopenharmony_ci		if (!stable_node_dup) {
190762306a36Sopenharmony_ci			/*
190862306a36Sopenharmony_ci			 * Either all stable_node dups were full in
190962306a36Sopenharmony_ci			 * this stable_node chain, or this chain was
191062306a36Sopenharmony_ci			 * empty and should be rb_erased.
191162306a36Sopenharmony_ci			 */
191262306a36Sopenharmony_ci			stable_node_any = stable_node_dup_any(stable_node,
191362306a36Sopenharmony_ci							      root);
191462306a36Sopenharmony_ci			if (!stable_node_any) {
191562306a36Sopenharmony_ci				/* rb_erase just run */
191662306a36Sopenharmony_ci				goto again;
191762306a36Sopenharmony_ci			}
191862306a36Sopenharmony_ci			/*
191962306a36Sopenharmony_ci			 * Take any of the stable_node dups page of
192062306a36Sopenharmony_ci			 * this stable_node chain to let the tree walk
192162306a36Sopenharmony_ci			 * continue. All KSM pages belonging to the
192262306a36Sopenharmony_ci			 * stable_node dups in a stable_node chain
192362306a36Sopenharmony_ci			 * have the same content and they're
192462306a36Sopenharmony_ci			 * write protected at all times. Any will work
192562306a36Sopenharmony_ci			 * fine to continue the walk.
192662306a36Sopenharmony_ci			 */
192762306a36Sopenharmony_ci			tree_page = get_ksm_page(stable_node_any,
192862306a36Sopenharmony_ci						 GET_KSM_PAGE_NOLOCK);
192962306a36Sopenharmony_ci		}
193062306a36Sopenharmony_ci		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
193162306a36Sopenharmony_ci		if (!tree_page) {
193262306a36Sopenharmony_ci			/*
193362306a36Sopenharmony_ci			 * If we walked over a stale stable_node,
193462306a36Sopenharmony_ci			 * get_ksm_page() will call rb_erase() and it
193562306a36Sopenharmony_ci			 * may rebalance the tree from under us. So
193662306a36Sopenharmony_ci			 * restart the search from scratch. Returning
193762306a36Sopenharmony_ci			 * NULL would be safe too, but we'd generate
193862306a36Sopenharmony_ci			 * false negative insertions just because some
193962306a36Sopenharmony_ci			 * stable_node was stale.
194062306a36Sopenharmony_ci			 */
194162306a36Sopenharmony_ci			goto again;
194262306a36Sopenharmony_ci		}
194362306a36Sopenharmony_ci
194462306a36Sopenharmony_ci		ret = memcmp_pages(kpage, tree_page);
194562306a36Sopenharmony_ci		put_page(tree_page);
194662306a36Sopenharmony_ci
194762306a36Sopenharmony_ci		parent = *new;
194862306a36Sopenharmony_ci		if (ret < 0)
194962306a36Sopenharmony_ci			new = &parent->rb_left;
195062306a36Sopenharmony_ci		else if (ret > 0)
195162306a36Sopenharmony_ci			new = &parent->rb_right;
195262306a36Sopenharmony_ci		else {
195362306a36Sopenharmony_ci			need_chain = true;
195462306a36Sopenharmony_ci			break;
195562306a36Sopenharmony_ci		}
195662306a36Sopenharmony_ci	}
195762306a36Sopenharmony_ci
195862306a36Sopenharmony_ci	stable_node_dup = alloc_stable_node();
195962306a36Sopenharmony_ci	if (!stable_node_dup)
196062306a36Sopenharmony_ci		return NULL;
196162306a36Sopenharmony_ci
196262306a36Sopenharmony_ci	INIT_HLIST_HEAD(&stable_node_dup->hlist);
196362306a36Sopenharmony_ci	stable_node_dup->kpfn = kpfn;
196462306a36Sopenharmony_ci	set_page_stable_node(kpage, stable_node_dup);
196562306a36Sopenharmony_ci	stable_node_dup->rmap_hlist_len = 0;
196662306a36Sopenharmony_ci	DO_NUMA(stable_node_dup->nid = nid);
196762306a36Sopenharmony_ci	if (!need_chain) {
196862306a36Sopenharmony_ci		rb_link_node(&stable_node_dup->node, parent, new);
196962306a36Sopenharmony_ci		rb_insert_color(&stable_node_dup->node, root);
197062306a36Sopenharmony_ci	} else {
197162306a36Sopenharmony_ci		if (!is_stable_node_chain(stable_node)) {
197262306a36Sopenharmony_ci			struct ksm_stable_node *orig = stable_node;
197362306a36Sopenharmony_ci			/* chain is missing so create it */
197462306a36Sopenharmony_ci			stable_node = alloc_stable_node_chain(orig, root);
197562306a36Sopenharmony_ci			if (!stable_node) {
197662306a36Sopenharmony_ci				free_stable_node(stable_node_dup);
197762306a36Sopenharmony_ci				return NULL;
197862306a36Sopenharmony_ci			}
197962306a36Sopenharmony_ci		}
198062306a36Sopenharmony_ci		stable_node_chain_add_dup(stable_node_dup, stable_node);
198162306a36Sopenharmony_ci	}
198262306a36Sopenharmony_ci
198362306a36Sopenharmony_ci	return stable_node_dup;
198462306a36Sopenharmony_ci}
198562306a36Sopenharmony_ci
198662306a36Sopenharmony_ci/*
198762306a36Sopenharmony_ci * unstable_tree_search_insert - search for identical page,
198862306a36Sopenharmony_ci * else insert rmap_item into the unstable tree.
198962306a36Sopenharmony_ci *
199062306a36Sopenharmony_ci * This function searches for a page in the unstable tree identical to the
199162306a36Sopenharmony_ci * page currently being scanned; and if no identical page is found in the
199262306a36Sopenharmony_ci * tree, we insert rmap_item as a new object into the unstable tree.
199362306a36Sopenharmony_ci *
199462306a36Sopenharmony_ci * This function returns pointer to rmap_item found to be identical
199562306a36Sopenharmony_ci * to the currently scanned page, NULL otherwise.
199662306a36Sopenharmony_ci *
199762306a36Sopenharmony_ci * This function does both searching and inserting, because they share
199862306a36Sopenharmony_ci * the same walking algorithm in an rbtree.
199962306a36Sopenharmony_ci */
200062306a36Sopenharmony_cistatic
200162306a36Sopenharmony_cistruct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item,
200262306a36Sopenharmony_ci					      struct page *page,
200362306a36Sopenharmony_ci					      struct page **tree_pagep)
200462306a36Sopenharmony_ci{
200562306a36Sopenharmony_ci	struct rb_node **new;
200662306a36Sopenharmony_ci	struct rb_root *root;
200762306a36Sopenharmony_ci	struct rb_node *parent = NULL;
200862306a36Sopenharmony_ci	int nid;
200962306a36Sopenharmony_ci
201062306a36Sopenharmony_ci	nid = get_kpfn_nid(page_to_pfn(page));
201162306a36Sopenharmony_ci	root = root_unstable_tree + nid;
201262306a36Sopenharmony_ci	new = &root->rb_node;
201362306a36Sopenharmony_ci
201462306a36Sopenharmony_ci	while (*new) {
201562306a36Sopenharmony_ci		struct ksm_rmap_item *tree_rmap_item;
201662306a36Sopenharmony_ci		struct page *tree_page;
201762306a36Sopenharmony_ci		int ret;
201862306a36Sopenharmony_ci
201962306a36Sopenharmony_ci		cond_resched();
202062306a36Sopenharmony_ci		tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node);
202162306a36Sopenharmony_ci		tree_page = get_mergeable_page(tree_rmap_item);
202262306a36Sopenharmony_ci		if (!tree_page)
202362306a36Sopenharmony_ci			return NULL;
202462306a36Sopenharmony_ci
202562306a36Sopenharmony_ci		/*
202662306a36Sopenharmony_ci		 * Don't substitute a ksm page for a forked page.
202762306a36Sopenharmony_ci		 */
202862306a36Sopenharmony_ci		if (page == tree_page) {
202962306a36Sopenharmony_ci			put_page(tree_page);
203062306a36Sopenharmony_ci			return NULL;
203162306a36Sopenharmony_ci		}
203262306a36Sopenharmony_ci
203362306a36Sopenharmony_ci		ret = memcmp_pages(page, tree_page);
203462306a36Sopenharmony_ci
203562306a36Sopenharmony_ci		parent = *new;
203662306a36Sopenharmony_ci		if (ret < 0) {
203762306a36Sopenharmony_ci			put_page(tree_page);
203862306a36Sopenharmony_ci			new = &parent->rb_left;
203962306a36Sopenharmony_ci		} else if (ret > 0) {
204062306a36Sopenharmony_ci			put_page(tree_page);
204162306a36Sopenharmony_ci			new = &parent->rb_right;
204262306a36Sopenharmony_ci		} else if (!ksm_merge_across_nodes &&
204362306a36Sopenharmony_ci			   page_to_nid(tree_page) != nid) {
204462306a36Sopenharmony_ci			/*
204562306a36Sopenharmony_ci			 * If tree_page has been migrated to another NUMA node,
204662306a36Sopenharmony_ci			 * it will be flushed out and put in the right unstable
204762306a36Sopenharmony_ci			 * tree next time: only merge with it when across_nodes.
204862306a36Sopenharmony_ci			 */
204962306a36Sopenharmony_ci			put_page(tree_page);
205062306a36Sopenharmony_ci			return NULL;
205162306a36Sopenharmony_ci		} else {
205262306a36Sopenharmony_ci			*tree_pagep = tree_page;
205362306a36Sopenharmony_ci			return tree_rmap_item;
205462306a36Sopenharmony_ci		}
205562306a36Sopenharmony_ci	}
205662306a36Sopenharmony_ci
205762306a36Sopenharmony_ci	rmap_item->address |= UNSTABLE_FLAG;
205862306a36Sopenharmony_ci	rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
205962306a36Sopenharmony_ci	DO_NUMA(rmap_item->nid = nid);
206062306a36Sopenharmony_ci	rb_link_node(&rmap_item->node, parent, new);
206162306a36Sopenharmony_ci	rb_insert_color(&rmap_item->node, root);
206262306a36Sopenharmony_ci
206362306a36Sopenharmony_ci	ksm_pages_unshared++;
206462306a36Sopenharmony_ci	return NULL;
206562306a36Sopenharmony_ci}
206662306a36Sopenharmony_ci
206762306a36Sopenharmony_ci/*
206862306a36Sopenharmony_ci * stable_tree_append - add another rmap_item to the linked list of
206962306a36Sopenharmony_ci * rmap_items hanging off a given node of the stable tree, all sharing
207062306a36Sopenharmony_ci * the same ksm page.
207162306a36Sopenharmony_ci */
207262306a36Sopenharmony_cistatic void stable_tree_append(struct ksm_rmap_item *rmap_item,
207362306a36Sopenharmony_ci			       struct ksm_stable_node *stable_node,
207462306a36Sopenharmony_ci			       bool max_page_sharing_bypass)
207562306a36Sopenharmony_ci{
207662306a36Sopenharmony_ci	/*
207762306a36Sopenharmony_ci	 * rmap won't find this mapping if we don't insert the
207862306a36Sopenharmony_ci	 * rmap_item in the right stable_node
207962306a36Sopenharmony_ci	 * duplicate. page_migration could break later if rmap breaks,
208062306a36Sopenharmony_ci	 * so we can as well crash here. We really need to check for
208162306a36Sopenharmony_ci	 * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
208262306a36Sopenharmony_ci	 * for other negative values as an underflow if detected here
208362306a36Sopenharmony_ci	 * for the first time (and not when decreasing rmap_hlist_len)
208462306a36Sopenharmony_ci	 * would be sign of memory corruption in the stable_node.
208562306a36Sopenharmony_ci	 */
208662306a36Sopenharmony_ci	BUG_ON(stable_node->rmap_hlist_len < 0);
208762306a36Sopenharmony_ci
208862306a36Sopenharmony_ci	stable_node->rmap_hlist_len++;
208962306a36Sopenharmony_ci	if (!max_page_sharing_bypass)
209062306a36Sopenharmony_ci		/* possibly non fatal but unexpected overflow, only warn */
209162306a36Sopenharmony_ci		WARN_ON_ONCE(stable_node->rmap_hlist_len >
209262306a36Sopenharmony_ci			     ksm_max_page_sharing);
209362306a36Sopenharmony_ci
209462306a36Sopenharmony_ci	rmap_item->head = stable_node;
209562306a36Sopenharmony_ci	rmap_item->address |= STABLE_FLAG;
209662306a36Sopenharmony_ci	hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
209762306a36Sopenharmony_ci
209862306a36Sopenharmony_ci	if (rmap_item->hlist.next)
209962306a36Sopenharmony_ci		ksm_pages_sharing++;
210062306a36Sopenharmony_ci	else
210162306a36Sopenharmony_ci		ksm_pages_shared++;
210262306a36Sopenharmony_ci
210362306a36Sopenharmony_ci	rmap_item->mm->ksm_merging_pages++;
210462306a36Sopenharmony_ci}
210562306a36Sopenharmony_ci
210662306a36Sopenharmony_ci/*
210762306a36Sopenharmony_ci * cmp_and_merge_page - first see if page can be merged into the stable tree;
210862306a36Sopenharmony_ci * if not, compare checksum to previous and if it's the same, see if page can
210962306a36Sopenharmony_ci * be inserted into the unstable tree, or merged with a page already there and
211062306a36Sopenharmony_ci * both transferred to the stable tree.
211162306a36Sopenharmony_ci *
211262306a36Sopenharmony_ci * @page: the page that we are searching identical page to.
211362306a36Sopenharmony_ci * @rmap_item: the reverse mapping into the virtual address of this page
211462306a36Sopenharmony_ci */
211562306a36Sopenharmony_cistatic void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
211662306a36Sopenharmony_ci{
211762306a36Sopenharmony_ci	struct mm_struct *mm = rmap_item->mm;
211862306a36Sopenharmony_ci	struct ksm_rmap_item *tree_rmap_item;
211962306a36Sopenharmony_ci	struct page *tree_page = NULL;
212062306a36Sopenharmony_ci	struct ksm_stable_node *stable_node;
212162306a36Sopenharmony_ci	struct page *kpage;
212262306a36Sopenharmony_ci	unsigned int checksum;
212362306a36Sopenharmony_ci	int err;
212462306a36Sopenharmony_ci	bool max_page_sharing_bypass = false;
212562306a36Sopenharmony_ci
212662306a36Sopenharmony_ci	stable_node = page_stable_node(page);
212762306a36Sopenharmony_ci	if (stable_node) {
212862306a36Sopenharmony_ci		if (stable_node->head != &migrate_nodes &&
212962306a36Sopenharmony_ci		    get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
213062306a36Sopenharmony_ci		    NUMA(stable_node->nid)) {
213162306a36Sopenharmony_ci			stable_node_dup_del(stable_node);
213262306a36Sopenharmony_ci			stable_node->head = &migrate_nodes;
213362306a36Sopenharmony_ci			list_add(&stable_node->list, stable_node->head);
213462306a36Sopenharmony_ci		}
213562306a36Sopenharmony_ci		if (stable_node->head != &migrate_nodes &&
213662306a36Sopenharmony_ci		    rmap_item->head == stable_node)
213762306a36Sopenharmony_ci			return;
213862306a36Sopenharmony_ci		/*
213962306a36Sopenharmony_ci		 * If it's a KSM fork, allow it to go over the sharing limit
214062306a36Sopenharmony_ci		 * without warnings.
214162306a36Sopenharmony_ci		 */
214262306a36Sopenharmony_ci		if (!is_page_sharing_candidate(stable_node))
214362306a36Sopenharmony_ci			max_page_sharing_bypass = true;
214462306a36Sopenharmony_ci	}
214562306a36Sopenharmony_ci
214662306a36Sopenharmony_ci	/* We first start with searching the page inside the stable tree */
214762306a36Sopenharmony_ci	kpage = stable_tree_search(page);
214862306a36Sopenharmony_ci	if (kpage == page && rmap_item->head == stable_node) {
214962306a36Sopenharmony_ci		put_page(kpage);
215062306a36Sopenharmony_ci		return;
215162306a36Sopenharmony_ci	}
215262306a36Sopenharmony_ci
215362306a36Sopenharmony_ci	remove_rmap_item_from_tree(rmap_item);
215462306a36Sopenharmony_ci
215562306a36Sopenharmony_ci	if (kpage) {
215662306a36Sopenharmony_ci		if (PTR_ERR(kpage) == -EBUSY)
215762306a36Sopenharmony_ci			return;
215862306a36Sopenharmony_ci
215962306a36Sopenharmony_ci		err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
216062306a36Sopenharmony_ci		if (!err) {
216162306a36Sopenharmony_ci			/*
216262306a36Sopenharmony_ci			 * The page was successfully merged:
216362306a36Sopenharmony_ci			 * add its rmap_item to the stable tree.
216462306a36Sopenharmony_ci			 */
216562306a36Sopenharmony_ci			lock_page(kpage);
216662306a36Sopenharmony_ci			stable_tree_append(rmap_item, page_stable_node(kpage),
216762306a36Sopenharmony_ci					   max_page_sharing_bypass);
216862306a36Sopenharmony_ci			unlock_page(kpage);
216962306a36Sopenharmony_ci		}
217062306a36Sopenharmony_ci		put_page(kpage);
217162306a36Sopenharmony_ci		return;
217262306a36Sopenharmony_ci	}
217362306a36Sopenharmony_ci
217462306a36Sopenharmony_ci	/*
217562306a36Sopenharmony_ci	 * If the hash value of the page has changed from the last time
217662306a36Sopenharmony_ci	 * we calculated it, this page is changing frequently: therefore we
217762306a36Sopenharmony_ci	 * don't want to insert it in the unstable tree, and we don't want
217862306a36Sopenharmony_ci	 * to waste our time searching for something identical to it there.
217962306a36Sopenharmony_ci	 */
218062306a36Sopenharmony_ci	checksum = calc_checksum(page);
218162306a36Sopenharmony_ci	if (rmap_item->oldchecksum != checksum) {
218262306a36Sopenharmony_ci		rmap_item->oldchecksum = checksum;
218362306a36Sopenharmony_ci		return;
218462306a36Sopenharmony_ci	}
218562306a36Sopenharmony_ci
218662306a36Sopenharmony_ci	/*
218762306a36Sopenharmony_ci	 * Same checksum as an empty page. We attempt to merge it with the
218862306a36Sopenharmony_ci	 * appropriate zero page if the user enabled this via sysfs.
218962306a36Sopenharmony_ci	 */
219062306a36Sopenharmony_ci	if (ksm_use_zero_pages && (checksum == zero_checksum)) {
219162306a36Sopenharmony_ci		struct vm_area_struct *vma;
219262306a36Sopenharmony_ci
219362306a36Sopenharmony_ci		mmap_read_lock(mm);
219462306a36Sopenharmony_ci		vma = find_mergeable_vma(mm, rmap_item->address);
219562306a36Sopenharmony_ci		if (vma) {
219662306a36Sopenharmony_ci			err = try_to_merge_one_page(vma, page,
219762306a36Sopenharmony_ci					ZERO_PAGE(rmap_item->address));
219862306a36Sopenharmony_ci			trace_ksm_merge_one_page(
219962306a36Sopenharmony_ci				page_to_pfn(ZERO_PAGE(rmap_item->address)),
220062306a36Sopenharmony_ci				rmap_item, mm, err);
220162306a36Sopenharmony_ci		} else {
220262306a36Sopenharmony_ci			/*
220362306a36Sopenharmony_ci			 * If the vma is out of date, we do not need to
220462306a36Sopenharmony_ci			 * continue.
220562306a36Sopenharmony_ci			 */
220662306a36Sopenharmony_ci			err = 0;
220762306a36Sopenharmony_ci		}
220862306a36Sopenharmony_ci		mmap_read_unlock(mm);
220962306a36Sopenharmony_ci		/*
221062306a36Sopenharmony_ci		 * In case of failure, the page was not really empty, so we
221162306a36Sopenharmony_ci		 * need to continue. Otherwise we're done.
221262306a36Sopenharmony_ci		 */
221362306a36Sopenharmony_ci		if (!err)
221462306a36Sopenharmony_ci			return;
221562306a36Sopenharmony_ci	}
221662306a36Sopenharmony_ci	tree_rmap_item =
221762306a36Sopenharmony_ci		unstable_tree_search_insert(rmap_item, page, &tree_page);
221862306a36Sopenharmony_ci	if (tree_rmap_item) {
221962306a36Sopenharmony_ci		bool split;
222062306a36Sopenharmony_ci
222162306a36Sopenharmony_ci		kpage = try_to_merge_two_pages(rmap_item, page,
222262306a36Sopenharmony_ci						tree_rmap_item, tree_page);
222362306a36Sopenharmony_ci		/*
222462306a36Sopenharmony_ci		 * If both pages we tried to merge belong to the same compound
222562306a36Sopenharmony_ci		 * page, then we actually ended up increasing the reference
222662306a36Sopenharmony_ci		 * count of the same compound page twice, and split_huge_page
222762306a36Sopenharmony_ci		 * failed.
222862306a36Sopenharmony_ci		 * Here we set a flag if that happened, and we use it later to
222962306a36Sopenharmony_ci		 * try split_huge_page again. Since we call put_page right
223062306a36Sopenharmony_ci		 * afterwards, the reference count will be correct and
223162306a36Sopenharmony_ci		 * split_huge_page should succeed.
223262306a36Sopenharmony_ci		 */
223362306a36Sopenharmony_ci		split = PageTransCompound(page)
223462306a36Sopenharmony_ci			&& compound_head(page) == compound_head(tree_page);
223562306a36Sopenharmony_ci		put_page(tree_page);
223662306a36Sopenharmony_ci		if (kpage) {
223762306a36Sopenharmony_ci			/*
223862306a36Sopenharmony_ci			 * The pages were successfully merged: insert new
223962306a36Sopenharmony_ci			 * node in the stable tree and add both rmap_items.
224062306a36Sopenharmony_ci			 */
224162306a36Sopenharmony_ci			lock_page(kpage);
224262306a36Sopenharmony_ci			stable_node = stable_tree_insert(kpage);
224362306a36Sopenharmony_ci			if (stable_node) {
224462306a36Sopenharmony_ci				stable_tree_append(tree_rmap_item, stable_node,
224562306a36Sopenharmony_ci						   false);
224662306a36Sopenharmony_ci				stable_tree_append(rmap_item, stable_node,
224762306a36Sopenharmony_ci						   false);
224862306a36Sopenharmony_ci			}
224962306a36Sopenharmony_ci			unlock_page(kpage);
225062306a36Sopenharmony_ci
225162306a36Sopenharmony_ci			/*
225262306a36Sopenharmony_ci			 * If we fail to insert the page into the stable tree,
225362306a36Sopenharmony_ci			 * we will have 2 virtual addresses that are pointing
225462306a36Sopenharmony_ci			 * to a ksm page left outside the stable tree,
225562306a36Sopenharmony_ci			 * in which case we need to break_cow on both.
225662306a36Sopenharmony_ci			 */
225762306a36Sopenharmony_ci			if (!stable_node) {
225862306a36Sopenharmony_ci				break_cow(tree_rmap_item);
225962306a36Sopenharmony_ci				break_cow(rmap_item);
226062306a36Sopenharmony_ci			}
226162306a36Sopenharmony_ci		} else if (split) {
226262306a36Sopenharmony_ci			/*
226362306a36Sopenharmony_ci			 * We are here if we tried to merge two pages and
226462306a36Sopenharmony_ci			 * failed because they both belonged to the same
226562306a36Sopenharmony_ci			 * compound page. We will split the page now, but no
226662306a36Sopenharmony_ci			 * merging will take place.
226762306a36Sopenharmony_ci			 * We do not want to add the cost of a full lock; if
226862306a36Sopenharmony_ci			 * the page is locked, it is better to skip it and
226962306a36Sopenharmony_ci			 * perhaps try again later.
227062306a36Sopenharmony_ci			 */
227162306a36Sopenharmony_ci			if (!trylock_page(page))
227262306a36Sopenharmony_ci				return;
227362306a36Sopenharmony_ci			split_huge_page(page);
227462306a36Sopenharmony_ci			unlock_page(page);
227562306a36Sopenharmony_ci		}
227662306a36Sopenharmony_ci	}
227762306a36Sopenharmony_ci}
227862306a36Sopenharmony_ci
227962306a36Sopenharmony_cistatic struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
228062306a36Sopenharmony_ci					    struct ksm_rmap_item **rmap_list,
228162306a36Sopenharmony_ci					    unsigned long addr)
228262306a36Sopenharmony_ci{
228362306a36Sopenharmony_ci	struct ksm_rmap_item *rmap_item;
228462306a36Sopenharmony_ci
228562306a36Sopenharmony_ci	while (*rmap_list) {
228662306a36Sopenharmony_ci		rmap_item = *rmap_list;
228762306a36Sopenharmony_ci		if ((rmap_item->address & PAGE_MASK) == addr)
228862306a36Sopenharmony_ci			return rmap_item;
228962306a36Sopenharmony_ci		if (rmap_item->address > addr)
229062306a36Sopenharmony_ci			break;
229162306a36Sopenharmony_ci		*rmap_list = rmap_item->rmap_list;
229262306a36Sopenharmony_ci		remove_rmap_item_from_tree(rmap_item);
229362306a36Sopenharmony_ci		free_rmap_item(rmap_item);
229462306a36Sopenharmony_ci	}
229562306a36Sopenharmony_ci
229662306a36Sopenharmony_ci	rmap_item = alloc_rmap_item();
229762306a36Sopenharmony_ci	if (rmap_item) {
229862306a36Sopenharmony_ci		/* It has already been zeroed */
229962306a36Sopenharmony_ci		rmap_item->mm = mm_slot->slot.mm;
230062306a36Sopenharmony_ci		rmap_item->mm->ksm_rmap_items++;
230162306a36Sopenharmony_ci		rmap_item->address = addr;
230262306a36Sopenharmony_ci		rmap_item->rmap_list = *rmap_list;
230362306a36Sopenharmony_ci		*rmap_list = rmap_item;
230462306a36Sopenharmony_ci	}
230562306a36Sopenharmony_ci	return rmap_item;
230662306a36Sopenharmony_ci}
230762306a36Sopenharmony_ci
230862306a36Sopenharmony_cistatic struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
230962306a36Sopenharmony_ci{
231062306a36Sopenharmony_ci	struct mm_struct *mm;
231162306a36Sopenharmony_ci	struct ksm_mm_slot *mm_slot;
231262306a36Sopenharmony_ci	struct mm_slot *slot;
231362306a36Sopenharmony_ci	struct vm_area_struct *vma;
231462306a36Sopenharmony_ci	struct ksm_rmap_item *rmap_item;
231562306a36Sopenharmony_ci	struct vma_iterator vmi;
231662306a36Sopenharmony_ci	int nid;
231762306a36Sopenharmony_ci
231862306a36Sopenharmony_ci	if (list_empty(&ksm_mm_head.slot.mm_node))
231962306a36Sopenharmony_ci		return NULL;
232062306a36Sopenharmony_ci
232162306a36Sopenharmony_ci	mm_slot = ksm_scan.mm_slot;
232262306a36Sopenharmony_ci	if (mm_slot == &ksm_mm_head) {
232362306a36Sopenharmony_ci		trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);
232462306a36Sopenharmony_ci
232562306a36Sopenharmony_ci		/*
232662306a36Sopenharmony_ci		 * A number of pages can hang around indefinitely in per-cpu
232762306a36Sopenharmony_ci		 * LRU cache, raised page count preventing write_protect_page
232862306a36Sopenharmony_ci		 * from merging them.  Though it doesn't really matter much,
232962306a36Sopenharmony_ci		 * it is puzzling to see some stuck in pages_volatile until
233062306a36Sopenharmony_ci		 * other activity jostles them out, and they also prevented
233162306a36Sopenharmony_ci		 * LTP's KSM test from succeeding deterministically; so drain
233262306a36Sopenharmony_ci		 * them here (here rather than on entry to ksm_do_scan(),
233362306a36Sopenharmony_ci		 * so we don't IPI too often when pages_to_scan is set low).
233462306a36Sopenharmony_ci		 */
233562306a36Sopenharmony_ci		lru_add_drain_all();
233662306a36Sopenharmony_ci
233762306a36Sopenharmony_ci		/*
233862306a36Sopenharmony_ci		 * Whereas stale stable_nodes on the stable_tree itself
233962306a36Sopenharmony_ci		 * get pruned in the regular course of stable_tree_search(),
234062306a36Sopenharmony_ci		 * those moved out to the migrate_nodes list can accumulate:
234162306a36Sopenharmony_ci		 * so prune them once before each full scan.
234262306a36Sopenharmony_ci		 */
234362306a36Sopenharmony_ci		if (!ksm_merge_across_nodes) {
234462306a36Sopenharmony_ci			struct ksm_stable_node *stable_node, *next;
234562306a36Sopenharmony_ci			struct page *page;
234662306a36Sopenharmony_ci
234762306a36Sopenharmony_ci			list_for_each_entry_safe(stable_node, next,
234862306a36Sopenharmony_ci						 &migrate_nodes, list) {
234962306a36Sopenharmony_ci				page = get_ksm_page(stable_node,
235062306a36Sopenharmony_ci						    GET_KSM_PAGE_NOLOCK);
235162306a36Sopenharmony_ci				if (page)
235262306a36Sopenharmony_ci					put_page(page);
235362306a36Sopenharmony_ci				cond_resched();
235462306a36Sopenharmony_ci			}
235562306a36Sopenharmony_ci		}
235662306a36Sopenharmony_ci
235762306a36Sopenharmony_ci		for (nid = 0; nid < ksm_nr_node_ids; nid++)
235862306a36Sopenharmony_ci			root_unstable_tree[nid] = RB_ROOT;
235962306a36Sopenharmony_ci
236062306a36Sopenharmony_ci		spin_lock(&ksm_mmlist_lock);
236162306a36Sopenharmony_ci		slot = list_entry(mm_slot->slot.mm_node.next,
236262306a36Sopenharmony_ci				  struct mm_slot, mm_node);
236362306a36Sopenharmony_ci		mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
236462306a36Sopenharmony_ci		ksm_scan.mm_slot = mm_slot;
236562306a36Sopenharmony_ci		spin_unlock(&ksm_mmlist_lock);
236662306a36Sopenharmony_ci		/*
236762306a36Sopenharmony_ci		 * Although we tested list_empty() above, a racing __ksm_exit
236862306a36Sopenharmony_ci		 * of the last mm on the list may have removed it since then.
236962306a36Sopenharmony_ci		 */
237062306a36Sopenharmony_ci		if (mm_slot == &ksm_mm_head)
237162306a36Sopenharmony_ci			return NULL;
237262306a36Sopenharmony_cinext_mm:
237362306a36Sopenharmony_ci		ksm_scan.address = 0;
237462306a36Sopenharmony_ci		ksm_scan.rmap_list = &mm_slot->rmap_list;
237562306a36Sopenharmony_ci	}
237662306a36Sopenharmony_ci
237762306a36Sopenharmony_ci	slot = &mm_slot->slot;
237862306a36Sopenharmony_ci	mm = slot->mm;
237962306a36Sopenharmony_ci	vma_iter_init(&vmi, mm, ksm_scan.address);
238062306a36Sopenharmony_ci
238162306a36Sopenharmony_ci	mmap_read_lock(mm);
238262306a36Sopenharmony_ci	if (ksm_test_exit(mm))
238362306a36Sopenharmony_ci		goto no_vmas;
238462306a36Sopenharmony_ci
238562306a36Sopenharmony_ci	for_each_vma(vmi, vma) {
238662306a36Sopenharmony_ci		if (!(vma->vm_flags & VM_MERGEABLE))
238762306a36Sopenharmony_ci			continue;
238862306a36Sopenharmony_ci		if (ksm_scan.address < vma->vm_start)
238962306a36Sopenharmony_ci			ksm_scan.address = vma->vm_start;
239062306a36Sopenharmony_ci		if (!vma->anon_vma)
239162306a36Sopenharmony_ci			ksm_scan.address = vma->vm_end;
239262306a36Sopenharmony_ci
239362306a36Sopenharmony_ci		while (ksm_scan.address < vma->vm_end) {
239462306a36Sopenharmony_ci			if (ksm_test_exit(mm))
239562306a36Sopenharmony_ci				break;
239662306a36Sopenharmony_ci			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
239762306a36Sopenharmony_ci			if (IS_ERR_OR_NULL(*page)) {
239862306a36Sopenharmony_ci				ksm_scan.address += PAGE_SIZE;
239962306a36Sopenharmony_ci				cond_resched();
240062306a36Sopenharmony_ci				continue;
240162306a36Sopenharmony_ci			}
240262306a36Sopenharmony_ci			if (is_zone_device_page(*page))
240362306a36Sopenharmony_ci				goto next_page;
240462306a36Sopenharmony_ci			if (PageAnon(*page)) {
240562306a36Sopenharmony_ci				flush_anon_page(vma, *page, ksm_scan.address);
240662306a36Sopenharmony_ci				flush_dcache_page(*page);
240762306a36Sopenharmony_ci				rmap_item = get_next_rmap_item(mm_slot,
240862306a36Sopenharmony_ci					ksm_scan.rmap_list, ksm_scan.address);
240962306a36Sopenharmony_ci				if (rmap_item) {
241062306a36Sopenharmony_ci					ksm_scan.rmap_list =
241162306a36Sopenharmony_ci							&rmap_item->rmap_list;
241262306a36Sopenharmony_ci					ksm_scan.address += PAGE_SIZE;
241362306a36Sopenharmony_ci				} else
241462306a36Sopenharmony_ci					put_page(*page);
241562306a36Sopenharmony_ci				mmap_read_unlock(mm);
241662306a36Sopenharmony_ci				return rmap_item;
241762306a36Sopenharmony_ci			}
241862306a36Sopenharmony_cinext_page:
241962306a36Sopenharmony_ci			put_page(*page);
242062306a36Sopenharmony_ci			ksm_scan.address += PAGE_SIZE;
242162306a36Sopenharmony_ci			cond_resched();
242262306a36Sopenharmony_ci		}
242362306a36Sopenharmony_ci	}
242462306a36Sopenharmony_ci
242562306a36Sopenharmony_ci	if (ksm_test_exit(mm)) {
242662306a36Sopenharmony_cino_vmas:
242762306a36Sopenharmony_ci		ksm_scan.address = 0;
242862306a36Sopenharmony_ci		ksm_scan.rmap_list = &mm_slot->rmap_list;
242962306a36Sopenharmony_ci	}
243062306a36Sopenharmony_ci	/*
243162306a36Sopenharmony_ci	 * Nuke all the rmap_items that are above this current rmap:
243262306a36Sopenharmony_ci	 * because there were no VM_MERGEABLE vmas with such addresses.
243362306a36Sopenharmony_ci	 */
243462306a36Sopenharmony_ci	remove_trailing_rmap_items(ksm_scan.rmap_list);
243562306a36Sopenharmony_ci
243662306a36Sopenharmony_ci	spin_lock(&ksm_mmlist_lock);
243762306a36Sopenharmony_ci	slot = list_entry(mm_slot->slot.mm_node.next,
243862306a36Sopenharmony_ci			  struct mm_slot, mm_node);
243962306a36Sopenharmony_ci	ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
244062306a36Sopenharmony_ci	if (ksm_scan.address == 0) {
244162306a36Sopenharmony_ci		/*
244262306a36Sopenharmony_ci		 * We've completed a full scan of all vmas, holding mmap_lock
244362306a36Sopenharmony_ci		 * throughout, and found no VM_MERGEABLE: so do the same as
244462306a36Sopenharmony_ci		 * __ksm_exit does to remove this mm from all our lists now.
244562306a36Sopenharmony_ci		 * This applies either when cleaning up after __ksm_exit
244662306a36Sopenharmony_ci		 * (but beware: we can reach here even before __ksm_exit),
244762306a36Sopenharmony_ci		 * or when all VM_MERGEABLE areas have been unmapped (and
244862306a36Sopenharmony_ci		 * mmap_lock then protects against race with MADV_MERGEABLE).
244962306a36Sopenharmony_ci		 */
245062306a36Sopenharmony_ci		hash_del(&mm_slot->slot.hash);
245162306a36Sopenharmony_ci		list_del(&mm_slot->slot.mm_node);
245262306a36Sopenharmony_ci		spin_unlock(&ksm_mmlist_lock);
245362306a36Sopenharmony_ci
245462306a36Sopenharmony_ci		mm_slot_free(mm_slot_cache, mm_slot);
245562306a36Sopenharmony_ci		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
245662306a36Sopenharmony_ci		clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
245762306a36Sopenharmony_ci		mmap_read_unlock(mm);
245862306a36Sopenharmony_ci		mmdrop(mm);
245962306a36Sopenharmony_ci	} else {
246062306a36Sopenharmony_ci		mmap_read_unlock(mm);
246162306a36Sopenharmony_ci		/*
246262306a36Sopenharmony_ci		 * mmap_read_unlock(mm) first because after
246362306a36Sopenharmony_ci		 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
246462306a36Sopenharmony_ci		 * already have been freed under us by __ksm_exit()
246562306a36Sopenharmony_ci		 * because the "mm_slot" is still hashed and
246662306a36Sopenharmony_ci		 * ksm_scan.mm_slot doesn't point to it anymore.
246762306a36Sopenharmony_ci		 */
246862306a36Sopenharmony_ci		spin_unlock(&ksm_mmlist_lock);
246962306a36Sopenharmony_ci	}
247062306a36Sopenharmony_ci
247162306a36Sopenharmony_ci	/* Repeat until we've completed scanning the whole list */
247262306a36Sopenharmony_ci	mm_slot = ksm_scan.mm_slot;
247362306a36Sopenharmony_ci	if (mm_slot != &ksm_mm_head)
247462306a36Sopenharmony_ci		goto next_mm;
247562306a36Sopenharmony_ci
247662306a36Sopenharmony_ci	trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
247762306a36Sopenharmony_ci	ksm_scan.seqnr++;
247862306a36Sopenharmony_ci	return NULL;
247962306a36Sopenharmony_ci}
248062306a36Sopenharmony_ci
248162306a36Sopenharmony_ci/**
248262306a36Sopenharmony_ci * ksm_do_scan  - the ksm scanner main worker function.
248362306a36Sopenharmony_ci * @scan_npages:  number of pages we want to scan before we return.
248462306a36Sopenharmony_ci */
248562306a36Sopenharmony_cistatic void ksm_do_scan(unsigned int scan_npages)
248662306a36Sopenharmony_ci{
248762306a36Sopenharmony_ci	struct ksm_rmap_item *rmap_item;
248862306a36Sopenharmony_ci	struct page *page;
248962306a36Sopenharmony_ci	unsigned int npages = scan_npages;
249062306a36Sopenharmony_ci
249162306a36Sopenharmony_ci	while (npages-- && likely(!freezing(current))) {
249262306a36Sopenharmony_ci		cond_resched();
249362306a36Sopenharmony_ci		rmap_item = scan_get_next_rmap_item(&page);
249462306a36Sopenharmony_ci		if (!rmap_item)
249562306a36Sopenharmony_ci			return;
249662306a36Sopenharmony_ci		cmp_and_merge_page(page, rmap_item);
249762306a36Sopenharmony_ci		put_page(page);
249862306a36Sopenharmony_ci	}
249962306a36Sopenharmony_ci
250062306a36Sopenharmony_ci	ksm_pages_scanned += scan_npages - npages;
250162306a36Sopenharmony_ci}
250262306a36Sopenharmony_ci
250362306a36Sopenharmony_cistatic int ksmd_should_run(void)
250462306a36Sopenharmony_ci{
250562306a36Sopenharmony_ci	return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node);
250662306a36Sopenharmony_ci}
250762306a36Sopenharmony_ci
250862306a36Sopenharmony_cistatic int ksm_scan_thread(void *nothing)
250962306a36Sopenharmony_ci{
251062306a36Sopenharmony_ci	unsigned int sleep_ms;
251162306a36Sopenharmony_ci
251262306a36Sopenharmony_ci	set_freezable();
251362306a36Sopenharmony_ci	set_user_nice(current, 5);
251462306a36Sopenharmony_ci
251562306a36Sopenharmony_ci	while (!kthread_should_stop()) {
251662306a36Sopenharmony_ci		mutex_lock(&ksm_thread_mutex);
251762306a36Sopenharmony_ci		wait_while_offlining();
251862306a36Sopenharmony_ci		if (ksmd_should_run())
251962306a36Sopenharmony_ci			ksm_do_scan(ksm_thread_pages_to_scan);
252062306a36Sopenharmony_ci		mutex_unlock(&ksm_thread_mutex);
252162306a36Sopenharmony_ci
252262306a36Sopenharmony_ci		try_to_freeze();
252362306a36Sopenharmony_ci
252462306a36Sopenharmony_ci		if (ksmd_should_run()) {
252562306a36Sopenharmony_ci			sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
252662306a36Sopenharmony_ci			wait_event_interruptible_timeout(ksm_iter_wait,
252762306a36Sopenharmony_ci				sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
252862306a36Sopenharmony_ci				msecs_to_jiffies(sleep_ms));
252962306a36Sopenharmony_ci		} else {
253062306a36Sopenharmony_ci			wait_event_freezable(ksm_thread_wait,
253162306a36Sopenharmony_ci				ksmd_should_run() || kthread_should_stop());
253262306a36Sopenharmony_ci		}
253362306a36Sopenharmony_ci	}
253462306a36Sopenharmony_ci	return 0;
253562306a36Sopenharmony_ci}
253662306a36Sopenharmony_ci
253762306a36Sopenharmony_cistatic void __ksm_add_vma(struct vm_area_struct *vma)
253862306a36Sopenharmony_ci{
253962306a36Sopenharmony_ci	unsigned long vm_flags = vma->vm_flags;
254062306a36Sopenharmony_ci
254162306a36Sopenharmony_ci	if (vm_flags & VM_MERGEABLE)
254262306a36Sopenharmony_ci		return;
254362306a36Sopenharmony_ci
254462306a36Sopenharmony_ci	if (vma_ksm_compatible(vma))
254562306a36Sopenharmony_ci		vm_flags_set(vma, VM_MERGEABLE);
254662306a36Sopenharmony_ci}
254762306a36Sopenharmony_ci
254862306a36Sopenharmony_cistatic int __ksm_del_vma(struct vm_area_struct *vma)
254962306a36Sopenharmony_ci{
255062306a36Sopenharmony_ci	int err;
255162306a36Sopenharmony_ci
255262306a36Sopenharmony_ci	if (!(vma->vm_flags & VM_MERGEABLE))
255362306a36Sopenharmony_ci		return 0;
255462306a36Sopenharmony_ci
255562306a36Sopenharmony_ci	if (vma->anon_vma) {
255662306a36Sopenharmony_ci		err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true);
255762306a36Sopenharmony_ci		if (err)
255862306a36Sopenharmony_ci			return err;
255962306a36Sopenharmony_ci	}
256062306a36Sopenharmony_ci
256162306a36Sopenharmony_ci	vm_flags_clear(vma, VM_MERGEABLE);
256262306a36Sopenharmony_ci	return 0;
256362306a36Sopenharmony_ci}
256462306a36Sopenharmony_ci/**
256562306a36Sopenharmony_ci * ksm_add_vma - Mark vma as mergeable if compatible
256662306a36Sopenharmony_ci *
256762306a36Sopenharmony_ci * @vma:  Pointer to vma
256862306a36Sopenharmony_ci */
256962306a36Sopenharmony_civoid ksm_add_vma(struct vm_area_struct *vma)
257062306a36Sopenharmony_ci{
257162306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
257262306a36Sopenharmony_ci
257362306a36Sopenharmony_ci	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
257462306a36Sopenharmony_ci		__ksm_add_vma(vma);
257562306a36Sopenharmony_ci}
257662306a36Sopenharmony_ci
257762306a36Sopenharmony_cistatic void ksm_add_vmas(struct mm_struct *mm)
257862306a36Sopenharmony_ci{
257962306a36Sopenharmony_ci	struct vm_area_struct *vma;
258062306a36Sopenharmony_ci
258162306a36Sopenharmony_ci	VMA_ITERATOR(vmi, mm, 0);
258262306a36Sopenharmony_ci	for_each_vma(vmi, vma)
258362306a36Sopenharmony_ci		__ksm_add_vma(vma);
258462306a36Sopenharmony_ci}
258562306a36Sopenharmony_ci
258662306a36Sopenharmony_cistatic int ksm_del_vmas(struct mm_struct *mm)
258762306a36Sopenharmony_ci{
258862306a36Sopenharmony_ci	struct vm_area_struct *vma;
258962306a36Sopenharmony_ci	int err;
259062306a36Sopenharmony_ci
259162306a36Sopenharmony_ci	VMA_ITERATOR(vmi, mm, 0);
259262306a36Sopenharmony_ci	for_each_vma(vmi, vma) {
259362306a36Sopenharmony_ci		err = __ksm_del_vma(vma);
259462306a36Sopenharmony_ci		if (err)
259562306a36Sopenharmony_ci			return err;
259662306a36Sopenharmony_ci	}
259762306a36Sopenharmony_ci	return 0;
259862306a36Sopenharmony_ci}
259962306a36Sopenharmony_ci
260062306a36Sopenharmony_ci/**
260162306a36Sopenharmony_ci * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
260262306a36Sopenharmony_ci *                        compatible VMA's
260362306a36Sopenharmony_ci *
260462306a36Sopenharmony_ci * @mm:  Pointer to mm
260562306a36Sopenharmony_ci *
260662306a36Sopenharmony_ci * Returns 0 on success, otherwise error code
260762306a36Sopenharmony_ci */
260862306a36Sopenharmony_ciint ksm_enable_merge_any(struct mm_struct *mm)
260962306a36Sopenharmony_ci{
261062306a36Sopenharmony_ci	int err;
261162306a36Sopenharmony_ci
261262306a36Sopenharmony_ci	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
261362306a36Sopenharmony_ci		return 0;
261462306a36Sopenharmony_ci
261562306a36Sopenharmony_ci	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
261662306a36Sopenharmony_ci		err = __ksm_enter(mm);
261762306a36Sopenharmony_ci		if (err)
261862306a36Sopenharmony_ci			return err;
261962306a36Sopenharmony_ci	}
262062306a36Sopenharmony_ci
262162306a36Sopenharmony_ci	set_bit(MMF_VM_MERGE_ANY, &mm->flags);
262262306a36Sopenharmony_ci	ksm_add_vmas(mm);
262362306a36Sopenharmony_ci
262462306a36Sopenharmony_ci	return 0;
262562306a36Sopenharmony_ci}
262662306a36Sopenharmony_ci
262762306a36Sopenharmony_ci/**
262862306a36Sopenharmony_ci * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm,
262962306a36Sopenharmony_ci *			   previously enabled via ksm_enable_merge_any().
263062306a36Sopenharmony_ci *
263162306a36Sopenharmony_ci * Disabling merging implies unmerging any merged pages, like setting
263262306a36Sopenharmony_ci * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and
263362306a36Sopenharmony_ci * merging on all compatible VMA's remains enabled.
263462306a36Sopenharmony_ci *
263562306a36Sopenharmony_ci * @mm: Pointer to mm
263662306a36Sopenharmony_ci *
263762306a36Sopenharmony_ci * Returns 0 on success, otherwise error code
263862306a36Sopenharmony_ci */
263962306a36Sopenharmony_ciint ksm_disable_merge_any(struct mm_struct *mm)
264062306a36Sopenharmony_ci{
264162306a36Sopenharmony_ci	int err;
264262306a36Sopenharmony_ci
264362306a36Sopenharmony_ci	if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
264462306a36Sopenharmony_ci		return 0;
264562306a36Sopenharmony_ci
264662306a36Sopenharmony_ci	err = ksm_del_vmas(mm);
264762306a36Sopenharmony_ci	if (err) {
264862306a36Sopenharmony_ci		ksm_add_vmas(mm);
264962306a36Sopenharmony_ci		return err;
265062306a36Sopenharmony_ci	}
265162306a36Sopenharmony_ci
265262306a36Sopenharmony_ci	clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
265362306a36Sopenharmony_ci	return 0;
265462306a36Sopenharmony_ci}
265562306a36Sopenharmony_ci
265662306a36Sopenharmony_ciint ksm_disable(struct mm_struct *mm)
265762306a36Sopenharmony_ci{
265862306a36Sopenharmony_ci	mmap_assert_write_locked(mm);
265962306a36Sopenharmony_ci
266062306a36Sopenharmony_ci	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
266162306a36Sopenharmony_ci		return 0;
266262306a36Sopenharmony_ci	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
266362306a36Sopenharmony_ci		return ksm_disable_merge_any(mm);
266462306a36Sopenharmony_ci	return ksm_del_vmas(mm);
266562306a36Sopenharmony_ci}
266662306a36Sopenharmony_ci
266762306a36Sopenharmony_ciint ksm_madvise(struct vm_area_struct *vma, unsigned long start,
266862306a36Sopenharmony_ci		unsigned long end, int advice, unsigned long *vm_flags)
266962306a36Sopenharmony_ci{
267062306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
267162306a36Sopenharmony_ci	int err;
267262306a36Sopenharmony_ci
267362306a36Sopenharmony_ci	switch (advice) {
267462306a36Sopenharmony_ci	case MADV_MERGEABLE:
267562306a36Sopenharmony_ci		if (vma->vm_flags & VM_MERGEABLE)
267662306a36Sopenharmony_ci			return 0;
267762306a36Sopenharmony_ci		if (!vma_ksm_compatible(vma))
267862306a36Sopenharmony_ci			return 0;
267962306a36Sopenharmony_ci
268062306a36Sopenharmony_ci		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
268162306a36Sopenharmony_ci			err = __ksm_enter(mm);
268262306a36Sopenharmony_ci			if (err)
268362306a36Sopenharmony_ci				return err;
268462306a36Sopenharmony_ci		}
268562306a36Sopenharmony_ci
268662306a36Sopenharmony_ci		*vm_flags |= VM_MERGEABLE;
268762306a36Sopenharmony_ci		break;
268862306a36Sopenharmony_ci
268962306a36Sopenharmony_ci	case MADV_UNMERGEABLE:
269062306a36Sopenharmony_ci		if (!(*vm_flags & VM_MERGEABLE))
269162306a36Sopenharmony_ci			return 0;		/* just ignore the advice */
269262306a36Sopenharmony_ci
269362306a36Sopenharmony_ci		if (vma->anon_vma) {
269462306a36Sopenharmony_ci			err = unmerge_ksm_pages(vma, start, end, true);
269562306a36Sopenharmony_ci			if (err)
269662306a36Sopenharmony_ci				return err;
269762306a36Sopenharmony_ci		}
269862306a36Sopenharmony_ci
269962306a36Sopenharmony_ci		*vm_flags &= ~VM_MERGEABLE;
270062306a36Sopenharmony_ci		break;
270162306a36Sopenharmony_ci	}
270262306a36Sopenharmony_ci
270362306a36Sopenharmony_ci	return 0;
270462306a36Sopenharmony_ci}
270562306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(ksm_madvise);
270662306a36Sopenharmony_ci
270762306a36Sopenharmony_ciint __ksm_enter(struct mm_struct *mm)
270862306a36Sopenharmony_ci{
270962306a36Sopenharmony_ci	struct ksm_mm_slot *mm_slot;
271062306a36Sopenharmony_ci	struct mm_slot *slot;
271162306a36Sopenharmony_ci	int needs_wakeup;
271262306a36Sopenharmony_ci
271362306a36Sopenharmony_ci	mm_slot = mm_slot_alloc(mm_slot_cache);
271462306a36Sopenharmony_ci	if (!mm_slot)
271562306a36Sopenharmony_ci		return -ENOMEM;
271662306a36Sopenharmony_ci
271762306a36Sopenharmony_ci	slot = &mm_slot->slot;
271862306a36Sopenharmony_ci
271962306a36Sopenharmony_ci	/* Check ksm_run too?  Would need tighter locking */
272062306a36Sopenharmony_ci	needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node);
272162306a36Sopenharmony_ci
272262306a36Sopenharmony_ci	spin_lock(&ksm_mmlist_lock);
272362306a36Sopenharmony_ci	mm_slot_insert(mm_slots_hash, mm, slot);
272462306a36Sopenharmony_ci	/*
272562306a36Sopenharmony_ci	 * When KSM_RUN_MERGE (or KSM_RUN_STOP),
272662306a36Sopenharmony_ci	 * insert just behind the scanning cursor, to let the area settle
272762306a36Sopenharmony_ci	 * down a little; when fork is followed by immediate exec, we don't
272862306a36Sopenharmony_ci	 * want ksmd to waste time setting up and tearing down an rmap_list.
272962306a36Sopenharmony_ci	 *
273062306a36Sopenharmony_ci	 * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
273162306a36Sopenharmony_ci	 * scanning cursor, otherwise KSM pages in newly forked mms will be
273262306a36Sopenharmony_ci	 * missed: then we might as well insert at the end of the list.
273362306a36Sopenharmony_ci	 */
273462306a36Sopenharmony_ci	if (ksm_run & KSM_RUN_UNMERGE)
273562306a36Sopenharmony_ci		list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node);
273662306a36Sopenharmony_ci	else
273762306a36Sopenharmony_ci		list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
273862306a36Sopenharmony_ci	spin_unlock(&ksm_mmlist_lock);
273962306a36Sopenharmony_ci
274062306a36Sopenharmony_ci	set_bit(MMF_VM_MERGEABLE, &mm->flags);
274162306a36Sopenharmony_ci	mmgrab(mm);
274262306a36Sopenharmony_ci
274362306a36Sopenharmony_ci	if (needs_wakeup)
274462306a36Sopenharmony_ci		wake_up_interruptible(&ksm_thread_wait);
274562306a36Sopenharmony_ci
274662306a36Sopenharmony_ci	trace_ksm_enter(mm);
274762306a36Sopenharmony_ci	return 0;
274862306a36Sopenharmony_ci}
274962306a36Sopenharmony_ci
275062306a36Sopenharmony_civoid __ksm_exit(struct mm_struct *mm)
275162306a36Sopenharmony_ci{
275262306a36Sopenharmony_ci	struct ksm_mm_slot *mm_slot;
275362306a36Sopenharmony_ci	struct mm_slot *slot;
275462306a36Sopenharmony_ci	int easy_to_free = 0;
275562306a36Sopenharmony_ci
275662306a36Sopenharmony_ci	/*
275762306a36Sopenharmony_ci	 * This process is exiting: if it's straightforward (as is the
275862306a36Sopenharmony_ci	 * case when ksmd was never running), free mm_slot immediately.
275962306a36Sopenharmony_ci	 * But if it's at the cursor or has rmap_items linked to it, use
276062306a36Sopenharmony_ci	 * mmap_lock to synchronize with any break_cows before pagetables
276162306a36Sopenharmony_ci	 * are freed, and leave the mm_slot on the list for ksmd to free.
276262306a36Sopenharmony_ci	 * Beware: ksm may already have noticed it exiting and freed the slot.
276362306a36Sopenharmony_ci	 */
276462306a36Sopenharmony_ci
276562306a36Sopenharmony_ci	spin_lock(&ksm_mmlist_lock);
276662306a36Sopenharmony_ci	slot = mm_slot_lookup(mm_slots_hash, mm);
276762306a36Sopenharmony_ci	mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
276862306a36Sopenharmony_ci	if (mm_slot && ksm_scan.mm_slot != mm_slot) {
276962306a36Sopenharmony_ci		if (!mm_slot->rmap_list) {
277062306a36Sopenharmony_ci			hash_del(&slot->hash);
277162306a36Sopenharmony_ci			list_del(&slot->mm_node);
277262306a36Sopenharmony_ci			easy_to_free = 1;
277362306a36Sopenharmony_ci		} else {
277462306a36Sopenharmony_ci			list_move(&slot->mm_node,
277562306a36Sopenharmony_ci				  &ksm_scan.mm_slot->slot.mm_node);
277662306a36Sopenharmony_ci		}
277762306a36Sopenharmony_ci	}
277862306a36Sopenharmony_ci	spin_unlock(&ksm_mmlist_lock);
277962306a36Sopenharmony_ci
278062306a36Sopenharmony_ci	if (easy_to_free) {
278162306a36Sopenharmony_ci		mm_slot_free(mm_slot_cache, mm_slot);
278262306a36Sopenharmony_ci		clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
278362306a36Sopenharmony_ci		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
278462306a36Sopenharmony_ci		mmdrop(mm);
278562306a36Sopenharmony_ci	} else if (mm_slot) {
278662306a36Sopenharmony_ci		mmap_write_lock(mm);
278762306a36Sopenharmony_ci		mmap_write_unlock(mm);
278862306a36Sopenharmony_ci	}
278962306a36Sopenharmony_ci
279062306a36Sopenharmony_ci	trace_ksm_exit(mm);
279162306a36Sopenharmony_ci}
279262306a36Sopenharmony_ci
279362306a36Sopenharmony_cistruct page *ksm_might_need_to_copy(struct page *page,
279462306a36Sopenharmony_ci			struct vm_area_struct *vma, unsigned long address)
279562306a36Sopenharmony_ci{
279662306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
279762306a36Sopenharmony_ci	struct anon_vma *anon_vma = folio_anon_vma(folio);
279862306a36Sopenharmony_ci	struct page *new_page;
279962306a36Sopenharmony_ci
280062306a36Sopenharmony_ci	if (PageKsm(page)) {
280162306a36Sopenharmony_ci		if (page_stable_node(page) &&
280262306a36Sopenharmony_ci		    !(ksm_run & KSM_RUN_UNMERGE))
280362306a36Sopenharmony_ci			return page;	/* no need to copy it */
280462306a36Sopenharmony_ci	} else if (!anon_vma) {
280562306a36Sopenharmony_ci		return page;		/* no need to copy it */
280662306a36Sopenharmony_ci	} else if (page->index == linear_page_index(vma, address) &&
280762306a36Sopenharmony_ci			anon_vma->root == vma->anon_vma->root) {
280862306a36Sopenharmony_ci		return page;		/* still no need to copy it */
280962306a36Sopenharmony_ci	}
281062306a36Sopenharmony_ci	if (PageHWPoison(page))
281162306a36Sopenharmony_ci		return ERR_PTR(-EHWPOISON);
281262306a36Sopenharmony_ci	if (!PageUptodate(page))
281362306a36Sopenharmony_ci		return page;		/* let do_swap_page report the error */
281462306a36Sopenharmony_ci
281562306a36Sopenharmony_ci	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
281662306a36Sopenharmony_ci	if (new_page &&
281762306a36Sopenharmony_ci	    mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) {
281862306a36Sopenharmony_ci		put_page(new_page);
281962306a36Sopenharmony_ci		new_page = NULL;
282062306a36Sopenharmony_ci	}
282162306a36Sopenharmony_ci	if (new_page) {
282262306a36Sopenharmony_ci		if (copy_mc_user_highpage(new_page, page, address, vma)) {
282362306a36Sopenharmony_ci			put_page(new_page);
282462306a36Sopenharmony_ci			memory_failure_queue(page_to_pfn(page), 0);
282562306a36Sopenharmony_ci			return ERR_PTR(-EHWPOISON);
282662306a36Sopenharmony_ci		}
282762306a36Sopenharmony_ci		SetPageDirty(new_page);
282862306a36Sopenharmony_ci		__SetPageUptodate(new_page);
282962306a36Sopenharmony_ci		__SetPageLocked(new_page);
283062306a36Sopenharmony_ci#ifdef CONFIG_SWAP
283162306a36Sopenharmony_ci		count_vm_event(KSM_SWPIN_COPY);
283262306a36Sopenharmony_ci#endif
283362306a36Sopenharmony_ci	}
283462306a36Sopenharmony_ci
283562306a36Sopenharmony_ci	return new_page;
283662306a36Sopenharmony_ci}
283762306a36Sopenharmony_ci
283862306a36Sopenharmony_civoid rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
283962306a36Sopenharmony_ci{
284062306a36Sopenharmony_ci	struct ksm_stable_node *stable_node;
284162306a36Sopenharmony_ci	struct ksm_rmap_item *rmap_item;
284262306a36Sopenharmony_ci	int search_new_forks = 0;
284362306a36Sopenharmony_ci
284462306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
284562306a36Sopenharmony_ci
284662306a36Sopenharmony_ci	/*
284762306a36Sopenharmony_ci	 * Rely on the page lock to protect against concurrent modifications
284862306a36Sopenharmony_ci	 * to that page's node of the stable tree.
284962306a36Sopenharmony_ci	 */
285062306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
285162306a36Sopenharmony_ci
285262306a36Sopenharmony_ci	stable_node = folio_stable_node(folio);
285362306a36Sopenharmony_ci	if (!stable_node)
285462306a36Sopenharmony_ci		return;
285562306a36Sopenharmony_ciagain:
285662306a36Sopenharmony_ci	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
285762306a36Sopenharmony_ci		struct anon_vma *anon_vma = rmap_item->anon_vma;
285862306a36Sopenharmony_ci		struct anon_vma_chain *vmac;
285962306a36Sopenharmony_ci		struct vm_area_struct *vma;
286062306a36Sopenharmony_ci
286162306a36Sopenharmony_ci		cond_resched();
286262306a36Sopenharmony_ci		if (!anon_vma_trylock_read(anon_vma)) {
286362306a36Sopenharmony_ci			if (rwc->try_lock) {
286462306a36Sopenharmony_ci				rwc->contended = true;
286562306a36Sopenharmony_ci				return;
286662306a36Sopenharmony_ci			}
286762306a36Sopenharmony_ci			anon_vma_lock_read(anon_vma);
286862306a36Sopenharmony_ci		}
286962306a36Sopenharmony_ci		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
287062306a36Sopenharmony_ci					       0, ULONG_MAX) {
287162306a36Sopenharmony_ci			unsigned long addr;
287262306a36Sopenharmony_ci
287362306a36Sopenharmony_ci			cond_resched();
287462306a36Sopenharmony_ci			vma = vmac->vma;
287562306a36Sopenharmony_ci
287662306a36Sopenharmony_ci			/* Ignore the stable/unstable/sqnr flags */
287762306a36Sopenharmony_ci			addr = rmap_item->address & PAGE_MASK;
287862306a36Sopenharmony_ci
287962306a36Sopenharmony_ci			if (addr < vma->vm_start || addr >= vma->vm_end)
288062306a36Sopenharmony_ci				continue;
288162306a36Sopenharmony_ci			/*
288262306a36Sopenharmony_ci			 * Initially we examine only the vma which covers this
288362306a36Sopenharmony_ci			 * rmap_item; but later, if there is still work to do,
288462306a36Sopenharmony_ci			 * we examine covering vmas in other mms: in case they
288562306a36Sopenharmony_ci			 * were forked from the original since ksmd passed.
288662306a36Sopenharmony_ci			 */
288762306a36Sopenharmony_ci			if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
288862306a36Sopenharmony_ci				continue;
288962306a36Sopenharmony_ci
289062306a36Sopenharmony_ci			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
289162306a36Sopenharmony_ci				continue;
289262306a36Sopenharmony_ci
289362306a36Sopenharmony_ci			if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
289462306a36Sopenharmony_ci				anon_vma_unlock_read(anon_vma);
289562306a36Sopenharmony_ci				return;
289662306a36Sopenharmony_ci			}
289762306a36Sopenharmony_ci			if (rwc->done && rwc->done(folio)) {
289862306a36Sopenharmony_ci				anon_vma_unlock_read(anon_vma);
289962306a36Sopenharmony_ci				return;
290062306a36Sopenharmony_ci			}
290162306a36Sopenharmony_ci		}
290262306a36Sopenharmony_ci		anon_vma_unlock_read(anon_vma);
290362306a36Sopenharmony_ci	}
290462306a36Sopenharmony_ci	if (!search_new_forks++)
290562306a36Sopenharmony_ci		goto again;
290662306a36Sopenharmony_ci}
290762306a36Sopenharmony_ci
290862306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_FAILURE
290962306a36Sopenharmony_ci/*
291062306a36Sopenharmony_ci * Collect processes when the error hit an ksm page.
291162306a36Sopenharmony_ci */
291262306a36Sopenharmony_civoid collect_procs_ksm(struct page *page, struct list_head *to_kill,
291362306a36Sopenharmony_ci		       int force_early)
291462306a36Sopenharmony_ci{
291562306a36Sopenharmony_ci	struct ksm_stable_node *stable_node;
291662306a36Sopenharmony_ci	struct ksm_rmap_item *rmap_item;
291762306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
291862306a36Sopenharmony_ci	struct vm_area_struct *vma;
291962306a36Sopenharmony_ci	struct task_struct *tsk;
292062306a36Sopenharmony_ci
292162306a36Sopenharmony_ci	stable_node = folio_stable_node(folio);
292262306a36Sopenharmony_ci	if (!stable_node)
292362306a36Sopenharmony_ci		return;
292462306a36Sopenharmony_ci	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
292562306a36Sopenharmony_ci		struct anon_vma *av = rmap_item->anon_vma;
292662306a36Sopenharmony_ci
292762306a36Sopenharmony_ci		anon_vma_lock_read(av);
292862306a36Sopenharmony_ci		rcu_read_lock();
292962306a36Sopenharmony_ci		for_each_process(tsk) {
293062306a36Sopenharmony_ci			struct anon_vma_chain *vmac;
293162306a36Sopenharmony_ci			unsigned long addr;
293262306a36Sopenharmony_ci			struct task_struct *t =
293362306a36Sopenharmony_ci				task_early_kill(tsk, force_early);
293462306a36Sopenharmony_ci			if (!t)
293562306a36Sopenharmony_ci				continue;
293662306a36Sopenharmony_ci			anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0,
293762306a36Sopenharmony_ci						       ULONG_MAX)
293862306a36Sopenharmony_ci			{
293962306a36Sopenharmony_ci				vma = vmac->vma;
294062306a36Sopenharmony_ci				if (vma->vm_mm == t->mm) {
294162306a36Sopenharmony_ci					addr = rmap_item->address & PAGE_MASK;
294262306a36Sopenharmony_ci					add_to_kill_ksm(t, page, vma, to_kill,
294362306a36Sopenharmony_ci							addr);
294462306a36Sopenharmony_ci				}
294562306a36Sopenharmony_ci			}
294662306a36Sopenharmony_ci		}
294762306a36Sopenharmony_ci		rcu_read_unlock();
294862306a36Sopenharmony_ci		anon_vma_unlock_read(av);
294962306a36Sopenharmony_ci	}
295062306a36Sopenharmony_ci}
295162306a36Sopenharmony_ci#endif
295262306a36Sopenharmony_ci
295362306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION
295462306a36Sopenharmony_civoid folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
295562306a36Sopenharmony_ci{
295662306a36Sopenharmony_ci	struct ksm_stable_node *stable_node;
295762306a36Sopenharmony_ci
295862306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
295962306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
296062306a36Sopenharmony_ci	VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
296162306a36Sopenharmony_ci
296262306a36Sopenharmony_ci	stable_node = folio_stable_node(folio);
296362306a36Sopenharmony_ci	if (stable_node) {
296462306a36Sopenharmony_ci		VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
296562306a36Sopenharmony_ci		stable_node->kpfn = folio_pfn(newfolio);
296662306a36Sopenharmony_ci		/*
296762306a36Sopenharmony_ci		 * newfolio->mapping was set in advance; now we need smp_wmb()
296862306a36Sopenharmony_ci		 * to make sure that the new stable_node->kpfn is visible
296962306a36Sopenharmony_ci		 * to get_ksm_page() before it can see that folio->mapping
297062306a36Sopenharmony_ci		 * has gone stale (or that folio_test_swapcache has been cleared).
297162306a36Sopenharmony_ci		 */
297262306a36Sopenharmony_ci		smp_wmb();
297362306a36Sopenharmony_ci		set_page_stable_node(&folio->page, NULL);
297462306a36Sopenharmony_ci	}
297562306a36Sopenharmony_ci}
297662306a36Sopenharmony_ci#endif /* CONFIG_MIGRATION */
297762306a36Sopenharmony_ci
297862306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTREMOVE
297962306a36Sopenharmony_cistatic void wait_while_offlining(void)
298062306a36Sopenharmony_ci{
298162306a36Sopenharmony_ci	while (ksm_run & KSM_RUN_OFFLINE) {
298262306a36Sopenharmony_ci		mutex_unlock(&ksm_thread_mutex);
298362306a36Sopenharmony_ci		wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
298462306a36Sopenharmony_ci			    TASK_UNINTERRUPTIBLE);
298562306a36Sopenharmony_ci		mutex_lock(&ksm_thread_mutex);
298662306a36Sopenharmony_ci	}
298762306a36Sopenharmony_ci}
298862306a36Sopenharmony_ci
298962306a36Sopenharmony_cistatic bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node,
299062306a36Sopenharmony_ci					 unsigned long start_pfn,
299162306a36Sopenharmony_ci					 unsigned long end_pfn)
299262306a36Sopenharmony_ci{
299362306a36Sopenharmony_ci	if (stable_node->kpfn >= start_pfn &&
299462306a36Sopenharmony_ci	    stable_node->kpfn < end_pfn) {
299562306a36Sopenharmony_ci		/*
299662306a36Sopenharmony_ci		 * Don't get_ksm_page, page has already gone:
299762306a36Sopenharmony_ci		 * which is why we keep kpfn instead of page*
299862306a36Sopenharmony_ci		 */
299962306a36Sopenharmony_ci		remove_node_from_stable_tree(stable_node);
300062306a36Sopenharmony_ci		return true;
300162306a36Sopenharmony_ci	}
300262306a36Sopenharmony_ci	return false;
300362306a36Sopenharmony_ci}
300462306a36Sopenharmony_ci
300562306a36Sopenharmony_cistatic bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node,
300662306a36Sopenharmony_ci					   unsigned long start_pfn,
300762306a36Sopenharmony_ci					   unsigned long end_pfn,
300862306a36Sopenharmony_ci					   struct rb_root *root)
300962306a36Sopenharmony_ci{
301062306a36Sopenharmony_ci	struct ksm_stable_node *dup;
301162306a36Sopenharmony_ci	struct hlist_node *hlist_safe;
301262306a36Sopenharmony_ci
301362306a36Sopenharmony_ci	if (!is_stable_node_chain(stable_node)) {
301462306a36Sopenharmony_ci		VM_BUG_ON(is_stable_node_dup(stable_node));
301562306a36Sopenharmony_ci		return stable_node_dup_remove_range(stable_node, start_pfn,
301662306a36Sopenharmony_ci						    end_pfn);
301762306a36Sopenharmony_ci	}
301862306a36Sopenharmony_ci
301962306a36Sopenharmony_ci	hlist_for_each_entry_safe(dup, hlist_safe,
302062306a36Sopenharmony_ci				  &stable_node->hlist, hlist_dup) {
302162306a36Sopenharmony_ci		VM_BUG_ON(!is_stable_node_dup(dup));
302262306a36Sopenharmony_ci		stable_node_dup_remove_range(dup, start_pfn, end_pfn);
302362306a36Sopenharmony_ci	}
302462306a36Sopenharmony_ci	if (hlist_empty(&stable_node->hlist)) {
302562306a36Sopenharmony_ci		free_stable_node_chain(stable_node, root);
302662306a36Sopenharmony_ci		return true; /* notify caller that tree was rebalanced */
302762306a36Sopenharmony_ci	} else
302862306a36Sopenharmony_ci		return false;
302962306a36Sopenharmony_ci}
303062306a36Sopenharmony_ci
303162306a36Sopenharmony_cistatic void ksm_check_stable_tree(unsigned long start_pfn,
303262306a36Sopenharmony_ci				  unsigned long end_pfn)
303362306a36Sopenharmony_ci{
303462306a36Sopenharmony_ci	struct ksm_stable_node *stable_node, *next;
303562306a36Sopenharmony_ci	struct rb_node *node;
303662306a36Sopenharmony_ci	int nid;
303762306a36Sopenharmony_ci
303862306a36Sopenharmony_ci	for (nid = 0; nid < ksm_nr_node_ids; nid++) {
303962306a36Sopenharmony_ci		node = rb_first(root_stable_tree + nid);
304062306a36Sopenharmony_ci		while (node) {
304162306a36Sopenharmony_ci			stable_node = rb_entry(node, struct ksm_stable_node, node);
304262306a36Sopenharmony_ci			if (stable_node_chain_remove_range(stable_node,
304362306a36Sopenharmony_ci							   start_pfn, end_pfn,
304462306a36Sopenharmony_ci							   root_stable_tree +
304562306a36Sopenharmony_ci							   nid))
304662306a36Sopenharmony_ci				node = rb_first(root_stable_tree + nid);
304762306a36Sopenharmony_ci			else
304862306a36Sopenharmony_ci				node = rb_next(node);
304962306a36Sopenharmony_ci			cond_resched();
305062306a36Sopenharmony_ci		}
305162306a36Sopenharmony_ci	}
305262306a36Sopenharmony_ci	list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
305362306a36Sopenharmony_ci		if (stable_node->kpfn >= start_pfn &&
305462306a36Sopenharmony_ci		    stable_node->kpfn < end_pfn)
305562306a36Sopenharmony_ci			remove_node_from_stable_tree(stable_node);
305662306a36Sopenharmony_ci		cond_resched();
305762306a36Sopenharmony_ci	}
305862306a36Sopenharmony_ci}
305962306a36Sopenharmony_ci
306062306a36Sopenharmony_cistatic int ksm_memory_callback(struct notifier_block *self,
306162306a36Sopenharmony_ci			       unsigned long action, void *arg)
306262306a36Sopenharmony_ci{
306362306a36Sopenharmony_ci	struct memory_notify *mn = arg;
306462306a36Sopenharmony_ci
306562306a36Sopenharmony_ci	switch (action) {
306662306a36Sopenharmony_ci	case MEM_GOING_OFFLINE:
306762306a36Sopenharmony_ci		/*
306862306a36Sopenharmony_ci		 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
306962306a36Sopenharmony_ci		 * and remove_all_stable_nodes() while memory is going offline:
307062306a36Sopenharmony_ci		 * it is unsafe for them to touch the stable tree at this time.
307162306a36Sopenharmony_ci		 * But unmerge_ksm_pages(), rmap lookups and other entry points
307262306a36Sopenharmony_ci		 * which do not need the ksm_thread_mutex are all safe.
307362306a36Sopenharmony_ci		 */
307462306a36Sopenharmony_ci		mutex_lock(&ksm_thread_mutex);
307562306a36Sopenharmony_ci		ksm_run |= KSM_RUN_OFFLINE;
307662306a36Sopenharmony_ci		mutex_unlock(&ksm_thread_mutex);
307762306a36Sopenharmony_ci		break;
307862306a36Sopenharmony_ci
307962306a36Sopenharmony_ci	case MEM_OFFLINE:
308062306a36Sopenharmony_ci		/*
308162306a36Sopenharmony_ci		 * Most of the work is done by page migration; but there might
308262306a36Sopenharmony_ci		 * be a few stable_nodes left over, still pointing to struct
308362306a36Sopenharmony_ci		 * pages which have been offlined: prune those from the tree,
308462306a36Sopenharmony_ci		 * otherwise get_ksm_page() might later try to access a
308562306a36Sopenharmony_ci		 * non-existent struct page.
308662306a36Sopenharmony_ci		 */
308762306a36Sopenharmony_ci		ksm_check_stable_tree(mn->start_pfn,
308862306a36Sopenharmony_ci				      mn->start_pfn + mn->nr_pages);
308962306a36Sopenharmony_ci		fallthrough;
309062306a36Sopenharmony_ci	case MEM_CANCEL_OFFLINE:
309162306a36Sopenharmony_ci		mutex_lock(&ksm_thread_mutex);
309262306a36Sopenharmony_ci		ksm_run &= ~KSM_RUN_OFFLINE;
309362306a36Sopenharmony_ci		mutex_unlock(&ksm_thread_mutex);
309462306a36Sopenharmony_ci
309562306a36Sopenharmony_ci		smp_mb();	/* wake_up_bit advises this */
309662306a36Sopenharmony_ci		wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
309762306a36Sopenharmony_ci		break;
309862306a36Sopenharmony_ci	}
309962306a36Sopenharmony_ci	return NOTIFY_OK;
310062306a36Sopenharmony_ci}
310162306a36Sopenharmony_ci#else
310262306a36Sopenharmony_cistatic void wait_while_offlining(void)
310362306a36Sopenharmony_ci{
310462306a36Sopenharmony_ci}
310562306a36Sopenharmony_ci#endif /* CONFIG_MEMORY_HOTREMOVE */
310662306a36Sopenharmony_ci
310762306a36Sopenharmony_ci#ifdef CONFIG_PROC_FS
310862306a36Sopenharmony_cilong ksm_process_profit(struct mm_struct *mm)
310962306a36Sopenharmony_ci{
311062306a36Sopenharmony_ci	return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE -
311162306a36Sopenharmony_ci		mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
311262306a36Sopenharmony_ci}
311362306a36Sopenharmony_ci#endif /* CONFIG_PROC_FS */
311462306a36Sopenharmony_ci
311562306a36Sopenharmony_ci#ifdef CONFIG_SYSFS
311662306a36Sopenharmony_ci/*
311762306a36Sopenharmony_ci * This all compiles without CONFIG_SYSFS, but is a waste of space.
311862306a36Sopenharmony_ci */
311962306a36Sopenharmony_ci
312062306a36Sopenharmony_ci#define KSM_ATTR_RO(_name) \
312162306a36Sopenharmony_ci	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
312262306a36Sopenharmony_ci#define KSM_ATTR(_name) \
312362306a36Sopenharmony_ci	static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
312462306a36Sopenharmony_ci
312562306a36Sopenharmony_cistatic ssize_t sleep_millisecs_show(struct kobject *kobj,
312662306a36Sopenharmony_ci				    struct kobj_attribute *attr, char *buf)
312762306a36Sopenharmony_ci{
312862306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
312962306a36Sopenharmony_ci}
313062306a36Sopenharmony_ci
313162306a36Sopenharmony_cistatic ssize_t sleep_millisecs_store(struct kobject *kobj,
313262306a36Sopenharmony_ci				     struct kobj_attribute *attr,
313362306a36Sopenharmony_ci				     const char *buf, size_t count)
313462306a36Sopenharmony_ci{
313562306a36Sopenharmony_ci	unsigned int msecs;
313662306a36Sopenharmony_ci	int err;
313762306a36Sopenharmony_ci
313862306a36Sopenharmony_ci	err = kstrtouint(buf, 10, &msecs);
313962306a36Sopenharmony_ci	if (err)
314062306a36Sopenharmony_ci		return -EINVAL;
314162306a36Sopenharmony_ci
314262306a36Sopenharmony_ci	ksm_thread_sleep_millisecs = msecs;
314362306a36Sopenharmony_ci	wake_up_interruptible(&ksm_iter_wait);
314462306a36Sopenharmony_ci
314562306a36Sopenharmony_ci	return count;
314662306a36Sopenharmony_ci}
314762306a36Sopenharmony_ciKSM_ATTR(sleep_millisecs);
314862306a36Sopenharmony_ci
314962306a36Sopenharmony_cistatic ssize_t pages_to_scan_show(struct kobject *kobj,
315062306a36Sopenharmony_ci				  struct kobj_attribute *attr, char *buf)
315162306a36Sopenharmony_ci{
315262306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
315362306a36Sopenharmony_ci}
315462306a36Sopenharmony_ci
315562306a36Sopenharmony_cistatic ssize_t pages_to_scan_store(struct kobject *kobj,
315662306a36Sopenharmony_ci				   struct kobj_attribute *attr,
315762306a36Sopenharmony_ci				   const char *buf, size_t count)
315862306a36Sopenharmony_ci{
315962306a36Sopenharmony_ci	unsigned int nr_pages;
316062306a36Sopenharmony_ci	int err;
316162306a36Sopenharmony_ci
316262306a36Sopenharmony_ci	err = kstrtouint(buf, 10, &nr_pages);
316362306a36Sopenharmony_ci	if (err)
316462306a36Sopenharmony_ci		return -EINVAL;
316562306a36Sopenharmony_ci
316662306a36Sopenharmony_ci	ksm_thread_pages_to_scan = nr_pages;
316762306a36Sopenharmony_ci
316862306a36Sopenharmony_ci	return count;
316962306a36Sopenharmony_ci}
317062306a36Sopenharmony_ciKSM_ATTR(pages_to_scan);
317162306a36Sopenharmony_ci
317262306a36Sopenharmony_cistatic ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
317362306a36Sopenharmony_ci			char *buf)
317462306a36Sopenharmony_ci{
317562306a36Sopenharmony_ci	return sysfs_emit(buf, "%lu\n", ksm_run);
317662306a36Sopenharmony_ci}
317762306a36Sopenharmony_ci
317862306a36Sopenharmony_cistatic ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
317962306a36Sopenharmony_ci			 const char *buf, size_t count)
318062306a36Sopenharmony_ci{
318162306a36Sopenharmony_ci	unsigned int flags;
318262306a36Sopenharmony_ci	int err;
318362306a36Sopenharmony_ci
318462306a36Sopenharmony_ci	err = kstrtouint(buf, 10, &flags);
318562306a36Sopenharmony_ci	if (err)
318662306a36Sopenharmony_ci		return -EINVAL;
318762306a36Sopenharmony_ci	if (flags > KSM_RUN_UNMERGE)
318862306a36Sopenharmony_ci		return -EINVAL;
318962306a36Sopenharmony_ci
319062306a36Sopenharmony_ci	/*
319162306a36Sopenharmony_ci	 * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
319262306a36Sopenharmony_ci	 * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
319362306a36Sopenharmony_ci	 * breaking COW to free the pages_shared (but leaves mm_slots
319462306a36Sopenharmony_ci	 * on the list for when ksmd may be set running again).
319562306a36Sopenharmony_ci	 */
319662306a36Sopenharmony_ci
319762306a36Sopenharmony_ci	mutex_lock(&ksm_thread_mutex);
319862306a36Sopenharmony_ci	wait_while_offlining();
319962306a36Sopenharmony_ci	if (ksm_run != flags) {
320062306a36Sopenharmony_ci		ksm_run = flags;
320162306a36Sopenharmony_ci		if (flags & KSM_RUN_UNMERGE) {
320262306a36Sopenharmony_ci			set_current_oom_origin();
320362306a36Sopenharmony_ci			err = unmerge_and_remove_all_rmap_items();
320462306a36Sopenharmony_ci			clear_current_oom_origin();
320562306a36Sopenharmony_ci			if (err) {
320662306a36Sopenharmony_ci				ksm_run = KSM_RUN_STOP;
320762306a36Sopenharmony_ci				count = err;
320862306a36Sopenharmony_ci			}
320962306a36Sopenharmony_ci		}
321062306a36Sopenharmony_ci	}
321162306a36Sopenharmony_ci	mutex_unlock(&ksm_thread_mutex);
321262306a36Sopenharmony_ci
321362306a36Sopenharmony_ci	if (flags & KSM_RUN_MERGE)
321462306a36Sopenharmony_ci		wake_up_interruptible(&ksm_thread_wait);
321562306a36Sopenharmony_ci
321662306a36Sopenharmony_ci	return count;
321762306a36Sopenharmony_ci}
321862306a36Sopenharmony_ciKSM_ATTR(run);
321962306a36Sopenharmony_ci
322062306a36Sopenharmony_ci#ifdef CONFIG_NUMA
322162306a36Sopenharmony_cistatic ssize_t merge_across_nodes_show(struct kobject *kobj,
322262306a36Sopenharmony_ci				       struct kobj_attribute *attr, char *buf)
322362306a36Sopenharmony_ci{
322462306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
322562306a36Sopenharmony_ci}
322662306a36Sopenharmony_ci
322762306a36Sopenharmony_cistatic ssize_t merge_across_nodes_store(struct kobject *kobj,
322862306a36Sopenharmony_ci				   struct kobj_attribute *attr,
322962306a36Sopenharmony_ci				   const char *buf, size_t count)
323062306a36Sopenharmony_ci{
323162306a36Sopenharmony_ci	int err;
323262306a36Sopenharmony_ci	unsigned long knob;
323362306a36Sopenharmony_ci
323462306a36Sopenharmony_ci	err = kstrtoul(buf, 10, &knob);
323562306a36Sopenharmony_ci	if (err)
323662306a36Sopenharmony_ci		return err;
323762306a36Sopenharmony_ci	if (knob > 1)
323862306a36Sopenharmony_ci		return -EINVAL;
323962306a36Sopenharmony_ci
324062306a36Sopenharmony_ci	mutex_lock(&ksm_thread_mutex);
324162306a36Sopenharmony_ci	wait_while_offlining();
324262306a36Sopenharmony_ci	if (ksm_merge_across_nodes != knob) {
324362306a36Sopenharmony_ci		if (ksm_pages_shared || remove_all_stable_nodes())
324462306a36Sopenharmony_ci			err = -EBUSY;
324562306a36Sopenharmony_ci		else if (root_stable_tree == one_stable_tree) {
324662306a36Sopenharmony_ci			struct rb_root *buf;
324762306a36Sopenharmony_ci			/*
324862306a36Sopenharmony_ci			 * This is the first time that we switch away from the
324962306a36Sopenharmony_ci			 * default of merging across nodes: must now allocate
325062306a36Sopenharmony_ci			 * a buffer to hold as many roots as may be needed.
325162306a36Sopenharmony_ci			 * Allocate stable and unstable together:
325262306a36Sopenharmony_ci			 * MAXSMP NODES_SHIFT 10 will use 16kB.
325362306a36Sopenharmony_ci			 */
325462306a36Sopenharmony_ci			buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
325562306a36Sopenharmony_ci				      GFP_KERNEL);
325662306a36Sopenharmony_ci			/* Let us assume that RB_ROOT is NULL is zero */
325762306a36Sopenharmony_ci			if (!buf)
325862306a36Sopenharmony_ci				err = -ENOMEM;
325962306a36Sopenharmony_ci			else {
326062306a36Sopenharmony_ci				root_stable_tree = buf;
326162306a36Sopenharmony_ci				root_unstable_tree = buf + nr_node_ids;
326262306a36Sopenharmony_ci				/* Stable tree is empty but not the unstable */
326362306a36Sopenharmony_ci				root_unstable_tree[0] = one_unstable_tree[0];
326462306a36Sopenharmony_ci			}
326562306a36Sopenharmony_ci		}
326662306a36Sopenharmony_ci		if (!err) {
326762306a36Sopenharmony_ci			ksm_merge_across_nodes = knob;
326862306a36Sopenharmony_ci			ksm_nr_node_ids = knob ? 1 : nr_node_ids;
326962306a36Sopenharmony_ci		}
327062306a36Sopenharmony_ci	}
327162306a36Sopenharmony_ci	mutex_unlock(&ksm_thread_mutex);
327262306a36Sopenharmony_ci
327362306a36Sopenharmony_ci	return err ? err : count;
327462306a36Sopenharmony_ci}
327562306a36Sopenharmony_ciKSM_ATTR(merge_across_nodes);
327662306a36Sopenharmony_ci#endif
327762306a36Sopenharmony_ci
327862306a36Sopenharmony_cistatic ssize_t use_zero_pages_show(struct kobject *kobj,
327962306a36Sopenharmony_ci				   struct kobj_attribute *attr, char *buf)
328062306a36Sopenharmony_ci{
328162306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
328262306a36Sopenharmony_ci}
328362306a36Sopenharmony_cistatic ssize_t use_zero_pages_store(struct kobject *kobj,
328462306a36Sopenharmony_ci				   struct kobj_attribute *attr,
328562306a36Sopenharmony_ci				   const char *buf, size_t count)
328662306a36Sopenharmony_ci{
328762306a36Sopenharmony_ci	int err;
328862306a36Sopenharmony_ci	bool value;
328962306a36Sopenharmony_ci
329062306a36Sopenharmony_ci	err = kstrtobool(buf, &value);
329162306a36Sopenharmony_ci	if (err)
329262306a36Sopenharmony_ci		return -EINVAL;
329362306a36Sopenharmony_ci
329462306a36Sopenharmony_ci	ksm_use_zero_pages = value;
329562306a36Sopenharmony_ci
329662306a36Sopenharmony_ci	return count;
329762306a36Sopenharmony_ci}
329862306a36Sopenharmony_ciKSM_ATTR(use_zero_pages);
329962306a36Sopenharmony_ci
330062306a36Sopenharmony_cistatic ssize_t max_page_sharing_show(struct kobject *kobj,
330162306a36Sopenharmony_ci				     struct kobj_attribute *attr, char *buf)
330262306a36Sopenharmony_ci{
330362306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
330462306a36Sopenharmony_ci}
330562306a36Sopenharmony_ci
330662306a36Sopenharmony_cistatic ssize_t max_page_sharing_store(struct kobject *kobj,
330762306a36Sopenharmony_ci				      struct kobj_attribute *attr,
330862306a36Sopenharmony_ci				      const char *buf, size_t count)
330962306a36Sopenharmony_ci{
331062306a36Sopenharmony_ci	int err;
331162306a36Sopenharmony_ci	int knob;
331262306a36Sopenharmony_ci
331362306a36Sopenharmony_ci	err = kstrtoint(buf, 10, &knob);
331462306a36Sopenharmony_ci	if (err)
331562306a36Sopenharmony_ci		return err;
331662306a36Sopenharmony_ci	/*
331762306a36Sopenharmony_ci	 * When a KSM page is created it is shared by 2 mappings. This
331862306a36Sopenharmony_ci	 * being a signed comparison, it implicitly verifies it's not
331962306a36Sopenharmony_ci	 * negative.
332062306a36Sopenharmony_ci	 */
332162306a36Sopenharmony_ci	if (knob < 2)
332262306a36Sopenharmony_ci		return -EINVAL;
332362306a36Sopenharmony_ci
332462306a36Sopenharmony_ci	if (READ_ONCE(ksm_max_page_sharing) == knob)
332562306a36Sopenharmony_ci		return count;
332662306a36Sopenharmony_ci
332762306a36Sopenharmony_ci	mutex_lock(&ksm_thread_mutex);
332862306a36Sopenharmony_ci	wait_while_offlining();
332962306a36Sopenharmony_ci	if (ksm_max_page_sharing != knob) {
333062306a36Sopenharmony_ci		if (ksm_pages_shared || remove_all_stable_nodes())
333162306a36Sopenharmony_ci			err = -EBUSY;
333262306a36Sopenharmony_ci		else
333362306a36Sopenharmony_ci			ksm_max_page_sharing = knob;
333462306a36Sopenharmony_ci	}
333562306a36Sopenharmony_ci	mutex_unlock(&ksm_thread_mutex);
333662306a36Sopenharmony_ci
333762306a36Sopenharmony_ci	return err ? err : count;
333862306a36Sopenharmony_ci}
333962306a36Sopenharmony_ciKSM_ATTR(max_page_sharing);
334062306a36Sopenharmony_ci
334162306a36Sopenharmony_cistatic ssize_t pages_scanned_show(struct kobject *kobj,
334262306a36Sopenharmony_ci				  struct kobj_attribute *attr, char *buf)
334362306a36Sopenharmony_ci{
334462306a36Sopenharmony_ci	return sysfs_emit(buf, "%lu\n", ksm_pages_scanned);
334562306a36Sopenharmony_ci}
334662306a36Sopenharmony_ciKSM_ATTR_RO(pages_scanned);
334762306a36Sopenharmony_ci
334862306a36Sopenharmony_cistatic ssize_t pages_shared_show(struct kobject *kobj,
334962306a36Sopenharmony_ci				 struct kobj_attribute *attr, char *buf)
335062306a36Sopenharmony_ci{
335162306a36Sopenharmony_ci	return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
335262306a36Sopenharmony_ci}
335362306a36Sopenharmony_ciKSM_ATTR_RO(pages_shared);
335462306a36Sopenharmony_ci
335562306a36Sopenharmony_cistatic ssize_t pages_sharing_show(struct kobject *kobj,
335662306a36Sopenharmony_ci				  struct kobj_attribute *attr, char *buf)
335762306a36Sopenharmony_ci{
335862306a36Sopenharmony_ci	return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
335962306a36Sopenharmony_ci}
336062306a36Sopenharmony_ciKSM_ATTR_RO(pages_sharing);
336162306a36Sopenharmony_ci
336262306a36Sopenharmony_cistatic ssize_t pages_unshared_show(struct kobject *kobj,
336362306a36Sopenharmony_ci				   struct kobj_attribute *attr, char *buf)
336462306a36Sopenharmony_ci{
336562306a36Sopenharmony_ci	return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
336662306a36Sopenharmony_ci}
336762306a36Sopenharmony_ciKSM_ATTR_RO(pages_unshared);
336862306a36Sopenharmony_ci
336962306a36Sopenharmony_cistatic ssize_t pages_volatile_show(struct kobject *kobj,
337062306a36Sopenharmony_ci				   struct kobj_attribute *attr, char *buf)
337162306a36Sopenharmony_ci{
337262306a36Sopenharmony_ci	long ksm_pages_volatile;
337362306a36Sopenharmony_ci
337462306a36Sopenharmony_ci	ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
337562306a36Sopenharmony_ci				- ksm_pages_sharing - ksm_pages_unshared;
337662306a36Sopenharmony_ci	/*
337762306a36Sopenharmony_ci	 * It was not worth any locking to calculate that statistic,
337862306a36Sopenharmony_ci	 * but it might therefore sometimes be negative: conceal that.
337962306a36Sopenharmony_ci	 */
338062306a36Sopenharmony_ci	if (ksm_pages_volatile < 0)
338162306a36Sopenharmony_ci		ksm_pages_volatile = 0;
338262306a36Sopenharmony_ci	return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
338362306a36Sopenharmony_ci}
338462306a36Sopenharmony_ciKSM_ATTR_RO(pages_volatile);
338562306a36Sopenharmony_ci
338662306a36Sopenharmony_cistatic ssize_t ksm_zero_pages_show(struct kobject *kobj,
338762306a36Sopenharmony_ci				struct kobj_attribute *attr, char *buf)
338862306a36Sopenharmony_ci{
338962306a36Sopenharmony_ci	return sysfs_emit(buf, "%ld\n", ksm_zero_pages);
339062306a36Sopenharmony_ci}
339162306a36Sopenharmony_ciKSM_ATTR_RO(ksm_zero_pages);
339262306a36Sopenharmony_ci
339362306a36Sopenharmony_cistatic ssize_t general_profit_show(struct kobject *kobj,
339462306a36Sopenharmony_ci				   struct kobj_attribute *attr, char *buf)
339562306a36Sopenharmony_ci{
339662306a36Sopenharmony_ci	long general_profit;
339762306a36Sopenharmony_ci
339862306a36Sopenharmony_ci	general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE -
339962306a36Sopenharmony_ci				ksm_rmap_items * sizeof(struct ksm_rmap_item);
340062306a36Sopenharmony_ci
340162306a36Sopenharmony_ci	return sysfs_emit(buf, "%ld\n", general_profit);
340262306a36Sopenharmony_ci}
340362306a36Sopenharmony_ciKSM_ATTR_RO(general_profit);
340462306a36Sopenharmony_ci
340562306a36Sopenharmony_cistatic ssize_t stable_node_dups_show(struct kobject *kobj,
340662306a36Sopenharmony_ci				     struct kobj_attribute *attr, char *buf)
340762306a36Sopenharmony_ci{
340862306a36Sopenharmony_ci	return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
340962306a36Sopenharmony_ci}
341062306a36Sopenharmony_ciKSM_ATTR_RO(stable_node_dups);
341162306a36Sopenharmony_ci
341262306a36Sopenharmony_cistatic ssize_t stable_node_chains_show(struct kobject *kobj,
341362306a36Sopenharmony_ci				       struct kobj_attribute *attr, char *buf)
341462306a36Sopenharmony_ci{
341562306a36Sopenharmony_ci	return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
341662306a36Sopenharmony_ci}
341762306a36Sopenharmony_ciKSM_ATTR_RO(stable_node_chains);
341862306a36Sopenharmony_ci
341962306a36Sopenharmony_cistatic ssize_t
342062306a36Sopenharmony_cistable_node_chains_prune_millisecs_show(struct kobject *kobj,
342162306a36Sopenharmony_ci					struct kobj_attribute *attr,
342262306a36Sopenharmony_ci					char *buf)
342362306a36Sopenharmony_ci{
342462306a36Sopenharmony_ci	return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
342562306a36Sopenharmony_ci}
342662306a36Sopenharmony_ci
342762306a36Sopenharmony_cistatic ssize_t
342862306a36Sopenharmony_cistable_node_chains_prune_millisecs_store(struct kobject *kobj,
342962306a36Sopenharmony_ci					 struct kobj_attribute *attr,
343062306a36Sopenharmony_ci					 const char *buf, size_t count)
343162306a36Sopenharmony_ci{
343262306a36Sopenharmony_ci	unsigned int msecs;
343362306a36Sopenharmony_ci	int err;
343462306a36Sopenharmony_ci
343562306a36Sopenharmony_ci	err = kstrtouint(buf, 10, &msecs);
343662306a36Sopenharmony_ci	if (err)
343762306a36Sopenharmony_ci		return -EINVAL;
343862306a36Sopenharmony_ci
343962306a36Sopenharmony_ci	ksm_stable_node_chains_prune_millisecs = msecs;
344062306a36Sopenharmony_ci
344162306a36Sopenharmony_ci	return count;
344262306a36Sopenharmony_ci}
344362306a36Sopenharmony_ciKSM_ATTR(stable_node_chains_prune_millisecs);
344462306a36Sopenharmony_ci
344562306a36Sopenharmony_cistatic ssize_t full_scans_show(struct kobject *kobj,
344662306a36Sopenharmony_ci			       struct kobj_attribute *attr, char *buf)
344762306a36Sopenharmony_ci{
344862306a36Sopenharmony_ci	return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
344962306a36Sopenharmony_ci}
345062306a36Sopenharmony_ciKSM_ATTR_RO(full_scans);
345162306a36Sopenharmony_ci
345262306a36Sopenharmony_cistatic struct attribute *ksm_attrs[] = {
345362306a36Sopenharmony_ci	&sleep_millisecs_attr.attr,
345462306a36Sopenharmony_ci	&pages_to_scan_attr.attr,
345562306a36Sopenharmony_ci	&run_attr.attr,
345662306a36Sopenharmony_ci	&pages_scanned_attr.attr,
345762306a36Sopenharmony_ci	&pages_shared_attr.attr,
345862306a36Sopenharmony_ci	&pages_sharing_attr.attr,
345962306a36Sopenharmony_ci	&pages_unshared_attr.attr,
346062306a36Sopenharmony_ci	&pages_volatile_attr.attr,
346162306a36Sopenharmony_ci	&ksm_zero_pages_attr.attr,
346262306a36Sopenharmony_ci	&full_scans_attr.attr,
346362306a36Sopenharmony_ci#ifdef CONFIG_NUMA
346462306a36Sopenharmony_ci	&merge_across_nodes_attr.attr,
346562306a36Sopenharmony_ci#endif
346662306a36Sopenharmony_ci	&max_page_sharing_attr.attr,
346762306a36Sopenharmony_ci	&stable_node_chains_attr.attr,
346862306a36Sopenharmony_ci	&stable_node_dups_attr.attr,
346962306a36Sopenharmony_ci	&stable_node_chains_prune_millisecs_attr.attr,
347062306a36Sopenharmony_ci	&use_zero_pages_attr.attr,
347162306a36Sopenharmony_ci	&general_profit_attr.attr,
347262306a36Sopenharmony_ci	NULL,
347362306a36Sopenharmony_ci};
347462306a36Sopenharmony_ci
347562306a36Sopenharmony_cistatic const struct attribute_group ksm_attr_group = {
347662306a36Sopenharmony_ci	.attrs = ksm_attrs,
347762306a36Sopenharmony_ci	.name = "ksm",
347862306a36Sopenharmony_ci};
347962306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */
348062306a36Sopenharmony_ci
348162306a36Sopenharmony_cistatic int __init ksm_init(void)
348262306a36Sopenharmony_ci{
348362306a36Sopenharmony_ci	struct task_struct *ksm_thread;
348462306a36Sopenharmony_ci	int err;
348562306a36Sopenharmony_ci
348662306a36Sopenharmony_ci	/* The correct value depends on page size and endianness */
348762306a36Sopenharmony_ci	zero_checksum = calc_checksum(ZERO_PAGE(0));
348862306a36Sopenharmony_ci	/* Default to false for backwards compatibility */
348962306a36Sopenharmony_ci	ksm_use_zero_pages = false;
349062306a36Sopenharmony_ci
349162306a36Sopenharmony_ci	err = ksm_slab_init();
349262306a36Sopenharmony_ci	if (err)
349362306a36Sopenharmony_ci		goto out;
349462306a36Sopenharmony_ci
349562306a36Sopenharmony_ci	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
349662306a36Sopenharmony_ci	if (IS_ERR(ksm_thread)) {
349762306a36Sopenharmony_ci		pr_err("ksm: creating kthread failed\n");
349862306a36Sopenharmony_ci		err = PTR_ERR(ksm_thread);
349962306a36Sopenharmony_ci		goto out_free;
350062306a36Sopenharmony_ci	}
350162306a36Sopenharmony_ci
350262306a36Sopenharmony_ci#ifdef CONFIG_SYSFS
350362306a36Sopenharmony_ci	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
350462306a36Sopenharmony_ci	if (err) {
350562306a36Sopenharmony_ci		pr_err("ksm: register sysfs failed\n");
350662306a36Sopenharmony_ci		kthread_stop(ksm_thread);
350762306a36Sopenharmony_ci		goto out_free;
350862306a36Sopenharmony_ci	}
350962306a36Sopenharmony_ci#else
351062306a36Sopenharmony_ci	ksm_run = KSM_RUN_MERGE;	/* no way for user to start it */
351162306a36Sopenharmony_ci
351262306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */
351362306a36Sopenharmony_ci
351462306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTREMOVE
351562306a36Sopenharmony_ci	/* There is no significance to this priority 100 */
351662306a36Sopenharmony_ci	hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI);
351762306a36Sopenharmony_ci#endif
351862306a36Sopenharmony_ci	return 0;
351962306a36Sopenharmony_ci
352062306a36Sopenharmony_ciout_free:
352162306a36Sopenharmony_ci	ksm_slab_free();
352262306a36Sopenharmony_ciout:
352362306a36Sopenharmony_ci	return err;
352462306a36Sopenharmony_ci}
352562306a36Sopenharmony_cisubsys_initcall(ksm_init);
3526