162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Memory merging support. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * This code enables dynamic sharing of identical pages found in different 662306a36Sopenharmony_ci * memory areas, even if they are not shared by fork() 762306a36Sopenharmony_ci * 862306a36Sopenharmony_ci * Copyright (C) 2008-2009 Red Hat, Inc. 962306a36Sopenharmony_ci * Authors: 1062306a36Sopenharmony_ci * Izik Eidus 1162306a36Sopenharmony_ci * Andrea Arcangeli 1262306a36Sopenharmony_ci * Chris Wright 1362306a36Sopenharmony_ci * Hugh Dickins 1462306a36Sopenharmony_ci */ 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_ci#include <linux/errno.h> 1762306a36Sopenharmony_ci#include <linux/mm.h> 1862306a36Sopenharmony_ci#include <linux/mm_inline.h> 1962306a36Sopenharmony_ci#include <linux/fs.h> 2062306a36Sopenharmony_ci#include <linux/mman.h> 2162306a36Sopenharmony_ci#include <linux/sched.h> 2262306a36Sopenharmony_ci#include <linux/sched/mm.h> 2362306a36Sopenharmony_ci#include <linux/sched/coredump.h> 2462306a36Sopenharmony_ci#include <linux/rwsem.h> 2562306a36Sopenharmony_ci#include <linux/pagemap.h> 2662306a36Sopenharmony_ci#include <linux/rmap.h> 2762306a36Sopenharmony_ci#include <linux/spinlock.h> 2862306a36Sopenharmony_ci#include <linux/xxhash.h> 2962306a36Sopenharmony_ci#include <linux/delay.h> 3062306a36Sopenharmony_ci#include <linux/kthread.h> 3162306a36Sopenharmony_ci#include <linux/wait.h> 3262306a36Sopenharmony_ci#include <linux/slab.h> 3362306a36Sopenharmony_ci#include <linux/rbtree.h> 3462306a36Sopenharmony_ci#include <linux/memory.h> 3562306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 3662306a36Sopenharmony_ci#include <linux/swap.h> 3762306a36Sopenharmony_ci#include <linux/ksm.h> 3862306a36Sopenharmony_ci#include <linux/hashtable.h> 3962306a36Sopenharmony_ci#include <linux/freezer.h> 4062306a36Sopenharmony_ci#include <linux/oom.h> 4162306a36Sopenharmony_ci#include <linux/numa.h> 4262306a36Sopenharmony_ci#include <linux/pagewalk.h> 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci#include <asm/tlbflush.h> 4562306a36Sopenharmony_ci#include "internal.h" 4662306a36Sopenharmony_ci#include "mm_slot.h" 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci#define CREATE_TRACE_POINTS 4962306a36Sopenharmony_ci#include <trace/events/ksm.h> 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci#ifdef CONFIG_NUMA 5262306a36Sopenharmony_ci#define NUMA(x) (x) 5362306a36Sopenharmony_ci#define DO_NUMA(x) do { (x); } while (0) 5462306a36Sopenharmony_ci#else 5562306a36Sopenharmony_ci#define NUMA(x) (0) 5662306a36Sopenharmony_ci#define DO_NUMA(x) do { } while (0) 5762306a36Sopenharmony_ci#endif 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci/** 6062306a36Sopenharmony_ci * DOC: Overview 6162306a36Sopenharmony_ci * 6262306a36Sopenharmony_ci * A few notes about the KSM scanning process, 6362306a36Sopenharmony_ci * to make it easier to understand the data structures below: 6462306a36Sopenharmony_ci * 6562306a36Sopenharmony_ci * In order to reduce excessive scanning, KSM sorts the memory pages by their 6662306a36Sopenharmony_ci * contents into a data structure that holds pointers to the pages' locations. 6762306a36Sopenharmony_ci * 6862306a36Sopenharmony_ci * Since the contents of the pages may change at any moment, KSM cannot just 6962306a36Sopenharmony_ci * insert the pages into a normal sorted tree and expect it to find anything. 7062306a36Sopenharmony_ci * Therefore KSM uses two data structures - the stable and the unstable tree. 7162306a36Sopenharmony_ci * 7262306a36Sopenharmony_ci * The stable tree holds pointers to all the merged pages (ksm pages), sorted 7362306a36Sopenharmony_ci * by their contents. Because each such page is write-protected, searching on 7462306a36Sopenharmony_ci * this tree is fully assured to be working (except when pages are unmapped), 7562306a36Sopenharmony_ci * and therefore this tree is called the stable tree. 7662306a36Sopenharmony_ci * 7762306a36Sopenharmony_ci * The stable tree node includes information required for reverse 7862306a36Sopenharmony_ci * mapping from a KSM page to virtual addresses that map this page. 7962306a36Sopenharmony_ci * 8062306a36Sopenharmony_ci * In order to avoid large latencies of the rmap walks on KSM pages, 8162306a36Sopenharmony_ci * KSM maintains two types of nodes in the stable tree: 8262306a36Sopenharmony_ci * 8362306a36Sopenharmony_ci * * the regular nodes that keep the reverse mapping structures in a 8462306a36Sopenharmony_ci * linked list 8562306a36Sopenharmony_ci * * the "chains" that link nodes ("dups") that represent the same 8662306a36Sopenharmony_ci * write protected memory content, but each "dup" corresponds to a 8762306a36Sopenharmony_ci * different KSM page copy of that content 8862306a36Sopenharmony_ci * 8962306a36Sopenharmony_ci * Internally, the regular nodes, "dups" and "chains" are represented 9062306a36Sopenharmony_ci * using the same struct ksm_stable_node structure. 9162306a36Sopenharmony_ci * 9262306a36Sopenharmony_ci * In addition to the stable tree, KSM uses a second data structure called the 9362306a36Sopenharmony_ci * unstable tree: this tree holds pointers to pages which have been found to 9462306a36Sopenharmony_ci * be "unchanged for a period of time". The unstable tree sorts these pages 9562306a36Sopenharmony_ci * by their contents, but since they are not write-protected, KSM cannot rely 9662306a36Sopenharmony_ci * upon the unstable tree to work correctly - the unstable tree is liable to 9762306a36Sopenharmony_ci * be corrupted as its contents are modified, and so it is called unstable. 9862306a36Sopenharmony_ci * 9962306a36Sopenharmony_ci * KSM solves this problem by several techniques: 10062306a36Sopenharmony_ci * 10162306a36Sopenharmony_ci * 1) The unstable tree is flushed every time KSM completes scanning all 10262306a36Sopenharmony_ci * memory areas, and then the tree is rebuilt again from the beginning. 10362306a36Sopenharmony_ci * 2) KSM will only insert into the unstable tree, pages whose hash value 10462306a36Sopenharmony_ci * has not changed since the previous scan of all memory areas. 10562306a36Sopenharmony_ci * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the 10662306a36Sopenharmony_ci * colors of the nodes and not on their contents, assuring that even when 10762306a36Sopenharmony_ci * the tree gets "corrupted" it won't get out of balance, so scanning time 10862306a36Sopenharmony_ci * remains the same (also, searching and inserting nodes in an rbtree uses 10962306a36Sopenharmony_ci * the same algorithm, so we have no overhead when we flush and rebuild). 11062306a36Sopenharmony_ci * 4) KSM never flushes the stable tree, which means that even if it were to 11162306a36Sopenharmony_ci * take 10 attempts to find a page in the unstable tree, once it is found, 11262306a36Sopenharmony_ci * it is secured in the stable tree. (When we scan a new page, we first 11362306a36Sopenharmony_ci * compare it against the stable tree, and then against the unstable tree.) 11462306a36Sopenharmony_ci * 11562306a36Sopenharmony_ci * If the merge_across_nodes tunable is unset, then KSM maintains multiple 11662306a36Sopenharmony_ci * stable trees and multiple unstable trees: one of each for each NUMA node. 11762306a36Sopenharmony_ci */ 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci/** 12062306a36Sopenharmony_ci * struct ksm_mm_slot - ksm information per mm that is being scanned 12162306a36Sopenharmony_ci * @slot: hash lookup from mm to mm_slot 12262306a36Sopenharmony_ci * @rmap_list: head for this mm_slot's singly-linked list of rmap_items 12362306a36Sopenharmony_ci */ 12462306a36Sopenharmony_cistruct ksm_mm_slot { 12562306a36Sopenharmony_ci struct mm_slot slot; 12662306a36Sopenharmony_ci struct ksm_rmap_item *rmap_list; 12762306a36Sopenharmony_ci}; 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci/** 13062306a36Sopenharmony_ci * struct ksm_scan - cursor for scanning 13162306a36Sopenharmony_ci * @mm_slot: the current mm_slot we are scanning 13262306a36Sopenharmony_ci * @address: the next address inside that to be scanned 13362306a36Sopenharmony_ci * @rmap_list: link to the next rmap to be scanned in the rmap_list 13462306a36Sopenharmony_ci * @seqnr: count of completed full scans (needed when removing unstable node) 13562306a36Sopenharmony_ci * 13662306a36Sopenharmony_ci * There is only the one ksm_scan instance of this cursor structure. 13762306a36Sopenharmony_ci */ 13862306a36Sopenharmony_cistruct ksm_scan { 13962306a36Sopenharmony_ci struct ksm_mm_slot *mm_slot; 14062306a36Sopenharmony_ci unsigned long address; 14162306a36Sopenharmony_ci struct ksm_rmap_item **rmap_list; 14262306a36Sopenharmony_ci unsigned long seqnr; 14362306a36Sopenharmony_ci}; 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci/** 14662306a36Sopenharmony_ci * struct ksm_stable_node - node of the stable rbtree 14762306a36Sopenharmony_ci * @node: rb node of this ksm page in the stable tree 14862306a36Sopenharmony_ci * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list 14962306a36Sopenharmony_ci * @hlist_dup: linked into the stable_node->hlist with a stable_node chain 15062306a36Sopenharmony_ci * @list: linked into migrate_nodes, pending placement in the proper node tree 15162306a36Sopenharmony_ci * @hlist: hlist head of rmap_items using this ksm page 15262306a36Sopenharmony_ci * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) 15362306a36Sopenharmony_ci * @chain_prune_time: time of the last full garbage collection 15462306a36Sopenharmony_ci * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN 15562306a36Sopenharmony_ci * @nid: NUMA node id of stable tree in which linked (may not match kpfn) 15662306a36Sopenharmony_ci */ 15762306a36Sopenharmony_cistruct ksm_stable_node { 15862306a36Sopenharmony_ci union { 15962306a36Sopenharmony_ci struct rb_node node; /* when node of stable tree */ 16062306a36Sopenharmony_ci struct { /* when listed for migration */ 16162306a36Sopenharmony_ci struct list_head *head; 16262306a36Sopenharmony_ci struct { 16362306a36Sopenharmony_ci struct hlist_node hlist_dup; 16462306a36Sopenharmony_ci struct list_head list; 16562306a36Sopenharmony_ci }; 16662306a36Sopenharmony_ci }; 16762306a36Sopenharmony_ci }; 16862306a36Sopenharmony_ci struct hlist_head hlist; 16962306a36Sopenharmony_ci union { 17062306a36Sopenharmony_ci unsigned long kpfn; 17162306a36Sopenharmony_ci unsigned long chain_prune_time; 17262306a36Sopenharmony_ci }; 17362306a36Sopenharmony_ci /* 17462306a36Sopenharmony_ci * STABLE_NODE_CHAIN can be any negative number in 17562306a36Sopenharmony_ci * rmap_hlist_len negative range, but better not -1 to be able 17662306a36Sopenharmony_ci * to reliably detect underflows. 17762306a36Sopenharmony_ci */ 17862306a36Sopenharmony_ci#define STABLE_NODE_CHAIN -1024 17962306a36Sopenharmony_ci int rmap_hlist_len; 18062306a36Sopenharmony_ci#ifdef CONFIG_NUMA 18162306a36Sopenharmony_ci int nid; 18262306a36Sopenharmony_ci#endif 18362306a36Sopenharmony_ci}; 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci/** 18662306a36Sopenharmony_ci * struct ksm_rmap_item - reverse mapping item for virtual addresses 18762306a36Sopenharmony_ci * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list 18862306a36Sopenharmony_ci * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree 18962306a36Sopenharmony_ci * @nid: NUMA node id of unstable tree in which linked (may not match page) 19062306a36Sopenharmony_ci * @mm: the memory structure this rmap_item is pointing into 19162306a36Sopenharmony_ci * @address: the virtual address this rmap_item tracks (+ flags in low bits) 19262306a36Sopenharmony_ci * @oldchecksum: previous checksum of the page at that virtual address 19362306a36Sopenharmony_ci * @node: rb node of this rmap_item in the unstable tree 19462306a36Sopenharmony_ci * @head: pointer to stable_node heading this list in the stable tree 19562306a36Sopenharmony_ci * @hlist: link into hlist of rmap_items hanging off that stable_node 19662306a36Sopenharmony_ci */ 19762306a36Sopenharmony_cistruct ksm_rmap_item { 19862306a36Sopenharmony_ci struct ksm_rmap_item *rmap_list; 19962306a36Sopenharmony_ci union { 20062306a36Sopenharmony_ci struct anon_vma *anon_vma; /* when stable */ 20162306a36Sopenharmony_ci#ifdef CONFIG_NUMA 20262306a36Sopenharmony_ci int nid; /* when node of unstable tree */ 20362306a36Sopenharmony_ci#endif 20462306a36Sopenharmony_ci }; 20562306a36Sopenharmony_ci struct mm_struct *mm; 20662306a36Sopenharmony_ci unsigned long address; /* + low bits used for flags below */ 20762306a36Sopenharmony_ci unsigned int oldchecksum; /* when unstable */ 20862306a36Sopenharmony_ci union { 20962306a36Sopenharmony_ci struct rb_node node; /* when node of unstable tree */ 21062306a36Sopenharmony_ci struct { /* when listed from stable tree */ 21162306a36Sopenharmony_ci struct ksm_stable_node *head; 21262306a36Sopenharmony_ci struct hlist_node hlist; 21362306a36Sopenharmony_ci }; 21462306a36Sopenharmony_ci }; 21562306a36Sopenharmony_ci}; 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ 21862306a36Sopenharmony_ci#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ 21962306a36Sopenharmony_ci#define STABLE_FLAG 0x200 /* is listed from the stable tree */ 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci/* The stable and unstable tree heads */ 22262306a36Sopenharmony_cistatic struct rb_root one_stable_tree[1] = { RB_ROOT }; 22362306a36Sopenharmony_cistatic struct rb_root one_unstable_tree[1] = { RB_ROOT }; 22462306a36Sopenharmony_cistatic struct rb_root *root_stable_tree = one_stable_tree; 22562306a36Sopenharmony_cistatic struct rb_root *root_unstable_tree = one_unstable_tree; 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci/* Recently migrated nodes of stable tree, pending proper placement */ 22862306a36Sopenharmony_cistatic LIST_HEAD(migrate_nodes); 22962306a36Sopenharmony_ci#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci#define MM_SLOTS_HASH_BITS 10 23262306a36Sopenharmony_cistatic DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_cistatic struct ksm_mm_slot ksm_mm_head = { 23562306a36Sopenharmony_ci .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), 23662306a36Sopenharmony_ci}; 23762306a36Sopenharmony_cistatic struct ksm_scan ksm_scan = { 23862306a36Sopenharmony_ci .mm_slot = &ksm_mm_head, 23962306a36Sopenharmony_ci}; 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_cistatic struct kmem_cache *rmap_item_cache; 24262306a36Sopenharmony_cistatic struct kmem_cache *stable_node_cache; 24362306a36Sopenharmony_cistatic struct kmem_cache *mm_slot_cache; 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci/* The number of pages scanned */ 24662306a36Sopenharmony_cistatic unsigned long ksm_pages_scanned; 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci/* The number of nodes in the stable tree */ 24962306a36Sopenharmony_cistatic unsigned long ksm_pages_shared; 25062306a36Sopenharmony_ci 25162306a36Sopenharmony_ci/* The number of page slots additionally sharing those nodes */ 25262306a36Sopenharmony_cistatic unsigned long ksm_pages_sharing; 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci/* The number of nodes in the unstable tree */ 25562306a36Sopenharmony_cistatic unsigned long ksm_pages_unshared; 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci/* The number of rmap_items in use: to calculate pages_volatile */ 25862306a36Sopenharmony_cistatic unsigned long ksm_rmap_items; 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci/* The number of stable_node chains */ 26162306a36Sopenharmony_cistatic unsigned long ksm_stable_node_chains; 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci/* The number of stable_node dups linked to the stable_node chains */ 26462306a36Sopenharmony_cistatic unsigned long ksm_stable_node_dups; 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_ci/* Delay in pruning stale stable_node_dups in the stable_node_chains */ 26762306a36Sopenharmony_cistatic unsigned int ksm_stable_node_chains_prune_millisecs = 2000; 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci/* Maximum number of page slots sharing a stable node */ 27062306a36Sopenharmony_cistatic int ksm_max_page_sharing = 256; 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci/* Number of pages ksmd should scan in one batch */ 27362306a36Sopenharmony_cistatic unsigned int ksm_thread_pages_to_scan = 100; 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ci/* Milliseconds ksmd should sleep between batches */ 27662306a36Sopenharmony_cistatic unsigned int ksm_thread_sleep_millisecs = 20; 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci/* Checksum of an empty (zeroed) page */ 27962306a36Sopenharmony_cistatic unsigned int zero_checksum __read_mostly; 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci/* Whether to merge empty (zeroed) pages with actual zero pages */ 28262306a36Sopenharmony_cistatic bool ksm_use_zero_pages __read_mostly; 28362306a36Sopenharmony_ci 28462306a36Sopenharmony_ci/* The number of zero pages which is placed by KSM */ 28562306a36Sopenharmony_ciunsigned long ksm_zero_pages; 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci#ifdef CONFIG_NUMA 28862306a36Sopenharmony_ci/* Zeroed when merging across nodes is not allowed */ 28962306a36Sopenharmony_cistatic unsigned int ksm_merge_across_nodes = 1; 29062306a36Sopenharmony_cistatic int ksm_nr_node_ids = 1; 29162306a36Sopenharmony_ci#else 29262306a36Sopenharmony_ci#define ksm_merge_across_nodes 1U 29362306a36Sopenharmony_ci#define ksm_nr_node_ids 1 29462306a36Sopenharmony_ci#endif 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci#define KSM_RUN_STOP 0 29762306a36Sopenharmony_ci#define KSM_RUN_MERGE 1 29862306a36Sopenharmony_ci#define KSM_RUN_UNMERGE 2 29962306a36Sopenharmony_ci#define KSM_RUN_OFFLINE 4 30062306a36Sopenharmony_cistatic unsigned long ksm_run = KSM_RUN_STOP; 30162306a36Sopenharmony_cistatic void wait_while_offlining(void); 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 30462306a36Sopenharmony_cistatic DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); 30562306a36Sopenharmony_cistatic DEFINE_MUTEX(ksm_thread_mutex); 30662306a36Sopenharmony_cistatic DEFINE_SPINLOCK(ksm_mmlist_lock); 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\ 30962306a36Sopenharmony_ci sizeof(struct __struct), __alignof__(struct __struct),\ 31062306a36Sopenharmony_ci (__flags), NULL) 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_cistatic int __init ksm_slab_init(void) 31362306a36Sopenharmony_ci{ 31462306a36Sopenharmony_ci rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0); 31562306a36Sopenharmony_ci if (!rmap_item_cache) 31662306a36Sopenharmony_ci goto out; 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0); 31962306a36Sopenharmony_ci if (!stable_node_cache) 32062306a36Sopenharmony_ci goto out_free1; 32162306a36Sopenharmony_ci 32262306a36Sopenharmony_ci mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0); 32362306a36Sopenharmony_ci if (!mm_slot_cache) 32462306a36Sopenharmony_ci goto out_free2; 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci return 0; 32762306a36Sopenharmony_ci 32862306a36Sopenharmony_ciout_free2: 32962306a36Sopenharmony_ci kmem_cache_destroy(stable_node_cache); 33062306a36Sopenharmony_ciout_free1: 33162306a36Sopenharmony_ci kmem_cache_destroy(rmap_item_cache); 33262306a36Sopenharmony_ciout: 33362306a36Sopenharmony_ci return -ENOMEM; 33462306a36Sopenharmony_ci} 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_cistatic void __init ksm_slab_free(void) 33762306a36Sopenharmony_ci{ 33862306a36Sopenharmony_ci kmem_cache_destroy(mm_slot_cache); 33962306a36Sopenharmony_ci kmem_cache_destroy(stable_node_cache); 34062306a36Sopenharmony_ci kmem_cache_destroy(rmap_item_cache); 34162306a36Sopenharmony_ci mm_slot_cache = NULL; 34262306a36Sopenharmony_ci} 34362306a36Sopenharmony_ci 34462306a36Sopenharmony_cistatic __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) 34562306a36Sopenharmony_ci{ 34662306a36Sopenharmony_ci return chain->rmap_hlist_len == STABLE_NODE_CHAIN; 34762306a36Sopenharmony_ci} 34862306a36Sopenharmony_ci 34962306a36Sopenharmony_cistatic __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) 35062306a36Sopenharmony_ci{ 35162306a36Sopenharmony_ci return dup->head == STABLE_NODE_DUP_HEAD; 35262306a36Sopenharmony_ci} 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_cistatic inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, 35562306a36Sopenharmony_ci struct ksm_stable_node *chain) 35662306a36Sopenharmony_ci{ 35762306a36Sopenharmony_ci VM_BUG_ON(is_stable_node_dup(dup)); 35862306a36Sopenharmony_ci dup->head = STABLE_NODE_DUP_HEAD; 35962306a36Sopenharmony_ci VM_BUG_ON(!is_stable_node_chain(chain)); 36062306a36Sopenharmony_ci hlist_add_head(&dup->hlist_dup, &chain->hlist); 36162306a36Sopenharmony_ci ksm_stable_node_dups++; 36262306a36Sopenharmony_ci} 36362306a36Sopenharmony_ci 36462306a36Sopenharmony_cistatic inline void __stable_node_dup_del(struct ksm_stable_node *dup) 36562306a36Sopenharmony_ci{ 36662306a36Sopenharmony_ci VM_BUG_ON(!is_stable_node_dup(dup)); 36762306a36Sopenharmony_ci hlist_del(&dup->hlist_dup); 36862306a36Sopenharmony_ci ksm_stable_node_dups--; 36962306a36Sopenharmony_ci} 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_cistatic inline void stable_node_dup_del(struct ksm_stable_node *dup) 37262306a36Sopenharmony_ci{ 37362306a36Sopenharmony_ci VM_BUG_ON(is_stable_node_chain(dup)); 37462306a36Sopenharmony_ci if (is_stable_node_dup(dup)) 37562306a36Sopenharmony_ci __stable_node_dup_del(dup); 37662306a36Sopenharmony_ci else 37762306a36Sopenharmony_ci rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); 37862306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 37962306a36Sopenharmony_ci dup->head = NULL; 38062306a36Sopenharmony_ci#endif 38162306a36Sopenharmony_ci} 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_cistatic inline struct ksm_rmap_item *alloc_rmap_item(void) 38462306a36Sopenharmony_ci{ 38562306a36Sopenharmony_ci struct ksm_rmap_item *rmap_item; 38662306a36Sopenharmony_ci 38762306a36Sopenharmony_ci rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | 38862306a36Sopenharmony_ci __GFP_NORETRY | __GFP_NOWARN); 38962306a36Sopenharmony_ci if (rmap_item) 39062306a36Sopenharmony_ci ksm_rmap_items++; 39162306a36Sopenharmony_ci return rmap_item; 39262306a36Sopenharmony_ci} 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_cistatic inline void free_rmap_item(struct ksm_rmap_item *rmap_item) 39562306a36Sopenharmony_ci{ 39662306a36Sopenharmony_ci ksm_rmap_items--; 39762306a36Sopenharmony_ci rmap_item->mm->ksm_rmap_items--; 39862306a36Sopenharmony_ci rmap_item->mm = NULL; /* debug safety */ 39962306a36Sopenharmony_ci kmem_cache_free(rmap_item_cache, rmap_item); 40062306a36Sopenharmony_ci} 40162306a36Sopenharmony_ci 40262306a36Sopenharmony_cistatic inline struct ksm_stable_node *alloc_stable_node(void) 40362306a36Sopenharmony_ci{ 40462306a36Sopenharmony_ci /* 40562306a36Sopenharmony_ci * The allocation can take too long with GFP_KERNEL when memory is under 40662306a36Sopenharmony_ci * pressure, which may lead to hung task warnings. Adding __GFP_HIGH 40762306a36Sopenharmony_ci * grants access to memory reserves, helping to avoid this problem. 40862306a36Sopenharmony_ci */ 40962306a36Sopenharmony_ci return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); 41062306a36Sopenharmony_ci} 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_cistatic inline void free_stable_node(struct ksm_stable_node *stable_node) 41362306a36Sopenharmony_ci{ 41462306a36Sopenharmony_ci VM_BUG_ON(stable_node->rmap_hlist_len && 41562306a36Sopenharmony_ci !is_stable_node_chain(stable_node)); 41662306a36Sopenharmony_ci kmem_cache_free(stable_node_cache, stable_node); 41762306a36Sopenharmony_ci} 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_ci/* 42062306a36Sopenharmony_ci * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's 42162306a36Sopenharmony_ci * page tables after it has passed through ksm_exit() - which, if necessary, 42262306a36Sopenharmony_ci * takes mmap_lock briefly to serialize against them. ksm_exit() does not set 42362306a36Sopenharmony_ci * a special flag: they can just back out as soon as mm_users goes to zero. 42462306a36Sopenharmony_ci * ksm_test_exit() is used throughout to make this test for exit: in some 42562306a36Sopenharmony_ci * places for correctness, in some places just to avoid unnecessary work. 42662306a36Sopenharmony_ci */ 42762306a36Sopenharmony_cistatic inline bool ksm_test_exit(struct mm_struct *mm) 42862306a36Sopenharmony_ci{ 42962306a36Sopenharmony_ci return atomic_read(&mm->mm_users) == 0; 43062306a36Sopenharmony_ci} 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_cistatic int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, 43362306a36Sopenharmony_ci struct mm_walk *walk) 43462306a36Sopenharmony_ci{ 43562306a36Sopenharmony_ci struct page *page = NULL; 43662306a36Sopenharmony_ci spinlock_t *ptl; 43762306a36Sopenharmony_ci pte_t *pte; 43862306a36Sopenharmony_ci pte_t ptent; 43962306a36Sopenharmony_ci int ret; 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 44262306a36Sopenharmony_ci if (!pte) 44362306a36Sopenharmony_ci return 0; 44462306a36Sopenharmony_ci ptent = ptep_get(pte); 44562306a36Sopenharmony_ci if (pte_present(ptent)) { 44662306a36Sopenharmony_ci page = vm_normal_page(walk->vma, addr, ptent); 44762306a36Sopenharmony_ci } else if (!pte_none(ptent)) { 44862306a36Sopenharmony_ci swp_entry_t entry = pte_to_swp_entry(ptent); 44962306a36Sopenharmony_ci 45062306a36Sopenharmony_ci /* 45162306a36Sopenharmony_ci * As KSM pages remain KSM pages until freed, no need to wait 45262306a36Sopenharmony_ci * here for migration to end. 45362306a36Sopenharmony_ci */ 45462306a36Sopenharmony_ci if (is_migration_entry(entry)) 45562306a36Sopenharmony_ci page = pfn_swap_entry_to_page(entry); 45662306a36Sopenharmony_ci } 45762306a36Sopenharmony_ci /* return 1 if the page is an normal ksm page or KSM-placed zero page */ 45862306a36Sopenharmony_ci ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte); 45962306a36Sopenharmony_ci pte_unmap_unlock(pte, ptl); 46062306a36Sopenharmony_ci return ret; 46162306a36Sopenharmony_ci} 46262306a36Sopenharmony_ci 46362306a36Sopenharmony_cistatic const struct mm_walk_ops break_ksm_ops = { 46462306a36Sopenharmony_ci .pmd_entry = break_ksm_pmd_entry, 46562306a36Sopenharmony_ci .walk_lock = PGWALK_RDLOCK, 46662306a36Sopenharmony_ci}; 46762306a36Sopenharmony_ci 46862306a36Sopenharmony_cistatic const struct mm_walk_ops break_ksm_lock_vma_ops = { 46962306a36Sopenharmony_ci .pmd_entry = break_ksm_pmd_entry, 47062306a36Sopenharmony_ci .walk_lock = PGWALK_WRLOCK, 47162306a36Sopenharmony_ci}; 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci/* 47462306a36Sopenharmony_ci * We use break_ksm to break COW on a ksm page by triggering unsharing, 47562306a36Sopenharmony_ci * such that the ksm page will get replaced by an exclusive anonymous page. 47662306a36Sopenharmony_ci * 47762306a36Sopenharmony_ci * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, 47862306a36Sopenharmony_ci * in case the application has unmapped and remapped mm,addr meanwhile. 47962306a36Sopenharmony_ci * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP 48062306a36Sopenharmony_ci * mmap of /dev/mem, where we would not want to touch it. 48162306a36Sopenharmony_ci * 48262306a36Sopenharmony_ci * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context 48362306a36Sopenharmony_ci * of the process that owns 'vma'. We also do not want to enforce 48462306a36Sopenharmony_ci * protection keys here anyway. 48562306a36Sopenharmony_ci */ 48662306a36Sopenharmony_cistatic int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) 48762306a36Sopenharmony_ci{ 48862306a36Sopenharmony_ci vm_fault_t ret = 0; 48962306a36Sopenharmony_ci const struct mm_walk_ops *ops = lock_vma ? 49062306a36Sopenharmony_ci &break_ksm_lock_vma_ops : &break_ksm_ops; 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci do { 49362306a36Sopenharmony_ci int ksm_page; 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci cond_resched(); 49662306a36Sopenharmony_ci ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL); 49762306a36Sopenharmony_ci if (WARN_ON_ONCE(ksm_page < 0)) 49862306a36Sopenharmony_ci return ksm_page; 49962306a36Sopenharmony_ci if (!ksm_page) 50062306a36Sopenharmony_ci return 0; 50162306a36Sopenharmony_ci ret = handle_mm_fault(vma, addr, 50262306a36Sopenharmony_ci FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, 50362306a36Sopenharmony_ci NULL); 50462306a36Sopenharmony_ci } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); 50562306a36Sopenharmony_ci /* 50662306a36Sopenharmony_ci * We must loop until we no longer find a KSM page because 50762306a36Sopenharmony_ci * handle_mm_fault() may back out if there's any difficulty e.g. if 50862306a36Sopenharmony_ci * pte accessed bit gets updated concurrently. 50962306a36Sopenharmony_ci * 51062306a36Sopenharmony_ci * VM_FAULT_SIGBUS could occur if we race with truncation of the 51162306a36Sopenharmony_ci * backing file, which also invalidates anonymous pages: that's 51262306a36Sopenharmony_ci * okay, that truncation will have unmapped the PageKsm for us. 51362306a36Sopenharmony_ci * 51462306a36Sopenharmony_ci * VM_FAULT_OOM: at the time of writing (late July 2009), setting 51562306a36Sopenharmony_ci * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the 51662306a36Sopenharmony_ci * current task has TIF_MEMDIE set, and will be OOM killed on return 51762306a36Sopenharmony_ci * to user; and ksmd, having no mm, would never be chosen for that. 51862306a36Sopenharmony_ci * 51962306a36Sopenharmony_ci * But if the mm is in a limited mem_cgroup, then the fault may fail 52062306a36Sopenharmony_ci * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and 52162306a36Sopenharmony_ci * even ksmd can fail in this way - though it's usually breaking ksm 52262306a36Sopenharmony_ci * just to undo a merge it made a moment before, so unlikely to oom. 52362306a36Sopenharmony_ci * 52462306a36Sopenharmony_ci * That's a pity: we might therefore have more kernel pages allocated 52562306a36Sopenharmony_ci * than we're counting as nodes in the stable tree; but ksm_do_scan 52662306a36Sopenharmony_ci * will retry to break_cow on each pass, so should recover the page 52762306a36Sopenharmony_ci * in due course. The important thing is to not let VM_MERGEABLE 52862306a36Sopenharmony_ci * be cleared while any such pages might remain in the area. 52962306a36Sopenharmony_ci */ 53062306a36Sopenharmony_ci return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 53162306a36Sopenharmony_ci} 53262306a36Sopenharmony_ci 53362306a36Sopenharmony_cistatic bool vma_ksm_compatible(struct vm_area_struct *vma) 53462306a36Sopenharmony_ci{ 53562306a36Sopenharmony_ci if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | 53662306a36Sopenharmony_ci VM_IO | VM_DONTEXPAND | VM_HUGETLB | 53762306a36Sopenharmony_ci VM_MIXEDMAP)) 53862306a36Sopenharmony_ci return false; /* just ignore the advice */ 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_ci if (vma_is_dax(vma)) 54162306a36Sopenharmony_ci return false; 54262306a36Sopenharmony_ci 54362306a36Sopenharmony_ci#ifdef VM_SAO 54462306a36Sopenharmony_ci if (vma->vm_flags & VM_SAO) 54562306a36Sopenharmony_ci return false; 54662306a36Sopenharmony_ci#endif 54762306a36Sopenharmony_ci#ifdef VM_SPARC_ADI 54862306a36Sopenharmony_ci if (vma->vm_flags & VM_SPARC_ADI) 54962306a36Sopenharmony_ci return false; 55062306a36Sopenharmony_ci#endif 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_ci return true; 55362306a36Sopenharmony_ci} 55462306a36Sopenharmony_ci 55562306a36Sopenharmony_cistatic struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, 55662306a36Sopenharmony_ci unsigned long addr) 55762306a36Sopenharmony_ci{ 55862306a36Sopenharmony_ci struct vm_area_struct *vma; 55962306a36Sopenharmony_ci if (ksm_test_exit(mm)) 56062306a36Sopenharmony_ci return NULL; 56162306a36Sopenharmony_ci vma = vma_lookup(mm, addr); 56262306a36Sopenharmony_ci if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 56362306a36Sopenharmony_ci return NULL; 56462306a36Sopenharmony_ci return vma; 56562306a36Sopenharmony_ci} 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_cistatic void break_cow(struct ksm_rmap_item *rmap_item) 56862306a36Sopenharmony_ci{ 56962306a36Sopenharmony_ci struct mm_struct *mm = rmap_item->mm; 57062306a36Sopenharmony_ci unsigned long addr = rmap_item->address; 57162306a36Sopenharmony_ci struct vm_area_struct *vma; 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ci /* 57462306a36Sopenharmony_ci * It is not an accident that whenever we want to break COW 57562306a36Sopenharmony_ci * to undo, we also need to drop a reference to the anon_vma. 57662306a36Sopenharmony_ci */ 57762306a36Sopenharmony_ci put_anon_vma(rmap_item->anon_vma); 57862306a36Sopenharmony_ci 57962306a36Sopenharmony_ci mmap_read_lock(mm); 58062306a36Sopenharmony_ci vma = find_mergeable_vma(mm, addr); 58162306a36Sopenharmony_ci if (vma) 58262306a36Sopenharmony_ci break_ksm(vma, addr, false); 58362306a36Sopenharmony_ci mmap_read_unlock(mm); 58462306a36Sopenharmony_ci} 58562306a36Sopenharmony_ci 58662306a36Sopenharmony_cistatic struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) 58762306a36Sopenharmony_ci{ 58862306a36Sopenharmony_ci struct mm_struct *mm = rmap_item->mm; 58962306a36Sopenharmony_ci unsigned long addr = rmap_item->address; 59062306a36Sopenharmony_ci struct vm_area_struct *vma; 59162306a36Sopenharmony_ci struct page *page; 59262306a36Sopenharmony_ci 59362306a36Sopenharmony_ci mmap_read_lock(mm); 59462306a36Sopenharmony_ci vma = find_mergeable_vma(mm, addr); 59562306a36Sopenharmony_ci if (!vma) 59662306a36Sopenharmony_ci goto out; 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci page = follow_page(vma, addr, FOLL_GET); 59962306a36Sopenharmony_ci if (IS_ERR_OR_NULL(page)) 60062306a36Sopenharmony_ci goto out; 60162306a36Sopenharmony_ci if (is_zone_device_page(page)) 60262306a36Sopenharmony_ci goto out_putpage; 60362306a36Sopenharmony_ci if (PageAnon(page)) { 60462306a36Sopenharmony_ci flush_anon_page(vma, page, addr); 60562306a36Sopenharmony_ci flush_dcache_page(page); 60662306a36Sopenharmony_ci } else { 60762306a36Sopenharmony_ciout_putpage: 60862306a36Sopenharmony_ci put_page(page); 60962306a36Sopenharmony_ciout: 61062306a36Sopenharmony_ci page = NULL; 61162306a36Sopenharmony_ci } 61262306a36Sopenharmony_ci mmap_read_unlock(mm); 61362306a36Sopenharmony_ci return page; 61462306a36Sopenharmony_ci} 61562306a36Sopenharmony_ci 61662306a36Sopenharmony_ci/* 61762306a36Sopenharmony_ci * This helper is used for getting right index into array of tree roots. 61862306a36Sopenharmony_ci * When merge_across_nodes knob is set to 1, there are only two rb-trees for 61962306a36Sopenharmony_ci * stable and unstable pages from all nodes with roots in index 0. Otherwise, 62062306a36Sopenharmony_ci * every node has its own stable and unstable tree. 62162306a36Sopenharmony_ci */ 62262306a36Sopenharmony_cistatic inline int get_kpfn_nid(unsigned long kpfn) 62362306a36Sopenharmony_ci{ 62462306a36Sopenharmony_ci return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); 62562306a36Sopenharmony_ci} 62662306a36Sopenharmony_ci 62762306a36Sopenharmony_cistatic struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, 62862306a36Sopenharmony_ci struct rb_root *root) 62962306a36Sopenharmony_ci{ 63062306a36Sopenharmony_ci struct ksm_stable_node *chain = alloc_stable_node(); 63162306a36Sopenharmony_ci VM_BUG_ON(is_stable_node_chain(dup)); 63262306a36Sopenharmony_ci if (likely(chain)) { 63362306a36Sopenharmony_ci INIT_HLIST_HEAD(&chain->hlist); 63462306a36Sopenharmony_ci chain->chain_prune_time = jiffies; 63562306a36Sopenharmony_ci chain->rmap_hlist_len = STABLE_NODE_CHAIN; 63662306a36Sopenharmony_ci#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) 63762306a36Sopenharmony_ci chain->nid = NUMA_NO_NODE; /* debug */ 63862306a36Sopenharmony_ci#endif 63962306a36Sopenharmony_ci ksm_stable_node_chains++; 64062306a36Sopenharmony_ci 64162306a36Sopenharmony_ci /* 64262306a36Sopenharmony_ci * Put the stable node chain in the first dimension of 64362306a36Sopenharmony_ci * the stable tree and at the same time remove the old 64462306a36Sopenharmony_ci * stable node. 64562306a36Sopenharmony_ci */ 64662306a36Sopenharmony_ci rb_replace_node(&dup->node, &chain->node, root); 64762306a36Sopenharmony_ci 64862306a36Sopenharmony_ci /* 64962306a36Sopenharmony_ci * Move the old stable node to the second dimension 65062306a36Sopenharmony_ci * queued in the hlist_dup. The invariant is that all 65162306a36Sopenharmony_ci * dup stable_nodes in the chain->hlist point to pages 65262306a36Sopenharmony_ci * that are write protected and have the exact same 65362306a36Sopenharmony_ci * content. 65462306a36Sopenharmony_ci */ 65562306a36Sopenharmony_ci stable_node_chain_add_dup(dup, chain); 65662306a36Sopenharmony_ci } 65762306a36Sopenharmony_ci return chain; 65862306a36Sopenharmony_ci} 65962306a36Sopenharmony_ci 66062306a36Sopenharmony_cistatic inline void free_stable_node_chain(struct ksm_stable_node *chain, 66162306a36Sopenharmony_ci struct rb_root *root) 66262306a36Sopenharmony_ci{ 66362306a36Sopenharmony_ci rb_erase(&chain->node, root); 66462306a36Sopenharmony_ci free_stable_node(chain); 66562306a36Sopenharmony_ci ksm_stable_node_chains--; 66662306a36Sopenharmony_ci} 66762306a36Sopenharmony_ci 66862306a36Sopenharmony_cistatic void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) 66962306a36Sopenharmony_ci{ 67062306a36Sopenharmony_ci struct ksm_rmap_item *rmap_item; 67162306a36Sopenharmony_ci 67262306a36Sopenharmony_ci /* check it's not STABLE_NODE_CHAIN or negative */ 67362306a36Sopenharmony_ci BUG_ON(stable_node->rmap_hlist_len < 0); 67462306a36Sopenharmony_ci 67562306a36Sopenharmony_ci hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { 67662306a36Sopenharmony_ci if (rmap_item->hlist.next) { 67762306a36Sopenharmony_ci ksm_pages_sharing--; 67862306a36Sopenharmony_ci trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm); 67962306a36Sopenharmony_ci } else { 68062306a36Sopenharmony_ci ksm_pages_shared--; 68162306a36Sopenharmony_ci } 68262306a36Sopenharmony_ci 68362306a36Sopenharmony_ci rmap_item->mm->ksm_merging_pages--; 68462306a36Sopenharmony_ci 68562306a36Sopenharmony_ci VM_BUG_ON(stable_node->rmap_hlist_len <= 0); 68662306a36Sopenharmony_ci stable_node->rmap_hlist_len--; 68762306a36Sopenharmony_ci put_anon_vma(rmap_item->anon_vma); 68862306a36Sopenharmony_ci rmap_item->address &= PAGE_MASK; 68962306a36Sopenharmony_ci cond_resched(); 69062306a36Sopenharmony_ci } 69162306a36Sopenharmony_ci 69262306a36Sopenharmony_ci /* 69362306a36Sopenharmony_ci * We need the second aligned pointer of the migrate_nodes 69462306a36Sopenharmony_ci * list_head to stay clear from the rb_parent_color union 69562306a36Sopenharmony_ci * (aligned and different than any node) and also different 69662306a36Sopenharmony_ci * from &migrate_nodes. This will verify that future list.h changes 69762306a36Sopenharmony_ci * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it. 69862306a36Sopenharmony_ci */ 69962306a36Sopenharmony_ci BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); 70062306a36Sopenharmony_ci BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_ci trace_ksm_remove_ksm_page(stable_node->kpfn); 70362306a36Sopenharmony_ci if (stable_node->head == &migrate_nodes) 70462306a36Sopenharmony_ci list_del(&stable_node->list); 70562306a36Sopenharmony_ci else 70662306a36Sopenharmony_ci stable_node_dup_del(stable_node); 70762306a36Sopenharmony_ci free_stable_node(stable_node); 70862306a36Sopenharmony_ci} 70962306a36Sopenharmony_ci 71062306a36Sopenharmony_cienum get_ksm_page_flags { 71162306a36Sopenharmony_ci GET_KSM_PAGE_NOLOCK, 71262306a36Sopenharmony_ci GET_KSM_PAGE_LOCK, 71362306a36Sopenharmony_ci GET_KSM_PAGE_TRYLOCK 71462306a36Sopenharmony_ci}; 71562306a36Sopenharmony_ci 71662306a36Sopenharmony_ci/* 71762306a36Sopenharmony_ci * get_ksm_page: checks if the page indicated by the stable node 71862306a36Sopenharmony_ci * is still its ksm page, despite having held no reference to it. 71962306a36Sopenharmony_ci * In which case we can trust the content of the page, and it 72062306a36Sopenharmony_ci * returns the gotten page; but if the page has now been zapped, 72162306a36Sopenharmony_ci * remove the stale node from the stable tree and return NULL. 72262306a36Sopenharmony_ci * But beware, the stable node's page might be being migrated. 72362306a36Sopenharmony_ci * 72462306a36Sopenharmony_ci * You would expect the stable_node to hold a reference to the ksm page. 72562306a36Sopenharmony_ci * But if it increments the page's count, swapping out has to wait for 72662306a36Sopenharmony_ci * ksmd to come around again before it can free the page, which may take 72762306a36Sopenharmony_ci * seconds or even minutes: much too unresponsive. So instead we use a 72862306a36Sopenharmony_ci * "keyhole reference": access to the ksm page from the stable node peeps 72962306a36Sopenharmony_ci * out through its keyhole to see if that page still holds the right key, 73062306a36Sopenharmony_ci * pointing back to this stable node. This relies on freeing a PageAnon 73162306a36Sopenharmony_ci * page to reset its page->mapping to NULL, and relies on no other use of 73262306a36Sopenharmony_ci * a page to put something that might look like our key in page->mapping. 73362306a36Sopenharmony_ci * is on its way to being freed; but it is an anomaly to bear in mind. 73462306a36Sopenharmony_ci */ 73562306a36Sopenharmony_cistatic struct page *get_ksm_page(struct ksm_stable_node *stable_node, 73662306a36Sopenharmony_ci enum get_ksm_page_flags flags) 73762306a36Sopenharmony_ci{ 73862306a36Sopenharmony_ci struct page *page; 73962306a36Sopenharmony_ci void *expected_mapping; 74062306a36Sopenharmony_ci unsigned long kpfn; 74162306a36Sopenharmony_ci 74262306a36Sopenharmony_ci expected_mapping = (void *)((unsigned long)stable_node | 74362306a36Sopenharmony_ci PAGE_MAPPING_KSM); 74462306a36Sopenharmony_ciagain: 74562306a36Sopenharmony_ci kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ 74662306a36Sopenharmony_ci page = pfn_to_page(kpfn); 74762306a36Sopenharmony_ci if (READ_ONCE(page->mapping) != expected_mapping) 74862306a36Sopenharmony_ci goto stale; 74962306a36Sopenharmony_ci 75062306a36Sopenharmony_ci /* 75162306a36Sopenharmony_ci * We cannot do anything with the page while its refcount is 0. 75262306a36Sopenharmony_ci * Usually 0 means free, or tail of a higher-order page: in which 75362306a36Sopenharmony_ci * case this node is no longer referenced, and should be freed; 75462306a36Sopenharmony_ci * however, it might mean that the page is under page_ref_freeze(). 75562306a36Sopenharmony_ci * The __remove_mapping() case is easy, again the node is now stale; 75662306a36Sopenharmony_ci * the same is in reuse_ksm_page() case; but if page is swapcache 75762306a36Sopenharmony_ci * in folio_migrate_mapping(), it might still be our page, 75862306a36Sopenharmony_ci * in which case it's essential to keep the node. 75962306a36Sopenharmony_ci */ 76062306a36Sopenharmony_ci while (!get_page_unless_zero(page)) { 76162306a36Sopenharmony_ci /* 76262306a36Sopenharmony_ci * Another check for page->mapping != expected_mapping would 76362306a36Sopenharmony_ci * work here too. We have chosen the !PageSwapCache test to 76462306a36Sopenharmony_ci * optimize the common case, when the page is or is about to 76562306a36Sopenharmony_ci * be freed: PageSwapCache is cleared (under spin_lock_irq) 76662306a36Sopenharmony_ci * in the ref_freeze section of __remove_mapping(); but Anon 76762306a36Sopenharmony_ci * page->mapping reset to NULL later, in free_pages_prepare(). 76862306a36Sopenharmony_ci */ 76962306a36Sopenharmony_ci if (!PageSwapCache(page)) 77062306a36Sopenharmony_ci goto stale; 77162306a36Sopenharmony_ci cpu_relax(); 77262306a36Sopenharmony_ci } 77362306a36Sopenharmony_ci 77462306a36Sopenharmony_ci if (READ_ONCE(page->mapping) != expected_mapping) { 77562306a36Sopenharmony_ci put_page(page); 77662306a36Sopenharmony_ci goto stale; 77762306a36Sopenharmony_ci } 77862306a36Sopenharmony_ci 77962306a36Sopenharmony_ci if (flags == GET_KSM_PAGE_TRYLOCK) { 78062306a36Sopenharmony_ci if (!trylock_page(page)) { 78162306a36Sopenharmony_ci put_page(page); 78262306a36Sopenharmony_ci return ERR_PTR(-EBUSY); 78362306a36Sopenharmony_ci } 78462306a36Sopenharmony_ci } else if (flags == GET_KSM_PAGE_LOCK) 78562306a36Sopenharmony_ci lock_page(page); 78662306a36Sopenharmony_ci 78762306a36Sopenharmony_ci if (flags != GET_KSM_PAGE_NOLOCK) { 78862306a36Sopenharmony_ci if (READ_ONCE(page->mapping) != expected_mapping) { 78962306a36Sopenharmony_ci unlock_page(page); 79062306a36Sopenharmony_ci put_page(page); 79162306a36Sopenharmony_ci goto stale; 79262306a36Sopenharmony_ci } 79362306a36Sopenharmony_ci } 79462306a36Sopenharmony_ci return page; 79562306a36Sopenharmony_ci 79662306a36Sopenharmony_cistale: 79762306a36Sopenharmony_ci /* 79862306a36Sopenharmony_ci * We come here from above when page->mapping or !PageSwapCache 79962306a36Sopenharmony_ci * suggests that the node is stale; but it might be under migration. 80062306a36Sopenharmony_ci * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(), 80162306a36Sopenharmony_ci * before checking whether node->kpfn has been changed. 80262306a36Sopenharmony_ci */ 80362306a36Sopenharmony_ci smp_rmb(); 80462306a36Sopenharmony_ci if (READ_ONCE(stable_node->kpfn) != kpfn) 80562306a36Sopenharmony_ci goto again; 80662306a36Sopenharmony_ci remove_node_from_stable_tree(stable_node); 80762306a36Sopenharmony_ci return NULL; 80862306a36Sopenharmony_ci} 80962306a36Sopenharmony_ci 81062306a36Sopenharmony_ci/* 81162306a36Sopenharmony_ci * Removing rmap_item from stable or unstable tree. 81262306a36Sopenharmony_ci * This function will clean the information from the stable/unstable tree. 81362306a36Sopenharmony_ci */ 81462306a36Sopenharmony_cistatic void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) 81562306a36Sopenharmony_ci{ 81662306a36Sopenharmony_ci if (rmap_item->address & STABLE_FLAG) { 81762306a36Sopenharmony_ci struct ksm_stable_node *stable_node; 81862306a36Sopenharmony_ci struct page *page; 81962306a36Sopenharmony_ci 82062306a36Sopenharmony_ci stable_node = rmap_item->head; 82162306a36Sopenharmony_ci page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); 82262306a36Sopenharmony_ci if (!page) 82362306a36Sopenharmony_ci goto out; 82462306a36Sopenharmony_ci 82562306a36Sopenharmony_ci hlist_del(&rmap_item->hlist); 82662306a36Sopenharmony_ci unlock_page(page); 82762306a36Sopenharmony_ci put_page(page); 82862306a36Sopenharmony_ci 82962306a36Sopenharmony_ci if (!hlist_empty(&stable_node->hlist)) 83062306a36Sopenharmony_ci ksm_pages_sharing--; 83162306a36Sopenharmony_ci else 83262306a36Sopenharmony_ci ksm_pages_shared--; 83362306a36Sopenharmony_ci 83462306a36Sopenharmony_ci rmap_item->mm->ksm_merging_pages--; 83562306a36Sopenharmony_ci 83662306a36Sopenharmony_ci VM_BUG_ON(stable_node->rmap_hlist_len <= 0); 83762306a36Sopenharmony_ci stable_node->rmap_hlist_len--; 83862306a36Sopenharmony_ci 83962306a36Sopenharmony_ci put_anon_vma(rmap_item->anon_vma); 84062306a36Sopenharmony_ci rmap_item->head = NULL; 84162306a36Sopenharmony_ci rmap_item->address &= PAGE_MASK; 84262306a36Sopenharmony_ci 84362306a36Sopenharmony_ci } else if (rmap_item->address & UNSTABLE_FLAG) { 84462306a36Sopenharmony_ci unsigned char age; 84562306a36Sopenharmony_ci /* 84662306a36Sopenharmony_ci * Usually ksmd can and must skip the rb_erase, because 84762306a36Sopenharmony_ci * root_unstable_tree was already reset to RB_ROOT. 84862306a36Sopenharmony_ci * But be careful when an mm is exiting: do the rb_erase 84962306a36Sopenharmony_ci * if this rmap_item was inserted by this scan, rather 85062306a36Sopenharmony_ci * than left over from before. 85162306a36Sopenharmony_ci */ 85262306a36Sopenharmony_ci age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 85362306a36Sopenharmony_ci BUG_ON(age > 1); 85462306a36Sopenharmony_ci if (!age) 85562306a36Sopenharmony_ci rb_erase(&rmap_item->node, 85662306a36Sopenharmony_ci root_unstable_tree + NUMA(rmap_item->nid)); 85762306a36Sopenharmony_ci ksm_pages_unshared--; 85862306a36Sopenharmony_ci rmap_item->address &= PAGE_MASK; 85962306a36Sopenharmony_ci } 86062306a36Sopenharmony_ciout: 86162306a36Sopenharmony_ci cond_resched(); /* we're called from many long loops */ 86262306a36Sopenharmony_ci} 86362306a36Sopenharmony_ci 86462306a36Sopenharmony_cistatic void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) 86562306a36Sopenharmony_ci{ 86662306a36Sopenharmony_ci while (*rmap_list) { 86762306a36Sopenharmony_ci struct ksm_rmap_item *rmap_item = *rmap_list; 86862306a36Sopenharmony_ci *rmap_list = rmap_item->rmap_list; 86962306a36Sopenharmony_ci remove_rmap_item_from_tree(rmap_item); 87062306a36Sopenharmony_ci free_rmap_item(rmap_item); 87162306a36Sopenharmony_ci } 87262306a36Sopenharmony_ci} 87362306a36Sopenharmony_ci 87462306a36Sopenharmony_ci/* 87562306a36Sopenharmony_ci * Though it's very tempting to unmerge rmap_items from stable tree rather 87662306a36Sopenharmony_ci * than check every pte of a given vma, the locking doesn't quite work for 87762306a36Sopenharmony_ci * that - an rmap_item is assigned to the stable tree after inserting ksm 87862306a36Sopenharmony_ci * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing 87962306a36Sopenharmony_ci * rmap_items from parent to child at fork time (so as not to waste time 88062306a36Sopenharmony_ci * if exit comes before the next scan reaches it). 88162306a36Sopenharmony_ci * 88262306a36Sopenharmony_ci * Similarly, although we'd like to remove rmap_items (so updating counts 88362306a36Sopenharmony_ci * and freeing memory) when unmerging an area, it's easier to leave that 88462306a36Sopenharmony_ci * to the next pass of ksmd - consider, for example, how ksmd might be 88562306a36Sopenharmony_ci * in cmp_and_merge_page on one of the rmap_items we would be removing. 88662306a36Sopenharmony_ci */ 88762306a36Sopenharmony_cistatic int unmerge_ksm_pages(struct vm_area_struct *vma, 88862306a36Sopenharmony_ci unsigned long start, unsigned long end, bool lock_vma) 88962306a36Sopenharmony_ci{ 89062306a36Sopenharmony_ci unsigned long addr; 89162306a36Sopenharmony_ci int err = 0; 89262306a36Sopenharmony_ci 89362306a36Sopenharmony_ci for (addr = start; addr < end && !err; addr += PAGE_SIZE) { 89462306a36Sopenharmony_ci if (ksm_test_exit(vma->vm_mm)) 89562306a36Sopenharmony_ci break; 89662306a36Sopenharmony_ci if (signal_pending(current)) 89762306a36Sopenharmony_ci err = -ERESTARTSYS; 89862306a36Sopenharmony_ci else 89962306a36Sopenharmony_ci err = break_ksm(vma, addr, lock_vma); 90062306a36Sopenharmony_ci } 90162306a36Sopenharmony_ci return err; 90262306a36Sopenharmony_ci} 90362306a36Sopenharmony_ci 90462306a36Sopenharmony_cistatic inline struct ksm_stable_node *folio_stable_node(struct folio *folio) 90562306a36Sopenharmony_ci{ 90662306a36Sopenharmony_ci return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; 90762306a36Sopenharmony_ci} 90862306a36Sopenharmony_ci 90962306a36Sopenharmony_cistatic inline struct ksm_stable_node *page_stable_node(struct page *page) 91062306a36Sopenharmony_ci{ 91162306a36Sopenharmony_ci return folio_stable_node(page_folio(page)); 91262306a36Sopenharmony_ci} 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_cistatic inline void set_page_stable_node(struct page *page, 91562306a36Sopenharmony_ci struct ksm_stable_node *stable_node) 91662306a36Sopenharmony_ci{ 91762306a36Sopenharmony_ci VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page); 91862306a36Sopenharmony_ci page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); 91962306a36Sopenharmony_ci} 92062306a36Sopenharmony_ci 92162306a36Sopenharmony_ci#ifdef CONFIG_SYSFS 92262306a36Sopenharmony_ci/* 92362306a36Sopenharmony_ci * Only called through the sysfs control interface: 92462306a36Sopenharmony_ci */ 92562306a36Sopenharmony_cistatic int remove_stable_node(struct ksm_stable_node *stable_node) 92662306a36Sopenharmony_ci{ 92762306a36Sopenharmony_ci struct page *page; 92862306a36Sopenharmony_ci int err; 92962306a36Sopenharmony_ci 93062306a36Sopenharmony_ci page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); 93162306a36Sopenharmony_ci if (!page) { 93262306a36Sopenharmony_ci /* 93362306a36Sopenharmony_ci * get_ksm_page did remove_node_from_stable_tree itself. 93462306a36Sopenharmony_ci */ 93562306a36Sopenharmony_ci return 0; 93662306a36Sopenharmony_ci } 93762306a36Sopenharmony_ci 93862306a36Sopenharmony_ci /* 93962306a36Sopenharmony_ci * Page could be still mapped if this races with __mmput() running in 94062306a36Sopenharmony_ci * between ksm_exit() and exit_mmap(). Just refuse to let 94162306a36Sopenharmony_ci * merge_across_nodes/max_page_sharing be switched. 94262306a36Sopenharmony_ci */ 94362306a36Sopenharmony_ci err = -EBUSY; 94462306a36Sopenharmony_ci if (!page_mapped(page)) { 94562306a36Sopenharmony_ci /* 94662306a36Sopenharmony_ci * The stable node did not yet appear stale to get_ksm_page(), 94762306a36Sopenharmony_ci * since that allows for an unmapped ksm page to be recognized 94862306a36Sopenharmony_ci * right up until it is freed; but the node is safe to remove. 94962306a36Sopenharmony_ci * This page might be in an LRU cache waiting to be freed, 95062306a36Sopenharmony_ci * or it might be PageSwapCache (perhaps under writeback), 95162306a36Sopenharmony_ci * or it might have been removed from swapcache a moment ago. 95262306a36Sopenharmony_ci */ 95362306a36Sopenharmony_ci set_page_stable_node(page, NULL); 95462306a36Sopenharmony_ci remove_node_from_stable_tree(stable_node); 95562306a36Sopenharmony_ci err = 0; 95662306a36Sopenharmony_ci } 95762306a36Sopenharmony_ci 95862306a36Sopenharmony_ci unlock_page(page); 95962306a36Sopenharmony_ci put_page(page); 96062306a36Sopenharmony_ci return err; 96162306a36Sopenharmony_ci} 96262306a36Sopenharmony_ci 96362306a36Sopenharmony_cistatic int remove_stable_node_chain(struct ksm_stable_node *stable_node, 96462306a36Sopenharmony_ci struct rb_root *root) 96562306a36Sopenharmony_ci{ 96662306a36Sopenharmony_ci struct ksm_stable_node *dup; 96762306a36Sopenharmony_ci struct hlist_node *hlist_safe; 96862306a36Sopenharmony_ci 96962306a36Sopenharmony_ci if (!is_stable_node_chain(stable_node)) { 97062306a36Sopenharmony_ci VM_BUG_ON(is_stable_node_dup(stable_node)); 97162306a36Sopenharmony_ci if (remove_stable_node(stable_node)) 97262306a36Sopenharmony_ci return true; 97362306a36Sopenharmony_ci else 97462306a36Sopenharmony_ci return false; 97562306a36Sopenharmony_ci } 97662306a36Sopenharmony_ci 97762306a36Sopenharmony_ci hlist_for_each_entry_safe(dup, hlist_safe, 97862306a36Sopenharmony_ci &stable_node->hlist, hlist_dup) { 97962306a36Sopenharmony_ci VM_BUG_ON(!is_stable_node_dup(dup)); 98062306a36Sopenharmony_ci if (remove_stable_node(dup)) 98162306a36Sopenharmony_ci return true; 98262306a36Sopenharmony_ci } 98362306a36Sopenharmony_ci BUG_ON(!hlist_empty(&stable_node->hlist)); 98462306a36Sopenharmony_ci free_stable_node_chain(stable_node, root); 98562306a36Sopenharmony_ci return false; 98662306a36Sopenharmony_ci} 98762306a36Sopenharmony_ci 98862306a36Sopenharmony_cistatic int remove_all_stable_nodes(void) 98962306a36Sopenharmony_ci{ 99062306a36Sopenharmony_ci struct ksm_stable_node *stable_node, *next; 99162306a36Sopenharmony_ci int nid; 99262306a36Sopenharmony_ci int err = 0; 99362306a36Sopenharmony_ci 99462306a36Sopenharmony_ci for (nid = 0; nid < ksm_nr_node_ids; nid++) { 99562306a36Sopenharmony_ci while (root_stable_tree[nid].rb_node) { 99662306a36Sopenharmony_ci stable_node = rb_entry(root_stable_tree[nid].rb_node, 99762306a36Sopenharmony_ci struct ksm_stable_node, node); 99862306a36Sopenharmony_ci if (remove_stable_node_chain(stable_node, 99962306a36Sopenharmony_ci root_stable_tree + nid)) { 100062306a36Sopenharmony_ci err = -EBUSY; 100162306a36Sopenharmony_ci break; /* proceed to next nid */ 100262306a36Sopenharmony_ci } 100362306a36Sopenharmony_ci cond_resched(); 100462306a36Sopenharmony_ci } 100562306a36Sopenharmony_ci } 100662306a36Sopenharmony_ci list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { 100762306a36Sopenharmony_ci if (remove_stable_node(stable_node)) 100862306a36Sopenharmony_ci err = -EBUSY; 100962306a36Sopenharmony_ci cond_resched(); 101062306a36Sopenharmony_ci } 101162306a36Sopenharmony_ci return err; 101262306a36Sopenharmony_ci} 101362306a36Sopenharmony_ci 101462306a36Sopenharmony_cistatic int unmerge_and_remove_all_rmap_items(void) 101562306a36Sopenharmony_ci{ 101662306a36Sopenharmony_ci struct ksm_mm_slot *mm_slot; 101762306a36Sopenharmony_ci struct mm_slot *slot; 101862306a36Sopenharmony_ci struct mm_struct *mm; 101962306a36Sopenharmony_ci struct vm_area_struct *vma; 102062306a36Sopenharmony_ci int err = 0; 102162306a36Sopenharmony_ci 102262306a36Sopenharmony_ci spin_lock(&ksm_mmlist_lock); 102362306a36Sopenharmony_ci slot = list_entry(ksm_mm_head.slot.mm_node.next, 102462306a36Sopenharmony_ci struct mm_slot, mm_node); 102562306a36Sopenharmony_ci ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); 102662306a36Sopenharmony_ci spin_unlock(&ksm_mmlist_lock); 102762306a36Sopenharmony_ci 102862306a36Sopenharmony_ci for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; 102962306a36Sopenharmony_ci mm_slot = ksm_scan.mm_slot) { 103062306a36Sopenharmony_ci VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); 103162306a36Sopenharmony_ci 103262306a36Sopenharmony_ci mm = mm_slot->slot.mm; 103362306a36Sopenharmony_ci mmap_read_lock(mm); 103462306a36Sopenharmony_ci 103562306a36Sopenharmony_ci /* 103662306a36Sopenharmony_ci * Exit right away if mm is exiting to avoid lockdep issue in 103762306a36Sopenharmony_ci * the maple tree 103862306a36Sopenharmony_ci */ 103962306a36Sopenharmony_ci if (ksm_test_exit(mm)) 104062306a36Sopenharmony_ci goto mm_exiting; 104162306a36Sopenharmony_ci 104262306a36Sopenharmony_ci for_each_vma(vmi, vma) { 104362306a36Sopenharmony_ci if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) 104462306a36Sopenharmony_ci continue; 104562306a36Sopenharmony_ci err = unmerge_ksm_pages(vma, 104662306a36Sopenharmony_ci vma->vm_start, vma->vm_end, false); 104762306a36Sopenharmony_ci if (err) 104862306a36Sopenharmony_ci goto error; 104962306a36Sopenharmony_ci } 105062306a36Sopenharmony_ci 105162306a36Sopenharmony_cimm_exiting: 105262306a36Sopenharmony_ci remove_trailing_rmap_items(&mm_slot->rmap_list); 105362306a36Sopenharmony_ci mmap_read_unlock(mm); 105462306a36Sopenharmony_ci 105562306a36Sopenharmony_ci spin_lock(&ksm_mmlist_lock); 105662306a36Sopenharmony_ci slot = list_entry(mm_slot->slot.mm_node.next, 105762306a36Sopenharmony_ci struct mm_slot, mm_node); 105862306a36Sopenharmony_ci ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); 105962306a36Sopenharmony_ci if (ksm_test_exit(mm)) { 106062306a36Sopenharmony_ci hash_del(&mm_slot->slot.hash); 106162306a36Sopenharmony_ci list_del(&mm_slot->slot.mm_node); 106262306a36Sopenharmony_ci spin_unlock(&ksm_mmlist_lock); 106362306a36Sopenharmony_ci 106462306a36Sopenharmony_ci mm_slot_free(mm_slot_cache, mm_slot); 106562306a36Sopenharmony_ci clear_bit(MMF_VM_MERGEABLE, &mm->flags); 106662306a36Sopenharmony_ci clear_bit(MMF_VM_MERGE_ANY, &mm->flags); 106762306a36Sopenharmony_ci mmdrop(mm); 106862306a36Sopenharmony_ci } else 106962306a36Sopenharmony_ci spin_unlock(&ksm_mmlist_lock); 107062306a36Sopenharmony_ci } 107162306a36Sopenharmony_ci 107262306a36Sopenharmony_ci /* Clean up stable nodes, but don't worry if some are still busy */ 107362306a36Sopenharmony_ci remove_all_stable_nodes(); 107462306a36Sopenharmony_ci ksm_scan.seqnr = 0; 107562306a36Sopenharmony_ci return 0; 107662306a36Sopenharmony_ci 107762306a36Sopenharmony_cierror: 107862306a36Sopenharmony_ci mmap_read_unlock(mm); 107962306a36Sopenharmony_ci spin_lock(&ksm_mmlist_lock); 108062306a36Sopenharmony_ci ksm_scan.mm_slot = &ksm_mm_head; 108162306a36Sopenharmony_ci spin_unlock(&ksm_mmlist_lock); 108262306a36Sopenharmony_ci return err; 108362306a36Sopenharmony_ci} 108462306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */ 108562306a36Sopenharmony_ci 108662306a36Sopenharmony_cistatic u32 calc_checksum(struct page *page) 108762306a36Sopenharmony_ci{ 108862306a36Sopenharmony_ci u32 checksum; 108962306a36Sopenharmony_ci void *addr = kmap_atomic(page); 109062306a36Sopenharmony_ci checksum = xxhash(addr, PAGE_SIZE, 0); 109162306a36Sopenharmony_ci kunmap_atomic(addr); 109262306a36Sopenharmony_ci return checksum; 109362306a36Sopenharmony_ci} 109462306a36Sopenharmony_ci 109562306a36Sopenharmony_cistatic int write_protect_page(struct vm_area_struct *vma, struct page *page, 109662306a36Sopenharmony_ci pte_t *orig_pte) 109762306a36Sopenharmony_ci{ 109862306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 109962306a36Sopenharmony_ci DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0); 110062306a36Sopenharmony_ci int swapped; 110162306a36Sopenharmony_ci int err = -EFAULT; 110262306a36Sopenharmony_ci struct mmu_notifier_range range; 110362306a36Sopenharmony_ci bool anon_exclusive; 110462306a36Sopenharmony_ci pte_t entry; 110562306a36Sopenharmony_ci 110662306a36Sopenharmony_ci pvmw.address = page_address_in_vma(page, vma); 110762306a36Sopenharmony_ci if (pvmw.address == -EFAULT) 110862306a36Sopenharmony_ci goto out; 110962306a36Sopenharmony_ci 111062306a36Sopenharmony_ci BUG_ON(PageTransCompound(page)); 111162306a36Sopenharmony_ci 111262306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address, 111362306a36Sopenharmony_ci pvmw.address + PAGE_SIZE); 111462306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 111562306a36Sopenharmony_ci 111662306a36Sopenharmony_ci if (!page_vma_mapped_walk(&pvmw)) 111762306a36Sopenharmony_ci goto out_mn; 111862306a36Sopenharmony_ci if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) 111962306a36Sopenharmony_ci goto out_unlock; 112062306a36Sopenharmony_ci 112162306a36Sopenharmony_ci anon_exclusive = PageAnonExclusive(page); 112262306a36Sopenharmony_ci entry = ptep_get(pvmw.pte); 112362306a36Sopenharmony_ci if (pte_write(entry) || pte_dirty(entry) || 112462306a36Sopenharmony_ci anon_exclusive || mm_tlb_flush_pending(mm)) { 112562306a36Sopenharmony_ci swapped = PageSwapCache(page); 112662306a36Sopenharmony_ci flush_cache_page(vma, pvmw.address, page_to_pfn(page)); 112762306a36Sopenharmony_ci /* 112862306a36Sopenharmony_ci * Ok this is tricky, when get_user_pages_fast() run it doesn't 112962306a36Sopenharmony_ci * take any lock, therefore the check that we are going to make 113062306a36Sopenharmony_ci * with the pagecount against the mapcount is racy and 113162306a36Sopenharmony_ci * O_DIRECT can happen right after the check. 113262306a36Sopenharmony_ci * So we clear the pte and flush the tlb before the check 113362306a36Sopenharmony_ci * this assure us that no O_DIRECT can happen after the check 113462306a36Sopenharmony_ci * or in the middle of the check. 113562306a36Sopenharmony_ci * 113662306a36Sopenharmony_ci * No need to notify as we are downgrading page table to read 113762306a36Sopenharmony_ci * only not changing it to point to a new page. 113862306a36Sopenharmony_ci * 113962306a36Sopenharmony_ci * See Documentation/mm/mmu_notifier.rst 114062306a36Sopenharmony_ci */ 114162306a36Sopenharmony_ci entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); 114262306a36Sopenharmony_ci /* 114362306a36Sopenharmony_ci * Check that no O_DIRECT or similar I/O is in progress on the 114462306a36Sopenharmony_ci * page 114562306a36Sopenharmony_ci */ 114662306a36Sopenharmony_ci if (page_mapcount(page) + 1 + swapped != page_count(page)) { 114762306a36Sopenharmony_ci set_pte_at(mm, pvmw.address, pvmw.pte, entry); 114862306a36Sopenharmony_ci goto out_unlock; 114962306a36Sopenharmony_ci } 115062306a36Sopenharmony_ci 115162306a36Sopenharmony_ci /* See page_try_share_anon_rmap(): clear PTE first. */ 115262306a36Sopenharmony_ci if (anon_exclusive && page_try_share_anon_rmap(page)) { 115362306a36Sopenharmony_ci set_pte_at(mm, pvmw.address, pvmw.pte, entry); 115462306a36Sopenharmony_ci goto out_unlock; 115562306a36Sopenharmony_ci } 115662306a36Sopenharmony_ci 115762306a36Sopenharmony_ci if (pte_dirty(entry)) 115862306a36Sopenharmony_ci set_page_dirty(page); 115962306a36Sopenharmony_ci entry = pte_mkclean(entry); 116062306a36Sopenharmony_ci 116162306a36Sopenharmony_ci if (pte_write(entry)) 116262306a36Sopenharmony_ci entry = pte_wrprotect(entry); 116362306a36Sopenharmony_ci 116462306a36Sopenharmony_ci set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); 116562306a36Sopenharmony_ci } 116662306a36Sopenharmony_ci *orig_pte = entry; 116762306a36Sopenharmony_ci err = 0; 116862306a36Sopenharmony_ci 116962306a36Sopenharmony_ciout_unlock: 117062306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 117162306a36Sopenharmony_ciout_mn: 117262306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 117362306a36Sopenharmony_ciout: 117462306a36Sopenharmony_ci return err; 117562306a36Sopenharmony_ci} 117662306a36Sopenharmony_ci 117762306a36Sopenharmony_ci/** 117862306a36Sopenharmony_ci * replace_page - replace page in vma by new ksm page 117962306a36Sopenharmony_ci * @vma: vma that holds the pte pointing to page 118062306a36Sopenharmony_ci * @page: the page we are replacing by kpage 118162306a36Sopenharmony_ci * @kpage: the ksm page we replace page by 118262306a36Sopenharmony_ci * @orig_pte: the original value of the pte 118362306a36Sopenharmony_ci * 118462306a36Sopenharmony_ci * Returns 0 on success, -EFAULT on failure. 118562306a36Sopenharmony_ci */ 118662306a36Sopenharmony_cistatic int replace_page(struct vm_area_struct *vma, struct page *page, 118762306a36Sopenharmony_ci struct page *kpage, pte_t orig_pte) 118862306a36Sopenharmony_ci{ 118962306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 119062306a36Sopenharmony_ci struct folio *folio; 119162306a36Sopenharmony_ci pmd_t *pmd; 119262306a36Sopenharmony_ci pmd_t pmde; 119362306a36Sopenharmony_ci pte_t *ptep; 119462306a36Sopenharmony_ci pte_t newpte; 119562306a36Sopenharmony_ci spinlock_t *ptl; 119662306a36Sopenharmony_ci unsigned long addr; 119762306a36Sopenharmony_ci int err = -EFAULT; 119862306a36Sopenharmony_ci struct mmu_notifier_range range; 119962306a36Sopenharmony_ci 120062306a36Sopenharmony_ci addr = page_address_in_vma(page, vma); 120162306a36Sopenharmony_ci if (addr == -EFAULT) 120262306a36Sopenharmony_ci goto out; 120362306a36Sopenharmony_ci 120462306a36Sopenharmony_ci pmd = mm_find_pmd(mm, addr); 120562306a36Sopenharmony_ci if (!pmd) 120662306a36Sopenharmony_ci goto out; 120762306a36Sopenharmony_ci /* 120862306a36Sopenharmony_ci * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() 120962306a36Sopenharmony_ci * without holding anon_vma lock for write. So when looking for a 121062306a36Sopenharmony_ci * genuine pmde (in which to find pte), test present and !THP together. 121162306a36Sopenharmony_ci */ 121262306a36Sopenharmony_ci pmde = pmdp_get_lockless(pmd); 121362306a36Sopenharmony_ci if (!pmd_present(pmde) || pmd_trans_huge(pmde)) 121462306a36Sopenharmony_ci goto out; 121562306a36Sopenharmony_ci 121662306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, 121762306a36Sopenharmony_ci addr + PAGE_SIZE); 121862306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 121962306a36Sopenharmony_ci 122062306a36Sopenharmony_ci ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 122162306a36Sopenharmony_ci if (!ptep) 122262306a36Sopenharmony_ci goto out_mn; 122362306a36Sopenharmony_ci if (!pte_same(ptep_get(ptep), orig_pte)) { 122462306a36Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 122562306a36Sopenharmony_ci goto out_mn; 122662306a36Sopenharmony_ci } 122762306a36Sopenharmony_ci VM_BUG_ON_PAGE(PageAnonExclusive(page), page); 122862306a36Sopenharmony_ci VM_BUG_ON_PAGE(PageAnon(kpage) && PageAnonExclusive(kpage), kpage); 122962306a36Sopenharmony_ci 123062306a36Sopenharmony_ci /* 123162306a36Sopenharmony_ci * No need to check ksm_use_zero_pages here: we can only have a 123262306a36Sopenharmony_ci * zero_page here if ksm_use_zero_pages was enabled already. 123362306a36Sopenharmony_ci */ 123462306a36Sopenharmony_ci if (!is_zero_pfn(page_to_pfn(kpage))) { 123562306a36Sopenharmony_ci get_page(kpage); 123662306a36Sopenharmony_ci page_add_anon_rmap(kpage, vma, addr, RMAP_NONE); 123762306a36Sopenharmony_ci newpte = mk_pte(kpage, vma->vm_page_prot); 123862306a36Sopenharmony_ci } else { 123962306a36Sopenharmony_ci /* 124062306a36Sopenharmony_ci * Use pte_mkdirty to mark the zero page mapped by KSM, and then 124162306a36Sopenharmony_ci * we can easily track all KSM-placed zero pages by checking if 124262306a36Sopenharmony_ci * the dirty bit in zero page's PTE is set. 124362306a36Sopenharmony_ci */ 124462306a36Sopenharmony_ci newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); 124562306a36Sopenharmony_ci ksm_zero_pages++; 124662306a36Sopenharmony_ci mm->ksm_zero_pages++; 124762306a36Sopenharmony_ci /* 124862306a36Sopenharmony_ci * We're replacing an anonymous page with a zero page, which is 124962306a36Sopenharmony_ci * not anonymous. We need to do proper accounting otherwise we 125062306a36Sopenharmony_ci * will get wrong values in /proc, and a BUG message in dmesg 125162306a36Sopenharmony_ci * when tearing down the mm. 125262306a36Sopenharmony_ci */ 125362306a36Sopenharmony_ci dec_mm_counter(mm, MM_ANONPAGES); 125462306a36Sopenharmony_ci } 125562306a36Sopenharmony_ci 125662306a36Sopenharmony_ci flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep))); 125762306a36Sopenharmony_ci /* 125862306a36Sopenharmony_ci * No need to notify as we are replacing a read only page with another 125962306a36Sopenharmony_ci * read only page with the same content. 126062306a36Sopenharmony_ci * 126162306a36Sopenharmony_ci * See Documentation/mm/mmu_notifier.rst 126262306a36Sopenharmony_ci */ 126362306a36Sopenharmony_ci ptep_clear_flush(vma, addr, ptep); 126462306a36Sopenharmony_ci set_pte_at_notify(mm, addr, ptep, newpte); 126562306a36Sopenharmony_ci 126662306a36Sopenharmony_ci folio = page_folio(page); 126762306a36Sopenharmony_ci page_remove_rmap(page, vma, false); 126862306a36Sopenharmony_ci if (!folio_mapped(folio)) 126962306a36Sopenharmony_ci folio_free_swap(folio); 127062306a36Sopenharmony_ci folio_put(folio); 127162306a36Sopenharmony_ci 127262306a36Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 127362306a36Sopenharmony_ci err = 0; 127462306a36Sopenharmony_ciout_mn: 127562306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 127662306a36Sopenharmony_ciout: 127762306a36Sopenharmony_ci return err; 127862306a36Sopenharmony_ci} 127962306a36Sopenharmony_ci 128062306a36Sopenharmony_ci/* 128162306a36Sopenharmony_ci * try_to_merge_one_page - take two pages and merge them into one 128262306a36Sopenharmony_ci * @vma: the vma that holds the pte pointing to page 128362306a36Sopenharmony_ci * @page: the PageAnon page that we want to replace with kpage 128462306a36Sopenharmony_ci * @kpage: the PageKsm page that we want to map instead of page, 128562306a36Sopenharmony_ci * or NULL the first time when we want to use page as kpage. 128662306a36Sopenharmony_ci * 128762306a36Sopenharmony_ci * This function returns 0 if the pages were merged, -EFAULT otherwise. 128862306a36Sopenharmony_ci */ 128962306a36Sopenharmony_cistatic int try_to_merge_one_page(struct vm_area_struct *vma, 129062306a36Sopenharmony_ci struct page *page, struct page *kpage) 129162306a36Sopenharmony_ci{ 129262306a36Sopenharmony_ci pte_t orig_pte = __pte(0); 129362306a36Sopenharmony_ci int err = -EFAULT; 129462306a36Sopenharmony_ci 129562306a36Sopenharmony_ci if (page == kpage) /* ksm page forked */ 129662306a36Sopenharmony_ci return 0; 129762306a36Sopenharmony_ci 129862306a36Sopenharmony_ci if (!PageAnon(page)) 129962306a36Sopenharmony_ci goto out; 130062306a36Sopenharmony_ci 130162306a36Sopenharmony_ci /* 130262306a36Sopenharmony_ci * We need the page lock to read a stable PageSwapCache in 130362306a36Sopenharmony_ci * write_protect_page(). We use trylock_page() instead of 130462306a36Sopenharmony_ci * lock_page() because we don't want to wait here - we 130562306a36Sopenharmony_ci * prefer to continue scanning and merging different pages, 130662306a36Sopenharmony_ci * then come back to this page when it is unlocked. 130762306a36Sopenharmony_ci */ 130862306a36Sopenharmony_ci if (!trylock_page(page)) 130962306a36Sopenharmony_ci goto out; 131062306a36Sopenharmony_ci 131162306a36Sopenharmony_ci if (PageTransCompound(page)) { 131262306a36Sopenharmony_ci if (split_huge_page(page)) 131362306a36Sopenharmony_ci goto out_unlock; 131462306a36Sopenharmony_ci } 131562306a36Sopenharmony_ci 131662306a36Sopenharmony_ci /* 131762306a36Sopenharmony_ci * If this anonymous page is mapped only here, its pte may need 131862306a36Sopenharmony_ci * to be write-protected. If it's mapped elsewhere, all of its 131962306a36Sopenharmony_ci * ptes are necessarily already write-protected. But in either 132062306a36Sopenharmony_ci * case, we need to lock and check page_count is not raised. 132162306a36Sopenharmony_ci */ 132262306a36Sopenharmony_ci if (write_protect_page(vma, page, &orig_pte) == 0) { 132362306a36Sopenharmony_ci if (!kpage) { 132462306a36Sopenharmony_ci /* 132562306a36Sopenharmony_ci * While we hold page lock, upgrade page from 132662306a36Sopenharmony_ci * PageAnon+anon_vma to PageKsm+NULL stable_node: 132762306a36Sopenharmony_ci * stable_tree_insert() will update stable_node. 132862306a36Sopenharmony_ci */ 132962306a36Sopenharmony_ci set_page_stable_node(page, NULL); 133062306a36Sopenharmony_ci mark_page_accessed(page); 133162306a36Sopenharmony_ci /* 133262306a36Sopenharmony_ci * Page reclaim just frees a clean page with no dirty 133362306a36Sopenharmony_ci * ptes: make sure that the ksm page would be swapped. 133462306a36Sopenharmony_ci */ 133562306a36Sopenharmony_ci if (!PageDirty(page)) 133662306a36Sopenharmony_ci SetPageDirty(page); 133762306a36Sopenharmony_ci err = 0; 133862306a36Sopenharmony_ci } else if (pages_identical(page, kpage)) 133962306a36Sopenharmony_ci err = replace_page(vma, page, kpage, orig_pte); 134062306a36Sopenharmony_ci } 134162306a36Sopenharmony_ci 134262306a36Sopenharmony_ciout_unlock: 134362306a36Sopenharmony_ci unlock_page(page); 134462306a36Sopenharmony_ciout: 134562306a36Sopenharmony_ci return err; 134662306a36Sopenharmony_ci} 134762306a36Sopenharmony_ci 134862306a36Sopenharmony_ci/* 134962306a36Sopenharmony_ci * try_to_merge_with_ksm_page - like try_to_merge_two_pages, 135062306a36Sopenharmony_ci * but no new kernel page is allocated: kpage must already be a ksm page. 135162306a36Sopenharmony_ci * 135262306a36Sopenharmony_ci * This function returns 0 if the pages were merged, -EFAULT otherwise. 135362306a36Sopenharmony_ci */ 135462306a36Sopenharmony_cistatic int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, 135562306a36Sopenharmony_ci struct page *page, struct page *kpage) 135662306a36Sopenharmony_ci{ 135762306a36Sopenharmony_ci struct mm_struct *mm = rmap_item->mm; 135862306a36Sopenharmony_ci struct vm_area_struct *vma; 135962306a36Sopenharmony_ci int err = -EFAULT; 136062306a36Sopenharmony_ci 136162306a36Sopenharmony_ci mmap_read_lock(mm); 136262306a36Sopenharmony_ci vma = find_mergeable_vma(mm, rmap_item->address); 136362306a36Sopenharmony_ci if (!vma) 136462306a36Sopenharmony_ci goto out; 136562306a36Sopenharmony_ci 136662306a36Sopenharmony_ci err = try_to_merge_one_page(vma, page, kpage); 136762306a36Sopenharmony_ci if (err) 136862306a36Sopenharmony_ci goto out; 136962306a36Sopenharmony_ci 137062306a36Sopenharmony_ci /* Unstable nid is in union with stable anon_vma: remove first */ 137162306a36Sopenharmony_ci remove_rmap_item_from_tree(rmap_item); 137262306a36Sopenharmony_ci 137362306a36Sopenharmony_ci /* Must get reference to anon_vma while still holding mmap_lock */ 137462306a36Sopenharmony_ci rmap_item->anon_vma = vma->anon_vma; 137562306a36Sopenharmony_ci get_anon_vma(vma->anon_vma); 137662306a36Sopenharmony_ciout: 137762306a36Sopenharmony_ci mmap_read_unlock(mm); 137862306a36Sopenharmony_ci trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page), 137962306a36Sopenharmony_ci rmap_item, mm, err); 138062306a36Sopenharmony_ci return err; 138162306a36Sopenharmony_ci} 138262306a36Sopenharmony_ci 138362306a36Sopenharmony_ci/* 138462306a36Sopenharmony_ci * try_to_merge_two_pages - take two identical pages and prepare them 138562306a36Sopenharmony_ci * to be merged into one page. 138662306a36Sopenharmony_ci * 138762306a36Sopenharmony_ci * This function returns the kpage if we successfully merged two identical 138862306a36Sopenharmony_ci * pages into one ksm page, NULL otherwise. 138962306a36Sopenharmony_ci * 139062306a36Sopenharmony_ci * Note that this function upgrades page to ksm page: if one of the pages 139162306a36Sopenharmony_ci * is already a ksm page, try_to_merge_with_ksm_page should be used. 139262306a36Sopenharmony_ci */ 139362306a36Sopenharmony_cistatic struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, 139462306a36Sopenharmony_ci struct page *page, 139562306a36Sopenharmony_ci struct ksm_rmap_item *tree_rmap_item, 139662306a36Sopenharmony_ci struct page *tree_page) 139762306a36Sopenharmony_ci{ 139862306a36Sopenharmony_ci int err; 139962306a36Sopenharmony_ci 140062306a36Sopenharmony_ci err = try_to_merge_with_ksm_page(rmap_item, page, NULL); 140162306a36Sopenharmony_ci if (!err) { 140262306a36Sopenharmony_ci err = try_to_merge_with_ksm_page(tree_rmap_item, 140362306a36Sopenharmony_ci tree_page, page); 140462306a36Sopenharmony_ci /* 140562306a36Sopenharmony_ci * If that fails, we have a ksm page with only one pte 140662306a36Sopenharmony_ci * pointing to it: so break it. 140762306a36Sopenharmony_ci */ 140862306a36Sopenharmony_ci if (err) 140962306a36Sopenharmony_ci break_cow(rmap_item); 141062306a36Sopenharmony_ci } 141162306a36Sopenharmony_ci return err ? NULL : page; 141262306a36Sopenharmony_ci} 141362306a36Sopenharmony_ci 141462306a36Sopenharmony_cistatic __always_inline 141562306a36Sopenharmony_cibool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) 141662306a36Sopenharmony_ci{ 141762306a36Sopenharmony_ci VM_BUG_ON(stable_node->rmap_hlist_len < 0); 141862306a36Sopenharmony_ci /* 141962306a36Sopenharmony_ci * Check that at least one mapping still exists, otherwise 142062306a36Sopenharmony_ci * there's no much point to merge and share with this 142162306a36Sopenharmony_ci * stable_node, as the underlying tree_page of the other 142262306a36Sopenharmony_ci * sharer is going to be freed soon. 142362306a36Sopenharmony_ci */ 142462306a36Sopenharmony_ci return stable_node->rmap_hlist_len && 142562306a36Sopenharmony_ci stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; 142662306a36Sopenharmony_ci} 142762306a36Sopenharmony_ci 142862306a36Sopenharmony_cistatic __always_inline 142962306a36Sopenharmony_cibool is_page_sharing_candidate(struct ksm_stable_node *stable_node) 143062306a36Sopenharmony_ci{ 143162306a36Sopenharmony_ci return __is_page_sharing_candidate(stable_node, 0); 143262306a36Sopenharmony_ci} 143362306a36Sopenharmony_ci 143462306a36Sopenharmony_cistatic struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, 143562306a36Sopenharmony_ci struct ksm_stable_node **_stable_node, 143662306a36Sopenharmony_ci struct rb_root *root, 143762306a36Sopenharmony_ci bool prune_stale_stable_nodes) 143862306a36Sopenharmony_ci{ 143962306a36Sopenharmony_ci struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; 144062306a36Sopenharmony_ci struct hlist_node *hlist_safe; 144162306a36Sopenharmony_ci struct page *_tree_page, *tree_page = NULL; 144262306a36Sopenharmony_ci int nr = 0; 144362306a36Sopenharmony_ci int found_rmap_hlist_len; 144462306a36Sopenharmony_ci 144562306a36Sopenharmony_ci if (!prune_stale_stable_nodes || 144662306a36Sopenharmony_ci time_before(jiffies, stable_node->chain_prune_time + 144762306a36Sopenharmony_ci msecs_to_jiffies( 144862306a36Sopenharmony_ci ksm_stable_node_chains_prune_millisecs))) 144962306a36Sopenharmony_ci prune_stale_stable_nodes = false; 145062306a36Sopenharmony_ci else 145162306a36Sopenharmony_ci stable_node->chain_prune_time = jiffies; 145262306a36Sopenharmony_ci 145362306a36Sopenharmony_ci hlist_for_each_entry_safe(dup, hlist_safe, 145462306a36Sopenharmony_ci &stable_node->hlist, hlist_dup) { 145562306a36Sopenharmony_ci cond_resched(); 145662306a36Sopenharmony_ci /* 145762306a36Sopenharmony_ci * We must walk all stable_node_dup to prune the stale 145862306a36Sopenharmony_ci * stable nodes during lookup. 145962306a36Sopenharmony_ci * 146062306a36Sopenharmony_ci * get_ksm_page can drop the nodes from the 146162306a36Sopenharmony_ci * stable_node->hlist if they point to freed pages 146262306a36Sopenharmony_ci * (that's why we do a _safe walk). The "dup" 146362306a36Sopenharmony_ci * stable_node parameter itself will be freed from 146462306a36Sopenharmony_ci * under us if it returns NULL. 146562306a36Sopenharmony_ci */ 146662306a36Sopenharmony_ci _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); 146762306a36Sopenharmony_ci if (!_tree_page) 146862306a36Sopenharmony_ci continue; 146962306a36Sopenharmony_ci nr += 1; 147062306a36Sopenharmony_ci if (is_page_sharing_candidate(dup)) { 147162306a36Sopenharmony_ci if (!found || 147262306a36Sopenharmony_ci dup->rmap_hlist_len > found_rmap_hlist_len) { 147362306a36Sopenharmony_ci if (found) 147462306a36Sopenharmony_ci put_page(tree_page); 147562306a36Sopenharmony_ci found = dup; 147662306a36Sopenharmony_ci found_rmap_hlist_len = found->rmap_hlist_len; 147762306a36Sopenharmony_ci tree_page = _tree_page; 147862306a36Sopenharmony_ci 147962306a36Sopenharmony_ci /* skip put_page for found dup */ 148062306a36Sopenharmony_ci if (!prune_stale_stable_nodes) 148162306a36Sopenharmony_ci break; 148262306a36Sopenharmony_ci continue; 148362306a36Sopenharmony_ci } 148462306a36Sopenharmony_ci } 148562306a36Sopenharmony_ci put_page(_tree_page); 148662306a36Sopenharmony_ci } 148762306a36Sopenharmony_ci 148862306a36Sopenharmony_ci if (found) { 148962306a36Sopenharmony_ci /* 149062306a36Sopenharmony_ci * nr is counting all dups in the chain only if 149162306a36Sopenharmony_ci * prune_stale_stable_nodes is true, otherwise we may 149262306a36Sopenharmony_ci * break the loop at nr == 1 even if there are 149362306a36Sopenharmony_ci * multiple entries. 149462306a36Sopenharmony_ci */ 149562306a36Sopenharmony_ci if (prune_stale_stable_nodes && nr == 1) { 149662306a36Sopenharmony_ci /* 149762306a36Sopenharmony_ci * If there's not just one entry it would 149862306a36Sopenharmony_ci * corrupt memory, better BUG_ON. In KSM 149962306a36Sopenharmony_ci * context with no lock held it's not even 150062306a36Sopenharmony_ci * fatal. 150162306a36Sopenharmony_ci */ 150262306a36Sopenharmony_ci BUG_ON(stable_node->hlist.first->next); 150362306a36Sopenharmony_ci 150462306a36Sopenharmony_ci /* 150562306a36Sopenharmony_ci * There's just one entry and it is below the 150662306a36Sopenharmony_ci * deduplication limit so drop the chain. 150762306a36Sopenharmony_ci */ 150862306a36Sopenharmony_ci rb_replace_node(&stable_node->node, &found->node, 150962306a36Sopenharmony_ci root); 151062306a36Sopenharmony_ci free_stable_node(stable_node); 151162306a36Sopenharmony_ci ksm_stable_node_chains--; 151262306a36Sopenharmony_ci ksm_stable_node_dups--; 151362306a36Sopenharmony_ci /* 151462306a36Sopenharmony_ci * NOTE: the caller depends on the stable_node 151562306a36Sopenharmony_ci * to be equal to stable_node_dup if the chain 151662306a36Sopenharmony_ci * was collapsed. 151762306a36Sopenharmony_ci */ 151862306a36Sopenharmony_ci *_stable_node = found; 151962306a36Sopenharmony_ci /* 152062306a36Sopenharmony_ci * Just for robustness, as stable_node is 152162306a36Sopenharmony_ci * otherwise left as a stable pointer, the 152262306a36Sopenharmony_ci * compiler shall optimize it away at build 152362306a36Sopenharmony_ci * time. 152462306a36Sopenharmony_ci */ 152562306a36Sopenharmony_ci stable_node = NULL; 152662306a36Sopenharmony_ci } else if (stable_node->hlist.first != &found->hlist_dup && 152762306a36Sopenharmony_ci __is_page_sharing_candidate(found, 1)) { 152862306a36Sopenharmony_ci /* 152962306a36Sopenharmony_ci * If the found stable_node dup can accept one 153062306a36Sopenharmony_ci * more future merge (in addition to the one 153162306a36Sopenharmony_ci * that is underway) and is not at the head of 153262306a36Sopenharmony_ci * the chain, put it there so next search will 153362306a36Sopenharmony_ci * be quicker in the !prune_stale_stable_nodes 153462306a36Sopenharmony_ci * case. 153562306a36Sopenharmony_ci * 153662306a36Sopenharmony_ci * NOTE: it would be inaccurate to use nr > 1 153762306a36Sopenharmony_ci * instead of checking the hlist.first pointer 153862306a36Sopenharmony_ci * directly, because in the 153962306a36Sopenharmony_ci * prune_stale_stable_nodes case "nr" isn't 154062306a36Sopenharmony_ci * the position of the found dup in the chain, 154162306a36Sopenharmony_ci * but the total number of dups in the chain. 154262306a36Sopenharmony_ci */ 154362306a36Sopenharmony_ci hlist_del(&found->hlist_dup); 154462306a36Sopenharmony_ci hlist_add_head(&found->hlist_dup, 154562306a36Sopenharmony_ci &stable_node->hlist); 154662306a36Sopenharmony_ci } 154762306a36Sopenharmony_ci } 154862306a36Sopenharmony_ci 154962306a36Sopenharmony_ci *_stable_node_dup = found; 155062306a36Sopenharmony_ci return tree_page; 155162306a36Sopenharmony_ci} 155262306a36Sopenharmony_ci 155362306a36Sopenharmony_cistatic struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node, 155462306a36Sopenharmony_ci struct rb_root *root) 155562306a36Sopenharmony_ci{ 155662306a36Sopenharmony_ci if (!is_stable_node_chain(stable_node)) 155762306a36Sopenharmony_ci return stable_node; 155862306a36Sopenharmony_ci if (hlist_empty(&stable_node->hlist)) { 155962306a36Sopenharmony_ci free_stable_node_chain(stable_node, root); 156062306a36Sopenharmony_ci return NULL; 156162306a36Sopenharmony_ci } 156262306a36Sopenharmony_ci return hlist_entry(stable_node->hlist.first, 156362306a36Sopenharmony_ci typeof(*stable_node), hlist_dup); 156462306a36Sopenharmony_ci} 156562306a36Sopenharmony_ci 156662306a36Sopenharmony_ci/* 156762306a36Sopenharmony_ci * Like for get_ksm_page, this function can free the *_stable_node and 156862306a36Sopenharmony_ci * *_stable_node_dup if the returned tree_page is NULL. 156962306a36Sopenharmony_ci * 157062306a36Sopenharmony_ci * It can also free and overwrite *_stable_node with the found 157162306a36Sopenharmony_ci * stable_node_dup if the chain is collapsed (in which case 157262306a36Sopenharmony_ci * *_stable_node will be equal to *_stable_node_dup like if the chain 157362306a36Sopenharmony_ci * never existed). It's up to the caller to verify tree_page is not 157462306a36Sopenharmony_ci * NULL before dereferencing *_stable_node or *_stable_node_dup. 157562306a36Sopenharmony_ci * 157662306a36Sopenharmony_ci * *_stable_node_dup is really a second output parameter of this 157762306a36Sopenharmony_ci * function and will be overwritten in all cases, the caller doesn't 157862306a36Sopenharmony_ci * need to initialize it. 157962306a36Sopenharmony_ci */ 158062306a36Sopenharmony_cistatic struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, 158162306a36Sopenharmony_ci struct ksm_stable_node **_stable_node, 158262306a36Sopenharmony_ci struct rb_root *root, 158362306a36Sopenharmony_ci bool prune_stale_stable_nodes) 158462306a36Sopenharmony_ci{ 158562306a36Sopenharmony_ci struct ksm_stable_node *stable_node = *_stable_node; 158662306a36Sopenharmony_ci if (!is_stable_node_chain(stable_node)) { 158762306a36Sopenharmony_ci if (is_page_sharing_candidate(stable_node)) { 158862306a36Sopenharmony_ci *_stable_node_dup = stable_node; 158962306a36Sopenharmony_ci return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); 159062306a36Sopenharmony_ci } 159162306a36Sopenharmony_ci /* 159262306a36Sopenharmony_ci * _stable_node_dup set to NULL means the stable_node 159362306a36Sopenharmony_ci * reached the ksm_max_page_sharing limit. 159462306a36Sopenharmony_ci */ 159562306a36Sopenharmony_ci *_stable_node_dup = NULL; 159662306a36Sopenharmony_ci return NULL; 159762306a36Sopenharmony_ci } 159862306a36Sopenharmony_ci return stable_node_dup(_stable_node_dup, _stable_node, root, 159962306a36Sopenharmony_ci prune_stale_stable_nodes); 160062306a36Sopenharmony_ci} 160162306a36Sopenharmony_ci 160262306a36Sopenharmony_cistatic __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d, 160362306a36Sopenharmony_ci struct ksm_stable_node **s_n, 160462306a36Sopenharmony_ci struct rb_root *root) 160562306a36Sopenharmony_ci{ 160662306a36Sopenharmony_ci return __stable_node_chain(s_n_d, s_n, root, true); 160762306a36Sopenharmony_ci} 160862306a36Sopenharmony_ci 160962306a36Sopenharmony_cistatic __always_inline struct page *chain(struct ksm_stable_node **s_n_d, 161062306a36Sopenharmony_ci struct ksm_stable_node *s_n, 161162306a36Sopenharmony_ci struct rb_root *root) 161262306a36Sopenharmony_ci{ 161362306a36Sopenharmony_ci struct ksm_stable_node *old_stable_node = s_n; 161462306a36Sopenharmony_ci struct page *tree_page; 161562306a36Sopenharmony_ci 161662306a36Sopenharmony_ci tree_page = __stable_node_chain(s_n_d, &s_n, root, false); 161762306a36Sopenharmony_ci /* not pruning dups so s_n cannot have changed */ 161862306a36Sopenharmony_ci VM_BUG_ON(s_n != old_stable_node); 161962306a36Sopenharmony_ci return tree_page; 162062306a36Sopenharmony_ci} 162162306a36Sopenharmony_ci 162262306a36Sopenharmony_ci/* 162362306a36Sopenharmony_ci * stable_tree_search - search for page inside the stable tree 162462306a36Sopenharmony_ci * 162562306a36Sopenharmony_ci * This function checks if there is a page inside the stable tree 162662306a36Sopenharmony_ci * with identical content to the page that we are scanning right now. 162762306a36Sopenharmony_ci * 162862306a36Sopenharmony_ci * This function returns the stable tree node of identical content if found, 162962306a36Sopenharmony_ci * NULL otherwise. 163062306a36Sopenharmony_ci */ 163162306a36Sopenharmony_cistatic struct page *stable_tree_search(struct page *page) 163262306a36Sopenharmony_ci{ 163362306a36Sopenharmony_ci int nid; 163462306a36Sopenharmony_ci struct rb_root *root; 163562306a36Sopenharmony_ci struct rb_node **new; 163662306a36Sopenharmony_ci struct rb_node *parent; 163762306a36Sopenharmony_ci struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; 163862306a36Sopenharmony_ci struct ksm_stable_node *page_node; 163962306a36Sopenharmony_ci 164062306a36Sopenharmony_ci page_node = page_stable_node(page); 164162306a36Sopenharmony_ci if (page_node && page_node->head != &migrate_nodes) { 164262306a36Sopenharmony_ci /* ksm page forked */ 164362306a36Sopenharmony_ci get_page(page); 164462306a36Sopenharmony_ci return page; 164562306a36Sopenharmony_ci } 164662306a36Sopenharmony_ci 164762306a36Sopenharmony_ci nid = get_kpfn_nid(page_to_pfn(page)); 164862306a36Sopenharmony_ci root = root_stable_tree + nid; 164962306a36Sopenharmony_ciagain: 165062306a36Sopenharmony_ci new = &root->rb_node; 165162306a36Sopenharmony_ci parent = NULL; 165262306a36Sopenharmony_ci 165362306a36Sopenharmony_ci while (*new) { 165462306a36Sopenharmony_ci struct page *tree_page; 165562306a36Sopenharmony_ci int ret; 165662306a36Sopenharmony_ci 165762306a36Sopenharmony_ci cond_resched(); 165862306a36Sopenharmony_ci stable_node = rb_entry(*new, struct ksm_stable_node, node); 165962306a36Sopenharmony_ci stable_node_any = NULL; 166062306a36Sopenharmony_ci tree_page = chain_prune(&stable_node_dup, &stable_node, root); 166162306a36Sopenharmony_ci /* 166262306a36Sopenharmony_ci * NOTE: stable_node may have been freed by 166362306a36Sopenharmony_ci * chain_prune() if the returned stable_node_dup is 166462306a36Sopenharmony_ci * not NULL. stable_node_dup may have been inserted in 166562306a36Sopenharmony_ci * the rbtree instead as a regular stable_node (in 166662306a36Sopenharmony_ci * order to collapse the stable_node chain if a single 166762306a36Sopenharmony_ci * stable_node dup was found in it). In such case the 166862306a36Sopenharmony_ci * stable_node is overwritten by the callee to point 166962306a36Sopenharmony_ci * to the stable_node_dup that was collapsed in the 167062306a36Sopenharmony_ci * stable rbtree and stable_node will be equal to 167162306a36Sopenharmony_ci * stable_node_dup like if the chain never existed. 167262306a36Sopenharmony_ci */ 167362306a36Sopenharmony_ci if (!stable_node_dup) { 167462306a36Sopenharmony_ci /* 167562306a36Sopenharmony_ci * Either all stable_node dups were full in 167662306a36Sopenharmony_ci * this stable_node chain, or this chain was 167762306a36Sopenharmony_ci * empty and should be rb_erased. 167862306a36Sopenharmony_ci */ 167962306a36Sopenharmony_ci stable_node_any = stable_node_dup_any(stable_node, 168062306a36Sopenharmony_ci root); 168162306a36Sopenharmony_ci if (!stable_node_any) { 168262306a36Sopenharmony_ci /* rb_erase just run */ 168362306a36Sopenharmony_ci goto again; 168462306a36Sopenharmony_ci } 168562306a36Sopenharmony_ci /* 168662306a36Sopenharmony_ci * Take any of the stable_node dups page of 168762306a36Sopenharmony_ci * this stable_node chain to let the tree walk 168862306a36Sopenharmony_ci * continue. All KSM pages belonging to the 168962306a36Sopenharmony_ci * stable_node dups in a stable_node chain 169062306a36Sopenharmony_ci * have the same content and they're 169162306a36Sopenharmony_ci * write protected at all times. Any will work 169262306a36Sopenharmony_ci * fine to continue the walk. 169362306a36Sopenharmony_ci */ 169462306a36Sopenharmony_ci tree_page = get_ksm_page(stable_node_any, 169562306a36Sopenharmony_ci GET_KSM_PAGE_NOLOCK); 169662306a36Sopenharmony_ci } 169762306a36Sopenharmony_ci VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); 169862306a36Sopenharmony_ci if (!tree_page) { 169962306a36Sopenharmony_ci /* 170062306a36Sopenharmony_ci * If we walked over a stale stable_node, 170162306a36Sopenharmony_ci * get_ksm_page() will call rb_erase() and it 170262306a36Sopenharmony_ci * may rebalance the tree from under us. So 170362306a36Sopenharmony_ci * restart the search from scratch. Returning 170462306a36Sopenharmony_ci * NULL would be safe too, but we'd generate 170562306a36Sopenharmony_ci * false negative insertions just because some 170662306a36Sopenharmony_ci * stable_node was stale. 170762306a36Sopenharmony_ci */ 170862306a36Sopenharmony_ci goto again; 170962306a36Sopenharmony_ci } 171062306a36Sopenharmony_ci 171162306a36Sopenharmony_ci ret = memcmp_pages(page, tree_page); 171262306a36Sopenharmony_ci put_page(tree_page); 171362306a36Sopenharmony_ci 171462306a36Sopenharmony_ci parent = *new; 171562306a36Sopenharmony_ci if (ret < 0) 171662306a36Sopenharmony_ci new = &parent->rb_left; 171762306a36Sopenharmony_ci else if (ret > 0) 171862306a36Sopenharmony_ci new = &parent->rb_right; 171962306a36Sopenharmony_ci else { 172062306a36Sopenharmony_ci if (page_node) { 172162306a36Sopenharmony_ci VM_BUG_ON(page_node->head != &migrate_nodes); 172262306a36Sopenharmony_ci /* 172362306a36Sopenharmony_ci * Test if the migrated page should be merged 172462306a36Sopenharmony_ci * into a stable node dup. If the mapcount is 172562306a36Sopenharmony_ci * 1 we can migrate it with another KSM page 172662306a36Sopenharmony_ci * without adding it to the chain. 172762306a36Sopenharmony_ci */ 172862306a36Sopenharmony_ci if (page_mapcount(page) > 1) 172962306a36Sopenharmony_ci goto chain_append; 173062306a36Sopenharmony_ci } 173162306a36Sopenharmony_ci 173262306a36Sopenharmony_ci if (!stable_node_dup) { 173362306a36Sopenharmony_ci /* 173462306a36Sopenharmony_ci * If the stable_node is a chain and 173562306a36Sopenharmony_ci * we got a payload match in memcmp 173662306a36Sopenharmony_ci * but we cannot merge the scanned 173762306a36Sopenharmony_ci * page in any of the existing 173862306a36Sopenharmony_ci * stable_node dups because they're 173962306a36Sopenharmony_ci * all full, we need to wait the 174062306a36Sopenharmony_ci * scanned page to find itself a match 174162306a36Sopenharmony_ci * in the unstable tree to create a 174262306a36Sopenharmony_ci * brand new KSM page to add later to 174362306a36Sopenharmony_ci * the dups of this stable_node. 174462306a36Sopenharmony_ci */ 174562306a36Sopenharmony_ci return NULL; 174662306a36Sopenharmony_ci } 174762306a36Sopenharmony_ci 174862306a36Sopenharmony_ci /* 174962306a36Sopenharmony_ci * Lock and unlock the stable_node's page (which 175062306a36Sopenharmony_ci * might already have been migrated) so that page 175162306a36Sopenharmony_ci * migration is sure to notice its raised count. 175262306a36Sopenharmony_ci * It would be more elegant to return stable_node 175362306a36Sopenharmony_ci * than kpage, but that involves more changes. 175462306a36Sopenharmony_ci */ 175562306a36Sopenharmony_ci tree_page = get_ksm_page(stable_node_dup, 175662306a36Sopenharmony_ci GET_KSM_PAGE_TRYLOCK); 175762306a36Sopenharmony_ci 175862306a36Sopenharmony_ci if (PTR_ERR(tree_page) == -EBUSY) 175962306a36Sopenharmony_ci return ERR_PTR(-EBUSY); 176062306a36Sopenharmony_ci 176162306a36Sopenharmony_ci if (unlikely(!tree_page)) 176262306a36Sopenharmony_ci /* 176362306a36Sopenharmony_ci * The tree may have been rebalanced, 176462306a36Sopenharmony_ci * so re-evaluate parent and new. 176562306a36Sopenharmony_ci */ 176662306a36Sopenharmony_ci goto again; 176762306a36Sopenharmony_ci unlock_page(tree_page); 176862306a36Sopenharmony_ci 176962306a36Sopenharmony_ci if (get_kpfn_nid(stable_node_dup->kpfn) != 177062306a36Sopenharmony_ci NUMA(stable_node_dup->nid)) { 177162306a36Sopenharmony_ci put_page(tree_page); 177262306a36Sopenharmony_ci goto replace; 177362306a36Sopenharmony_ci } 177462306a36Sopenharmony_ci return tree_page; 177562306a36Sopenharmony_ci } 177662306a36Sopenharmony_ci } 177762306a36Sopenharmony_ci 177862306a36Sopenharmony_ci if (!page_node) 177962306a36Sopenharmony_ci return NULL; 178062306a36Sopenharmony_ci 178162306a36Sopenharmony_ci list_del(&page_node->list); 178262306a36Sopenharmony_ci DO_NUMA(page_node->nid = nid); 178362306a36Sopenharmony_ci rb_link_node(&page_node->node, parent, new); 178462306a36Sopenharmony_ci rb_insert_color(&page_node->node, root); 178562306a36Sopenharmony_ciout: 178662306a36Sopenharmony_ci if (is_page_sharing_candidate(page_node)) { 178762306a36Sopenharmony_ci get_page(page); 178862306a36Sopenharmony_ci return page; 178962306a36Sopenharmony_ci } else 179062306a36Sopenharmony_ci return NULL; 179162306a36Sopenharmony_ci 179262306a36Sopenharmony_cireplace: 179362306a36Sopenharmony_ci /* 179462306a36Sopenharmony_ci * If stable_node was a chain and chain_prune collapsed it, 179562306a36Sopenharmony_ci * stable_node has been updated to be the new regular 179662306a36Sopenharmony_ci * stable_node. A collapse of the chain is indistinguishable 179762306a36Sopenharmony_ci * from the case there was no chain in the stable 179862306a36Sopenharmony_ci * rbtree. Otherwise stable_node is the chain and 179962306a36Sopenharmony_ci * stable_node_dup is the dup to replace. 180062306a36Sopenharmony_ci */ 180162306a36Sopenharmony_ci if (stable_node_dup == stable_node) { 180262306a36Sopenharmony_ci VM_BUG_ON(is_stable_node_chain(stable_node_dup)); 180362306a36Sopenharmony_ci VM_BUG_ON(is_stable_node_dup(stable_node_dup)); 180462306a36Sopenharmony_ci /* there is no chain */ 180562306a36Sopenharmony_ci if (page_node) { 180662306a36Sopenharmony_ci VM_BUG_ON(page_node->head != &migrate_nodes); 180762306a36Sopenharmony_ci list_del(&page_node->list); 180862306a36Sopenharmony_ci DO_NUMA(page_node->nid = nid); 180962306a36Sopenharmony_ci rb_replace_node(&stable_node_dup->node, 181062306a36Sopenharmony_ci &page_node->node, 181162306a36Sopenharmony_ci root); 181262306a36Sopenharmony_ci if (is_page_sharing_candidate(page_node)) 181362306a36Sopenharmony_ci get_page(page); 181462306a36Sopenharmony_ci else 181562306a36Sopenharmony_ci page = NULL; 181662306a36Sopenharmony_ci } else { 181762306a36Sopenharmony_ci rb_erase(&stable_node_dup->node, root); 181862306a36Sopenharmony_ci page = NULL; 181962306a36Sopenharmony_ci } 182062306a36Sopenharmony_ci } else { 182162306a36Sopenharmony_ci VM_BUG_ON(!is_stable_node_chain(stable_node)); 182262306a36Sopenharmony_ci __stable_node_dup_del(stable_node_dup); 182362306a36Sopenharmony_ci if (page_node) { 182462306a36Sopenharmony_ci VM_BUG_ON(page_node->head != &migrate_nodes); 182562306a36Sopenharmony_ci list_del(&page_node->list); 182662306a36Sopenharmony_ci DO_NUMA(page_node->nid = nid); 182762306a36Sopenharmony_ci stable_node_chain_add_dup(page_node, stable_node); 182862306a36Sopenharmony_ci if (is_page_sharing_candidate(page_node)) 182962306a36Sopenharmony_ci get_page(page); 183062306a36Sopenharmony_ci else 183162306a36Sopenharmony_ci page = NULL; 183262306a36Sopenharmony_ci } else { 183362306a36Sopenharmony_ci page = NULL; 183462306a36Sopenharmony_ci } 183562306a36Sopenharmony_ci } 183662306a36Sopenharmony_ci stable_node_dup->head = &migrate_nodes; 183762306a36Sopenharmony_ci list_add(&stable_node_dup->list, stable_node_dup->head); 183862306a36Sopenharmony_ci return page; 183962306a36Sopenharmony_ci 184062306a36Sopenharmony_cichain_append: 184162306a36Sopenharmony_ci /* stable_node_dup could be null if it reached the limit */ 184262306a36Sopenharmony_ci if (!stable_node_dup) 184362306a36Sopenharmony_ci stable_node_dup = stable_node_any; 184462306a36Sopenharmony_ci /* 184562306a36Sopenharmony_ci * If stable_node was a chain and chain_prune collapsed it, 184662306a36Sopenharmony_ci * stable_node has been updated to be the new regular 184762306a36Sopenharmony_ci * stable_node. A collapse of the chain is indistinguishable 184862306a36Sopenharmony_ci * from the case there was no chain in the stable 184962306a36Sopenharmony_ci * rbtree. Otherwise stable_node is the chain and 185062306a36Sopenharmony_ci * stable_node_dup is the dup to replace. 185162306a36Sopenharmony_ci */ 185262306a36Sopenharmony_ci if (stable_node_dup == stable_node) { 185362306a36Sopenharmony_ci VM_BUG_ON(is_stable_node_dup(stable_node_dup)); 185462306a36Sopenharmony_ci /* chain is missing so create it */ 185562306a36Sopenharmony_ci stable_node = alloc_stable_node_chain(stable_node_dup, 185662306a36Sopenharmony_ci root); 185762306a36Sopenharmony_ci if (!stable_node) 185862306a36Sopenharmony_ci return NULL; 185962306a36Sopenharmony_ci } 186062306a36Sopenharmony_ci /* 186162306a36Sopenharmony_ci * Add this stable_node dup that was 186262306a36Sopenharmony_ci * migrated to the stable_node chain 186362306a36Sopenharmony_ci * of the current nid for this page 186462306a36Sopenharmony_ci * content. 186562306a36Sopenharmony_ci */ 186662306a36Sopenharmony_ci VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); 186762306a36Sopenharmony_ci VM_BUG_ON(page_node->head != &migrate_nodes); 186862306a36Sopenharmony_ci list_del(&page_node->list); 186962306a36Sopenharmony_ci DO_NUMA(page_node->nid = nid); 187062306a36Sopenharmony_ci stable_node_chain_add_dup(page_node, stable_node); 187162306a36Sopenharmony_ci goto out; 187262306a36Sopenharmony_ci} 187362306a36Sopenharmony_ci 187462306a36Sopenharmony_ci/* 187562306a36Sopenharmony_ci * stable_tree_insert - insert stable tree node pointing to new ksm page 187662306a36Sopenharmony_ci * into the stable tree. 187762306a36Sopenharmony_ci * 187862306a36Sopenharmony_ci * This function returns the stable tree node just allocated on success, 187962306a36Sopenharmony_ci * NULL otherwise. 188062306a36Sopenharmony_ci */ 188162306a36Sopenharmony_cistatic struct ksm_stable_node *stable_tree_insert(struct page *kpage) 188262306a36Sopenharmony_ci{ 188362306a36Sopenharmony_ci int nid; 188462306a36Sopenharmony_ci unsigned long kpfn; 188562306a36Sopenharmony_ci struct rb_root *root; 188662306a36Sopenharmony_ci struct rb_node **new; 188762306a36Sopenharmony_ci struct rb_node *parent; 188862306a36Sopenharmony_ci struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; 188962306a36Sopenharmony_ci bool need_chain = false; 189062306a36Sopenharmony_ci 189162306a36Sopenharmony_ci kpfn = page_to_pfn(kpage); 189262306a36Sopenharmony_ci nid = get_kpfn_nid(kpfn); 189362306a36Sopenharmony_ci root = root_stable_tree + nid; 189462306a36Sopenharmony_ciagain: 189562306a36Sopenharmony_ci parent = NULL; 189662306a36Sopenharmony_ci new = &root->rb_node; 189762306a36Sopenharmony_ci 189862306a36Sopenharmony_ci while (*new) { 189962306a36Sopenharmony_ci struct page *tree_page; 190062306a36Sopenharmony_ci int ret; 190162306a36Sopenharmony_ci 190262306a36Sopenharmony_ci cond_resched(); 190362306a36Sopenharmony_ci stable_node = rb_entry(*new, struct ksm_stable_node, node); 190462306a36Sopenharmony_ci stable_node_any = NULL; 190562306a36Sopenharmony_ci tree_page = chain(&stable_node_dup, stable_node, root); 190662306a36Sopenharmony_ci if (!stable_node_dup) { 190762306a36Sopenharmony_ci /* 190862306a36Sopenharmony_ci * Either all stable_node dups were full in 190962306a36Sopenharmony_ci * this stable_node chain, or this chain was 191062306a36Sopenharmony_ci * empty and should be rb_erased. 191162306a36Sopenharmony_ci */ 191262306a36Sopenharmony_ci stable_node_any = stable_node_dup_any(stable_node, 191362306a36Sopenharmony_ci root); 191462306a36Sopenharmony_ci if (!stable_node_any) { 191562306a36Sopenharmony_ci /* rb_erase just run */ 191662306a36Sopenharmony_ci goto again; 191762306a36Sopenharmony_ci } 191862306a36Sopenharmony_ci /* 191962306a36Sopenharmony_ci * Take any of the stable_node dups page of 192062306a36Sopenharmony_ci * this stable_node chain to let the tree walk 192162306a36Sopenharmony_ci * continue. All KSM pages belonging to the 192262306a36Sopenharmony_ci * stable_node dups in a stable_node chain 192362306a36Sopenharmony_ci * have the same content and they're 192462306a36Sopenharmony_ci * write protected at all times. Any will work 192562306a36Sopenharmony_ci * fine to continue the walk. 192662306a36Sopenharmony_ci */ 192762306a36Sopenharmony_ci tree_page = get_ksm_page(stable_node_any, 192862306a36Sopenharmony_ci GET_KSM_PAGE_NOLOCK); 192962306a36Sopenharmony_ci } 193062306a36Sopenharmony_ci VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); 193162306a36Sopenharmony_ci if (!tree_page) { 193262306a36Sopenharmony_ci /* 193362306a36Sopenharmony_ci * If we walked over a stale stable_node, 193462306a36Sopenharmony_ci * get_ksm_page() will call rb_erase() and it 193562306a36Sopenharmony_ci * may rebalance the tree from under us. So 193662306a36Sopenharmony_ci * restart the search from scratch. Returning 193762306a36Sopenharmony_ci * NULL would be safe too, but we'd generate 193862306a36Sopenharmony_ci * false negative insertions just because some 193962306a36Sopenharmony_ci * stable_node was stale. 194062306a36Sopenharmony_ci */ 194162306a36Sopenharmony_ci goto again; 194262306a36Sopenharmony_ci } 194362306a36Sopenharmony_ci 194462306a36Sopenharmony_ci ret = memcmp_pages(kpage, tree_page); 194562306a36Sopenharmony_ci put_page(tree_page); 194662306a36Sopenharmony_ci 194762306a36Sopenharmony_ci parent = *new; 194862306a36Sopenharmony_ci if (ret < 0) 194962306a36Sopenharmony_ci new = &parent->rb_left; 195062306a36Sopenharmony_ci else if (ret > 0) 195162306a36Sopenharmony_ci new = &parent->rb_right; 195262306a36Sopenharmony_ci else { 195362306a36Sopenharmony_ci need_chain = true; 195462306a36Sopenharmony_ci break; 195562306a36Sopenharmony_ci } 195662306a36Sopenharmony_ci } 195762306a36Sopenharmony_ci 195862306a36Sopenharmony_ci stable_node_dup = alloc_stable_node(); 195962306a36Sopenharmony_ci if (!stable_node_dup) 196062306a36Sopenharmony_ci return NULL; 196162306a36Sopenharmony_ci 196262306a36Sopenharmony_ci INIT_HLIST_HEAD(&stable_node_dup->hlist); 196362306a36Sopenharmony_ci stable_node_dup->kpfn = kpfn; 196462306a36Sopenharmony_ci set_page_stable_node(kpage, stable_node_dup); 196562306a36Sopenharmony_ci stable_node_dup->rmap_hlist_len = 0; 196662306a36Sopenharmony_ci DO_NUMA(stable_node_dup->nid = nid); 196762306a36Sopenharmony_ci if (!need_chain) { 196862306a36Sopenharmony_ci rb_link_node(&stable_node_dup->node, parent, new); 196962306a36Sopenharmony_ci rb_insert_color(&stable_node_dup->node, root); 197062306a36Sopenharmony_ci } else { 197162306a36Sopenharmony_ci if (!is_stable_node_chain(stable_node)) { 197262306a36Sopenharmony_ci struct ksm_stable_node *orig = stable_node; 197362306a36Sopenharmony_ci /* chain is missing so create it */ 197462306a36Sopenharmony_ci stable_node = alloc_stable_node_chain(orig, root); 197562306a36Sopenharmony_ci if (!stable_node) { 197662306a36Sopenharmony_ci free_stable_node(stable_node_dup); 197762306a36Sopenharmony_ci return NULL; 197862306a36Sopenharmony_ci } 197962306a36Sopenharmony_ci } 198062306a36Sopenharmony_ci stable_node_chain_add_dup(stable_node_dup, stable_node); 198162306a36Sopenharmony_ci } 198262306a36Sopenharmony_ci 198362306a36Sopenharmony_ci return stable_node_dup; 198462306a36Sopenharmony_ci} 198562306a36Sopenharmony_ci 198662306a36Sopenharmony_ci/* 198762306a36Sopenharmony_ci * unstable_tree_search_insert - search for identical page, 198862306a36Sopenharmony_ci * else insert rmap_item into the unstable tree. 198962306a36Sopenharmony_ci * 199062306a36Sopenharmony_ci * This function searches for a page in the unstable tree identical to the 199162306a36Sopenharmony_ci * page currently being scanned; and if no identical page is found in the 199262306a36Sopenharmony_ci * tree, we insert rmap_item as a new object into the unstable tree. 199362306a36Sopenharmony_ci * 199462306a36Sopenharmony_ci * This function returns pointer to rmap_item found to be identical 199562306a36Sopenharmony_ci * to the currently scanned page, NULL otherwise. 199662306a36Sopenharmony_ci * 199762306a36Sopenharmony_ci * This function does both searching and inserting, because they share 199862306a36Sopenharmony_ci * the same walking algorithm in an rbtree. 199962306a36Sopenharmony_ci */ 200062306a36Sopenharmony_cistatic 200162306a36Sopenharmony_cistruct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, 200262306a36Sopenharmony_ci struct page *page, 200362306a36Sopenharmony_ci struct page **tree_pagep) 200462306a36Sopenharmony_ci{ 200562306a36Sopenharmony_ci struct rb_node **new; 200662306a36Sopenharmony_ci struct rb_root *root; 200762306a36Sopenharmony_ci struct rb_node *parent = NULL; 200862306a36Sopenharmony_ci int nid; 200962306a36Sopenharmony_ci 201062306a36Sopenharmony_ci nid = get_kpfn_nid(page_to_pfn(page)); 201162306a36Sopenharmony_ci root = root_unstable_tree + nid; 201262306a36Sopenharmony_ci new = &root->rb_node; 201362306a36Sopenharmony_ci 201462306a36Sopenharmony_ci while (*new) { 201562306a36Sopenharmony_ci struct ksm_rmap_item *tree_rmap_item; 201662306a36Sopenharmony_ci struct page *tree_page; 201762306a36Sopenharmony_ci int ret; 201862306a36Sopenharmony_ci 201962306a36Sopenharmony_ci cond_resched(); 202062306a36Sopenharmony_ci tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); 202162306a36Sopenharmony_ci tree_page = get_mergeable_page(tree_rmap_item); 202262306a36Sopenharmony_ci if (!tree_page) 202362306a36Sopenharmony_ci return NULL; 202462306a36Sopenharmony_ci 202562306a36Sopenharmony_ci /* 202662306a36Sopenharmony_ci * Don't substitute a ksm page for a forked page. 202762306a36Sopenharmony_ci */ 202862306a36Sopenharmony_ci if (page == tree_page) { 202962306a36Sopenharmony_ci put_page(tree_page); 203062306a36Sopenharmony_ci return NULL; 203162306a36Sopenharmony_ci } 203262306a36Sopenharmony_ci 203362306a36Sopenharmony_ci ret = memcmp_pages(page, tree_page); 203462306a36Sopenharmony_ci 203562306a36Sopenharmony_ci parent = *new; 203662306a36Sopenharmony_ci if (ret < 0) { 203762306a36Sopenharmony_ci put_page(tree_page); 203862306a36Sopenharmony_ci new = &parent->rb_left; 203962306a36Sopenharmony_ci } else if (ret > 0) { 204062306a36Sopenharmony_ci put_page(tree_page); 204162306a36Sopenharmony_ci new = &parent->rb_right; 204262306a36Sopenharmony_ci } else if (!ksm_merge_across_nodes && 204362306a36Sopenharmony_ci page_to_nid(tree_page) != nid) { 204462306a36Sopenharmony_ci /* 204562306a36Sopenharmony_ci * If tree_page has been migrated to another NUMA node, 204662306a36Sopenharmony_ci * it will be flushed out and put in the right unstable 204762306a36Sopenharmony_ci * tree next time: only merge with it when across_nodes. 204862306a36Sopenharmony_ci */ 204962306a36Sopenharmony_ci put_page(tree_page); 205062306a36Sopenharmony_ci return NULL; 205162306a36Sopenharmony_ci } else { 205262306a36Sopenharmony_ci *tree_pagep = tree_page; 205362306a36Sopenharmony_ci return tree_rmap_item; 205462306a36Sopenharmony_ci } 205562306a36Sopenharmony_ci } 205662306a36Sopenharmony_ci 205762306a36Sopenharmony_ci rmap_item->address |= UNSTABLE_FLAG; 205862306a36Sopenharmony_ci rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 205962306a36Sopenharmony_ci DO_NUMA(rmap_item->nid = nid); 206062306a36Sopenharmony_ci rb_link_node(&rmap_item->node, parent, new); 206162306a36Sopenharmony_ci rb_insert_color(&rmap_item->node, root); 206262306a36Sopenharmony_ci 206362306a36Sopenharmony_ci ksm_pages_unshared++; 206462306a36Sopenharmony_ci return NULL; 206562306a36Sopenharmony_ci} 206662306a36Sopenharmony_ci 206762306a36Sopenharmony_ci/* 206862306a36Sopenharmony_ci * stable_tree_append - add another rmap_item to the linked list of 206962306a36Sopenharmony_ci * rmap_items hanging off a given node of the stable tree, all sharing 207062306a36Sopenharmony_ci * the same ksm page. 207162306a36Sopenharmony_ci */ 207262306a36Sopenharmony_cistatic void stable_tree_append(struct ksm_rmap_item *rmap_item, 207362306a36Sopenharmony_ci struct ksm_stable_node *stable_node, 207462306a36Sopenharmony_ci bool max_page_sharing_bypass) 207562306a36Sopenharmony_ci{ 207662306a36Sopenharmony_ci /* 207762306a36Sopenharmony_ci * rmap won't find this mapping if we don't insert the 207862306a36Sopenharmony_ci * rmap_item in the right stable_node 207962306a36Sopenharmony_ci * duplicate. page_migration could break later if rmap breaks, 208062306a36Sopenharmony_ci * so we can as well crash here. We really need to check for 208162306a36Sopenharmony_ci * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check 208262306a36Sopenharmony_ci * for other negative values as an underflow if detected here 208362306a36Sopenharmony_ci * for the first time (and not when decreasing rmap_hlist_len) 208462306a36Sopenharmony_ci * would be sign of memory corruption in the stable_node. 208562306a36Sopenharmony_ci */ 208662306a36Sopenharmony_ci BUG_ON(stable_node->rmap_hlist_len < 0); 208762306a36Sopenharmony_ci 208862306a36Sopenharmony_ci stable_node->rmap_hlist_len++; 208962306a36Sopenharmony_ci if (!max_page_sharing_bypass) 209062306a36Sopenharmony_ci /* possibly non fatal but unexpected overflow, only warn */ 209162306a36Sopenharmony_ci WARN_ON_ONCE(stable_node->rmap_hlist_len > 209262306a36Sopenharmony_ci ksm_max_page_sharing); 209362306a36Sopenharmony_ci 209462306a36Sopenharmony_ci rmap_item->head = stable_node; 209562306a36Sopenharmony_ci rmap_item->address |= STABLE_FLAG; 209662306a36Sopenharmony_ci hlist_add_head(&rmap_item->hlist, &stable_node->hlist); 209762306a36Sopenharmony_ci 209862306a36Sopenharmony_ci if (rmap_item->hlist.next) 209962306a36Sopenharmony_ci ksm_pages_sharing++; 210062306a36Sopenharmony_ci else 210162306a36Sopenharmony_ci ksm_pages_shared++; 210262306a36Sopenharmony_ci 210362306a36Sopenharmony_ci rmap_item->mm->ksm_merging_pages++; 210462306a36Sopenharmony_ci} 210562306a36Sopenharmony_ci 210662306a36Sopenharmony_ci/* 210762306a36Sopenharmony_ci * cmp_and_merge_page - first see if page can be merged into the stable tree; 210862306a36Sopenharmony_ci * if not, compare checksum to previous and if it's the same, see if page can 210962306a36Sopenharmony_ci * be inserted into the unstable tree, or merged with a page already there and 211062306a36Sopenharmony_ci * both transferred to the stable tree. 211162306a36Sopenharmony_ci * 211262306a36Sopenharmony_ci * @page: the page that we are searching identical page to. 211362306a36Sopenharmony_ci * @rmap_item: the reverse mapping into the virtual address of this page 211462306a36Sopenharmony_ci */ 211562306a36Sopenharmony_cistatic void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) 211662306a36Sopenharmony_ci{ 211762306a36Sopenharmony_ci struct mm_struct *mm = rmap_item->mm; 211862306a36Sopenharmony_ci struct ksm_rmap_item *tree_rmap_item; 211962306a36Sopenharmony_ci struct page *tree_page = NULL; 212062306a36Sopenharmony_ci struct ksm_stable_node *stable_node; 212162306a36Sopenharmony_ci struct page *kpage; 212262306a36Sopenharmony_ci unsigned int checksum; 212362306a36Sopenharmony_ci int err; 212462306a36Sopenharmony_ci bool max_page_sharing_bypass = false; 212562306a36Sopenharmony_ci 212662306a36Sopenharmony_ci stable_node = page_stable_node(page); 212762306a36Sopenharmony_ci if (stable_node) { 212862306a36Sopenharmony_ci if (stable_node->head != &migrate_nodes && 212962306a36Sopenharmony_ci get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != 213062306a36Sopenharmony_ci NUMA(stable_node->nid)) { 213162306a36Sopenharmony_ci stable_node_dup_del(stable_node); 213262306a36Sopenharmony_ci stable_node->head = &migrate_nodes; 213362306a36Sopenharmony_ci list_add(&stable_node->list, stable_node->head); 213462306a36Sopenharmony_ci } 213562306a36Sopenharmony_ci if (stable_node->head != &migrate_nodes && 213662306a36Sopenharmony_ci rmap_item->head == stable_node) 213762306a36Sopenharmony_ci return; 213862306a36Sopenharmony_ci /* 213962306a36Sopenharmony_ci * If it's a KSM fork, allow it to go over the sharing limit 214062306a36Sopenharmony_ci * without warnings. 214162306a36Sopenharmony_ci */ 214262306a36Sopenharmony_ci if (!is_page_sharing_candidate(stable_node)) 214362306a36Sopenharmony_ci max_page_sharing_bypass = true; 214462306a36Sopenharmony_ci } 214562306a36Sopenharmony_ci 214662306a36Sopenharmony_ci /* We first start with searching the page inside the stable tree */ 214762306a36Sopenharmony_ci kpage = stable_tree_search(page); 214862306a36Sopenharmony_ci if (kpage == page && rmap_item->head == stable_node) { 214962306a36Sopenharmony_ci put_page(kpage); 215062306a36Sopenharmony_ci return; 215162306a36Sopenharmony_ci } 215262306a36Sopenharmony_ci 215362306a36Sopenharmony_ci remove_rmap_item_from_tree(rmap_item); 215462306a36Sopenharmony_ci 215562306a36Sopenharmony_ci if (kpage) { 215662306a36Sopenharmony_ci if (PTR_ERR(kpage) == -EBUSY) 215762306a36Sopenharmony_ci return; 215862306a36Sopenharmony_ci 215962306a36Sopenharmony_ci err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 216062306a36Sopenharmony_ci if (!err) { 216162306a36Sopenharmony_ci /* 216262306a36Sopenharmony_ci * The page was successfully merged: 216362306a36Sopenharmony_ci * add its rmap_item to the stable tree. 216462306a36Sopenharmony_ci */ 216562306a36Sopenharmony_ci lock_page(kpage); 216662306a36Sopenharmony_ci stable_tree_append(rmap_item, page_stable_node(kpage), 216762306a36Sopenharmony_ci max_page_sharing_bypass); 216862306a36Sopenharmony_ci unlock_page(kpage); 216962306a36Sopenharmony_ci } 217062306a36Sopenharmony_ci put_page(kpage); 217162306a36Sopenharmony_ci return; 217262306a36Sopenharmony_ci } 217362306a36Sopenharmony_ci 217462306a36Sopenharmony_ci /* 217562306a36Sopenharmony_ci * If the hash value of the page has changed from the last time 217662306a36Sopenharmony_ci * we calculated it, this page is changing frequently: therefore we 217762306a36Sopenharmony_ci * don't want to insert it in the unstable tree, and we don't want 217862306a36Sopenharmony_ci * to waste our time searching for something identical to it there. 217962306a36Sopenharmony_ci */ 218062306a36Sopenharmony_ci checksum = calc_checksum(page); 218162306a36Sopenharmony_ci if (rmap_item->oldchecksum != checksum) { 218262306a36Sopenharmony_ci rmap_item->oldchecksum = checksum; 218362306a36Sopenharmony_ci return; 218462306a36Sopenharmony_ci } 218562306a36Sopenharmony_ci 218662306a36Sopenharmony_ci /* 218762306a36Sopenharmony_ci * Same checksum as an empty page. We attempt to merge it with the 218862306a36Sopenharmony_ci * appropriate zero page if the user enabled this via sysfs. 218962306a36Sopenharmony_ci */ 219062306a36Sopenharmony_ci if (ksm_use_zero_pages && (checksum == zero_checksum)) { 219162306a36Sopenharmony_ci struct vm_area_struct *vma; 219262306a36Sopenharmony_ci 219362306a36Sopenharmony_ci mmap_read_lock(mm); 219462306a36Sopenharmony_ci vma = find_mergeable_vma(mm, rmap_item->address); 219562306a36Sopenharmony_ci if (vma) { 219662306a36Sopenharmony_ci err = try_to_merge_one_page(vma, page, 219762306a36Sopenharmony_ci ZERO_PAGE(rmap_item->address)); 219862306a36Sopenharmony_ci trace_ksm_merge_one_page( 219962306a36Sopenharmony_ci page_to_pfn(ZERO_PAGE(rmap_item->address)), 220062306a36Sopenharmony_ci rmap_item, mm, err); 220162306a36Sopenharmony_ci } else { 220262306a36Sopenharmony_ci /* 220362306a36Sopenharmony_ci * If the vma is out of date, we do not need to 220462306a36Sopenharmony_ci * continue. 220562306a36Sopenharmony_ci */ 220662306a36Sopenharmony_ci err = 0; 220762306a36Sopenharmony_ci } 220862306a36Sopenharmony_ci mmap_read_unlock(mm); 220962306a36Sopenharmony_ci /* 221062306a36Sopenharmony_ci * In case of failure, the page was not really empty, so we 221162306a36Sopenharmony_ci * need to continue. Otherwise we're done. 221262306a36Sopenharmony_ci */ 221362306a36Sopenharmony_ci if (!err) 221462306a36Sopenharmony_ci return; 221562306a36Sopenharmony_ci } 221662306a36Sopenharmony_ci tree_rmap_item = 221762306a36Sopenharmony_ci unstable_tree_search_insert(rmap_item, page, &tree_page); 221862306a36Sopenharmony_ci if (tree_rmap_item) { 221962306a36Sopenharmony_ci bool split; 222062306a36Sopenharmony_ci 222162306a36Sopenharmony_ci kpage = try_to_merge_two_pages(rmap_item, page, 222262306a36Sopenharmony_ci tree_rmap_item, tree_page); 222362306a36Sopenharmony_ci /* 222462306a36Sopenharmony_ci * If both pages we tried to merge belong to the same compound 222562306a36Sopenharmony_ci * page, then we actually ended up increasing the reference 222662306a36Sopenharmony_ci * count of the same compound page twice, and split_huge_page 222762306a36Sopenharmony_ci * failed. 222862306a36Sopenharmony_ci * Here we set a flag if that happened, and we use it later to 222962306a36Sopenharmony_ci * try split_huge_page again. Since we call put_page right 223062306a36Sopenharmony_ci * afterwards, the reference count will be correct and 223162306a36Sopenharmony_ci * split_huge_page should succeed. 223262306a36Sopenharmony_ci */ 223362306a36Sopenharmony_ci split = PageTransCompound(page) 223462306a36Sopenharmony_ci && compound_head(page) == compound_head(tree_page); 223562306a36Sopenharmony_ci put_page(tree_page); 223662306a36Sopenharmony_ci if (kpage) { 223762306a36Sopenharmony_ci /* 223862306a36Sopenharmony_ci * The pages were successfully merged: insert new 223962306a36Sopenharmony_ci * node in the stable tree and add both rmap_items. 224062306a36Sopenharmony_ci */ 224162306a36Sopenharmony_ci lock_page(kpage); 224262306a36Sopenharmony_ci stable_node = stable_tree_insert(kpage); 224362306a36Sopenharmony_ci if (stable_node) { 224462306a36Sopenharmony_ci stable_tree_append(tree_rmap_item, stable_node, 224562306a36Sopenharmony_ci false); 224662306a36Sopenharmony_ci stable_tree_append(rmap_item, stable_node, 224762306a36Sopenharmony_ci false); 224862306a36Sopenharmony_ci } 224962306a36Sopenharmony_ci unlock_page(kpage); 225062306a36Sopenharmony_ci 225162306a36Sopenharmony_ci /* 225262306a36Sopenharmony_ci * If we fail to insert the page into the stable tree, 225362306a36Sopenharmony_ci * we will have 2 virtual addresses that are pointing 225462306a36Sopenharmony_ci * to a ksm page left outside the stable tree, 225562306a36Sopenharmony_ci * in which case we need to break_cow on both. 225662306a36Sopenharmony_ci */ 225762306a36Sopenharmony_ci if (!stable_node) { 225862306a36Sopenharmony_ci break_cow(tree_rmap_item); 225962306a36Sopenharmony_ci break_cow(rmap_item); 226062306a36Sopenharmony_ci } 226162306a36Sopenharmony_ci } else if (split) { 226262306a36Sopenharmony_ci /* 226362306a36Sopenharmony_ci * We are here if we tried to merge two pages and 226462306a36Sopenharmony_ci * failed because they both belonged to the same 226562306a36Sopenharmony_ci * compound page. We will split the page now, but no 226662306a36Sopenharmony_ci * merging will take place. 226762306a36Sopenharmony_ci * We do not want to add the cost of a full lock; if 226862306a36Sopenharmony_ci * the page is locked, it is better to skip it and 226962306a36Sopenharmony_ci * perhaps try again later. 227062306a36Sopenharmony_ci */ 227162306a36Sopenharmony_ci if (!trylock_page(page)) 227262306a36Sopenharmony_ci return; 227362306a36Sopenharmony_ci split_huge_page(page); 227462306a36Sopenharmony_ci unlock_page(page); 227562306a36Sopenharmony_ci } 227662306a36Sopenharmony_ci } 227762306a36Sopenharmony_ci} 227862306a36Sopenharmony_ci 227962306a36Sopenharmony_cistatic struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, 228062306a36Sopenharmony_ci struct ksm_rmap_item **rmap_list, 228162306a36Sopenharmony_ci unsigned long addr) 228262306a36Sopenharmony_ci{ 228362306a36Sopenharmony_ci struct ksm_rmap_item *rmap_item; 228462306a36Sopenharmony_ci 228562306a36Sopenharmony_ci while (*rmap_list) { 228662306a36Sopenharmony_ci rmap_item = *rmap_list; 228762306a36Sopenharmony_ci if ((rmap_item->address & PAGE_MASK) == addr) 228862306a36Sopenharmony_ci return rmap_item; 228962306a36Sopenharmony_ci if (rmap_item->address > addr) 229062306a36Sopenharmony_ci break; 229162306a36Sopenharmony_ci *rmap_list = rmap_item->rmap_list; 229262306a36Sopenharmony_ci remove_rmap_item_from_tree(rmap_item); 229362306a36Sopenharmony_ci free_rmap_item(rmap_item); 229462306a36Sopenharmony_ci } 229562306a36Sopenharmony_ci 229662306a36Sopenharmony_ci rmap_item = alloc_rmap_item(); 229762306a36Sopenharmony_ci if (rmap_item) { 229862306a36Sopenharmony_ci /* It has already been zeroed */ 229962306a36Sopenharmony_ci rmap_item->mm = mm_slot->slot.mm; 230062306a36Sopenharmony_ci rmap_item->mm->ksm_rmap_items++; 230162306a36Sopenharmony_ci rmap_item->address = addr; 230262306a36Sopenharmony_ci rmap_item->rmap_list = *rmap_list; 230362306a36Sopenharmony_ci *rmap_list = rmap_item; 230462306a36Sopenharmony_ci } 230562306a36Sopenharmony_ci return rmap_item; 230662306a36Sopenharmony_ci} 230762306a36Sopenharmony_ci 230862306a36Sopenharmony_cistatic struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) 230962306a36Sopenharmony_ci{ 231062306a36Sopenharmony_ci struct mm_struct *mm; 231162306a36Sopenharmony_ci struct ksm_mm_slot *mm_slot; 231262306a36Sopenharmony_ci struct mm_slot *slot; 231362306a36Sopenharmony_ci struct vm_area_struct *vma; 231462306a36Sopenharmony_ci struct ksm_rmap_item *rmap_item; 231562306a36Sopenharmony_ci struct vma_iterator vmi; 231662306a36Sopenharmony_ci int nid; 231762306a36Sopenharmony_ci 231862306a36Sopenharmony_ci if (list_empty(&ksm_mm_head.slot.mm_node)) 231962306a36Sopenharmony_ci return NULL; 232062306a36Sopenharmony_ci 232162306a36Sopenharmony_ci mm_slot = ksm_scan.mm_slot; 232262306a36Sopenharmony_ci if (mm_slot == &ksm_mm_head) { 232362306a36Sopenharmony_ci trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items); 232462306a36Sopenharmony_ci 232562306a36Sopenharmony_ci /* 232662306a36Sopenharmony_ci * A number of pages can hang around indefinitely in per-cpu 232762306a36Sopenharmony_ci * LRU cache, raised page count preventing write_protect_page 232862306a36Sopenharmony_ci * from merging them. Though it doesn't really matter much, 232962306a36Sopenharmony_ci * it is puzzling to see some stuck in pages_volatile until 233062306a36Sopenharmony_ci * other activity jostles them out, and they also prevented 233162306a36Sopenharmony_ci * LTP's KSM test from succeeding deterministically; so drain 233262306a36Sopenharmony_ci * them here (here rather than on entry to ksm_do_scan(), 233362306a36Sopenharmony_ci * so we don't IPI too often when pages_to_scan is set low). 233462306a36Sopenharmony_ci */ 233562306a36Sopenharmony_ci lru_add_drain_all(); 233662306a36Sopenharmony_ci 233762306a36Sopenharmony_ci /* 233862306a36Sopenharmony_ci * Whereas stale stable_nodes on the stable_tree itself 233962306a36Sopenharmony_ci * get pruned in the regular course of stable_tree_search(), 234062306a36Sopenharmony_ci * those moved out to the migrate_nodes list can accumulate: 234162306a36Sopenharmony_ci * so prune them once before each full scan. 234262306a36Sopenharmony_ci */ 234362306a36Sopenharmony_ci if (!ksm_merge_across_nodes) { 234462306a36Sopenharmony_ci struct ksm_stable_node *stable_node, *next; 234562306a36Sopenharmony_ci struct page *page; 234662306a36Sopenharmony_ci 234762306a36Sopenharmony_ci list_for_each_entry_safe(stable_node, next, 234862306a36Sopenharmony_ci &migrate_nodes, list) { 234962306a36Sopenharmony_ci page = get_ksm_page(stable_node, 235062306a36Sopenharmony_ci GET_KSM_PAGE_NOLOCK); 235162306a36Sopenharmony_ci if (page) 235262306a36Sopenharmony_ci put_page(page); 235362306a36Sopenharmony_ci cond_resched(); 235462306a36Sopenharmony_ci } 235562306a36Sopenharmony_ci } 235662306a36Sopenharmony_ci 235762306a36Sopenharmony_ci for (nid = 0; nid < ksm_nr_node_ids; nid++) 235862306a36Sopenharmony_ci root_unstable_tree[nid] = RB_ROOT; 235962306a36Sopenharmony_ci 236062306a36Sopenharmony_ci spin_lock(&ksm_mmlist_lock); 236162306a36Sopenharmony_ci slot = list_entry(mm_slot->slot.mm_node.next, 236262306a36Sopenharmony_ci struct mm_slot, mm_node); 236362306a36Sopenharmony_ci mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); 236462306a36Sopenharmony_ci ksm_scan.mm_slot = mm_slot; 236562306a36Sopenharmony_ci spin_unlock(&ksm_mmlist_lock); 236662306a36Sopenharmony_ci /* 236762306a36Sopenharmony_ci * Although we tested list_empty() above, a racing __ksm_exit 236862306a36Sopenharmony_ci * of the last mm on the list may have removed it since then. 236962306a36Sopenharmony_ci */ 237062306a36Sopenharmony_ci if (mm_slot == &ksm_mm_head) 237162306a36Sopenharmony_ci return NULL; 237262306a36Sopenharmony_cinext_mm: 237362306a36Sopenharmony_ci ksm_scan.address = 0; 237462306a36Sopenharmony_ci ksm_scan.rmap_list = &mm_slot->rmap_list; 237562306a36Sopenharmony_ci } 237662306a36Sopenharmony_ci 237762306a36Sopenharmony_ci slot = &mm_slot->slot; 237862306a36Sopenharmony_ci mm = slot->mm; 237962306a36Sopenharmony_ci vma_iter_init(&vmi, mm, ksm_scan.address); 238062306a36Sopenharmony_ci 238162306a36Sopenharmony_ci mmap_read_lock(mm); 238262306a36Sopenharmony_ci if (ksm_test_exit(mm)) 238362306a36Sopenharmony_ci goto no_vmas; 238462306a36Sopenharmony_ci 238562306a36Sopenharmony_ci for_each_vma(vmi, vma) { 238662306a36Sopenharmony_ci if (!(vma->vm_flags & VM_MERGEABLE)) 238762306a36Sopenharmony_ci continue; 238862306a36Sopenharmony_ci if (ksm_scan.address < vma->vm_start) 238962306a36Sopenharmony_ci ksm_scan.address = vma->vm_start; 239062306a36Sopenharmony_ci if (!vma->anon_vma) 239162306a36Sopenharmony_ci ksm_scan.address = vma->vm_end; 239262306a36Sopenharmony_ci 239362306a36Sopenharmony_ci while (ksm_scan.address < vma->vm_end) { 239462306a36Sopenharmony_ci if (ksm_test_exit(mm)) 239562306a36Sopenharmony_ci break; 239662306a36Sopenharmony_ci *page = follow_page(vma, ksm_scan.address, FOLL_GET); 239762306a36Sopenharmony_ci if (IS_ERR_OR_NULL(*page)) { 239862306a36Sopenharmony_ci ksm_scan.address += PAGE_SIZE; 239962306a36Sopenharmony_ci cond_resched(); 240062306a36Sopenharmony_ci continue; 240162306a36Sopenharmony_ci } 240262306a36Sopenharmony_ci if (is_zone_device_page(*page)) 240362306a36Sopenharmony_ci goto next_page; 240462306a36Sopenharmony_ci if (PageAnon(*page)) { 240562306a36Sopenharmony_ci flush_anon_page(vma, *page, ksm_scan.address); 240662306a36Sopenharmony_ci flush_dcache_page(*page); 240762306a36Sopenharmony_ci rmap_item = get_next_rmap_item(mm_slot, 240862306a36Sopenharmony_ci ksm_scan.rmap_list, ksm_scan.address); 240962306a36Sopenharmony_ci if (rmap_item) { 241062306a36Sopenharmony_ci ksm_scan.rmap_list = 241162306a36Sopenharmony_ci &rmap_item->rmap_list; 241262306a36Sopenharmony_ci ksm_scan.address += PAGE_SIZE; 241362306a36Sopenharmony_ci } else 241462306a36Sopenharmony_ci put_page(*page); 241562306a36Sopenharmony_ci mmap_read_unlock(mm); 241662306a36Sopenharmony_ci return rmap_item; 241762306a36Sopenharmony_ci } 241862306a36Sopenharmony_cinext_page: 241962306a36Sopenharmony_ci put_page(*page); 242062306a36Sopenharmony_ci ksm_scan.address += PAGE_SIZE; 242162306a36Sopenharmony_ci cond_resched(); 242262306a36Sopenharmony_ci } 242362306a36Sopenharmony_ci } 242462306a36Sopenharmony_ci 242562306a36Sopenharmony_ci if (ksm_test_exit(mm)) { 242662306a36Sopenharmony_cino_vmas: 242762306a36Sopenharmony_ci ksm_scan.address = 0; 242862306a36Sopenharmony_ci ksm_scan.rmap_list = &mm_slot->rmap_list; 242962306a36Sopenharmony_ci } 243062306a36Sopenharmony_ci /* 243162306a36Sopenharmony_ci * Nuke all the rmap_items that are above this current rmap: 243262306a36Sopenharmony_ci * because there were no VM_MERGEABLE vmas with such addresses. 243362306a36Sopenharmony_ci */ 243462306a36Sopenharmony_ci remove_trailing_rmap_items(ksm_scan.rmap_list); 243562306a36Sopenharmony_ci 243662306a36Sopenharmony_ci spin_lock(&ksm_mmlist_lock); 243762306a36Sopenharmony_ci slot = list_entry(mm_slot->slot.mm_node.next, 243862306a36Sopenharmony_ci struct mm_slot, mm_node); 243962306a36Sopenharmony_ci ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); 244062306a36Sopenharmony_ci if (ksm_scan.address == 0) { 244162306a36Sopenharmony_ci /* 244262306a36Sopenharmony_ci * We've completed a full scan of all vmas, holding mmap_lock 244362306a36Sopenharmony_ci * throughout, and found no VM_MERGEABLE: so do the same as 244462306a36Sopenharmony_ci * __ksm_exit does to remove this mm from all our lists now. 244562306a36Sopenharmony_ci * This applies either when cleaning up after __ksm_exit 244662306a36Sopenharmony_ci * (but beware: we can reach here even before __ksm_exit), 244762306a36Sopenharmony_ci * or when all VM_MERGEABLE areas have been unmapped (and 244862306a36Sopenharmony_ci * mmap_lock then protects against race with MADV_MERGEABLE). 244962306a36Sopenharmony_ci */ 245062306a36Sopenharmony_ci hash_del(&mm_slot->slot.hash); 245162306a36Sopenharmony_ci list_del(&mm_slot->slot.mm_node); 245262306a36Sopenharmony_ci spin_unlock(&ksm_mmlist_lock); 245362306a36Sopenharmony_ci 245462306a36Sopenharmony_ci mm_slot_free(mm_slot_cache, mm_slot); 245562306a36Sopenharmony_ci clear_bit(MMF_VM_MERGEABLE, &mm->flags); 245662306a36Sopenharmony_ci clear_bit(MMF_VM_MERGE_ANY, &mm->flags); 245762306a36Sopenharmony_ci mmap_read_unlock(mm); 245862306a36Sopenharmony_ci mmdrop(mm); 245962306a36Sopenharmony_ci } else { 246062306a36Sopenharmony_ci mmap_read_unlock(mm); 246162306a36Sopenharmony_ci /* 246262306a36Sopenharmony_ci * mmap_read_unlock(mm) first because after 246362306a36Sopenharmony_ci * spin_unlock(&ksm_mmlist_lock) run, the "mm" may 246462306a36Sopenharmony_ci * already have been freed under us by __ksm_exit() 246562306a36Sopenharmony_ci * because the "mm_slot" is still hashed and 246662306a36Sopenharmony_ci * ksm_scan.mm_slot doesn't point to it anymore. 246762306a36Sopenharmony_ci */ 246862306a36Sopenharmony_ci spin_unlock(&ksm_mmlist_lock); 246962306a36Sopenharmony_ci } 247062306a36Sopenharmony_ci 247162306a36Sopenharmony_ci /* Repeat until we've completed scanning the whole list */ 247262306a36Sopenharmony_ci mm_slot = ksm_scan.mm_slot; 247362306a36Sopenharmony_ci if (mm_slot != &ksm_mm_head) 247462306a36Sopenharmony_ci goto next_mm; 247562306a36Sopenharmony_ci 247662306a36Sopenharmony_ci trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items); 247762306a36Sopenharmony_ci ksm_scan.seqnr++; 247862306a36Sopenharmony_ci return NULL; 247962306a36Sopenharmony_ci} 248062306a36Sopenharmony_ci 248162306a36Sopenharmony_ci/** 248262306a36Sopenharmony_ci * ksm_do_scan - the ksm scanner main worker function. 248362306a36Sopenharmony_ci * @scan_npages: number of pages we want to scan before we return. 248462306a36Sopenharmony_ci */ 248562306a36Sopenharmony_cistatic void ksm_do_scan(unsigned int scan_npages) 248662306a36Sopenharmony_ci{ 248762306a36Sopenharmony_ci struct ksm_rmap_item *rmap_item; 248862306a36Sopenharmony_ci struct page *page; 248962306a36Sopenharmony_ci unsigned int npages = scan_npages; 249062306a36Sopenharmony_ci 249162306a36Sopenharmony_ci while (npages-- && likely(!freezing(current))) { 249262306a36Sopenharmony_ci cond_resched(); 249362306a36Sopenharmony_ci rmap_item = scan_get_next_rmap_item(&page); 249462306a36Sopenharmony_ci if (!rmap_item) 249562306a36Sopenharmony_ci return; 249662306a36Sopenharmony_ci cmp_and_merge_page(page, rmap_item); 249762306a36Sopenharmony_ci put_page(page); 249862306a36Sopenharmony_ci } 249962306a36Sopenharmony_ci 250062306a36Sopenharmony_ci ksm_pages_scanned += scan_npages - npages; 250162306a36Sopenharmony_ci} 250262306a36Sopenharmony_ci 250362306a36Sopenharmony_cistatic int ksmd_should_run(void) 250462306a36Sopenharmony_ci{ 250562306a36Sopenharmony_ci return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); 250662306a36Sopenharmony_ci} 250762306a36Sopenharmony_ci 250862306a36Sopenharmony_cistatic int ksm_scan_thread(void *nothing) 250962306a36Sopenharmony_ci{ 251062306a36Sopenharmony_ci unsigned int sleep_ms; 251162306a36Sopenharmony_ci 251262306a36Sopenharmony_ci set_freezable(); 251362306a36Sopenharmony_ci set_user_nice(current, 5); 251462306a36Sopenharmony_ci 251562306a36Sopenharmony_ci while (!kthread_should_stop()) { 251662306a36Sopenharmony_ci mutex_lock(&ksm_thread_mutex); 251762306a36Sopenharmony_ci wait_while_offlining(); 251862306a36Sopenharmony_ci if (ksmd_should_run()) 251962306a36Sopenharmony_ci ksm_do_scan(ksm_thread_pages_to_scan); 252062306a36Sopenharmony_ci mutex_unlock(&ksm_thread_mutex); 252162306a36Sopenharmony_ci 252262306a36Sopenharmony_ci try_to_freeze(); 252362306a36Sopenharmony_ci 252462306a36Sopenharmony_ci if (ksmd_should_run()) { 252562306a36Sopenharmony_ci sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs); 252662306a36Sopenharmony_ci wait_event_interruptible_timeout(ksm_iter_wait, 252762306a36Sopenharmony_ci sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs), 252862306a36Sopenharmony_ci msecs_to_jiffies(sleep_ms)); 252962306a36Sopenharmony_ci } else { 253062306a36Sopenharmony_ci wait_event_freezable(ksm_thread_wait, 253162306a36Sopenharmony_ci ksmd_should_run() || kthread_should_stop()); 253262306a36Sopenharmony_ci } 253362306a36Sopenharmony_ci } 253462306a36Sopenharmony_ci return 0; 253562306a36Sopenharmony_ci} 253662306a36Sopenharmony_ci 253762306a36Sopenharmony_cistatic void __ksm_add_vma(struct vm_area_struct *vma) 253862306a36Sopenharmony_ci{ 253962306a36Sopenharmony_ci unsigned long vm_flags = vma->vm_flags; 254062306a36Sopenharmony_ci 254162306a36Sopenharmony_ci if (vm_flags & VM_MERGEABLE) 254262306a36Sopenharmony_ci return; 254362306a36Sopenharmony_ci 254462306a36Sopenharmony_ci if (vma_ksm_compatible(vma)) 254562306a36Sopenharmony_ci vm_flags_set(vma, VM_MERGEABLE); 254662306a36Sopenharmony_ci} 254762306a36Sopenharmony_ci 254862306a36Sopenharmony_cistatic int __ksm_del_vma(struct vm_area_struct *vma) 254962306a36Sopenharmony_ci{ 255062306a36Sopenharmony_ci int err; 255162306a36Sopenharmony_ci 255262306a36Sopenharmony_ci if (!(vma->vm_flags & VM_MERGEABLE)) 255362306a36Sopenharmony_ci return 0; 255462306a36Sopenharmony_ci 255562306a36Sopenharmony_ci if (vma->anon_vma) { 255662306a36Sopenharmony_ci err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); 255762306a36Sopenharmony_ci if (err) 255862306a36Sopenharmony_ci return err; 255962306a36Sopenharmony_ci } 256062306a36Sopenharmony_ci 256162306a36Sopenharmony_ci vm_flags_clear(vma, VM_MERGEABLE); 256262306a36Sopenharmony_ci return 0; 256362306a36Sopenharmony_ci} 256462306a36Sopenharmony_ci/** 256562306a36Sopenharmony_ci * ksm_add_vma - Mark vma as mergeable if compatible 256662306a36Sopenharmony_ci * 256762306a36Sopenharmony_ci * @vma: Pointer to vma 256862306a36Sopenharmony_ci */ 256962306a36Sopenharmony_civoid ksm_add_vma(struct vm_area_struct *vma) 257062306a36Sopenharmony_ci{ 257162306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 257262306a36Sopenharmony_ci 257362306a36Sopenharmony_ci if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) 257462306a36Sopenharmony_ci __ksm_add_vma(vma); 257562306a36Sopenharmony_ci} 257662306a36Sopenharmony_ci 257762306a36Sopenharmony_cistatic void ksm_add_vmas(struct mm_struct *mm) 257862306a36Sopenharmony_ci{ 257962306a36Sopenharmony_ci struct vm_area_struct *vma; 258062306a36Sopenharmony_ci 258162306a36Sopenharmony_ci VMA_ITERATOR(vmi, mm, 0); 258262306a36Sopenharmony_ci for_each_vma(vmi, vma) 258362306a36Sopenharmony_ci __ksm_add_vma(vma); 258462306a36Sopenharmony_ci} 258562306a36Sopenharmony_ci 258662306a36Sopenharmony_cistatic int ksm_del_vmas(struct mm_struct *mm) 258762306a36Sopenharmony_ci{ 258862306a36Sopenharmony_ci struct vm_area_struct *vma; 258962306a36Sopenharmony_ci int err; 259062306a36Sopenharmony_ci 259162306a36Sopenharmony_ci VMA_ITERATOR(vmi, mm, 0); 259262306a36Sopenharmony_ci for_each_vma(vmi, vma) { 259362306a36Sopenharmony_ci err = __ksm_del_vma(vma); 259462306a36Sopenharmony_ci if (err) 259562306a36Sopenharmony_ci return err; 259662306a36Sopenharmony_ci } 259762306a36Sopenharmony_ci return 0; 259862306a36Sopenharmony_ci} 259962306a36Sopenharmony_ci 260062306a36Sopenharmony_ci/** 260162306a36Sopenharmony_ci * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all 260262306a36Sopenharmony_ci * compatible VMA's 260362306a36Sopenharmony_ci * 260462306a36Sopenharmony_ci * @mm: Pointer to mm 260562306a36Sopenharmony_ci * 260662306a36Sopenharmony_ci * Returns 0 on success, otherwise error code 260762306a36Sopenharmony_ci */ 260862306a36Sopenharmony_ciint ksm_enable_merge_any(struct mm_struct *mm) 260962306a36Sopenharmony_ci{ 261062306a36Sopenharmony_ci int err; 261162306a36Sopenharmony_ci 261262306a36Sopenharmony_ci if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) 261362306a36Sopenharmony_ci return 0; 261462306a36Sopenharmony_ci 261562306a36Sopenharmony_ci if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 261662306a36Sopenharmony_ci err = __ksm_enter(mm); 261762306a36Sopenharmony_ci if (err) 261862306a36Sopenharmony_ci return err; 261962306a36Sopenharmony_ci } 262062306a36Sopenharmony_ci 262162306a36Sopenharmony_ci set_bit(MMF_VM_MERGE_ANY, &mm->flags); 262262306a36Sopenharmony_ci ksm_add_vmas(mm); 262362306a36Sopenharmony_ci 262462306a36Sopenharmony_ci return 0; 262562306a36Sopenharmony_ci} 262662306a36Sopenharmony_ci 262762306a36Sopenharmony_ci/** 262862306a36Sopenharmony_ci * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm, 262962306a36Sopenharmony_ci * previously enabled via ksm_enable_merge_any(). 263062306a36Sopenharmony_ci * 263162306a36Sopenharmony_ci * Disabling merging implies unmerging any merged pages, like setting 263262306a36Sopenharmony_ci * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and 263362306a36Sopenharmony_ci * merging on all compatible VMA's remains enabled. 263462306a36Sopenharmony_ci * 263562306a36Sopenharmony_ci * @mm: Pointer to mm 263662306a36Sopenharmony_ci * 263762306a36Sopenharmony_ci * Returns 0 on success, otherwise error code 263862306a36Sopenharmony_ci */ 263962306a36Sopenharmony_ciint ksm_disable_merge_any(struct mm_struct *mm) 264062306a36Sopenharmony_ci{ 264162306a36Sopenharmony_ci int err; 264262306a36Sopenharmony_ci 264362306a36Sopenharmony_ci if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) 264462306a36Sopenharmony_ci return 0; 264562306a36Sopenharmony_ci 264662306a36Sopenharmony_ci err = ksm_del_vmas(mm); 264762306a36Sopenharmony_ci if (err) { 264862306a36Sopenharmony_ci ksm_add_vmas(mm); 264962306a36Sopenharmony_ci return err; 265062306a36Sopenharmony_ci } 265162306a36Sopenharmony_ci 265262306a36Sopenharmony_ci clear_bit(MMF_VM_MERGE_ANY, &mm->flags); 265362306a36Sopenharmony_ci return 0; 265462306a36Sopenharmony_ci} 265562306a36Sopenharmony_ci 265662306a36Sopenharmony_ciint ksm_disable(struct mm_struct *mm) 265762306a36Sopenharmony_ci{ 265862306a36Sopenharmony_ci mmap_assert_write_locked(mm); 265962306a36Sopenharmony_ci 266062306a36Sopenharmony_ci if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) 266162306a36Sopenharmony_ci return 0; 266262306a36Sopenharmony_ci if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) 266362306a36Sopenharmony_ci return ksm_disable_merge_any(mm); 266462306a36Sopenharmony_ci return ksm_del_vmas(mm); 266562306a36Sopenharmony_ci} 266662306a36Sopenharmony_ci 266762306a36Sopenharmony_ciint ksm_madvise(struct vm_area_struct *vma, unsigned long start, 266862306a36Sopenharmony_ci unsigned long end, int advice, unsigned long *vm_flags) 266962306a36Sopenharmony_ci{ 267062306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 267162306a36Sopenharmony_ci int err; 267262306a36Sopenharmony_ci 267362306a36Sopenharmony_ci switch (advice) { 267462306a36Sopenharmony_ci case MADV_MERGEABLE: 267562306a36Sopenharmony_ci if (vma->vm_flags & VM_MERGEABLE) 267662306a36Sopenharmony_ci return 0; 267762306a36Sopenharmony_ci if (!vma_ksm_compatible(vma)) 267862306a36Sopenharmony_ci return 0; 267962306a36Sopenharmony_ci 268062306a36Sopenharmony_ci if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 268162306a36Sopenharmony_ci err = __ksm_enter(mm); 268262306a36Sopenharmony_ci if (err) 268362306a36Sopenharmony_ci return err; 268462306a36Sopenharmony_ci } 268562306a36Sopenharmony_ci 268662306a36Sopenharmony_ci *vm_flags |= VM_MERGEABLE; 268762306a36Sopenharmony_ci break; 268862306a36Sopenharmony_ci 268962306a36Sopenharmony_ci case MADV_UNMERGEABLE: 269062306a36Sopenharmony_ci if (!(*vm_flags & VM_MERGEABLE)) 269162306a36Sopenharmony_ci return 0; /* just ignore the advice */ 269262306a36Sopenharmony_ci 269362306a36Sopenharmony_ci if (vma->anon_vma) { 269462306a36Sopenharmony_ci err = unmerge_ksm_pages(vma, start, end, true); 269562306a36Sopenharmony_ci if (err) 269662306a36Sopenharmony_ci return err; 269762306a36Sopenharmony_ci } 269862306a36Sopenharmony_ci 269962306a36Sopenharmony_ci *vm_flags &= ~VM_MERGEABLE; 270062306a36Sopenharmony_ci break; 270162306a36Sopenharmony_ci } 270262306a36Sopenharmony_ci 270362306a36Sopenharmony_ci return 0; 270462306a36Sopenharmony_ci} 270562306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(ksm_madvise); 270662306a36Sopenharmony_ci 270762306a36Sopenharmony_ciint __ksm_enter(struct mm_struct *mm) 270862306a36Sopenharmony_ci{ 270962306a36Sopenharmony_ci struct ksm_mm_slot *mm_slot; 271062306a36Sopenharmony_ci struct mm_slot *slot; 271162306a36Sopenharmony_ci int needs_wakeup; 271262306a36Sopenharmony_ci 271362306a36Sopenharmony_ci mm_slot = mm_slot_alloc(mm_slot_cache); 271462306a36Sopenharmony_ci if (!mm_slot) 271562306a36Sopenharmony_ci return -ENOMEM; 271662306a36Sopenharmony_ci 271762306a36Sopenharmony_ci slot = &mm_slot->slot; 271862306a36Sopenharmony_ci 271962306a36Sopenharmony_ci /* Check ksm_run too? Would need tighter locking */ 272062306a36Sopenharmony_ci needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); 272162306a36Sopenharmony_ci 272262306a36Sopenharmony_ci spin_lock(&ksm_mmlist_lock); 272362306a36Sopenharmony_ci mm_slot_insert(mm_slots_hash, mm, slot); 272462306a36Sopenharmony_ci /* 272562306a36Sopenharmony_ci * When KSM_RUN_MERGE (or KSM_RUN_STOP), 272662306a36Sopenharmony_ci * insert just behind the scanning cursor, to let the area settle 272762306a36Sopenharmony_ci * down a little; when fork is followed by immediate exec, we don't 272862306a36Sopenharmony_ci * want ksmd to waste time setting up and tearing down an rmap_list. 272962306a36Sopenharmony_ci * 273062306a36Sopenharmony_ci * But when KSM_RUN_UNMERGE, it's important to insert ahead of its 273162306a36Sopenharmony_ci * scanning cursor, otherwise KSM pages in newly forked mms will be 273262306a36Sopenharmony_ci * missed: then we might as well insert at the end of the list. 273362306a36Sopenharmony_ci */ 273462306a36Sopenharmony_ci if (ksm_run & KSM_RUN_UNMERGE) 273562306a36Sopenharmony_ci list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); 273662306a36Sopenharmony_ci else 273762306a36Sopenharmony_ci list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); 273862306a36Sopenharmony_ci spin_unlock(&ksm_mmlist_lock); 273962306a36Sopenharmony_ci 274062306a36Sopenharmony_ci set_bit(MMF_VM_MERGEABLE, &mm->flags); 274162306a36Sopenharmony_ci mmgrab(mm); 274262306a36Sopenharmony_ci 274362306a36Sopenharmony_ci if (needs_wakeup) 274462306a36Sopenharmony_ci wake_up_interruptible(&ksm_thread_wait); 274562306a36Sopenharmony_ci 274662306a36Sopenharmony_ci trace_ksm_enter(mm); 274762306a36Sopenharmony_ci return 0; 274862306a36Sopenharmony_ci} 274962306a36Sopenharmony_ci 275062306a36Sopenharmony_civoid __ksm_exit(struct mm_struct *mm) 275162306a36Sopenharmony_ci{ 275262306a36Sopenharmony_ci struct ksm_mm_slot *mm_slot; 275362306a36Sopenharmony_ci struct mm_slot *slot; 275462306a36Sopenharmony_ci int easy_to_free = 0; 275562306a36Sopenharmony_ci 275662306a36Sopenharmony_ci /* 275762306a36Sopenharmony_ci * This process is exiting: if it's straightforward (as is the 275862306a36Sopenharmony_ci * case when ksmd was never running), free mm_slot immediately. 275962306a36Sopenharmony_ci * But if it's at the cursor or has rmap_items linked to it, use 276062306a36Sopenharmony_ci * mmap_lock to synchronize with any break_cows before pagetables 276162306a36Sopenharmony_ci * are freed, and leave the mm_slot on the list for ksmd to free. 276262306a36Sopenharmony_ci * Beware: ksm may already have noticed it exiting and freed the slot. 276362306a36Sopenharmony_ci */ 276462306a36Sopenharmony_ci 276562306a36Sopenharmony_ci spin_lock(&ksm_mmlist_lock); 276662306a36Sopenharmony_ci slot = mm_slot_lookup(mm_slots_hash, mm); 276762306a36Sopenharmony_ci mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); 276862306a36Sopenharmony_ci if (mm_slot && ksm_scan.mm_slot != mm_slot) { 276962306a36Sopenharmony_ci if (!mm_slot->rmap_list) { 277062306a36Sopenharmony_ci hash_del(&slot->hash); 277162306a36Sopenharmony_ci list_del(&slot->mm_node); 277262306a36Sopenharmony_ci easy_to_free = 1; 277362306a36Sopenharmony_ci } else { 277462306a36Sopenharmony_ci list_move(&slot->mm_node, 277562306a36Sopenharmony_ci &ksm_scan.mm_slot->slot.mm_node); 277662306a36Sopenharmony_ci } 277762306a36Sopenharmony_ci } 277862306a36Sopenharmony_ci spin_unlock(&ksm_mmlist_lock); 277962306a36Sopenharmony_ci 278062306a36Sopenharmony_ci if (easy_to_free) { 278162306a36Sopenharmony_ci mm_slot_free(mm_slot_cache, mm_slot); 278262306a36Sopenharmony_ci clear_bit(MMF_VM_MERGE_ANY, &mm->flags); 278362306a36Sopenharmony_ci clear_bit(MMF_VM_MERGEABLE, &mm->flags); 278462306a36Sopenharmony_ci mmdrop(mm); 278562306a36Sopenharmony_ci } else if (mm_slot) { 278662306a36Sopenharmony_ci mmap_write_lock(mm); 278762306a36Sopenharmony_ci mmap_write_unlock(mm); 278862306a36Sopenharmony_ci } 278962306a36Sopenharmony_ci 279062306a36Sopenharmony_ci trace_ksm_exit(mm); 279162306a36Sopenharmony_ci} 279262306a36Sopenharmony_ci 279362306a36Sopenharmony_cistruct page *ksm_might_need_to_copy(struct page *page, 279462306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long address) 279562306a36Sopenharmony_ci{ 279662306a36Sopenharmony_ci struct folio *folio = page_folio(page); 279762306a36Sopenharmony_ci struct anon_vma *anon_vma = folio_anon_vma(folio); 279862306a36Sopenharmony_ci struct page *new_page; 279962306a36Sopenharmony_ci 280062306a36Sopenharmony_ci if (PageKsm(page)) { 280162306a36Sopenharmony_ci if (page_stable_node(page) && 280262306a36Sopenharmony_ci !(ksm_run & KSM_RUN_UNMERGE)) 280362306a36Sopenharmony_ci return page; /* no need to copy it */ 280462306a36Sopenharmony_ci } else if (!anon_vma) { 280562306a36Sopenharmony_ci return page; /* no need to copy it */ 280662306a36Sopenharmony_ci } else if (page->index == linear_page_index(vma, address) && 280762306a36Sopenharmony_ci anon_vma->root == vma->anon_vma->root) { 280862306a36Sopenharmony_ci return page; /* still no need to copy it */ 280962306a36Sopenharmony_ci } 281062306a36Sopenharmony_ci if (PageHWPoison(page)) 281162306a36Sopenharmony_ci return ERR_PTR(-EHWPOISON); 281262306a36Sopenharmony_ci if (!PageUptodate(page)) 281362306a36Sopenharmony_ci return page; /* let do_swap_page report the error */ 281462306a36Sopenharmony_ci 281562306a36Sopenharmony_ci new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 281662306a36Sopenharmony_ci if (new_page && 281762306a36Sopenharmony_ci mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) { 281862306a36Sopenharmony_ci put_page(new_page); 281962306a36Sopenharmony_ci new_page = NULL; 282062306a36Sopenharmony_ci } 282162306a36Sopenharmony_ci if (new_page) { 282262306a36Sopenharmony_ci if (copy_mc_user_highpage(new_page, page, address, vma)) { 282362306a36Sopenharmony_ci put_page(new_page); 282462306a36Sopenharmony_ci memory_failure_queue(page_to_pfn(page), 0); 282562306a36Sopenharmony_ci return ERR_PTR(-EHWPOISON); 282662306a36Sopenharmony_ci } 282762306a36Sopenharmony_ci SetPageDirty(new_page); 282862306a36Sopenharmony_ci __SetPageUptodate(new_page); 282962306a36Sopenharmony_ci __SetPageLocked(new_page); 283062306a36Sopenharmony_ci#ifdef CONFIG_SWAP 283162306a36Sopenharmony_ci count_vm_event(KSM_SWPIN_COPY); 283262306a36Sopenharmony_ci#endif 283362306a36Sopenharmony_ci } 283462306a36Sopenharmony_ci 283562306a36Sopenharmony_ci return new_page; 283662306a36Sopenharmony_ci} 283762306a36Sopenharmony_ci 283862306a36Sopenharmony_civoid rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) 283962306a36Sopenharmony_ci{ 284062306a36Sopenharmony_ci struct ksm_stable_node *stable_node; 284162306a36Sopenharmony_ci struct ksm_rmap_item *rmap_item; 284262306a36Sopenharmony_ci int search_new_forks = 0; 284362306a36Sopenharmony_ci 284462306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); 284562306a36Sopenharmony_ci 284662306a36Sopenharmony_ci /* 284762306a36Sopenharmony_ci * Rely on the page lock to protect against concurrent modifications 284862306a36Sopenharmony_ci * to that page's node of the stable tree. 284962306a36Sopenharmony_ci */ 285062306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 285162306a36Sopenharmony_ci 285262306a36Sopenharmony_ci stable_node = folio_stable_node(folio); 285362306a36Sopenharmony_ci if (!stable_node) 285462306a36Sopenharmony_ci return; 285562306a36Sopenharmony_ciagain: 285662306a36Sopenharmony_ci hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { 285762306a36Sopenharmony_ci struct anon_vma *anon_vma = rmap_item->anon_vma; 285862306a36Sopenharmony_ci struct anon_vma_chain *vmac; 285962306a36Sopenharmony_ci struct vm_area_struct *vma; 286062306a36Sopenharmony_ci 286162306a36Sopenharmony_ci cond_resched(); 286262306a36Sopenharmony_ci if (!anon_vma_trylock_read(anon_vma)) { 286362306a36Sopenharmony_ci if (rwc->try_lock) { 286462306a36Sopenharmony_ci rwc->contended = true; 286562306a36Sopenharmony_ci return; 286662306a36Sopenharmony_ci } 286762306a36Sopenharmony_ci anon_vma_lock_read(anon_vma); 286862306a36Sopenharmony_ci } 286962306a36Sopenharmony_ci anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 287062306a36Sopenharmony_ci 0, ULONG_MAX) { 287162306a36Sopenharmony_ci unsigned long addr; 287262306a36Sopenharmony_ci 287362306a36Sopenharmony_ci cond_resched(); 287462306a36Sopenharmony_ci vma = vmac->vma; 287562306a36Sopenharmony_ci 287662306a36Sopenharmony_ci /* Ignore the stable/unstable/sqnr flags */ 287762306a36Sopenharmony_ci addr = rmap_item->address & PAGE_MASK; 287862306a36Sopenharmony_ci 287962306a36Sopenharmony_ci if (addr < vma->vm_start || addr >= vma->vm_end) 288062306a36Sopenharmony_ci continue; 288162306a36Sopenharmony_ci /* 288262306a36Sopenharmony_ci * Initially we examine only the vma which covers this 288362306a36Sopenharmony_ci * rmap_item; but later, if there is still work to do, 288462306a36Sopenharmony_ci * we examine covering vmas in other mms: in case they 288562306a36Sopenharmony_ci * were forked from the original since ksmd passed. 288662306a36Sopenharmony_ci */ 288762306a36Sopenharmony_ci if ((rmap_item->mm == vma->vm_mm) == search_new_forks) 288862306a36Sopenharmony_ci continue; 288962306a36Sopenharmony_ci 289062306a36Sopenharmony_ci if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 289162306a36Sopenharmony_ci continue; 289262306a36Sopenharmony_ci 289362306a36Sopenharmony_ci if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) { 289462306a36Sopenharmony_ci anon_vma_unlock_read(anon_vma); 289562306a36Sopenharmony_ci return; 289662306a36Sopenharmony_ci } 289762306a36Sopenharmony_ci if (rwc->done && rwc->done(folio)) { 289862306a36Sopenharmony_ci anon_vma_unlock_read(anon_vma); 289962306a36Sopenharmony_ci return; 290062306a36Sopenharmony_ci } 290162306a36Sopenharmony_ci } 290262306a36Sopenharmony_ci anon_vma_unlock_read(anon_vma); 290362306a36Sopenharmony_ci } 290462306a36Sopenharmony_ci if (!search_new_forks++) 290562306a36Sopenharmony_ci goto again; 290662306a36Sopenharmony_ci} 290762306a36Sopenharmony_ci 290862306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_FAILURE 290962306a36Sopenharmony_ci/* 291062306a36Sopenharmony_ci * Collect processes when the error hit an ksm page. 291162306a36Sopenharmony_ci */ 291262306a36Sopenharmony_civoid collect_procs_ksm(struct page *page, struct list_head *to_kill, 291362306a36Sopenharmony_ci int force_early) 291462306a36Sopenharmony_ci{ 291562306a36Sopenharmony_ci struct ksm_stable_node *stable_node; 291662306a36Sopenharmony_ci struct ksm_rmap_item *rmap_item; 291762306a36Sopenharmony_ci struct folio *folio = page_folio(page); 291862306a36Sopenharmony_ci struct vm_area_struct *vma; 291962306a36Sopenharmony_ci struct task_struct *tsk; 292062306a36Sopenharmony_ci 292162306a36Sopenharmony_ci stable_node = folio_stable_node(folio); 292262306a36Sopenharmony_ci if (!stable_node) 292362306a36Sopenharmony_ci return; 292462306a36Sopenharmony_ci hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { 292562306a36Sopenharmony_ci struct anon_vma *av = rmap_item->anon_vma; 292662306a36Sopenharmony_ci 292762306a36Sopenharmony_ci anon_vma_lock_read(av); 292862306a36Sopenharmony_ci rcu_read_lock(); 292962306a36Sopenharmony_ci for_each_process(tsk) { 293062306a36Sopenharmony_ci struct anon_vma_chain *vmac; 293162306a36Sopenharmony_ci unsigned long addr; 293262306a36Sopenharmony_ci struct task_struct *t = 293362306a36Sopenharmony_ci task_early_kill(tsk, force_early); 293462306a36Sopenharmony_ci if (!t) 293562306a36Sopenharmony_ci continue; 293662306a36Sopenharmony_ci anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0, 293762306a36Sopenharmony_ci ULONG_MAX) 293862306a36Sopenharmony_ci { 293962306a36Sopenharmony_ci vma = vmac->vma; 294062306a36Sopenharmony_ci if (vma->vm_mm == t->mm) { 294162306a36Sopenharmony_ci addr = rmap_item->address & PAGE_MASK; 294262306a36Sopenharmony_ci add_to_kill_ksm(t, page, vma, to_kill, 294362306a36Sopenharmony_ci addr); 294462306a36Sopenharmony_ci } 294562306a36Sopenharmony_ci } 294662306a36Sopenharmony_ci } 294762306a36Sopenharmony_ci rcu_read_unlock(); 294862306a36Sopenharmony_ci anon_vma_unlock_read(av); 294962306a36Sopenharmony_ci } 295062306a36Sopenharmony_ci} 295162306a36Sopenharmony_ci#endif 295262306a36Sopenharmony_ci 295362306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION 295462306a36Sopenharmony_civoid folio_migrate_ksm(struct folio *newfolio, struct folio *folio) 295562306a36Sopenharmony_ci{ 295662306a36Sopenharmony_ci struct ksm_stable_node *stable_node; 295762306a36Sopenharmony_ci 295862306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 295962306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); 296062306a36Sopenharmony_ci VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio); 296162306a36Sopenharmony_ci 296262306a36Sopenharmony_ci stable_node = folio_stable_node(folio); 296362306a36Sopenharmony_ci if (stable_node) { 296462306a36Sopenharmony_ci VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio); 296562306a36Sopenharmony_ci stable_node->kpfn = folio_pfn(newfolio); 296662306a36Sopenharmony_ci /* 296762306a36Sopenharmony_ci * newfolio->mapping was set in advance; now we need smp_wmb() 296862306a36Sopenharmony_ci * to make sure that the new stable_node->kpfn is visible 296962306a36Sopenharmony_ci * to get_ksm_page() before it can see that folio->mapping 297062306a36Sopenharmony_ci * has gone stale (or that folio_test_swapcache has been cleared). 297162306a36Sopenharmony_ci */ 297262306a36Sopenharmony_ci smp_wmb(); 297362306a36Sopenharmony_ci set_page_stable_node(&folio->page, NULL); 297462306a36Sopenharmony_ci } 297562306a36Sopenharmony_ci} 297662306a36Sopenharmony_ci#endif /* CONFIG_MIGRATION */ 297762306a36Sopenharmony_ci 297862306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTREMOVE 297962306a36Sopenharmony_cistatic void wait_while_offlining(void) 298062306a36Sopenharmony_ci{ 298162306a36Sopenharmony_ci while (ksm_run & KSM_RUN_OFFLINE) { 298262306a36Sopenharmony_ci mutex_unlock(&ksm_thread_mutex); 298362306a36Sopenharmony_ci wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), 298462306a36Sopenharmony_ci TASK_UNINTERRUPTIBLE); 298562306a36Sopenharmony_ci mutex_lock(&ksm_thread_mutex); 298662306a36Sopenharmony_ci } 298762306a36Sopenharmony_ci} 298862306a36Sopenharmony_ci 298962306a36Sopenharmony_cistatic bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, 299062306a36Sopenharmony_ci unsigned long start_pfn, 299162306a36Sopenharmony_ci unsigned long end_pfn) 299262306a36Sopenharmony_ci{ 299362306a36Sopenharmony_ci if (stable_node->kpfn >= start_pfn && 299462306a36Sopenharmony_ci stable_node->kpfn < end_pfn) { 299562306a36Sopenharmony_ci /* 299662306a36Sopenharmony_ci * Don't get_ksm_page, page has already gone: 299762306a36Sopenharmony_ci * which is why we keep kpfn instead of page* 299862306a36Sopenharmony_ci */ 299962306a36Sopenharmony_ci remove_node_from_stable_tree(stable_node); 300062306a36Sopenharmony_ci return true; 300162306a36Sopenharmony_ci } 300262306a36Sopenharmony_ci return false; 300362306a36Sopenharmony_ci} 300462306a36Sopenharmony_ci 300562306a36Sopenharmony_cistatic bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, 300662306a36Sopenharmony_ci unsigned long start_pfn, 300762306a36Sopenharmony_ci unsigned long end_pfn, 300862306a36Sopenharmony_ci struct rb_root *root) 300962306a36Sopenharmony_ci{ 301062306a36Sopenharmony_ci struct ksm_stable_node *dup; 301162306a36Sopenharmony_ci struct hlist_node *hlist_safe; 301262306a36Sopenharmony_ci 301362306a36Sopenharmony_ci if (!is_stable_node_chain(stable_node)) { 301462306a36Sopenharmony_ci VM_BUG_ON(is_stable_node_dup(stable_node)); 301562306a36Sopenharmony_ci return stable_node_dup_remove_range(stable_node, start_pfn, 301662306a36Sopenharmony_ci end_pfn); 301762306a36Sopenharmony_ci } 301862306a36Sopenharmony_ci 301962306a36Sopenharmony_ci hlist_for_each_entry_safe(dup, hlist_safe, 302062306a36Sopenharmony_ci &stable_node->hlist, hlist_dup) { 302162306a36Sopenharmony_ci VM_BUG_ON(!is_stable_node_dup(dup)); 302262306a36Sopenharmony_ci stable_node_dup_remove_range(dup, start_pfn, end_pfn); 302362306a36Sopenharmony_ci } 302462306a36Sopenharmony_ci if (hlist_empty(&stable_node->hlist)) { 302562306a36Sopenharmony_ci free_stable_node_chain(stable_node, root); 302662306a36Sopenharmony_ci return true; /* notify caller that tree was rebalanced */ 302762306a36Sopenharmony_ci } else 302862306a36Sopenharmony_ci return false; 302962306a36Sopenharmony_ci} 303062306a36Sopenharmony_ci 303162306a36Sopenharmony_cistatic void ksm_check_stable_tree(unsigned long start_pfn, 303262306a36Sopenharmony_ci unsigned long end_pfn) 303362306a36Sopenharmony_ci{ 303462306a36Sopenharmony_ci struct ksm_stable_node *stable_node, *next; 303562306a36Sopenharmony_ci struct rb_node *node; 303662306a36Sopenharmony_ci int nid; 303762306a36Sopenharmony_ci 303862306a36Sopenharmony_ci for (nid = 0; nid < ksm_nr_node_ids; nid++) { 303962306a36Sopenharmony_ci node = rb_first(root_stable_tree + nid); 304062306a36Sopenharmony_ci while (node) { 304162306a36Sopenharmony_ci stable_node = rb_entry(node, struct ksm_stable_node, node); 304262306a36Sopenharmony_ci if (stable_node_chain_remove_range(stable_node, 304362306a36Sopenharmony_ci start_pfn, end_pfn, 304462306a36Sopenharmony_ci root_stable_tree + 304562306a36Sopenharmony_ci nid)) 304662306a36Sopenharmony_ci node = rb_first(root_stable_tree + nid); 304762306a36Sopenharmony_ci else 304862306a36Sopenharmony_ci node = rb_next(node); 304962306a36Sopenharmony_ci cond_resched(); 305062306a36Sopenharmony_ci } 305162306a36Sopenharmony_ci } 305262306a36Sopenharmony_ci list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { 305362306a36Sopenharmony_ci if (stable_node->kpfn >= start_pfn && 305462306a36Sopenharmony_ci stable_node->kpfn < end_pfn) 305562306a36Sopenharmony_ci remove_node_from_stable_tree(stable_node); 305662306a36Sopenharmony_ci cond_resched(); 305762306a36Sopenharmony_ci } 305862306a36Sopenharmony_ci} 305962306a36Sopenharmony_ci 306062306a36Sopenharmony_cistatic int ksm_memory_callback(struct notifier_block *self, 306162306a36Sopenharmony_ci unsigned long action, void *arg) 306262306a36Sopenharmony_ci{ 306362306a36Sopenharmony_ci struct memory_notify *mn = arg; 306462306a36Sopenharmony_ci 306562306a36Sopenharmony_ci switch (action) { 306662306a36Sopenharmony_ci case MEM_GOING_OFFLINE: 306762306a36Sopenharmony_ci /* 306862306a36Sopenharmony_ci * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() 306962306a36Sopenharmony_ci * and remove_all_stable_nodes() while memory is going offline: 307062306a36Sopenharmony_ci * it is unsafe for them to touch the stable tree at this time. 307162306a36Sopenharmony_ci * But unmerge_ksm_pages(), rmap lookups and other entry points 307262306a36Sopenharmony_ci * which do not need the ksm_thread_mutex are all safe. 307362306a36Sopenharmony_ci */ 307462306a36Sopenharmony_ci mutex_lock(&ksm_thread_mutex); 307562306a36Sopenharmony_ci ksm_run |= KSM_RUN_OFFLINE; 307662306a36Sopenharmony_ci mutex_unlock(&ksm_thread_mutex); 307762306a36Sopenharmony_ci break; 307862306a36Sopenharmony_ci 307962306a36Sopenharmony_ci case MEM_OFFLINE: 308062306a36Sopenharmony_ci /* 308162306a36Sopenharmony_ci * Most of the work is done by page migration; but there might 308262306a36Sopenharmony_ci * be a few stable_nodes left over, still pointing to struct 308362306a36Sopenharmony_ci * pages which have been offlined: prune those from the tree, 308462306a36Sopenharmony_ci * otherwise get_ksm_page() might later try to access a 308562306a36Sopenharmony_ci * non-existent struct page. 308662306a36Sopenharmony_ci */ 308762306a36Sopenharmony_ci ksm_check_stable_tree(mn->start_pfn, 308862306a36Sopenharmony_ci mn->start_pfn + mn->nr_pages); 308962306a36Sopenharmony_ci fallthrough; 309062306a36Sopenharmony_ci case MEM_CANCEL_OFFLINE: 309162306a36Sopenharmony_ci mutex_lock(&ksm_thread_mutex); 309262306a36Sopenharmony_ci ksm_run &= ~KSM_RUN_OFFLINE; 309362306a36Sopenharmony_ci mutex_unlock(&ksm_thread_mutex); 309462306a36Sopenharmony_ci 309562306a36Sopenharmony_ci smp_mb(); /* wake_up_bit advises this */ 309662306a36Sopenharmony_ci wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); 309762306a36Sopenharmony_ci break; 309862306a36Sopenharmony_ci } 309962306a36Sopenharmony_ci return NOTIFY_OK; 310062306a36Sopenharmony_ci} 310162306a36Sopenharmony_ci#else 310262306a36Sopenharmony_cistatic void wait_while_offlining(void) 310362306a36Sopenharmony_ci{ 310462306a36Sopenharmony_ci} 310562306a36Sopenharmony_ci#endif /* CONFIG_MEMORY_HOTREMOVE */ 310662306a36Sopenharmony_ci 310762306a36Sopenharmony_ci#ifdef CONFIG_PROC_FS 310862306a36Sopenharmony_cilong ksm_process_profit(struct mm_struct *mm) 310962306a36Sopenharmony_ci{ 311062306a36Sopenharmony_ci return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE - 311162306a36Sopenharmony_ci mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); 311262306a36Sopenharmony_ci} 311362306a36Sopenharmony_ci#endif /* CONFIG_PROC_FS */ 311462306a36Sopenharmony_ci 311562306a36Sopenharmony_ci#ifdef CONFIG_SYSFS 311662306a36Sopenharmony_ci/* 311762306a36Sopenharmony_ci * This all compiles without CONFIG_SYSFS, but is a waste of space. 311862306a36Sopenharmony_ci */ 311962306a36Sopenharmony_ci 312062306a36Sopenharmony_ci#define KSM_ATTR_RO(_name) \ 312162306a36Sopenharmony_ci static struct kobj_attribute _name##_attr = __ATTR_RO(_name) 312262306a36Sopenharmony_ci#define KSM_ATTR(_name) \ 312362306a36Sopenharmony_ci static struct kobj_attribute _name##_attr = __ATTR_RW(_name) 312462306a36Sopenharmony_ci 312562306a36Sopenharmony_cistatic ssize_t sleep_millisecs_show(struct kobject *kobj, 312662306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 312762306a36Sopenharmony_ci{ 312862306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs); 312962306a36Sopenharmony_ci} 313062306a36Sopenharmony_ci 313162306a36Sopenharmony_cistatic ssize_t sleep_millisecs_store(struct kobject *kobj, 313262306a36Sopenharmony_ci struct kobj_attribute *attr, 313362306a36Sopenharmony_ci const char *buf, size_t count) 313462306a36Sopenharmony_ci{ 313562306a36Sopenharmony_ci unsigned int msecs; 313662306a36Sopenharmony_ci int err; 313762306a36Sopenharmony_ci 313862306a36Sopenharmony_ci err = kstrtouint(buf, 10, &msecs); 313962306a36Sopenharmony_ci if (err) 314062306a36Sopenharmony_ci return -EINVAL; 314162306a36Sopenharmony_ci 314262306a36Sopenharmony_ci ksm_thread_sleep_millisecs = msecs; 314362306a36Sopenharmony_ci wake_up_interruptible(&ksm_iter_wait); 314462306a36Sopenharmony_ci 314562306a36Sopenharmony_ci return count; 314662306a36Sopenharmony_ci} 314762306a36Sopenharmony_ciKSM_ATTR(sleep_millisecs); 314862306a36Sopenharmony_ci 314962306a36Sopenharmony_cistatic ssize_t pages_to_scan_show(struct kobject *kobj, 315062306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 315162306a36Sopenharmony_ci{ 315262306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan); 315362306a36Sopenharmony_ci} 315462306a36Sopenharmony_ci 315562306a36Sopenharmony_cistatic ssize_t pages_to_scan_store(struct kobject *kobj, 315662306a36Sopenharmony_ci struct kobj_attribute *attr, 315762306a36Sopenharmony_ci const char *buf, size_t count) 315862306a36Sopenharmony_ci{ 315962306a36Sopenharmony_ci unsigned int nr_pages; 316062306a36Sopenharmony_ci int err; 316162306a36Sopenharmony_ci 316262306a36Sopenharmony_ci err = kstrtouint(buf, 10, &nr_pages); 316362306a36Sopenharmony_ci if (err) 316462306a36Sopenharmony_ci return -EINVAL; 316562306a36Sopenharmony_ci 316662306a36Sopenharmony_ci ksm_thread_pages_to_scan = nr_pages; 316762306a36Sopenharmony_ci 316862306a36Sopenharmony_ci return count; 316962306a36Sopenharmony_ci} 317062306a36Sopenharmony_ciKSM_ATTR(pages_to_scan); 317162306a36Sopenharmony_ci 317262306a36Sopenharmony_cistatic ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 317362306a36Sopenharmony_ci char *buf) 317462306a36Sopenharmony_ci{ 317562306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", ksm_run); 317662306a36Sopenharmony_ci} 317762306a36Sopenharmony_ci 317862306a36Sopenharmony_cistatic ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 317962306a36Sopenharmony_ci const char *buf, size_t count) 318062306a36Sopenharmony_ci{ 318162306a36Sopenharmony_ci unsigned int flags; 318262306a36Sopenharmony_ci int err; 318362306a36Sopenharmony_ci 318462306a36Sopenharmony_ci err = kstrtouint(buf, 10, &flags); 318562306a36Sopenharmony_ci if (err) 318662306a36Sopenharmony_ci return -EINVAL; 318762306a36Sopenharmony_ci if (flags > KSM_RUN_UNMERGE) 318862306a36Sopenharmony_ci return -EINVAL; 318962306a36Sopenharmony_ci 319062306a36Sopenharmony_ci /* 319162306a36Sopenharmony_ci * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. 319262306a36Sopenharmony_ci * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, 319362306a36Sopenharmony_ci * breaking COW to free the pages_shared (but leaves mm_slots 319462306a36Sopenharmony_ci * on the list for when ksmd may be set running again). 319562306a36Sopenharmony_ci */ 319662306a36Sopenharmony_ci 319762306a36Sopenharmony_ci mutex_lock(&ksm_thread_mutex); 319862306a36Sopenharmony_ci wait_while_offlining(); 319962306a36Sopenharmony_ci if (ksm_run != flags) { 320062306a36Sopenharmony_ci ksm_run = flags; 320162306a36Sopenharmony_ci if (flags & KSM_RUN_UNMERGE) { 320262306a36Sopenharmony_ci set_current_oom_origin(); 320362306a36Sopenharmony_ci err = unmerge_and_remove_all_rmap_items(); 320462306a36Sopenharmony_ci clear_current_oom_origin(); 320562306a36Sopenharmony_ci if (err) { 320662306a36Sopenharmony_ci ksm_run = KSM_RUN_STOP; 320762306a36Sopenharmony_ci count = err; 320862306a36Sopenharmony_ci } 320962306a36Sopenharmony_ci } 321062306a36Sopenharmony_ci } 321162306a36Sopenharmony_ci mutex_unlock(&ksm_thread_mutex); 321262306a36Sopenharmony_ci 321362306a36Sopenharmony_ci if (flags & KSM_RUN_MERGE) 321462306a36Sopenharmony_ci wake_up_interruptible(&ksm_thread_wait); 321562306a36Sopenharmony_ci 321662306a36Sopenharmony_ci return count; 321762306a36Sopenharmony_ci} 321862306a36Sopenharmony_ciKSM_ATTR(run); 321962306a36Sopenharmony_ci 322062306a36Sopenharmony_ci#ifdef CONFIG_NUMA 322162306a36Sopenharmony_cistatic ssize_t merge_across_nodes_show(struct kobject *kobj, 322262306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 322362306a36Sopenharmony_ci{ 322462306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes); 322562306a36Sopenharmony_ci} 322662306a36Sopenharmony_ci 322762306a36Sopenharmony_cistatic ssize_t merge_across_nodes_store(struct kobject *kobj, 322862306a36Sopenharmony_ci struct kobj_attribute *attr, 322962306a36Sopenharmony_ci const char *buf, size_t count) 323062306a36Sopenharmony_ci{ 323162306a36Sopenharmony_ci int err; 323262306a36Sopenharmony_ci unsigned long knob; 323362306a36Sopenharmony_ci 323462306a36Sopenharmony_ci err = kstrtoul(buf, 10, &knob); 323562306a36Sopenharmony_ci if (err) 323662306a36Sopenharmony_ci return err; 323762306a36Sopenharmony_ci if (knob > 1) 323862306a36Sopenharmony_ci return -EINVAL; 323962306a36Sopenharmony_ci 324062306a36Sopenharmony_ci mutex_lock(&ksm_thread_mutex); 324162306a36Sopenharmony_ci wait_while_offlining(); 324262306a36Sopenharmony_ci if (ksm_merge_across_nodes != knob) { 324362306a36Sopenharmony_ci if (ksm_pages_shared || remove_all_stable_nodes()) 324462306a36Sopenharmony_ci err = -EBUSY; 324562306a36Sopenharmony_ci else if (root_stable_tree == one_stable_tree) { 324662306a36Sopenharmony_ci struct rb_root *buf; 324762306a36Sopenharmony_ci /* 324862306a36Sopenharmony_ci * This is the first time that we switch away from the 324962306a36Sopenharmony_ci * default of merging across nodes: must now allocate 325062306a36Sopenharmony_ci * a buffer to hold as many roots as may be needed. 325162306a36Sopenharmony_ci * Allocate stable and unstable together: 325262306a36Sopenharmony_ci * MAXSMP NODES_SHIFT 10 will use 16kB. 325362306a36Sopenharmony_ci */ 325462306a36Sopenharmony_ci buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), 325562306a36Sopenharmony_ci GFP_KERNEL); 325662306a36Sopenharmony_ci /* Let us assume that RB_ROOT is NULL is zero */ 325762306a36Sopenharmony_ci if (!buf) 325862306a36Sopenharmony_ci err = -ENOMEM; 325962306a36Sopenharmony_ci else { 326062306a36Sopenharmony_ci root_stable_tree = buf; 326162306a36Sopenharmony_ci root_unstable_tree = buf + nr_node_ids; 326262306a36Sopenharmony_ci /* Stable tree is empty but not the unstable */ 326362306a36Sopenharmony_ci root_unstable_tree[0] = one_unstable_tree[0]; 326462306a36Sopenharmony_ci } 326562306a36Sopenharmony_ci } 326662306a36Sopenharmony_ci if (!err) { 326762306a36Sopenharmony_ci ksm_merge_across_nodes = knob; 326862306a36Sopenharmony_ci ksm_nr_node_ids = knob ? 1 : nr_node_ids; 326962306a36Sopenharmony_ci } 327062306a36Sopenharmony_ci } 327162306a36Sopenharmony_ci mutex_unlock(&ksm_thread_mutex); 327262306a36Sopenharmony_ci 327362306a36Sopenharmony_ci return err ? err : count; 327462306a36Sopenharmony_ci} 327562306a36Sopenharmony_ciKSM_ATTR(merge_across_nodes); 327662306a36Sopenharmony_ci#endif 327762306a36Sopenharmony_ci 327862306a36Sopenharmony_cistatic ssize_t use_zero_pages_show(struct kobject *kobj, 327962306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 328062306a36Sopenharmony_ci{ 328162306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", ksm_use_zero_pages); 328262306a36Sopenharmony_ci} 328362306a36Sopenharmony_cistatic ssize_t use_zero_pages_store(struct kobject *kobj, 328462306a36Sopenharmony_ci struct kobj_attribute *attr, 328562306a36Sopenharmony_ci const char *buf, size_t count) 328662306a36Sopenharmony_ci{ 328762306a36Sopenharmony_ci int err; 328862306a36Sopenharmony_ci bool value; 328962306a36Sopenharmony_ci 329062306a36Sopenharmony_ci err = kstrtobool(buf, &value); 329162306a36Sopenharmony_ci if (err) 329262306a36Sopenharmony_ci return -EINVAL; 329362306a36Sopenharmony_ci 329462306a36Sopenharmony_ci ksm_use_zero_pages = value; 329562306a36Sopenharmony_ci 329662306a36Sopenharmony_ci return count; 329762306a36Sopenharmony_ci} 329862306a36Sopenharmony_ciKSM_ATTR(use_zero_pages); 329962306a36Sopenharmony_ci 330062306a36Sopenharmony_cistatic ssize_t max_page_sharing_show(struct kobject *kobj, 330162306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 330262306a36Sopenharmony_ci{ 330362306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", ksm_max_page_sharing); 330462306a36Sopenharmony_ci} 330562306a36Sopenharmony_ci 330662306a36Sopenharmony_cistatic ssize_t max_page_sharing_store(struct kobject *kobj, 330762306a36Sopenharmony_ci struct kobj_attribute *attr, 330862306a36Sopenharmony_ci const char *buf, size_t count) 330962306a36Sopenharmony_ci{ 331062306a36Sopenharmony_ci int err; 331162306a36Sopenharmony_ci int knob; 331262306a36Sopenharmony_ci 331362306a36Sopenharmony_ci err = kstrtoint(buf, 10, &knob); 331462306a36Sopenharmony_ci if (err) 331562306a36Sopenharmony_ci return err; 331662306a36Sopenharmony_ci /* 331762306a36Sopenharmony_ci * When a KSM page is created it is shared by 2 mappings. This 331862306a36Sopenharmony_ci * being a signed comparison, it implicitly verifies it's not 331962306a36Sopenharmony_ci * negative. 332062306a36Sopenharmony_ci */ 332162306a36Sopenharmony_ci if (knob < 2) 332262306a36Sopenharmony_ci return -EINVAL; 332362306a36Sopenharmony_ci 332462306a36Sopenharmony_ci if (READ_ONCE(ksm_max_page_sharing) == knob) 332562306a36Sopenharmony_ci return count; 332662306a36Sopenharmony_ci 332762306a36Sopenharmony_ci mutex_lock(&ksm_thread_mutex); 332862306a36Sopenharmony_ci wait_while_offlining(); 332962306a36Sopenharmony_ci if (ksm_max_page_sharing != knob) { 333062306a36Sopenharmony_ci if (ksm_pages_shared || remove_all_stable_nodes()) 333162306a36Sopenharmony_ci err = -EBUSY; 333262306a36Sopenharmony_ci else 333362306a36Sopenharmony_ci ksm_max_page_sharing = knob; 333462306a36Sopenharmony_ci } 333562306a36Sopenharmony_ci mutex_unlock(&ksm_thread_mutex); 333662306a36Sopenharmony_ci 333762306a36Sopenharmony_ci return err ? err : count; 333862306a36Sopenharmony_ci} 333962306a36Sopenharmony_ciKSM_ATTR(max_page_sharing); 334062306a36Sopenharmony_ci 334162306a36Sopenharmony_cistatic ssize_t pages_scanned_show(struct kobject *kobj, 334262306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 334362306a36Sopenharmony_ci{ 334462306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", ksm_pages_scanned); 334562306a36Sopenharmony_ci} 334662306a36Sopenharmony_ciKSM_ATTR_RO(pages_scanned); 334762306a36Sopenharmony_ci 334862306a36Sopenharmony_cistatic ssize_t pages_shared_show(struct kobject *kobj, 334962306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 335062306a36Sopenharmony_ci{ 335162306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", ksm_pages_shared); 335262306a36Sopenharmony_ci} 335362306a36Sopenharmony_ciKSM_ATTR_RO(pages_shared); 335462306a36Sopenharmony_ci 335562306a36Sopenharmony_cistatic ssize_t pages_sharing_show(struct kobject *kobj, 335662306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 335762306a36Sopenharmony_ci{ 335862306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", ksm_pages_sharing); 335962306a36Sopenharmony_ci} 336062306a36Sopenharmony_ciKSM_ATTR_RO(pages_sharing); 336162306a36Sopenharmony_ci 336262306a36Sopenharmony_cistatic ssize_t pages_unshared_show(struct kobject *kobj, 336362306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 336462306a36Sopenharmony_ci{ 336562306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", ksm_pages_unshared); 336662306a36Sopenharmony_ci} 336762306a36Sopenharmony_ciKSM_ATTR_RO(pages_unshared); 336862306a36Sopenharmony_ci 336962306a36Sopenharmony_cistatic ssize_t pages_volatile_show(struct kobject *kobj, 337062306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 337162306a36Sopenharmony_ci{ 337262306a36Sopenharmony_ci long ksm_pages_volatile; 337362306a36Sopenharmony_ci 337462306a36Sopenharmony_ci ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared 337562306a36Sopenharmony_ci - ksm_pages_sharing - ksm_pages_unshared; 337662306a36Sopenharmony_ci /* 337762306a36Sopenharmony_ci * It was not worth any locking to calculate that statistic, 337862306a36Sopenharmony_ci * but it might therefore sometimes be negative: conceal that. 337962306a36Sopenharmony_ci */ 338062306a36Sopenharmony_ci if (ksm_pages_volatile < 0) 338162306a36Sopenharmony_ci ksm_pages_volatile = 0; 338262306a36Sopenharmony_ci return sysfs_emit(buf, "%ld\n", ksm_pages_volatile); 338362306a36Sopenharmony_ci} 338462306a36Sopenharmony_ciKSM_ATTR_RO(pages_volatile); 338562306a36Sopenharmony_ci 338662306a36Sopenharmony_cistatic ssize_t ksm_zero_pages_show(struct kobject *kobj, 338762306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 338862306a36Sopenharmony_ci{ 338962306a36Sopenharmony_ci return sysfs_emit(buf, "%ld\n", ksm_zero_pages); 339062306a36Sopenharmony_ci} 339162306a36Sopenharmony_ciKSM_ATTR_RO(ksm_zero_pages); 339262306a36Sopenharmony_ci 339362306a36Sopenharmony_cistatic ssize_t general_profit_show(struct kobject *kobj, 339462306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 339562306a36Sopenharmony_ci{ 339662306a36Sopenharmony_ci long general_profit; 339762306a36Sopenharmony_ci 339862306a36Sopenharmony_ci general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE - 339962306a36Sopenharmony_ci ksm_rmap_items * sizeof(struct ksm_rmap_item); 340062306a36Sopenharmony_ci 340162306a36Sopenharmony_ci return sysfs_emit(buf, "%ld\n", general_profit); 340262306a36Sopenharmony_ci} 340362306a36Sopenharmony_ciKSM_ATTR_RO(general_profit); 340462306a36Sopenharmony_ci 340562306a36Sopenharmony_cistatic ssize_t stable_node_dups_show(struct kobject *kobj, 340662306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 340762306a36Sopenharmony_ci{ 340862306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups); 340962306a36Sopenharmony_ci} 341062306a36Sopenharmony_ciKSM_ATTR_RO(stable_node_dups); 341162306a36Sopenharmony_ci 341262306a36Sopenharmony_cistatic ssize_t stable_node_chains_show(struct kobject *kobj, 341362306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 341462306a36Sopenharmony_ci{ 341562306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains); 341662306a36Sopenharmony_ci} 341762306a36Sopenharmony_ciKSM_ATTR_RO(stable_node_chains); 341862306a36Sopenharmony_ci 341962306a36Sopenharmony_cistatic ssize_t 342062306a36Sopenharmony_cistable_node_chains_prune_millisecs_show(struct kobject *kobj, 342162306a36Sopenharmony_ci struct kobj_attribute *attr, 342262306a36Sopenharmony_ci char *buf) 342362306a36Sopenharmony_ci{ 342462306a36Sopenharmony_ci return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); 342562306a36Sopenharmony_ci} 342662306a36Sopenharmony_ci 342762306a36Sopenharmony_cistatic ssize_t 342862306a36Sopenharmony_cistable_node_chains_prune_millisecs_store(struct kobject *kobj, 342962306a36Sopenharmony_ci struct kobj_attribute *attr, 343062306a36Sopenharmony_ci const char *buf, size_t count) 343162306a36Sopenharmony_ci{ 343262306a36Sopenharmony_ci unsigned int msecs; 343362306a36Sopenharmony_ci int err; 343462306a36Sopenharmony_ci 343562306a36Sopenharmony_ci err = kstrtouint(buf, 10, &msecs); 343662306a36Sopenharmony_ci if (err) 343762306a36Sopenharmony_ci return -EINVAL; 343862306a36Sopenharmony_ci 343962306a36Sopenharmony_ci ksm_stable_node_chains_prune_millisecs = msecs; 344062306a36Sopenharmony_ci 344162306a36Sopenharmony_ci return count; 344262306a36Sopenharmony_ci} 344362306a36Sopenharmony_ciKSM_ATTR(stable_node_chains_prune_millisecs); 344462306a36Sopenharmony_ci 344562306a36Sopenharmony_cistatic ssize_t full_scans_show(struct kobject *kobj, 344662306a36Sopenharmony_ci struct kobj_attribute *attr, char *buf) 344762306a36Sopenharmony_ci{ 344862306a36Sopenharmony_ci return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr); 344962306a36Sopenharmony_ci} 345062306a36Sopenharmony_ciKSM_ATTR_RO(full_scans); 345162306a36Sopenharmony_ci 345262306a36Sopenharmony_cistatic struct attribute *ksm_attrs[] = { 345362306a36Sopenharmony_ci &sleep_millisecs_attr.attr, 345462306a36Sopenharmony_ci &pages_to_scan_attr.attr, 345562306a36Sopenharmony_ci &run_attr.attr, 345662306a36Sopenharmony_ci &pages_scanned_attr.attr, 345762306a36Sopenharmony_ci &pages_shared_attr.attr, 345862306a36Sopenharmony_ci &pages_sharing_attr.attr, 345962306a36Sopenharmony_ci &pages_unshared_attr.attr, 346062306a36Sopenharmony_ci &pages_volatile_attr.attr, 346162306a36Sopenharmony_ci &ksm_zero_pages_attr.attr, 346262306a36Sopenharmony_ci &full_scans_attr.attr, 346362306a36Sopenharmony_ci#ifdef CONFIG_NUMA 346462306a36Sopenharmony_ci &merge_across_nodes_attr.attr, 346562306a36Sopenharmony_ci#endif 346662306a36Sopenharmony_ci &max_page_sharing_attr.attr, 346762306a36Sopenharmony_ci &stable_node_chains_attr.attr, 346862306a36Sopenharmony_ci &stable_node_dups_attr.attr, 346962306a36Sopenharmony_ci &stable_node_chains_prune_millisecs_attr.attr, 347062306a36Sopenharmony_ci &use_zero_pages_attr.attr, 347162306a36Sopenharmony_ci &general_profit_attr.attr, 347262306a36Sopenharmony_ci NULL, 347362306a36Sopenharmony_ci}; 347462306a36Sopenharmony_ci 347562306a36Sopenharmony_cistatic const struct attribute_group ksm_attr_group = { 347662306a36Sopenharmony_ci .attrs = ksm_attrs, 347762306a36Sopenharmony_ci .name = "ksm", 347862306a36Sopenharmony_ci}; 347962306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */ 348062306a36Sopenharmony_ci 348162306a36Sopenharmony_cistatic int __init ksm_init(void) 348262306a36Sopenharmony_ci{ 348362306a36Sopenharmony_ci struct task_struct *ksm_thread; 348462306a36Sopenharmony_ci int err; 348562306a36Sopenharmony_ci 348662306a36Sopenharmony_ci /* The correct value depends on page size and endianness */ 348762306a36Sopenharmony_ci zero_checksum = calc_checksum(ZERO_PAGE(0)); 348862306a36Sopenharmony_ci /* Default to false for backwards compatibility */ 348962306a36Sopenharmony_ci ksm_use_zero_pages = false; 349062306a36Sopenharmony_ci 349162306a36Sopenharmony_ci err = ksm_slab_init(); 349262306a36Sopenharmony_ci if (err) 349362306a36Sopenharmony_ci goto out; 349462306a36Sopenharmony_ci 349562306a36Sopenharmony_ci ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 349662306a36Sopenharmony_ci if (IS_ERR(ksm_thread)) { 349762306a36Sopenharmony_ci pr_err("ksm: creating kthread failed\n"); 349862306a36Sopenharmony_ci err = PTR_ERR(ksm_thread); 349962306a36Sopenharmony_ci goto out_free; 350062306a36Sopenharmony_ci } 350162306a36Sopenharmony_ci 350262306a36Sopenharmony_ci#ifdef CONFIG_SYSFS 350362306a36Sopenharmony_ci err = sysfs_create_group(mm_kobj, &ksm_attr_group); 350462306a36Sopenharmony_ci if (err) { 350562306a36Sopenharmony_ci pr_err("ksm: register sysfs failed\n"); 350662306a36Sopenharmony_ci kthread_stop(ksm_thread); 350762306a36Sopenharmony_ci goto out_free; 350862306a36Sopenharmony_ci } 350962306a36Sopenharmony_ci#else 351062306a36Sopenharmony_ci ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ 351162306a36Sopenharmony_ci 351262306a36Sopenharmony_ci#endif /* CONFIG_SYSFS */ 351362306a36Sopenharmony_ci 351462306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTREMOVE 351562306a36Sopenharmony_ci /* There is no significance to this priority 100 */ 351662306a36Sopenharmony_ci hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); 351762306a36Sopenharmony_ci#endif 351862306a36Sopenharmony_ci return 0; 351962306a36Sopenharmony_ci 352062306a36Sopenharmony_ciout_free: 352162306a36Sopenharmony_ci ksm_slab_free(); 352262306a36Sopenharmony_ciout: 352362306a36Sopenharmony_ci return err; 352462306a36Sopenharmony_ci} 352562306a36Sopenharmony_cisubsys_initcall(ksm_init); 3526