162306a36Sopenharmony_ci/* 262306a36Sopenharmony_ci * mm/rmap.c - physical to virtual reverse mappings 362306a36Sopenharmony_ci * 462306a36Sopenharmony_ci * Copyright 2001, Rik van Riel <riel@conectiva.com.br> 562306a36Sopenharmony_ci * Released under the General Public License (GPL). 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Simple, low overhead reverse mapping scheme. 862306a36Sopenharmony_ci * Please try to keep this thing as modular as possible. 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * Provides methods for unmapping each kind of mapped page: 1162306a36Sopenharmony_ci * the anon methods track anonymous pages, and 1262306a36Sopenharmony_ci * the file methods track pages belonging to an inode. 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * Original design by Rik van Riel <riel@conectiva.com.br> 2001 1562306a36Sopenharmony_ci * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 1662306a36Sopenharmony_ci * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 1762306a36Sopenharmony_ci * Contributions by Hugh Dickins 2003, 2004 1862306a36Sopenharmony_ci */ 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci/* 2162306a36Sopenharmony_ci * Lock ordering in mm: 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * inode->i_rwsem (while writing or truncating, not reading or faulting) 2462306a36Sopenharmony_ci * mm->mmap_lock 2562306a36Sopenharmony_ci * mapping->invalidate_lock (in filemap_fault) 2662306a36Sopenharmony_ci * page->flags PG_locked (lock_page) 2762306a36Sopenharmony_ci * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) 2862306a36Sopenharmony_ci * vma_start_write 2962306a36Sopenharmony_ci * mapping->i_mmap_rwsem 3062306a36Sopenharmony_ci * anon_vma->rwsem 3162306a36Sopenharmony_ci * mm->page_table_lock or pte_lock 3262306a36Sopenharmony_ci * swap_lock (in swap_duplicate, swap_info_get) 3362306a36Sopenharmony_ci * mmlist_lock (in mmput, drain_mmlist and others) 3462306a36Sopenharmony_ci * mapping->private_lock (in block_dirty_folio) 3562306a36Sopenharmony_ci * folio_lock_memcg move_lock (in block_dirty_folio) 3662306a36Sopenharmony_ci * i_pages lock (widely used) 3762306a36Sopenharmony_ci * lruvec->lru_lock (in folio_lruvec_lock_irq) 3862306a36Sopenharmony_ci * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 3962306a36Sopenharmony_ci * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) 4062306a36Sopenharmony_ci * sb_lock (within inode_lock in fs/fs-writeback.c) 4162306a36Sopenharmony_ci * i_pages lock (widely used, in set_page_dirty, 4262306a36Sopenharmony_ci * in arch-dependent flush_dcache_mmap_lock, 4362306a36Sopenharmony_ci * within bdi.wb->list_lock in __sync_single_inode) 4462306a36Sopenharmony_ci * 4562306a36Sopenharmony_ci * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) 4662306a36Sopenharmony_ci * ->tasklist_lock 4762306a36Sopenharmony_ci * pte map lock 4862306a36Sopenharmony_ci * 4962306a36Sopenharmony_ci * hugetlbfs PageHuge() take locks in this order: 5062306a36Sopenharmony_ci * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) 5162306a36Sopenharmony_ci * vma_lock (hugetlb specific lock for pmd_sharing) 5262306a36Sopenharmony_ci * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) 5362306a36Sopenharmony_ci * page->flags PG_locked (lock_page) 5462306a36Sopenharmony_ci */ 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci#include <linux/mm.h> 5762306a36Sopenharmony_ci#include <linux/sched/mm.h> 5862306a36Sopenharmony_ci#include <linux/sched/task.h> 5962306a36Sopenharmony_ci#include <linux/pagemap.h> 6062306a36Sopenharmony_ci#include <linux/swap.h> 6162306a36Sopenharmony_ci#include <linux/swapops.h> 6262306a36Sopenharmony_ci#include <linux/slab.h> 6362306a36Sopenharmony_ci#include <linux/init.h> 6462306a36Sopenharmony_ci#include <linux/ksm.h> 6562306a36Sopenharmony_ci#include <linux/rmap.h> 6662306a36Sopenharmony_ci#include <linux/rcupdate.h> 6762306a36Sopenharmony_ci#include <linux/export.h> 6862306a36Sopenharmony_ci#include <linux/memcontrol.h> 6962306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 7062306a36Sopenharmony_ci#include <linux/migrate.h> 7162306a36Sopenharmony_ci#include <linux/hugetlb.h> 7262306a36Sopenharmony_ci#include <linux/huge_mm.h> 7362306a36Sopenharmony_ci#include <linux/backing-dev.h> 7462306a36Sopenharmony_ci#include <linux/page_idle.h> 7562306a36Sopenharmony_ci#include <linux/memremap.h> 7662306a36Sopenharmony_ci#include <linux/userfaultfd_k.h> 7762306a36Sopenharmony_ci#include <linux/mm_inline.h> 7862306a36Sopenharmony_ci#include <linux/mm_purgeable.h> 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci#include <asm/tlbflush.h> 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci#define CREATE_TRACE_POINTS 8362306a36Sopenharmony_ci#include <trace/events/tlb.h> 8462306a36Sopenharmony_ci#include <trace/events/migrate.h> 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci#include "internal.h" 8762306a36Sopenharmony_ci 8862306a36Sopenharmony_cistatic struct kmem_cache *anon_vma_cachep; 8962306a36Sopenharmony_cistatic struct kmem_cache *anon_vma_chain_cachep; 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_cistatic inline struct anon_vma *anon_vma_alloc(void) 9262306a36Sopenharmony_ci{ 9362306a36Sopenharmony_ci struct anon_vma *anon_vma; 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ci anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 9662306a36Sopenharmony_ci if (anon_vma) { 9762306a36Sopenharmony_ci atomic_set(&anon_vma->refcount, 1); 9862306a36Sopenharmony_ci anon_vma->num_children = 0; 9962306a36Sopenharmony_ci anon_vma->num_active_vmas = 0; 10062306a36Sopenharmony_ci anon_vma->parent = anon_vma; 10162306a36Sopenharmony_ci /* 10262306a36Sopenharmony_ci * Initialise the anon_vma root to point to itself. If called 10362306a36Sopenharmony_ci * from fork, the root will be reset to the parents anon_vma. 10462306a36Sopenharmony_ci */ 10562306a36Sopenharmony_ci anon_vma->root = anon_vma; 10662306a36Sopenharmony_ci } 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci return anon_vma; 10962306a36Sopenharmony_ci} 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_cistatic inline void anon_vma_free(struct anon_vma *anon_vma) 11262306a36Sopenharmony_ci{ 11362306a36Sopenharmony_ci VM_BUG_ON(atomic_read(&anon_vma->refcount)); 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci /* 11662306a36Sopenharmony_ci * Synchronize against folio_lock_anon_vma_read() such that 11762306a36Sopenharmony_ci * we can safely hold the lock without the anon_vma getting 11862306a36Sopenharmony_ci * freed. 11962306a36Sopenharmony_ci * 12062306a36Sopenharmony_ci * Relies on the full mb implied by the atomic_dec_and_test() from 12162306a36Sopenharmony_ci * put_anon_vma() against the acquire barrier implied by 12262306a36Sopenharmony_ci * down_read_trylock() from folio_lock_anon_vma_read(). This orders: 12362306a36Sopenharmony_ci * 12462306a36Sopenharmony_ci * folio_lock_anon_vma_read() VS put_anon_vma() 12562306a36Sopenharmony_ci * down_read_trylock() atomic_dec_and_test() 12662306a36Sopenharmony_ci * LOCK MB 12762306a36Sopenharmony_ci * atomic_read() rwsem_is_locked() 12862306a36Sopenharmony_ci * 12962306a36Sopenharmony_ci * LOCK should suffice since the actual taking of the lock must 13062306a36Sopenharmony_ci * happen _before_ what follows. 13162306a36Sopenharmony_ci */ 13262306a36Sopenharmony_ci might_sleep(); 13362306a36Sopenharmony_ci if (rwsem_is_locked(&anon_vma->root->rwsem)) { 13462306a36Sopenharmony_ci anon_vma_lock_write(anon_vma); 13562306a36Sopenharmony_ci anon_vma_unlock_write(anon_vma); 13662306a36Sopenharmony_ci } 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci kmem_cache_free(anon_vma_cachep, anon_vma); 13962306a36Sopenharmony_ci} 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_cistatic inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) 14262306a36Sopenharmony_ci{ 14362306a36Sopenharmony_ci return kmem_cache_alloc(anon_vma_chain_cachep, gfp); 14462306a36Sopenharmony_ci} 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_cistatic void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 14762306a36Sopenharmony_ci{ 14862306a36Sopenharmony_ci kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 14962306a36Sopenharmony_ci} 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_cistatic void anon_vma_chain_link(struct vm_area_struct *vma, 15262306a36Sopenharmony_ci struct anon_vma_chain *avc, 15362306a36Sopenharmony_ci struct anon_vma *anon_vma) 15462306a36Sopenharmony_ci{ 15562306a36Sopenharmony_ci avc->vma = vma; 15662306a36Sopenharmony_ci avc->anon_vma = anon_vma; 15762306a36Sopenharmony_ci list_add(&avc->same_vma, &vma->anon_vma_chain); 15862306a36Sopenharmony_ci anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); 15962306a36Sopenharmony_ci} 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci/** 16262306a36Sopenharmony_ci * __anon_vma_prepare - attach an anon_vma to a memory region 16362306a36Sopenharmony_ci * @vma: the memory region in question 16462306a36Sopenharmony_ci * 16562306a36Sopenharmony_ci * This makes sure the memory mapping described by 'vma' has 16662306a36Sopenharmony_ci * an 'anon_vma' attached to it, so that we can associate the 16762306a36Sopenharmony_ci * anonymous pages mapped into it with that anon_vma. 16862306a36Sopenharmony_ci * 16962306a36Sopenharmony_ci * The common case will be that we already have one, which 17062306a36Sopenharmony_ci * is handled inline by anon_vma_prepare(). But if 17162306a36Sopenharmony_ci * not we either need to find an adjacent mapping that we 17262306a36Sopenharmony_ci * can re-use the anon_vma from (very common when the only 17362306a36Sopenharmony_ci * reason for splitting a vma has been mprotect()), or we 17462306a36Sopenharmony_ci * allocate a new one. 17562306a36Sopenharmony_ci * 17662306a36Sopenharmony_ci * Anon-vma allocations are very subtle, because we may have 17762306a36Sopenharmony_ci * optimistically looked up an anon_vma in folio_lock_anon_vma_read() 17862306a36Sopenharmony_ci * and that may actually touch the rwsem even in the newly 17962306a36Sopenharmony_ci * allocated vma (it depends on RCU to make sure that the 18062306a36Sopenharmony_ci * anon_vma isn't actually destroyed). 18162306a36Sopenharmony_ci * 18262306a36Sopenharmony_ci * As a result, we need to do proper anon_vma locking even 18362306a36Sopenharmony_ci * for the new allocation. At the same time, we do not want 18462306a36Sopenharmony_ci * to do any locking for the common case of already having 18562306a36Sopenharmony_ci * an anon_vma. 18662306a36Sopenharmony_ci * 18762306a36Sopenharmony_ci * This must be called with the mmap_lock held for reading. 18862306a36Sopenharmony_ci */ 18962306a36Sopenharmony_ciint __anon_vma_prepare(struct vm_area_struct *vma) 19062306a36Sopenharmony_ci{ 19162306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 19262306a36Sopenharmony_ci struct anon_vma *anon_vma, *allocated; 19362306a36Sopenharmony_ci struct anon_vma_chain *avc; 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci might_sleep(); 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci avc = anon_vma_chain_alloc(GFP_KERNEL); 19862306a36Sopenharmony_ci if (!avc) 19962306a36Sopenharmony_ci goto out_enomem; 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ci anon_vma = find_mergeable_anon_vma(vma); 20262306a36Sopenharmony_ci allocated = NULL; 20362306a36Sopenharmony_ci if (!anon_vma) { 20462306a36Sopenharmony_ci anon_vma = anon_vma_alloc(); 20562306a36Sopenharmony_ci if (unlikely(!anon_vma)) 20662306a36Sopenharmony_ci goto out_enomem_free_avc; 20762306a36Sopenharmony_ci anon_vma->num_children++; /* self-parent link for new root */ 20862306a36Sopenharmony_ci allocated = anon_vma; 20962306a36Sopenharmony_ci } 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci anon_vma_lock_write(anon_vma); 21262306a36Sopenharmony_ci /* page_table_lock to protect against threads */ 21362306a36Sopenharmony_ci spin_lock(&mm->page_table_lock); 21462306a36Sopenharmony_ci if (likely(!vma->anon_vma)) { 21562306a36Sopenharmony_ci vma->anon_vma = anon_vma; 21662306a36Sopenharmony_ci anon_vma_chain_link(vma, avc, anon_vma); 21762306a36Sopenharmony_ci anon_vma->num_active_vmas++; 21862306a36Sopenharmony_ci allocated = NULL; 21962306a36Sopenharmony_ci avc = NULL; 22062306a36Sopenharmony_ci } 22162306a36Sopenharmony_ci spin_unlock(&mm->page_table_lock); 22262306a36Sopenharmony_ci anon_vma_unlock_write(anon_vma); 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_ci if (unlikely(allocated)) 22562306a36Sopenharmony_ci put_anon_vma(allocated); 22662306a36Sopenharmony_ci if (unlikely(avc)) 22762306a36Sopenharmony_ci anon_vma_chain_free(avc); 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci return 0; 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci out_enomem_free_avc: 23262306a36Sopenharmony_ci anon_vma_chain_free(avc); 23362306a36Sopenharmony_ci out_enomem: 23462306a36Sopenharmony_ci return -ENOMEM; 23562306a36Sopenharmony_ci} 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci/* 23862306a36Sopenharmony_ci * This is a useful helper function for locking the anon_vma root as 23962306a36Sopenharmony_ci * we traverse the vma->anon_vma_chain, looping over anon_vma's that 24062306a36Sopenharmony_ci * have the same vma. 24162306a36Sopenharmony_ci * 24262306a36Sopenharmony_ci * Such anon_vma's should have the same root, so you'd expect to see 24362306a36Sopenharmony_ci * just a single mutex_lock for the whole traversal. 24462306a36Sopenharmony_ci */ 24562306a36Sopenharmony_cistatic inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) 24662306a36Sopenharmony_ci{ 24762306a36Sopenharmony_ci struct anon_vma *new_root = anon_vma->root; 24862306a36Sopenharmony_ci if (new_root != root) { 24962306a36Sopenharmony_ci if (WARN_ON_ONCE(root)) 25062306a36Sopenharmony_ci up_write(&root->rwsem); 25162306a36Sopenharmony_ci root = new_root; 25262306a36Sopenharmony_ci down_write(&root->rwsem); 25362306a36Sopenharmony_ci } 25462306a36Sopenharmony_ci return root; 25562306a36Sopenharmony_ci} 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_cistatic inline void unlock_anon_vma_root(struct anon_vma *root) 25862306a36Sopenharmony_ci{ 25962306a36Sopenharmony_ci if (root) 26062306a36Sopenharmony_ci up_write(&root->rwsem); 26162306a36Sopenharmony_ci} 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci/* 26462306a36Sopenharmony_ci * Attach the anon_vmas from src to dst. 26562306a36Sopenharmony_ci * Returns 0 on success, -ENOMEM on failure. 26662306a36Sopenharmony_ci * 26762306a36Sopenharmony_ci * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), 26862306a36Sopenharmony_ci * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, 26962306a36Sopenharmony_ci * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to 27062306a36Sopenharmony_ci * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before 27162306a36Sopenharmony_ci * call, we can identify this case by checking (!dst->anon_vma && 27262306a36Sopenharmony_ci * src->anon_vma). 27362306a36Sopenharmony_ci * 27462306a36Sopenharmony_ci * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find 27562306a36Sopenharmony_ci * and reuse existing anon_vma which has no vmas and only one child anon_vma. 27662306a36Sopenharmony_ci * This prevents degradation of anon_vma hierarchy to endless linear chain in 27762306a36Sopenharmony_ci * case of constantly forking task. On the other hand, an anon_vma with more 27862306a36Sopenharmony_ci * than one child isn't reused even if there was no alive vma, thus rmap 27962306a36Sopenharmony_ci * walker has a good chance of avoiding scanning the whole hierarchy when it 28062306a36Sopenharmony_ci * searches where page is mapped. 28162306a36Sopenharmony_ci */ 28262306a36Sopenharmony_ciint anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 28362306a36Sopenharmony_ci{ 28462306a36Sopenharmony_ci struct anon_vma_chain *avc, *pavc; 28562306a36Sopenharmony_ci struct anon_vma *root = NULL; 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 28862306a36Sopenharmony_ci struct anon_vma *anon_vma; 28962306a36Sopenharmony_ci 29062306a36Sopenharmony_ci avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); 29162306a36Sopenharmony_ci if (unlikely(!avc)) { 29262306a36Sopenharmony_ci unlock_anon_vma_root(root); 29362306a36Sopenharmony_ci root = NULL; 29462306a36Sopenharmony_ci avc = anon_vma_chain_alloc(GFP_KERNEL); 29562306a36Sopenharmony_ci if (!avc) 29662306a36Sopenharmony_ci goto enomem_failure; 29762306a36Sopenharmony_ci } 29862306a36Sopenharmony_ci anon_vma = pavc->anon_vma; 29962306a36Sopenharmony_ci root = lock_anon_vma_root(root, anon_vma); 30062306a36Sopenharmony_ci anon_vma_chain_link(dst, avc, anon_vma); 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_ci /* 30362306a36Sopenharmony_ci * Reuse existing anon_vma if it has no vma and only one 30462306a36Sopenharmony_ci * anon_vma child. 30562306a36Sopenharmony_ci * 30662306a36Sopenharmony_ci * Root anon_vma is never reused: 30762306a36Sopenharmony_ci * it has self-parent reference and at least one child. 30862306a36Sopenharmony_ci */ 30962306a36Sopenharmony_ci if (!dst->anon_vma && src->anon_vma && 31062306a36Sopenharmony_ci anon_vma->num_children < 2 && 31162306a36Sopenharmony_ci anon_vma->num_active_vmas == 0) 31262306a36Sopenharmony_ci dst->anon_vma = anon_vma; 31362306a36Sopenharmony_ci } 31462306a36Sopenharmony_ci if (dst->anon_vma) 31562306a36Sopenharmony_ci dst->anon_vma->num_active_vmas++; 31662306a36Sopenharmony_ci unlock_anon_vma_root(root); 31762306a36Sopenharmony_ci return 0; 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_ci enomem_failure: 32062306a36Sopenharmony_ci /* 32162306a36Sopenharmony_ci * dst->anon_vma is dropped here otherwise its num_active_vmas can 32262306a36Sopenharmony_ci * be incorrectly decremented in unlink_anon_vmas(). 32362306a36Sopenharmony_ci * We can safely do this because callers of anon_vma_clone() don't care 32462306a36Sopenharmony_ci * about dst->anon_vma if anon_vma_clone() failed. 32562306a36Sopenharmony_ci */ 32662306a36Sopenharmony_ci dst->anon_vma = NULL; 32762306a36Sopenharmony_ci unlink_anon_vmas(dst); 32862306a36Sopenharmony_ci return -ENOMEM; 32962306a36Sopenharmony_ci} 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci/* 33262306a36Sopenharmony_ci * Attach vma to its own anon_vma, as well as to the anon_vmas that 33362306a36Sopenharmony_ci * the corresponding VMA in the parent process is attached to. 33462306a36Sopenharmony_ci * Returns 0 on success, non-zero on failure. 33562306a36Sopenharmony_ci */ 33662306a36Sopenharmony_ciint anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) 33762306a36Sopenharmony_ci{ 33862306a36Sopenharmony_ci struct anon_vma_chain *avc; 33962306a36Sopenharmony_ci struct anon_vma *anon_vma; 34062306a36Sopenharmony_ci int error; 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci /* Don't bother if the parent process has no anon_vma here. */ 34362306a36Sopenharmony_ci if (!pvma->anon_vma) 34462306a36Sopenharmony_ci return 0; 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_ci /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ 34762306a36Sopenharmony_ci vma->anon_vma = NULL; 34862306a36Sopenharmony_ci 34962306a36Sopenharmony_ci /* 35062306a36Sopenharmony_ci * First, attach the new VMA to the parent VMA's anon_vmas, 35162306a36Sopenharmony_ci * so rmap can find non-COWed pages in child processes. 35262306a36Sopenharmony_ci */ 35362306a36Sopenharmony_ci error = anon_vma_clone(vma, pvma); 35462306a36Sopenharmony_ci if (error) 35562306a36Sopenharmony_ci return error; 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci /* An existing anon_vma has been reused, all done then. */ 35862306a36Sopenharmony_ci if (vma->anon_vma) 35962306a36Sopenharmony_ci return 0; 36062306a36Sopenharmony_ci 36162306a36Sopenharmony_ci /* Then add our own anon_vma. */ 36262306a36Sopenharmony_ci anon_vma = anon_vma_alloc(); 36362306a36Sopenharmony_ci if (!anon_vma) 36462306a36Sopenharmony_ci goto out_error; 36562306a36Sopenharmony_ci anon_vma->num_active_vmas++; 36662306a36Sopenharmony_ci avc = anon_vma_chain_alloc(GFP_KERNEL); 36762306a36Sopenharmony_ci if (!avc) 36862306a36Sopenharmony_ci goto out_error_free_anon_vma; 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci /* 37162306a36Sopenharmony_ci * The root anon_vma's rwsem is the lock actually used when we 37262306a36Sopenharmony_ci * lock any of the anon_vmas in this anon_vma tree. 37362306a36Sopenharmony_ci */ 37462306a36Sopenharmony_ci anon_vma->root = pvma->anon_vma->root; 37562306a36Sopenharmony_ci anon_vma->parent = pvma->anon_vma; 37662306a36Sopenharmony_ci /* 37762306a36Sopenharmony_ci * With refcounts, an anon_vma can stay around longer than the 37862306a36Sopenharmony_ci * process it belongs to. The root anon_vma needs to be pinned until 37962306a36Sopenharmony_ci * this anon_vma is freed, because the lock lives in the root. 38062306a36Sopenharmony_ci */ 38162306a36Sopenharmony_ci get_anon_vma(anon_vma->root); 38262306a36Sopenharmony_ci /* Mark this anon_vma as the one where our new (COWed) pages go. */ 38362306a36Sopenharmony_ci vma->anon_vma = anon_vma; 38462306a36Sopenharmony_ci anon_vma_lock_write(anon_vma); 38562306a36Sopenharmony_ci anon_vma_chain_link(vma, avc, anon_vma); 38662306a36Sopenharmony_ci anon_vma->parent->num_children++; 38762306a36Sopenharmony_ci anon_vma_unlock_write(anon_vma); 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci return 0; 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci out_error_free_anon_vma: 39262306a36Sopenharmony_ci put_anon_vma(anon_vma); 39362306a36Sopenharmony_ci out_error: 39462306a36Sopenharmony_ci unlink_anon_vmas(vma); 39562306a36Sopenharmony_ci return -ENOMEM; 39662306a36Sopenharmony_ci} 39762306a36Sopenharmony_ci 39862306a36Sopenharmony_civoid unlink_anon_vmas(struct vm_area_struct *vma) 39962306a36Sopenharmony_ci{ 40062306a36Sopenharmony_ci struct anon_vma_chain *avc, *next; 40162306a36Sopenharmony_ci struct anon_vma *root = NULL; 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_ci /* 40462306a36Sopenharmony_ci * Unlink each anon_vma chained to the VMA. This list is ordered 40562306a36Sopenharmony_ci * from newest to oldest, ensuring the root anon_vma gets freed last. 40662306a36Sopenharmony_ci */ 40762306a36Sopenharmony_ci list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 40862306a36Sopenharmony_ci struct anon_vma *anon_vma = avc->anon_vma; 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci root = lock_anon_vma_root(root, anon_vma); 41162306a36Sopenharmony_ci anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); 41262306a36Sopenharmony_ci 41362306a36Sopenharmony_ci /* 41462306a36Sopenharmony_ci * Leave empty anon_vmas on the list - we'll need 41562306a36Sopenharmony_ci * to free them outside the lock. 41662306a36Sopenharmony_ci */ 41762306a36Sopenharmony_ci if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { 41862306a36Sopenharmony_ci anon_vma->parent->num_children--; 41962306a36Sopenharmony_ci continue; 42062306a36Sopenharmony_ci } 42162306a36Sopenharmony_ci 42262306a36Sopenharmony_ci list_del(&avc->same_vma); 42362306a36Sopenharmony_ci anon_vma_chain_free(avc); 42462306a36Sopenharmony_ci } 42562306a36Sopenharmony_ci if (vma->anon_vma) { 42662306a36Sopenharmony_ci vma->anon_vma->num_active_vmas--; 42762306a36Sopenharmony_ci 42862306a36Sopenharmony_ci /* 42962306a36Sopenharmony_ci * vma would still be needed after unlink, and anon_vma will be prepared 43062306a36Sopenharmony_ci * when handle fault. 43162306a36Sopenharmony_ci */ 43262306a36Sopenharmony_ci vma->anon_vma = NULL; 43362306a36Sopenharmony_ci } 43462306a36Sopenharmony_ci unlock_anon_vma_root(root); 43562306a36Sopenharmony_ci 43662306a36Sopenharmony_ci /* 43762306a36Sopenharmony_ci * Iterate the list once more, it now only contains empty and unlinked 43862306a36Sopenharmony_ci * anon_vmas, destroy them. Could not do before due to __put_anon_vma() 43962306a36Sopenharmony_ci * needing to write-acquire the anon_vma->root->rwsem. 44062306a36Sopenharmony_ci */ 44162306a36Sopenharmony_ci list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 44262306a36Sopenharmony_ci struct anon_vma *anon_vma = avc->anon_vma; 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_ci VM_WARN_ON(anon_vma->num_children); 44562306a36Sopenharmony_ci VM_WARN_ON(anon_vma->num_active_vmas); 44662306a36Sopenharmony_ci put_anon_vma(anon_vma); 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci list_del(&avc->same_vma); 44962306a36Sopenharmony_ci anon_vma_chain_free(avc); 45062306a36Sopenharmony_ci } 45162306a36Sopenharmony_ci} 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_cistatic void anon_vma_ctor(void *data) 45462306a36Sopenharmony_ci{ 45562306a36Sopenharmony_ci struct anon_vma *anon_vma = data; 45662306a36Sopenharmony_ci 45762306a36Sopenharmony_ci init_rwsem(&anon_vma->rwsem); 45862306a36Sopenharmony_ci atomic_set(&anon_vma->refcount, 0); 45962306a36Sopenharmony_ci anon_vma->rb_root = RB_ROOT_CACHED; 46062306a36Sopenharmony_ci} 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_civoid __init anon_vma_init(void) 46362306a36Sopenharmony_ci{ 46462306a36Sopenharmony_ci anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 46562306a36Sopenharmony_ci 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, 46662306a36Sopenharmony_ci anon_vma_ctor); 46762306a36Sopenharmony_ci anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, 46862306a36Sopenharmony_ci SLAB_PANIC|SLAB_ACCOUNT); 46962306a36Sopenharmony_ci} 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci/* 47262306a36Sopenharmony_ci * Getting a lock on a stable anon_vma from a page off the LRU is tricky! 47362306a36Sopenharmony_ci * 47462306a36Sopenharmony_ci * Since there is no serialization what so ever against page_remove_rmap() 47562306a36Sopenharmony_ci * the best this function can do is return a refcount increased anon_vma 47662306a36Sopenharmony_ci * that might have been relevant to this page. 47762306a36Sopenharmony_ci * 47862306a36Sopenharmony_ci * The page might have been remapped to a different anon_vma or the anon_vma 47962306a36Sopenharmony_ci * returned may already be freed (and even reused). 48062306a36Sopenharmony_ci * 48162306a36Sopenharmony_ci * In case it was remapped to a different anon_vma, the new anon_vma will be a 48262306a36Sopenharmony_ci * child of the old anon_vma, and the anon_vma lifetime rules will therefore 48362306a36Sopenharmony_ci * ensure that any anon_vma obtained from the page will still be valid for as 48462306a36Sopenharmony_ci * long as we observe page_mapped() [ hence all those page_mapped() tests ]. 48562306a36Sopenharmony_ci * 48662306a36Sopenharmony_ci * All users of this function must be very careful when walking the anon_vma 48762306a36Sopenharmony_ci * chain and verify that the page in question is indeed mapped in it 48862306a36Sopenharmony_ci * [ something equivalent to page_mapped_in_vma() ]. 48962306a36Sopenharmony_ci * 49062306a36Sopenharmony_ci * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from 49162306a36Sopenharmony_ci * page_remove_rmap() that the anon_vma pointer from page->mapping is valid 49262306a36Sopenharmony_ci * if there is a mapcount, we can dereference the anon_vma after observing 49362306a36Sopenharmony_ci * those. 49462306a36Sopenharmony_ci */ 49562306a36Sopenharmony_cistruct anon_vma *folio_get_anon_vma(struct folio *folio) 49662306a36Sopenharmony_ci{ 49762306a36Sopenharmony_ci struct anon_vma *anon_vma = NULL; 49862306a36Sopenharmony_ci unsigned long anon_mapping; 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ci rcu_read_lock(); 50162306a36Sopenharmony_ci anon_mapping = (unsigned long)READ_ONCE(folio->mapping); 50262306a36Sopenharmony_ci if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 50362306a36Sopenharmony_ci goto out; 50462306a36Sopenharmony_ci if (!folio_mapped(folio)) 50562306a36Sopenharmony_ci goto out; 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 50862306a36Sopenharmony_ci if (!atomic_inc_not_zero(&anon_vma->refcount)) { 50962306a36Sopenharmony_ci anon_vma = NULL; 51062306a36Sopenharmony_ci goto out; 51162306a36Sopenharmony_ci } 51262306a36Sopenharmony_ci 51362306a36Sopenharmony_ci /* 51462306a36Sopenharmony_ci * If this folio is still mapped, then its anon_vma cannot have been 51562306a36Sopenharmony_ci * freed. But if it has been unmapped, we have no security against the 51662306a36Sopenharmony_ci * anon_vma structure being freed and reused (for another anon_vma: 51762306a36Sopenharmony_ci * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() 51862306a36Sopenharmony_ci * above cannot corrupt). 51962306a36Sopenharmony_ci */ 52062306a36Sopenharmony_ci if (!folio_mapped(folio)) { 52162306a36Sopenharmony_ci rcu_read_unlock(); 52262306a36Sopenharmony_ci put_anon_vma(anon_vma); 52362306a36Sopenharmony_ci return NULL; 52462306a36Sopenharmony_ci } 52562306a36Sopenharmony_ciout: 52662306a36Sopenharmony_ci rcu_read_unlock(); 52762306a36Sopenharmony_ci 52862306a36Sopenharmony_ci return anon_vma; 52962306a36Sopenharmony_ci} 53062306a36Sopenharmony_ci 53162306a36Sopenharmony_ci/* 53262306a36Sopenharmony_ci * Similar to folio_get_anon_vma() except it locks the anon_vma. 53362306a36Sopenharmony_ci * 53462306a36Sopenharmony_ci * Its a little more complex as it tries to keep the fast path to a single 53562306a36Sopenharmony_ci * atomic op -- the trylock. If we fail the trylock, we fall back to getting a 53662306a36Sopenharmony_ci * reference like with folio_get_anon_vma() and then block on the mutex 53762306a36Sopenharmony_ci * on !rwc->try_lock case. 53862306a36Sopenharmony_ci */ 53962306a36Sopenharmony_cistruct anon_vma *folio_lock_anon_vma_read(struct folio *folio, 54062306a36Sopenharmony_ci struct rmap_walk_control *rwc) 54162306a36Sopenharmony_ci{ 54262306a36Sopenharmony_ci struct anon_vma *anon_vma = NULL; 54362306a36Sopenharmony_ci struct anon_vma *root_anon_vma; 54462306a36Sopenharmony_ci unsigned long anon_mapping; 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_ci rcu_read_lock(); 54762306a36Sopenharmony_ci anon_mapping = (unsigned long)READ_ONCE(folio->mapping); 54862306a36Sopenharmony_ci if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 54962306a36Sopenharmony_ci goto out; 55062306a36Sopenharmony_ci if (!folio_mapped(folio)) 55162306a36Sopenharmony_ci goto out; 55262306a36Sopenharmony_ci 55362306a36Sopenharmony_ci anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 55462306a36Sopenharmony_ci root_anon_vma = READ_ONCE(anon_vma->root); 55562306a36Sopenharmony_ci if (down_read_trylock(&root_anon_vma->rwsem)) { 55662306a36Sopenharmony_ci /* 55762306a36Sopenharmony_ci * If the folio is still mapped, then this anon_vma is still 55862306a36Sopenharmony_ci * its anon_vma, and holding the mutex ensures that it will 55962306a36Sopenharmony_ci * not go away, see anon_vma_free(). 56062306a36Sopenharmony_ci */ 56162306a36Sopenharmony_ci if (!folio_mapped(folio)) { 56262306a36Sopenharmony_ci up_read(&root_anon_vma->rwsem); 56362306a36Sopenharmony_ci anon_vma = NULL; 56462306a36Sopenharmony_ci } 56562306a36Sopenharmony_ci goto out; 56662306a36Sopenharmony_ci } 56762306a36Sopenharmony_ci 56862306a36Sopenharmony_ci if (rwc && rwc->try_lock) { 56962306a36Sopenharmony_ci anon_vma = NULL; 57062306a36Sopenharmony_ci rwc->contended = true; 57162306a36Sopenharmony_ci goto out; 57262306a36Sopenharmony_ci } 57362306a36Sopenharmony_ci 57462306a36Sopenharmony_ci /* trylock failed, we got to sleep */ 57562306a36Sopenharmony_ci if (!atomic_inc_not_zero(&anon_vma->refcount)) { 57662306a36Sopenharmony_ci anon_vma = NULL; 57762306a36Sopenharmony_ci goto out; 57862306a36Sopenharmony_ci } 57962306a36Sopenharmony_ci 58062306a36Sopenharmony_ci if (!folio_mapped(folio)) { 58162306a36Sopenharmony_ci rcu_read_unlock(); 58262306a36Sopenharmony_ci put_anon_vma(anon_vma); 58362306a36Sopenharmony_ci return NULL; 58462306a36Sopenharmony_ci } 58562306a36Sopenharmony_ci 58662306a36Sopenharmony_ci /* we pinned the anon_vma, its safe to sleep */ 58762306a36Sopenharmony_ci rcu_read_unlock(); 58862306a36Sopenharmony_ci anon_vma_lock_read(anon_vma); 58962306a36Sopenharmony_ci 59062306a36Sopenharmony_ci if (atomic_dec_and_test(&anon_vma->refcount)) { 59162306a36Sopenharmony_ci /* 59262306a36Sopenharmony_ci * Oops, we held the last refcount, release the lock 59362306a36Sopenharmony_ci * and bail -- can't simply use put_anon_vma() because 59462306a36Sopenharmony_ci * we'll deadlock on the anon_vma_lock_write() recursion. 59562306a36Sopenharmony_ci */ 59662306a36Sopenharmony_ci anon_vma_unlock_read(anon_vma); 59762306a36Sopenharmony_ci __put_anon_vma(anon_vma); 59862306a36Sopenharmony_ci anon_vma = NULL; 59962306a36Sopenharmony_ci } 60062306a36Sopenharmony_ci 60162306a36Sopenharmony_ci return anon_vma; 60262306a36Sopenharmony_ci 60362306a36Sopenharmony_ciout: 60462306a36Sopenharmony_ci rcu_read_unlock(); 60562306a36Sopenharmony_ci return anon_vma; 60662306a36Sopenharmony_ci} 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH 60962306a36Sopenharmony_ci/* 61062306a36Sopenharmony_ci * Flush TLB entries for recently unmapped pages from remote CPUs. It is 61162306a36Sopenharmony_ci * important if a PTE was dirty when it was unmapped that it's flushed 61262306a36Sopenharmony_ci * before any IO is initiated on the page to prevent lost writes. Similarly, 61362306a36Sopenharmony_ci * it must be flushed before freeing to prevent data leakage. 61462306a36Sopenharmony_ci */ 61562306a36Sopenharmony_civoid try_to_unmap_flush(void) 61662306a36Sopenharmony_ci{ 61762306a36Sopenharmony_ci struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 61862306a36Sopenharmony_ci 61962306a36Sopenharmony_ci if (!tlb_ubc->flush_required) 62062306a36Sopenharmony_ci return; 62162306a36Sopenharmony_ci 62262306a36Sopenharmony_ci arch_tlbbatch_flush(&tlb_ubc->arch); 62362306a36Sopenharmony_ci tlb_ubc->flush_required = false; 62462306a36Sopenharmony_ci tlb_ubc->writable = false; 62562306a36Sopenharmony_ci} 62662306a36Sopenharmony_ci 62762306a36Sopenharmony_ci/* Flush iff there are potentially writable TLB entries that can race with IO */ 62862306a36Sopenharmony_civoid try_to_unmap_flush_dirty(void) 62962306a36Sopenharmony_ci{ 63062306a36Sopenharmony_ci struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ci if (tlb_ubc->writable) 63362306a36Sopenharmony_ci try_to_unmap_flush(); 63462306a36Sopenharmony_ci} 63562306a36Sopenharmony_ci 63662306a36Sopenharmony_ci/* 63762306a36Sopenharmony_ci * Bits 0-14 of mm->tlb_flush_batched record pending generations. 63862306a36Sopenharmony_ci * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. 63962306a36Sopenharmony_ci */ 64062306a36Sopenharmony_ci#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 64162306a36Sopenharmony_ci#define TLB_FLUSH_BATCH_PENDING_MASK \ 64262306a36Sopenharmony_ci ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) 64362306a36Sopenharmony_ci#define TLB_FLUSH_BATCH_PENDING_LARGE \ 64462306a36Sopenharmony_ci (TLB_FLUSH_BATCH_PENDING_MASK / 2) 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_cistatic void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, 64762306a36Sopenharmony_ci unsigned long uaddr) 64862306a36Sopenharmony_ci{ 64962306a36Sopenharmony_ci struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; 65062306a36Sopenharmony_ci int batch; 65162306a36Sopenharmony_ci bool writable = pte_dirty(pteval); 65262306a36Sopenharmony_ci 65362306a36Sopenharmony_ci if (!pte_accessible(mm, pteval)) 65462306a36Sopenharmony_ci return; 65562306a36Sopenharmony_ci 65662306a36Sopenharmony_ci arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr); 65762306a36Sopenharmony_ci tlb_ubc->flush_required = true; 65862306a36Sopenharmony_ci 65962306a36Sopenharmony_ci /* 66062306a36Sopenharmony_ci * Ensure compiler does not re-order the setting of tlb_flush_batched 66162306a36Sopenharmony_ci * before the PTE is cleared. 66262306a36Sopenharmony_ci */ 66362306a36Sopenharmony_ci barrier(); 66462306a36Sopenharmony_ci batch = atomic_read(&mm->tlb_flush_batched); 66562306a36Sopenharmony_ciretry: 66662306a36Sopenharmony_ci if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { 66762306a36Sopenharmony_ci /* 66862306a36Sopenharmony_ci * Prevent `pending' from catching up with `flushed' because of 66962306a36Sopenharmony_ci * overflow. Reset `pending' and `flushed' to be 1 and 0 if 67062306a36Sopenharmony_ci * `pending' becomes large. 67162306a36Sopenharmony_ci */ 67262306a36Sopenharmony_ci if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1)) 67362306a36Sopenharmony_ci goto retry; 67462306a36Sopenharmony_ci } else { 67562306a36Sopenharmony_ci atomic_inc(&mm->tlb_flush_batched); 67662306a36Sopenharmony_ci } 67762306a36Sopenharmony_ci 67862306a36Sopenharmony_ci /* 67962306a36Sopenharmony_ci * If the PTE was dirty then it's best to assume it's writable. The 68062306a36Sopenharmony_ci * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() 68162306a36Sopenharmony_ci * before the page is queued for IO. 68262306a36Sopenharmony_ci */ 68362306a36Sopenharmony_ci if (writable) 68462306a36Sopenharmony_ci tlb_ubc->writable = true; 68562306a36Sopenharmony_ci} 68662306a36Sopenharmony_ci 68762306a36Sopenharmony_ci/* 68862306a36Sopenharmony_ci * Returns true if the TLB flush should be deferred to the end of a batch of 68962306a36Sopenharmony_ci * unmap operations to reduce IPIs. 69062306a36Sopenharmony_ci */ 69162306a36Sopenharmony_cistatic bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 69262306a36Sopenharmony_ci{ 69362306a36Sopenharmony_ci if (!(flags & TTU_BATCH_FLUSH)) 69462306a36Sopenharmony_ci return false; 69562306a36Sopenharmony_ci 69662306a36Sopenharmony_ci return arch_tlbbatch_should_defer(mm); 69762306a36Sopenharmony_ci} 69862306a36Sopenharmony_ci 69962306a36Sopenharmony_ci/* 70062306a36Sopenharmony_ci * Reclaim unmaps pages under the PTL but do not flush the TLB prior to 70162306a36Sopenharmony_ci * releasing the PTL if TLB flushes are batched. It's possible for a parallel 70262306a36Sopenharmony_ci * operation such as mprotect or munmap to race between reclaim unmapping 70362306a36Sopenharmony_ci * the page and flushing the page. If this race occurs, it potentially allows 70462306a36Sopenharmony_ci * access to data via a stale TLB entry. Tracking all mm's that have TLB 70562306a36Sopenharmony_ci * batching in flight would be expensive during reclaim so instead track 70662306a36Sopenharmony_ci * whether TLB batching occurred in the past and if so then do a flush here 70762306a36Sopenharmony_ci * if required. This will cost one additional flush per reclaim cycle paid 70862306a36Sopenharmony_ci * by the first operation at risk such as mprotect and mumap. 70962306a36Sopenharmony_ci * 71062306a36Sopenharmony_ci * This must be called under the PTL so that an access to tlb_flush_batched 71162306a36Sopenharmony_ci * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise 71262306a36Sopenharmony_ci * via the PTL. 71362306a36Sopenharmony_ci */ 71462306a36Sopenharmony_civoid flush_tlb_batched_pending(struct mm_struct *mm) 71562306a36Sopenharmony_ci{ 71662306a36Sopenharmony_ci int batch = atomic_read(&mm->tlb_flush_batched); 71762306a36Sopenharmony_ci int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; 71862306a36Sopenharmony_ci int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; 71962306a36Sopenharmony_ci 72062306a36Sopenharmony_ci if (pending != flushed) { 72162306a36Sopenharmony_ci arch_flush_tlb_batched_pending(mm); 72262306a36Sopenharmony_ci /* 72362306a36Sopenharmony_ci * If the new TLB flushing is pending during flushing, leave 72462306a36Sopenharmony_ci * mm->tlb_flush_batched as is, to avoid losing flushing. 72562306a36Sopenharmony_ci */ 72662306a36Sopenharmony_ci atomic_cmpxchg(&mm->tlb_flush_batched, batch, 72762306a36Sopenharmony_ci pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); 72862306a36Sopenharmony_ci } 72962306a36Sopenharmony_ci} 73062306a36Sopenharmony_ci#else 73162306a36Sopenharmony_cistatic void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, 73262306a36Sopenharmony_ci unsigned long uaddr) 73362306a36Sopenharmony_ci{ 73462306a36Sopenharmony_ci} 73562306a36Sopenharmony_ci 73662306a36Sopenharmony_cistatic bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) 73762306a36Sopenharmony_ci{ 73862306a36Sopenharmony_ci return false; 73962306a36Sopenharmony_ci} 74062306a36Sopenharmony_ci#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ 74162306a36Sopenharmony_ci 74262306a36Sopenharmony_ci/* 74362306a36Sopenharmony_ci * At what user virtual address is page expected in vma? 74462306a36Sopenharmony_ci * Caller should check the page is actually part of the vma. 74562306a36Sopenharmony_ci */ 74662306a36Sopenharmony_ciunsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 74762306a36Sopenharmony_ci{ 74862306a36Sopenharmony_ci struct folio *folio = page_folio(page); 74962306a36Sopenharmony_ci if (folio_test_anon(folio)) { 75062306a36Sopenharmony_ci struct anon_vma *page__anon_vma = folio_anon_vma(folio); 75162306a36Sopenharmony_ci /* 75262306a36Sopenharmony_ci * Note: swapoff's unuse_vma() is more efficient with this 75362306a36Sopenharmony_ci * check, and needs it to match anon_vma when KSM is active. 75462306a36Sopenharmony_ci */ 75562306a36Sopenharmony_ci if (!vma->anon_vma || !page__anon_vma || 75662306a36Sopenharmony_ci vma->anon_vma->root != page__anon_vma->root) 75762306a36Sopenharmony_ci return -EFAULT; 75862306a36Sopenharmony_ci } else if (!vma->vm_file) { 75962306a36Sopenharmony_ci return -EFAULT; 76062306a36Sopenharmony_ci } else if (vma->vm_file->f_mapping != folio->mapping) { 76162306a36Sopenharmony_ci return -EFAULT; 76262306a36Sopenharmony_ci } 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_ci return vma_address(page, vma); 76562306a36Sopenharmony_ci} 76662306a36Sopenharmony_ci 76762306a36Sopenharmony_ci/* 76862306a36Sopenharmony_ci * Returns the actual pmd_t* where we expect 'address' to be mapped from, or 76962306a36Sopenharmony_ci * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* 77062306a36Sopenharmony_ci * represents. 77162306a36Sopenharmony_ci */ 77262306a36Sopenharmony_cipmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) 77362306a36Sopenharmony_ci{ 77462306a36Sopenharmony_ci pgd_t *pgd; 77562306a36Sopenharmony_ci p4d_t *p4d; 77662306a36Sopenharmony_ci pud_t *pud; 77762306a36Sopenharmony_ci pmd_t *pmd = NULL; 77862306a36Sopenharmony_ci 77962306a36Sopenharmony_ci pgd = pgd_offset(mm, address); 78062306a36Sopenharmony_ci if (!pgd_present(*pgd)) 78162306a36Sopenharmony_ci goto out; 78262306a36Sopenharmony_ci 78362306a36Sopenharmony_ci p4d = p4d_offset(pgd, address); 78462306a36Sopenharmony_ci if (!p4d_present(*p4d)) 78562306a36Sopenharmony_ci goto out; 78662306a36Sopenharmony_ci 78762306a36Sopenharmony_ci pud = pud_offset(p4d, address); 78862306a36Sopenharmony_ci if (!pud_present(*pud)) 78962306a36Sopenharmony_ci goto out; 79062306a36Sopenharmony_ci 79162306a36Sopenharmony_ci pmd = pmd_offset(pud, address); 79262306a36Sopenharmony_ciout: 79362306a36Sopenharmony_ci return pmd; 79462306a36Sopenharmony_ci} 79562306a36Sopenharmony_ci 79662306a36Sopenharmony_cistruct folio_referenced_arg { 79762306a36Sopenharmony_ci int mapcount; 79862306a36Sopenharmony_ci int referenced; 79962306a36Sopenharmony_ci unsigned long vm_flags; 80062306a36Sopenharmony_ci struct mem_cgroup *memcg; 80162306a36Sopenharmony_ci}; 80262306a36Sopenharmony_ci/* 80362306a36Sopenharmony_ci * arg: folio_referenced_arg will be passed 80462306a36Sopenharmony_ci */ 80562306a36Sopenharmony_cistatic bool folio_referenced_one(struct folio *folio, 80662306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long address, void *arg) 80762306a36Sopenharmony_ci{ 80862306a36Sopenharmony_ci struct folio_referenced_arg *pra = arg; 80962306a36Sopenharmony_ci DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); 81062306a36Sopenharmony_ci int referenced = 0; 81162306a36Sopenharmony_ci 81262306a36Sopenharmony_ci while (page_vma_mapped_walk(&pvmw)) { 81362306a36Sopenharmony_ci address = pvmw.address; 81462306a36Sopenharmony_ci 81562306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 81662306a36Sopenharmony_ci if (!(vma->vm_flags & VM_PURGEABLE)) 81762306a36Sopenharmony_ci pra->vm_flags &= ~VM_PURGEABLE; 81862306a36Sopenharmony_ci#endif 81962306a36Sopenharmony_ci if ((vma->vm_flags & VM_LOCKED) && 82062306a36Sopenharmony_ci (!folio_test_large(folio) || !pvmw.pte)) { 82162306a36Sopenharmony_ci /* Restore the mlock which got missed */ 82262306a36Sopenharmony_ci mlock_vma_folio(folio, vma, !pvmw.pte); 82362306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 82462306a36Sopenharmony_ci pra->vm_flags |= VM_LOCKED; 82562306a36Sopenharmony_ci return false; /* To break the loop */ 82662306a36Sopenharmony_ci } 82762306a36Sopenharmony_ci 82862306a36Sopenharmony_ci if (pvmw.pte) { 82962306a36Sopenharmony_ci if (lru_gen_enabled() && 83062306a36Sopenharmony_ci pte_young(ptep_get(pvmw.pte))) { 83162306a36Sopenharmony_ci lru_gen_look_around(&pvmw); 83262306a36Sopenharmony_ci referenced++; 83362306a36Sopenharmony_ci } 83462306a36Sopenharmony_ci 83562306a36Sopenharmony_ci if (ptep_clear_flush_young_notify(vma, address, 83662306a36Sopenharmony_ci pvmw.pte)) 83762306a36Sopenharmony_ci referenced++; 83862306a36Sopenharmony_ci } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 83962306a36Sopenharmony_ci if (pmdp_clear_flush_young_notify(vma, address, 84062306a36Sopenharmony_ci pvmw.pmd)) 84162306a36Sopenharmony_ci referenced++; 84262306a36Sopenharmony_ci } else { 84362306a36Sopenharmony_ci /* unexpected pmd-mapped folio? */ 84462306a36Sopenharmony_ci WARN_ON_ONCE(1); 84562306a36Sopenharmony_ci } 84662306a36Sopenharmony_ci 84762306a36Sopenharmony_ci pra->mapcount--; 84862306a36Sopenharmony_ci } 84962306a36Sopenharmony_ci 85062306a36Sopenharmony_ci if (referenced) 85162306a36Sopenharmony_ci folio_clear_idle(folio); 85262306a36Sopenharmony_ci if (folio_test_clear_young(folio)) 85362306a36Sopenharmony_ci referenced++; 85462306a36Sopenharmony_ci 85562306a36Sopenharmony_ci if (referenced) { 85662306a36Sopenharmony_ci pra->referenced++; 85762306a36Sopenharmony_ci pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; 85862306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 85962306a36Sopenharmony_ci pra->vm_flags |= vma->vm_flags & ~VM_PURGEABLE; 86062306a36Sopenharmony_ci#endif 86162306a36Sopenharmony_ci } 86262306a36Sopenharmony_ci 86362306a36Sopenharmony_ci if (!pra->mapcount) 86462306a36Sopenharmony_ci return false; /* To break the loop */ 86562306a36Sopenharmony_ci 86662306a36Sopenharmony_ci return true; 86762306a36Sopenharmony_ci} 86862306a36Sopenharmony_ci 86962306a36Sopenharmony_cistatic bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) 87062306a36Sopenharmony_ci{ 87162306a36Sopenharmony_ci struct folio_referenced_arg *pra = arg; 87262306a36Sopenharmony_ci struct mem_cgroup *memcg = pra->memcg; 87362306a36Sopenharmony_ci 87462306a36Sopenharmony_ci /* 87562306a36Sopenharmony_ci * Ignore references from this mapping if it has no recency. If the 87662306a36Sopenharmony_ci * folio has been used in another mapping, we will catch it; if this 87762306a36Sopenharmony_ci * other mapping is already gone, the unmap path will have set the 87862306a36Sopenharmony_ci * referenced flag or activated the folio in zap_pte_range(). 87962306a36Sopenharmony_ci */ 88062306a36Sopenharmony_ci if (!vma_has_recency(vma)) 88162306a36Sopenharmony_ci return true; 88262306a36Sopenharmony_ci 88362306a36Sopenharmony_ci /* 88462306a36Sopenharmony_ci * If we are reclaiming on behalf of a cgroup, skip counting on behalf 88562306a36Sopenharmony_ci * of references from different cgroups. 88662306a36Sopenharmony_ci */ 88762306a36Sopenharmony_ci if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) 88862306a36Sopenharmony_ci return true; 88962306a36Sopenharmony_ci 89062306a36Sopenharmony_ci return false; 89162306a36Sopenharmony_ci} 89262306a36Sopenharmony_ci 89362306a36Sopenharmony_ci/** 89462306a36Sopenharmony_ci * folio_referenced() - Test if the folio was referenced. 89562306a36Sopenharmony_ci * @folio: The folio to test. 89662306a36Sopenharmony_ci * @is_locked: Caller holds lock on the folio. 89762306a36Sopenharmony_ci * @memcg: target memory cgroup 89862306a36Sopenharmony_ci * @vm_flags: A combination of all the vma->vm_flags which referenced the folio. 89962306a36Sopenharmony_ci * 90062306a36Sopenharmony_ci * Quick test_and_clear_referenced for all mappings of a folio, 90162306a36Sopenharmony_ci * 90262306a36Sopenharmony_ci * Return: The number of mappings which referenced the folio. Return -1 if 90362306a36Sopenharmony_ci * the function bailed out due to rmap lock contention. 90462306a36Sopenharmony_ci */ 90562306a36Sopenharmony_ciint folio_referenced(struct folio *folio, int is_locked, 90662306a36Sopenharmony_ci struct mem_cgroup *memcg, unsigned long *vm_flags) 90762306a36Sopenharmony_ci{ 90862306a36Sopenharmony_ci int we_locked = 0; 90962306a36Sopenharmony_ci struct folio_referenced_arg pra = { 91062306a36Sopenharmony_ci .mapcount = folio_mapcount(folio), 91162306a36Sopenharmony_ci .memcg = memcg, 91262306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 91362306a36Sopenharmony_ci .vm_flags = VM_PURGEABLE, 91462306a36Sopenharmony_ci#endif 91562306a36Sopenharmony_ci }; 91662306a36Sopenharmony_ci struct rmap_walk_control rwc = { 91762306a36Sopenharmony_ci .rmap_one = folio_referenced_one, 91862306a36Sopenharmony_ci .arg = (void *)&pra, 91962306a36Sopenharmony_ci .anon_lock = folio_lock_anon_vma_read, 92062306a36Sopenharmony_ci .try_lock = true, 92162306a36Sopenharmony_ci .invalid_vma = invalid_folio_referenced_vma, 92262306a36Sopenharmony_ci }; 92362306a36Sopenharmony_ci 92462306a36Sopenharmony_ci *vm_flags = 0; 92562306a36Sopenharmony_ci if (!pra.mapcount) 92662306a36Sopenharmony_ci return 0; 92762306a36Sopenharmony_ci 92862306a36Sopenharmony_ci if (!folio_raw_mapping(folio)) 92962306a36Sopenharmony_ci return 0; 93062306a36Sopenharmony_ci 93162306a36Sopenharmony_ci if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) { 93262306a36Sopenharmony_ci we_locked = folio_trylock(folio); 93362306a36Sopenharmony_ci if (!we_locked) 93462306a36Sopenharmony_ci return 1; 93562306a36Sopenharmony_ci } 93662306a36Sopenharmony_ci 93762306a36Sopenharmony_ci rmap_walk(folio, &rwc); 93862306a36Sopenharmony_ci *vm_flags = pra.vm_flags; 93962306a36Sopenharmony_ci 94062306a36Sopenharmony_ci if (we_locked) 94162306a36Sopenharmony_ci folio_unlock(folio); 94262306a36Sopenharmony_ci 94362306a36Sopenharmony_ci return rwc.contended ? -1 : pra.referenced; 94462306a36Sopenharmony_ci} 94562306a36Sopenharmony_ci 94662306a36Sopenharmony_cistatic int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) 94762306a36Sopenharmony_ci{ 94862306a36Sopenharmony_ci int cleaned = 0; 94962306a36Sopenharmony_ci struct vm_area_struct *vma = pvmw->vma; 95062306a36Sopenharmony_ci struct mmu_notifier_range range; 95162306a36Sopenharmony_ci unsigned long address = pvmw->address; 95262306a36Sopenharmony_ci 95362306a36Sopenharmony_ci /* 95462306a36Sopenharmony_ci * We have to assume the worse case ie pmd for invalidation. Note that 95562306a36Sopenharmony_ci * the folio can not be freed from this function. 95662306a36Sopenharmony_ci */ 95762306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, 95862306a36Sopenharmony_ci vma->vm_mm, address, vma_address_end(pvmw)); 95962306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 96062306a36Sopenharmony_ci 96162306a36Sopenharmony_ci while (page_vma_mapped_walk(pvmw)) { 96262306a36Sopenharmony_ci int ret = 0; 96362306a36Sopenharmony_ci 96462306a36Sopenharmony_ci address = pvmw->address; 96562306a36Sopenharmony_ci if (pvmw->pte) { 96662306a36Sopenharmony_ci pte_t *pte = pvmw->pte; 96762306a36Sopenharmony_ci pte_t entry = ptep_get(pte); 96862306a36Sopenharmony_ci 96962306a36Sopenharmony_ci if (!pte_dirty(entry) && !pte_write(entry)) 97062306a36Sopenharmony_ci continue; 97162306a36Sopenharmony_ci 97262306a36Sopenharmony_ci flush_cache_page(vma, address, pte_pfn(entry)); 97362306a36Sopenharmony_ci entry = ptep_clear_flush(vma, address, pte); 97462306a36Sopenharmony_ci entry = pte_wrprotect(entry); 97562306a36Sopenharmony_ci entry = pte_mkclean(entry); 97662306a36Sopenharmony_ci set_pte_at(vma->vm_mm, address, pte, entry); 97762306a36Sopenharmony_ci ret = 1; 97862306a36Sopenharmony_ci } else { 97962306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 98062306a36Sopenharmony_ci pmd_t *pmd = pvmw->pmd; 98162306a36Sopenharmony_ci pmd_t entry; 98262306a36Sopenharmony_ci 98362306a36Sopenharmony_ci if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) 98462306a36Sopenharmony_ci continue; 98562306a36Sopenharmony_ci 98662306a36Sopenharmony_ci flush_cache_range(vma, address, 98762306a36Sopenharmony_ci address + HPAGE_PMD_SIZE); 98862306a36Sopenharmony_ci entry = pmdp_invalidate(vma, address, pmd); 98962306a36Sopenharmony_ci entry = pmd_wrprotect(entry); 99062306a36Sopenharmony_ci entry = pmd_mkclean(entry); 99162306a36Sopenharmony_ci set_pmd_at(vma->vm_mm, address, pmd, entry); 99262306a36Sopenharmony_ci ret = 1; 99362306a36Sopenharmony_ci#else 99462306a36Sopenharmony_ci /* unexpected pmd-mapped folio? */ 99562306a36Sopenharmony_ci WARN_ON_ONCE(1); 99662306a36Sopenharmony_ci#endif 99762306a36Sopenharmony_ci } 99862306a36Sopenharmony_ci 99962306a36Sopenharmony_ci if (ret) 100062306a36Sopenharmony_ci cleaned++; 100162306a36Sopenharmony_ci } 100262306a36Sopenharmony_ci 100362306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 100462306a36Sopenharmony_ci 100562306a36Sopenharmony_ci return cleaned; 100662306a36Sopenharmony_ci} 100762306a36Sopenharmony_ci 100862306a36Sopenharmony_cistatic bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, 100962306a36Sopenharmony_ci unsigned long address, void *arg) 101062306a36Sopenharmony_ci{ 101162306a36Sopenharmony_ci DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); 101262306a36Sopenharmony_ci int *cleaned = arg; 101362306a36Sopenharmony_ci 101462306a36Sopenharmony_ci *cleaned += page_vma_mkclean_one(&pvmw); 101562306a36Sopenharmony_ci 101662306a36Sopenharmony_ci return true; 101762306a36Sopenharmony_ci} 101862306a36Sopenharmony_ci 101962306a36Sopenharmony_cistatic bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) 102062306a36Sopenharmony_ci{ 102162306a36Sopenharmony_ci if (vma->vm_flags & VM_SHARED) 102262306a36Sopenharmony_ci return false; 102362306a36Sopenharmony_ci 102462306a36Sopenharmony_ci return true; 102562306a36Sopenharmony_ci} 102662306a36Sopenharmony_ci 102762306a36Sopenharmony_ciint folio_mkclean(struct folio *folio) 102862306a36Sopenharmony_ci{ 102962306a36Sopenharmony_ci int cleaned = 0; 103062306a36Sopenharmony_ci struct address_space *mapping; 103162306a36Sopenharmony_ci struct rmap_walk_control rwc = { 103262306a36Sopenharmony_ci .arg = (void *)&cleaned, 103362306a36Sopenharmony_ci .rmap_one = page_mkclean_one, 103462306a36Sopenharmony_ci .invalid_vma = invalid_mkclean_vma, 103562306a36Sopenharmony_ci }; 103662306a36Sopenharmony_ci 103762306a36Sopenharmony_ci BUG_ON(!folio_test_locked(folio)); 103862306a36Sopenharmony_ci 103962306a36Sopenharmony_ci if (!folio_mapped(folio)) 104062306a36Sopenharmony_ci return 0; 104162306a36Sopenharmony_ci 104262306a36Sopenharmony_ci mapping = folio_mapping(folio); 104362306a36Sopenharmony_ci if (!mapping) 104462306a36Sopenharmony_ci return 0; 104562306a36Sopenharmony_ci 104662306a36Sopenharmony_ci rmap_walk(folio, &rwc); 104762306a36Sopenharmony_ci 104862306a36Sopenharmony_ci return cleaned; 104962306a36Sopenharmony_ci} 105062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(folio_mkclean); 105162306a36Sopenharmony_ci 105262306a36Sopenharmony_ci/** 105362306a36Sopenharmony_ci * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of 105462306a36Sopenharmony_ci * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff) 105562306a36Sopenharmony_ci * within the @vma of shared mappings. And since clean PTEs 105662306a36Sopenharmony_ci * should also be readonly, write protects them too. 105762306a36Sopenharmony_ci * @pfn: start pfn. 105862306a36Sopenharmony_ci * @nr_pages: number of physically contiguous pages srarting with @pfn. 105962306a36Sopenharmony_ci * @pgoff: page offset that the @pfn mapped with. 106062306a36Sopenharmony_ci * @vma: vma that @pfn mapped within. 106162306a36Sopenharmony_ci * 106262306a36Sopenharmony_ci * Returns the number of cleaned PTEs (including PMDs). 106362306a36Sopenharmony_ci */ 106462306a36Sopenharmony_ciint pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, 106562306a36Sopenharmony_ci struct vm_area_struct *vma) 106662306a36Sopenharmony_ci{ 106762306a36Sopenharmony_ci struct page_vma_mapped_walk pvmw = { 106862306a36Sopenharmony_ci .pfn = pfn, 106962306a36Sopenharmony_ci .nr_pages = nr_pages, 107062306a36Sopenharmony_ci .pgoff = pgoff, 107162306a36Sopenharmony_ci .vma = vma, 107262306a36Sopenharmony_ci .flags = PVMW_SYNC, 107362306a36Sopenharmony_ci }; 107462306a36Sopenharmony_ci 107562306a36Sopenharmony_ci if (invalid_mkclean_vma(vma, NULL)) 107662306a36Sopenharmony_ci return 0; 107762306a36Sopenharmony_ci 107862306a36Sopenharmony_ci pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma); 107962306a36Sopenharmony_ci VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma); 108062306a36Sopenharmony_ci 108162306a36Sopenharmony_ci return page_vma_mkclean_one(&pvmw); 108262306a36Sopenharmony_ci} 108362306a36Sopenharmony_ci 108462306a36Sopenharmony_ciint folio_total_mapcount(struct folio *folio) 108562306a36Sopenharmony_ci{ 108662306a36Sopenharmony_ci int mapcount = folio_entire_mapcount(folio); 108762306a36Sopenharmony_ci int nr_pages; 108862306a36Sopenharmony_ci int i; 108962306a36Sopenharmony_ci 109062306a36Sopenharmony_ci /* In the common case, avoid the loop when no pages mapped by PTE */ 109162306a36Sopenharmony_ci if (folio_nr_pages_mapped(folio) == 0) 109262306a36Sopenharmony_ci return mapcount; 109362306a36Sopenharmony_ci /* 109462306a36Sopenharmony_ci * Add all the PTE mappings of those pages mapped by PTE. 109562306a36Sopenharmony_ci * Limit the loop to folio_nr_pages_mapped()? 109662306a36Sopenharmony_ci * Perhaps: given all the raciness, that may be a good or a bad idea. 109762306a36Sopenharmony_ci */ 109862306a36Sopenharmony_ci nr_pages = folio_nr_pages(folio); 109962306a36Sopenharmony_ci for (i = 0; i < nr_pages; i++) 110062306a36Sopenharmony_ci mapcount += atomic_read(&folio_page(folio, i)->_mapcount); 110162306a36Sopenharmony_ci 110262306a36Sopenharmony_ci /* But each of those _mapcounts was based on -1 */ 110362306a36Sopenharmony_ci mapcount += nr_pages; 110462306a36Sopenharmony_ci return mapcount; 110562306a36Sopenharmony_ci} 110662306a36Sopenharmony_ci 110762306a36Sopenharmony_ci/** 110862306a36Sopenharmony_ci * page_move_anon_rmap - move a page to our anon_vma 110962306a36Sopenharmony_ci * @page: the page to move to our anon_vma 111062306a36Sopenharmony_ci * @vma: the vma the page belongs to 111162306a36Sopenharmony_ci * 111262306a36Sopenharmony_ci * When a page belongs exclusively to one process after a COW event, 111362306a36Sopenharmony_ci * that page can be moved into the anon_vma that belongs to just that 111462306a36Sopenharmony_ci * process, so the rmap code will not search the parent or sibling 111562306a36Sopenharmony_ci * processes. 111662306a36Sopenharmony_ci */ 111762306a36Sopenharmony_civoid page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) 111862306a36Sopenharmony_ci{ 111962306a36Sopenharmony_ci void *anon_vma = vma->anon_vma; 112062306a36Sopenharmony_ci struct folio *folio = page_folio(page); 112162306a36Sopenharmony_ci 112262306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 112362306a36Sopenharmony_ci VM_BUG_ON_VMA(!anon_vma, vma); 112462306a36Sopenharmony_ci 112562306a36Sopenharmony_ci anon_vma += PAGE_MAPPING_ANON; 112662306a36Sopenharmony_ci /* 112762306a36Sopenharmony_ci * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written 112862306a36Sopenharmony_ci * simultaneously, so a concurrent reader (eg folio_referenced()'s 112962306a36Sopenharmony_ci * folio_test_anon()) will not see one without the other. 113062306a36Sopenharmony_ci */ 113162306a36Sopenharmony_ci WRITE_ONCE(folio->mapping, anon_vma); 113262306a36Sopenharmony_ci SetPageAnonExclusive(page); 113362306a36Sopenharmony_ci} 113462306a36Sopenharmony_ci 113562306a36Sopenharmony_ci/** 113662306a36Sopenharmony_ci * __page_set_anon_rmap - set up new anonymous rmap 113762306a36Sopenharmony_ci * @folio: Folio which contains page. 113862306a36Sopenharmony_ci * @page: Page to add to rmap. 113962306a36Sopenharmony_ci * @vma: VM area to add page to. 114062306a36Sopenharmony_ci * @address: User virtual address of the mapping 114162306a36Sopenharmony_ci * @exclusive: the page is exclusively owned by the current process 114262306a36Sopenharmony_ci */ 114362306a36Sopenharmony_cistatic void __page_set_anon_rmap(struct folio *folio, struct page *page, 114462306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long address, int exclusive) 114562306a36Sopenharmony_ci{ 114662306a36Sopenharmony_ci struct anon_vma *anon_vma = vma->anon_vma; 114762306a36Sopenharmony_ci 114862306a36Sopenharmony_ci BUG_ON(!anon_vma); 114962306a36Sopenharmony_ci 115062306a36Sopenharmony_ci if (folio_test_anon(folio)) 115162306a36Sopenharmony_ci goto out; 115262306a36Sopenharmony_ci 115362306a36Sopenharmony_ci /* 115462306a36Sopenharmony_ci * If the page isn't exclusively mapped into this vma, 115562306a36Sopenharmony_ci * we must use the _oldest_ possible anon_vma for the 115662306a36Sopenharmony_ci * page mapping! 115762306a36Sopenharmony_ci */ 115862306a36Sopenharmony_ci if (!exclusive) 115962306a36Sopenharmony_ci anon_vma = anon_vma->root; 116062306a36Sopenharmony_ci 116162306a36Sopenharmony_ci /* 116262306a36Sopenharmony_ci * page_idle does a lockless/optimistic rmap scan on folio->mapping. 116362306a36Sopenharmony_ci * Make sure the compiler doesn't split the stores of anon_vma and 116462306a36Sopenharmony_ci * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code 116562306a36Sopenharmony_ci * could mistake the mapping for a struct address_space and crash. 116662306a36Sopenharmony_ci */ 116762306a36Sopenharmony_ci anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 116862306a36Sopenharmony_ci WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); 116962306a36Sopenharmony_ci folio->index = linear_page_index(vma, address); 117062306a36Sopenharmony_ciout: 117162306a36Sopenharmony_ci if (exclusive) 117262306a36Sopenharmony_ci SetPageAnonExclusive(page); 117362306a36Sopenharmony_ci} 117462306a36Sopenharmony_ci 117562306a36Sopenharmony_ci/** 117662306a36Sopenharmony_ci * __page_check_anon_rmap - sanity check anonymous rmap addition 117762306a36Sopenharmony_ci * @folio: The folio containing @page. 117862306a36Sopenharmony_ci * @page: the page to check the mapping of 117962306a36Sopenharmony_ci * @vma: the vm area in which the mapping is added 118062306a36Sopenharmony_ci * @address: the user virtual address mapped 118162306a36Sopenharmony_ci */ 118262306a36Sopenharmony_cistatic void __page_check_anon_rmap(struct folio *folio, struct page *page, 118362306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long address) 118462306a36Sopenharmony_ci{ 118562306a36Sopenharmony_ci /* 118662306a36Sopenharmony_ci * The page's anon-rmap details (mapping and index) are guaranteed to 118762306a36Sopenharmony_ci * be set up correctly at this point. 118862306a36Sopenharmony_ci * 118962306a36Sopenharmony_ci * We have exclusion against page_add_anon_rmap because the caller 119062306a36Sopenharmony_ci * always holds the page locked. 119162306a36Sopenharmony_ci * 119262306a36Sopenharmony_ci * We have exclusion against page_add_new_anon_rmap because those pages 119362306a36Sopenharmony_ci * are initially only visible via the pagetables, and the pte is locked 119462306a36Sopenharmony_ci * over the call to page_add_new_anon_rmap. 119562306a36Sopenharmony_ci */ 119662306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, 119762306a36Sopenharmony_ci folio); 119862306a36Sopenharmony_ci VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), 119962306a36Sopenharmony_ci page); 120062306a36Sopenharmony_ci} 120162306a36Sopenharmony_ci 120262306a36Sopenharmony_ci/** 120362306a36Sopenharmony_ci * page_add_anon_rmap - add pte mapping to an anonymous page 120462306a36Sopenharmony_ci * @page: the page to add the mapping to 120562306a36Sopenharmony_ci * @vma: the vm area in which the mapping is added 120662306a36Sopenharmony_ci * @address: the user virtual address mapped 120762306a36Sopenharmony_ci * @flags: the rmap flags 120862306a36Sopenharmony_ci * 120962306a36Sopenharmony_ci * The caller needs to hold the pte lock, and the page must be locked in 121062306a36Sopenharmony_ci * the anon_vma case: to serialize mapping,index checking after setting, 121162306a36Sopenharmony_ci * and to ensure that PageAnon is not being upgraded racily to PageKsm 121262306a36Sopenharmony_ci * (but PageKsm is never downgraded to PageAnon). 121362306a36Sopenharmony_ci */ 121462306a36Sopenharmony_civoid page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, 121562306a36Sopenharmony_ci unsigned long address, rmap_t flags) 121662306a36Sopenharmony_ci{ 121762306a36Sopenharmony_ci struct folio *folio = page_folio(page); 121862306a36Sopenharmony_ci atomic_t *mapped = &folio->_nr_pages_mapped; 121962306a36Sopenharmony_ci int nr = 0, nr_pmdmapped = 0; 122062306a36Sopenharmony_ci bool compound = flags & RMAP_COMPOUND; 122162306a36Sopenharmony_ci bool first = true; 122262306a36Sopenharmony_ci 122362306a36Sopenharmony_ci /* Is page being mapped by PTE? Is this its first map to be added? */ 122462306a36Sopenharmony_ci if (likely(!compound)) { 122562306a36Sopenharmony_ci first = atomic_inc_and_test(&page->_mapcount); 122662306a36Sopenharmony_ci nr = first; 122762306a36Sopenharmony_ci if (first && folio_test_large(folio)) { 122862306a36Sopenharmony_ci nr = atomic_inc_return_relaxed(mapped); 122962306a36Sopenharmony_ci nr = (nr < COMPOUND_MAPPED); 123062306a36Sopenharmony_ci } 123162306a36Sopenharmony_ci } else if (folio_test_pmd_mappable(folio)) { 123262306a36Sopenharmony_ci /* That test is redundant: it's for safety or to optimize out */ 123362306a36Sopenharmony_ci 123462306a36Sopenharmony_ci first = atomic_inc_and_test(&folio->_entire_mapcount); 123562306a36Sopenharmony_ci if (first) { 123662306a36Sopenharmony_ci nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); 123762306a36Sopenharmony_ci if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { 123862306a36Sopenharmony_ci nr_pmdmapped = folio_nr_pages(folio); 123962306a36Sopenharmony_ci nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); 124062306a36Sopenharmony_ci /* Raced ahead of a remove and another add? */ 124162306a36Sopenharmony_ci if (unlikely(nr < 0)) 124262306a36Sopenharmony_ci nr = 0; 124362306a36Sopenharmony_ci } else { 124462306a36Sopenharmony_ci /* Raced ahead of a remove of COMPOUND_MAPPED */ 124562306a36Sopenharmony_ci nr = 0; 124662306a36Sopenharmony_ci } 124762306a36Sopenharmony_ci } 124862306a36Sopenharmony_ci } 124962306a36Sopenharmony_ci 125062306a36Sopenharmony_ci VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); 125162306a36Sopenharmony_ci VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); 125262306a36Sopenharmony_ci 125362306a36Sopenharmony_ci if (nr_pmdmapped) 125462306a36Sopenharmony_ci __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped); 125562306a36Sopenharmony_ci if (nr) 125662306a36Sopenharmony_ci __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); 125762306a36Sopenharmony_ci 125862306a36Sopenharmony_ci if (likely(!folio_test_ksm(folio))) { 125962306a36Sopenharmony_ci /* address might be in next vma when migration races vma_merge */ 126062306a36Sopenharmony_ci if (first) 126162306a36Sopenharmony_ci __page_set_anon_rmap(folio, page, vma, address, 126262306a36Sopenharmony_ci !!(flags & RMAP_EXCLUSIVE)); 126362306a36Sopenharmony_ci else 126462306a36Sopenharmony_ci __page_check_anon_rmap(folio, page, vma, address); 126562306a36Sopenharmony_ci } 126662306a36Sopenharmony_ci 126762306a36Sopenharmony_ci mlock_vma_folio(folio, vma, compound); 126862306a36Sopenharmony_ci} 126962306a36Sopenharmony_ci 127062306a36Sopenharmony_ci/** 127162306a36Sopenharmony_ci * folio_add_new_anon_rmap - Add mapping to a new anonymous folio. 127262306a36Sopenharmony_ci * @folio: The folio to add the mapping to. 127362306a36Sopenharmony_ci * @vma: the vm area in which the mapping is added 127462306a36Sopenharmony_ci * @address: the user virtual address mapped 127562306a36Sopenharmony_ci * 127662306a36Sopenharmony_ci * Like page_add_anon_rmap() but must only be called on *new* folios. 127762306a36Sopenharmony_ci * This means the inc-and-test can be bypassed. 127862306a36Sopenharmony_ci * The folio does not have to be locked. 127962306a36Sopenharmony_ci * 128062306a36Sopenharmony_ci * If the folio is large, it is accounted as a THP. As the folio 128162306a36Sopenharmony_ci * is new, it's assumed to be mapped exclusively by a single process. 128262306a36Sopenharmony_ci */ 128362306a36Sopenharmony_civoid folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, 128462306a36Sopenharmony_ci unsigned long address) 128562306a36Sopenharmony_ci{ 128662306a36Sopenharmony_ci int nr; 128762306a36Sopenharmony_ci 128862306a36Sopenharmony_ci VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 128962306a36Sopenharmony_ci __folio_set_swapbacked(folio); 129062306a36Sopenharmony_ci 129162306a36Sopenharmony_ci if (likely(!folio_test_pmd_mappable(folio))) { 129262306a36Sopenharmony_ci /* increment count (starts at -1) */ 129362306a36Sopenharmony_ci atomic_set(&folio->_mapcount, 0); 129462306a36Sopenharmony_ci nr = 1; 129562306a36Sopenharmony_ci } else { 129662306a36Sopenharmony_ci /* increment count (starts at -1) */ 129762306a36Sopenharmony_ci atomic_set(&folio->_entire_mapcount, 0); 129862306a36Sopenharmony_ci atomic_set(&folio->_nr_pages_mapped, COMPOUND_MAPPED); 129962306a36Sopenharmony_ci nr = folio_nr_pages(folio); 130062306a36Sopenharmony_ci __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr); 130162306a36Sopenharmony_ci } 130262306a36Sopenharmony_ci 130362306a36Sopenharmony_ci __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); 130462306a36Sopenharmony_ci __page_set_anon_rmap(folio, &folio->page, vma, address, 1); 130562306a36Sopenharmony_ci} 130662306a36Sopenharmony_ci 130762306a36Sopenharmony_ci/** 130862306a36Sopenharmony_ci * folio_add_file_rmap_range - add pte mapping to page range of a folio 130962306a36Sopenharmony_ci * @folio: The folio to add the mapping to 131062306a36Sopenharmony_ci * @page: The first page to add 131162306a36Sopenharmony_ci * @nr_pages: The number of pages which will be mapped 131262306a36Sopenharmony_ci * @vma: the vm area in which the mapping is added 131362306a36Sopenharmony_ci * @compound: charge the page as compound or small page 131462306a36Sopenharmony_ci * 131562306a36Sopenharmony_ci * The page range of folio is defined by [first_page, first_page + nr_pages) 131662306a36Sopenharmony_ci * 131762306a36Sopenharmony_ci * The caller needs to hold the pte lock. 131862306a36Sopenharmony_ci */ 131962306a36Sopenharmony_civoid folio_add_file_rmap_range(struct folio *folio, struct page *page, 132062306a36Sopenharmony_ci unsigned int nr_pages, struct vm_area_struct *vma, 132162306a36Sopenharmony_ci bool compound) 132262306a36Sopenharmony_ci{ 132362306a36Sopenharmony_ci atomic_t *mapped = &folio->_nr_pages_mapped; 132462306a36Sopenharmony_ci unsigned int nr_pmdmapped = 0, first; 132562306a36Sopenharmony_ci int nr = 0; 132662306a36Sopenharmony_ci 132762306a36Sopenharmony_ci VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio); 132862306a36Sopenharmony_ci 132962306a36Sopenharmony_ci /* Is page being mapped by PTE? Is this its first map to be added? */ 133062306a36Sopenharmony_ci if (likely(!compound)) { 133162306a36Sopenharmony_ci do { 133262306a36Sopenharmony_ci first = atomic_inc_and_test(&page->_mapcount); 133362306a36Sopenharmony_ci if (first && folio_test_large(folio)) { 133462306a36Sopenharmony_ci first = atomic_inc_return_relaxed(mapped); 133562306a36Sopenharmony_ci first = (first < COMPOUND_MAPPED); 133662306a36Sopenharmony_ci } 133762306a36Sopenharmony_ci 133862306a36Sopenharmony_ci if (first) 133962306a36Sopenharmony_ci nr++; 134062306a36Sopenharmony_ci } while (page++, --nr_pages > 0); 134162306a36Sopenharmony_ci } else if (folio_test_pmd_mappable(folio)) { 134262306a36Sopenharmony_ci /* That test is redundant: it's for safety or to optimize out */ 134362306a36Sopenharmony_ci 134462306a36Sopenharmony_ci first = atomic_inc_and_test(&folio->_entire_mapcount); 134562306a36Sopenharmony_ci if (first) { 134662306a36Sopenharmony_ci nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); 134762306a36Sopenharmony_ci if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { 134862306a36Sopenharmony_ci nr_pmdmapped = folio_nr_pages(folio); 134962306a36Sopenharmony_ci nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); 135062306a36Sopenharmony_ci /* Raced ahead of a remove and another add? */ 135162306a36Sopenharmony_ci if (unlikely(nr < 0)) 135262306a36Sopenharmony_ci nr = 0; 135362306a36Sopenharmony_ci } else { 135462306a36Sopenharmony_ci /* Raced ahead of a remove of COMPOUND_MAPPED */ 135562306a36Sopenharmony_ci nr = 0; 135662306a36Sopenharmony_ci } 135762306a36Sopenharmony_ci } 135862306a36Sopenharmony_ci } 135962306a36Sopenharmony_ci 136062306a36Sopenharmony_ci if (nr_pmdmapped) 136162306a36Sopenharmony_ci __lruvec_stat_mod_folio(folio, folio_test_swapbacked(folio) ? 136262306a36Sopenharmony_ci NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); 136362306a36Sopenharmony_ci if (nr) 136462306a36Sopenharmony_ci __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); 136562306a36Sopenharmony_ci 136662306a36Sopenharmony_ci mlock_vma_folio(folio, vma, compound); 136762306a36Sopenharmony_ci} 136862306a36Sopenharmony_ci 136962306a36Sopenharmony_ci/** 137062306a36Sopenharmony_ci * page_add_file_rmap - add pte mapping to a file page 137162306a36Sopenharmony_ci * @page: the page to add the mapping to 137262306a36Sopenharmony_ci * @vma: the vm area in which the mapping is added 137362306a36Sopenharmony_ci * @compound: charge the page as compound or small page 137462306a36Sopenharmony_ci * 137562306a36Sopenharmony_ci * The caller needs to hold the pte lock. 137662306a36Sopenharmony_ci */ 137762306a36Sopenharmony_civoid page_add_file_rmap(struct page *page, struct vm_area_struct *vma, 137862306a36Sopenharmony_ci bool compound) 137962306a36Sopenharmony_ci{ 138062306a36Sopenharmony_ci struct folio *folio = page_folio(page); 138162306a36Sopenharmony_ci unsigned int nr_pages; 138262306a36Sopenharmony_ci 138362306a36Sopenharmony_ci VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page); 138462306a36Sopenharmony_ci 138562306a36Sopenharmony_ci if (likely(!compound)) 138662306a36Sopenharmony_ci nr_pages = 1; 138762306a36Sopenharmony_ci else 138862306a36Sopenharmony_ci nr_pages = folio_nr_pages(folio); 138962306a36Sopenharmony_ci 139062306a36Sopenharmony_ci folio_add_file_rmap_range(folio, page, nr_pages, vma, compound); 139162306a36Sopenharmony_ci} 139262306a36Sopenharmony_ci 139362306a36Sopenharmony_ci/** 139462306a36Sopenharmony_ci * page_remove_rmap - take down pte mapping from a page 139562306a36Sopenharmony_ci * @page: page to remove mapping from 139662306a36Sopenharmony_ci * @vma: the vm area from which the mapping is removed 139762306a36Sopenharmony_ci * @compound: uncharge the page as compound or small page 139862306a36Sopenharmony_ci * 139962306a36Sopenharmony_ci * The caller needs to hold the pte lock. 140062306a36Sopenharmony_ci */ 140162306a36Sopenharmony_civoid page_remove_rmap(struct page *page, struct vm_area_struct *vma, 140262306a36Sopenharmony_ci bool compound) 140362306a36Sopenharmony_ci{ 140462306a36Sopenharmony_ci struct folio *folio = page_folio(page); 140562306a36Sopenharmony_ci atomic_t *mapped = &folio->_nr_pages_mapped; 140662306a36Sopenharmony_ci int nr = 0, nr_pmdmapped = 0; 140762306a36Sopenharmony_ci bool last; 140862306a36Sopenharmony_ci enum node_stat_item idx; 140962306a36Sopenharmony_ci 141062306a36Sopenharmony_ci VM_BUG_ON_PAGE(compound && !PageHead(page), page); 141162306a36Sopenharmony_ci 141262306a36Sopenharmony_ci /* Hugetlb pages are not counted in NR_*MAPPED */ 141362306a36Sopenharmony_ci if (unlikely(folio_test_hugetlb(folio))) { 141462306a36Sopenharmony_ci /* hugetlb pages are always mapped with pmds */ 141562306a36Sopenharmony_ci atomic_dec(&folio->_entire_mapcount); 141662306a36Sopenharmony_ci return; 141762306a36Sopenharmony_ci } 141862306a36Sopenharmony_ci 141962306a36Sopenharmony_ci /* Is page being unmapped by PTE? Is this its last map to be removed? */ 142062306a36Sopenharmony_ci if (likely(!compound)) { 142162306a36Sopenharmony_ci last = atomic_add_negative(-1, &page->_mapcount); 142262306a36Sopenharmony_ci nr = last; 142362306a36Sopenharmony_ci if (last && folio_test_large(folio)) { 142462306a36Sopenharmony_ci nr = atomic_dec_return_relaxed(mapped); 142562306a36Sopenharmony_ci nr = (nr < COMPOUND_MAPPED); 142662306a36Sopenharmony_ci } 142762306a36Sopenharmony_ci } else if (folio_test_pmd_mappable(folio)) { 142862306a36Sopenharmony_ci /* That test is redundant: it's for safety or to optimize out */ 142962306a36Sopenharmony_ci 143062306a36Sopenharmony_ci last = atomic_add_negative(-1, &folio->_entire_mapcount); 143162306a36Sopenharmony_ci if (last) { 143262306a36Sopenharmony_ci nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); 143362306a36Sopenharmony_ci if (likely(nr < COMPOUND_MAPPED)) { 143462306a36Sopenharmony_ci nr_pmdmapped = folio_nr_pages(folio); 143562306a36Sopenharmony_ci nr = nr_pmdmapped - (nr & FOLIO_PAGES_MAPPED); 143662306a36Sopenharmony_ci /* Raced ahead of another remove and an add? */ 143762306a36Sopenharmony_ci if (unlikely(nr < 0)) 143862306a36Sopenharmony_ci nr = 0; 143962306a36Sopenharmony_ci } else { 144062306a36Sopenharmony_ci /* An add of COMPOUND_MAPPED raced ahead */ 144162306a36Sopenharmony_ci nr = 0; 144262306a36Sopenharmony_ci } 144362306a36Sopenharmony_ci } 144462306a36Sopenharmony_ci } 144562306a36Sopenharmony_ci 144662306a36Sopenharmony_ci if (nr_pmdmapped) { 144762306a36Sopenharmony_ci if (folio_test_anon(folio)) 144862306a36Sopenharmony_ci idx = NR_ANON_THPS; 144962306a36Sopenharmony_ci else if (folio_test_swapbacked(folio)) 145062306a36Sopenharmony_ci idx = NR_SHMEM_PMDMAPPED; 145162306a36Sopenharmony_ci else 145262306a36Sopenharmony_ci idx = NR_FILE_PMDMAPPED; 145362306a36Sopenharmony_ci __lruvec_stat_mod_folio(folio, idx, -nr_pmdmapped); 145462306a36Sopenharmony_ci } 145562306a36Sopenharmony_ci if (nr) { 145662306a36Sopenharmony_ci idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; 145762306a36Sopenharmony_ci __lruvec_stat_mod_folio(folio, idx, -nr); 145862306a36Sopenharmony_ci 145962306a36Sopenharmony_ci /* 146062306a36Sopenharmony_ci * Queue anon THP for deferred split if at least one 146162306a36Sopenharmony_ci * page of the folio is unmapped and at least one page 146262306a36Sopenharmony_ci * is still mapped. 146362306a36Sopenharmony_ci */ 146462306a36Sopenharmony_ci if (folio_test_pmd_mappable(folio) && folio_test_anon(folio)) 146562306a36Sopenharmony_ci if (!compound || nr < nr_pmdmapped) 146662306a36Sopenharmony_ci deferred_split_folio(folio); 146762306a36Sopenharmony_ci } 146862306a36Sopenharmony_ci 146962306a36Sopenharmony_ci /* 147062306a36Sopenharmony_ci * It would be tidy to reset folio_test_anon mapping when fully 147162306a36Sopenharmony_ci * unmapped, but that might overwrite a racing page_add_anon_rmap 147262306a36Sopenharmony_ci * which increments mapcount after us but sets mapping before us: 147362306a36Sopenharmony_ci * so leave the reset to free_pages_prepare, and remember that 147462306a36Sopenharmony_ci * it's only reliable while mapped. 147562306a36Sopenharmony_ci */ 147662306a36Sopenharmony_ci 147762306a36Sopenharmony_ci munlock_vma_folio(folio, vma, compound); 147862306a36Sopenharmony_ci} 147962306a36Sopenharmony_ci 148062306a36Sopenharmony_ci/* 148162306a36Sopenharmony_ci * @arg: enum ttu_flags will be passed to this argument 148262306a36Sopenharmony_ci */ 148362306a36Sopenharmony_cistatic bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, 148462306a36Sopenharmony_ci unsigned long address, void *arg) 148562306a36Sopenharmony_ci{ 148662306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 148762306a36Sopenharmony_ci DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); 148862306a36Sopenharmony_ci pte_t pteval; 148962306a36Sopenharmony_ci struct page *subpage; 149062306a36Sopenharmony_ci bool anon_exclusive, ret = true; 149162306a36Sopenharmony_ci struct mmu_notifier_range range; 149262306a36Sopenharmony_ci enum ttu_flags flags = (enum ttu_flags)(long)arg; 149362306a36Sopenharmony_ci unsigned long pfn; 149462306a36Sopenharmony_ci unsigned long hsz = 0; 149562306a36Sopenharmony_ci 149662306a36Sopenharmony_ci /* 149762306a36Sopenharmony_ci * When racing against e.g. zap_pte_range() on another cpu, 149862306a36Sopenharmony_ci * in between its ptep_get_and_clear_full() and page_remove_rmap(), 149962306a36Sopenharmony_ci * try_to_unmap() may return before page_mapped() has become false, 150062306a36Sopenharmony_ci * if page table locking is skipped: use TTU_SYNC to wait for that. 150162306a36Sopenharmony_ci */ 150262306a36Sopenharmony_ci if (flags & TTU_SYNC) 150362306a36Sopenharmony_ci pvmw.flags = PVMW_SYNC; 150462306a36Sopenharmony_ci 150562306a36Sopenharmony_ci if (flags & TTU_SPLIT_HUGE_PMD) 150662306a36Sopenharmony_ci split_huge_pmd_address(vma, address, false, folio); 150762306a36Sopenharmony_ci 150862306a36Sopenharmony_ci /* 150962306a36Sopenharmony_ci * For THP, we have to assume the worse case ie pmd for invalidation. 151062306a36Sopenharmony_ci * For hugetlb, it could be much worse if we need to do pud 151162306a36Sopenharmony_ci * invalidation in the case of pmd sharing. 151262306a36Sopenharmony_ci * 151362306a36Sopenharmony_ci * Note that the folio can not be freed in this function as call of 151462306a36Sopenharmony_ci * try_to_unmap() must hold a reference on the folio. 151562306a36Sopenharmony_ci */ 151662306a36Sopenharmony_ci range.end = vma_address_end(&pvmw); 151762306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 151862306a36Sopenharmony_ci address, range.end); 151962306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) { 152062306a36Sopenharmony_ci /* 152162306a36Sopenharmony_ci * If sharing is possible, start and end will be adjusted 152262306a36Sopenharmony_ci * accordingly. 152362306a36Sopenharmony_ci */ 152462306a36Sopenharmony_ci adjust_range_if_pmd_sharing_possible(vma, &range.start, 152562306a36Sopenharmony_ci &range.end); 152662306a36Sopenharmony_ci 152762306a36Sopenharmony_ci /* We need the huge page size for set_huge_pte_at() */ 152862306a36Sopenharmony_ci hsz = huge_page_size(hstate_vma(vma)); 152962306a36Sopenharmony_ci } 153062306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 153162306a36Sopenharmony_ci 153262306a36Sopenharmony_ci while (page_vma_mapped_walk(&pvmw)) { 153362306a36Sopenharmony_ci /* Unexpected PMD-mapped THP? */ 153462306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!pvmw.pte, folio); 153562306a36Sopenharmony_ci 153662306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 153762306a36Sopenharmony_ci if ((vma->vm_flags & VM_PURGEABLE) && !lock_uxpte(vma, address)) { 153862306a36Sopenharmony_ci ret = false; 153962306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 154062306a36Sopenharmony_ci break; 154162306a36Sopenharmony_ci } 154262306a36Sopenharmony_ci#endif 154362306a36Sopenharmony_ci /* 154462306a36Sopenharmony_ci * If the folio is in an mlock()d vma, we must not swap it out. 154562306a36Sopenharmony_ci */ 154662306a36Sopenharmony_ci if (!(flags & TTU_IGNORE_MLOCK) && 154762306a36Sopenharmony_ci (vma->vm_flags & VM_LOCKED)) { 154862306a36Sopenharmony_ci /* Restore the mlock which got missed */ 154962306a36Sopenharmony_ci mlock_vma_folio(folio, vma, false); 155062306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 155162306a36Sopenharmony_ci ret = false; 155262306a36Sopenharmony_ci break; 155362306a36Sopenharmony_ci } 155462306a36Sopenharmony_ci 155562306a36Sopenharmony_ci pfn = pte_pfn(ptep_get(pvmw.pte)); 155662306a36Sopenharmony_ci subpage = folio_page(folio, pfn - folio_pfn(folio)); 155762306a36Sopenharmony_ci address = pvmw.address; 155862306a36Sopenharmony_ci anon_exclusive = folio_test_anon(folio) && 155962306a36Sopenharmony_ci PageAnonExclusive(subpage); 156062306a36Sopenharmony_ci 156162306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) { 156262306a36Sopenharmony_ci bool anon = folio_test_anon(folio); 156362306a36Sopenharmony_ci 156462306a36Sopenharmony_ci /* 156562306a36Sopenharmony_ci * The try_to_unmap() is only passed a hugetlb page 156662306a36Sopenharmony_ci * in the case where the hugetlb page is poisoned. 156762306a36Sopenharmony_ci */ 156862306a36Sopenharmony_ci VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage); 156962306a36Sopenharmony_ci /* 157062306a36Sopenharmony_ci * huge_pmd_unshare may unmap an entire PMD page. 157162306a36Sopenharmony_ci * There is no way of knowing exactly which PMDs may 157262306a36Sopenharmony_ci * be cached for this mm, so we must flush them all. 157362306a36Sopenharmony_ci * start/end were already adjusted above to cover this 157462306a36Sopenharmony_ci * range. 157562306a36Sopenharmony_ci */ 157662306a36Sopenharmony_ci flush_cache_range(vma, range.start, range.end); 157762306a36Sopenharmony_ci 157862306a36Sopenharmony_ci /* 157962306a36Sopenharmony_ci * To call huge_pmd_unshare, i_mmap_rwsem must be 158062306a36Sopenharmony_ci * held in write mode. Caller needs to explicitly 158162306a36Sopenharmony_ci * do this outside rmap routines. 158262306a36Sopenharmony_ci * 158362306a36Sopenharmony_ci * We also must hold hugetlb vma_lock in write mode. 158462306a36Sopenharmony_ci * Lock order dictates acquiring vma_lock BEFORE 158562306a36Sopenharmony_ci * i_mmap_rwsem. We can only try lock here and fail 158662306a36Sopenharmony_ci * if unsuccessful. 158762306a36Sopenharmony_ci */ 158862306a36Sopenharmony_ci if (!anon) { 158962306a36Sopenharmony_ci VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); 159062306a36Sopenharmony_ci if (!hugetlb_vma_trylock_write(vma)) { 159162306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 159262306a36Sopenharmony_ci ret = false; 159362306a36Sopenharmony_ci break; 159462306a36Sopenharmony_ci } 159562306a36Sopenharmony_ci if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { 159662306a36Sopenharmony_ci hugetlb_vma_unlock_write(vma); 159762306a36Sopenharmony_ci flush_tlb_range(vma, 159862306a36Sopenharmony_ci range.start, range.end); 159962306a36Sopenharmony_ci /* 160062306a36Sopenharmony_ci * The ref count of the PMD page was 160162306a36Sopenharmony_ci * dropped which is part of the way map 160262306a36Sopenharmony_ci * counting is done for shared PMDs. 160362306a36Sopenharmony_ci * Return 'true' here. When there is 160462306a36Sopenharmony_ci * no other sharing, huge_pmd_unshare 160562306a36Sopenharmony_ci * returns false and we will unmap the 160662306a36Sopenharmony_ci * actual page and drop map count 160762306a36Sopenharmony_ci * to zero. 160862306a36Sopenharmony_ci */ 160962306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 161062306a36Sopenharmony_ci break; 161162306a36Sopenharmony_ci } 161262306a36Sopenharmony_ci hugetlb_vma_unlock_write(vma); 161362306a36Sopenharmony_ci } 161462306a36Sopenharmony_ci pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); 161562306a36Sopenharmony_ci } else { 161662306a36Sopenharmony_ci flush_cache_page(vma, address, pfn); 161762306a36Sopenharmony_ci /* Nuke the page table entry. */ 161862306a36Sopenharmony_ci if (should_defer_flush(mm, flags)) { 161962306a36Sopenharmony_ci /* 162062306a36Sopenharmony_ci * We clear the PTE but do not flush so potentially 162162306a36Sopenharmony_ci * a remote CPU could still be writing to the folio. 162262306a36Sopenharmony_ci * If the entry was previously clean then the 162362306a36Sopenharmony_ci * architecture must guarantee that a clear->dirty 162462306a36Sopenharmony_ci * transition on a cached TLB entry is written through 162562306a36Sopenharmony_ci * and traps if the PTE is unmapped. 162662306a36Sopenharmony_ci */ 162762306a36Sopenharmony_ci pteval = ptep_get_and_clear(mm, address, pvmw.pte); 162862306a36Sopenharmony_ci 162962306a36Sopenharmony_ci set_tlb_ubc_flush_pending(mm, pteval, address); 163062306a36Sopenharmony_ci } else { 163162306a36Sopenharmony_ci pteval = ptep_clear_flush(vma, address, pvmw.pte); 163262306a36Sopenharmony_ci } 163362306a36Sopenharmony_ci } 163462306a36Sopenharmony_ci 163562306a36Sopenharmony_ci /* 163662306a36Sopenharmony_ci * Now the pte is cleared. If this pte was uffd-wp armed, 163762306a36Sopenharmony_ci * we may want to replace a none pte with a marker pte if 163862306a36Sopenharmony_ci * it's file-backed, so we don't lose the tracking info. 163962306a36Sopenharmony_ci */ 164062306a36Sopenharmony_ci pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval); 164162306a36Sopenharmony_ci 164262306a36Sopenharmony_ci /* Set the dirty flag on the folio now the pte is gone. */ 164362306a36Sopenharmony_ci if (pte_dirty(pteval)) 164462306a36Sopenharmony_ci folio_mark_dirty(folio); 164562306a36Sopenharmony_ci 164662306a36Sopenharmony_ci /* Update high watermark before we lower rss */ 164762306a36Sopenharmony_ci update_hiwater_rss(mm); 164862306a36Sopenharmony_ci 164962306a36Sopenharmony_ci if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) { 165062306a36Sopenharmony_ci pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 165162306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) { 165262306a36Sopenharmony_ci hugetlb_count_sub(folio_nr_pages(folio), mm); 165362306a36Sopenharmony_ci set_huge_pte_at(mm, address, pvmw.pte, pteval, 165462306a36Sopenharmony_ci hsz); 165562306a36Sopenharmony_ci } else { 165662306a36Sopenharmony_ci dec_mm_counter(mm, mm_counter(&folio->page)); 165762306a36Sopenharmony_ci set_pte_at(mm, address, pvmw.pte, pteval); 165862306a36Sopenharmony_ci } 165962306a36Sopenharmony_ci 166062306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 166162306a36Sopenharmony_ci } else if ((vma->vm_flags & VM_PURGEABLE) || (pte_unused(pteval) && 166262306a36Sopenharmony_ci !userfaultfd_armed(vma))) { 166362306a36Sopenharmony_ci#else 166462306a36Sopenharmony_ci } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { 166562306a36Sopenharmony_ci#endif 166662306a36Sopenharmony_ci#ifdef CONFIG_MEM_PURGEABLE 166762306a36Sopenharmony_ci if (vma->vm_flags & VM_PURGEABLE) 166862306a36Sopenharmony_ci unlock_uxpte(vma, address); 166962306a36Sopenharmony_ci#endif 167062306a36Sopenharmony_ci 167162306a36Sopenharmony_ci /* 167262306a36Sopenharmony_ci * The guest indicated that the page content is of no 167362306a36Sopenharmony_ci * interest anymore. Simply discard the pte, vmscan 167462306a36Sopenharmony_ci * will take care of the rest. 167562306a36Sopenharmony_ci * A future reference will then fault in a new zero 167662306a36Sopenharmony_ci * page. When userfaultfd is active, we must not drop 167762306a36Sopenharmony_ci * this page though, as its main user (postcopy 167862306a36Sopenharmony_ci * migration) will not expect userfaults on already 167962306a36Sopenharmony_ci * copied pages. 168062306a36Sopenharmony_ci */ 168162306a36Sopenharmony_ci dec_mm_counter(mm, mm_counter(&folio->page)); 168262306a36Sopenharmony_ci } else if (folio_test_anon(folio)) { 168362306a36Sopenharmony_ci swp_entry_t entry = page_swap_entry(subpage); 168462306a36Sopenharmony_ci pte_t swp_pte; 168562306a36Sopenharmony_ci /* 168662306a36Sopenharmony_ci * Store the swap location in the pte. 168762306a36Sopenharmony_ci * See handle_pte_fault() ... 168862306a36Sopenharmony_ci */ 168962306a36Sopenharmony_ci if (unlikely(folio_test_swapbacked(folio) != 169062306a36Sopenharmony_ci folio_test_swapcache(folio))) { 169162306a36Sopenharmony_ci WARN_ON_ONCE(1); 169262306a36Sopenharmony_ci ret = false; 169362306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 169462306a36Sopenharmony_ci break; 169562306a36Sopenharmony_ci } 169662306a36Sopenharmony_ci 169762306a36Sopenharmony_ci /* MADV_FREE page check */ 169862306a36Sopenharmony_ci if (!folio_test_swapbacked(folio)) { 169962306a36Sopenharmony_ci int ref_count, map_count; 170062306a36Sopenharmony_ci 170162306a36Sopenharmony_ci /* 170262306a36Sopenharmony_ci * Synchronize with gup_pte_range(): 170362306a36Sopenharmony_ci * - clear PTE; barrier; read refcount 170462306a36Sopenharmony_ci * - inc refcount; barrier; read PTE 170562306a36Sopenharmony_ci */ 170662306a36Sopenharmony_ci smp_mb(); 170762306a36Sopenharmony_ci 170862306a36Sopenharmony_ci ref_count = folio_ref_count(folio); 170962306a36Sopenharmony_ci map_count = folio_mapcount(folio); 171062306a36Sopenharmony_ci 171162306a36Sopenharmony_ci /* 171262306a36Sopenharmony_ci * Order reads for page refcount and dirty flag 171362306a36Sopenharmony_ci * (see comments in __remove_mapping()). 171462306a36Sopenharmony_ci */ 171562306a36Sopenharmony_ci smp_rmb(); 171662306a36Sopenharmony_ci 171762306a36Sopenharmony_ci /* 171862306a36Sopenharmony_ci * The only page refs must be one from isolation 171962306a36Sopenharmony_ci * plus the rmap(s) (dropped by discard:). 172062306a36Sopenharmony_ci */ 172162306a36Sopenharmony_ci if (ref_count == 1 + map_count && 172262306a36Sopenharmony_ci !folio_test_dirty(folio)) { 172362306a36Sopenharmony_ci dec_mm_counter(mm, MM_ANONPAGES); 172462306a36Sopenharmony_ci goto discard; 172562306a36Sopenharmony_ci } 172662306a36Sopenharmony_ci 172762306a36Sopenharmony_ci /* 172862306a36Sopenharmony_ci * If the folio was redirtied, it cannot be 172962306a36Sopenharmony_ci * discarded. Remap the page to page table. 173062306a36Sopenharmony_ci */ 173162306a36Sopenharmony_ci set_pte_at(mm, address, pvmw.pte, pteval); 173262306a36Sopenharmony_ci folio_set_swapbacked(folio); 173362306a36Sopenharmony_ci ret = false; 173462306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 173562306a36Sopenharmony_ci break; 173662306a36Sopenharmony_ci } 173762306a36Sopenharmony_ci 173862306a36Sopenharmony_ci if (swap_duplicate(entry) < 0) { 173962306a36Sopenharmony_ci set_pte_at(mm, address, pvmw.pte, pteval); 174062306a36Sopenharmony_ci ret = false; 174162306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 174262306a36Sopenharmony_ci break; 174362306a36Sopenharmony_ci } 174462306a36Sopenharmony_ci if (arch_unmap_one(mm, vma, address, pteval) < 0) { 174562306a36Sopenharmony_ci swap_free(entry); 174662306a36Sopenharmony_ci set_pte_at(mm, address, pvmw.pte, pteval); 174762306a36Sopenharmony_ci ret = false; 174862306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 174962306a36Sopenharmony_ci break; 175062306a36Sopenharmony_ci } 175162306a36Sopenharmony_ci 175262306a36Sopenharmony_ci /* See page_try_share_anon_rmap(): clear PTE first. */ 175362306a36Sopenharmony_ci if (anon_exclusive && 175462306a36Sopenharmony_ci page_try_share_anon_rmap(subpage)) { 175562306a36Sopenharmony_ci swap_free(entry); 175662306a36Sopenharmony_ci set_pte_at(mm, address, pvmw.pte, pteval); 175762306a36Sopenharmony_ci ret = false; 175862306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 175962306a36Sopenharmony_ci break; 176062306a36Sopenharmony_ci } 176162306a36Sopenharmony_ci if (list_empty(&mm->mmlist)) { 176262306a36Sopenharmony_ci spin_lock(&mmlist_lock); 176362306a36Sopenharmony_ci if (list_empty(&mm->mmlist)) 176462306a36Sopenharmony_ci list_add(&mm->mmlist, &init_mm.mmlist); 176562306a36Sopenharmony_ci spin_unlock(&mmlist_lock); 176662306a36Sopenharmony_ci } 176762306a36Sopenharmony_ci dec_mm_counter(mm, MM_ANONPAGES); 176862306a36Sopenharmony_ci inc_mm_counter(mm, MM_SWAPENTS); 176962306a36Sopenharmony_ci swp_pte = swp_entry_to_pte(entry); 177062306a36Sopenharmony_ci if (anon_exclusive) 177162306a36Sopenharmony_ci swp_pte = pte_swp_mkexclusive(swp_pte); 177262306a36Sopenharmony_ci if (pte_soft_dirty(pteval)) 177362306a36Sopenharmony_ci swp_pte = pte_swp_mksoft_dirty(swp_pte); 177462306a36Sopenharmony_ci if (pte_uffd_wp(pteval)) 177562306a36Sopenharmony_ci swp_pte = pte_swp_mkuffd_wp(swp_pte); 177662306a36Sopenharmony_ci set_pte_at(mm, address, pvmw.pte, swp_pte); 177762306a36Sopenharmony_ci } else { 177862306a36Sopenharmony_ci /* 177962306a36Sopenharmony_ci * This is a locked file-backed folio, 178062306a36Sopenharmony_ci * so it cannot be removed from the page 178162306a36Sopenharmony_ci * cache and replaced by a new folio before 178262306a36Sopenharmony_ci * mmu_notifier_invalidate_range_end, so no 178362306a36Sopenharmony_ci * concurrent thread might update its page table 178462306a36Sopenharmony_ci * to point at a new folio while a device is 178562306a36Sopenharmony_ci * still using this folio. 178662306a36Sopenharmony_ci * 178762306a36Sopenharmony_ci * See Documentation/mm/mmu_notifier.rst 178862306a36Sopenharmony_ci */ 178962306a36Sopenharmony_ci dec_mm_counter(mm, mm_counter_file(&folio->page)); 179062306a36Sopenharmony_ci } 179162306a36Sopenharmony_cidiscard: 179262306a36Sopenharmony_ci page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); 179362306a36Sopenharmony_ci if (vma->vm_flags & VM_LOCKED) 179462306a36Sopenharmony_ci mlock_drain_local(); 179562306a36Sopenharmony_ci folio_put(folio); 179662306a36Sopenharmony_ci } 179762306a36Sopenharmony_ci 179862306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 179962306a36Sopenharmony_ci 180062306a36Sopenharmony_ci return ret; 180162306a36Sopenharmony_ci} 180262306a36Sopenharmony_ci 180362306a36Sopenharmony_cistatic bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) 180462306a36Sopenharmony_ci{ 180562306a36Sopenharmony_ci return vma_is_temporary_stack(vma); 180662306a36Sopenharmony_ci} 180762306a36Sopenharmony_ci 180862306a36Sopenharmony_cistatic int folio_not_mapped(struct folio *folio) 180962306a36Sopenharmony_ci{ 181062306a36Sopenharmony_ci return !folio_mapped(folio); 181162306a36Sopenharmony_ci} 181262306a36Sopenharmony_ci 181362306a36Sopenharmony_ci/** 181462306a36Sopenharmony_ci * try_to_unmap - Try to remove all page table mappings to a folio. 181562306a36Sopenharmony_ci * @folio: The folio to unmap. 181662306a36Sopenharmony_ci * @flags: action and flags 181762306a36Sopenharmony_ci * 181862306a36Sopenharmony_ci * Tries to remove all the page table entries which are mapping this 181962306a36Sopenharmony_ci * folio. It is the caller's responsibility to check if the folio is 182062306a36Sopenharmony_ci * still mapped if needed (use TTU_SYNC to prevent accounting races). 182162306a36Sopenharmony_ci * 182262306a36Sopenharmony_ci * Context: Caller must hold the folio lock. 182362306a36Sopenharmony_ci */ 182462306a36Sopenharmony_civoid try_to_unmap(struct folio *folio, enum ttu_flags flags) 182562306a36Sopenharmony_ci{ 182662306a36Sopenharmony_ci struct rmap_walk_control rwc = { 182762306a36Sopenharmony_ci .rmap_one = try_to_unmap_one, 182862306a36Sopenharmony_ci .arg = (void *)flags, 182962306a36Sopenharmony_ci .done = folio_not_mapped, 183062306a36Sopenharmony_ci .anon_lock = folio_lock_anon_vma_read, 183162306a36Sopenharmony_ci }; 183262306a36Sopenharmony_ci 183362306a36Sopenharmony_ci if (flags & TTU_RMAP_LOCKED) 183462306a36Sopenharmony_ci rmap_walk_locked(folio, &rwc); 183562306a36Sopenharmony_ci else 183662306a36Sopenharmony_ci rmap_walk(folio, &rwc); 183762306a36Sopenharmony_ci} 183862306a36Sopenharmony_ci 183962306a36Sopenharmony_ci/* 184062306a36Sopenharmony_ci * @arg: enum ttu_flags will be passed to this argument. 184162306a36Sopenharmony_ci * 184262306a36Sopenharmony_ci * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs 184362306a36Sopenharmony_ci * containing migration entries. 184462306a36Sopenharmony_ci */ 184562306a36Sopenharmony_cistatic bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, 184662306a36Sopenharmony_ci unsigned long address, void *arg) 184762306a36Sopenharmony_ci{ 184862306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 184962306a36Sopenharmony_ci DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); 185062306a36Sopenharmony_ci pte_t pteval; 185162306a36Sopenharmony_ci struct page *subpage; 185262306a36Sopenharmony_ci bool anon_exclusive, ret = true; 185362306a36Sopenharmony_ci struct mmu_notifier_range range; 185462306a36Sopenharmony_ci enum ttu_flags flags = (enum ttu_flags)(long)arg; 185562306a36Sopenharmony_ci unsigned long pfn; 185662306a36Sopenharmony_ci unsigned long hsz = 0; 185762306a36Sopenharmony_ci 185862306a36Sopenharmony_ci /* 185962306a36Sopenharmony_ci * When racing against e.g. zap_pte_range() on another cpu, 186062306a36Sopenharmony_ci * in between its ptep_get_and_clear_full() and page_remove_rmap(), 186162306a36Sopenharmony_ci * try_to_migrate() may return before page_mapped() has become false, 186262306a36Sopenharmony_ci * if page table locking is skipped: use TTU_SYNC to wait for that. 186362306a36Sopenharmony_ci */ 186462306a36Sopenharmony_ci if (flags & TTU_SYNC) 186562306a36Sopenharmony_ci pvmw.flags = PVMW_SYNC; 186662306a36Sopenharmony_ci 186762306a36Sopenharmony_ci /* 186862306a36Sopenharmony_ci * unmap_page() in mm/huge_memory.c is the only user of migration with 186962306a36Sopenharmony_ci * TTU_SPLIT_HUGE_PMD and it wants to freeze. 187062306a36Sopenharmony_ci */ 187162306a36Sopenharmony_ci if (flags & TTU_SPLIT_HUGE_PMD) 187262306a36Sopenharmony_ci split_huge_pmd_address(vma, address, true, folio); 187362306a36Sopenharmony_ci 187462306a36Sopenharmony_ci /* 187562306a36Sopenharmony_ci * For THP, we have to assume the worse case ie pmd for invalidation. 187662306a36Sopenharmony_ci * For hugetlb, it could be much worse if we need to do pud 187762306a36Sopenharmony_ci * invalidation in the case of pmd sharing. 187862306a36Sopenharmony_ci * 187962306a36Sopenharmony_ci * Note that the page can not be free in this function as call of 188062306a36Sopenharmony_ci * try_to_unmap() must hold a reference on the page. 188162306a36Sopenharmony_ci */ 188262306a36Sopenharmony_ci range.end = vma_address_end(&pvmw); 188362306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 188462306a36Sopenharmony_ci address, range.end); 188562306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) { 188662306a36Sopenharmony_ci /* 188762306a36Sopenharmony_ci * If sharing is possible, start and end will be adjusted 188862306a36Sopenharmony_ci * accordingly. 188962306a36Sopenharmony_ci */ 189062306a36Sopenharmony_ci adjust_range_if_pmd_sharing_possible(vma, &range.start, 189162306a36Sopenharmony_ci &range.end); 189262306a36Sopenharmony_ci 189362306a36Sopenharmony_ci /* We need the huge page size for set_huge_pte_at() */ 189462306a36Sopenharmony_ci hsz = huge_page_size(hstate_vma(vma)); 189562306a36Sopenharmony_ci } 189662306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 189762306a36Sopenharmony_ci 189862306a36Sopenharmony_ci while (page_vma_mapped_walk(&pvmw)) { 189962306a36Sopenharmony_ci#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 190062306a36Sopenharmony_ci /* PMD-mapped THP migration entry */ 190162306a36Sopenharmony_ci if (!pvmw.pte) { 190262306a36Sopenharmony_ci subpage = folio_page(folio, 190362306a36Sopenharmony_ci pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); 190462306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || 190562306a36Sopenharmony_ci !folio_test_pmd_mappable(folio), folio); 190662306a36Sopenharmony_ci 190762306a36Sopenharmony_ci if (set_pmd_migration_entry(&pvmw, subpage)) { 190862306a36Sopenharmony_ci ret = false; 190962306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 191062306a36Sopenharmony_ci break; 191162306a36Sopenharmony_ci } 191262306a36Sopenharmony_ci continue; 191362306a36Sopenharmony_ci } 191462306a36Sopenharmony_ci#endif 191562306a36Sopenharmony_ci 191662306a36Sopenharmony_ci /* Unexpected PMD-mapped THP? */ 191762306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!pvmw.pte, folio); 191862306a36Sopenharmony_ci 191962306a36Sopenharmony_ci pfn = pte_pfn(ptep_get(pvmw.pte)); 192062306a36Sopenharmony_ci 192162306a36Sopenharmony_ci if (folio_is_zone_device(folio)) { 192262306a36Sopenharmony_ci /* 192362306a36Sopenharmony_ci * Our PTE is a non-present device exclusive entry and 192462306a36Sopenharmony_ci * calculating the subpage as for the common case would 192562306a36Sopenharmony_ci * result in an invalid pointer. 192662306a36Sopenharmony_ci * 192762306a36Sopenharmony_ci * Since only PAGE_SIZE pages can currently be 192862306a36Sopenharmony_ci * migrated, just set it to page. This will need to be 192962306a36Sopenharmony_ci * changed when hugepage migrations to device private 193062306a36Sopenharmony_ci * memory are supported. 193162306a36Sopenharmony_ci */ 193262306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio); 193362306a36Sopenharmony_ci subpage = &folio->page; 193462306a36Sopenharmony_ci } else { 193562306a36Sopenharmony_ci subpage = folio_page(folio, pfn - folio_pfn(folio)); 193662306a36Sopenharmony_ci } 193762306a36Sopenharmony_ci address = pvmw.address; 193862306a36Sopenharmony_ci anon_exclusive = folio_test_anon(folio) && 193962306a36Sopenharmony_ci PageAnonExclusive(subpage); 194062306a36Sopenharmony_ci 194162306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) { 194262306a36Sopenharmony_ci bool anon = folio_test_anon(folio); 194362306a36Sopenharmony_ci 194462306a36Sopenharmony_ci /* 194562306a36Sopenharmony_ci * huge_pmd_unshare may unmap an entire PMD page. 194662306a36Sopenharmony_ci * There is no way of knowing exactly which PMDs may 194762306a36Sopenharmony_ci * be cached for this mm, so we must flush them all. 194862306a36Sopenharmony_ci * start/end were already adjusted above to cover this 194962306a36Sopenharmony_ci * range. 195062306a36Sopenharmony_ci */ 195162306a36Sopenharmony_ci flush_cache_range(vma, range.start, range.end); 195262306a36Sopenharmony_ci 195362306a36Sopenharmony_ci /* 195462306a36Sopenharmony_ci * To call huge_pmd_unshare, i_mmap_rwsem must be 195562306a36Sopenharmony_ci * held in write mode. Caller needs to explicitly 195662306a36Sopenharmony_ci * do this outside rmap routines. 195762306a36Sopenharmony_ci * 195862306a36Sopenharmony_ci * We also must hold hugetlb vma_lock in write mode. 195962306a36Sopenharmony_ci * Lock order dictates acquiring vma_lock BEFORE 196062306a36Sopenharmony_ci * i_mmap_rwsem. We can only try lock here and 196162306a36Sopenharmony_ci * fail if unsuccessful. 196262306a36Sopenharmony_ci */ 196362306a36Sopenharmony_ci if (!anon) { 196462306a36Sopenharmony_ci VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); 196562306a36Sopenharmony_ci if (!hugetlb_vma_trylock_write(vma)) { 196662306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 196762306a36Sopenharmony_ci ret = false; 196862306a36Sopenharmony_ci break; 196962306a36Sopenharmony_ci } 197062306a36Sopenharmony_ci if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { 197162306a36Sopenharmony_ci hugetlb_vma_unlock_write(vma); 197262306a36Sopenharmony_ci flush_tlb_range(vma, 197362306a36Sopenharmony_ci range.start, range.end); 197462306a36Sopenharmony_ci 197562306a36Sopenharmony_ci /* 197662306a36Sopenharmony_ci * The ref count of the PMD page was 197762306a36Sopenharmony_ci * dropped which is part of the way map 197862306a36Sopenharmony_ci * counting is done for shared PMDs. 197962306a36Sopenharmony_ci * Return 'true' here. When there is 198062306a36Sopenharmony_ci * no other sharing, huge_pmd_unshare 198162306a36Sopenharmony_ci * returns false and we will unmap the 198262306a36Sopenharmony_ci * actual page and drop map count 198362306a36Sopenharmony_ci * to zero. 198462306a36Sopenharmony_ci */ 198562306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 198662306a36Sopenharmony_ci break; 198762306a36Sopenharmony_ci } 198862306a36Sopenharmony_ci hugetlb_vma_unlock_write(vma); 198962306a36Sopenharmony_ci } 199062306a36Sopenharmony_ci /* Nuke the hugetlb page table entry */ 199162306a36Sopenharmony_ci pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); 199262306a36Sopenharmony_ci } else { 199362306a36Sopenharmony_ci flush_cache_page(vma, address, pfn); 199462306a36Sopenharmony_ci /* Nuke the page table entry. */ 199562306a36Sopenharmony_ci if (should_defer_flush(mm, flags)) { 199662306a36Sopenharmony_ci /* 199762306a36Sopenharmony_ci * We clear the PTE but do not flush so potentially 199862306a36Sopenharmony_ci * a remote CPU could still be writing to the folio. 199962306a36Sopenharmony_ci * If the entry was previously clean then the 200062306a36Sopenharmony_ci * architecture must guarantee that a clear->dirty 200162306a36Sopenharmony_ci * transition on a cached TLB entry is written through 200262306a36Sopenharmony_ci * and traps if the PTE is unmapped. 200362306a36Sopenharmony_ci */ 200462306a36Sopenharmony_ci pteval = ptep_get_and_clear(mm, address, pvmw.pte); 200562306a36Sopenharmony_ci 200662306a36Sopenharmony_ci set_tlb_ubc_flush_pending(mm, pteval, address); 200762306a36Sopenharmony_ci } else { 200862306a36Sopenharmony_ci pteval = ptep_clear_flush(vma, address, pvmw.pte); 200962306a36Sopenharmony_ci } 201062306a36Sopenharmony_ci } 201162306a36Sopenharmony_ci 201262306a36Sopenharmony_ci /* Set the dirty flag on the folio now the pte is gone. */ 201362306a36Sopenharmony_ci if (pte_dirty(pteval)) 201462306a36Sopenharmony_ci folio_mark_dirty(folio); 201562306a36Sopenharmony_ci 201662306a36Sopenharmony_ci /* Update high watermark before we lower rss */ 201762306a36Sopenharmony_ci update_hiwater_rss(mm); 201862306a36Sopenharmony_ci 201962306a36Sopenharmony_ci if (folio_is_device_private(folio)) { 202062306a36Sopenharmony_ci unsigned long pfn = folio_pfn(folio); 202162306a36Sopenharmony_ci swp_entry_t entry; 202262306a36Sopenharmony_ci pte_t swp_pte; 202362306a36Sopenharmony_ci 202462306a36Sopenharmony_ci if (anon_exclusive) 202562306a36Sopenharmony_ci BUG_ON(page_try_share_anon_rmap(subpage)); 202662306a36Sopenharmony_ci 202762306a36Sopenharmony_ci /* 202862306a36Sopenharmony_ci * Store the pfn of the page in a special migration 202962306a36Sopenharmony_ci * pte. do_swap_page() will wait until the migration 203062306a36Sopenharmony_ci * pte is removed and then restart fault handling. 203162306a36Sopenharmony_ci */ 203262306a36Sopenharmony_ci entry = pte_to_swp_entry(pteval); 203362306a36Sopenharmony_ci if (is_writable_device_private_entry(entry)) 203462306a36Sopenharmony_ci entry = make_writable_migration_entry(pfn); 203562306a36Sopenharmony_ci else if (anon_exclusive) 203662306a36Sopenharmony_ci entry = make_readable_exclusive_migration_entry(pfn); 203762306a36Sopenharmony_ci else 203862306a36Sopenharmony_ci entry = make_readable_migration_entry(pfn); 203962306a36Sopenharmony_ci swp_pte = swp_entry_to_pte(entry); 204062306a36Sopenharmony_ci 204162306a36Sopenharmony_ci /* 204262306a36Sopenharmony_ci * pteval maps a zone device page and is therefore 204362306a36Sopenharmony_ci * a swap pte. 204462306a36Sopenharmony_ci */ 204562306a36Sopenharmony_ci if (pte_swp_soft_dirty(pteval)) 204662306a36Sopenharmony_ci swp_pte = pte_swp_mksoft_dirty(swp_pte); 204762306a36Sopenharmony_ci if (pte_swp_uffd_wp(pteval)) 204862306a36Sopenharmony_ci swp_pte = pte_swp_mkuffd_wp(swp_pte); 204962306a36Sopenharmony_ci set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); 205062306a36Sopenharmony_ci trace_set_migration_pte(pvmw.address, pte_val(swp_pte), 205162306a36Sopenharmony_ci compound_order(&folio->page)); 205262306a36Sopenharmony_ci /* 205362306a36Sopenharmony_ci * No need to invalidate here it will synchronize on 205462306a36Sopenharmony_ci * against the special swap migration pte. 205562306a36Sopenharmony_ci */ 205662306a36Sopenharmony_ci } else if (PageHWPoison(subpage)) { 205762306a36Sopenharmony_ci pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 205862306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) { 205962306a36Sopenharmony_ci hugetlb_count_sub(folio_nr_pages(folio), mm); 206062306a36Sopenharmony_ci set_huge_pte_at(mm, address, pvmw.pte, pteval, 206162306a36Sopenharmony_ci hsz); 206262306a36Sopenharmony_ci } else { 206362306a36Sopenharmony_ci dec_mm_counter(mm, mm_counter(&folio->page)); 206462306a36Sopenharmony_ci set_pte_at(mm, address, pvmw.pte, pteval); 206562306a36Sopenharmony_ci } 206662306a36Sopenharmony_ci 206762306a36Sopenharmony_ci } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { 206862306a36Sopenharmony_ci /* 206962306a36Sopenharmony_ci * The guest indicated that the page content is of no 207062306a36Sopenharmony_ci * interest anymore. Simply discard the pte, vmscan 207162306a36Sopenharmony_ci * will take care of the rest. 207262306a36Sopenharmony_ci * A future reference will then fault in a new zero 207362306a36Sopenharmony_ci * page. When userfaultfd is active, we must not drop 207462306a36Sopenharmony_ci * this page though, as its main user (postcopy 207562306a36Sopenharmony_ci * migration) will not expect userfaults on already 207662306a36Sopenharmony_ci * copied pages. 207762306a36Sopenharmony_ci */ 207862306a36Sopenharmony_ci dec_mm_counter(mm, mm_counter(&folio->page)); 207962306a36Sopenharmony_ci } else { 208062306a36Sopenharmony_ci swp_entry_t entry; 208162306a36Sopenharmony_ci pte_t swp_pte; 208262306a36Sopenharmony_ci 208362306a36Sopenharmony_ci if (arch_unmap_one(mm, vma, address, pteval) < 0) { 208462306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) 208562306a36Sopenharmony_ci set_huge_pte_at(mm, address, pvmw.pte, 208662306a36Sopenharmony_ci pteval, hsz); 208762306a36Sopenharmony_ci else 208862306a36Sopenharmony_ci set_pte_at(mm, address, pvmw.pte, pteval); 208962306a36Sopenharmony_ci ret = false; 209062306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 209162306a36Sopenharmony_ci break; 209262306a36Sopenharmony_ci } 209362306a36Sopenharmony_ci VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) && 209462306a36Sopenharmony_ci !anon_exclusive, subpage); 209562306a36Sopenharmony_ci 209662306a36Sopenharmony_ci /* See page_try_share_anon_rmap(): clear PTE first. */ 209762306a36Sopenharmony_ci if (anon_exclusive && 209862306a36Sopenharmony_ci page_try_share_anon_rmap(subpage)) { 209962306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) 210062306a36Sopenharmony_ci set_huge_pte_at(mm, address, pvmw.pte, 210162306a36Sopenharmony_ci pteval, hsz); 210262306a36Sopenharmony_ci else 210362306a36Sopenharmony_ci set_pte_at(mm, address, pvmw.pte, pteval); 210462306a36Sopenharmony_ci ret = false; 210562306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 210662306a36Sopenharmony_ci break; 210762306a36Sopenharmony_ci } 210862306a36Sopenharmony_ci 210962306a36Sopenharmony_ci /* 211062306a36Sopenharmony_ci * Store the pfn of the page in a special migration 211162306a36Sopenharmony_ci * pte. do_swap_page() will wait until the migration 211262306a36Sopenharmony_ci * pte is removed and then restart fault handling. 211362306a36Sopenharmony_ci */ 211462306a36Sopenharmony_ci if (pte_write(pteval)) 211562306a36Sopenharmony_ci entry = make_writable_migration_entry( 211662306a36Sopenharmony_ci page_to_pfn(subpage)); 211762306a36Sopenharmony_ci else if (anon_exclusive) 211862306a36Sopenharmony_ci entry = make_readable_exclusive_migration_entry( 211962306a36Sopenharmony_ci page_to_pfn(subpage)); 212062306a36Sopenharmony_ci else 212162306a36Sopenharmony_ci entry = make_readable_migration_entry( 212262306a36Sopenharmony_ci page_to_pfn(subpage)); 212362306a36Sopenharmony_ci if (pte_young(pteval)) 212462306a36Sopenharmony_ci entry = make_migration_entry_young(entry); 212562306a36Sopenharmony_ci if (pte_dirty(pteval)) 212662306a36Sopenharmony_ci entry = make_migration_entry_dirty(entry); 212762306a36Sopenharmony_ci swp_pte = swp_entry_to_pte(entry); 212862306a36Sopenharmony_ci if (pte_soft_dirty(pteval)) 212962306a36Sopenharmony_ci swp_pte = pte_swp_mksoft_dirty(swp_pte); 213062306a36Sopenharmony_ci if (pte_uffd_wp(pteval)) 213162306a36Sopenharmony_ci swp_pte = pte_swp_mkuffd_wp(swp_pte); 213262306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) 213362306a36Sopenharmony_ci set_huge_pte_at(mm, address, pvmw.pte, swp_pte, 213462306a36Sopenharmony_ci hsz); 213562306a36Sopenharmony_ci else 213662306a36Sopenharmony_ci set_pte_at(mm, address, pvmw.pte, swp_pte); 213762306a36Sopenharmony_ci trace_set_migration_pte(address, pte_val(swp_pte), 213862306a36Sopenharmony_ci compound_order(&folio->page)); 213962306a36Sopenharmony_ci /* 214062306a36Sopenharmony_ci * No need to invalidate here it will synchronize on 214162306a36Sopenharmony_ci * against the special swap migration pte. 214262306a36Sopenharmony_ci */ 214362306a36Sopenharmony_ci } 214462306a36Sopenharmony_ci 214562306a36Sopenharmony_ci page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); 214662306a36Sopenharmony_ci if (vma->vm_flags & VM_LOCKED) 214762306a36Sopenharmony_ci mlock_drain_local(); 214862306a36Sopenharmony_ci folio_put(folio); 214962306a36Sopenharmony_ci } 215062306a36Sopenharmony_ci 215162306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 215262306a36Sopenharmony_ci 215362306a36Sopenharmony_ci return ret; 215462306a36Sopenharmony_ci} 215562306a36Sopenharmony_ci 215662306a36Sopenharmony_ci/** 215762306a36Sopenharmony_ci * try_to_migrate - try to replace all page table mappings with swap entries 215862306a36Sopenharmony_ci * @folio: the folio to replace page table entries for 215962306a36Sopenharmony_ci * @flags: action and flags 216062306a36Sopenharmony_ci * 216162306a36Sopenharmony_ci * Tries to remove all the page table entries which are mapping this folio and 216262306a36Sopenharmony_ci * replace them with special swap entries. Caller must hold the folio lock. 216362306a36Sopenharmony_ci */ 216462306a36Sopenharmony_civoid try_to_migrate(struct folio *folio, enum ttu_flags flags) 216562306a36Sopenharmony_ci{ 216662306a36Sopenharmony_ci struct rmap_walk_control rwc = { 216762306a36Sopenharmony_ci .rmap_one = try_to_migrate_one, 216862306a36Sopenharmony_ci .arg = (void *)flags, 216962306a36Sopenharmony_ci .done = folio_not_mapped, 217062306a36Sopenharmony_ci .anon_lock = folio_lock_anon_vma_read, 217162306a36Sopenharmony_ci }; 217262306a36Sopenharmony_ci 217362306a36Sopenharmony_ci /* 217462306a36Sopenharmony_ci * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and 217562306a36Sopenharmony_ci * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags. 217662306a36Sopenharmony_ci */ 217762306a36Sopenharmony_ci if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | 217862306a36Sopenharmony_ci TTU_SYNC | TTU_BATCH_FLUSH))) 217962306a36Sopenharmony_ci return; 218062306a36Sopenharmony_ci 218162306a36Sopenharmony_ci if (folio_is_zone_device(folio) && 218262306a36Sopenharmony_ci (!folio_is_device_private(folio) && !folio_is_device_coherent(folio))) 218362306a36Sopenharmony_ci return; 218462306a36Sopenharmony_ci 218562306a36Sopenharmony_ci /* 218662306a36Sopenharmony_ci * During exec, a temporary VMA is setup and later moved. 218762306a36Sopenharmony_ci * The VMA is moved under the anon_vma lock but not the 218862306a36Sopenharmony_ci * page tables leading to a race where migration cannot 218962306a36Sopenharmony_ci * find the migration ptes. Rather than increasing the 219062306a36Sopenharmony_ci * locking requirements of exec(), migration skips 219162306a36Sopenharmony_ci * temporary VMAs until after exec() completes. 219262306a36Sopenharmony_ci */ 219362306a36Sopenharmony_ci if (!folio_test_ksm(folio) && folio_test_anon(folio)) 219462306a36Sopenharmony_ci rwc.invalid_vma = invalid_migration_vma; 219562306a36Sopenharmony_ci 219662306a36Sopenharmony_ci if (flags & TTU_RMAP_LOCKED) 219762306a36Sopenharmony_ci rmap_walk_locked(folio, &rwc); 219862306a36Sopenharmony_ci else 219962306a36Sopenharmony_ci rmap_walk(folio, &rwc); 220062306a36Sopenharmony_ci} 220162306a36Sopenharmony_ci 220262306a36Sopenharmony_ci#ifdef CONFIG_DEVICE_PRIVATE 220362306a36Sopenharmony_cistruct make_exclusive_args { 220462306a36Sopenharmony_ci struct mm_struct *mm; 220562306a36Sopenharmony_ci unsigned long address; 220662306a36Sopenharmony_ci void *owner; 220762306a36Sopenharmony_ci bool valid; 220862306a36Sopenharmony_ci}; 220962306a36Sopenharmony_ci 221062306a36Sopenharmony_cistatic bool page_make_device_exclusive_one(struct folio *folio, 221162306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long address, void *priv) 221262306a36Sopenharmony_ci{ 221362306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 221462306a36Sopenharmony_ci DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); 221562306a36Sopenharmony_ci struct make_exclusive_args *args = priv; 221662306a36Sopenharmony_ci pte_t pteval; 221762306a36Sopenharmony_ci struct page *subpage; 221862306a36Sopenharmony_ci bool ret = true; 221962306a36Sopenharmony_ci struct mmu_notifier_range range; 222062306a36Sopenharmony_ci swp_entry_t entry; 222162306a36Sopenharmony_ci pte_t swp_pte; 222262306a36Sopenharmony_ci pte_t ptent; 222362306a36Sopenharmony_ci 222462306a36Sopenharmony_ci mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, 222562306a36Sopenharmony_ci vma->vm_mm, address, min(vma->vm_end, 222662306a36Sopenharmony_ci address + folio_size(folio)), 222762306a36Sopenharmony_ci args->owner); 222862306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 222962306a36Sopenharmony_ci 223062306a36Sopenharmony_ci while (page_vma_mapped_walk(&pvmw)) { 223162306a36Sopenharmony_ci /* Unexpected PMD-mapped THP? */ 223262306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!pvmw.pte, folio); 223362306a36Sopenharmony_ci 223462306a36Sopenharmony_ci ptent = ptep_get(pvmw.pte); 223562306a36Sopenharmony_ci if (!pte_present(ptent)) { 223662306a36Sopenharmony_ci ret = false; 223762306a36Sopenharmony_ci page_vma_mapped_walk_done(&pvmw); 223862306a36Sopenharmony_ci break; 223962306a36Sopenharmony_ci } 224062306a36Sopenharmony_ci 224162306a36Sopenharmony_ci subpage = folio_page(folio, 224262306a36Sopenharmony_ci pte_pfn(ptent) - folio_pfn(folio)); 224362306a36Sopenharmony_ci address = pvmw.address; 224462306a36Sopenharmony_ci 224562306a36Sopenharmony_ci /* Nuke the page table entry. */ 224662306a36Sopenharmony_ci flush_cache_page(vma, address, pte_pfn(ptent)); 224762306a36Sopenharmony_ci pteval = ptep_clear_flush(vma, address, pvmw.pte); 224862306a36Sopenharmony_ci 224962306a36Sopenharmony_ci /* Set the dirty flag on the folio now the pte is gone. */ 225062306a36Sopenharmony_ci if (pte_dirty(pteval)) 225162306a36Sopenharmony_ci folio_mark_dirty(folio); 225262306a36Sopenharmony_ci 225362306a36Sopenharmony_ci /* 225462306a36Sopenharmony_ci * Check that our target page is still mapped at the expected 225562306a36Sopenharmony_ci * address. 225662306a36Sopenharmony_ci */ 225762306a36Sopenharmony_ci if (args->mm == mm && args->address == address && 225862306a36Sopenharmony_ci pte_write(pteval)) 225962306a36Sopenharmony_ci args->valid = true; 226062306a36Sopenharmony_ci 226162306a36Sopenharmony_ci /* 226262306a36Sopenharmony_ci * Store the pfn of the page in a special migration 226362306a36Sopenharmony_ci * pte. do_swap_page() will wait until the migration 226462306a36Sopenharmony_ci * pte is removed and then restart fault handling. 226562306a36Sopenharmony_ci */ 226662306a36Sopenharmony_ci if (pte_write(pteval)) 226762306a36Sopenharmony_ci entry = make_writable_device_exclusive_entry( 226862306a36Sopenharmony_ci page_to_pfn(subpage)); 226962306a36Sopenharmony_ci else 227062306a36Sopenharmony_ci entry = make_readable_device_exclusive_entry( 227162306a36Sopenharmony_ci page_to_pfn(subpage)); 227262306a36Sopenharmony_ci swp_pte = swp_entry_to_pte(entry); 227362306a36Sopenharmony_ci if (pte_soft_dirty(pteval)) 227462306a36Sopenharmony_ci swp_pte = pte_swp_mksoft_dirty(swp_pte); 227562306a36Sopenharmony_ci if (pte_uffd_wp(pteval)) 227662306a36Sopenharmony_ci swp_pte = pte_swp_mkuffd_wp(swp_pte); 227762306a36Sopenharmony_ci 227862306a36Sopenharmony_ci set_pte_at(mm, address, pvmw.pte, swp_pte); 227962306a36Sopenharmony_ci 228062306a36Sopenharmony_ci /* 228162306a36Sopenharmony_ci * There is a reference on the page for the swap entry which has 228262306a36Sopenharmony_ci * been removed, so shouldn't take another. 228362306a36Sopenharmony_ci */ 228462306a36Sopenharmony_ci page_remove_rmap(subpage, vma, false); 228562306a36Sopenharmony_ci } 228662306a36Sopenharmony_ci 228762306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 228862306a36Sopenharmony_ci 228962306a36Sopenharmony_ci return ret; 229062306a36Sopenharmony_ci} 229162306a36Sopenharmony_ci 229262306a36Sopenharmony_ci/** 229362306a36Sopenharmony_ci * folio_make_device_exclusive - Mark the folio exclusively owned by a device. 229462306a36Sopenharmony_ci * @folio: The folio to replace page table entries for. 229562306a36Sopenharmony_ci * @mm: The mm_struct where the folio is expected to be mapped. 229662306a36Sopenharmony_ci * @address: Address where the folio is expected to be mapped. 229762306a36Sopenharmony_ci * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks 229862306a36Sopenharmony_ci * 229962306a36Sopenharmony_ci * Tries to remove all the page table entries which are mapping this 230062306a36Sopenharmony_ci * folio and replace them with special device exclusive swap entries to 230162306a36Sopenharmony_ci * grant a device exclusive access to the folio. 230262306a36Sopenharmony_ci * 230362306a36Sopenharmony_ci * Context: Caller must hold the folio lock. 230462306a36Sopenharmony_ci * Return: false if the page is still mapped, or if it could not be unmapped 230562306a36Sopenharmony_ci * from the expected address. Otherwise returns true (success). 230662306a36Sopenharmony_ci */ 230762306a36Sopenharmony_cistatic bool folio_make_device_exclusive(struct folio *folio, 230862306a36Sopenharmony_ci struct mm_struct *mm, unsigned long address, void *owner) 230962306a36Sopenharmony_ci{ 231062306a36Sopenharmony_ci struct make_exclusive_args args = { 231162306a36Sopenharmony_ci .mm = mm, 231262306a36Sopenharmony_ci .address = address, 231362306a36Sopenharmony_ci .owner = owner, 231462306a36Sopenharmony_ci .valid = false, 231562306a36Sopenharmony_ci }; 231662306a36Sopenharmony_ci struct rmap_walk_control rwc = { 231762306a36Sopenharmony_ci .rmap_one = page_make_device_exclusive_one, 231862306a36Sopenharmony_ci .done = folio_not_mapped, 231962306a36Sopenharmony_ci .anon_lock = folio_lock_anon_vma_read, 232062306a36Sopenharmony_ci .arg = &args, 232162306a36Sopenharmony_ci }; 232262306a36Sopenharmony_ci 232362306a36Sopenharmony_ci /* 232462306a36Sopenharmony_ci * Restrict to anonymous folios for now to avoid potential writeback 232562306a36Sopenharmony_ci * issues. 232662306a36Sopenharmony_ci */ 232762306a36Sopenharmony_ci if (!folio_test_anon(folio)) 232862306a36Sopenharmony_ci return false; 232962306a36Sopenharmony_ci 233062306a36Sopenharmony_ci rmap_walk(folio, &rwc); 233162306a36Sopenharmony_ci 233262306a36Sopenharmony_ci return args.valid && !folio_mapcount(folio); 233362306a36Sopenharmony_ci} 233462306a36Sopenharmony_ci 233562306a36Sopenharmony_ci/** 233662306a36Sopenharmony_ci * make_device_exclusive_range() - Mark a range for exclusive use by a device 233762306a36Sopenharmony_ci * @mm: mm_struct of associated target process 233862306a36Sopenharmony_ci * @start: start of the region to mark for exclusive device access 233962306a36Sopenharmony_ci * @end: end address of region 234062306a36Sopenharmony_ci * @pages: returns the pages which were successfully marked for exclusive access 234162306a36Sopenharmony_ci * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering 234262306a36Sopenharmony_ci * 234362306a36Sopenharmony_ci * Returns: number of pages found in the range by GUP. A page is marked for 234462306a36Sopenharmony_ci * exclusive access only if the page pointer is non-NULL. 234562306a36Sopenharmony_ci * 234662306a36Sopenharmony_ci * This function finds ptes mapping page(s) to the given address range, locks 234762306a36Sopenharmony_ci * them and replaces mappings with special swap entries preventing userspace CPU 234862306a36Sopenharmony_ci * access. On fault these entries are replaced with the original mapping after 234962306a36Sopenharmony_ci * calling MMU notifiers. 235062306a36Sopenharmony_ci * 235162306a36Sopenharmony_ci * A driver using this to program access from a device must use a mmu notifier 235262306a36Sopenharmony_ci * critical section to hold a device specific lock during programming. Once 235362306a36Sopenharmony_ci * programming is complete it should drop the page lock and reference after 235462306a36Sopenharmony_ci * which point CPU access to the page will revoke the exclusive access. 235562306a36Sopenharmony_ci */ 235662306a36Sopenharmony_ciint make_device_exclusive_range(struct mm_struct *mm, unsigned long start, 235762306a36Sopenharmony_ci unsigned long end, struct page **pages, 235862306a36Sopenharmony_ci void *owner) 235962306a36Sopenharmony_ci{ 236062306a36Sopenharmony_ci long npages = (end - start) >> PAGE_SHIFT; 236162306a36Sopenharmony_ci long i; 236262306a36Sopenharmony_ci 236362306a36Sopenharmony_ci npages = get_user_pages_remote(mm, start, npages, 236462306a36Sopenharmony_ci FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, 236562306a36Sopenharmony_ci pages, NULL); 236662306a36Sopenharmony_ci if (npages < 0) 236762306a36Sopenharmony_ci return npages; 236862306a36Sopenharmony_ci 236962306a36Sopenharmony_ci for (i = 0; i < npages; i++, start += PAGE_SIZE) { 237062306a36Sopenharmony_ci struct folio *folio = page_folio(pages[i]); 237162306a36Sopenharmony_ci if (PageTail(pages[i]) || !folio_trylock(folio)) { 237262306a36Sopenharmony_ci folio_put(folio); 237362306a36Sopenharmony_ci pages[i] = NULL; 237462306a36Sopenharmony_ci continue; 237562306a36Sopenharmony_ci } 237662306a36Sopenharmony_ci 237762306a36Sopenharmony_ci if (!folio_make_device_exclusive(folio, mm, start, owner)) { 237862306a36Sopenharmony_ci folio_unlock(folio); 237962306a36Sopenharmony_ci folio_put(folio); 238062306a36Sopenharmony_ci pages[i] = NULL; 238162306a36Sopenharmony_ci } 238262306a36Sopenharmony_ci } 238362306a36Sopenharmony_ci 238462306a36Sopenharmony_ci return npages; 238562306a36Sopenharmony_ci} 238662306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(make_device_exclusive_range); 238762306a36Sopenharmony_ci#endif 238862306a36Sopenharmony_ci 238962306a36Sopenharmony_civoid __put_anon_vma(struct anon_vma *anon_vma) 239062306a36Sopenharmony_ci{ 239162306a36Sopenharmony_ci struct anon_vma *root = anon_vma->root; 239262306a36Sopenharmony_ci 239362306a36Sopenharmony_ci anon_vma_free(anon_vma); 239462306a36Sopenharmony_ci if (root != anon_vma && atomic_dec_and_test(&root->refcount)) 239562306a36Sopenharmony_ci anon_vma_free(root); 239662306a36Sopenharmony_ci} 239762306a36Sopenharmony_ci 239862306a36Sopenharmony_cistatic struct anon_vma *rmap_walk_anon_lock(struct folio *folio, 239962306a36Sopenharmony_ci struct rmap_walk_control *rwc) 240062306a36Sopenharmony_ci{ 240162306a36Sopenharmony_ci struct anon_vma *anon_vma; 240262306a36Sopenharmony_ci 240362306a36Sopenharmony_ci if (rwc->anon_lock) 240462306a36Sopenharmony_ci return rwc->anon_lock(folio, rwc); 240562306a36Sopenharmony_ci 240662306a36Sopenharmony_ci /* 240762306a36Sopenharmony_ci * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() 240862306a36Sopenharmony_ci * because that depends on page_mapped(); but not all its usages 240962306a36Sopenharmony_ci * are holding mmap_lock. Users without mmap_lock are required to 241062306a36Sopenharmony_ci * take a reference count to prevent the anon_vma disappearing 241162306a36Sopenharmony_ci */ 241262306a36Sopenharmony_ci anon_vma = folio_anon_vma(folio); 241362306a36Sopenharmony_ci if (!anon_vma) 241462306a36Sopenharmony_ci return NULL; 241562306a36Sopenharmony_ci 241662306a36Sopenharmony_ci if (anon_vma_trylock_read(anon_vma)) 241762306a36Sopenharmony_ci goto out; 241862306a36Sopenharmony_ci 241962306a36Sopenharmony_ci if (rwc->try_lock) { 242062306a36Sopenharmony_ci anon_vma = NULL; 242162306a36Sopenharmony_ci rwc->contended = true; 242262306a36Sopenharmony_ci goto out; 242362306a36Sopenharmony_ci } 242462306a36Sopenharmony_ci 242562306a36Sopenharmony_ci anon_vma_lock_read(anon_vma); 242662306a36Sopenharmony_ciout: 242762306a36Sopenharmony_ci return anon_vma; 242862306a36Sopenharmony_ci} 242962306a36Sopenharmony_ci 243062306a36Sopenharmony_ci/* 243162306a36Sopenharmony_ci * rmap_walk_anon - do something to anonymous page using the object-based 243262306a36Sopenharmony_ci * rmap method 243362306a36Sopenharmony_ci * @folio: the folio to be handled 243462306a36Sopenharmony_ci * @rwc: control variable according to each walk type 243562306a36Sopenharmony_ci * @locked: caller holds relevant rmap lock 243662306a36Sopenharmony_ci * 243762306a36Sopenharmony_ci * Find all the mappings of a folio using the mapping pointer and the vma 243862306a36Sopenharmony_ci * chains contained in the anon_vma struct it points to. 243962306a36Sopenharmony_ci */ 244062306a36Sopenharmony_cistatic void rmap_walk_anon(struct folio *folio, 244162306a36Sopenharmony_ci struct rmap_walk_control *rwc, bool locked) 244262306a36Sopenharmony_ci{ 244362306a36Sopenharmony_ci struct anon_vma *anon_vma; 244462306a36Sopenharmony_ci pgoff_t pgoff_start, pgoff_end; 244562306a36Sopenharmony_ci struct anon_vma_chain *avc; 244662306a36Sopenharmony_ci 244762306a36Sopenharmony_ci if (locked) { 244862306a36Sopenharmony_ci anon_vma = folio_anon_vma(folio); 244962306a36Sopenharmony_ci /* anon_vma disappear under us? */ 245062306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!anon_vma, folio); 245162306a36Sopenharmony_ci } else { 245262306a36Sopenharmony_ci anon_vma = rmap_walk_anon_lock(folio, rwc); 245362306a36Sopenharmony_ci } 245462306a36Sopenharmony_ci if (!anon_vma) 245562306a36Sopenharmony_ci return; 245662306a36Sopenharmony_ci 245762306a36Sopenharmony_ci pgoff_start = folio_pgoff(folio); 245862306a36Sopenharmony_ci pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; 245962306a36Sopenharmony_ci anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, 246062306a36Sopenharmony_ci pgoff_start, pgoff_end) { 246162306a36Sopenharmony_ci struct vm_area_struct *vma = avc->vma; 246262306a36Sopenharmony_ci unsigned long address = vma_address(&folio->page, vma); 246362306a36Sopenharmony_ci 246462306a36Sopenharmony_ci VM_BUG_ON_VMA(address == -EFAULT, vma); 246562306a36Sopenharmony_ci cond_resched(); 246662306a36Sopenharmony_ci 246762306a36Sopenharmony_ci if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 246862306a36Sopenharmony_ci continue; 246962306a36Sopenharmony_ci 247062306a36Sopenharmony_ci if (!rwc->rmap_one(folio, vma, address, rwc->arg)) 247162306a36Sopenharmony_ci break; 247262306a36Sopenharmony_ci if (rwc->done && rwc->done(folio)) 247362306a36Sopenharmony_ci break; 247462306a36Sopenharmony_ci } 247562306a36Sopenharmony_ci 247662306a36Sopenharmony_ci if (!locked) 247762306a36Sopenharmony_ci anon_vma_unlock_read(anon_vma); 247862306a36Sopenharmony_ci} 247962306a36Sopenharmony_ci 248062306a36Sopenharmony_ci/* 248162306a36Sopenharmony_ci * rmap_walk_file - do something to file page using the object-based rmap method 248262306a36Sopenharmony_ci * @folio: the folio to be handled 248362306a36Sopenharmony_ci * @rwc: control variable according to each walk type 248462306a36Sopenharmony_ci * @locked: caller holds relevant rmap lock 248562306a36Sopenharmony_ci * 248662306a36Sopenharmony_ci * Find all the mappings of a folio using the mapping pointer and the vma chains 248762306a36Sopenharmony_ci * contained in the address_space struct it points to. 248862306a36Sopenharmony_ci */ 248962306a36Sopenharmony_cistatic void rmap_walk_file(struct folio *folio, 249062306a36Sopenharmony_ci struct rmap_walk_control *rwc, bool locked) 249162306a36Sopenharmony_ci{ 249262306a36Sopenharmony_ci struct address_space *mapping = folio_mapping(folio); 249362306a36Sopenharmony_ci pgoff_t pgoff_start, pgoff_end; 249462306a36Sopenharmony_ci struct vm_area_struct *vma; 249562306a36Sopenharmony_ci 249662306a36Sopenharmony_ci /* 249762306a36Sopenharmony_ci * The page lock not only makes sure that page->mapping cannot 249862306a36Sopenharmony_ci * suddenly be NULLified by truncation, it makes sure that the 249962306a36Sopenharmony_ci * structure at mapping cannot be freed and reused yet, 250062306a36Sopenharmony_ci * so we can safely take mapping->i_mmap_rwsem. 250162306a36Sopenharmony_ci */ 250262306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 250362306a36Sopenharmony_ci 250462306a36Sopenharmony_ci if (!mapping) 250562306a36Sopenharmony_ci return; 250662306a36Sopenharmony_ci 250762306a36Sopenharmony_ci pgoff_start = folio_pgoff(folio); 250862306a36Sopenharmony_ci pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; 250962306a36Sopenharmony_ci if (!locked) { 251062306a36Sopenharmony_ci if (i_mmap_trylock_read(mapping)) 251162306a36Sopenharmony_ci goto lookup; 251262306a36Sopenharmony_ci 251362306a36Sopenharmony_ci if (rwc->try_lock) { 251462306a36Sopenharmony_ci rwc->contended = true; 251562306a36Sopenharmony_ci return; 251662306a36Sopenharmony_ci } 251762306a36Sopenharmony_ci 251862306a36Sopenharmony_ci i_mmap_lock_read(mapping); 251962306a36Sopenharmony_ci } 252062306a36Sopenharmony_cilookup: 252162306a36Sopenharmony_ci vma_interval_tree_foreach(vma, &mapping->i_mmap, 252262306a36Sopenharmony_ci pgoff_start, pgoff_end) { 252362306a36Sopenharmony_ci unsigned long address = vma_address(&folio->page, vma); 252462306a36Sopenharmony_ci 252562306a36Sopenharmony_ci VM_BUG_ON_VMA(address == -EFAULT, vma); 252662306a36Sopenharmony_ci cond_resched(); 252762306a36Sopenharmony_ci 252862306a36Sopenharmony_ci if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) 252962306a36Sopenharmony_ci continue; 253062306a36Sopenharmony_ci 253162306a36Sopenharmony_ci if (!rwc->rmap_one(folio, vma, address, rwc->arg)) 253262306a36Sopenharmony_ci goto done; 253362306a36Sopenharmony_ci if (rwc->done && rwc->done(folio)) 253462306a36Sopenharmony_ci goto done; 253562306a36Sopenharmony_ci } 253662306a36Sopenharmony_ci 253762306a36Sopenharmony_cidone: 253862306a36Sopenharmony_ci if (!locked) 253962306a36Sopenharmony_ci i_mmap_unlock_read(mapping); 254062306a36Sopenharmony_ci} 254162306a36Sopenharmony_ci 254262306a36Sopenharmony_civoid rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) 254362306a36Sopenharmony_ci{ 254462306a36Sopenharmony_ci if (unlikely(folio_test_ksm(folio))) 254562306a36Sopenharmony_ci rmap_walk_ksm(folio, rwc); 254662306a36Sopenharmony_ci else if (folio_test_anon(folio)) 254762306a36Sopenharmony_ci rmap_walk_anon(folio, rwc, false); 254862306a36Sopenharmony_ci else 254962306a36Sopenharmony_ci rmap_walk_file(folio, rwc, false); 255062306a36Sopenharmony_ci} 255162306a36Sopenharmony_ci 255262306a36Sopenharmony_ci/* Like rmap_walk, but caller holds relevant rmap lock */ 255362306a36Sopenharmony_civoid rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) 255462306a36Sopenharmony_ci{ 255562306a36Sopenharmony_ci /* no ksm support for now */ 255662306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); 255762306a36Sopenharmony_ci if (folio_test_anon(folio)) 255862306a36Sopenharmony_ci rmap_walk_anon(folio, rwc, true); 255962306a36Sopenharmony_ci else 256062306a36Sopenharmony_ci rmap_walk_file(folio, rwc, true); 256162306a36Sopenharmony_ci} 256262306a36Sopenharmony_ci 256362306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 256462306a36Sopenharmony_ci/* 256562306a36Sopenharmony_ci * The following two functions are for anonymous (private mapped) hugepages. 256662306a36Sopenharmony_ci * Unlike common anonymous pages, anonymous hugepages have no accounting code 256762306a36Sopenharmony_ci * and no lru code, because we handle hugepages differently from common pages. 256862306a36Sopenharmony_ci * 256962306a36Sopenharmony_ci * RMAP_COMPOUND is ignored. 257062306a36Sopenharmony_ci */ 257162306a36Sopenharmony_civoid hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, 257262306a36Sopenharmony_ci unsigned long address, rmap_t flags) 257362306a36Sopenharmony_ci{ 257462306a36Sopenharmony_ci struct folio *folio = page_folio(page); 257562306a36Sopenharmony_ci struct anon_vma *anon_vma = vma->anon_vma; 257662306a36Sopenharmony_ci int first; 257762306a36Sopenharmony_ci 257862306a36Sopenharmony_ci BUG_ON(!folio_test_locked(folio)); 257962306a36Sopenharmony_ci BUG_ON(!anon_vma); 258062306a36Sopenharmony_ci /* address might be in next vma when migration races vma_merge */ 258162306a36Sopenharmony_ci first = atomic_inc_and_test(&folio->_entire_mapcount); 258262306a36Sopenharmony_ci VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); 258362306a36Sopenharmony_ci VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); 258462306a36Sopenharmony_ci if (first) 258562306a36Sopenharmony_ci __page_set_anon_rmap(folio, page, vma, address, 258662306a36Sopenharmony_ci !!(flags & RMAP_EXCLUSIVE)); 258762306a36Sopenharmony_ci} 258862306a36Sopenharmony_ci 258962306a36Sopenharmony_civoid hugepage_add_new_anon_rmap(struct folio *folio, 259062306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long address) 259162306a36Sopenharmony_ci{ 259262306a36Sopenharmony_ci BUG_ON(address < vma->vm_start || address >= vma->vm_end); 259362306a36Sopenharmony_ci /* increment count (starts at -1) */ 259462306a36Sopenharmony_ci atomic_set(&folio->_entire_mapcount, 0); 259562306a36Sopenharmony_ci folio_clear_hugetlb_restore_reserve(folio); 259662306a36Sopenharmony_ci __page_set_anon_rmap(folio, &folio->page, vma, address, 1); 259762306a36Sopenharmony_ci} 259862306a36Sopenharmony_ci#endif /* CONFIG_HUGETLB_PAGE */ 2599