162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Memory Migration functionality - linux/mm/migrate.c 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Page migration was first developed in the context of the memory hotplug 862306a36Sopenharmony_ci * project. The main authors of the migration code are: 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> 1162306a36Sopenharmony_ci * Hirokazu Takahashi <taka@valinux.co.jp> 1262306a36Sopenharmony_ci * Dave Hansen <haveblue@us.ibm.com> 1362306a36Sopenharmony_ci * Christoph Lameter 1462306a36Sopenharmony_ci */ 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_ci#include <linux/migrate.h> 1762306a36Sopenharmony_ci#include <linux/export.h> 1862306a36Sopenharmony_ci#include <linux/swap.h> 1962306a36Sopenharmony_ci#include <linux/swapops.h> 2062306a36Sopenharmony_ci#include <linux/pagemap.h> 2162306a36Sopenharmony_ci#include <linux/buffer_head.h> 2262306a36Sopenharmony_ci#include <linux/mm_inline.h> 2362306a36Sopenharmony_ci#include <linux/nsproxy.h> 2462306a36Sopenharmony_ci#include <linux/ksm.h> 2562306a36Sopenharmony_ci#include <linux/rmap.h> 2662306a36Sopenharmony_ci#include <linux/topology.h> 2762306a36Sopenharmony_ci#include <linux/cpu.h> 2862306a36Sopenharmony_ci#include <linux/cpuset.h> 2962306a36Sopenharmony_ci#include <linux/writeback.h> 3062306a36Sopenharmony_ci#include <linux/mempolicy.h> 3162306a36Sopenharmony_ci#include <linux/vmalloc.h> 3262306a36Sopenharmony_ci#include <linux/security.h> 3362306a36Sopenharmony_ci#include <linux/backing-dev.h> 3462306a36Sopenharmony_ci#include <linux/compaction.h> 3562306a36Sopenharmony_ci#include <linux/syscalls.h> 3662306a36Sopenharmony_ci#include <linux/compat.h> 3762306a36Sopenharmony_ci#include <linux/hugetlb.h> 3862306a36Sopenharmony_ci#include <linux/hugetlb_cgroup.h> 3962306a36Sopenharmony_ci#include <linux/gfp.h> 4062306a36Sopenharmony_ci#include <linux/pfn_t.h> 4162306a36Sopenharmony_ci#include <linux/memremap.h> 4262306a36Sopenharmony_ci#include <linux/userfaultfd_k.h> 4362306a36Sopenharmony_ci#include <linux/balloon_compaction.h> 4462306a36Sopenharmony_ci#include <linux/page_idle.h> 4562306a36Sopenharmony_ci#include <linux/page_owner.h> 4662306a36Sopenharmony_ci#include <linux/sched/mm.h> 4762306a36Sopenharmony_ci#include <linux/ptrace.h> 4862306a36Sopenharmony_ci#include <linux/oom.h> 4962306a36Sopenharmony_ci#include <linux/memory.h> 5062306a36Sopenharmony_ci#include <linux/random.h> 5162306a36Sopenharmony_ci#include <linux/sched/sysctl.h> 5262306a36Sopenharmony_ci#include <linux/memory-tiers.h> 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci#include <asm/tlbflush.h> 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci#include <trace/events/migrate.h> 5762306a36Sopenharmony_ci 5862306a36Sopenharmony_ci#include "internal.h" 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_cibool isolate_movable_page(struct page *page, isolate_mode_t mode) 6162306a36Sopenharmony_ci{ 6262306a36Sopenharmony_ci struct folio *folio = folio_get_nontail_page(page); 6362306a36Sopenharmony_ci const struct movable_operations *mops; 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci /* 6662306a36Sopenharmony_ci * Avoid burning cycles with pages that are yet under __free_pages(), 6762306a36Sopenharmony_ci * or just got freed under us. 6862306a36Sopenharmony_ci * 6962306a36Sopenharmony_ci * In case we 'win' a race for a movable page being freed under us and 7062306a36Sopenharmony_ci * raise its refcount preventing __free_pages() from doing its job 7162306a36Sopenharmony_ci * the put_page() at the end of this block will take care of 7262306a36Sopenharmony_ci * release this page, thus avoiding a nasty leakage. 7362306a36Sopenharmony_ci */ 7462306a36Sopenharmony_ci if (!folio) 7562306a36Sopenharmony_ci goto out; 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci if (unlikely(folio_test_slab(folio))) 7862306a36Sopenharmony_ci goto out_putfolio; 7962306a36Sopenharmony_ci /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ 8062306a36Sopenharmony_ci smp_rmb(); 8162306a36Sopenharmony_ci /* 8262306a36Sopenharmony_ci * Check movable flag before taking the page lock because 8362306a36Sopenharmony_ci * we use non-atomic bitops on newly allocated page flags so 8462306a36Sopenharmony_ci * unconditionally grabbing the lock ruins page's owner side. 8562306a36Sopenharmony_ci */ 8662306a36Sopenharmony_ci if (unlikely(!__folio_test_movable(folio))) 8762306a36Sopenharmony_ci goto out_putfolio; 8862306a36Sopenharmony_ci /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ 8962306a36Sopenharmony_ci smp_rmb(); 9062306a36Sopenharmony_ci if (unlikely(folio_test_slab(folio))) 9162306a36Sopenharmony_ci goto out_putfolio; 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci /* 9462306a36Sopenharmony_ci * As movable pages are not isolated from LRU lists, concurrent 9562306a36Sopenharmony_ci * compaction threads can race against page migration functions 9662306a36Sopenharmony_ci * as well as race against the releasing a page. 9762306a36Sopenharmony_ci * 9862306a36Sopenharmony_ci * In order to avoid having an already isolated movable page 9962306a36Sopenharmony_ci * being (wrongly) re-isolated while it is under migration, 10062306a36Sopenharmony_ci * or to avoid attempting to isolate pages being released, 10162306a36Sopenharmony_ci * lets be sure we have the page lock 10262306a36Sopenharmony_ci * before proceeding with the movable page isolation steps. 10362306a36Sopenharmony_ci */ 10462306a36Sopenharmony_ci if (unlikely(!folio_trylock(folio))) 10562306a36Sopenharmony_ci goto out_putfolio; 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci if (!folio_test_movable(folio) || folio_test_isolated(folio)) 10862306a36Sopenharmony_ci goto out_no_isolated; 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci mops = folio_movable_ops(folio); 11162306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!mops, folio); 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci if (!mops->isolate_page(&folio->page, mode)) 11462306a36Sopenharmony_ci goto out_no_isolated; 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci /* Driver shouldn't use PG_isolated bit of page->flags */ 11762306a36Sopenharmony_ci WARN_ON_ONCE(folio_test_isolated(folio)); 11862306a36Sopenharmony_ci folio_set_isolated(folio); 11962306a36Sopenharmony_ci folio_unlock(folio); 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci return true; 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ciout_no_isolated: 12462306a36Sopenharmony_ci folio_unlock(folio); 12562306a36Sopenharmony_ciout_putfolio: 12662306a36Sopenharmony_ci folio_put(folio); 12762306a36Sopenharmony_ciout: 12862306a36Sopenharmony_ci return false; 12962306a36Sopenharmony_ci} 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_cistatic void putback_movable_folio(struct folio *folio) 13262306a36Sopenharmony_ci{ 13362306a36Sopenharmony_ci const struct movable_operations *mops = folio_movable_ops(folio); 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci mops->putback_page(&folio->page); 13662306a36Sopenharmony_ci folio_clear_isolated(folio); 13762306a36Sopenharmony_ci} 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci/* 14062306a36Sopenharmony_ci * Put previously isolated pages back onto the appropriate lists 14162306a36Sopenharmony_ci * from where they were once taken off for compaction/migration. 14262306a36Sopenharmony_ci * 14362306a36Sopenharmony_ci * This function shall be used whenever the isolated pageset has been 14462306a36Sopenharmony_ci * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() 14562306a36Sopenharmony_ci * and isolate_hugetlb(). 14662306a36Sopenharmony_ci */ 14762306a36Sopenharmony_civoid putback_movable_pages(struct list_head *l) 14862306a36Sopenharmony_ci{ 14962306a36Sopenharmony_ci struct folio *folio; 15062306a36Sopenharmony_ci struct folio *folio2; 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci list_for_each_entry_safe(folio, folio2, l, lru) { 15362306a36Sopenharmony_ci if (unlikely(folio_test_hugetlb(folio))) { 15462306a36Sopenharmony_ci folio_putback_active_hugetlb(folio); 15562306a36Sopenharmony_ci continue; 15662306a36Sopenharmony_ci } 15762306a36Sopenharmony_ci list_del(&folio->lru); 15862306a36Sopenharmony_ci /* 15962306a36Sopenharmony_ci * We isolated non-lru movable folio so here we can use 16062306a36Sopenharmony_ci * __PageMovable because LRU folio's mapping cannot have 16162306a36Sopenharmony_ci * PAGE_MAPPING_MOVABLE. 16262306a36Sopenharmony_ci */ 16362306a36Sopenharmony_ci if (unlikely(__folio_test_movable(folio))) { 16462306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio); 16562306a36Sopenharmony_ci folio_lock(folio); 16662306a36Sopenharmony_ci if (folio_test_movable(folio)) 16762306a36Sopenharmony_ci putback_movable_folio(folio); 16862306a36Sopenharmony_ci else 16962306a36Sopenharmony_ci folio_clear_isolated(folio); 17062306a36Sopenharmony_ci folio_unlock(folio); 17162306a36Sopenharmony_ci folio_put(folio); 17262306a36Sopenharmony_ci } else { 17362306a36Sopenharmony_ci node_stat_mod_folio(folio, NR_ISOLATED_ANON + 17462306a36Sopenharmony_ci folio_is_file_lru(folio), -folio_nr_pages(folio)); 17562306a36Sopenharmony_ci folio_putback_lru(folio); 17662306a36Sopenharmony_ci } 17762306a36Sopenharmony_ci } 17862306a36Sopenharmony_ci} 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci/* 18162306a36Sopenharmony_ci * Restore a potential migration pte to a working pte entry 18262306a36Sopenharmony_ci */ 18362306a36Sopenharmony_cistatic bool remove_migration_pte(struct folio *folio, 18462306a36Sopenharmony_ci struct vm_area_struct *vma, unsigned long addr, void *old) 18562306a36Sopenharmony_ci{ 18662306a36Sopenharmony_ci DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci while (page_vma_mapped_walk(&pvmw)) { 18962306a36Sopenharmony_ci rmap_t rmap_flags = RMAP_NONE; 19062306a36Sopenharmony_ci pte_t old_pte; 19162306a36Sopenharmony_ci pte_t pte; 19262306a36Sopenharmony_ci swp_entry_t entry; 19362306a36Sopenharmony_ci struct page *new; 19462306a36Sopenharmony_ci unsigned long idx = 0; 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci /* pgoff is invalid for ksm pages, but they are never large */ 19762306a36Sopenharmony_ci if (folio_test_large(folio) && !folio_test_hugetlb(folio)) 19862306a36Sopenharmony_ci idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff; 19962306a36Sopenharmony_ci new = folio_page(folio, idx); 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ci#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 20262306a36Sopenharmony_ci /* PMD-mapped THP migration entry */ 20362306a36Sopenharmony_ci if (!pvmw.pte) { 20462306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || 20562306a36Sopenharmony_ci !folio_test_pmd_mappable(folio), folio); 20662306a36Sopenharmony_ci remove_migration_pmd(&pvmw, new); 20762306a36Sopenharmony_ci continue; 20862306a36Sopenharmony_ci } 20962306a36Sopenharmony_ci#endif 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci folio_get(folio); 21262306a36Sopenharmony_ci pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); 21362306a36Sopenharmony_ci old_pte = ptep_get(pvmw.pte); 21462306a36Sopenharmony_ci if (pte_swp_soft_dirty(old_pte)) 21562306a36Sopenharmony_ci pte = pte_mksoft_dirty(pte); 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci entry = pte_to_swp_entry(old_pte); 21862306a36Sopenharmony_ci if (!is_migration_entry_young(entry)) 21962306a36Sopenharmony_ci pte = pte_mkold(pte); 22062306a36Sopenharmony_ci if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) 22162306a36Sopenharmony_ci pte = pte_mkdirty(pte); 22262306a36Sopenharmony_ci if (is_writable_migration_entry(entry)) 22362306a36Sopenharmony_ci pte = pte_mkwrite(pte, vma); 22462306a36Sopenharmony_ci else if (pte_swp_uffd_wp(old_pte)) 22562306a36Sopenharmony_ci pte = pte_mkuffd_wp(pte); 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) 22862306a36Sopenharmony_ci rmap_flags |= RMAP_EXCLUSIVE; 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci if (unlikely(is_device_private_page(new))) { 23162306a36Sopenharmony_ci if (pte_write(pte)) 23262306a36Sopenharmony_ci entry = make_writable_device_private_entry( 23362306a36Sopenharmony_ci page_to_pfn(new)); 23462306a36Sopenharmony_ci else 23562306a36Sopenharmony_ci entry = make_readable_device_private_entry( 23662306a36Sopenharmony_ci page_to_pfn(new)); 23762306a36Sopenharmony_ci pte = swp_entry_to_pte(entry); 23862306a36Sopenharmony_ci if (pte_swp_soft_dirty(old_pte)) 23962306a36Sopenharmony_ci pte = pte_swp_mksoft_dirty(pte); 24062306a36Sopenharmony_ci if (pte_swp_uffd_wp(old_pte)) 24162306a36Sopenharmony_ci pte = pte_swp_mkuffd_wp(pte); 24262306a36Sopenharmony_ci } 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 24562306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) { 24662306a36Sopenharmony_ci struct hstate *h = hstate_vma(vma); 24762306a36Sopenharmony_ci unsigned int shift = huge_page_shift(h); 24862306a36Sopenharmony_ci unsigned long psize = huge_page_size(h); 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci pte = arch_make_huge_pte(pte, shift, vma->vm_flags); 25162306a36Sopenharmony_ci if (folio_test_anon(folio)) 25262306a36Sopenharmony_ci hugepage_add_anon_rmap(new, vma, pvmw.address, 25362306a36Sopenharmony_ci rmap_flags); 25462306a36Sopenharmony_ci else 25562306a36Sopenharmony_ci page_dup_file_rmap(new, true); 25662306a36Sopenharmony_ci set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte, 25762306a36Sopenharmony_ci psize); 25862306a36Sopenharmony_ci } else 25962306a36Sopenharmony_ci#endif 26062306a36Sopenharmony_ci { 26162306a36Sopenharmony_ci if (folio_test_anon(folio)) 26262306a36Sopenharmony_ci page_add_anon_rmap(new, vma, pvmw.address, 26362306a36Sopenharmony_ci rmap_flags); 26462306a36Sopenharmony_ci else 26562306a36Sopenharmony_ci page_add_file_rmap(new, vma, false); 26662306a36Sopenharmony_ci set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); 26762306a36Sopenharmony_ci } 26862306a36Sopenharmony_ci if (vma->vm_flags & VM_LOCKED) 26962306a36Sopenharmony_ci mlock_drain_local(); 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci trace_remove_migration_pte(pvmw.address, pte_val(pte), 27262306a36Sopenharmony_ci compound_order(new)); 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci /* No need to invalidate - it was non-present before */ 27562306a36Sopenharmony_ci update_mmu_cache(vma, pvmw.address, pvmw.pte); 27662306a36Sopenharmony_ci } 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci return true; 27962306a36Sopenharmony_ci} 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci/* 28262306a36Sopenharmony_ci * Get rid of all migration entries and replace them by 28362306a36Sopenharmony_ci * references to the indicated page. 28462306a36Sopenharmony_ci */ 28562306a36Sopenharmony_civoid remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) 28662306a36Sopenharmony_ci{ 28762306a36Sopenharmony_ci struct rmap_walk_control rwc = { 28862306a36Sopenharmony_ci .rmap_one = remove_migration_pte, 28962306a36Sopenharmony_ci .arg = src, 29062306a36Sopenharmony_ci }; 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci if (locked) 29362306a36Sopenharmony_ci rmap_walk_locked(dst, &rwc); 29462306a36Sopenharmony_ci else 29562306a36Sopenharmony_ci rmap_walk(dst, &rwc); 29662306a36Sopenharmony_ci} 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci/* 29962306a36Sopenharmony_ci * Something used the pte of a page under migration. We need to 30062306a36Sopenharmony_ci * get to the page and wait until migration is finished. 30162306a36Sopenharmony_ci * When we return from this function the fault will be retried. 30262306a36Sopenharmony_ci */ 30362306a36Sopenharmony_civoid migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 30462306a36Sopenharmony_ci unsigned long address) 30562306a36Sopenharmony_ci{ 30662306a36Sopenharmony_ci spinlock_t *ptl; 30762306a36Sopenharmony_ci pte_t *ptep; 30862306a36Sopenharmony_ci pte_t pte; 30962306a36Sopenharmony_ci swp_entry_t entry; 31062306a36Sopenharmony_ci 31162306a36Sopenharmony_ci ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 31262306a36Sopenharmony_ci if (!ptep) 31362306a36Sopenharmony_ci return; 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci pte = ptep_get(ptep); 31662306a36Sopenharmony_ci pte_unmap(ptep); 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci if (!is_swap_pte(pte)) 31962306a36Sopenharmony_ci goto out; 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci entry = pte_to_swp_entry(pte); 32262306a36Sopenharmony_ci if (!is_migration_entry(entry)) 32362306a36Sopenharmony_ci goto out; 32462306a36Sopenharmony_ci 32562306a36Sopenharmony_ci migration_entry_wait_on_locked(entry, ptl); 32662306a36Sopenharmony_ci return; 32762306a36Sopenharmony_ciout: 32862306a36Sopenharmony_ci spin_unlock(ptl); 32962306a36Sopenharmony_ci} 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 33262306a36Sopenharmony_ci/* 33362306a36Sopenharmony_ci * The vma read lock must be held upon entry. Holding that lock prevents either 33462306a36Sopenharmony_ci * the pte or the ptl from being freed. 33562306a36Sopenharmony_ci * 33662306a36Sopenharmony_ci * This function will release the vma lock before returning. 33762306a36Sopenharmony_ci */ 33862306a36Sopenharmony_civoid migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *ptep) 33962306a36Sopenharmony_ci{ 34062306a36Sopenharmony_ci spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep); 34162306a36Sopenharmony_ci pte_t pte; 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_ci hugetlb_vma_assert_locked(vma); 34462306a36Sopenharmony_ci spin_lock(ptl); 34562306a36Sopenharmony_ci pte = huge_ptep_get(ptep); 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci if (unlikely(!is_hugetlb_entry_migration(pte))) { 34862306a36Sopenharmony_ci spin_unlock(ptl); 34962306a36Sopenharmony_ci hugetlb_vma_unlock_read(vma); 35062306a36Sopenharmony_ci } else { 35162306a36Sopenharmony_ci /* 35262306a36Sopenharmony_ci * If migration entry existed, safe to release vma lock 35362306a36Sopenharmony_ci * here because the pgtable page won't be freed without the 35462306a36Sopenharmony_ci * pgtable lock released. See comment right above pgtable 35562306a36Sopenharmony_ci * lock release in migration_entry_wait_on_locked(). 35662306a36Sopenharmony_ci */ 35762306a36Sopenharmony_ci hugetlb_vma_unlock_read(vma); 35862306a36Sopenharmony_ci migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl); 35962306a36Sopenharmony_ci } 36062306a36Sopenharmony_ci} 36162306a36Sopenharmony_ci#endif 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION 36462306a36Sopenharmony_civoid pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) 36562306a36Sopenharmony_ci{ 36662306a36Sopenharmony_ci spinlock_t *ptl; 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci ptl = pmd_lock(mm, pmd); 36962306a36Sopenharmony_ci if (!is_pmd_migration_entry(*pmd)) 37062306a36Sopenharmony_ci goto unlock; 37162306a36Sopenharmony_ci migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl); 37262306a36Sopenharmony_ci return; 37362306a36Sopenharmony_ciunlock: 37462306a36Sopenharmony_ci spin_unlock(ptl); 37562306a36Sopenharmony_ci} 37662306a36Sopenharmony_ci#endif 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_cistatic int folio_expected_refs(struct address_space *mapping, 37962306a36Sopenharmony_ci struct folio *folio) 38062306a36Sopenharmony_ci{ 38162306a36Sopenharmony_ci int refs = 1; 38262306a36Sopenharmony_ci if (!mapping) 38362306a36Sopenharmony_ci return refs; 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ci refs += folio_nr_pages(folio); 38662306a36Sopenharmony_ci if (folio_test_private(folio)) 38762306a36Sopenharmony_ci refs++; 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci return refs; 39062306a36Sopenharmony_ci} 39162306a36Sopenharmony_ci 39262306a36Sopenharmony_ci/* 39362306a36Sopenharmony_ci * Replace the page in the mapping. 39462306a36Sopenharmony_ci * 39562306a36Sopenharmony_ci * The number of remaining references must be: 39662306a36Sopenharmony_ci * 1 for anonymous pages without a mapping 39762306a36Sopenharmony_ci * 2 for pages with a mapping 39862306a36Sopenharmony_ci * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. 39962306a36Sopenharmony_ci */ 40062306a36Sopenharmony_ciint folio_migrate_mapping(struct address_space *mapping, 40162306a36Sopenharmony_ci struct folio *newfolio, struct folio *folio, int extra_count) 40262306a36Sopenharmony_ci{ 40362306a36Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, folio_index(folio)); 40462306a36Sopenharmony_ci struct zone *oldzone, *newzone; 40562306a36Sopenharmony_ci int dirty; 40662306a36Sopenharmony_ci int expected_count = folio_expected_refs(mapping, folio) + extra_count; 40762306a36Sopenharmony_ci long nr = folio_nr_pages(folio); 40862306a36Sopenharmony_ci long entries, i; 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci if (!mapping) { 41162306a36Sopenharmony_ci /* Anonymous page without mapping */ 41262306a36Sopenharmony_ci if (folio_ref_count(folio) != expected_count) 41362306a36Sopenharmony_ci return -EAGAIN; 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_ci /* No turning back from here */ 41662306a36Sopenharmony_ci newfolio->index = folio->index; 41762306a36Sopenharmony_ci newfolio->mapping = folio->mapping; 41862306a36Sopenharmony_ci if (folio_test_swapbacked(folio)) 41962306a36Sopenharmony_ci __folio_set_swapbacked(newfolio); 42062306a36Sopenharmony_ci 42162306a36Sopenharmony_ci return MIGRATEPAGE_SUCCESS; 42262306a36Sopenharmony_ci } 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci oldzone = folio_zone(folio); 42562306a36Sopenharmony_ci newzone = folio_zone(newfolio); 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci xas_lock_irq(&xas); 42862306a36Sopenharmony_ci if (!folio_ref_freeze(folio, expected_count)) { 42962306a36Sopenharmony_ci xas_unlock_irq(&xas); 43062306a36Sopenharmony_ci return -EAGAIN; 43162306a36Sopenharmony_ci } 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci /* 43462306a36Sopenharmony_ci * Now we know that no one else is looking at the folio: 43562306a36Sopenharmony_ci * no turning back from here. 43662306a36Sopenharmony_ci */ 43762306a36Sopenharmony_ci newfolio->index = folio->index; 43862306a36Sopenharmony_ci newfolio->mapping = folio->mapping; 43962306a36Sopenharmony_ci folio_ref_add(newfolio, nr); /* add cache reference */ 44062306a36Sopenharmony_ci if (folio_test_swapbacked(folio)) { 44162306a36Sopenharmony_ci __folio_set_swapbacked(newfolio); 44262306a36Sopenharmony_ci if (folio_test_swapcache(folio)) { 44362306a36Sopenharmony_ci folio_set_swapcache(newfolio); 44462306a36Sopenharmony_ci newfolio->private = folio_get_private(folio); 44562306a36Sopenharmony_ci } 44662306a36Sopenharmony_ci entries = nr; 44762306a36Sopenharmony_ci } else { 44862306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); 44962306a36Sopenharmony_ci entries = 1; 45062306a36Sopenharmony_ci } 45162306a36Sopenharmony_ci 45262306a36Sopenharmony_ci /* Move dirty while page refs frozen and newpage not yet exposed */ 45362306a36Sopenharmony_ci dirty = folio_test_dirty(folio); 45462306a36Sopenharmony_ci if (dirty) { 45562306a36Sopenharmony_ci folio_clear_dirty(folio); 45662306a36Sopenharmony_ci folio_set_dirty(newfolio); 45762306a36Sopenharmony_ci } 45862306a36Sopenharmony_ci 45962306a36Sopenharmony_ci /* Swap cache still stores N entries instead of a high-order entry */ 46062306a36Sopenharmony_ci for (i = 0; i < entries; i++) { 46162306a36Sopenharmony_ci xas_store(&xas, newfolio); 46262306a36Sopenharmony_ci xas_next(&xas); 46362306a36Sopenharmony_ci } 46462306a36Sopenharmony_ci 46562306a36Sopenharmony_ci /* 46662306a36Sopenharmony_ci * Drop cache reference from old page by unfreezing 46762306a36Sopenharmony_ci * to one less reference. 46862306a36Sopenharmony_ci * We know this isn't the last reference. 46962306a36Sopenharmony_ci */ 47062306a36Sopenharmony_ci folio_ref_unfreeze(folio, expected_count - nr); 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_ci xas_unlock(&xas); 47362306a36Sopenharmony_ci /* Leave irq disabled to prevent preemption while updating stats */ 47462306a36Sopenharmony_ci 47562306a36Sopenharmony_ci /* 47662306a36Sopenharmony_ci * If moved to a different zone then also account 47762306a36Sopenharmony_ci * the page for that zone. Other VM counters will be 47862306a36Sopenharmony_ci * taken care of when we establish references to the 47962306a36Sopenharmony_ci * new page and drop references to the old page. 48062306a36Sopenharmony_ci * 48162306a36Sopenharmony_ci * Note that anonymous pages are accounted for 48262306a36Sopenharmony_ci * via NR_FILE_PAGES and NR_ANON_MAPPED if they 48362306a36Sopenharmony_ci * are mapped to swap space. 48462306a36Sopenharmony_ci */ 48562306a36Sopenharmony_ci if (newzone != oldzone) { 48662306a36Sopenharmony_ci struct lruvec *old_lruvec, *new_lruvec; 48762306a36Sopenharmony_ci struct mem_cgroup *memcg; 48862306a36Sopenharmony_ci 48962306a36Sopenharmony_ci memcg = folio_memcg(folio); 49062306a36Sopenharmony_ci old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); 49162306a36Sopenharmony_ci new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat); 49262306a36Sopenharmony_ci 49362306a36Sopenharmony_ci __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr); 49462306a36Sopenharmony_ci __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr); 49562306a36Sopenharmony_ci if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) { 49662306a36Sopenharmony_ci __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); 49762306a36Sopenharmony_ci __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_ci if (folio_test_pmd_mappable(folio)) { 50062306a36Sopenharmony_ci __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr); 50162306a36Sopenharmony_ci __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr); 50262306a36Sopenharmony_ci } 50362306a36Sopenharmony_ci } 50462306a36Sopenharmony_ci#ifdef CONFIG_SWAP 50562306a36Sopenharmony_ci if (folio_test_swapcache(folio)) { 50662306a36Sopenharmony_ci __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); 50762306a36Sopenharmony_ci __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); 50862306a36Sopenharmony_ci } 50962306a36Sopenharmony_ci#endif 51062306a36Sopenharmony_ci if (dirty && mapping_can_writeback(mapping)) { 51162306a36Sopenharmony_ci __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); 51262306a36Sopenharmony_ci __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); 51362306a36Sopenharmony_ci __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr); 51462306a36Sopenharmony_ci __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); 51562306a36Sopenharmony_ci } 51662306a36Sopenharmony_ci } 51762306a36Sopenharmony_ci local_irq_enable(); 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_ci return MIGRATEPAGE_SUCCESS; 52062306a36Sopenharmony_ci} 52162306a36Sopenharmony_ciEXPORT_SYMBOL(folio_migrate_mapping); 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci/* 52462306a36Sopenharmony_ci * The expected number of remaining references is the same as that 52562306a36Sopenharmony_ci * of folio_migrate_mapping(). 52662306a36Sopenharmony_ci */ 52762306a36Sopenharmony_ciint migrate_huge_page_move_mapping(struct address_space *mapping, 52862306a36Sopenharmony_ci struct folio *dst, struct folio *src) 52962306a36Sopenharmony_ci{ 53062306a36Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, folio_index(src)); 53162306a36Sopenharmony_ci int expected_count; 53262306a36Sopenharmony_ci 53362306a36Sopenharmony_ci xas_lock_irq(&xas); 53462306a36Sopenharmony_ci expected_count = 2 + folio_has_private(src); 53562306a36Sopenharmony_ci if (!folio_ref_freeze(src, expected_count)) { 53662306a36Sopenharmony_ci xas_unlock_irq(&xas); 53762306a36Sopenharmony_ci return -EAGAIN; 53862306a36Sopenharmony_ci } 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_ci dst->index = src->index; 54162306a36Sopenharmony_ci dst->mapping = src->mapping; 54262306a36Sopenharmony_ci 54362306a36Sopenharmony_ci folio_get(dst); 54462306a36Sopenharmony_ci 54562306a36Sopenharmony_ci xas_store(&xas, dst); 54662306a36Sopenharmony_ci 54762306a36Sopenharmony_ci folio_ref_unfreeze(src, expected_count - 1); 54862306a36Sopenharmony_ci 54962306a36Sopenharmony_ci xas_unlock_irq(&xas); 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_ci return MIGRATEPAGE_SUCCESS; 55262306a36Sopenharmony_ci} 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ci/* 55562306a36Sopenharmony_ci * Copy the flags and some other ancillary information 55662306a36Sopenharmony_ci */ 55762306a36Sopenharmony_civoid folio_migrate_flags(struct folio *newfolio, struct folio *folio) 55862306a36Sopenharmony_ci{ 55962306a36Sopenharmony_ci int cpupid; 56062306a36Sopenharmony_ci 56162306a36Sopenharmony_ci if (folio_test_error(folio)) 56262306a36Sopenharmony_ci folio_set_error(newfolio); 56362306a36Sopenharmony_ci if (folio_test_referenced(folio)) 56462306a36Sopenharmony_ci folio_set_referenced(newfolio); 56562306a36Sopenharmony_ci if (folio_test_uptodate(folio)) 56662306a36Sopenharmony_ci folio_mark_uptodate(newfolio); 56762306a36Sopenharmony_ci if (folio_test_clear_active(folio)) { 56862306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio); 56962306a36Sopenharmony_ci folio_set_active(newfolio); 57062306a36Sopenharmony_ci } else if (folio_test_clear_unevictable(folio)) 57162306a36Sopenharmony_ci folio_set_unevictable(newfolio); 57262306a36Sopenharmony_ci if (folio_test_workingset(folio)) 57362306a36Sopenharmony_ci folio_set_workingset(newfolio); 57462306a36Sopenharmony_ci if (folio_test_checked(folio)) 57562306a36Sopenharmony_ci folio_set_checked(newfolio); 57662306a36Sopenharmony_ci /* 57762306a36Sopenharmony_ci * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via 57862306a36Sopenharmony_ci * migration entries. We can still have PG_anon_exclusive set on an 57962306a36Sopenharmony_ci * effectively unmapped and unreferenced first sub-pages of an 58062306a36Sopenharmony_ci * anonymous THP: we can simply copy it here via PG_mappedtodisk. 58162306a36Sopenharmony_ci */ 58262306a36Sopenharmony_ci if (folio_test_mappedtodisk(folio)) 58362306a36Sopenharmony_ci folio_set_mappedtodisk(newfolio); 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci /* Move dirty on pages not done by folio_migrate_mapping() */ 58662306a36Sopenharmony_ci if (folio_test_dirty(folio)) 58762306a36Sopenharmony_ci folio_set_dirty(newfolio); 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci if (folio_test_young(folio)) 59062306a36Sopenharmony_ci folio_set_young(newfolio); 59162306a36Sopenharmony_ci if (folio_test_idle(folio)) 59262306a36Sopenharmony_ci folio_set_idle(newfolio); 59362306a36Sopenharmony_ci 59462306a36Sopenharmony_ci /* 59562306a36Sopenharmony_ci * Copy NUMA information to the new page, to prevent over-eager 59662306a36Sopenharmony_ci * future migrations of this same page. 59762306a36Sopenharmony_ci */ 59862306a36Sopenharmony_ci cpupid = page_cpupid_xchg_last(&folio->page, -1); 59962306a36Sopenharmony_ci /* 60062306a36Sopenharmony_ci * For memory tiering mode, when migrate between slow and fast 60162306a36Sopenharmony_ci * memory node, reset cpupid, because that is used to record 60262306a36Sopenharmony_ci * page access time in slow memory node. 60362306a36Sopenharmony_ci */ 60462306a36Sopenharmony_ci if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) { 60562306a36Sopenharmony_ci bool f_toptier = node_is_toptier(page_to_nid(&folio->page)); 60662306a36Sopenharmony_ci bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page)); 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci if (f_toptier != t_toptier) 60962306a36Sopenharmony_ci cpupid = -1; 61062306a36Sopenharmony_ci } 61162306a36Sopenharmony_ci page_cpupid_xchg_last(&newfolio->page, cpupid); 61262306a36Sopenharmony_ci 61362306a36Sopenharmony_ci folio_migrate_ksm(newfolio, folio); 61462306a36Sopenharmony_ci /* 61562306a36Sopenharmony_ci * Please do not reorder this without considering how mm/ksm.c's 61662306a36Sopenharmony_ci * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). 61762306a36Sopenharmony_ci */ 61862306a36Sopenharmony_ci if (folio_test_swapcache(folio)) 61962306a36Sopenharmony_ci folio_clear_swapcache(folio); 62062306a36Sopenharmony_ci folio_clear_private(folio); 62162306a36Sopenharmony_ci 62262306a36Sopenharmony_ci /* page->private contains hugetlb specific flags */ 62362306a36Sopenharmony_ci if (!folio_test_hugetlb(folio)) 62462306a36Sopenharmony_ci folio->private = NULL; 62562306a36Sopenharmony_ci 62662306a36Sopenharmony_ci /* 62762306a36Sopenharmony_ci * If any waiters have accumulated on the new page then 62862306a36Sopenharmony_ci * wake them up. 62962306a36Sopenharmony_ci */ 63062306a36Sopenharmony_ci if (folio_test_writeback(newfolio)) 63162306a36Sopenharmony_ci folio_end_writeback(newfolio); 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci /* 63462306a36Sopenharmony_ci * PG_readahead shares the same bit with PG_reclaim. The above 63562306a36Sopenharmony_ci * end_page_writeback() may clear PG_readahead mistakenly, so set the 63662306a36Sopenharmony_ci * bit after that. 63762306a36Sopenharmony_ci */ 63862306a36Sopenharmony_ci if (folio_test_readahead(folio)) 63962306a36Sopenharmony_ci folio_set_readahead(newfolio); 64062306a36Sopenharmony_ci 64162306a36Sopenharmony_ci folio_copy_owner(newfolio, folio); 64262306a36Sopenharmony_ci 64362306a36Sopenharmony_ci if (!folio_test_hugetlb(folio)) 64462306a36Sopenharmony_ci mem_cgroup_migrate(folio, newfolio); 64562306a36Sopenharmony_ci} 64662306a36Sopenharmony_ciEXPORT_SYMBOL(folio_migrate_flags); 64762306a36Sopenharmony_ci 64862306a36Sopenharmony_civoid folio_migrate_copy(struct folio *newfolio, struct folio *folio) 64962306a36Sopenharmony_ci{ 65062306a36Sopenharmony_ci folio_copy(newfolio, folio); 65162306a36Sopenharmony_ci folio_migrate_flags(newfolio, folio); 65262306a36Sopenharmony_ci} 65362306a36Sopenharmony_ciEXPORT_SYMBOL(folio_migrate_copy); 65462306a36Sopenharmony_ci 65562306a36Sopenharmony_ci/************************************************************ 65662306a36Sopenharmony_ci * Migration functions 65762306a36Sopenharmony_ci ***********************************************************/ 65862306a36Sopenharmony_ci 65962306a36Sopenharmony_ciint migrate_folio_extra(struct address_space *mapping, struct folio *dst, 66062306a36Sopenharmony_ci struct folio *src, enum migrate_mode mode, int extra_count) 66162306a36Sopenharmony_ci{ 66262306a36Sopenharmony_ci int rc; 66362306a36Sopenharmony_ci 66462306a36Sopenharmony_ci BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ 66562306a36Sopenharmony_ci 66662306a36Sopenharmony_ci rc = folio_migrate_mapping(mapping, dst, src, extra_count); 66762306a36Sopenharmony_ci 66862306a36Sopenharmony_ci if (rc != MIGRATEPAGE_SUCCESS) 66962306a36Sopenharmony_ci return rc; 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci if (mode != MIGRATE_SYNC_NO_COPY) 67262306a36Sopenharmony_ci folio_migrate_copy(dst, src); 67362306a36Sopenharmony_ci else 67462306a36Sopenharmony_ci folio_migrate_flags(dst, src); 67562306a36Sopenharmony_ci return MIGRATEPAGE_SUCCESS; 67662306a36Sopenharmony_ci} 67762306a36Sopenharmony_ci 67862306a36Sopenharmony_ci/** 67962306a36Sopenharmony_ci * migrate_folio() - Simple folio migration. 68062306a36Sopenharmony_ci * @mapping: The address_space containing the folio. 68162306a36Sopenharmony_ci * @dst: The folio to migrate the data to. 68262306a36Sopenharmony_ci * @src: The folio containing the current data. 68362306a36Sopenharmony_ci * @mode: How to migrate the page. 68462306a36Sopenharmony_ci * 68562306a36Sopenharmony_ci * Common logic to directly migrate a single LRU folio suitable for 68662306a36Sopenharmony_ci * folios that do not use PagePrivate/PagePrivate2. 68762306a36Sopenharmony_ci * 68862306a36Sopenharmony_ci * Folios are locked upon entry and exit. 68962306a36Sopenharmony_ci */ 69062306a36Sopenharmony_ciint migrate_folio(struct address_space *mapping, struct folio *dst, 69162306a36Sopenharmony_ci struct folio *src, enum migrate_mode mode) 69262306a36Sopenharmony_ci{ 69362306a36Sopenharmony_ci return migrate_folio_extra(mapping, dst, src, mode, 0); 69462306a36Sopenharmony_ci} 69562306a36Sopenharmony_ciEXPORT_SYMBOL(migrate_folio); 69662306a36Sopenharmony_ci 69762306a36Sopenharmony_ci#ifdef CONFIG_BUFFER_HEAD 69862306a36Sopenharmony_ci/* Returns true if all buffers are successfully locked */ 69962306a36Sopenharmony_cistatic bool buffer_migrate_lock_buffers(struct buffer_head *head, 70062306a36Sopenharmony_ci enum migrate_mode mode) 70162306a36Sopenharmony_ci{ 70262306a36Sopenharmony_ci struct buffer_head *bh = head; 70362306a36Sopenharmony_ci struct buffer_head *failed_bh; 70462306a36Sopenharmony_ci 70562306a36Sopenharmony_ci do { 70662306a36Sopenharmony_ci if (!trylock_buffer(bh)) { 70762306a36Sopenharmony_ci if (mode == MIGRATE_ASYNC) 70862306a36Sopenharmony_ci goto unlock; 70962306a36Sopenharmony_ci if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh)) 71062306a36Sopenharmony_ci goto unlock; 71162306a36Sopenharmony_ci lock_buffer(bh); 71262306a36Sopenharmony_ci } 71362306a36Sopenharmony_ci 71462306a36Sopenharmony_ci bh = bh->b_this_page; 71562306a36Sopenharmony_ci } while (bh != head); 71662306a36Sopenharmony_ci 71762306a36Sopenharmony_ci return true; 71862306a36Sopenharmony_ci 71962306a36Sopenharmony_ciunlock: 72062306a36Sopenharmony_ci /* We failed to lock the buffer and cannot stall. */ 72162306a36Sopenharmony_ci failed_bh = bh; 72262306a36Sopenharmony_ci bh = head; 72362306a36Sopenharmony_ci while (bh != failed_bh) { 72462306a36Sopenharmony_ci unlock_buffer(bh); 72562306a36Sopenharmony_ci bh = bh->b_this_page; 72662306a36Sopenharmony_ci } 72762306a36Sopenharmony_ci 72862306a36Sopenharmony_ci return false; 72962306a36Sopenharmony_ci} 73062306a36Sopenharmony_ci 73162306a36Sopenharmony_cistatic int __buffer_migrate_folio(struct address_space *mapping, 73262306a36Sopenharmony_ci struct folio *dst, struct folio *src, enum migrate_mode mode, 73362306a36Sopenharmony_ci bool check_refs) 73462306a36Sopenharmony_ci{ 73562306a36Sopenharmony_ci struct buffer_head *bh, *head; 73662306a36Sopenharmony_ci int rc; 73762306a36Sopenharmony_ci int expected_count; 73862306a36Sopenharmony_ci 73962306a36Sopenharmony_ci head = folio_buffers(src); 74062306a36Sopenharmony_ci if (!head) 74162306a36Sopenharmony_ci return migrate_folio(mapping, dst, src, mode); 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci /* Check whether page does not have extra refs before we do more work */ 74462306a36Sopenharmony_ci expected_count = folio_expected_refs(mapping, src); 74562306a36Sopenharmony_ci if (folio_ref_count(src) != expected_count) 74662306a36Sopenharmony_ci return -EAGAIN; 74762306a36Sopenharmony_ci 74862306a36Sopenharmony_ci if (!buffer_migrate_lock_buffers(head, mode)) 74962306a36Sopenharmony_ci return -EAGAIN; 75062306a36Sopenharmony_ci 75162306a36Sopenharmony_ci if (check_refs) { 75262306a36Sopenharmony_ci bool busy; 75362306a36Sopenharmony_ci bool invalidated = false; 75462306a36Sopenharmony_ci 75562306a36Sopenharmony_cirecheck_buffers: 75662306a36Sopenharmony_ci busy = false; 75762306a36Sopenharmony_ci spin_lock(&mapping->private_lock); 75862306a36Sopenharmony_ci bh = head; 75962306a36Sopenharmony_ci do { 76062306a36Sopenharmony_ci if (atomic_read(&bh->b_count)) { 76162306a36Sopenharmony_ci busy = true; 76262306a36Sopenharmony_ci break; 76362306a36Sopenharmony_ci } 76462306a36Sopenharmony_ci bh = bh->b_this_page; 76562306a36Sopenharmony_ci } while (bh != head); 76662306a36Sopenharmony_ci if (busy) { 76762306a36Sopenharmony_ci if (invalidated) { 76862306a36Sopenharmony_ci rc = -EAGAIN; 76962306a36Sopenharmony_ci goto unlock_buffers; 77062306a36Sopenharmony_ci } 77162306a36Sopenharmony_ci spin_unlock(&mapping->private_lock); 77262306a36Sopenharmony_ci invalidate_bh_lrus(); 77362306a36Sopenharmony_ci invalidated = true; 77462306a36Sopenharmony_ci goto recheck_buffers; 77562306a36Sopenharmony_ci } 77662306a36Sopenharmony_ci } 77762306a36Sopenharmony_ci 77862306a36Sopenharmony_ci rc = folio_migrate_mapping(mapping, dst, src, 0); 77962306a36Sopenharmony_ci if (rc != MIGRATEPAGE_SUCCESS) 78062306a36Sopenharmony_ci goto unlock_buffers; 78162306a36Sopenharmony_ci 78262306a36Sopenharmony_ci folio_attach_private(dst, folio_detach_private(src)); 78362306a36Sopenharmony_ci 78462306a36Sopenharmony_ci bh = head; 78562306a36Sopenharmony_ci do { 78662306a36Sopenharmony_ci folio_set_bh(bh, dst, bh_offset(bh)); 78762306a36Sopenharmony_ci bh = bh->b_this_page; 78862306a36Sopenharmony_ci } while (bh != head); 78962306a36Sopenharmony_ci 79062306a36Sopenharmony_ci if (mode != MIGRATE_SYNC_NO_COPY) 79162306a36Sopenharmony_ci folio_migrate_copy(dst, src); 79262306a36Sopenharmony_ci else 79362306a36Sopenharmony_ci folio_migrate_flags(dst, src); 79462306a36Sopenharmony_ci 79562306a36Sopenharmony_ci rc = MIGRATEPAGE_SUCCESS; 79662306a36Sopenharmony_ciunlock_buffers: 79762306a36Sopenharmony_ci if (check_refs) 79862306a36Sopenharmony_ci spin_unlock(&mapping->private_lock); 79962306a36Sopenharmony_ci bh = head; 80062306a36Sopenharmony_ci do { 80162306a36Sopenharmony_ci unlock_buffer(bh); 80262306a36Sopenharmony_ci bh = bh->b_this_page; 80362306a36Sopenharmony_ci } while (bh != head); 80462306a36Sopenharmony_ci 80562306a36Sopenharmony_ci return rc; 80662306a36Sopenharmony_ci} 80762306a36Sopenharmony_ci 80862306a36Sopenharmony_ci/** 80962306a36Sopenharmony_ci * buffer_migrate_folio() - Migration function for folios with buffers. 81062306a36Sopenharmony_ci * @mapping: The address space containing @src. 81162306a36Sopenharmony_ci * @dst: The folio to migrate to. 81262306a36Sopenharmony_ci * @src: The folio to migrate from. 81362306a36Sopenharmony_ci * @mode: How to migrate the folio. 81462306a36Sopenharmony_ci * 81562306a36Sopenharmony_ci * This function can only be used if the underlying filesystem guarantees 81662306a36Sopenharmony_ci * that no other references to @src exist. For example attached buffer 81762306a36Sopenharmony_ci * heads are accessed only under the folio lock. If your filesystem cannot 81862306a36Sopenharmony_ci * provide this guarantee, buffer_migrate_folio_norefs() may be more 81962306a36Sopenharmony_ci * appropriate. 82062306a36Sopenharmony_ci * 82162306a36Sopenharmony_ci * Return: 0 on success or a negative errno on failure. 82262306a36Sopenharmony_ci */ 82362306a36Sopenharmony_ciint buffer_migrate_folio(struct address_space *mapping, 82462306a36Sopenharmony_ci struct folio *dst, struct folio *src, enum migrate_mode mode) 82562306a36Sopenharmony_ci{ 82662306a36Sopenharmony_ci return __buffer_migrate_folio(mapping, dst, src, mode, false); 82762306a36Sopenharmony_ci} 82862306a36Sopenharmony_ciEXPORT_SYMBOL(buffer_migrate_folio); 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_ci/** 83162306a36Sopenharmony_ci * buffer_migrate_folio_norefs() - Migration function for folios with buffers. 83262306a36Sopenharmony_ci * @mapping: The address space containing @src. 83362306a36Sopenharmony_ci * @dst: The folio to migrate to. 83462306a36Sopenharmony_ci * @src: The folio to migrate from. 83562306a36Sopenharmony_ci * @mode: How to migrate the folio. 83662306a36Sopenharmony_ci * 83762306a36Sopenharmony_ci * Like buffer_migrate_folio() except that this variant is more careful 83862306a36Sopenharmony_ci * and checks that there are also no buffer head references. This function 83962306a36Sopenharmony_ci * is the right one for mappings where buffer heads are directly looked 84062306a36Sopenharmony_ci * up and referenced (such as block device mappings). 84162306a36Sopenharmony_ci * 84262306a36Sopenharmony_ci * Return: 0 on success or a negative errno on failure. 84362306a36Sopenharmony_ci */ 84462306a36Sopenharmony_ciint buffer_migrate_folio_norefs(struct address_space *mapping, 84562306a36Sopenharmony_ci struct folio *dst, struct folio *src, enum migrate_mode mode) 84662306a36Sopenharmony_ci{ 84762306a36Sopenharmony_ci return __buffer_migrate_folio(mapping, dst, src, mode, true); 84862306a36Sopenharmony_ci} 84962306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); 85062306a36Sopenharmony_ci#endif /* CONFIG_BUFFER_HEAD */ 85162306a36Sopenharmony_ci 85262306a36Sopenharmony_ciint filemap_migrate_folio(struct address_space *mapping, 85362306a36Sopenharmony_ci struct folio *dst, struct folio *src, enum migrate_mode mode) 85462306a36Sopenharmony_ci{ 85562306a36Sopenharmony_ci int ret; 85662306a36Sopenharmony_ci 85762306a36Sopenharmony_ci ret = folio_migrate_mapping(mapping, dst, src, 0); 85862306a36Sopenharmony_ci if (ret != MIGRATEPAGE_SUCCESS) 85962306a36Sopenharmony_ci return ret; 86062306a36Sopenharmony_ci 86162306a36Sopenharmony_ci if (folio_get_private(src)) 86262306a36Sopenharmony_ci folio_attach_private(dst, folio_detach_private(src)); 86362306a36Sopenharmony_ci 86462306a36Sopenharmony_ci if (mode != MIGRATE_SYNC_NO_COPY) 86562306a36Sopenharmony_ci folio_migrate_copy(dst, src); 86662306a36Sopenharmony_ci else 86762306a36Sopenharmony_ci folio_migrate_flags(dst, src); 86862306a36Sopenharmony_ci return MIGRATEPAGE_SUCCESS; 86962306a36Sopenharmony_ci} 87062306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(filemap_migrate_folio); 87162306a36Sopenharmony_ci 87262306a36Sopenharmony_ci/* 87362306a36Sopenharmony_ci * Writeback a folio to clean the dirty state 87462306a36Sopenharmony_ci */ 87562306a36Sopenharmony_cistatic int writeout(struct address_space *mapping, struct folio *folio) 87662306a36Sopenharmony_ci{ 87762306a36Sopenharmony_ci struct writeback_control wbc = { 87862306a36Sopenharmony_ci .sync_mode = WB_SYNC_NONE, 87962306a36Sopenharmony_ci .nr_to_write = 1, 88062306a36Sopenharmony_ci .range_start = 0, 88162306a36Sopenharmony_ci .range_end = LLONG_MAX, 88262306a36Sopenharmony_ci .for_reclaim = 1 88362306a36Sopenharmony_ci }; 88462306a36Sopenharmony_ci int rc; 88562306a36Sopenharmony_ci 88662306a36Sopenharmony_ci if (!mapping->a_ops->writepage) 88762306a36Sopenharmony_ci /* No write method for the address space */ 88862306a36Sopenharmony_ci return -EINVAL; 88962306a36Sopenharmony_ci 89062306a36Sopenharmony_ci if (!folio_clear_dirty_for_io(folio)) 89162306a36Sopenharmony_ci /* Someone else already triggered a write */ 89262306a36Sopenharmony_ci return -EAGAIN; 89362306a36Sopenharmony_ci 89462306a36Sopenharmony_ci /* 89562306a36Sopenharmony_ci * A dirty folio may imply that the underlying filesystem has 89662306a36Sopenharmony_ci * the folio on some queue. So the folio must be clean for 89762306a36Sopenharmony_ci * migration. Writeout may mean we lose the lock and the 89862306a36Sopenharmony_ci * folio state is no longer what we checked for earlier. 89962306a36Sopenharmony_ci * At this point we know that the migration attempt cannot 90062306a36Sopenharmony_ci * be successful. 90162306a36Sopenharmony_ci */ 90262306a36Sopenharmony_ci remove_migration_ptes(folio, folio, false); 90362306a36Sopenharmony_ci 90462306a36Sopenharmony_ci rc = mapping->a_ops->writepage(&folio->page, &wbc); 90562306a36Sopenharmony_ci 90662306a36Sopenharmony_ci if (rc != AOP_WRITEPAGE_ACTIVATE) 90762306a36Sopenharmony_ci /* unlocked. Relock */ 90862306a36Sopenharmony_ci folio_lock(folio); 90962306a36Sopenharmony_ci 91062306a36Sopenharmony_ci return (rc < 0) ? -EIO : -EAGAIN; 91162306a36Sopenharmony_ci} 91262306a36Sopenharmony_ci 91362306a36Sopenharmony_ci/* 91462306a36Sopenharmony_ci * Default handling if a filesystem does not provide a migration function. 91562306a36Sopenharmony_ci */ 91662306a36Sopenharmony_cistatic int fallback_migrate_folio(struct address_space *mapping, 91762306a36Sopenharmony_ci struct folio *dst, struct folio *src, enum migrate_mode mode) 91862306a36Sopenharmony_ci{ 91962306a36Sopenharmony_ci if (folio_test_dirty(src)) { 92062306a36Sopenharmony_ci /* Only writeback folios in full synchronous migration */ 92162306a36Sopenharmony_ci switch (mode) { 92262306a36Sopenharmony_ci case MIGRATE_SYNC: 92362306a36Sopenharmony_ci case MIGRATE_SYNC_NO_COPY: 92462306a36Sopenharmony_ci break; 92562306a36Sopenharmony_ci default: 92662306a36Sopenharmony_ci return -EBUSY; 92762306a36Sopenharmony_ci } 92862306a36Sopenharmony_ci return writeout(mapping, src); 92962306a36Sopenharmony_ci } 93062306a36Sopenharmony_ci 93162306a36Sopenharmony_ci /* 93262306a36Sopenharmony_ci * Buffers may be managed in a filesystem specific way. 93362306a36Sopenharmony_ci * We must have no buffers or drop them. 93462306a36Sopenharmony_ci */ 93562306a36Sopenharmony_ci if (!filemap_release_folio(src, GFP_KERNEL)) 93662306a36Sopenharmony_ci return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; 93762306a36Sopenharmony_ci 93862306a36Sopenharmony_ci return migrate_folio(mapping, dst, src, mode); 93962306a36Sopenharmony_ci} 94062306a36Sopenharmony_ci 94162306a36Sopenharmony_ci/* 94262306a36Sopenharmony_ci * Move a page to a newly allocated page 94362306a36Sopenharmony_ci * The page is locked and all ptes have been successfully removed. 94462306a36Sopenharmony_ci * 94562306a36Sopenharmony_ci * The new page will have replaced the old page if this function 94662306a36Sopenharmony_ci * is successful. 94762306a36Sopenharmony_ci * 94862306a36Sopenharmony_ci * Return value: 94962306a36Sopenharmony_ci * < 0 - error code 95062306a36Sopenharmony_ci * MIGRATEPAGE_SUCCESS - success 95162306a36Sopenharmony_ci */ 95262306a36Sopenharmony_cistatic int move_to_new_folio(struct folio *dst, struct folio *src, 95362306a36Sopenharmony_ci enum migrate_mode mode) 95462306a36Sopenharmony_ci{ 95562306a36Sopenharmony_ci int rc = -EAGAIN; 95662306a36Sopenharmony_ci bool is_lru = !__PageMovable(&src->page); 95762306a36Sopenharmony_ci 95862306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(src), src); 95962306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); 96062306a36Sopenharmony_ci 96162306a36Sopenharmony_ci if (likely(is_lru)) { 96262306a36Sopenharmony_ci struct address_space *mapping = folio_mapping(src); 96362306a36Sopenharmony_ci 96462306a36Sopenharmony_ci if (!mapping) 96562306a36Sopenharmony_ci rc = migrate_folio(mapping, dst, src, mode); 96662306a36Sopenharmony_ci else if (mapping->a_ops->migrate_folio) 96762306a36Sopenharmony_ci /* 96862306a36Sopenharmony_ci * Most folios have a mapping and most filesystems 96962306a36Sopenharmony_ci * provide a migrate_folio callback. Anonymous folios 97062306a36Sopenharmony_ci * are part of swap space which also has its own 97162306a36Sopenharmony_ci * migrate_folio callback. This is the most common path 97262306a36Sopenharmony_ci * for page migration. 97362306a36Sopenharmony_ci */ 97462306a36Sopenharmony_ci rc = mapping->a_ops->migrate_folio(mapping, dst, src, 97562306a36Sopenharmony_ci mode); 97662306a36Sopenharmony_ci else 97762306a36Sopenharmony_ci rc = fallback_migrate_folio(mapping, dst, src, mode); 97862306a36Sopenharmony_ci } else { 97962306a36Sopenharmony_ci const struct movable_operations *mops; 98062306a36Sopenharmony_ci 98162306a36Sopenharmony_ci /* 98262306a36Sopenharmony_ci * In case of non-lru page, it could be released after 98362306a36Sopenharmony_ci * isolation step. In that case, we shouldn't try migration. 98462306a36Sopenharmony_ci */ 98562306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); 98662306a36Sopenharmony_ci if (!folio_test_movable(src)) { 98762306a36Sopenharmony_ci rc = MIGRATEPAGE_SUCCESS; 98862306a36Sopenharmony_ci folio_clear_isolated(src); 98962306a36Sopenharmony_ci goto out; 99062306a36Sopenharmony_ci } 99162306a36Sopenharmony_ci 99262306a36Sopenharmony_ci mops = folio_movable_ops(src); 99362306a36Sopenharmony_ci rc = mops->migrate_page(&dst->page, &src->page, mode); 99462306a36Sopenharmony_ci WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && 99562306a36Sopenharmony_ci !folio_test_isolated(src)); 99662306a36Sopenharmony_ci } 99762306a36Sopenharmony_ci 99862306a36Sopenharmony_ci /* 99962306a36Sopenharmony_ci * When successful, old pagecache src->mapping must be cleared before 100062306a36Sopenharmony_ci * src is freed; but stats require that PageAnon be left as PageAnon. 100162306a36Sopenharmony_ci */ 100262306a36Sopenharmony_ci if (rc == MIGRATEPAGE_SUCCESS) { 100362306a36Sopenharmony_ci if (__PageMovable(&src->page)) { 100462306a36Sopenharmony_ci VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); 100562306a36Sopenharmony_ci 100662306a36Sopenharmony_ci /* 100762306a36Sopenharmony_ci * We clear PG_movable under page_lock so any compactor 100862306a36Sopenharmony_ci * cannot try to migrate this page. 100962306a36Sopenharmony_ci */ 101062306a36Sopenharmony_ci folio_clear_isolated(src); 101162306a36Sopenharmony_ci } 101262306a36Sopenharmony_ci 101362306a36Sopenharmony_ci /* 101462306a36Sopenharmony_ci * Anonymous and movable src->mapping will be cleared by 101562306a36Sopenharmony_ci * free_pages_prepare so don't reset it here for keeping 101662306a36Sopenharmony_ci * the type to work PageAnon, for example. 101762306a36Sopenharmony_ci */ 101862306a36Sopenharmony_ci if (!folio_mapping_flags(src)) 101962306a36Sopenharmony_ci src->mapping = NULL; 102062306a36Sopenharmony_ci 102162306a36Sopenharmony_ci if (likely(!folio_is_zone_device(dst))) 102262306a36Sopenharmony_ci flush_dcache_folio(dst); 102362306a36Sopenharmony_ci } 102462306a36Sopenharmony_ciout: 102562306a36Sopenharmony_ci return rc; 102662306a36Sopenharmony_ci} 102762306a36Sopenharmony_ci 102862306a36Sopenharmony_ci/* 102962306a36Sopenharmony_ci * To record some information during migration, we use unused private 103062306a36Sopenharmony_ci * field of struct folio of the newly allocated destination folio. 103162306a36Sopenharmony_ci * This is safe because nobody is using it except us. 103262306a36Sopenharmony_ci */ 103362306a36Sopenharmony_cienum { 103462306a36Sopenharmony_ci PAGE_WAS_MAPPED = BIT(0), 103562306a36Sopenharmony_ci PAGE_WAS_MLOCKED = BIT(1), 103662306a36Sopenharmony_ci PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED, 103762306a36Sopenharmony_ci}; 103862306a36Sopenharmony_ci 103962306a36Sopenharmony_cistatic void __migrate_folio_record(struct folio *dst, 104062306a36Sopenharmony_ci int old_page_state, 104162306a36Sopenharmony_ci struct anon_vma *anon_vma) 104262306a36Sopenharmony_ci{ 104362306a36Sopenharmony_ci dst->private = (void *)anon_vma + old_page_state; 104462306a36Sopenharmony_ci} 104562306a36Sopenharmony_ci 104662306a36Sopenharmony_cistatic void __migrate_folio_extract(struct folio *dst, 104762306a36Sopenharmony_ci int *old_page_state, 104862306a36Sopenharmony_ci struct anon_vma **anon_vmap) 104962306a36Sopenharmony_ci{ 105062306a36Sopenharmony_ci unsigned long private = (unsigned long)dst->private; 105162306a36Sopenharmony_ci 105262306a36Sopenharmony_ci *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES); 105362306a36Sopenharmony_ci *old_page_state = private & PAGE_OLD_STATES; 105462306a36Sopenharmony_ci dst->private = NULL; 105562306a36Sopenharmony_ci} 105662306a36Sopenharmony_ci 105762306a36Sopenharmony_ci/* Restore the source folio to the original state upon failure */ 105862306a36Sopenharmony_cistatic void migrate_folio_undo_src(struct folio *src, 105962306a36Sopenharmony_ci int page_was_mapped, 106062306a36Sopenharmony_ci struct anon_vma *anon_vma, 106162306a36Sopenharmony_ci bool locked, 106262306a36Sopenharmony_ci struct list_head *ret) 106362306a36Sopenharmony_ci{ 106462306a36Sopenharmony_ci if (page_was_mapped) 106562306a36Sopenharmony_ci remove_migration_ptes(src, src, false); 106662306a36Sopenharmony_ci /* Drop an anon_vma reference if we took one */ 106762306a36Sopenharmony_ci if (anon_vma) 106862306a36Sopenharmony_ci put_anon_vma(anon_vma); 106962306a36Sopenharmony_ci if (locked) 107062306a36Sopenharmony_ci folio_unlock(src); 107162306a36Sopenharmony_ci if (ret) 107262306a36Sopenharmony_ci list_move_tail(&src->lru, ret); 107362306a36Sopenharmony_ci} 107462306a36Sopenharmony_ci 107562306a36Sopenharmony_ci/* Restore the destination folio to the original state upon failure */ 107662306a36Sopenharmony_cistatic void migrate_folio_undo_dst(struct folio *dst, bool locked, 107762306a36Sopenharmony_ci free_folio_t put_new_folio, unsigned long private) 107862306a36Sopenharmony_ci{ 107962306a36Sopenharmony_ci if (locked) 108062306a36Sopenharmony_ci folio_unlock(dst); 108162306a36Sopenharmony_ci if (put_new_folio) 108262306a36Sopenharmony_ci put_new_folio(dst, private); 108362306a36Sopenharmony_ci else 108462306a36Sopenharmony_ci folio_put(dst); 108562306a36Sopenharmony_ci} 108662306a36Sopenharmony_ci 108762306a36Sopenharmony_ci/* Cleanup src folio upon migration success */ 108862306a36Sopenharmony_cistatic void migrate_folio_done(struct folio *src, 108962306a36Sopenharmony_ci enum migrate_reason reason) 109062306a36Sopenharmony_ci{ 109162306a36Sopenharmony_ci /* 109262306a36Sopenharmony_ci * Compaction can migrate also non-LRU pages which are 109362306a36Sopenharmony_ci * not accounted to NR_ISOLATED_*. They can be recognized 109462306a36Sopenharmony_ci * as __PageMovable 109562306a36Sopenharmony_ci */ 109662306a36Sopenharmony_ci if (likely(!__folio_test_movable(src))) 109762306a36Sopenharmony_ci mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + 109862306a36Sopenharmony_ci folio_is_file_lru(src), -folio_nr_pages(src)); 109962306a36Sopenharmony_ci 110062306a36Sopenharmony_ci if (reason != MR_MEMORY_FAILURE) 110162306a36Sopenharmony_ci /* We release the page in page_handle_poison. */ 110262306a36Sopenharmony_ci folio_put(src); 110362306a36Sopenharmony_ci} 110462306a36Sopenharmony_ci 110562306a36Sopenharmony_ci/* Obtain the lock on page, remove all ptes. */ 110662306a36Sopenharmony_cistatic int migrate_folio_unmap(new_folio_t get_new_folio, 110762306a36Sopenharmony_ci free_folio_t put_new_folio, unsigned long private, 110862306a36Sopenharmony_ci struct folio *src, struct folio **dstp, enum migrate_mode mode, 110962306a36Sopenharmony_ci enum migrate_reason reason, struct list_head *ret) 111062306a36Sopenharmony_ci{ 111162306a36Sopenharmony_ci struct folio *dst; 111262306a36Sopenharmony_ci int rc = -EAGAIN; 111362306a36Sopenharmony_ci int old_page_state = 0; 111462306a36Sopenharmony_ci struct anon_vma *anon_vma = NULL; 111562306a36Sopenharmony_ci bool is_lru = !__PageMovable(&src->page); 111662306a36Sopenharmony_ci bool locked = false; 111762306a36Sopenharmony_ci bool dst_locked = false; 111862306a36Sopenharmony_ci 111962306a36Sopenharmony_ci if (folio_ref_count(src) == 1) { 112062306a36Sopenharmony_ci /* Folio was freed from under us. So we are done. */ 112162306a36Sopenharmony_ci folio_clear_active(src); 112262306a36Sopenharmony_ci folio_clear_unevictable(src); 112362306a36Sopenharmony_ci /* free_pages_prepare() will clear PG_isolated. */ 112462306a36Sopenharmony_ci list_del(&src->lru); 112562306a36Sopenharmony_ci migrate_folio_done(src, reason); 112662306a36Sopenharmony_ci return MIGRATEPAGE_SUCCESS; 112762306a36Sopenharmony_ci } 112862306a36Sopenharmony_ci 112962306a36Sopenharmony_ci dst = get_new_folio(src, private); 113062306a36Sopenharmony_ci if (!dst) 113162306a36Sopenharmony_ci return -ENOMEM; 113262306a36Sopenharmony_ci *dstp = dst; 113362306a36Sopenharmony_ci 113462306a36Sopenharmony_ci dst->private = NULL; 113562306a36Sopenharmony_ci 113662306a36Sopenharmony_ci if (!folio_trylock(src)) { 113762306a36Sopenharmony_ci if (mode == MIGRATE_ASYNC) 113862306a36Sopenharmony_ci goto out; 113962306a36Sopenharmony_ci 114062306a36Sopenharmony_ci /* 114162306a36Sopenharmony_ci * It's not safe for direct compaction to call lock_page. 114262306a36Sopenharmony_ci * For example, during page readahead pages are added locked 114362306a36Sopenharmony_ci * to the LRU. Later, when the IO completes the pages are 114462306a36Sopenharmony_ci * marked uptodate and unlocked. However, the queueing 114562306a36Sopenharmony_ci * could be merging multiple pages for one bio (e.g. 114662306a36Sopenharmony_ci * mpage_readahead). If an allocation happens for the 114762306a36Sopenharmony_ci * second or third page, the process can end up locking 114862306a36Sopenharmony_ci * the same page twice and deadlocking. Rather than 114962306a36Sopenharmony_ci * trying to be clever about what pages can be locked, 115062306a36Sopenharmony_ci * avoid the use of lock_page for direct compaction 115162306a36Sopenharmony_ci * altogether. 115262306a36Sopenharmony_ci */ 115362306a36Sopenharmony_ci if (current->flags & PF_MEMALLOC) 115462306a36Sopenharmony_ci goto out; 115562306a36Sopenharmony_ci 115662306a36Sopenharmony_ci /* 115762306a36Sopenharmony_ci * In "light" mode, we can wait for transient locks (eg 115862306a36Sopenharmony_ci * inserting a page into the page table), but it's not 115962306a36Sopenharmony_ci * worth waiting for I/O. 116062306a36Sopenharmony_ci */ 116162306a36Sopenharmony_ci if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src)) 116262306a36Sopenharmony_ci goto out; 116362306a36Sopenharmony_ci 116462306a36Sopenharmony_ci folio_lock(src); 116562306a36Sopenharmony_ci } 116662306a36Sopenharmony_ci locked = true; 116762306a36Sopenharmony_ci if (folio_test_mlocked(src)) 116862306a36Sopenharmony_ci old_page_state |= PAGE_WAS_MLOCKED; 116962306a36Sopenharmony_ci 117062306a36Sopenharmony_ci if (folio_test_writeback(src)) { 117162306a36Sopenharmony_ci /* 117262306a36Sopenharmony_ci * Only in the case of a full synchronous migration is it 117362306a36Sopenharmony_ci * necessary to wait for PageWriteback. In the async case, 117462306a36Sopenharmony_ci * the retry loop is too short and in the sync-light case, 117562306a36Sopenharmony_ci * the overhead of stalling is too much 117662306a36Sopenharmony_ci */ 117762306a36Sopenharmony_ci switch (mode) { 117862306a36Sopenharmony_ci case MIGRATE_SYNC: 117962306a36Sopenharmony_ci case MIGRATE_SYNC_NO_COPY: 118062306a36Sopenharmony_ci break; 118162306a36Sopenharmony_ci default: 118262306a36Sopenharmony_ci rc = -EBUSY; 118362306a36Sopenharmony_ci goto out; 118462306a36Sopenharmony_ci } 118562306a36Sopenharmony_ci folio_wait_writeback(src); 118662306a36Sopenharmony_ci } 118762306a36Sopenharmony_ci 118862306a36Sopenharmony_ci /* 118962306a36Sopenharmony_ci * By try_to_migrate(), src->mapcount goes down to 0 here. In this case, 119062306a36Sopenharmony_ci * we cannot notice that anon_vma is freed while we migrate a page. 119162306a36Sopenharmony_ci * This get_anon_vma() delays freeing anon_vma pointer until the end 119262306a36Sopenharmony_ci * of migration. File cache pages are no problem because of page_lock() 119362306a36Sopenharmony_ci * File Caches may use write_page() or lock_page() in migration, then, 119462306a36Sopenharmony_ci * just care Anon page here. 119562306a36Sopenharmony_ci * 119662306a36Sopenharmony_ci * Only folio_get_anon_vma() understands the subtleties of 119762306a36Sopenharmony_ci * getting a hold on an anon_vma from outside one of its mms. 119862306a36Sopenharmony_ci * But if we cannot get anon_vma, then we won't need it anyway, 119962306a36Sopenharmony_ci * because that implies that the anon page is no longer mapped 120062306a36Sopenharmony_ci * (and cannot be remapped so long as we hold the page lock). 120162306a36Sopenharmony_ci */ 120262306a36Sopenharmony_ci if (folio_test_anon(src) && !folio_test_ksm(src)) 120362306a36Sopenharmony_ci anon_vma = folio_get_anon_vma(src); 120462306a36Sopenharmony_ci 120562306a36Sopenharmony_ci /* 120662306a36Sopenharmony_ci * Block others from accessing the new page when we get around to 120762306a36Sopenharmony_ci * establishing additional references. We are usually the only one 120862306a36Sopenharmony_ci * holding a reference to dst at this point. We used to have a BUG 120962306a36Sopenharmony_ci * here if folio_trylock(dst) fails, but would like to allow for 121062306a36Sopenharmony_ci * cases where there might be a race with the previous use of dst. 121162306a36Sopenharmony_ci * This is much like races on refcount of oldpage: just don't BUG(). 121262306a36Sopenharmony_ci */ 121362306a36Sopenharmony_ci if (unlikely(!folio_trylock(dst))) 121462306a36Sopenharmony_ci goto out; 121562306a36Sopenharmony_ci dst_locked = true; 121662306a36Sopenharmony_ci 121762306a36Sopenharmony_ci if (unlikely(!is_lru)) { 121862306a36Sopenharmony_ci __migrate_folio_record(dst, old_page_state, anon_vma); 121962306a36Sopenharmony_ci return MIGRATEPAGE_UNMAP; 122062306a36Sopenharmony_ci } 122162306a36Sopenharmony_ci 122262306a36Sopenharmony_ci /* 122362306a36Sopenharmony_ci * Corner case handling: 122462306a36Sopenharmony_ci * 1. When a new swap-cache page is read into, it is added to the LRU 122562306a36Sopenharmony_ci * and treated as swapcache but it has no rmap yet. 122662306a36Sopenharmony_ci * Calling try_to_unmap() against a src->mapping==NULL page will 122762306a36Sopenharmony_ci * trigger a BUG. So handle it here. 122862306a36Sopenharmony_ci * 2. An orphaned page (see truncate_cleanup_page) might have 122962306a36Sopenharmony_ci * fs-private metadata. The page can be picked up due to memory 123062306a36Sopenharmony_ci * offlining. Everywhere else except page reclaim, the page is 123162306a36Sopenharmony_ci * invisible to the vm, so the page can not be migrated. So try to 123262306a36Sopenharmony_ci * free the metadata, so the page can be freed. 123362306a36Sopenharmony_ci */ 123462306a36Sopenharmony_ci if (!src->mapping) { 123562306a36Sopenharmony_ci if (folio_test_private(src)) { 123662306a36Sopenharmony_ci try_to_free_buffers(src); 123762306a36Sopenharmony_ci goto out; 123862306a36Sopenharmony_ci } 123962306a36Sopenharmony_ci } else if (folio_mapped(src)) { 124062306a36Sopenharmony_ci /* Establish migration ptes */ 124162306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_anon(src) && 124262306a36Sopenharmony_ci !folio_test_ksm(src) && !anon_vma, src); 124362306a36Sopenharmony_ci try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0); 124462306a36Sopenharmony_ci old_page_state |= PAGE_WAS_MAPPED; 124562306a36Sopenharmony_ci } 124662306a36Sopenharmony_ci 124762306a36Sopenharmony_ci if (!folio_mapped(src)) { 124862306a36Sopenharmony_ci __migrate_folio_record(dst, old_page_state, anon_vma); 124962306a36Sopenharmony_ci return MIGRATEPAGE_UNMAP; 125062306a36Sopenharmony_ci } 125162306a36Sopenharmony_ci 125262306a36Sopenharmony_ciout: 125362306a36Sopenharmony_ci /* 125462306a36Sopenharmony_ci * A folio that has not been unmapped will be restored to 125562306a36Sopenharmony_ci * right list unless we want to retry. 125662306a36Sopenharmony_ci */ 125762306a36Sopenharmony_ci if (rc == -EAGAIN) 125862306a36Sopenharmony_ci ret = NULL; 125962306a36Sopenharmony_ci 126062306a36Sopenharmony_ci migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, 126162306a36Sopenharmony_ci anon_vma, locked, ret); 126262306a36Sopenharmony_ci migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private); 126362306a36Sopenharmony_ci 126462306a36Sopenharmony_ci return rc; 126562306a36Sopenharmony_ci} 126662306a36Sopenharmony_ci 126762306a36Sopenharmony_ci/* Migrate the folio to the newly allocated folio in dst. */ 126862306a36Sopenharmony_cistatic int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, 126962306a36Sopenharmony_ci struct folio *src, struct folio *dst, 127062306a36Sopenharmony_ci enum migrate_mode mode, enum migrate_reason reason, 127162306a36Sopenharmony_ci struct list_head *ret) 127262306a36Sopenharmony_ci{ 127362306a36Sopenharmony_ci int rc; 127462306a36Sopenharmony_ci int old_page_state = 0; 127562306a36Sopenharmony_ci struct anon_vma *anon_vma = NULL; 127662306a36Sopenharmony_ci bool is_lru = !__PageMovable(&src->page); 127762306a36Sopenharmony_ci struct list_head *prev; 127862306a36Sopenharmony_ci 127962306a36Sopenharmony_ci __migrate_folio_extract(dst, &old_page_state, &anon_vma); 128062306a36Sopenharmony_ci prev = dst->lru.prev; 128162306a36Sopenharmony_ci list_del(&dst->lru); 128262306a36Sopenharmony_ci 128362306a36Sopenharmony_ci rc = move_to_new_folio(dst, src, mode); 128462306a36Sopenharmony_ci if (rc) 128562306a36Sopenharmony_ci goto out; 128662306a36Sopenharmony_ci 128762306a36Sopenharmony_ci if (unlikely(!is_lru)) 128862306a36Sopenharmony_ci goto out_unlock_both; 128962306a36Sopenharmony_ci 129062306a36Sopenharmony_ci /* 129162306a36Sopenharmony_ci * When successful, push dst to LRU immediately: so that if it 129262306a36Sopenharmony_ci * turns out to be an mlocked page, remove_migration_ptes() will 129362306a36Sopenharmony_ci * automatically build up the correct dst->mlock_count for it. 129462306a36Sopenharmony_ci * 129562306a36Sopenharmony_ci * We would like to do something similar for the old page, when 129662306a36Sopenharmony_ci * unsuccessful, and other cases when a page has been temporarily 129762306a36Sopenharmony_ci * isolated from the unevictable LRU: but this case is the easiest. 129862306a36Sopenharmony_ci */ 129962306a36Sopenharmony_ci folio_add_lru(dst); 130062306a36Sopenharmony_ci if (old_page_state & PAGE_WAS_MLOCKED) 130162306a36Sopenharmony_ci lru_add_drain(); 130262306a36Sopenharmony_ci 130362306a36Sopenharmony_ci if (old_page_state & PAGE_WAS_MAPPED) 130462306a36Sopenharmony_ci remove_migration_ptes(src, dst, false); 130562306a36Sopenharmony_ci 130662306a36Sopenharmony_ciout_unlock_both: 130762306a36Sopenharmony_ci folio_unlock(dst); 130862306a36Sopenharmony_ci set_page_owner_migrate_reason(&dst->page, reason); 130962306a36Sopenharmony_ci /* 131062306a36Sopenharmony_ci * If migration is successful, decrease refcount of dst, 131162306a36Sopenharmony_ci * which will not free the page because new page owner increased 131262306a36Sopenharmony_ci * refcounter. 131362306a36Sopenharmony_ci */ 131462306a36Sopenharmony_ci folio_put(dst); 131562306a36Sopenharmony_ci 131662306a36Sopenharmony_ci /* 131762306a36Sopenharmony_ci * A folio that has been migrated has all references removed 131862306a36Sopenharmony_ci * and will be freed. 131962306a36Sopenharmony_ci */ 132062306a36Sopenharmony_ci list_del(&src->lru); 132162306a36Sopenharmony_ci /* Drop an anon_vma reference if we took one */ 132262306a36Sopenharmony_ci if (anon_vma) 132362306a36Sopenharmony_ci put_anon_vma(anon_vma); 132462306a36Sopenharmony_ci folio_unlock(src); 132562306a36Sopenharmony_ci migrate_folio_done(src, reason); 132662306a36Sopenharmony_ci 132762306a36Sopenharmony_ci return rc; 132862306a36Sopenharmony_ciout: 132962306a36Sopenharmony_ci /* 133062306a36Sopenharmony_ci * A folio that has not been migrated will be restored to 133162306a36Sopenharmony_ci * right list unless we want to retry. 133262306a36Sopenharmony_ci */ 133362306a36Sopenharmony_ci if (rc == -EAGAIN) { 133462306a36Sopenharmony_ci list_add(&dst->lru, prev); 133562306a36Sopenharmony_ci __migrate_folio_record(dst, old_page_state, anon_vma); 133662306a36Sopenharmony_ci return rc; 133762306a36Sopenharmony_ci } 133862306a36Sopenharmony_ci 133962306a36Sopenharmony_ci migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, 134062306a36Sopenharmony_ci anon_vma, true, ret); 134162306a36Sopenharmony_ci migrate_folio_undo_dst(dst, true, put_new_folio, private); 134262306a36Sopenharmony_ci 134362306a36Sopenharmony_ci return rc; 134462306a36Sopenharmony_ci} 134562306a36Sopenharmony_ci 134662306a36Sopenharmony_ci/* 134762306a36Sopenharmony_ci * Counterpart of unmap_and_move_page() for hugepage migration. 134862306a36Sopenharmony_ci * 134962306a36Sopenharmony_ci * This function doesn't wait the completion of hugepage I/O 135062306a36Sopenharmony_ci * because there is no race between I/O and migration for hugepage. 135162306a36Sopenharmony_ci * Note that currently hugepage I/O occurs only in direct I/O 135262306a36Sopenharmony_ci * where no lock is held and PG_writeback is irrelevant, 135362306a36Sopenharmony_ci * and writeback status of all subpages are counted in the reference 135462306a36Sopenharmony_ci * count of the head page (i.e. if all subpages of a 2MB hugepage are 135562306a36Sopenharmony_ci * under direct I/O, the reference of the head page is 512 and a bit more.) 135662306a36Sopenharmony_ci * This means that when we try to migrate hugepage whose subpages are 135762306a36Sopenharmony_ci * doing direct I/O, some references remain after try_to_unmap() and 135862306a36Sopenharmony_ci * hugepage migration fails without data corruption. 135962306a36Sopenharmony_ci * 136062306a36Sopenharmony_ci * There is also no race when direct I/O is issued on the page under migration, 136162306a36Sopenharmony_ci * because then pte is replaced with migration swap entry and direct I/O code 136262306a36Sopenharmony_ci * will wait in the page fault for migration to complete. 136362306a36Sopenharmony_ci */ 136462306a36Sopenharmony_cistatic int unmap_and_move_huge_page(new_folio_t get_new_folio, 136562306a36Sopenharmony_ci free_folio_t put_new_folio, unsigned long private, 136662306a36Sopenharmony_ci struct folio *src, int force, enum migrate_mode mode, 136762306a36Sopenharmony_ci int reason, struct list_head *ret) 136862306a36Sopenharmony_ci{ 136962306a36Sopenharmony_ci struct folio *dst; 137062306a36Sopenharmony_ci int rc = -EAGAIN; 137162306a36Sopenharmony_ci int page_was_mapped = 0; 137262306a36Sopenharmony_ci struct anon_vma *anon_vma = NULL; 137362306a36Sopenharmony_ci struct address_space *mapping = NULL; 137462306a36Sopenharmony_ci 137562306a36Sopenharmony_ci if (folio_ref_count(src) == 1) { 137662306a36Sopenharmony_ci /* page was freed from under us. So we are done. */ 137762306a36Sopenharmony_ci folio_putback_active_hugetlb(src); 137862306a36Sopenharmony_ci return MIGRATEPAGE_SUCCESS; 137962306a36Sopenharmony_ci } 138062306a36Sopenharmony_ci 138162306a36Sopenharmony_ci dst = get_new_folio(src, private); 138262306a36Sopenharmony_ci if (!dst) 138362306a36Sopenharmony_ci return -ENOMEM; 138462306a36Sopenharmony_ci 138562306a36Sopenharmony_ci if (!folio_trylock(src)) { 138662306a36Sopenharmony_ci if (!force) 138762306a36Sopenharmony_ci goto out; 138862306a36Sopenharmony_ci switch (mode) { 138962306a36Sopenharmony_ci case MIGRATE_SYNC: 139062306a36Sopenharmony_ci case MIGRATE_SYNC_NO_COPY: 139162306a36Sopenharmony_ci break; 139262306a36Sopenharmony_ci default: 139362306a36Sopenharmony_ci goto out; 139462306a36Sopenharmony_ci } 139562306a36Sopenharmony_ci folio_lock(src); 139662306a36Sopenharmony_ci } 139762306a36Sopenharmony_ci 139862306a36Sopenharmony_ci /* 139962306a36Sopenharmony_ci * Check for pages which are in the process of being freed. Without 140062306a36Sopenharmony_ci * folio_mapping() set, hugetlbfs specific move page routine will not 140162306a36Sopenharmony_ci * be called and we could leak usage counts for subpools. 140262306a36Sopenharmony_ci */ 140362306a36Sopenharmony_ci if (hugetlb_folio_subpool(src) && !folio_mapping(src)) { 140462306a36Sopenharmony_ci rc = -EBUSY; 140562306a36Sopenharmony_ci goto out_unlock; 140662306a36Sopenharmony_ci } 140762306a36Sopenharmony_ci 140862306a36Sopenharmony_ci if (folio_test_anon(src)) 140962306a36Sopenharmony_ci anon_vma = folio_get_anon_vma(src); 141062306a36Sopenharmony_ci 141162306a36Sopenharmony_ci if (unlikely(!folio_trylock(dst))) 141262306a36Sopenharmony_ci goto put_anon; 141362306a36Sopenharmony_ci 141462306a36Sopenharmony_ci if (folio_mapped(src)) { 141562306a36Sopenharmony_ci enum ttu_flags ttu = 0; 141662306a36Sopenharmony_ci 141762306a36Sopenharmony_ci if (!folio_test_anon(src)) { 141862306a36Sopenharmony_ci /* 141962306a36Sopenharmony_ci * In shared mappings, try_to_unmap could potentially 142062306a36Sopenharmony_ci * call huge_pmd_unshare. Because of this, take 142162306a36Sopenharmony_ci * semaphore in write mode here and set TTU_RMAP_LOCKED 142262306a36Sopenharmony_ci * to let lower levels know we have taken the lock. 142362306a36Sopenharmony_ci */ 142462306a36Sopenharmony_ci mapping = hugetlb_page_mapping_lock_write(&src->page); 142562306a36Sopenharmony_ci if (unlikely(!mapping)) 142662306a36Sopenharmony_ci goto unlock_put_anon; 142762306a36Sopenharmony_ci 142862306a36Sopenharmony_ci ttu = TTU_RMAP_LOCKED; 142962306a36Sopenharmony_ci } 143062306a36Sopenharmony_ci 143162306a36Sopenharmony_ci try_to_migrate(src, ttu); 143262306a36Sopenharmony_ci page_was_mapped = 1; 143362306a36Sopenharmony_ci 143462306a36Sopenharmony_ci if (ttu & TTU_RMAP_LOCKED) 143562306a36Sopenharmony_ci i_mmap_unlock_write(mapping); 143662306a36Sopenharmony_ci } 143762306a36Sopenharmony_ci 143862306a36Sopenharmony_ci if (!folio_mapped(src)) 143962306a36Sopenharmony_ci rc = move_to_new_folio(dst, src, mode); 144062306a36Sopenharmony_ci 144162306a36Sopenharmony_ci if (page_was_mapped) 144262306a36Sopenharmony_ci remove_migration_ptes(src, 144362306a36Sopenharmony_ci rc == MIGRATEPAGE_SUCCESS ? dst : src, false); 144462306a36Sopenharmony_ci 144562306a36Sopenharmony_ciunlock_put_anon: 144662306a36Sopenharmony_ci folio_unlock(dst); 144762306a36Sopenharmony_ci 144862306a36Sopenharmony_ciput_anon: 144962306a36Sopenharmony_ci if (anon_vma) 145062306a36Sopenharmony_ci put_anon_vma(anon_vma); 145162306a36Sopenharmony_ci 145262306a36Sopenharmony_ci if (rc == MIGRATEPAGE_SUCCESS) { 145362306a36Sopenharmony_ci move_hugetlb_state(src, dst, reason); 145462306a36Sopenharmony_ci put_new_folio = NULL; 145562306a36Sopenharmony_ci } 145662306a36Sopenharmony_ci 145762306a36Sopenharmony_ciout_unlock: 145862306a36Sopenharmony_ci folio_unlock(src); 145962306a36Sopenharmony_ciout: 146062306a36Sopenharmony_ci if (rc == MIGRATEPAGE_SUCCESS) 146162306a36Sopenharmony_ci folio_putback_active_hugetlb(src); 146262306a36Sopenharmony_ci else if (rc != -EAGAIN) 146362306a36Sopenharmony_ci list_move_tail(&src->lru, ret); 146462306a36Sopenharmony_ci 146562306a36Sopenharmony_ci /* 146662306a36Sopenharmony_ci * If migration was not successful and there's a freeing callback, use 146762306a36Sopenharmony_ci * it. Otherwise, put_page() will drop the reference grabbed during 146862306a36Sopenharmony_ci * isolation. 146962306a36Sopenharmony_ci */ 147062306a36Sopenharmony_ci if (put_new_folio) 147162306a36Sopenharmony_ci put_new_folio(dst, private); 147262306a36Sopenharmony_ci else 147362306a36Sopenharmony_ci folio_putback_active_hugetlb(dst); 147462306a36Sopenharmony_ci 147562306a36Sopenharmony_ci return rc; 147662306a36Sopenharmony_ci} 147762306a36Sopenharmony_ci 147862306a36Sopenharmony_cistatic inline int try_split_folio(struct folio *folio, struct list_head *split_folios) 147962306a36Sopenharmony_ci{ 148062306a36Sopenharmony_ci int rc; 148162306a36Sopenharmony_ci 148262306a36Sopenharmony_ci folio_lock(folio); 148362306a36Sopenharmony_ci rc = split_folio_to_list(folio, split_folios); 148462306a36Sopenharmony_ci folio_unlock(folio); 148562306a36Sopenharmony_ci if (!rc) 148662306a36Sopenharmony_ci list_move_tail(&folio->lru, split_folios); 148762306a36Sopenharmony_ci 148862306a36Sopenharmony_ci return rc; 148962306a36Sopenharmony_ci} 149062306a36Sopenharmony_ci 149162306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 149262306a36Sopenharmony_ci#define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR 149362306a36Sopenharmony_ci#else 149462306a36Sopenharmony_ci#define NR_MAX_BATCHED_MIGRATION 512 149562306a36Sopenharmony_ci#endif 149662306a36Sopenharmony_ci#define NR_MAX_MIGRATE_PAGES_RETRY 10 149762306a36Sopenharmony_ci#define NR_MAX_MIGRATE_ASYNC_RETRY 3 149862306a36Sopenharmony_ci#define NR_MAX_MIGRATE_SYNC_RETRY \ 149962306a36Sopenharmony_ci (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY) 150062306a36Sopenharmony_ci 150162306a36Sopenharmony_cistruct migrate_pages_stats { 150262306a36Sopenharmony_ci int nr_succeeded; /* Normal and large folios migrated successfully, in 150362306a36Sopenharmony_ci units of base pages */ 150462306a36Sopenharmony_ci int nr_failed_pages; /* Normal and large folios failed to be migrated, in 150562306a36Sopenharmony_ci units of base pages. Untried folios aren't counted */ 150662306a36Sopenharmony_ci int nr_thp_succeeded; /* THP migrated successfully */ 150762306a36Sopenharmony_ci int nr_thp_failed; /* THP failed to be migrated */ 150862306a36Sopenharmony_ci int nr_thp_split; /* THP split before migrating */ 150962306a36Sopenharmony_ci}; 151062306a36Sopenharmony_ci 151162306a36Sopenharmony_ci/* 151262306a36Sopenharmony_ci * Returns the number of hugetlb folios that were not migrated, or an error code 151362306a36Sopenharmony_ci * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable 151462306a36Sopenharmony_ci * any more because the list has become empty or no retryable hugetlb folios 151562306a36Sopenharmony_ci * exist any more. It is caller's responsibility to call putback_movable_pages() 151662306a36Sopenharmony_ci * only if ret != 0. 151762306a36Sopenharmony_ci */ 151862306a36Sopenharmony_cistatic int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, 151962306a36Sopenharmony_ci free_folio_t put_new_folio, unsigned long private, 152062306a36Sopenharmony_ci enum migrate_mode mode, int reason, 152162306a36Sopenharmony_ci struct migrate_pages_stats *stats, 152262306a36Sopenharmony_ci struct list_head *ret_folios) 152362306a36Sopenharmony_ci{ 152462306a36Sopenharmony_ci int retry = 1; 152562306a36Sopenharmony_ci int nr_failed = 0; 152662306a36Sopenharmony_ci int nr_retry_pages = 0; 152762306a36Sopenharmony_ci int pass = 0; 152862306a36Sopenharmony_ci struct folio *folio, *folio2; 152962306a36Sopenharmony_ci int rc, nr_pages; 153062306a36Sopenharmony_ci 153162306a36Sopenharmony_ci for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) { 153262306a36Sopenharmony_ci retry = 0; 153362306a36Sopenharmony_ci nr_retry_pages = 0; 153462306a36Sopenharmony_ci 153562306a36Sopenharmony_ci list_for_each_entry_safe(folio, folio2, from, lru) { 153662306a36Sopenharmony_ci if (!folio_test_hugetlb(folio)) 153762306a36Sopenharmony_ci continue; 153862306a36Sopenharmony_ci 153962306a36Sopenharmony_ci nr_pages = folio_nr_pages(folio); 154062306a36Sopenharmony_ci 154162306a36Sopenharmony_ci cond_resched(); 154262306a36Sopenharmony_ci 154362306a36Sopenharmony_ci /* 154462306a36Sopenharmony_ci * Migratability of hugepages depends on architectures and 154562306a36Sopenharmony_ci * their size. This check is necessary because some callers 154662306a36Sopenharmony_ci * of hugepage migration like soft offline and memory 154762306a36Sopenharmony_ci * hotremove don't walk through page tables or check whether 154862306a36Sopenharmony_ci * the hugepage is pmd-based or not before kicking migration. 154962306a36Sopenharmony_ci */ 155062306a36Sopenharmony_ci if (!hugepage_migration_supported(folio_hstate(folio))) { 155162306a36Sopenharmony_ci nr_failed++; 155262306a36Sopenharmony_ci stats->nr_failed_pages += nr_pages; 155362306a36Sopenharmony_ci list_move_tail(&folio->lru, ret_folios); 155462306a36Sopenharmony_ci continue; 155562306a36Sopenharmony_ci } 155662306a36Sopenharmony_ci 155762306a36Sopenharmony_ci rc = unmap_and_move_huge_page(get_new_folio, 155862306a36Sopenharmony_ci put_new_folio, private, 155962306a36Sopenharmony_ci folio, pass > 2, mode, 156062306a36Sopenharmony_ci reason, ret_folios); 156162306a36Sopenharmony_ci /* 156262306a36Sopenharmony_ci * The rules are: 156362306a36Sopenharmony_ci * Success: hugetlb folio will be put back 156462306a36Sopenharmony_ci * -EAGAIN: stay on the from list 156562306a36Sopenharmony_ci * -ENOMEM: stay on the from list 156662306a36Sopenharmony_ci * Other errno: put on ret_folios list 156762306a36Sopenharmony_ci */ 156862306a36Sopenharmony_ci switch(rc) { 156962306a36Sopenharmony_ci case -ENOMEM: 157062306a36Sopenharmony_ci /* 157162306a36Sopenharmony_ci * When memory is low, don't bother to try to migrate 157262306a36Sopenharmony_ci * other folios, just exit. 157362306a36Sopenharmony_ci */ 157462306a36Sopenharmony_ci stats->nr_failed_pages += nr_pages + nr_retry_pages; 157562306a36Sopenharmony_ci return -ENOMEM; 157662306a36Sopenharmony_ci case -EAGAIN: 157762306a36Sopenharmony_ci retry++; 157862306a36Sopenharmony_ci nr_retry_pages += nr_pages; 157962306a36Sopenharmony_ci break; 158062306a36Sopenharmony_ci case MIGRATEPAGE_SUCCESS: 158162306a36Sopenharmony_ci stats->nr_succeeded += nr_pages; 158262306a36Sopenharmony_ci break; 158362306a36Sopenharmony_ci default: 158462306a36Sopenharmony_ci /* 158562306a36Sopenharmony_ci * Permanent failure (-EBUSY, etc.): 158662306a36Sopenharmony_ci * unlike -EAGAIN case, the failed folio is 158762306a36Sopenharmony_ci * removed from migration folio list and not 158862306a36Sopenharmony_ci * retried in the next outer loop. 158962306a36Sopenharmony_ci */ 159062306a36Sopenharmony_ci nr_failed++; 159162306a36Sopenharmony_ci stats->nr_failed_pages += nr_pages; 159262306a36Sopenharmony_ci break; 159362306a36Sopenharmony_ci } 159462306a36Sopenharmony_ci } 159562306a36Sopenharmony_ci } 159662306a36Sopenharmony_ci /* 159762306a36Sopenharmony_ci * nr_failed is number of hugetlb folios failed to be migrated. After 159862306a36Sopenharmony_ci * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb 159962306a36Sopenharmony_ci * folios as failed. 160062306a36Sopenharmony_ci */ 160162306a36Sopenharmony_ci nr_failed += retry; 160262306a36Sopenharmony_ci stats->nr_failed_pages += nr_retry_pages; 160362306a36Sopenharmony_ci 160462306a36Sopenharmony_ci return nr_failed; 160562306a36Sopenharmony_ci} 160662306a36Sopenharmony_ci 160762306a36Sopenharmony_ci/* 160862306a36Sopenharmony_ci * migrate_pages_batch() first unmaps folios in the from list as many as 160962306a36Sopenharmony_ci * possible, then move the unmapped folios. 161062306a36Sopenharmony_ci * 161162306a36Sopenharmony_ci * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a 161262306a36Sopenharmony_ci * lock or bit when we have locked more than one folio. Which may cause 161362306a36Sopenharmony_ci * deadlock (e.g., for loop device). So, if mode != MIGRATE_ASYNC, the 161462306a36Sopenharmony_ci * length of the from list must be <= 1. 161562306a36Sopenharmony_ci */ 161662306a36Sopenharmony_cistatic int migrate_pages_batch(struct list_head *from, 161762306a36Sopenharmony_ci new_folio_t get_new_folio, free_folio_t put_new_folio, 161862306a36Sopenharmony_ci unsigned long private, enum migrate_mode mode, int reason, 161962306a36Sopenharmony_ci struct list_head *ret_folios, struct list_head *split_folios, 162062306a36Sopenharmony_ci struct migrate_pages_stats *stats, int nr_pass) 162162306a36Sopenharmony_ci{ 162262306a36Sopenharmony_ci int retry = 1; 162362306a36Sopenharmony_ci int thp_retry = 1; 162462306a36Sopenharmony_ci int nr_failed = 0; 162562306a36Sopenharmony_ci int nr_retry_pages = 0; 162662306a36Sopenharmony_ci int pass = 0; 162762306a36Sopenharmony_ci bool is_thp = false; 162862306a36Sopenharmony_ci struct folio *folio, *folio2, *dst = NULL, *dst2; 162962306a36Sopenharmony_ci int rc, rc_saved = 0, nr_pages; 163062306a36Sopenharmony_ci LIST_HEAD(unmap_folios); 163162306a36Sopenharmony_ci LIST_HEAD(dst_folios); 163262306a36Sopenharmony_ci bool nosplit = (reason == MR_NUMA_MISPLACED); 163362306a36Sopenharmony_ci 163462306a36Sopenharmony_ci VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC && 163562306a36Sopenharmony_ci !list_empty(from) && !list_is_singular(from)); 163662306a36Sopenharmony_ci 163762306a36Sopenharmony_ci for (pass = 0; pass < nr_pass && retry; pass++) { 163862306a36Sopenharmony_ci retry = 0; 163962306a36Sopenharmony_ci thp_retry = 0; 164062306a36Sopenharmony_ci nr_retry_pages = 0; 164162306a36Sopenharmony_ci 164262306a36Sopenharmony_ci list_for_each_entry_safe(folio, folio2, from, lru) { 164362306a36Sopenharmony_ci is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); 164462306a36Sopenharmony_ci nr_pages = folio_nr_pages(folio); 164562306a36Sopenharmony_ci 164662306a36Sopenharmony_ci cond_resched(); 164762306a36Sopenharmony_ci 164862306a36Sopenharmony_ci /* 164962306a36Sopenharmony_ci * Large folio migration might be unsupported or 165062306a36Sopenharmony_ci * the allocation might be failed so we should retry 165162306a36Sopenharmony_ci * on the same folio with the large folio split 165262306a36Sopenharmony_ci * to normal folios. 165362306a36Sopenharmony_ci * 165462306a36Sopenharmony_ci * Split folios are put in split_folios, and 165562306a36Sopenharmony_ci * we will migrate them after the rest of the 165662306a36Sopenharmony_ci * list is processed. 165762306a36Sopenharmony_ci */ 165862306a36Sopenharmony_ci if (!thp_migration_supported() && is_thp) { 165962306a36Sopenharmony_ci nr_failed++; 166062306a36Sopenharmony_ci stats->nr_thp_failed++; 166162306a36Sopenharmony_ci if (!try_split_folio(folio, split_folios)) { 166262306a36Sopenharmony_ci stats->nr_thp_split++; 166362306a36Sopenharmony_ci continue; 166462306a36Sopenharmony_ci } 166562306a36Sopenharmony_ci stats->nr_failed_pages += nr_pages; 166662306a36Sopenharmony_ci list_move_tail(&folio->lru, ret_folios); 166762306a36Sopenharmony_ci continue; 166862306a36Sopenharmony_ci } 166962306a36Sopenharmony_ci 167062306a36Sopenharmony_ci rc = migrate_folio_unmap(get_new_folio, put_new_folio, 167162306a36Sopenharmony_ci private, folio, &dst, mode, reason, 167262306a36Sopenharmony_ci ret_folios); 167362306a36Sopenharmony_ci /* 167462306a36Sopenharmony_ci * The rules are: 167562306a36Sopenharmony_ci * Success: folio will be freed 167662306a36Sopenharmony_ci * Unmap: folio will be put on unmap_folios list, 167762306a36Sopenharmony_ci * dst folio put on dst_folios list 167862306a36Sopenharmony_ci * -EAGAIN: stay on the from list 167962306a36Sopenharmony_ci * -ENOMEM: stay on the from list 168062306a36Sopenharmony_ci * Other errno: put on ret_folios list 168162306a36Sopenharmony_ci */ 168262306a36Sopenharmony_ci switch(rc) { 168362306a36Sopenharmony_ci case -ENOMEM: 168462306a36Sopenharmony_ci /* 168562306a36Sopenharmony_ci * When memory is low, don't bother to try to migrate 168662306a36Sopenharmony_ci * other folios, move unmapped folios, then exit. 168762306a36Sopenharmony_ci */ 168862306a36Sopenharmony_ci nr_failed++; 168962306a36Sopenharmony_ci stats->nr_thp_failed += is_thp; 169062306a36Sopenharmony_ci /* Large folio NUMA faulting doesn't split to retry. */ 169162306a36Sopenharmony_ci if (folio_test_large(folio) && !nosplit) { 169262306a36Sopenharmony_ci int ret = try_split_folio(folio, split_folios); 169362306a36Sopenharmony_ci 169462306a36Sopenharmony_ci if (!ret) { 169562306a36Sopenharmony_ci stats->nr_thp_split += is_thp; 169662306a36Sopenharmony_ci break; 169762306a36Sopenharmony_ci } else if (reason == MR_LONGTERM_PIN && 169862306a36Sopenharmony_ci ret == -EAGAIN) { 169962306a36Sopenharmony_ci /* 170062306a36Sopenharmony_ci * Try again to split large folio to 170162306a36Sopenharmony_ci * mitigate the failure of longterm pinning. 170262306a36Sopenharmony_ci */ 170362306a36Sopenharmony_ci retry++; 170462306a36Sopenharmony_ci thp_retry += is_thp; 170562306a36Sopenharmony_ci nr_retry_pages += nr_pages; 170662306a36Sopenharmony_ci /* Undo duplicated failure counting. */ 170762306a36Sopenharmony_ci nr_failed--; 170862306a36Sopenharmony_ci stats->nr_thp_failed -= is_thp; 170962306a36Sopenharmony_ci break; 171062306a36Sopenharmony_ci } 171162306a36Sopenharmony_ci } 171262306a36Sopenharmony_ci 171362306a36Sopenharmony_ci stats->nr_failed_pages += nr_pages + nr_retry_pages; 171462306a36Sopenharmony_ci /* nr_failed isn't updated for not used */ 171562306a36Sopenharmony_ci stats->nr_thp_failed += thp_retry; 171662306a36Sopenharmony_ci rc_saved = rc; 171762306a36Sopenharmony_ci if (list_empty(&unmap_folios)) 171862306a36Sopenharmony_ci goto out; 171962306a36Sopenharmony_ci else 172062306a36Sopenharmony_ci goto move; 172162306a36Sopenharmony_ci case -EAGAIN: 172262306a36Sopenharmony_ci retry++; 172362306a36Sopenharmony_ci thp_retry += is_thp; 172462306a36Sopenharmony_ci nr_retry_pages += nr_pages; 172562306a36Sopenharmony_ci break; 172662306a36Sopenharmony_ci case MIGRATEPAGE_SUCCESS: 172762306a36Sopenharmony_ci stats->nr_succeeded += nr_pages; 172862306a36Sopenharmony_ci stats->nr_thp_succeeded += is_thp; 172962306a36Sopenharmony_ci break; 173062306a36Sopenharmony_ci case MIGRATEPAGE_UNMAP: 173162306a36Sopenharmony_ci list_move_tail(&folio->lru, &unmap_folios); 173262306a36Sopenharmony_ci list_add_tail(&dst->lru, &dst_folios); 173362306a36Sopenharmony_ci break; 173462306a36Sopenharmony_ci default: 173562306a36Sopenharmony_ci /* 173662306a36Sopenharmony_ci * Permanent failure (-EBUSY, etc.): 173762306a36Sopenharmony_ci * unlike -EAGAIN case, the failed folio is 173862306a36Sopenharmony_ci * removed from migration folio list and not 173962306a36Sopenharmony_ci * retried in the next outer loop. 174062306a36Sopenharmony_ci */ 174162306a36Sopenharmony_ci nr_failed++; 174262306a36Sopenharmony_ci stats->nr_thp_failed += is_thp; 174362306a36Sopenharmony_ci stats->nr_failed_pages += nr_pages; 174462306a36Sopenharmony_ci break; 174562306a36Sopenharmony_ci } 174662306a36Sopenharmony_ci } 174762306a36Sopenharmony_ci } 174862306a36Sopenharmony_ci nr_failed += retry; 174962306a36Sopenharmony_ci stats->nr_thp_failed += thp_retry; 175062306a36Sopenharmony_ci stats->nr_failed_pages += nr_retry_pages; 175162306a36Sopenharmony_cimove: 175262306a36Sopenharmony_ci /* Flush TLBs for all unmapped folios */ 175362306a36Sopenharmony_ci try_to_unmap_flush(); 175462306a36Sopenharmony_ci 175562306a36Sopenharmony_ci retry = 1; 175662306a36Sopenharmony_ci for (pass = 0; pass < nr_pass && retry; pass++) { 175762306a36Sopenharmony_ci retry = 0; 175862306a36Sopenharmony_ci thp_retry = 0; 175962306a36Sopenharmony_ci nr_retry_pages = 0; 176062306a36Sopenharmony_ci 176162306a36Sopenharmony_ci dst = list_first_entry(&dst_folios, struct folio, lru); 176262306a36Sopenharmony_ci dst2 = list_next_entry(dst, lru); 176362306a36Sopenharmony_ci list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { 176462306a36Sopenharmony_ci is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); 176562306a36Sopenharmony_ci nr_pages = folio_nr_pages(folio); 176662306a36Sopenharmony_ci 176762306a36Sopenharmony_ci cond_resched(); 176862306a36Sopenharmony_ci 176962306a36Sopenharmony_ci rc = migrate_folio_move(put_new_folio, private, 177062306a36Sopenharmony_ci folio, dst, mode, 177162306a36Sopenharmony_ci reason, ret_folios); 177262306a36Sopenharmony_ci /* 177362306a36Sopenharmony_ci * The rules are: 177462306a36Sopenharmony_ci * Success: folio will be freed 177562306a36Sopenharmony_ci * -EAGAIN: stay on the unmap_folios list 177662306a36Sopenharmony_ci * Other errno: put on ret_folios list 177762306a36Sopenharmony_ci */ 177862306a36Sopenharmony_ci switch(rc) { 177962306a36Sopenharmony_ci case -EAGAIN: 178062306a36Sopenharmony_ci retry++; 178162306a36Sopenharmony_ci thp_retry += is_thp; 178262306a36Sopenharmony_ci nr_retry_pages += nr_pages; 178362306a36Sopenharmony_ci break; 178462306a36Sopenharmony_ci case MIGRATEPAGE_SUCCESS: 178562306a36Sopenharmony_ci stats->nr_succeeded += nr_pages; 178662306a36Sopenharmony_ci stats->nr_thp_succeeded += is_thp; 178762306a36Sopenharmony_ci break; 178862306a36Sopenharmony_ci default: 178962306a36Sopenharmony_ci nr_failed++; 179062306a36Sopenharmony_ci stats->nr_thp_failed += is_thp; 179162306a36Sopenharmony_ci stats->nr_failed_pages += nr_pages; 179262306a36Sopenharmony_ci break; 179362306a36Sopenharmony_ci } 179462306a36Sopenharmony_ci dst = dst2; 179562306a36Sopenharmony_ci dst2 = list_next_entry(dst, lru); 179662306a36Sopenharmony_ci } 179762306a36Sopenharmony_ci } 179862306a36Sopenharmony_ci nr_failed += retry; 179962306a36Sopenharmony_ci stats->nr_thp_failed += thp_retry; 180062306a36Sopenharmony_ci stats->nr_failed_pages += nr_retry_pages; 180162306a36Sopenharmony_ci 180262306a36Sopenharmony_ci rc = rc_saved ? : nr_failed; 180362306a36Sopenharmony_ciout: 180462306a36Sopenharmony_ci /* Cleanup remaining folios */ 180562306a36Sopenharmony_ci dst = list_first_entry(&dst_folios, struct folio, lru); 180662306a36Sopenharmony_ci dst2 = list_next_entry(dst, lru); 180762306a36Sopenharmony_ci list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { 180862306a36Sopenharmony_ci int old_page_state = 0; 180962306a36Sopenharmony_ci struct anon_vma *anon_vma = NULL; 181062306a36Sopenharmony_ci 181162306a36Sopenharmony_ci __migrate_folio_extract(dst, &old_page_state, &anon_vma); 181262306a36Sopenharmony_ci migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, 181362306a36Sopenharmony_ci anon_vma, true, ret_folios); 181462306a36Sopenharmony_ci list_del(&dst->lru); 181562306a36Sopenharmony_ci migrate_folio_undo_dst(dst, true, put_new_folio, private); 181662306a36Sopenharmony_ci dst = dst2; 181762306a36Sopenharmony_ci dst2 = list_next_entry(dst, lru); 181862306a36Sopenharmony_ci } 181962306a36Sopenharmony_ci 182062306a36Sopenharmony_ci return rc; 182162306a36Sopenharmony_ci} 182262306a36Sopenharmony_ci 182362306a36Sopenharmony_cistatic int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio, 182462306a36Sopenharmony_ci free_folio_t put_new_folio, unsigned long private, 182562306a36Sopenharmony_ci enum migrate_mode mode, int reason, 182662306a36Sopenharmony_ci struct list_head *ret_folios, struct list_head *split_folios, 182762306a36Sopenharmony_ci struct migrate_pages_stats *stats) 182862306a36Sopenharmony_ci{ 182962306a36Sopenharmony_ci int rc, nr_failed = 0; 183062306a36Sopenharmony_ci LIST_HEAD(folios); 183162306a36Sopenharmony_ci struct migrate_pages_stats astats; 183262306a36Sopenharmony_ci 183362306a36Sopenharmony_ci memset(&astats, 0, sizeof(astats)); 183462306a36Sopenharmony_ci /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */ 183562306a36Sopenharmony_ci rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC, 183662306a36Sopenharmony_ci reason, &folios, split_folios, &astats, 183762306a36Sopenharmony_ci NR_MAX_MIGRATE_ASYNC_RETRY); 183862306a36Sopenharmony_ci stats->nr_succeeded += astats.nr_succeeded; 183962306a36Sopenharmony_ci stats->nr_thp_succeeded += astats.nr_thp_succeeded; 184062306a36Sopenharmony_ci stats->nr_thp_split += astats.nr_thp_split; 184162306a36Sopenharmony_ci if (rc < 0) { 184262306a36Sopenharmony_ci stats->nr_failed_pages += astats.nr_failed_pages; 184362306a36Sopenharmony_ci stats->nr_thp_failed += astats.nr_thp_failed; 184462306a36Sopenharmony_ci list_splice_tail(&folios, ret_folios); 184562306a36Sopenharmony_ci return rc; 184662306a36Sopenharmony_ci } 184762306a36Sopenharmony_ci stats->nr_thp_failed += astats.nr_thp_split; 184862306a36Sopenharmony_ci nr_failed += astats.nr_thp_split; 184962306a36Sopenharmony_ci /* 185062306a36Sopenharmony_ci * Fall back to migrate all failed folios one by one synchronously. All 185162306a36Sopenharmony_ci * failed folios except split THPs will be retried, so their failure 185262306a36Sopenharmony_ci * isn't counted 185362306a36Sopenharmony_ci */ 185462306a36Sopenharmony_ci list_splice_tail_init(&folios, from); 185562306a36Sopenharmony_ci while (!list_empty(from)) { 185662306a36Sopenharmony_ci list_move(from->next, &folios); 185762306a36Sopenharmony_ci rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, 185862306a36Sopenharmony_ci private, mode, reason, ret_folios, 185962306a36Sopenharmony_ci split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY); 186062306a36Sopenharmony_ci list_splice_tail_init(&folios, ret_folios); 186162306a36Sopenharmony_ci if (rc < 0) 186262306a36Sopenharmony_ci return rc; 186362306a36Sopenharmony_ci nr_failed += rc; 186462306a36Sopenharmony_ci } 186562306a36Sopenharmony_ci 186662306a36Sopenharmony_ci return nr_failed; 186762306a36Sopenharmony_ci} 186862306a36Sopenharmony_ci 186962306a36Sopenharmony_ci/* 187062306a36Sopenharmony_ci * migrate_pages - migrate the folios specified in a list, to the free folios 187162306a36Sopenharmony_ci * supplied as the target for the page migration 187262306a36Sopenharmony_ci * 187362306a36Sopenharmony_ci * @from: The list of folios to be migrated. 187462306a36Sopenharmony_ci * @get_new_folio: The function used to allocate free folios to be used 187562306a36Sopenharmony_ci * as the target of the folio migration. 187662306a36Sopenharmony_ci * @put_new_folio: The function used to free target folios if migration 187762306a36Sopenharmony_ci * fails, or NULL if no special handling is necessary. 187862306a36Sopenharmony_ci * @private: Private data to be passed on to get_new_folio() 187962306a36Sopenharmony_ci * @mode: The migration mode that specifies the constraints for 188062306a36Sopenharmony_ci * folio migration, if any. 188162306a36Sopenharmony_ci * @reason: The reason for folio migration. 188262306a36Sopenharmony_ci * @ret_succeeded: Set to the number of folios migrated successfully if 188362306a36Sopenharmony_ci * the caller passes a non-NULL pointer. 188462306a36Sopenharmony_ci * 188562306a36Sopenharmony_ci * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios 188662306a36Sopenharmony_ci * are movable any more because the list has become empty or no retryable folios 188762306a36Sopenharmony_ci * exist any more. It is caller's responsibility to call putback_movable_pages() 188862306a36Sopenharmony_ci * only if ret != 0. 188962306a36Sopenharmony_ci * 189062306a36Sopenharmony_ci * Returns the number of {normal folio, large folio, hugetlb} that were not 189162306a36Sopenharmony_ci * migrated, or an error code. The number of large folio splits will be 189262306a36Sopenharmony_ci * considered as the number of non-migrated large folio, no matter how many 189362306a36Sopenharmony_ci * split folios of the large folio are migrated successfully. 189462306a36Sopenharmony_ci */ 189562306a36Sopenharmony_ciint migrate_pages(struct list_head *from, new_folio_t get_new_folio, 189662306a36Sopenharmony_ci free_folio_t put_new_folio, unsigned long private, 189762306a36Sopenharmony_ci enum migrate_mode mode, int reason, unsigned int *ret_succeeded) 189862306a36Sopenharmony_ci{ 189962306a36Sopenharmony_ci int rc, rc_gather; 190062306a36Sopenharmony_ci int nr_pages; 190162306a36Sopenharmony_ci struct folio *folio, *folio2; 190262306a36Sopenharmony_ci LIST_HEAD(folios); 190362306a36Sopenharmony_ci LIST_HEAD(ret_folios); 190462306a36Sopenharmony_ci LIST_HEAD(split_folios); 190562306a36Sopenharmony_ci struct migrate_pages_stats stats; 190662306a36Sopenharmony_ci 190762306a36Sopenharmony_ci trace_mm_migrate_pages_start(mode, reason); 190862306a36Sopenharmony_ci 190962306a36Sopenharmony_ci memset(&stats, 0, sizeof(stats)); 191062306a36Sopenharmony_ci 191162306a36Sopenharmony_ci rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private, 191262306a36Sopenharmony_ci mode, reason, &stats, &ret_folios); 191362306a36Sopenharmony_ci if (rc_gather < 0) 191462306a36Sopenharmony_ci goto out; 191562306a36Sopenharmony_ci 191662306a36Sopenharmony_ciagain: 191762306a36Sopenharmony_ci nr_pages = 0; 191862306a36Sopenharmony_ci list_for_each_entry_safe(folio, folio2, from, lru) { 191962306a36Sopenharmony_ci /* Retried hugetlb folios will be kept in list */ 192062306a36Sopenharmony_ci if (folio_test_hugetlb(folio)) { 192162306a36Sopenharmony_ci list_move_tail(&folio->lru, &ret_folios); 192262306a36Sopenharmony_ci continue; 192362306a36Sopenharmony_ci } 192462306a36Sopenharmony_ci 192562306a36Sopenharmony_ci nr_pages += folio_nr_pages(folio); 192662306a36Sopenharmony_ci if (nr_pages >= NR_MAX_BATCHED_MIGRATION) 192762306a36Sopenharmony_ci break; 192862306a36Sopenharmony_ci } 192962306a36Sopenharmony_ci if (nr_pages >= NR_MAX_BATCHED_MIGRATION) 193062306a36Sopenharmony_ci list_cut_before(&folios, from, &folio2->lru); 193162306a36Sopenharmony_ci else 193262306a36Sopenharmony_ci list_splice_init(from, &folios); 193362306a36Sopenharmony_ci if (mode == MIGRATE_ASYNC) 193462306a36Sopenharmony_ci rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio, 193562306a36Sopenharmony_ci private, mode, reason, &ret_folios, 193662306a36Sopenharmony_ci &split_folios, &stats, 193762306a36Sopenharmony_ci NR_MAX_MIGRATE_PAGES_RETRY); 193862306a36Sopenharmony_ci else 193962306a36Sopenharmony_ci rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio, 194062306a36Sopenharmony_ci private, mode, reason, &ret_folios, 194162306a36Sopenharmony_ci &split_folios, &stats); 194262306a36Sopenharmony_ci list_splice_tail_init(&folios, &ret_folios); 194362306a36Sopenharmony_ci if (rc < 0) { 194462306a36Sopenharmony_ci rc_gather = rc; 194562306a36Sopenharmony_ci list_splice_tail(&split_folios, &ret_folios); 194662306a36Sopenharmony_ci goto out; 194762306a36Sopenharmony_ci } 194862306a36Sopenharmony_ci if (!list_empty(&split_folios)) { 194962306a36Sopenharmony_ci /* 195062306a36Sopenharmony_ci * Failure isn't counted since all split folios of a large folio 195162306a36Sopenharmony_ci * is counted as 1 failure already. And, we only try to migrate 195262306a36Sopenharmony_ci * with minimal effort, force MIGRATE_ASYNC mode and retry once. 195362306a36Sopenharmony_ci */ 195462306a36Sopenharmony_ci migrate_pages_batch(&split_folios, get_new_folio, 195562306a36Sopenharmony_ci put_new_folio, private, MIGRATE_ASYNC, reason, 195662306a36Sopenharmony_ci &ret_folios, NULL, &stats, 1); 195762306a36Sopenharmony_ci list_splice_tail_init(&split_folios, &ret_folios); 195862306a36Sopenharmony_ci } 195962306a36Sopenharmony_ci rc_gather += rc; 196062306a36Sopenharmony_ci if (!list_empty(from)) 196162306a36Sopenharmony_ci goto again; 196262306a36Sopenharmony_ciout: 196362306a36Sopenharmony_ci /* 196462306a36Sopenharmony_ci * Put the permanent failure folio back to migration list, they 196562306a36Sopenharmony_ci * will be put back to the right list by the caller. 196662306a36Sopenharmony_ci */ 196762306a36Sopenharmony_ci list_splice(&ret_folios, from); 196862306a36Sopenharmony_ci 196962306a36Sopenharmony_ci /* 197062306a36Sopenharmony_ci * Return 0 in case all split folios of fail-to-migrate large folios 197162306a36Sopenharmony_ci * are migrated successfully. 197262306a36Sopenharmony_ci */ 197362306a36Sopenharmony_ci if (list_empty(from)) 197462306a36Sopenharmony_ci rc_gather = 0; 197562306a36Sopenharmony_ci 197662306a36Sopenharmony_ci count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded); 197762306a36Sopenharmony_ci count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages); 197862306a36Sopenharmony_ci count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded); 197962306a36Sopenharmony_ci count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed); 198062306a36Sopenharmony_ci count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split); 198162306a36Sopenharmony_ci trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages, 198262306a36Sopenharmony_ci stats.nr_thp_succeeded, stats.nr_thp_failed, 198362306a36Sopenharmony_ci stats.nr_thp_split, mode, reason); 198462306a36Sopenharmony_ci 198562306a36Sopenharmony_ci if (ret_succeeded) 198662306a36Sopenharmony_ci *ret_succeeded = stats.nr_succeeded; 198762306a36Sopenharmony_ci 198862306a36Sopenharmony_ci return rc_gather; 198962306a36Sopenharmony_ci} 199062306a36Sopenharmony_ci 199162306a36Sopenharmony_cistruct folio *alloc_migration_target(struct folio *src, unsigned long private) 199262306a36Sopenharmony_ci{ 199362306a36Sopenharmony_ci struct migration_target_control *mtc; 199462306a36Sopenharmony_ci gfp_t gfp_mask; 199562306a36Sopenharmony_ci unsigned int order = 0; 199662306a36Sopenharmony_ci int nid; 199762306a36Sopenharmony_ci int zidx; 199862306a36Sopenharmony_ci 199962306a36Sopenharmony_ci mtc = (struct migration_target_control *)private; 200062306a36Sopenharmony_ci gfp_mask = mtc->gfp_mask; 200162306a36Sopenharmony_ci nid = mtc->nid; 200262306a36Sopenharmony_ci if (nid == NUMA_NO_NODE) 200362306a36Sopenharmony_ci nid = folio_nid(src); 200462306a36Sopenharmony_ci 200562306a36Sopenharmony_ci if (folio_test_hugetlb(src)) { 200662306a36Sopenharmony_ci struct hstate *h = folio_hstate(src); 200762306a36Sopenharmony_ci 200862306a36Sopenharmony_ci gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); 200962306a36Sopenharmony_ci return alloc_hugetlb_folio_nodemask(h, nid, 201062306a36Sopenharmony_ci mtc->nmask, gfp_mask); 201162306a36Sopenharmony_ci } 201262306a36Sopenharmony_ci 201362306a36Sopenharmony_ci if (folio_test_large(src)) { 201462306a36Sopenharmony_ci /* 201562306a36Sopenharmony_ci * clear __GFP_RECLAIM to make the migration callback 201662306a36Sopenharmony_ci * consistent with regular THP allocations. 201762306a36Sopenharmony_ci */ 201862306a36Sopenharmony_ci gfp_mask &= ~__GFP_RECLAIM; 201962306a36Sopenharmony_ci gfp_mask |= GFP_TRANSHUGE; 202062306a36Sopenharmony_ci order = folio_order(src); 202162306a36Sopenharmony_ci } 202262306a36Sopenharmony_ci zidx = zone_idx(folio_zone(src)); 202362306a36Sopenharmony_ci if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE) 202462306a36Sopenharmony_ci gfp_mask |= __GFP_HIGHMEM; 202562306a36Sopenharmony_ci 202662306a36Sopenharmony_ci return __folio_alloc(gfp_mask, order, nid, mtc->nmask); 202762306a36Sopenharmony_ci} 202862306a36Sopenharmony_ci 202962306a36Sopenharmony_ci#ifdef CONFIG_NUMA 203062306a36Sopenharmony_ci 203162306a36Sopenharmony_cistatic int store_status(int __user *status, int start, int value, int nr) 203262306a36Sopenharmony_ci{ 203362306a36Sopenharmony_ci while (nr-- > 0) { 203462306a36Sopenharmony_ci if (put_user(value, status + start)) 203562306a36Sopenharmony_ci return -EFAULT; 203662306a36Sopenharmony_ci start++; 203762306a36Sopenharmony_ci } 203862306a36Sopenharmony_ci 203962306a36Sopenharmony_ci return 0; 204062306a36Sopenharmony_ci} 204162306a36Sopenharmony_ci 204262306a36Sopenharmony_cistatic int do_move_pages_to_node(struct mm_struct *mm, 204362306a36Sopenharmony_ci struct list_head *pagelist, int node) 204462306a36Sopenharmony_ci{ 204562306a36Sopenharmony_ci int err; 204662306a36Sopenharmony_ci struct migration_target_control mtc = { 204762306a36Sopenharmony_ci .nid = node, 204862306a36Sopenharmony_ci .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 204962306a36Sopenharmony_ci }; 205062306a36Sopenharmony_ci 205162306a36Sopenharmony_ci err = migrate_pages(pagelist, alloc_migration_target, NULL, 205262306a36Sopenharmony_ci (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); 205362306a36Sopenharmony_ci if (err) 205462306a36Sopenharmony_ci putback_movable_pages(pagelist); 205562306a36Sopenharmony_ci return err; 205662306a36Sopenharmony_ci} 205762306a36Sopenharmony_ci 205862306a36Sopenharmony_ci/* 205962306a36Sopenharmony_ci * Resolves the given address to a struct page, isolates it from the LRU and 206062306a36Sopenharmony_ci * puts it to the given pagelist. 206162306a36Sopenharmony_ci * Returns: 206262306a36Sopenharmony_ci * errno - if the page cannot be found/isolated 206362306a36Sopenharmony_ci * 0 - when it doesn't have to be migrated because it is already on the 206462306a36Sopenharmony_ci * target node 206562306a36Sopenharmony_ci * 1 - when it has been queued 206662306a36Sopenharmony_ci */ 206762306a36Sopenharmony_cistatic int add_page_for_migration(struct mm_struct *mm, const void __user *p, 206862306a36Sopenharmony_ci int node, struct list_head *pagelist, bool migrate_all) 206962306a36Sopenharmony_ci{ 207062306a36Sopenharmony_ci struct vm_area_struct *vma; 207162306a36Sopenharmony_ci unsigned long addr; 207262306a36Sopenharmony_ci struct page *page; 207362306a36Sopenharmony_ci int err; 207462306a36Sopenharmony_ci bool isolated; 207562306a36Sopenharmony_ci 207662306a36Sopenharmony_ci mmap_read_lock(mm); 207762306a36Sopenharmony_ci addr = (unsigned long)untagged_addr_remote(mm, p); 207862306a36Sopenharmony_ci 207962306a36Sopenharmony_ci err = -EFAULT; 208062306a36Sopenharmony_ci vma = vma_lookup(mm, addr); 208162306a36Sopenharmony_ci if (!vma || !vma_migratable(vma)) 208262306a36Sopenharmony_ci goto out; 208362306a36Sopenharmony_ci 208462306a36Sopenharmony_ci /* FOLL_DUMP to ignore special (like zero) pages */ 208562306a36Sopenharmony_ci page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); 208662306a36Sopenharmony_ci 208762306a36Sopenharmony_ci err = PTR_ERR(page); 208862306a36Sopenharmony_ci if (IS_ERR(page)) 208962306a36Sopenharmony_ci goto out; 209062306a36Sopenharmony_ci 209162306a36Sopenharmony_ci err = -ENOENT; 209262306a36Sopenharmony_ci if (!page) 209362306a36Sopenharmony_ci goto out; 209462306a36Sopenharmony_ci 209562306a36Sopenharmony_ci if (is_zone_device_page(page)) 209662306a36Sopenharmony_ci goto out_putpage; 209762306a36Sopenharmony_ci 209862306a36Sopenharmony_ci err = 0; 209962306a36Sopenharmony_ci if (page_to_nid(page) == node) 210062306a36Sopenharmony_ci goto out_putpage; 210162306a36Sopenharmony_ci 210262306a36Sopenharmony_ci err = -EACCES; 210362306a36Sopenharmony_ci if (page_mapcount(page) > 1 && !migrate_all) 210462306a36Sopenharmony_ci goto out_putpage; 210562306a36Sopenharmony_ci 210662306a36Sopenharmony_ci if (PageHuge(page)) { 210762306a36Sopenharmony_ci if (PageHead(page)) { 210862306a36Sopenharmony_ci isolated = isolate_hugetlb(page_folio(page), pagelist); 210962306a36Sopenharmony_ci err = isolated ? 1 : -EBUSY; 211062306a36Sopenharmony_ci } 211162306a36Sopenharmony_ci } else { 211262306a36Sopenharmony_ci struct page *head; 211362306a36Sopenharmony_ci 211462306a36Sopenharmony_ci head = compound_head(page); 211562306a36Sopenharmony_ci isolated = isolate_lru_page(head); 211662306a36Sopenharmony_ci if (!isolated) { 211762306a36Sopenharmony_ci err = -EBUSY; 211862306a36Sopenharmony_ci goto out_putpage; 211962306a36Sopenharmony_ci } 212062306a36Sopenharmony_ci 212162306a36Sopenharmony_ci err = 1; 212262306a36Sopenharmony_ci list_add_tail(&head->lru, pagelist); 212362306a36Sopenharmony_ci mod_node_page_state(page_pgdat(head), 212462306a36Sopenharmony_ci NR_ISOLATED_ANON + page_is_file_lru(head), 212562306a36Sopenharmony_ci thp_nr_pages(head)); 212662306a36Sopenharmony_ci } 212762306a36Sopenharmony_ciout_putpage: 212862306a36Sopenharmony_ci /* 212962306a36Sopenharmony_ci * Either remove the duplicate refcount from 213062306a36Sopenharmony_ci * isolate_lru_page() or drop the page ref if it was 213162306a36Sopenharmony_ci * not isolated. 213262306a36Sopenharmony_ci */ 213362306a36Sopenharmony_ci put_page(page); 213462306a36Sopenharmony_ciout: 213562306a36Sopenharmony_ci mmap_read_unlock(mm); 213662306a36Sopenharmony_ci return err; 213762306a36Sopenharmony_ci} 213862306a36Sopenharmony_ci 213962306a36Sopenharmony_cistatic int move_pages_and_store_status(struct mm_struct *mm, int node, 214062306a36Sopenharmony_ci struct list_head *pagelist, int __user *status, 214162306a36Sopenharmony_ci int start, int i, unsigned long nr_pages) 214262306a36Sopenharmony_ci{ 214362306a36Sopenharmony_ci int err; 214462306a36Sopenharmony_ci 214562306a36Sopenharmony_ci if (list_empty(pagelist)) 214662306a36Sopenharmony_ci return 0; 214762306a36Sopenharmony_ci 214862306a36Sopenharmony_ci err = do_move_pages_to_node(mm, pagelist, node); 214962306a36Sopenharmony_ci if (err) { 215062306a36Sopenharmony_ci /* 215162306a36Sopenharmony_ci * Positive err means the number of failed 215262306a36Sopenharmony_ci * pages to migrate. Since we are going to 215362306a36Sopenharmony_ci * abort and return the number of non-migrated 215462306a36Sopenharmony_ci * pages, so need to include the rest of the 215562306a36Sopenharmony_ci * nr_pages that have not been attempted as 215662306a36Sopenharmony_ci * well. 215762306a36Sopenharmony_ci */ 215862306a36Sopenharmony_ci if (err > 0) 215962306a36Sopenharmony_ci err += nr_pages - i; 216062306a36Sopenharmony_ci return err; 216162306a36Sopenharmony_ci } 216262306a36Sopenharmony_ci return store_status(status, start, node, i - start); 216362306a36Sopenharmony_ci} 216462306a36Sopenharmony_ci 216562306a36Sopenharmony_ci/* 216662306a36Sopenharmony_ci * Migrate an array of page address onto an array of nodes and fill 216762306a36Sopenharmony_ci * the corresponding array of status. 216862306a36Sopenharmony_ci */ 216962306a36Sopenharmony_cistatic int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, 217062306a36Sopenharmony_ci unsigned long nr_pages, 217162306a36Sopenharmony_ci const void __user * __user *pages, 217262306a36Sopenharmony_ci const int __user *nodes, 217362306a36Sopenharmony_ci int __user *status, int flags) 217462306a36Sopenharmony_ci{ 217562306a36Sopenharmony_ci compat_uptr_t __user *compat_pages = (void __user *)pages; 217662306a36Sopenharmony_ci int current_node = NUMA_NO_NODE; 217762306a36Sopenharmony_ci LIST_HEAD(pagelist); 217862306a36Sopenharmony_ci int start, i; 217962306a36Sopenharmony_ci int err = 0, err1; 218062306a36Sopenharmony_ci 218162306a36Sopenharmony_ci lru_cache_disable(); 218262306a36Sopenharmony_ci 218362306a36Sopenharmony_ci for (i = start = 0; i < nr_pages; i++) { 218462306a36Sopenharmony_ci const void __user *p; 218562306a36Sopenharmony_ci int node; 218662306a36Sopenharmony_ci 218762306a36Sopenharmony_ci err = -EFAULT; 218862306a36Sopenharmony_ci if (in_compat_syscall()) { 218962306a36Sopenharmony_ci compat_uptr_t cp; 219062306a36Sopenharmony_ci 219162306a36Sopenharmony_ci if (get_user(cp, compat_pages + i)) 219262306a36Sopenharmony_ci goto out_flush; 219362306a36Sopenharmony_ci 219462306a36Sopenharmony_ci p = compat_ptr(cp); 219562306a36Sopenharmony_ci } else { 219662306a36Sopenharmony_ci if (get_user(p, pages + i)) 219762306a36Sopenharmony_ci goto out_flush; 219862306a36Sopenharmony_ci } 219962306a36Sopenharmony_ci if (get_user(node, nodes + i)) 220062306a36Sopenharmony_ci goto out_flush; 220162306a36Sopenharmony_ci 220262306a36Sopenharmony_ci err = -ENODEV; 220362306a36Sopenharmony_ci if (node < 0 || node >= MAX_NUMNODES) 220462306a36Sopenharmony_ci goto out_flush; 220562306a36Sopenharmony_ci if (!node_state(node, N_MEMORY)) 220662306a36Sopenharmony_ci goto out_flush; 220762306a36Sopenharmony_ci 220862306a36Sopenharmony_ci err = -EACCES; 220962306a36Sopenharmony_ci if (!node_isset(node, task_nodes)) 221062306a36Sopenharmony_ci goto out_flush; 221162306a36Sopenharmony_ci 221262306a36Sopenharmony_ci if (current_node == NUMA_NO_NODE) { 221362306a36Sopenharmony_ci current_node = node; 221462306a36Sopenharmony_ci start = i; 221562306a36Sopenharmony_ci } else if (node != current_node) { 221662306a36Sopenharmony_ci err = move_pages_and_store_status(mm, current_node, 221762306a36Sopenharmony_ci &pagelist, status, start, i, nr_pages); 221862306a36Sopenharmony_ci if (err) 221962306a36Sopenharmony_ci goto out; 222062306a36Sopenharmony_ci start = i; 222162306a36Sopenharmony_ci current_node = node; 222262306a36Sopenharmony_ci } 222362306a36Sopenharmony_ci 222462306a36Sopenharmony_ci /* 222562306a36Sopenharmony_ci * Errors in the page lookup or isolation are not fatal and we simply 222662306a36Sopenharmony_ci * report them via status 222762306a36Sopenharmony_ci */ 222862306a36Sopenharmony_ci err = add_page_for_migration(mm, p, current_node, &pagelist, 222962306a36Sopenharmony_ci flags & MPOL_MF_MOVE_ALL); 223062306a36Sopenharmony_ci 223162306a36Sopenharmony_ci if (err > 0) { 223262306a36Sopenharmony_ci /* The page is successfully queued for migration */ 223362306a36Sopenharmony_ci continue; 223462306a36Sopenharmony_ci } 223562306a36Sopenharmony_ci 223662306a36Sopenharmony_ci /* 223762306a36Sopenharmony_ci * The move_pages() man page does not have an -EEXIST choice, so 223862306a36Sopenharmony_ci * use -EFAULT instead. 223962306a36Sopenharmony_ci */ 224062306a36Sopenharmony_ci if (err == -EEXIST) 224162306a36Sopenharmony_ci err = -EFAULT; 224262306a36Sopenharmony_ci 224362306a36Sopenharmony_ci /* 224462306a36Sopenharmony_ci * If the page is already on the target node (!err), store the 224562306a36Sopenharmony_ci * node, otherwise, store the err. 224662306a36Sopenharmony_ci */ 224762306a36Sopenharmony_ci err = store_status(status, i, err ? : current_node, 1); 224862306a36Sopenharmony_ci if (err) 224962306a36Sopenharmony_ci goto out_flush; 225062306a36Sopenharmony_ci 225162306a36Sopenharmony_ci err = move_pages_and_store_status(mm, current_node, &pagelist, 225262306a36Sopenharmony_ci status, start, i, nr_pages); 225362306a36Sopenharmony_ci if (err) { 225462306a36Sopenharmony_ci /* We have accounted for page i */ 225562306a36Sopenharmony_ci if (err > 0) 225662306a36Sopenharmony_ci err--; 225762306a36Sopenharmony_ci goto out; 225862306a36Sopenharmony_ci } 225962306a36Sopenharmony_ci current_node = NUMA_NO_NODE; 226062306a36Sopenharmony_ci } 226162306a36Sopenharmony_ciout_flush: 226262306a36Sopenharmony_ci /* Make sure we do not overwrite the existing error */ 226362306a36Sopenharmony_ci err1 = move_pages_and_store_status(mm, current_node, &pagelist, 226462306a36Sopenharmony_ci status, start, i, nr_pages); 226562306a36Sopenharmony_ci if (err >= 0) 226662306a36Sopenharmony_ci err = err1; 226762306a36Sopenharmony_ciout: 226862306a36Sopenharmony_ci lru_cache_enable(); 226962306a36Sopenharmony_ci return err; 227062306a36Sopenharmony_ci} 227162306a36Sopenharmony_ci 227262306a36Sopenharmony_ci/* 227362306a36Sopenharmony_ci * Determine the nodes of an array of pages and store it in an array of status. 227462306a36Sopenharmony_ci */ 227562306a36Sopenharmony_cistatic void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, 227662306a36Sopenharmony_ci const void __user **pages, int *status) 227762306a36Sopenharmony_ci{ 227862306a36Sopenharmony_ci unsigned long i; 227962306a36Sopenharmony_ci 228062306a36Sopenharmony_ci mmap_read_lock(mm); 228162306a36Sopenharmony_ci 228262306a36Sopenharmony_ci for (i = 0; i < nr_pages; i++) { 228362306a36Sopenharmony_ci unsigned long addr = (unsigned long)(*pages); 228462306a36Sopenharmony_ci struct vm_area_struct *vma; 228562306a36Sopenharmony_ci struct page *page; 228662306a36Sopenharmony_ci int err = -EFAULT; 228762306a36Sopenharmony_ci 228862306a36Sopenharmony_ci vma = vma_lookup(mm, addr); 228962306a36Sopenharmony_ci if (!vma) 229062306a36Sopenharmony_ci goto set_status; 229162306a36Sopenharmony_ci 229262306a36Sopenharmony_ci /* FOLL_DUMP to ignore special (like zero) pages */ 229362306a36Sopenharmony_ci page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); 229462306a36Sopenharmony_ci 229562306a36Sopenharmony_ci err = PTR_ERR(page); 229662306a36Sopenharmony_ci if (IS_ERR(page)) 229762306a36Sopenharmony_ci goto set_status; 229862306a36Sopenharmony_ci 229962306a36Sopenharmony_ci err = -ENOENT; 230062306a36Sopenharmony_ci if (!page) 230162306a36Sopenharmony_ci goto set_status; 230262306a36Sopenharmony_ci 230362306a36Sopenharmony_ci if (!is_zone_device_page(page)) 230462306a36Sopenharmony_ci err = page_to_nid(page); 230562306a36Sopenharmony_ci 230662306a36Sopenharmony_ci put_page(page); 230762306a36Sopenharmony_ciset_status: 230862306a36Sopenharmony_ci *status = err; 230962306a36Sopenharmony_ci 231062306a36Sopenharmony_ci pages++; 231162306a36Sopenharmony_ci status++; 231262306a36Sopenharmony_ci } 231362306a36Sopenharmony_ci 231462306a36Sopenharmony_ci mmap_read_unlock(mm); 231562306a36Sopenharmony_ci} 231662306a36Sopenharmony_ci 231762306a36Sopenharmony_cistatic int get_compat_pages_array(const void __user *chunk_pages[], 231862306a36Sopenharmony_ci const void __user * __user *pages, 231962306a36Sopenharmony_ci unsigned long chunk_nr) 232062306a36Sopenharmony_ci{ 232162306a36Sopenharmony_ci compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages; 232262306a36Sopenharmony_ci compat_uptr_t p; 232362306a36Sopenharmony_ci int i; 232462306a36Sopenharmony_ci 232562306a36Sopenharmony_ci for (i = 0; i < chunk_nr; i++) { 232662306a36Sopenharmony_ci if (get_user(p, pages32 + i)) 232762306a36Sopenharmony_ci return -EFAULT; 232862306a36Sopenharmony_ci chunk_pages[i] = compat_ptr(p); 232962306a36Sopenharmony_ci } 233062306a36Sopenharmony_ci 233162306a36Sopenharmony_ci return 0; 233262306a36Sopenharmony_ci} 233362306a36Sopenharmony_ci 233462306a36Sopenharmony_ci/* 233562306a36Sopenharmony_ci * Determine the nodes of a user array of pages and store it in 233662306a36Sopenharmony_ci * a user array of status. 233762306a36Sopenharmony_ci */ 233862306a36Sopenharmony_cistatic int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, 233962306a36Sopenharmony_ci const void __user * __user *pages, 234062306a36Sopenharmony_ci int __user *status) 234162306a36Sopenharmony_ci{ 234262306a36Sopenharmony_ci#define DO_PAGES_STAT_CHUNK_NR 16UL 234362306a36Sopenharmony_ci const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; 234462306a36Sopenharmony_ci int chunk_status[DO_PAGES_STAT_CHUNK_NR]; 234562306a36Sopenharmony_ci 234662306a36Sopenharmony_ci while (nr_pages) { 234762306a36Sopenharmony_ci unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR); 234862306a36Sopenharmony_ci 234962306a36Sopenharmony_ci if (in_compat_syscall()) { 235062306a36Sopenharmony_ci if (get_compat_pages_array(chunk_pages, pages, 235162306a36Sopenharmony_ci chunk_nr)) 235262306a36Sopenharmony_ci break; 235362306a36Sopenharmony_ci } else { 235462306a36Sopenharmony_ci if (copy_from_user(chunk_pages, pages, 235562306a36Sopenharmony_ci chunk_nr * sizeof(*chunk_pages))) 235662306a36Sopenharmony_ci break; 235762306a36Sopenharmony_ci } 235862306a36Sopenharmony_ci 235962306a36Sopenharmony_ci do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); 236062306a36Sopenharmony_ci 236162306a36Sopenharmony_ci if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) 236262306a36Sopenharmony_ci break; 236362306a36Sopenharmony_ci 236462306a36Sopenharmony_ci pages += chunk_nr; 236562306a36Sopenharmony_ci status += chunk_nr; 236662306a36Sopenharmony_ci nr_pages -= chunk_nr; 236762306a36Sopenharmony_ci } 236862306a36Sopenharmony_ci return nr_pages ? -EFAULT : 0; 236962306a36Sopenharmony_ci} 237062306a36Sopenharmony_ci 237162306a36Sopenharmony_cistatic struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) 237262306a36Sopenharmony_ci{ 237362306a36Sopenharmony_ci struct task_struct *task; 237462306a36Sopenharmony_ci struct mm_struct *mm; 237562306a36Sopenharmony_ci 237662306a36Sopenharmony_ci /* 237762306a36Sopenharmony_ci * There is no need to check if current process has the right to modify 237862306a36Sopenharmony_ci * the specified process when they are same. 237962306a36Sopenharmony_ci */ 238062306a36Sopenharmony_ci if (!pid) { 238162306a36Sopenharmony_ci mmget(current->mm); 238262306a36Sopenharmony_ci *mem_nodes = cpuset_mems_allowed(current); 238362306a36Sopenharmony_ci return current->mm; 238462306a36Sopenharmony_ci } 238562306a36Sopenharmony_ci 238662306a36Sopenharmony_ci /* Find the mm_struct */ 238762306a36Sopenharmony_ci rcu_read_lock(); 238862306a36Sopenharmony_ci task = find_task_by_vpid(pid); 238962306a36Sopenharmony_ci if (!task) { 239062306a36Sopenharmony_ci rcu_read_unlock(); 239162306a36Sopenharmony_ci return ERR_PTR(-ESRCH); 239262306a36Sopenharmony_ci } 239362306a36Sopenharmony_ci get_task_struct(task); 239462306a36Sopenharmony_ci 239562306a36Sopenharmony_ci /* 239662306a36Sopenharmony_ci * Check if this process has the right to modify the specified 239762306a36Sopenharmony_ci * process. Use the regular "ptrace_may_access()" checks. 239862306a36Sopenharmony_ci */ 239962306a36Sopenharmony_ci if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { 240062306a36Sopenharmony_ci rcu_read_unlock(); 240162306a36Sopenharmony_ci mm = ERR_PTR(-EPERM); 240262306a36Sopenharmony_ci goto out; 240362306a36Sopenharmony_ci } 240462306a36Sopenharmony_ci rcu_read_unlock(); 240562306a36Sopenharmony_ci 240662306a36Sopenharmony_ci mm = ERR_PTR(security_task_movememory(task)); 240762306a36Sopenharmony_ci if (IS_ERR(mm)) 240862306a36Sopenharmony_ci goto out; 240962306a36Sopenharmony_ci *mem_nodes = cpuset_mems_allowed(task); 241062306a36Sopenharmony_ci mm = get_task_mm(task); 241162306a36Sopenharmony_ciout: 241262306a36Sopenharmony_ci put_task_struct(task); 241362306a36Sopenharmony_ci if (!mm) 241462306a36Sopenharmony_ci mm = ERR_PTR(-EINVAL); 241562306a36Sopenharmony_ci return mm; 241662306a36Sopenharmony_ci} 241762306a36Sopenharmony_ci 241862306a36Sopenharmony_ci/* 241962306a36Sopenharmony_ci * Move a list of pages in the address space of the currently executing 242062306a36Sopenharmony_ci * process. 242162306a36Sopenharmony_ci */ 242262306a36Sopenharmony_cistatic int kernel_move_pages(pid_t pid, unsigned long nr_pages, 242362306a36Sopenharmony_ci const void __user * __user *pages, 242462306a36Sopenharmony_ci const int __user *nodes, 242562306a36Sopenharmony_ci int __user *status, int flags) 242662306a36Sopenharmony_ci{ 242762306a36Sopenharmony_ci struct mm_struct *mm; 242862306a36Sopenharmony_ci int err; 242962306a36Sopenharmony_ci nodemask_t task_nodes; 243062306a36Sopenharmony_ci 243162306a36Sopenharmony_ci /* Check flags */ 243262306a36Sopenharmony_ci if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 243362306a36Sopenharmony_ci return -EINVAL; 243462306a36Sopenharmony_ci 243562306a36Sopenharmony_ci if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 243662306a36Sopenharmony_ci return -EPERM; 243762306a36Sopenharmony_ci 243862306a36Sopenharmony_ci mm = find_mm_struct(pid, &task_nodes); 243962306a36Sopenharmony_ci if (IS_ERR(mm)) 244062306a36Sopenharmony_ci return PTR_ERR(mm); 244162306a36Sopenharmony_ci 244262306a36Sopenharmony_ci if (nodes) 244362306a36Sopenharmony_ci err = do_pages_move(mm, task_nodes, nr_pages, pages, 244462306a36Sopenharmony_ci nodes, status, flags); 244562306a36Sopenharmony_ci else 244662306a36Sopenharmony_ci err = do_pages_stat(mm, nr_pages, pages, status); 244762306a36Sopenharmony_ci 244862306a36Sopenharmony_ci mmput(mm); 244962306a36Sopenharmony_ci return err; 245062306a36Sopenharmony_ci} 245162306a36Sopenharmony_ci 245262306a36Sopenharmony_ciSYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, 245362306a36Sopenharmony_ci const void __user * __user *, pages, 245462306a36Sopenharmony_ci const int __user *, nodes, 245562306a36Sopenharmony_ci int __user *, status, int, flags) 245662306a36Sopenharmony_ci{ 245762306a36Sopenharmony_ci return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags); 245862306a36Sopenharmony_ci} 245962306a36Sopenharmony_ci 246062306a36Sopenharmony_ci#ifdef CONFIG_NUMA_BALANCING 246162306a36Sopenharmony_ci/* 246262306a36Sopenharmony_ci * Returns true if this is a safe migration target node for misplaced NUMA 246362306a36Sopenharmony_ci * pages. Currently it only checks the watermarks which is crude. 246462306a36Sopenharmony_ci */ 246562306a36Sopenharmony_cistatic bool migrate_balanced_pgdat(struct pglist_data *pgdat, 246662306a36Sopenharmony_ci unsigned long nr_migrate_pages) 246762306a36Sopenharmony_ci{ 246862306a36Sopenharmony_ci int z; 246962306a36Sopenharmony_ci 247062306a36Sopenharmony_ci for (z = pgdat->nr_zones - 1; z >= 0; z--) { 247162306a36Sopenharmony_ci struct zone *zone = pgdat->node_zones + z; 247262306a36Sopenharmony_ci 247362306a36Sopenharmony_ci if (!managed_zone(zone)) 247462306a36Sopenharmony_ci continue; 247562306a36Sopenharmony_ci 247662306a36Sopenharmony_ci /* Avoid waking kswapd by allocating pages_to_migrate pages. */ 247762306a36Sopenharmony_ci if (!zone_watermark_ok(zone, 0, 247862306a36Sopenharmony_ci high_wmark_pages(zone) + 247962306a36Sopenharmony_ci nr_migrate_pages, 248062306a36Sopenharmony_ci ZONE_MOVABLE, 0)) 248162306a36Sopenharmony_ci continue; 248262306a36Sopenharmony_ci return true; 248362306a36Sopenharmony_ci } 248462306a36Sopenharmony_ci return false; 248562306a36Sopenharmony_ci} 248662306a36Sopenharmony_ci 248762306a36Sopenharmony_cistatic struct folio *alloc_misplaced_dst_folio(struct folio *src, 248862306a36Sopenharmony_ci unsigned long data) 248962306a36Sopenharmony_ci{ 249062306a36Sopenharmony_ci int nid = (int) data; 249162306a36Sopenharmony_ci int order = folio_order(src); 249262306a36Sopenharmony_ci gfp_t gfp = __GFP_THISNODE; 249362306a36Sopenharmony_ci 249462306a36Sopenharmony_ci if (order > 0) 249562306a36Sopenharmony_ci gfp |= GFP_TRANSHUGE_LIGHT; 249662306a36Sopenharmony_ci else { 249762306a36Sopenharmony_ci gfp |= GFP_HIGHUSER_MOVABLE | __GFP_NOMEMALLOC | __GFP_NORETRY | 249862306a36Sopenharmony_ci __GFP_NOWARN; 249962306a36Sopenharmony_ci gfp &= ~__GFP_RECLAIM; 250062306a36Sopenharmony_ci } 250162306a36Sopenharmony_ci return __folio_alloc_node(gfp, order, nid); 250262306a36Sopenharmony_ci} 250362306a36Sopenharmony_ci 250462306a36Sopenharmony_cistatic int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 250562306a36Sopenharmony_ci{ 250662306a36Sopenharmony_ci int nr_pages = thp_nr_pages(page); 250762306a36Sopenharmony_ci int order = compound_order(page); 250862306a36Sopenharmony_ci 250962306a36Sopenharmony_ci VM_BUG_ON_PAGE(order && !PageTransHuge(page), page); 251062306a36Sopenharmony_ci 251162306a36Sopenharmony_ci /* Do not migrate THP mapped by multiple processes */ 251262306a36Sopenharmony_ci if (PageTransHuge(page) && total_mapcount(page) > 1) 251362306a36Sopenharmony_ci return 0; 251462306a36Sopenharmony_ci 251562306a36Sopenharmony_ci /* Avoid migrating to a node that is nearly full */ 251662306a36Sopenharmony_ci if (!migrate_balanced_pgdat(pgdat, nr_pages)) { 251762306a36Sopenharmony_ci int z; 251862306a36Sopenharmony_ci 251962306a36Sopenharmony_ci if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)) 252062306a36Sopenharmony_ci return 0; 252162306a36Sopenharmony_ci for (z = pgdat->nr_zones - 1; z >= 0; z--) { 252262306a36Sopenharmony_ci if (managed_zone(pgdat->node_zones + z)) 252362306a36Sopenharmony_ci break; 252462306a36Sopenharmony_ci } 252562306a36Sopenharmony_ci 252662306a36Sopenharmony_ci /* 252762306a36Sopenharmony_ci * If there are no managed zones, it should not proceed 252862306a36Sopenharmony_ci * further. 252962306a36Sopenharmony_ci */ 253062306a36Sopenharmony_ci if (z < 0) 253162306a36Sopenharmony_ci return 0; 253262306a36Sopenharmony_ci 253362306a36Sopenharmony_ci wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); 253462306a36Sopenharmony_ci return 0; 253562306a36Sopenharmony_ci } 253662306a36Sopenharmony_ci 253762306a36Sopenharmony_ci if (!isolate_lru_page(page)) 253862306a36Sopenharmony_ci return 0; 253962306a36Sopenharmony_ci 254062306a36Sopenharmony_ci mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), 254162306a36Sopenharmony_ci nr_pages); 254262306a36Sopenharmony_ci 254362306a36Sopenharmony_ci /* 254462306a36Sopenharmony_ci * Isolating the page has taken another reference, so the 254562306a36Sopenharmony_ci * caller's reference can be safely dropped without the page 254662306a36Sopenharmony_ci * disappearing underneath us during migration. 254762306a36Sopenharmony_ci */ 254862306a36Sopenharmony_ci put_page(page); 254962306a36Sopenharmony_ci return 1; 255062306a36Sopenharmony_ci} 255162306a36Sopenharmony_ci 255262306a36Sopenharmony_ci/* 255362306a36Sopenharmony_ci * Attempt to migrate a misplaced page to the specified destination 255462306a36Sopenharmony_ci * node. Caller is expected to have an elevated reference count on 255562306a36Sopenharmony_ci * the page that will be dropped by this function before returning. 255662306a36Sopenharmony_ci */ 255762306a36Sopenharmony_ciint migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, 255862306a36Sopenharmony_ci int node) 255962306a36Sopenharmony_ci{ 256062306a36Sopenharmony_ci pg_data_t *pgdat = NODE_DATA(node); 256162306a36Sopenharmony_ci int isolated; 256262306a36Sopenharmony_ci int nr_remaining; 256362306a36Sopenharmony_ci unsigned int nr_succeeded; 256462306a36Sopenharmony_ci LIST_HEAD(migratepages); 256562306a36Sopenharmony_ci int nr_pages = thp_nr_pages(page); 256662306a36Sopenharmony_ci 256762306a36Sopenharmony_ci /* 256862306a36Sopenharmony_ci * Don't migrate file pages that are mapped in multiple processes 256962306a36Sopenharmony_ci * with execute permissions as they are probably shared libraries. 257062306a36Sopenharmony_ci */ 257162306a36Sopenharmony_ci if (page_mapcount(page) != 1 && page_is_file_lru(page) && 257262306a36Sopenharmony_ci (vma->vm_flags & VM_EXEC)) 257362306a36Sopenharmony_ci goto out; 257462306a36Sopenharmony_ci 257562306a36Sopenharmony_ci /* 257662306a36Sopenharmony_ci * Also do not migrate dirty pages as not all filesystems can move 257762306a36Sopenharmony_ci * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. 257862306a36Sopenharmony_ci */ 257962306a36Sopenharmony_ci if (page_is_file_lru(page) && PageDirty(page)) 258062306a36Sopenharmony_ci goto out; 258162306a36Sopenharmony_ci 258262306a36Sopenharmony_ci isolated = numamigrate_isolate_page(pgdat, page); 258362306a36Sopenharmony_ci if (!isolated) 258462306a36Sopenharmony_ci goto out; 258562306a36Sopenharmony_ci 258662306a36Sopenharmony_ci list_add(&page->lru, &migratepages); 258762306a36Sopenharmony_ci nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, 258862306a36Sopenharmony_ci NULL, node, MIGRATE_ASYNC, 258962306a36Sopenharmony_ci MR_NUMA_MISPLACED, &nr_succeeded); 259062306a36Sopenharmony_ci if (nr_remaining) { 259162306a36Sopenharmony_ci if (!list_empty(&migratepages)) { 259262306a36Sopenharmony_ci list_del(&page->lru); 259362306a36Sopenharmony_ci mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + 259462306a36Sopenharmony_ci page_is_file_lru(page), -nr_pages); 259562306a36Sopenharmony_ci putback_lru_page(page); 259662306a36Sopenharmony_ci } 259762306a36Sopenharmony_ci isolated = 0; 259862306a36Sopenharmony_ci } 259962306a36Sopenharmony_ci if (nr_succeeded) { 260062306a36Sopenharmony_ci count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); 260162306a36Sopenharmony_ci if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node)) 260262306a36Sopenharmony_ci mod_node_page_state(pgdat, PGPROMOTE_SUCCESS, 260362306a36Sopenharmony_ci nr_succeeded); 260462306a36Sopenharmony_ci } 260562306a36Sopenharmony_ci BUG_ON(!list_empty(&migratepages)); 260662306a36Sopenharmony_ci return isolated; 260762306a36Sopenharmony_ci 260862306a36Sopenharmony_ciout: 260962306a36Sopenharmony_ci put_page(page); 261062306a36Sopenharmony_ci return 0; 261162306a36Sopenharmony_ci} 261262306a36Sopenharmony_ci#endif /* CONFIG_NUMA_BALANCING */ 261362306a36Sopenharmony_ci#endif /* CONFIG_NUMA */ 2614