162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * linux/mm/madvise.c 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 1999 Linus Torvalds 662306a36Sopenharmony_ci * Copyright (C) 2002 Christoph Hellwig 762306a36Sopenharmony_ci */ 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci#include <linux/mman.h> 1062306a36Sopenharmony_ci#include <linux/pagemap.h> 1162306a36Sopenharmony_ci#include <linux/syscalls.h> 1262306a36Sopenharmony_ci#include <linux/mempolicy.h> 1362306a36Sopenharmony_ci#include <linux/page-isolation.h> 1462306a36Sopenharmony_ci#include <linux/page_idle.h> 1562306a36Sopenharmony_ci#include <linux/userfaultfd_k.h> 1662306a36Sopenharmony_ci#include <linux/hugetlb.h> 1762306a36Sopenharmony_ci#include <linux/falloc.h> 1862306a36Sopenharmony_ci#include <linux/fadvise.h> 1962306a36Sopenharmony_ci#include <linux/sched.h> 2062306a36Sopenharmony_ci#include <linux/sched/mm.h> 2162306a36Sopenharmony_ci#include <linux/mm_inline.h> 2262306a36Sopenharmony_ci#include <linux/string.h> 2362306a36Sopenharmony_ci#include <linux/uio.h> 2462306a36Sopenharmony_ci#include <linux/ksm.h> 2562306a36Sopenharmony_ci#include <linux/fs.h> 2662306a36Sopenharmony_ci#include <linux/file.h> 2762306a36Sopenharmony_ci#include <linux/blkdev.h> 2862306a36Sopenharmony_ci#include <linux/backing-dev.h> 2962306a36Sopenharmony_ci#include <linux/pagewalk.h> 3062306a36Sopenharmony_ci#include <linux/swap.h> 3162306a36Sopenharmony_ci#include <linux/swapops.h> 3262306a36Sopenharmony_ci#include <linux/shmem_fs.h> 3362306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci#include <asm/tlb.h> 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ci#include "internal.h" 3862306a36Sopenharmony_ci#include "swap.h" 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_cistruct madvise_walk_private { 4162306a36Sopenharmony_ci struct mmu_gather *tlb; 4262306a36Sopenharmony_ci bool pageout; 4362306a36Sopenharmony_ci}; 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_ci/* 4662306a36Sopenharmony_ci * Any behaviour which results in changes to the vma->vm_flags needs to 4762306a36Sopenharmony_ci * take mmap_lock for writing. Others, which simply traverse vmas, need 4862306a36Sopenharmony_ci * to only take it for reading. 4962306a36Sopenharmony_ci */ 5062306a36Sopenharmony_cistatic int madvise_need_mmap_write(int behavior) 5162306a36Sopenharmony_ci{ 5262306a36Sopenharmony_ci switch (behavior) { 5362306a36Sopenharmony_ci case MADV_REMOVE: 5462306a36Sopenharmony_ci case MADV_WILLNEED: 5562306a36Sopenharmony_ci case MADV_DONTNEED: 5662306a36Sopenharmony_ci case MADV_DONTNEED_LOCKED: 5762306a36Sopenharmony_ci case MADV_COLD: 5862306a36Sopenharmony_ci case MADV_PAGEOUT: 5962306a36Sopenharmony_ci case MADV_FREE: 6062306a36Sopenharmony_ci case MADV_POPULATE_READ: 6162306a36Sopenharmony_ci case MADV_POPULATE_WRITE: 6262306a36Sopenharmony_ci case MADV_COLLAPSE: 6362306a36Sopenharmony_ci return 0; 6462306a36Sopenharmony_ci default: 6562306a36Sopenharmony_ci /* be safe, default to 1. list exceptions explicitly */ 6662306a36Sopenharmony_ci return 1; 6762306a36Sopenharmony_ci } 6862306a36Sopenharmony_ci} 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_ci#ifdef CONFIG_ANON_VMA_NAME 7162306a36Sopenharmony_cistruct anon_vma_name *anon_vma_name_alloc(const char *name) 7262306a36Sopenharmony_ci{ 7362306a36Sopenharmony_ci struct anon_vma_name *anon_name; 7462306a36Sopenharmony_ci size_t count; 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci /* Add 1 for NUL terminator at the end of the anon_name->name */ 7762306a36Sopenharmony_ci count = strlen(name) + 1; 7862306a36Sopenharmony_ci anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); 7962306a36Sopenharmony_ci if (anon_name) { 8062306a36Sopenharmony_ci kref_init(&anon_name->kref); 8162306a36Sopenharmony_ci memcpy(anon_name->name, name, count); 8262306a36Sopenharmony_ci } 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci return anon_name; 8562306a36Sopenharmony_ci} 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_civoid anon_vma_name_free(struct kref *kref) 8862306a36Sopenharmony_ci{ 8962306a36Sopenharmony_ci struct anon_vma_name *anon_name = 9062306a36Sopenharmony_ci container_of(kref, struct anon_vma_name, kref); 9162306a36Sopenharmony_ci kfree(anon_name); 9262306a36Sopenharmony_ci} 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_cistruct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) 9562306a36Sopenharmony_ci{ 9662306a36Sopenharmony_ci mmap_assert_locked(vma->vm_mm); 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci return vma->anon_name; 9962306a36Sopenharmony_ci} 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci/* mmap_lock should be write-locked */ 10262306a36Sopenharmony_cistatic int replace_anon_vma_name(struct vm_area_struct *vma, 10362306a36Sopenharmony_ci struct anon_vma_name *anon_name) 10462306a36Sopenharmony_ci{ 10562306a36Sopenharmony_ci struct anon_vma_name *orig_name = anon_vma_name(vma); 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci if (!anon_name) { 10862306a36Sopenharmony_ci vma->anon_name = NULL; 10962306a36Sopenharmony_ci anon_vma_name_put(orig_name); 11062306a36Sopenharmony_ci return 0; 11162306a36Sopenharmony_ci } 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci if (anon_vma_name_eq(orig_name, anon_name)) 11462306a36Sopenharmony_ci return 0; 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci vma->anon_name = anon_vma_name_reuse(anon_name); 11762306a36Sopenharmony_ci anon_vma_name_put(orig_name); 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci return 0; 12062306a36Sopenharmony_ci} 12162306a36Sopenharmony_ci#else /* CONFIG_ANON_VMA_NAME */ 12262306a36Sopenharmony_cistatic int replace_anon_vma_name(struct vm_area_struct *vma, 12362306a36Sopenharmony_ci struct anon_vma_name *anon_name) 12462306a36Sopenharmony_ci{ 12562306a36Sopenharmony_ci if (anon_name) 12662306a36Sopenharmony_ci return -EINVAL; 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci return 0; 12962306a36Sopenharmony_ci} 13062306a36Sopenharmony_ci#endif /* CONFIG_ANON_VMA_NAME */ 13162306a36Sopenharmony_ci/* 13262306a36Sopenharmony_ci * Update the vm_flags on region of a vma, splitting it or merging it as 13362306a36Sopenharmony_ci * necessary. Must be called with mmap_lock held for writing; 13462306a36Sopenharmony_ci * Caller should ensure anon_name stability by raising its refcount even when 13562306a36Sopenharmony_ci * anon_name belongs to a valid vma because this function might free that vma. 13662306a36Sopenharmony_ci */ 13762306a36Sopenharmony_cistatic int madvise_update_vma(struct vm_area_struct *vma, 13862306a36Sopenharmony_ci struct vm_area_struct **prev, unsigned long start, 13962306a36Sopenharmony_ci unsigned long end, unsigned long new_flags, 14062306a36Sopenharmony_ci struct anon_vma_name *anon_name) 14162306a36Sopenharmony_ci{ 14262306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 14362306a36Sopenharmony_ci int error; 14462306a36Sopenharmony_ci pgoff_t pgoff; 14562306a36Sopenharmony_ci VMA_ITERATOR(vmi, mm, start); 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { 14862306a36Sopenharmony_ci *prev = vma; 14962306a36Sopenharmony_ci return 0; 15062306a36Sopenharmony_ci } 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 15362306a36Sopenharmony_ci *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, 15462306a36Sopenharmony_ci vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), 15562306a36Sopenharmony_ci vma->vm_userfaultfd_ctx, anon_name); 15662306a36Sopenharmony_ci if (*prev) { 15762306a36Sopenharmony_ci vma = *prev; 15862306a36Sopenharmony_ci goto success; 15962306a36Sopenharmony_ci } 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci *prev = vma; 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci if (start != vma->vm_start) { 16462306a36Sopenharmony_ci error = split_vma(&vmi, vma, start, 1); 16562306a36Sopenharmony_ci if (error) 16662306a36Sopenharmony_ci return error; 16762306a36Sopenharmony_ci } 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ci if (end != vma->vm_end) { 17062306a36Sopenharmony_ci error = split_vma(&vmi, vma, end, 0); 17162306a36Sopenharmony_ci if (error) 17262306a36Sopenharmony_ci return error; 17362306a36Sopenharmony_ci } 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_cisuccess: 17662306a36Sopenharmony_ci /* vm_flags is protected by the mmap_lock held in write mode. */ 17762306a36Sopenharmony_ci vma_start_write(vma); 17862306a36Sopenharmony_ci vm_flags_reset(vma, new_flags); 17962306a36Sopenharmony_ci if (!vma->vm_file || vma_is_anon_shmem(vma)) { 18062306a36Sopenharmony_ci error = replace_anon_vma_name(vma, anon_name); 18162306a36Sopenharmony_ci if (error) 18262306a36Sopenharmony_ci return error; 18362306a36Sopenharmony_ci } 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci return 0; 18662306a36Sopenharmony_ci} 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci#ifdef CONFIG_SWAP 18962306a36Sopenharmony_cistatic int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, 19062306a36Sopenharmony_ci unsigned long end, struct mm_walk *walk) 19162306a36Sopenharmony_ci{ 19262306a36Sopenharmony_ci struct vm_area_struct *vma = walk->private; 19362306a36Sopenharmony_ci struct swap_iocb *splug = NULL; 19462306a36Sopenharmony_ci pte_t *ptep = NULL; 19562306a36Sopenharmony_ci spinlock_t *ptl; 19662306a36Sopenharmony_ci unsigned long addr; 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci for (addr = start; addr < end; addr += PAGE_SIZE) { 19962306a36Sopenharmony_ci pte_t pte; 20062306a36Sopenharmony_ci swp_entry_t entry; 20162306a36Sopenharmony_ci struct page *page; 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci if (!ptep++) { 20462306a36Sopenharmony_ci ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 20562306a36Sopenharmony_ci if (!ptep) 20662306a36Sopenharmony_ci break; 20762306a36Sopenharmony_ci } 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci pte = ptep_get(ptep); 21062306a36Sopenharmony_ci if (!is_swap_pte(pte)) 21162306a36Sopenharmony_ci continue; 21262306a36Sopenharmony_ci entry = pte_to_swp_entry(pte); 21362306a36Sopenharmony_ci if (unlikely(non_swap_entry(entry))) 21462306a36Sopenharmony_ci continue; 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 21762306a36Sopenharmony_ci ptep = NULL; 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, 22062306a36Sopenharmony_ci vma, addr, &splug); 22162306a36Sopenharmony_ci if (page) 22262306a36Sopenharmony_ci put_page(page); 22362306a36Sopenharmony_ci } 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci if (ptep) 22662306a36Sopenharmony_ci pte_unmap_unlock(ptep, ptl); 22762306a36Sopenharmony_ci swap_read_unplug(splug); 22862306a36Sopenharmony_ci cond_resched(); 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci return 0; 23162306a36Sopenharmony_ci} 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_cistatic const struct mm_walk_ops swapin_walk_ops = { 23462306a36Sopenharmony_ci .pmd_entry = swapin_walk_pmd_entry, 23562306a36Sopenharmony_ci .walk_lock = PGWALK_RDLOCK, 23662306a36Sopenharmony_ci}; 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_cistatic void shmem_swapin_range(struct vm_area_struct *vma, 23962306a36Sopenharmony_ci unsigned long start, unsigned long end, 24062306a36Sopenharmony_ci struct address_space *mapping) 24162306a36Sopenharmony_ci{ 24262306a36Sopenharmony_ci XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); 24362306a36Sopenharmony_ci pgoff_t end_index = linear_page_index(vma, end) - 1; 24462306a36Sopenharmony_ci struct page *page; 24562306a36Sopenharmony_ci struct swap_iocb *splug = NULL; 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci rcu_read_lock(); 24862306a36Sopenharmony_ci xas_for_each(&xas, page, end_index) { 24962306a36Sopenharmony_ci unsigned long addr; 25062306a36Sopenharmony_ci swp_entry_t entry; 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci if (!xa_is_value(page)) 25362306a36Sopenharmony_ci continue; 25462306a36Sopenharmony_ci entry = radix_to_swp_entry(page); 25562306a36Sopenharmony_ci /* There might be swapin error entries in shmem mapping. */ 25662306a36Sopenharmony_ci if (non_swap_entry(entry)) 25762306a36Sopenharmony_ci continue; 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci addr = vma->vm_start + 26062306a36Sopenharmony_ci ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); 26162306a36Sopenharmony_ci xas_pause(&xas); 26262306a36Sopenharmony_ci rcu_read_unlock(); 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci page = read_swap_cache_async(entry, mapping_gfp_mask(mapping), 26562306a36Sopenharmony_ci vma, addr, &splug); 26662306a36Sopenharmony_ci if (page) 26762306a36Sopenharmony_ci put_page(page); 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci rcu_read_lock(); 27062306a36Sopenharmony_ci } 27162306a36Sopenharmony_ci rcu_read_unlock(); 27262306a36Sopenharmony_ci swap_read_unplug(splug); 27362306a36Sopenharmony_ci} 27462306a36Sopenharmony_ci#endif /* CONFIG_SWAP */ 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci/* 27762306a36Sopenharmony_ci * Schedule all required I/O operations. Do not wait for completion. 27862306a36Sopenharmony_ci */ 27962306a36Sopenharmony_cistatic long madvise_willneed(struct vm_area_struct *vma, 28062306a36Sopenharmony_ci struct vm_area_struct **prev, 28162306a36Sopenharmony_ci unsigned long start, unsigned long end) 28262306a36Sopenharmony_ci{ 28362306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 28462306a36Sopenharmony_ci struct file *file = vma->vm_file; 28562306a36Sopenharmony_ci loff_t offset; 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci *prev = vma; 28862306a36Sopenharmony_ci#ifdef CONFIG_SWAP 28962306a36Sopenharmony_ci if (!file) { 29062306a36Sopenharmony_ci walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); 29162306a36Sopenharmony_ci lru_add_drain(); /* Push any new pages onto the LRU now */ 29262306a36Sopenharmony_ci return 0; 29362306a36Sopenharmony_ci } 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci if (shmem_mapping(file->f_mapping)) { 29662306a36Sopenharmony_ci shmem_swapin_range(vma, start, end, file->f_mapping); 29762306a36Sopenharmony_ci lru_add_drain(); /* Push any new pages onto the LRU now */ 29862306a36Sopenharmony_ci return 0; 29962306a36Sopenharmony_ci } 30062306a36Sopenharmony_ci#else 30162306a36Sopenharmony_ci if (!file) 30262306a36Sopenharmony_ci return -EBADF; 30362306a36Sopenharmony_ci#endif 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci if (IS_DAX(file_inode(file))) { 30662306a36Sopenharmony_ci /* no bad return value, but ignore advice */ 30762306a36Sopenharmony_ci return 0; 30862306a36Sopenharmony_ci } 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci /* 31162306a36Sopenharmony_ci * Filesystem's fadvise may need to take various locks. We need to 31262306a36Sopenharmony_ci * explicitly grab a reference because the vma (and hence the 31362306a36Sopenharmony_ci * vma's reference to the file) can go away as soon as we drop 31462306a36Sopenharmony_ci * mmap_lock. 31562306a36Sopenharmony_ci */ 31662306a36Sopenharmony_ci *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 31762306a36Sopenharmony_ci get_file(file); 31862306a36Sopenharmony_ci offset = (loff_t)(start - vma->vm_start) 31962306a36Sopenharmony_ci + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 32062306a36Sopenharmony_ci mmap_read_unlock(mm); 32162306a36Sopenharmony_ci vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); 32262306a36Sopenharmony_ci fput(file); 32362306a36Sopenharmony_ci mmap_read_lock(mm); 32462306a36Sopenharmony_ci return 0; 32562306a36Sopenharmony_ci} 32662306a36Sopenharmony_ci 32762306a36Sopenharmony_cistatic inline bool can_do_file_pageout(struct vm_area_struct *vma) 32862306a36Sopenharmony_ci{ 32962306a36Sopenharmony_ci if (!vma->vm_file) 33062306a36Sopenharmony_ci return false; 33162306a36Sopenharmony_ci /* 33262306a36Sopenharmony_ci * paging out pagecache only for non-anonymous mappings that correspond 33362306a36Sopenharmony_ci * to the files the calling process could (if tried) open for writing; 33462306a36Sopenharmony_ci * otherwise we'd be including shared non-exclusive mappings, which 33562306a36Sopenharmony_ci * opens a side channel. 33662306a36Sopenharmony_ci */ 33762306a36Sopenharmony_ci return inode_owner_or_capable(&nop_mnt_idmap, 33862306a36Sopenharmony_ci file_inode(vma->vm_file)) || 33962306a36Sopenharmony_ci file_permission(vma->vm_file, MAY_WRITE) == 0; 34062306a36Sopenharmony_ci} 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_cistatic int madvise_cold_or_pageout_pte_range(pmd_t *pmd, 34362306a36Sopenharmony_ci unsigned long addr, unsigned long end, 34462306a36Sopenharmony_ci struct mm_walk *walk) 34562306a36Sopenharmony_ci{ 34662306a36Sopenharmony_ci struct madvise_walk_private *private = walk->private; 34762306a36Sopenharmony_ci struct mmu_gather *tlb = private->tlb; 34862306a36Sopenharmony_ci bool pageout = private->pageout; 34962306a36Sopenharmony_ci struct mm_struct *mm = tlb->mm; 35062306a36Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 35162306a36Sopenharmony_ci pte_t *start_pte, *pte, ptent; 35262306a36Sopenharmony_ci spinlock_t *ptl; 35362306a36Sopenharmony_ci struct folio *folio = NULL; 35462306a36Sopenharmony_ci LIST_HEAD(folio_list); 35562306a36Sopenharmony_ci bool pageout_anon_only_filter; 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci if (fatal_signal_pending(current)) 35862306a36Sopenharmony_ci return -EINTR; 35962306a36Sopenharmony_ci 36062306a36Sopenharmony_ci pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && 36162306a36Sopenharmony_ci !can_do_file_pageout(vma); 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 36462306a36Sopenharmony_ci if (pmd_trans_huge(*pmd)) { 36562306a36Sopenharmony_ci pmd_t orig_pmd; 36662306a36Sopenharmony_ci unsigned long next = pmd_addr_end(addr, end); 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci tlb_change_page_size(tlb, HPAGE_PMD_SIZE); 36962306a36Sopenharmony_ci ptl = pmd_trans_huge_lock(pmd, vma); 37062306a36Sopenharmony_ci if (!ptl) 37162306a36Sopenharmony_ci return 0; 37262306a36Sopenharmony_ci 37362306a36Sopenharmony_ci orig_pmd = *pmd; 37462306a36Sopenharmony_ci if (is_huge_zero_pmd(orig_pmd)) 37562306a36Sopenharmony_ci goto huge_unlock; 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci if (unlikely(!pmd_present(orig_pmd))) { 37862306a36Sopenharmony_ci VM_BUG_ON(thp_migration_supported() && 37962306a36Sopenharmony_ci !is_pmd_migration_entry(orig_pmd)); 38062306a36Sopenharmony_ci goto huge_unlock; 38162306a36Sopenharmony_ci } 38262306a36Sopenharmony_ci 38362306a36Sopenharmony_ci folio = pfn_folio(pmd_pfn(orig_pmd)); 38462306a36Sopenharmony_ci 38562306a36Sopenharmony_ci /* Do not interfere with other mappings of this folio */ 38662306a36Sopenharmony_ci if (folio_estimated_sharers(folio) != 1) 38762306a36Sopenharmony_ci goto huge_unlock; 38862306a36Sopenharmony_ci 38962306a36Sopenharmony_ci if (pageout_anon_only_filter && !folio_test_anon(folio)) 39062306a36Sopenharmony_ci goto huge_unlock; 39162306a36Sopenharmony_ci 39262306a36Sopenharmony_ci if (next - addr != HPAGE_PMD_SIZE) { 39362306a36Sopenharmony_ci int err; 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci folio_get(folio); 39662306a36Sopenharmony_ci spin_unlock(ptl); 39762306a36Sopenharmony_ci folio_lock(folio); 39862306a36Sopenharmony_ci err = split_folio(folio); 39962306a36Sopenharmony_ci folio_unlock(folio); 40062306a36Sopenharmony_ci folio_put(folio); 40162306a36Sopenharmony_ci if (!err) 40262306a36Sopenharmony_ci goto regular_folio; 40362306a36Sopenharmony_ci return 0; 40462306a36Sopenharmony_ci } 40562306a36Sopenharmony_ci 40662306a36Sopenharmony_ci if (pmd_young(orig_pmd)) { 40762306a36Sopenharmony_ci pmdp_invalidate(vma, addr, pmd); 40862306a36Sopenharmony_ci orig_pmd = pmd_mkold(orig_pmd); 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci set_pmd_at(mm, addr, pmd, orig_pmd); 41162306a36Sopenharmony_ci tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 41262306a36Sopenharmony_ci } 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci folio_clear_referenced(folio); 41562306a36Sopenharmony_ci folio_test_clear_young(folio); 41662306a36Sopenharmony_ci if (folio_test_active(folio)) 41762306a36Sopenharmony_ci folio_set_workingset(folio); 41862306a36Sopenharmony_ci if (pageout) { 41962306a36Sopenharmony_ci if (folio_isolate_lru(folio)) { 42062306a36Sopenharmony_ci if (folio_test_unevictable(folio)) 42162306a36Sopenharmony_ci folio_putback_lru(folio); 42262306a36Sopenharmony_ci else 42362306a36Sopenharmony_ci list_add(&folio->lru, &folio_list); 42462306a36Sopenharmony_ci } 42562306a36Sopenharmony_ci } else 42662306a36Sopenharmony_ci folio_deactivate(folio); 42762306a36Sopenharmony_cihuge_unlock: 42862306a36Sopenharmony_ci spin_unlock(ptl); 42962306a36Sopenharmony_ci if (pageout) 43062306a36Sopenharmony_ci reclaim_pages(&folio_list); 43162306a36Sopenharmony_ci return 0; 43262306a36Sopenharmony_ci } 43362306a36Sopenharmony_ci 43462306a36Sopenharmony_ciregular_folio: 43562306a36Sopenharmony_ci#endif 43662306a36Sopenharmony_ci tlb_change_page_size(tlb, PAGE_SIZE); 43762306a36Sopenharmony_ci start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 43862306a36Sopenharmony_ci if (!start_pte) 43962306a36Sopenharmony_ci return 0; 44062306a36Sopenharmony_ci flush_tlb_batched_pending(mm); 44162306a36Sopenharmony_ci arch_enter_lazy_mmu_mode(); 44262306a36Sopenharmony_ci for (; addr < end; pte++, addr += PAGE_SIZE) { 44362306a36Sopenharmony_ci ptent = ptep_get(pte); 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_ci if (pte_none(ptent)) 44662306a36Sopenharmony_ci continue; 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci if (!pte_present(ptent)) 44962306a36Sopenharmony_ci continue; 45062306a36Sopenharmony_ci 45162306a36Sopenharmony_ci folio = vm_normal_folio(vma, addr, ptent); 45262306a36Sopenharmony_ci if (!folio || folio_is_zone_device(folio)) 45362306a36Sopenharmony_ci continue; 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci /* 45662306a36Sopenharmony_ci * Creating a THP page is expensive so split it only if we 45762306a36Sopenharmony_ci * are sure it's worth. Split it if we are only owner. 45862306a36Sopenharmony_ci */ 45962306a36Sopenharmony_ci if (folio_test_large(folio)) { 46062306a36Sopenharmony_ci int err; 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_ci if (folio_estimated_sharers(folio) != 1) 46362306a36Sopenharmony_ci break; 46462306a36Sopenharmony_ci if (pageout_anon_only_filter && !folio_test_anon(folio)) 46562306a36Sopenharmony_ci break; 46662306a36Sopenharmony_ci if (!folio_trylock(folio)) 46762306a36Sopenharmony_ci break; 46862306a36Sopenharmony_ci folio_get(folio); 46962306a36Sopenharmony_ci arch_leave_lazy_mmu_mode(); 47062306a36Sopenharmony_ci pte_unmap_unlock(start_pte, ptl); 47162306a36Sopenharmony_ci start_pte = NULL; 47262306a36Sopenharmony_ci err = split_folio(folio); 47362306a36Sopenharmony_ci folio_unlock(folio); 47462306a36Sopenharmony_ci folio_put(folio); 47562306a36Sopenharmony_ci if (err) 47662306a36Sopenharmony_ci break; 47762306a36Sopenharmony_ci start_pte = pte = 47862306a36Sopenharmony_ci pte_offset_map_lock(mm, pmd, addr, &ptl); 47962306a36Sopenharmony_ci if (!start_pte) 48062306a36Sopenharmony_ci break; 48162306a36Sopenharmony_ci arch_enter_lazy_mmu_mode(); 48262306a36Sopenharmony_ci pte--; 48362306a36Sopenharmony_ci addr -= PAGE_SIZE; 48462306a36Sopenharmony_ci continue; 48562306a36Sopenharmony_ci } 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci /* 48862306a36Sopenharmony_ci * Do not interfere with other mappings of this folio and 48962306a36Sopenharmony_ci * non-LRU folio. 49062306a36Sopenharmony_ci */ 49162306a36Sopenharmony_ci if (!folio_test_lru(folio) || folio_mapcount(folio) != 1) 49262306a36Sopenharmony_ci continue; 49362306a36Sopenharmony_ci 49462306a36Sopenharmony_ci if (pageout_anon_only_filter && !folio_test_anon(folio)) 49562306a36Sopenharmony_ci continue; 49662306a36Sopenharmony_ci 49762306a36Sopenharmony_ci VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_ci if (pte_young(ptent)) { 50062306a36Sopenharmony_ci ptent = ptep_get_and_clear_full(mm, addr, pte, 50162306a36Sopenharmony_ci tlb->fullmm); 50262306a36Sopenharmony_ci ptent = pte_mkold(ptent); 50362306a36Sopenharmony_ci set_pte_at(mm, addr, pte, ptent); 50462306a36Sopenharmony_ci tlb_remove_tlb_entry(tlb, pte, addr); 50562306a36Sopenharmony_ci } 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci /* 50862306a36Sopenharmony_ci * We are deactivating a folio for accelerating reclaiming. 50962306a36Sopenharmony_ci * VM couldn't reclaim the folio unless we clear PG_young. 51062306a36Sopenharmony_ci * As a side effect, it makes confuse idle-page tracking 51162306a36Sopenharmony_ci * because they will miss recent referenced history. 51262306a36Sopenharmony_ci */ 51362306a36Sopenharmony_ci folio_clear_referenced(folio); 51462306a36Sopenharmony_ci folio_test_clear_young(folio); 51562306a36Sopenharmony_ci if (folio_test_active(folio)) 51662306a36Sopenharmony_ci folio_set_workingset(folio); 51762306a36Sopenharmony_ci if (pageout) { 51862306a36Sopenharmony_ci if (folio_isolate_lru(folio)) { 51962306a36Sopenharmony_ci if (folio_test_unevictable(folio)) 52062306a36Sopenharmony_ci folio_putback_lru(folio); 52162306a36Sopenharmony_ci else 52262306a36Sopenharmony_ci list_add(&folio->lru, &folio_list); 52362306a36Sopenharmony_ci } 52462306a36Sopenharmony_ci } else 52562306a36Sopenharmony_ci folio_deactivate(folio); 52662306a36Sopenharmony_ci } 52762306a36Sopenharmony_ci 52862306a36Sopenharmony_ci if (start_pte) { 52962306a36Sopenharmony_ci arch_leave_lazy_mmu_mode(); 53062306a36Sopenharmony_ci pte_unmap_unlock(start_pte, ptl); 53162306a36Sopenharmony_ci } 53262306a36Sopenharmony_ci if (pageout) 53362306a36Sopenharmony_ci reclaim_pages(&folio_list); 53462306a36Sopenharmony_ci cond_resched(); 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ci return 0; 53762306a36Sopenharmony_ci} 53862306a36Sopenharmony_ci 53962306a36Sopenharmony_cistatic const struct mm_walk_ops cold_walk_ops = { 54062306a36Sopenharmony_ci .pmd_entry = madvise_cold_or_pageout_pte_range, 54162306a36Sopenharmony_ci .walk_lock = PGWALK_RDLOCK, 54262306a36Sopenharmony_ci}; 54362306a36Sopenharmony_ci 54462306a36Sopenharmony_cistatic void madvise_cold_page_range(struct mmu_gather *tlb, 54562306a36Sopenharmony_ci struct vm_area_struct *vma, 54662306a36Sopenharmony_ci unsigned long addr, unsigned long end) 54762306a36Sopenharmony_ci{ 54862306a36Sopenharmony_ci struct madvise_walk_private walk_private = { 54962306a36Sopenharmony_ci .pageout = false, 55062306a36Sopenharmony_ci .tlb = tlb, 55162306a36Sopenharmony_ci }; 55262306a36Sopenharmony_ci 55362306a36Sopenharmony_ci tlb_start_vma(tlb, vma); 55462306a36Sopenharmony_ci walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 55562306a36Sopenharmony_ci tlb_end_vma(tlb, vma); 55662306a36Sopenharmony_ci} 55762306a36Sopenharmony_ci 55862306a36Sopenharmony_cistatic inline bool can_madv_lru_vma(struct vm_area_struct *vma) 55962306a36Sopenharmony_ci{ 56062306a36Sopenharmony_ci return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); 56162306a36Sopenharmony_ci} 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_cistatic long madvise_cold(struct vm_area_struct *vma, 56462306a36Sopenharmony_ci struct vm_area_struct **prev, 56562306a36Sopenharmony_ci unsigned long start_addr, unsigned long end_addr) 56662306a36Sopenharmony_ci{ 56762306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 56862306a36Sopenharmony_ci struct mmu_gather tlb; 56962306a36Sopenharmony_ci 57062306a36Sopenharmony_ci *prev = vma; 57162306a36Sopenharmony_ci if (!can_madv_lru_vma(vma)) 57262306a36Sopenharmony_ci return -EINVAL; 57362306a36Sopenharmony_ci 57462306a36Sopenharmony_ci lru_add_drain(); 57562306a36Sopenharmony_ci tlb_gather_mmu(&tlb, mm); 57662306a36Sopenharmony_ci madvise_cold_page_range(&tlb, vma, start_addr, end_addr); 57762306a36Sopenharmony_ci tlb_finish_mmu(&tlb); 57862306a36Sopenharmony_ci 57962306a36Sopenharmony_ci return 0; 58062306a36Sopenharmony_ci} 58162306a36Sopenharmony_ci 58262306a36Sopenharmony_cistatic void madvise_pageout_page_range(struct mmu_gather *tlb, 58362306a36Sopenharmony_ci struct vm_area_struct *vma, 58462306a36Sopenharmony_ci unsigned long addr, unsigned long end) 58562306a36Sopenharmony_ci{ 58662306a36Sopenharmony_ci struct madvise_walk_private walk_private = { 58762306a36Sopenharmony_ci .pageout = true, 58862306a36Sopenharmony_ci .tlb = tlb, 58962306a36Sopenharmony_ci }; 59062306a36Sopenharmony_ci 59162306a36Sopenharmony_ci tlb_start_vma(tlb, vma); 59262306a36Sopenharmony_ci walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); 59362306a36Sopenharmony_ci tlb_end_vma(tlb, vma); 59462306a36Sopenharmony_ci} 59562306a36Sopenharmony_ci 59662306a36Sopenharmony_cistatic long madvise_pageout(struct vm_area_struct *vma, 59762306a36Sopenharmony_ci struct vm_area_struct **prev, 59862306a36Sopenharmony_ci unsigned long start_addr, unsigned long end_addr) 59962306a36Sopenharmony_ci{ 60062306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 60162306a36Sopenharmony_ci struct mmu_gather tlb; 60262306a36Sopenharmony_ci 60362306a36Sopenharmony_ci *prev = vma; 60462306a36Sopenharmony_ci if (!can_madv_lru_vma(vma)) 60562306a36Sopenharmony_ci return -EINVAL; 60662306a36Sopenharmony_ci 60762306a36Sopenharmony_ci /* 60862306a36Sopenharmony_ci * If the VMA belongs to a private file mapping, there can be private 60962306a36Sopenharmony_ci * dirty pages which can be paged out if even this process is neither 61062306a36Sopenharmony_ci * owner nor write capable of the file. We allow private file mappings 61162306a36Sopenharmony_ci * further to pageout dirty anon pages. 61262306a36Sopenharmony_ci */ 61362306a36Sopenharmony_ci if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && 61462306a36Sopenharmony_ci (vma->vm_flags & VM_MAYSHARE))) 61562306a36Sopenharmony_ci return 0; 61662306a36Sopenharmony_ci 61762306a36Sopenharmony_ci lru_add_drain(); 61862306a36Sopenharmony_ci tlb_gather_mmu(&tlb, mm); 61962306a36Sopenharmony_ci madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); 62062306a36Sopenharmony_ci tlb_finish_mmu(&tlb); 62162306a36Sopenharmony_ci 62262306a36Sopenharmony_ci return 0; 62362306a36Sopenharmony_ci} 62462306a36Sopenharmony_ci 62562306a36Sopenharmony_cistatic int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, 62662306a36Sopenharmony_ci unsigned long end, struct mm_walk *walk) 62762306a36Sopenharmony_ci 62862306a36Sopenharmony_ci{ 62962306a36Sopenharmony_ci struct mmu_gather *tlb = walk->private; 63062306a36Sopenharmony_ci struct mm_struct *mm = tlb->mm; 63162306a36Sopenharmony_ci struct vm_area_struct *vma = walk->vma; 63262306a36Sopenharmony_ci spinlock_t *ptl; 63362306a36Sopenharmony_ci pte_t *start_pte, *pte, ptent; 63462306a36Sopenharmony_ci struct folio *folio; 63562306a36Sopenharmony_ci int nr_swap = 0; 63662306a36Sopenharmony_ci unsigned long next; 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_ci next = pmd_addr_end(addr, end); 63962306a36Sopenharmony_ci if (pmd_trans_huge(*pmd)) 64062306a36Sopenharmony_ci if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) 64162306a36Sopenharmony_ci return 0; 64262306a36Sopenharmony_ci 64362306a36Sopenharmony_ci tlb_change_page_size(tlb, PAGE_SIZE); 64462306a36Sopenharmony_ci start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 64562306a36Sopenharmony_ci if (!start_pte) 64662306a36Sopenharmony_ci return 0; 64762306a36Sopenharmony_ci flush_tlb_batched_pending(mm); 64862306a36Sopenharmony_ci arch_enter_lazy_mmu_mode(); 64962306a36Sopenharmony_ci for (; addr != end; pte++, addr += PAGE_SIZE) { 65062306a36Sopenharmony_ci ptent = ptep_get(pte); 65162306a36Sopenharmony_ci 65262306a36Sopenharmony_ci if (pte_none(ptent)) 65362306a36Sopenharmony_ci continue; 65462306a36Sopenharmony_ci /* 65562306a36Sopenharmony_ci * If the pte has swp_entry, just clear page table to 65662306a36Sopenharmony_ci * prevent swap-in which is more expensive rather than 65762306a36Sopenharmony_ci * (page allocation + zeroing). 65862306a36Sopenharmony_ci */ 65962306a36Sopenharmony_ci if (!pte_present(ptent)) { 66062306a36Sopenharmony_ci swp_entry_t entry; 66162306a36Sopenharmony_ci 66262306a36Sopenharmony_ci entry = pte_to_swp_entry(ptent); 66362306a36Sopenharmony_ci if (!non_swap_entry(entry)) { 66462306a36Sopenharmony_ci nr_swap--; 66562306a36Sopenharmony_ci free_swap_and_cache(entry); 66662306a36Sopenharmony_ci pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 66762306a36Sopenharmony_ci } else if (is_hwpoison_entry(entry) || 66862306a36Sopenharmony_ci is_poisoned_swp_entry(entry)) { 66962306a36Sopenharmony_ci pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 67062306a36Sopenharmony_ci } 67162306a36Sopenharmony_ci continue; 67262306a36Sopenharmony_ci } 67362306a36Sopenharmony_ci 67462306a36Sopenharmony_ci folio = vm_normal_folio(vma, addr, ptent); 67562306a36Sopenharmony_ci if (!folio || folio_is_zone_device(folio)) 67662306a36Sopenharmony_ci continue; 67762306a36Sopenharmony_ci 67862306a36Sopenharmony_ci /* 67962306a36Sopenharmony_ci * If pmd isn't transhuge but the folio is large and 68062306a36Sopenharmony_ci * is owned by only this process, split it and 68162306a36Sopenharmony_ci * deactivate all pages. 68262306a36Sopenharmony_ci */ 68362306a36Sopenharmony_ci if (folio_test_large(folio)) { 68462306a36Sopenharmony_ci int err; 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci if (folio_estimated_sharers(folio) != 1) 68762306a36Sopenharmony_ci break; 68862306a36Sopenharmony_ci if (!folio_trylock(folio)) 68962306a36Sopenharmony_ci break; 69062306a36Sopenharmony_ci folio_get(folio); 69162306a36Sopenharmony_ci arch_leave_lazy_mmu_mode(); 69262306a36Sopenharmony_ci pte_unmap_unlock(start_pte, ptl); 69362306a36Sopenharmony_ci start_pte = NULL; 69462306a36Sopenharmony_ci err = split_folio(folio); 69562306a36Sopenharmony_ci folio_unlock(folio); 69662306a36Sopenharmony_ci folio_put(folio); 69762306a36Sopenharmony_ci if (err) 69862306a36Sopenharmony_ci break; 69962306a36Sopenharmony_ci start_pte = pte = 70062306a36Sopenharmony_ci pte_offset_map_lock(mm, pmd, addr, &ptl); 70162306a36Sopenharmony_ci if (!start_pte) 70262306a36Sopenharmony_ci break; 70362306a36Sopenharmony_ci arch_enter_lazy_mmu_mode(); 70462306a36Sopenharmony_ci pte--; 70562306a36Sopenharmony_ci addr -= PAGE_SIZE; 70662306a36Sopenharmony_ci continue; 70762306a36Sopenharmony_ci } 70862306a36Sopenharmony_ci 70962306a36Sopenharmony_ci if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { 71062306a36Sopenharmony_ci if (!folio_trylock(folio)) 71162306a36Sopenharmony_ci continue; 71262306a36Sopenharmony_ci /* 71362306a36Sopenharmony_ci * If folio is shared with others, we mustn't clear 71462306a36Sopenharmony_ci * the folio's dirty flag. 71562306a36Sopenharmony_ci */ 71662306a36Sopenharmony_ci if (folio_mapcount(folio) != 1) { 71762306a36Sopenharmony_ci folio_unlock(folio); 71862306a36Sopenharmony_ci continue; 71962306a36Sopenharmony_ci } 72062306a36Sopenharmony_ci 72162306a36Sopenharmony_ci if (folio_test_swapcache(folio) && 72262306a36Sopenharmony_ci !folio_free_swap(folio)) { 72362306a36Sopenharmony_ci folio_unlock(folio); 72462306a36Sopenharmony_ci continue; 72562306a36Sopenharmony_ci } 72662306a36Sopenharmony_ci 72762306a36Sopenharmony_ci folio_clear_dirty(folio); 72862306a36Sopenharmony_ci folio_unlock(folio); 72962306a36Sopenharmony_ci } 73062306a36Sopenharmony_ci 73162306a36Sopenharmony_ci if (pte_young(ptent) || pte_dirty(ptent)) { 73262306a36Sopenharmony_ci /* 73362306a36Sopenharmony_ci * Some of architecture(ex, PPC) don't update TLB 73462306a36Sopenharmony_ci * with set_pte_at and tlb_remove_tlb_entry so for 73562306a36Sopenharmony_ci * the portability, remap the pte with old|clean 73662306a36Sopenharmony_ci * after pte clearing. 73762306a36Sopenharmony_ci */ 73862306a36Sopenharmony_ci ptent = ptep_get_and_clear_full(mm, addr, pte, 73962306a36Sopenharmony_ci tlb->fullmm); 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ci ptent = pte_mkold(ptent); 74262306a36Sopenharmony_ci ptent = pte_mkclean(ptent); 74362306a36Sopenharmony_ci set_pte_at(mm, addr, pte, ptent); 74462306a36Sopenharmony_ci tlb_remove_tlb_entry(tlb, pte, addr); 74562306a36Sopenharmony_ci } 74662306a36Sopenharmony_ci folio_mark_lazyfree(folio); 74762306a36Sopenharmony_ci } 74862306a36Sopenharmony_ci 74962306a36Sopenharmony_ci if (nr_swap) { 75062306a36Sopenharmony_ci if (current->mm == mm) 75162306a36Sopenharmony_ci sync_mm_rss(mm); 75262306a36Sopenharmony_ci add_mm_counter(mm, MM_SWAPENTS, nr_swap); 75362306a36Sopenharmony_ci } 75462306a36Sopenharmony_ci if (start_pte) { 75562306a36Sopenharmony_ci arch_leave_lazy_mmu_mode(); 75662306a36Sopenharmony_ci pte_unmap_unlock(start_pte, ptl); 75762306a36Sopenharmony_ci } 75862306a36Sopenharmony_ci cond_resched(); 75962306a36Sopenharmony_ci 76062306a36Sopenharmony_ci return 0; 76162306a36Sopenharmony_ci} 76262306a36Sopenharmony_ci 76362306a36Sopenharmony_cistatic const struct mm_walk_ops madvise_free_walk_ops = { 76462306a36Sopenharmony_ci .pmd_entry = madvise_free_pte_range, 76562306a36Sopenharmony_ci .walk_lock = PGWALK_RDLOCK, 76662306a36Sopenharmony_ci}; 76762306a36Sopenharmony_ci 76862306a36Sopenharmony_cistatic int madvise_free_single_vma(struct vm_area_struct *vma, 76962306a36Sopenharmony_ci unsigned long start_addr, unsigned long end_addr) 77062306a36Sopenharmony_ci{ 77162306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 77262306a36Sopenharmony_ci struct mmu_notifier_range range; 77362306a36Sopenharmony_ci struct mmu_gather tlb; 77462306a36Sopenharmony_ci 77562306a36Sopenharmony_ci /* MADV_FREE works for only anon vma at the moment */ 77662306a36Sopenharmony_ci if (!vma_is_anonymous(vma)) 77762306a36Sopenharmony_ci return -EINVAL; 77862306a36Sopenharmony_ci 77962306a36Sopenharmony_ci range.start = max(vma->vm_start, start_addr); 78062306a36Sopenharmony_ci if (range.start >= vma->vm_end) 78162306a36Sopenharmony_ci return -EINVAL; 78262306a36Sopenharmony_ci range.end = min(vma->vm_end, end_addr); 78362306a36Sopenharmony_ci if (range.end <= vma->vm_start) 78462306a36Sopenharmony_ci return -EINVAL; 78562306a36Sopenharmony_ci mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, 78662306a36Sopenharmony_ci range.start, range.end); 78762306a36Sopenharmony_ci 78862306a36Sopenharmony_ci lru_add_drain(); 78962306a36Sopenharmony_ci tlb_gather_mmu(&tlb, mm); 79062306a36Sopenharmony_ci update_hiwater_rss(mm); 79162306a36Sopenharmony_ci 79262306a36Sopenharmony_ci mmu_notifier_invalidate_range_start(&range); 79362306a36Sopenharmony_ci tlb_start_vma(&tlb, vma); 79462306a36Sopenharmony_ci walk_page_range(vma->vm_mm, range.start, range.end, 79562306a36Sopenharmony_ci &madvise_free_walk_ops, &tlb); 79662306a36Sopenharmony_ci tlb_end_vma(&tlb, vma); 79762306a36Sopenharmony_ci mmu_notifier_invalidate_range_end(&range); 79862306a36Sopenharmony_ci tlb_finish_mmu(&tlb); 79962306a36Sopenharmony_ci 80062306a36Sopenharmony_ci return 0; 80162306a36Sopenharmony_ci} 80262306a36Sopenharmony_ci 80362306a36Sopenharmony_ci/* 80462306a36Sopenharmony_ci * Application no longer needs these pages. If the pages are dirty, 80562306a36Sopenharmony_ci * it's OK to just throw them away. The app will be more careful about 80662306a36Sopenharmony_ci * data it wants to keep. Be sure to free swap resources too. The 80762306a36Sopenharmony_ci * zap_page_range_single call sets things up for shrink_active_list to actually 80862306a36Sopenharmony_ci * free these pages later if no one else has touched them in the meantime, 80962306a36Sopenharmony_ci * although we could add these pages to a global reuse list for 81062306a36Sopenharmony_ci * shrink_active_list to pick up before reclaiming other pages. 81162306a36Sopenharmony_ci * 81262306a36Sopenharmony_ci * NB: This interface discards data rather than pushes it out to swap, 81362306a36Sopenharmony_ci * as some implementations do. This has performance implications for 81462306a36Sopenharmony_ci * applications like large transactional databases which want to discard 81562306a36Sopenharmony_ci * pages in anonymous maps after committing to backing store the data 81662306a36Sopenharmony_ci * that was kept in them. There is no reason to write this data out to 81762306a36Sopenharmony_ci * the swap area if the application is discarding it. 81862306a36Sopenharmony_ci * 81962306a36Sopenharmony_ci * An interface that causes the system to free clean pages and flush 82062306a36Sopenharmony_ci * dirty pages is already available as msync(MS_INVALIDATE). 82162306a36Sopenharmony_ci */ 82262306a36Sopenharmony_cistatic long madvise_dontneed_single_vma(struct vm_area_struct *vma, 82362306a36Sopenharmony_ci unsigned long start, unsigned long end) 82462306a36Sopenharmony_ci{ 82562306a36Sopenharmony_ci zap_page_range_single(vma, start, end - start, NULL); 82662306a36Sopenharmony_ci return 0; 82762306a36Sopenharmony_ci} 82862306a36Sopenharmony_ci 82962306a36Sopenharmony_cistatic bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, 83062306a36Sopenharmony_ci unsigned long start, 83162306a36Sopenharmony_ci unsigned long *end, 83262306a36Sopenharmony_ci int behavior) 83362306a36Sopenharmony_ci{ 83462306a36Sopenharmony_ci if (!is_vm_hugetlb_page(vma)) { 83562306a36Sopenharmony_ci unsigned int forbidden = VM_PFNMAP; 83662306a36Sopenharmony_ci 83762306a36Sopenharmony_ci if (behavior != MADV_DONTNEED_LOCKED) 83862306a36Sopenharmony_ci forbidden |= VM_LOCKED; 83962306a36Sopenharmony_ci 84062306a36Sopenharmony_ci return !(vma->vm_flags & forbidden); 84162306a36Sopenharmony_ci } 84262306a36Sopenharmony_ci 84362306a36Sopenharmony_ci if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) 84462306a36Sopenharmony_ci return false; 84562306a36Sopenharmony_ci if (start & ~huge_page_mask(hstate_vma(vma))) 84662306a36Sopenharmony_ci return false; 84762306a36Sopenharmony_ci 84862306a36Sopenharmony_ci /* 84962306a36Sopenharmony_ci * Madvise callers expect the length to be rounded up to PAGE_SIZE 85062306a36Sopenharmony_ci * boundaries, and may be unaware that this VMA uses huge pages. 85162306a36Sopenharmony_ci * Avoid unexpected data loss by rounding down the number of 85262306a36Sopenharmony_ci * huge pages freed. 85362306a36Sopenharmony_ci */ 85462306a36Sopenharmony_ci *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); 85562306a36Sopenharmony_ci 85662306a36Sopenharmony_ci return true; 85762306a36Sopenharmony_ci} 85862306a36Sopenharmony_ci 85962306a36Sopenharmony_cistatic long madvise_dontneed_free(struct vm_area_struct *vma, 86062306a36Sopenharmony_ci struct vm_area_struct **prev, 86162306a36Sopenharmony_ci unsigned long start, unsigned long end, 86262306a36Sopenharmony_ci int behavior) 86362306a36Sopenharmony_ci{ 86462306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 86562306a36Sopenharmony_ci 86662306a36Sopenharmony_ci *prev = vma; 86762306a36Sopenharmony_ci if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) 86862306a36Sopenharmony_ci return -EINVAL; 86962306a36Sopenharmony_ci 87062306a36Sopenharmony_ci if (start == end) 87162306a36Sopenharmony_ci return 0; 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci if (!userfaultfd_remove(vma, start, end)) { 87462306a36Sopenharmony_ci *prev = NULL; /* mmap_lock has been dropped, prev is stale */ 87562306a36Sopenharmony_ci 87662306a36Sopenharmony_ci mmap_read_lock(mm); 87762306a36Sopenharmony_ci vma = vma_lookup(mm, start); 87862306a36Sopenharmony_ci if (!vma) 87962306a36Sopenharmony_ci return -ENOMEM; 88062306a36Sopenharmony_ci /* 88162306a36Sopenharmony_ci * Potential end adjustment for hugetlb vma is OK as 88262306a36Sopenharmony_ci * the check below keeps end within vma. 88362306a36Sopenharmony_ci */ 88462306a36Sopenharmony_ci if (!madvise_dontneed_free_valid_vma(vma, start, &end, 88562306a36Sopenharmony_ci behavior)) 88662306a36Sopenharmony_ci return -EINVAL; 88762306a36Sopenharmony_ci if (end > vma->vm_end) { 88862306a36Sopenharmony_ci /* 88962306a36Sopenharmony_ci * Don't fail if end > vma->vm_end. If the old 89062306a36Sopenharmony_ci * vma was split while the mmap_lock was 89162306a36Sopenharmony_ci * released the effect of the concurrent 89262306a36Sopenharmony_ci * operation may not cause madvise() to 89362306a36Sopenharmony_ci * have an undefined result. There may be an 89462306a36Sopenharmony_ci * adjacent next vma that we'll walk 89562306a36Sopenharmony_ci * next. userfaultfd_remove() will generate an 89662306a36Sopenharmony_ci * UFFD_EVENT_REMOVE repetition on the 89762306a36Sopenharmony_ci * end-vma->vm_end range, but the manager can 89862306a36Sopenharmony_ci * handle a repetition fine. 89962306a36Sopenharmony_ci */ 90062306a36Sopenharmony_ci end = vma->vm_end; 90162306a36Sopenharmony_ci } 90262306a36Sopenharmony_ci VM_WARN_ON(start >= end); 90362306a36Sopenharmony_ci } 90462306a36Sopenharmony_ci 90562306a36Sopenharmony_ci if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) 90662306a36Sopenharmony_ci return madvise_dontneed_single_vma(vma, start, end); 90762306a36Sopenharmony_ci else if (behavior == MADV_FREE) 90862306a36Sopenharmony_ci return madvise_free_single_vma(vma, start, end); 90962306a36Sopenharmony_ci else 91062306a36Sopenharmony_ci return -EINVAL; 91162306a36Sopenharmony_ci} 91262306a36Sopenharmony_ci 91362306a36Sopenharmony_cistatic long madvise_populate(struct vm_area_struct *vma, 91462306a36Sopenharmony_ci struct vm_area_struct **prev, 91562306a36Sopenharmony_ci unsigned long start, unsigned long end, 91662306a36Sopenharmony_ci int behavior) 91762306a36Sopenharmony_ci{ 91862306a36Sopenharmony_ci const bool write = behavior == MADV_POPULATE_WRITE; 91962306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 92062306a36Sopenharmony_ci unsigned long tmp_end; 92162306a36Sopenharmony_ci int locked = 1; 92262306a36Sopenharmony_ci long pages; 92362306a36Sopenharmony_ci 92462306a36Sopenharmony_ci *prev = vma; 92562306a36Sopenharmony_ci 92662306a36Sopenharmony_ci while (start < end) { 92762306a36Sopenharmony_ci /* 92862306a36Sopenharmony_ci * We might have temporarily dropped the lock. For example, 92962306a36Sopenharmony_ci * our VMA might have been split. 93062306a36Sopenharmony_ci */ 93162306a36Sopenharmony_ci if (!vma || start >= vma->vm_end) { 93262306a36Sopenharmony_ci vma = vma_lookup(mm, start); 93362306a36Sopenharmony_ci if (!vma) 93462306a36Sopenharmony_ci return -ENOMEM; 93562306a36Sopenharmony_ci } 93662306a36Sopenharmony_ci 93762306a36Sopenharmony_ci tmp_end = min_t(unsigned long, end, vma->vm_end); 93862306a36Sopenharmony_ci /* Populate (prefault) page tables readable/writable. */ 93962306a36Sopenharmony_ci pages = faultin_vma_page_range(vma, start, tmp_end, write, 94062306a36Sopenharmony_ci &locked); 94162306a36Sopenharmony_ci if (!locked) { 94262306a36Sopenharmony_ci mmap_read_lock(mm); 94362306a36Sopenharmony_ci locked = 1; 94462306a36Sopenharmony_ci *prev = NULL; 94562306a36Sopenharmony_ci vma = NULL; 94662306a36Sopenharmony_ci } 94762306a36Sopenharmony_ci if (pages < 0) { 94862306a36Sopenharmony_ci switch (pages) { 94962306a36Sopenharmony_ci case -EINTR: 95062306a36Sopenharmony_ci return -EINTR; 95162306a36Sopenharmony_ci case -EINVAL: /* Incompatible mappings / permissions. */ 95262306a36Sopenharmony_ci return -EINVAL; 95362306a36Sopenharmony_ci case -EHWPOISON: 95462306a36Sopenharmony_ci return -EHWPOISON; 95562306a36Sopenharmony_ci case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ 95662306a36Sopenharmony_ci return -EFAULT; 95762306a36Sopenharmony_ci default: 95862306a36Sopenharmony_ci pr_warn_once("%s: unhandled return value: %ld\n", 95962306a36Sopenharmony_ci __func__, pages); 96062306a36Sopenharmony_ci fallthrough; 96162306a36Sopenharmony_ci case -ENOMEM: 96262306a36Sopenharmony_ci return -ENOMEM; 96362306a36Sopenharmony_ci } 96462306a36Sopenharmony_ci } 96562306a36Sopenharmony_ci start += pages * PAGE_SIZE; 96662306a36Sopenharmony_ci } 96762306a36Sopenharmony_ci return 0; 96862306a36Sopenharmony_ci} 96962306a36Sopenharmony_ci 97062306a36Sopenharmony_ci/* 97162306a36Sopenharmony_ci * Application wants to free up the pages and associated backing store. 97262306a36Sopenharmony_ci * This is effectively punching a hole into the middle of a file. 97362306a36Sopenharmony_ci */ 97462306a36Sopenharmony_cistatic long madvise_remove(struct vm_area_struct *vma, 97562306a36Sopenharmony_ci struct vm_area_struct **prev, 97662306a36Sopenharmony_ci unsigned long start, unsigned long end) 97762306a36Sopenharmony_ci{ 97862306a36Sopenharmony_ci loff_t offset; 97962306a36Sopenharmony_ci int error; 98062306a36Sopenharmony_ci struct file *f; 98162306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 98262306a36Sopenharmony_ci 98362306a36Sopenharmony_ci *prev = NULL; /* tell sys_madvise we drop mmap_lock */ 98462306a36Sopenharmony_ci 98562306a36Sopenharmony_ci if (vma->vm_flags & VM_LOCKED) 98662306a36Sopenharmony_ci return -EINVAL; 98762306a36Sopenharmony_ci 98862306a36Sopenharmony_ci f = vma->vm_file; 98962306a36Sopenharmony_ci 99062306a36Sopenharmony_ci if (!f || !f->f_mapping || !f->f_mapping->host) { 99162306a36Sopenharmony_ci return -EINVAL; 99262306a36Sopenharmony_ci } 99362306a36Sopenharmony_ci 99462306a36Sopenharmony_ci if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 99562306a36Sopenharmony_ci return -EACCES; 99662306a36Sopenharmony_ci 99762306a36Sopenharmony_ci offset = (loff_t)(start - vma->vm_start) 99862306a36Sopenharmony_ci + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 99962306a36Sopenharmony_ci 100062306a36Sopenharmony_ci /* 100162306a36Sopenharmony_ci * Filesystem's fallocate may need to take i_rwsem. We need to 100262306a36Sopenharmony_ci * explicitly grab a reference because the vma (and hence the 100362306a36Sopenharmony_ci * vma's reference to the file) can go away as soon as we drop 100462306a36Sopenharmony_ci * mmap_lock. 100562306a36Sopenharmony_ci */ 100662306a36Sopenharmony_ci get_file(f); 100762306a36Sopenharmony_ci if (userfaultfd_remove(vma, start, end)) { 100862306a36Sopenharmony_ci /* mmap_lock was not released by userfaultfd_remove() */ 100962306a36Sopenharmony_ci mmap_read_unlock(mm); 101062306a36Sopenharmony_ci } 101162306a36Sopenharmony_ci error = vfs_fallocate(f, 101262306a36Sopenharmony_ci FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 101362306a36Sopenharmony_ci offset, end - start); 101462306a36Sopenharmony_ci fput(f); 101562306a36Sopenharmony_ci mmap_read_lock(mm); 101662306a36Sopenharmony_ci return error; 101762306a36Sopenharmony_ci} 101862306a36Sopenharmony_ci 101962306a36Sopenharmony_ci/* 102062306a36Sopenharmony_ci * Apply an madvise behavior to a region of a vma. madvise_update_vma 102162306a36Sopenharmony_ci * will handle splitting a vm area into separate areas, each area with its own 102262306a36Sopenharmony_ci * behavior. 102362306a36Sopenharmony_ci */ 102462306a36Sopenharmony_cistatic int madvise_vma_behavior(struct vm_area_struct *vma, 102562306a36Sopenharmony_ci struct vm_area_struct **prev, 102662306a36Sopenharmony_ci unsigned long start, unsigned long end, 102762306a36Sopenharmony_ci unsigned long behavior) 102862306a36Sopenharmony_ci{ 102962306a36Sopenharmony_ci int error; 103062306a36Sopenharmony_ci struct anon_vma_name *anon_name; 103162306a36Sopenharmony_ci unsigned long new_flags = vma->vm_flags; 103262306a36Sopenharmony_ci 103362306a36Sopenharmony_ci switch (behavior) { 103462306a36Sopenharmony_ci case MADV_REMOVE: 103562306a36Sopenharmony_ci return madvise_remove(vma, prev, start, end); 103662306a36Sopenharmony_ci case MADV_WILLNEED: 103762306a36Sopenharmony_ci return madvise_willneed(vma, prev, start, end); 103862306a36Sopenharmony_ci case MADV_COLD: 103962306a36Sopenharmony_ci return madvise_cold(vma, prev, start, end); 104062306a36Sopenharmony_ci case MADV_PAGEOUT: 104162306a36Sopenharmony_ci return madvise_pageout(vma, prev, start, end); 104262306a36Sopenharmony_ci case MADV_FREE: 104362306a36Sopenharmony_ci case MADV_DONTNEED: 104462306a36Sopenharmony_ci case MADV_DONTNEED_LOCKED: 104562306a36Sopenharmony_ci return madvise_dontneed_free(vma, prev, start, end, behavior); 104662306a36Sopenharmony_ci case MADV_POPULATE_READ: 104762306a36Sopenharmony_ci case MADV_POPULATE_WRITE: 104862306a36Sopenharmony_ci return madvise_populate(vma, prev, start, end, behavior); 104962306a36Sopenharmony_ci case MADV_NORMAL: 105062306a36Sopenharmony_ci new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; 105162306a36Sopenharmony_ci break; 105262306a36Sopenharmony_ci case MADV_SEQUENTIAL: 105362306a36Sopenharmony_ci new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; 105462306a36Sopenharmony_ci break; 105562306a36Sopenharmony_ci case MADV_RANDOM: 105662306a36Sopenharmony_ci new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; 105762306a36Sopenharmony_ci break; 105862306a36Sopenharmony_ci case MADV_DONTFORK: 105962306a36Sopenharmony_ci new_flags |= VM_DONTCOPY; 106062306a36Sopenharmony_ci break; 106162306a36Sopenharmony_ci case MADV_DOFORK: 106262306a36Sopenharmony_ci if (vma->vm_flags & VM_IO) 106362306a36Sopenharmony_ci return -EINVAL; 106462306a36Sopenharmony_ci new_flags &= ~VM_DONTCOPY; 106562306a36Sopenharmony_ci break; 106662306a36Sopenharmony_ci case MADV_WIPEONFORK: 106762306a36Sopenharmony_ci /* MADV_WIPEONFORK is only supported on anonymous memory. */ 106862306a36Sopenharmony_ci if (vma->vm_file || vma->vm_flags & VM_SHARED) 106962306a36Sopenharmony_ci return -EINVAL; 107062306a36Sopenharmony_ci new_flags |= VM_WIPEONFORK; 107162306a36Sopenharmony_ci break; 107262306a36Sopenharmony_ci case MADV_KEEPONFORK: 107362306a36Sopenharmony_ci new_flags &= ~VM_WIPEONFORK; 107462306a36Sopenharmony_ci break; 107562306a36Sopenharmony_ci case MADV_DONTDUMP: 107662306a36Sopenharmony_ci new_flags |= VM_DONTDUMP; 107762306a36Sopenharmony_ci break; 107862306a36Sopenharmony_ci case MADV_DODUMP: 107962306a36Sopenharmony_ci if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) 108062306a36Sopenharmony_ci return -EINVAL; 108162306a36Sopenharmony_ci new_flags &= ~VM_DONTDUMP; 108262306a36Sopenharmony_ci break; 108362306a36Sopenharmony_ci case MADV_MERGEABLE: 108462306a36Sopenharmony_ci case MADV_UNMERGEABLE: 108562306a36Sopenharmony_ci error = ksm_madvise(vma, start, end, behavior, &new_flags); 108662306a36Sopenharmony_ci if (error) 108762306a36Sopenharmony_ci goto out; 108862306a36Sopenharmony_ci break; 108962306a36Sopenharmony_ci case MADV_HUGEPAGE: 109062306a36Sopenharmony_ci case MADV_NOHUGEPAGE: 109162306a36Sopenharmony_ci error = hugepage_madvise(vma, &new_flags, behavior); 109262306a36Sopenharmony_ci if (error) 109362306a36Sopenharmony_ci goto out; 109462306a36Sopenharmony_ci break; 109562306a36Sopenharmony_ci case MADV_COLLAPSE: 109662306a36Sopenharmony_ci return madvise_collapse(vma, prev, start, end); 109762306a36Sopenharmony_ci } 109862306a36Sopenharmony_ci 109962306a36Sopenharmony_ci anon_name = anon_vma_name(vma); 110062306a36Sopenharmony_ci anon_vma_name_get(anon_name); 110162306a36Sopenharmony_ci error = madvise_update_vma(vma, prev, start, end, new_flags, 110262306a36Sopenharmony_ci anon_name); 110362306a36Sopenharmony_ci anon_vma_name_put(anon_name); 110462306a36Sopenharmony_ci 110562306a36Sopenharmony_ciout: 110662306a36Sopenharmony_ci /* 110762306a36Sopenharmony_ci * madvise() returns EAGAIN if kernel resources, such as 110862306a36Sopenharmony_ci * slab, are temporarily unavailable. 110962306a36Sopenharmony_ci */ 111062306a36Sopenharmony_ci if (error == -ENOMEM) 111162306a36Sopenharmony_ci error = -EAGAIN; 111262306a36Sopenharmony_ci return error; 111362306a36Sopenharmony_ci} 111462306a36Sopenharmony_ci 111562306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_FAILURE 111662306a36Sopenharmony_ci/* 111762306a36Sopenharmony_ci * Error injection support for memory error handling. 111862306a36Sopenharmony_ci */ 111962306a36Sopenharmony_cistatic int madvise_inject_error(int behavior, 112062306a36Sopenharmony_ci unsigned long start, unsigned long end) 112162306a36Sopenharmony_ci{ 112262306a36Sopenharmony_ci unsigned long size; 112362306a36Sopenharmony_ci 112462306a36Sopenharmony_ci if (!capable(CAP_SYS_ADMIN)) 112562306a36Sopenharmony_ci return -EPERM; 112662306a36Sopenharmony_ci 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_ci for (; start < end; start += size) { 112962306a36Sopenharmony_ci unsigned long pfn; 113062306a36Sopenharmony_ci struct page *page; 113162306a36Sopenharmony_ci int ret; 113262306a36Sopenharmony_ci 113362306a36Sopenharmony_ci ret = get_user_pages_fast(start, 1, 0, &page); 113462306a36Sopenharmony_ci if (ret != 1) 113562306a36Sopenharmony_ci return ret; 113662306a36Sopenharmony_ci pfn = page_to_pfn(page); 113762306a36Sopenharmony_ci 113862306a36Sopenharmony_ci /* 113962306a36Sopenharmony_ci * When soft offlining hugepages, after migrating the page 114062306a36Sopenharmony_ci * we dissolve it, therefore in the second loop "page" will 114162306a36Sopenharmony_ci * no longer be a compound page. 114262306a36Sopenharmony_ci */ 114362306a36Sopenharmony_ci size = page_size(compound_head(page)); 114462306a36Sopenharmony_ci 114562306a36Sopenharmony_ci if (behavior == MADV_SOFT_OFFLINE) { 114662306a36Sopenharmony_ci pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", 114762306a36Sopenharmony_ci pfn, start); 114862306a36Sopenharmony_ci ret = soft_offline_page(pfn, MF_COUNT_INCREASED); 114962306a36Sopenharmony_ci } else { 115062306a36Sopenharmony_ci pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", 115162306a36Sopenharmony_ci pfn, start); 115262306a36Sopenharmony_ci ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED); 115362306a36Sopenharmony_ci if (ret == -EOPNOTSUPP) 115462306a36Sopenharmony_ci ret = 0; 115562306a36Sopenharmony_ci } 115662306a36Sopenharmony_ci 115762306a36Sopenharmony_ci if (ret) 115862306a36Sopenharmony_ci return ret; 115962306a36Sopenharmony_ci } 116062306a36Sopenharmony_ci 116162306a36Sopenharmony_ci return 0; 116262306a36Sopenharmony_ci} 116362306a36Sopenharmony_ci#endif 116462306a36Sopenharmony_ci 116562306a36Sopenharmony_cistatic bool 116662306a36Sopenharmony_cimadvise_behavior_valid(int behavior) 116762306a36Sopenharmony_ci{ 116862306a36Sopenharmony_ci switch (behavior) { 116962306a36Sopenharmony_ci case MADV_DOFORK: 117062306a36Sopenharmony_ci case MADV_DONTFORK: 117162306a36Sopenharmony_ci case MADV_NORMAL: 117262306a36Sopenharmony_ci case MADV_SEQUENTIAL: 117362306a36Sopenharmony_ci case MADV_RANDOM: 117462306a36Sopenharmony_ci case MADV_REMOVE: 117562306a36Sopenharmony_ci case MADV_WILLNEED: 117662306a36Sopenharmony_ci case MADV_DONTNEED: 117762306a36Sopenharmony_ci case MADV_DONTNEED_LOCKED: 117862306a36Sopenharmony_ci case MADV_FREE: 117962306a36Sopenharmony_ci case MADV_COLD: 118062306a36Sopenharmony_ci case MADV_PAGEOUT: 118162306a36Sopenharmony_ci case MADV_POPULATE_READ: 118262306a36Sopenharmony_ci case MADV_POPULATE_WRITE: 118362306a36Sopenharmony_ci#ifdef CONFIG_KSM 118462306a36Sopenharmony_ci case MADV_MERGEABLE: 118562306a36Sopenharmony_ci case MADV_UNMERGEABLE: 118662306a36Sopenharmony_ci#endif 118762306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE 118862306a36Sopenharmony_ci case MADV_HUGEPAGE: 118962306a36Sopenharmony_ci case MADV_NOHUGEPAGE: 119062306a36Sopenharmony_ci case MADV_COLLAPSE: 119162306a36Sopenharmony_ci#endif 119262306a36Sopenharmony_ci case MADV_DONTDUMP: 119362306a36Sopenharmony_ci case MADV_DODUMP: 119462306a36Sopenharmony_ci case MADV_WIPEONFORK: 119562306a36Sopenharmony_ci case MADV_KEEPONFORK: 119662306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_FAILURE 119762306a36Sopenharmony_ci case MADV_SOFT_OFFLINE: 119862306a36Sopenharmony_ci case MADV_HWPOISON: 119962306a36Sopenharmony_ci#endif 120062306a36Sopenharmony_ci return true; 120162306a36Sopenharmony_ci 120262306a36Sopenharmony_ci default: 120362306a36Sopenharmony_ci return false; 120462306a36Sopenharmony_ci } 120562306a36Sopenharmony_ci} 120662306a36Sopenharmony_ci 120762306a36Sopenharmony_cistatic bool process_madvise_behavior_valid(int behavior) 120862306a36Sopenharmony_ci{ 120962306a36Sopenharmony_ci switch (behavior) { 121062306a36Sopenharmony_ci case MADV_COLD: 121162306a36Sopenharmony_ci case MADV_PAGEOUT: 121262306a36Sopenharmony_ci case MADV_WILLNEED: 121362306a36Sopenharmony_ci case MADV_COLLAPSE: 121462306a36Sopenharmony_ci return true; 121562306a36Sopenharmony_ci default: 121662306a36Sopenharmony_ci return false; 121762306a36Sopenharmony_ci } 121862306a36Sopenharmony_ci} 121962306a36Sopenharmony_ci 122062306a36Sopenharmony_ci/* 122162306a36Sopenharmony_ci * Walk the vmas in range [start,end), and call the visit function on each one. 122262306a36Sopenharmony_ci * The visit function will get start and end parameters that cover the overlap 122362306a36Sopenharmony_ci * between the current vma and the original range. Any unmapped regions in the 122462306a36Sopenharmony_ci * original range will result in this function returning -ENOMEM while still 122562306a36Sopenharmony_ci * calling the visit function on all of the existing vmas in the range. 122662306a36Sopenharmony_ci * Must be called with the mmap_lock held for reading or writing. 122762306a36Sopenharmony_ci */ 122862306a36Sopenharmony_cistatic 122962306a36Sopenharmony_ciint madvise_walk_vmas(struct mm_struct *mm, unsigned long start, 123062306a36Sopenharmony_ci unsigned long end, unsigned long arg, 123162306a36Sopenharmony_ci int (*visit)(struct vm_area_struct *vma, 123262306a36Sopenharmony_ci struct vm_area_struct **prev, unsigned long start, 123362306a36Sopenharmony_ci unsigned long end, unsigned long arg)) 123462306a36Sopenharmony_ci{ 123562306a36Sopenharmony_ci struct vm_area_struct *vma; 123662306a36Sopenharmony_ci struct vm_area_struct *prev; 123762306a36Sopenharmony_ci unsigned long tmp; 123862306a36Sopenharmony_ci int unmapped_error = 0; 123962306a36Sopenharmony_ci 124062306a36Sopenharmony_ci /* 124162306a36Sopenharmony_ci * If the interval [start,end) covers some unmapped address 124262306a36Sopenharmony_ci * ranges, just ignore them, but return -ENOMEM at the end. 124362306a36Sopenharmony_ci * - different from the way of handling in mlock etc. 124462306a36Sopenharmony_ci */ 124562306a36Sopenharmony_ci vma = find_vma_prev(mm, start, &prev); 124662306a36Sopenharmony_ci if (vma && start > vma->vm_start) 124762306a36Sopenharmony_ci prev = vma; 124862306a36Sopenharmony_ci 124962306a36Sopenharmony_ci for (;;) { 125062306a36Sopenharmony_ci int error; 125162306a36Sopenharmony_ci 125262306a36Sopenharmony_ci /* Still start < end. */ 125362306a36Sopenharmony_ci if (!vma) 125462306a36Sopenharmony_ci return -ENOMEM; 125562306a36Sopenharmony_ci 125662306a36Sopenharmony_ci /* Here start < (end|vma->vm_end). */ 125762306a36Sopenharmony_ci if (start < vma->vm_start) { 125862306a36Sopenharmony_ci unmapped_error = -ENOMEM; 125962306a36Sopenharmony_ci start = vma->vm_start; 126062306a36Sopenharmony_ci if (start >= end) 126162306a36Sopenharmony_ci break; 126262306a36Sopenharmony_ci } 126362306a36Sopenharmony_ci 126462306a36Sopenharmony_ci /* Here vma->vm_start <= start < (end|vma->vm_end) */ 126562306a36Sopenharmony_ci tmp = vma->vm_end; 126662306a36Sopenharmony_ci if (end < tmp) 126762306a36Sopenharmony_ci tmp = end; 126862306a36Sopenharmony_ci 126962306a36Sopenharmony_ci /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 127062306a36Sopenharmony_ci error = visit(vma, &prev, start, tmp, arg); 127162306a36Sopenharmony_ci if (error) 127262306a36Sopenharmony_ci return error; 127362306a36Sopenharmony_ci start = tmp; 127462306a36Sopenharmony_ci if (prev && start < prev->vm_end) 127562306a36Sopenharmony_ci start = prev->vm_end; 127662306a36Sopenharmony_ci if (start >= end) 127762306a36Sopenharmony_ci break; 127862306a36Sopenharmony_ci if (prev) 127962306a36Sopenharmony_ci vma = find_vma(mm, prev->vm_end); 128062306a36Sopenharmony_ci else /* madvise_remove dropped mmap_lock */ 128162306a36Sopenharmony_ci vma = find_vma(mm, start); 128262306a36Sopenharmony_ci } 128362306a36Sopenharmony_ci 128462306a36Sopenharmony_ci return unmapped_error; 128562306a36Sopenharmony_ci} 128662306a36Sopenharmony_ci 128762306a36Sopenharmony_ci#ifdef CONFIG_ANON_VMA_NAME 128862306a36Sopenharmony_cistatic int madvise_vma_anon_name(struct vm_area_struct *vma, 128962306a36Sopenharmony_ci struct vm_area_struct **prev, 129062306a36Sopenharmony_ci unsigned long start, unsigned long end, 129162306a36Sopenharmony_ci unsigned long anon_name) 129262306a36Sopenharmony_ci{ 129362306a36Sopenharmony_ci int error; 129462306a36Sopenharmony_ci 129562306a36Sopenharmony_ci /* Only anonymous mappings can be named */ 129662306a36Sopenharmony_ci if (vma->vm_file && !vma_is_anon_shmem(vma)) 129762306a36Sopenharmony_ci return -EBADF; 129862306a36Sopenharmony_ci 129962306a36Sopenharmony_ci error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, 130062306a36Sopenharmony_ci (struct anon_vma_name *)anon_name); 130162306a36Sopenharmony_ci 130262306a36Sopenharmony_ci /* 130362306a36Sopenharmony_ci * madvise() returns EAGAIN if kernel resources, such as 130462306a36Sopenharmony_ci * slab, are temporarily unavailable. 130562306a36Sopenharmony_ci */ 130662306a36Sopenharmony_ci if (error == -ENOMEM) 130762306a36Sopenharmony_ci error = -EAGAIN; 130862306a36Sopenharmony_ci return error; 130962306a36Sopenharmony_ci} 131062306a36Sopenharmony_ci 131162306a36Sopenharmony_ciint madvise_set_anon_name(struct mm_struct *mm, unsigned long start, 131262306a36Sopenharmony_ci unsigned long len_in, struct anon_vma_name *anon_name) 131362306a36Sopenharmony_ci{ 131462306a36Sopenharmony_ci unsigned long end; 131562306a36Sopenharmony_ci unsigned long len; 131662306a36Sopenharmony_ci 131762306a36Sopenharmony_ci if (start & ~PAGE_MASK) 131862306a36Sopenharmony_ci return -EINVAL; 131962306a36Sopenharmony_ci len = (len_in + ~PAGE_MASK) & PAGE_MASK; 132062306a36Sopenharmony_ci 132162306a36Sopenharmony_ci /* Check to see whether len was rounded up from small -ve to zero */ 132262306a36Sopenharmony_ci if (len_in && !len) 132362306a36Sopenharmony_ci return -EINVAL; 132462306a36Sopenharmony_ci 132562306a36Sopenharmony_ci end = start + len; 132662306a36Sopenharmony_ci if (end < start) 132762306a36Sopenharmony_ci return -EINVAL; 132862306a36Sopenharmony_ci 132962306a36Sopenharmony_ci if (end == start) 133062306a36Sopenharmony_ci return 0; 133162306a36Sopenharmony_ci 133262306a36Sopenharmony_ci return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, 133362306a36Sopenharmony_ci madvise_vma_anon_name); 133462306a36Sopenharmony_ci} 133562306a36Sopenharmony_ci#endif /* CONFIG_ANON_VMA_NAME */ 133662306a36Sopenharmony_ci/* 133762306a36Sopenharmony_ci * The madvise(2) system call. 133862306a36Sopenharmony_ci * 133962306a36Sopenharmony_ci * Applications can use madvise() to advise the kernel how it should 134062306a36Sopenharmony_ci * handle paging I/O in this VM area. The idea is to help the kernel 134162306a36Sopenharmony_ci * use appropriate read-ahead and caching techniques. The information 134262306a36Sopenharmony_ci * provided is advisory only, and can be safely disregarded by the 134362306a36Sopenharmony_ci * kernel without affecting the correct operation of the application. 134462306a36Sopenharmony_ci * 134562306a36Sopenharmony_ci * behavior values: 134662306a36Sopenharmony_ci * MADV_NORMAL - the default behavior is to read clusters. This 134762306a36Sopenharmony_ci * results in some read-ahead and read-behind. 134862306a36Sopenharmony_ci * MADV_RANDOM - the system should read the minimum amount of data 134962306a36Sopenharmony_ci * on any access, since it is unlikely that the appli- 135062306a36Sopenharmony_ci * cation will need more than what it asks for. 135162306a36Sopenharmony_ci * MADV_SEQUENTIAL - pages in the given range will probably be accessed 135262306a36Sopenharmony_ci * once, so they can be aggressively read ahead, and 135362306a36Sopenharmony_ci * can be freed soon after they are accessed. 135462306a36Sopenharmony_ci * MADV_WILLNEED - the application is notifying the system to read 135562306a36Sopenharmony_ci * some pages ahead. 135662306a36Sopenharmony_ci * MADV_DONTNEED - the application is finished with the given range, 135762306a36Sopenharmony_ci * so the kernel can free resources associated with it. 135862306a36Sopenharmony_ci * MADV_FREE - the application marks pages in the given range as lazy free, 135962306a36Sopenharmony_ci * where actual purges are postponed until memory pressure happens. 136062306a36Sopenharmony_ci * MADV_REMOVE - the application wants to free up the given range of 136162306a36Sopenharmony_ci * pages and associated backing store. 136262306a36Sopenharmony_ci * MADV_DONTFORK - omit this area from child's address space when forking: 136362306a36Sopenharmony_ci * typically, to avoid COWing pages pinned by get_user_pages(). 136462306a36Sopenharmony_ci * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. 136562306a36Sopenharmony_ci * MADV_WIPEONFORK - present the child process with zero-filled memory in this 136662306a36Sopenharmony_ci * range after a fork. 136762306a36Sopenharmony_ci * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK 136862306a36Sopenharmony_ci * MADV_HWPOISON - trigger memory error handler as if the given memory range 136962306a36Sopenharmony_ci * were corrupted by unrecoverable hardware memory failure. 137062306a36Sopenharmony_ci * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. 137162306a36Sopenharmony_ci * MADV_MERGEABLE - the application recommends that KSM try to merge pages in 137262306a36Sopenharmony_ci * this area with pages of identical content from other such areas. 137362306a36Sopenharmony_ci * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. 137462306a36Sopenharmony_ci * MADV_HUGEPAGE - the application wants to back the given range by transparent 137562306a36Sopenharmony_ci * huge pages in the future. Existing pages might be coalesced and 137662306a36Sopenharmony_ci * new pages might be allocated as THP. 137762306a36Sopenharmony_ci * MADV_NOHUGEPAGE - mark the given range as not worth being backed by 137862306a36Sopenharmony_ci * transparent huge pages so the existing pages will not be 137962306a36Sopenharmony_ci * coalesced into THP and new pages will not be allocated as THP. 138062306a36Sopenharmony_ci * MADV_COLLAPSE - synchronously coalesce pages into new THP. 138162306a36Sopenharmony_ci * MADV_DONTDUMP - the application wants to prevent pages in the given range 138262306a36Sopenharmony_ci * from being included in its core dump. 138362306a36Sopenharmony_ci * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. 138462306a36Sopenharmony_ci * MADV_COLD - the application is not expected to use this memory soon, 138562306a36Sopenharmony_ci * deactivate pages in this range so that they can be reclaimed 138662306a36Sopenharmony_ci * easily if memory pressure happens. 138762306a36Sopenharmony_ci * MADV_PAGEOUT - the application is not expected to use this memory soon, 138862306a36Sopenharmony_ci * page out the pages in this range immediately. 138962306a36Sopenharmony_ci * MADV_POPULATE_READ - populate (prefault) page tables readable by 139062306a36Sopenharmony_ci * triggering read faults if required 139162306a36Sopenharmony_ci * MADV_POPULATE_WRITE - populate (prefault) page tables writable by 139262306a36Sopenharmony_ci * triggering write faults if required 139362306a36Sopenharmony_ci * 139462306a36Sopenharmony_ci * return values: 139562306a36Sopenharmony_ci * zero - success 139662306a36Sopenharmony_ci * -EINVAL - start + len < 0, start is not page-aligned, 139762306a36Sopenharmony_ci * "behavior" is not a valid value, or application 139862306a36Sopenharmony_ci * is attempting to release locked or shared pages, 139962306a36Sopenharmony_ci * or the specified address range includes file, Huge TLB, 140062306a36Sopenharmony_ci * MAP_SHARED or VMPFNMAP range. 140162306a36Sopenharmony_ci * -ENOMEM - addresses in the specified range are not currently 140262306a36Sopenharmony_ci * mapped, or are outside the AS of the process. 140362306a36Sopenharmony_ci * -EIO - an I/O error occurred while paging in data. 140462306a36Sopenharmony_ci * -EBADF - map exists, but area maps something that isn't a file. 140562306a36Sopenharmony_ci * -EAGAIN - a kernel resource was temporarily unavailable. 140662306a36Sopenharmony_ci */ 140762306a36Sopenharmony_ciint do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) 140862306a36Sopenharmony_ci{ 140962306a36Sopenharmony_ci unsigned long end; 141062306a36Sopenharmony_ci int error; 141162306a36Sopenharmony_ci int write; 141262306a36Sopenharmony_ci size_t len; 141362306a36Sopenharmony_ci struct blk_plug plug; 141462306a36Sopenharmony_ci 141562306a36Sopenharmony_ci if (!madvise_behavior_valid(behavior)) 141662306a36Sopenharmony_ci return -EINVAL; 141762306a36Sopenharmony_ci 141862306a36Sopenharmony_ci if (!PAGE_ALIGNED(start)) 141962306a36Sopenharmony_ci return -EINVAL; 142062306a36Sopenharmony_ci len = PAGE_ALIGN(len_in); 142162306a36Sopenharmony_ci 142262306a36Sopenharmony_ci /* Check to see whether len was rounded up from small -ve to zero */ 142362306a36Sopenharmony_ci if (len_in && !len) 142462306a36Sopenharmony_ci return -EINVAL; 142562306a36Sopenharmony_ci 142662306a36Sopenharmony_ci end = start + len; 142762306a36Sopenharmony_ci if (end < start) 142862306a36Sopenharmony_ci return -EINVAL; 142962306a36Sopenharmony_ci 143062306a36Sopenharmony_ci if (end == start) 143162306a36Sopenharmony_ci return 0; 143262306a36Sopenharmony_ci 143362306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_FAILURE 143462306a36Sopenharmony_ci if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 143562306a36Sopenharmony_ci return madvise_inject_error(behavior, start, start + len_in); 143662306a36Sopenharmony_ci#endif 143762306a36Sopenharmony_ci 143862306a36Sopenharmony_ci write = madvise_need_mmap_write(behavior); 143962306a36Sopenharmony_ci if (write) { 144062306a36Sopenharmony_ci if (mmap_write_lock_killable(mm)) 144162306a36Sopenharmony_ci return -EINTR; 144262306a36Sopenharmony_ci } else { 144362306a36Sopenharmony_ci mmap_read_lock(mm); 144462306a36Sopenharmony_ci } 144562306a36Sopenharmony_ci 144662306a36Sopenharmony_ci start = untagged_addr_remote(mm, start); 144762306a36Sopenharmony_ci end = start + len; 144862306a36Sopenharmony_ci 144962306a36Sopenharmony_ci blk_start_plug(&plug); 145062306a36Sopenharmony_ci error = madvise_walk_vmas(mm, start, end, behavior, 145162306a36Sopenharmony_ci madvise_vma_behavior); 145262306a36Sopenharmony_ci blk_finish_plug(&plug); 145362306a36Sopenharmony_ci if (write) 145462306a36Sopenharmony_ci mmap_write_unlock(mm); 145562306a36Sopenharmony_ci else 145662306a36Sopenharmony_ci mmap_read_unlock(mm); 145762306a36Sopenharmony_ci 145862306a36Sopenharmony_ci return error; 145962306a36Sopenharmony_ci} 146062306a36Sopenharmony_ci 146162306a36Sopenharmony_ciSYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 146262306a36Sopenharmony_ci{ 146362306a36Sopenharmony_ci return do_madvise(current->mm, start, len_in, behavior); 146462306a36Sopenharmony_ci} 146562306a36Sopenharmony_ci 146662306a36Sopenharmony_ciSYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, 146762306a36Sopenharmony_ci size_t, vlen, int, behavior, unsigned int, flags) 146862306a36Sopenharmony_ci{ 146962306a36Sopenharmony_ci ssize_t ret; 147062306a36Sopenharmony_ci struct iovec iovstack[UIO_FASTIOV]; 147162306a36Sopenharmony_ci struct iovec *iov = iovstack; 147262306a36Sopenharmony_ci struct iov_iter iter; 147362306a36Sopenharmony_ci struct task_struct *task; 147462306a36Sopenharmony_ci struct mm_struct *mm; 147562306a36Sopenharmony_ci size_t total_len; 147662306a36Sopenharmony_ci unsigned int f_flags; 147762306a36Sopenharmony_ci 147862306a36Sopenharmony_ci if (flags != 0) { 147962306a36Sopenharmony_ci ret = -EINVAL; 148062306a36Sopenharmony_ci goto out; 148162306a36Sopenharmony_ci } 148262306a36Sopenharmony_ci 148362306a36Sopenharmony_ci ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 148462306a36Sopenharmony_ci if (ret < 0) 148562306a36Sopenharmony_ci goto out; 148662306a36Sopenharmony_ci 148762306a36Sopenharmony_ci task = pidfd_get_task(pidfd, &f_flags); 148862306a36Sopenharmony_ci if (IS_ERR(task)) { 148962306a36Sopenharmony_ci ret = PTR_ERR(task); 149062306a36Sopenharmony_ci goto free_iov; 149162306a36Sopenharmony_ci } 149262306a36Sopenharmony_ci 149362306a36Sopenharmony_ci if (!process_madvise_behavior_valid(behavior)) { 149462306a36Sopenharmony_ci ret = -EINVAL; 149562306a36Sopenharmony_ci goto release_task; 149662306a36Sopenharmony_ci } 149762306a36Sopenharmony_ci 149862306a36Sopenharmony_ci /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ 149962306a36Sopenharmony_ci mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); 150062306a36Sopenharmony_ci if (IS_ERR_OR_NULL(mm)) { 150162306a36Sopenharmony_ci ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; 150262306a36Sopenharmony_ci goto release_task; 150362306a36Sopenharmony_ci } 150462306a36Sopenharmony_ci 150562306a36Sopenharmony_ci /* 150662306a36Sopenharmony_ci * Require CAP_SYS_NICE for influencing process performance. Note that 150762306a36Sopenharmony_ci * only non-destructive hints are currently supported. 150862306a36Sopenharmony_ci */ 150962306a36Sopenharmony_ci if (!capable(CAP_SYS_NICE)) { 151062306a36Sopenharmony_ci ret = -EPERM; 151162306a36Sopenharmony_ci goto release_mm; 151262306a36Sopenharmony_ci } 151362306a36Sopenharmony_ci 151462306a36Sopenharmony_ci total_len = iov_iter_count(&iter); 151562306a36Sopenharmony_ci 151662306a36Sopenharmony_ci while (iov_iter_count(&iter)) { 151762306a36Sopenharmony_ci ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), 151862306a36Sopenharmony_ci iter_iov_len(&iter), behavior); 151962306a36Sopenharmony_ci if (ret < 0) 152062306a36Sopenharmony_ci break; 152162306a36Sopenharmony_ci iov_iter_advance(&iter, iter_iov_len(&iter)); 152262306a36Sopenharmony_ci } 152362306a36Sopenharmony_ci 152462306a36Sopenharmony_ci ret = (total_len - iov_iter_count(&iter)) ? : ret; 152562306a36Sopenharmony_ci 152662306a36Sopenharmony_cirelease_mm: 152762306a36Sopenharmony_ci mmput(mm); 152862306a36Sopenharmony_cirelease_task: 152962306a36Sopenharmony_ci put_task_struct(task); 153062306a36Sopenharmony_cifree_iov: 153162306a36Sopenharmony_ci kfree(iov); 153262306a36Sopenharmony_ciout: 153362306a36Sopenharmony_ci return ret; 153462306a36Sopenharmony_ci} 1535