162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci *	linux/mm/madvise.c
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 1999  Linus Torvalds
662306a36Sopenharmony_ci * Copyright (C) 2002  Christoph Hellwig
762306a36Sopenharmony_ci */
862306a36Sopenharmony_ci
962306a36Sopenharmony_ci#include <linux/mman.h>
1062306a36Sopenharmony_ci#include <linux/pagemap.h>
1162306a36Sopenharmony_ci#include <linux/syscalls.h>
1262306a36Sopenharmony_ci#include <linux/mempolicy.h>
1362306a36Sopenharmony_ci#include <linux/page-isolation.h>
1462306a36Sopenharmony_ci#include <linux/page_idle.h>
1562306a36Sopenharmony_ci#include <linux/userfaultfd_k.h>
1662306a36Sopenharmony_ci#include <linux/hugetlb.h>
1762306a36Sopenharmony_ci#include <linux/falloc.h>
1862306a36Sopenharmony_ci#include <linux/fadvise.h>
1962306a36Sopenharmony_ci#include <linux/sched.h>
2062306a36Sopenharmony_ci#include <linux/sched/mm.h>
2162306a36Sopenharmony_ci#include <linux/mm_inline.h>
2262306a36Sopenharmony_ci#include <linux/string.h>
2362306a36Sopenharmony_ci#include <linux/uio.h>
2462306a36Sopenharmony_ci#include <linux/ksm.h>
2562306a36Sopenharmony_ci#include <linux/fs.h>
2662306a36Sopenharmony_ci#include <linux/file.h>
2762306a36Sopenharmony_ci#include <linux/blkdev.h>
2862306a36Sopenharmony_ci#include <linux/backing-dev.h>
2962306a36Sopenharmony_ci#include <linux/pagewalk.h>
3062306a36Sopenharmony_ci#include <linux/swap.h>
3162306a36Sopenharmony_ci#include <linux/swapops.h>
3262306a36Sopenharmony_ci#include <linux/shmem_fs.h>
3362306a36Sopenharmony_ci#include <linux/mmu_notifier.h>
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci#include <asm/tlb.h>
3662306a36Sopenharmony_ci
3762306a36Sopenharmony_ci#include "internal.h"
3862306a36Sopenharmony_ci#include "swap.h"
3962306a36Sopenharmony_ci
4062306a36Sopenharmony_cistruct madvise_walk_private {
4162306a36Sopenharmony_ci	struct mmu_gather *tlb;
4262306a36Sopenharmony_ci	bool pageout;
4362306a36Sopenharmony_ci};
4462306a36Sopenharmony_ci
4562306a36Sopenharmony_ci/*
4662306a36Sopenharmony_ci * Any behaviour which results in changes to the vma->vm_flags needs to
4762306a36Sopenharmony_ci * take mmap_lock for writing. Others, which simply traverse vmas, need
4862306a36Sopenharmony_ci * to only take it for reading.
4962306a36Sopenharmony_ci */
5062306a36Sopenharmony_cistatic int madvise_need_mmap_write(int behavior)
5162306a36Sopenharmony_ci{
5262306a36Sopenharmony_ci	switch (behavior) {
5362306a36Sopenharmony_ci	case MADV_REMOVE:
5462306a36Sopenharmony_ci	case MADV_WILLNEED:
5562306a36Sopenharmony_ci	case MADV_DONTNEED:
5662306a36Sopenharmony_ci	case MADV_DONTNEED_LOCKED:
5762306a36Sopenharmony_ci	case MADV_COLD:
5862306a36Sopenharmony_ci	case MADV_PAGEOUT:
5962306a36Sopenharmony_ci	case MADV_FREE:
6062306a36Sopenharmony_ci	case MADV_POPULATE_READ:
6162306a36Sopenharmony_ci	case MADV_POPULATE_WRITE:
6262306a36Sopenharmony_ci	case MADV_COLLAPSE:
6362306a36Sopenharmony_ci		return 0;
6462306a36Sopenharmony_ci	default:
6562306a36Sopenharmony_ci		/* be safe, default to 1. list exceptions explicitly */
6662306a36Sopenharmony_ci		return 1;
6762306a36Sopenharmony_ci	}
6862306a36Sopenharmony_ci}
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci#ifdef CONFIG_ANON_VMA_NAME
7162306a36Sopenharmony_cistruct anon_vma_name *anon_vma_name_alloc(const char *name)
7262306a36Sopenharmony_ci{
7362306a36Sopenharmony_ci	struct anon_vma_name *anon_name;
7462306a36Sopenharmony_ci	size_t count;
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci	/* Add 1 for NUL terminator at the end of the anon_name->name */
7762306a36Sopenharmony_ci	count = strlen(name) + 1;
7862306a36Sopenharmony_ci	anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
7962306a36Sopenharmony_ci	if (anon_name) {
8062306a36Sopenharmony_ci		kref_init(&anon_name->kref);
8162306a36Sopenharmony_ci		memcpy(anon_name->name, name, count);
8262306a36Sopenharmony_ci	}
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci	return anon_name;
8562306a36Sopenharmony_ci}
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_civoid anon_vma_name_free(struct kref *kref)
8862306a36Sopenharmony_ci{
8962306a36Sopenharmony_ci	struct anon_vma_name *anon_name =
9062306a36Sopenharmony_ci			container_of(kref, struct anon_vma_name, kref);
9162306a36Sopenharmony_ci	kfree(anon_name);
9262306a36Sopenharmony_ci}
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_cistruct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
9562306a36Sopenharmony_ci{
9662306a36Sopenharmony_ci	mmap_assert_locked(vma->vm_mm);
9762306a36Sopenharmony_ci
9862306a36Sopenharmony_ci	return vma->anon_name;
9962306a36Sopenharmony_ci}
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci/* mmap_lock should be write-locked */
10262306a36Sopenharmony_cistatic int replace_anon_vma_name(struct vm_area_struct *vma,
10362306a36Sopenharmony_ci				 struct anon_vma_name *anon_name)
10462306a36Sopenharmony_ci{
10562306a36Sopenharmony_ci	struct anon_vma_name *orig_name = anon_vma_name(vma);
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci	if (!anon_name) {
10862306a36Sopenharmony_ci		vma->anon_name = NULL;
10962306a36Sopenharmony_ci		anon_vma_name_put(orig_name);
11062306a36Sopenharmony_ci		return 0;
11162306a36Sopenharmony_ci	}
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci	if (anon_vma_name_eq(orig_name, anon_name))
11462306a36Sopenharmony_ci		return 0;
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci	vma->anon_name = anon_vma_name_reuse(anon_name);
11762306a36Sopenharmony_ci	anon_vma_name_put(orig_name);
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci	return 0;
12062306a36Sopenharmony_ci}
12162306a36Sopenharmony_ci#else /* CONFIG_ANON_VMA_NAME */
12262306a36Sopenharmony_cistatic int replace_anon_vma_name(struct vm_area_struct *vma,
12362306a36Sopenharmony_ci				 struct anon_vma_name *anon_name)
12462306a36Sopenharmony_ci{
12562306a36Sopenharmony_ci	if (anon_name)
12662306a36Sopenharmony_ci		return -EINVAL;
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci	return 0;
12962306a36Sopenharmony_ci}
13062306a36Sopenharmony_ci#endif /* CONFIG_ANON_VMA_NAME */
13162306a36Sopenharmony_ci/*
13262306a36Sopenharmony_ci * Update the vm_flags on region of a vma, splitting it or merging it as
13362306a36Sopenharmony_ci * necessary.  Must be called with mmap_lock held for writing;
13462306a36Sopenharmony_ci * Caller should ensure anon_name stability by raising its refcount even when
13562306a36Sopenharmony_ci * anon_name belongs to a valid vma because this function might free that vma.
13662306a36Sopenharmony_ci */
13762306a36Sopenharmony_cistatic int madvise_update_vma(struct vm_area_struct *vma,
13862306a36Sopenharmony_ci			      struct vm_area_struct **prev, unsigned long start,
13962306a36Sopenharmony_ci			      unsigned long end, unsigned long new_flags,
14062306a36Sopenharmony_ci			      struct anon_vma_name *anon_name)
14162306a36Sopenharmony_ci{
14262306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
14362306a36Sopenharmony_ci	int error;
14462306a36Sopenharmony_ci	pgoff_t pgoff;
14562306a36Sopenharmony_ci	VMA_ITERATOR(vmi, mm, start);
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
14862306a36Sopenharmony_ci		*prev = vma;
14962306a36Sopenharmony_ci		return 0;
15062306a36Sopenharmony_ci	}
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
15362306a36Sopenharmony_ci	*prev = vma_merge(&vmi, mm, *prev, start, end, new_flags,
15462306a36Sopenharmony_ci			  vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
15562306a36Sopenharmony_ci			  vma->vm_userfaultfd_ctx, anon_name);
15662306a36Sopenharmony_ci	if (*prev) {
15762306a36Sopenharmony_ci		vma = *prev;
15862306a36Sopenharmony_ci		goto success;
15962306a36Sopenharmony_ci	}
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	*prev = vma;
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_ci	if (start != vma->vm_start) {
16462306a36Sopenharmony_ci		error = split_vma(&vmi, vma, start, 1);
16562306a36Sopenharmony_ci		if (error)
16662306a36Sopenharmony_ci			return error;
16762306a36Sopenharmony_ci	}
16862306a36Sopenharmony_ci
16962306a36Sopenharmony_ci	if (end != vma->vm_end) {
17062306a36Sopenharmony_ci		error = split_vma(&vmi, vma, end, 0);
17162306a36Sopenharmony_ci		if (error)
17262306a36Sopenharmony_ci			return error;
17362306a36Sopenharmony_ci	}
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_cisuccess:
17662306a36Sopenharmony_ci	/* vm_flags is protected by the mmap_lock held in write mode. */
17762306a36Sopenharmony_ci	vma_start_write(vma);
17862306a36Sopenharmony_ci	vm_flags_reset(vma, new_flags);
17962306a36Sopenharmony_ci	if (!vma->vm_file || vma_is_anon_shmem(vma)) {
18062306a36Sopenharmony_ci		error = replace_anon_vma_name(vma, anon_name);
18162306a36Sopenharmony_ci		if (error)
18262306a36Sopenharmony_ci			return error;
18362306a36Sopenharmony_ci	}
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci	return 0;
18662306a36Sopenharmony_ci}
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci#ifdef CONFIG_SWAP
18962306a36Sopenharmony_cistatic int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
19062306a36Sopenharmony_ci		unsigned long end, struct mm_walk *walk)
19162306a36Sopenharmony_ci{
19262306a36Sopenharmony_ci	struct vm_area_struct *vma = walk->private;
19362306a36Sopenharmony_ci	struct swap_iocb *splug = NULL;
19462306a36Sopenharmony_ci	pte_t *ptep = NULL;
19562306a36Sopenharmony_ci	spinlock_t *ptl;
19662306a36Sopenharmony_ci	unsigned long addr;
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci	for (addr = start; addr < end; addr += PAGE_SIZE) {
19962306a36Sopenharmony_ci		pte_t pte;
20062306a36Sopenharmony_ci		swp_entry_t entry;
20162306a36Sopenharmony_ci		struct page *page;
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci		if (!ptep++) {
20462306a36Sopenharmony_ci			ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
20562306a36Sopenharmony_ci			if (!ptep)
20662306a36Sopenharmony_ci				break;
20762306a36Sopenharmony_ci		}
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci		pte = ptep_get(ptep);
21062306a36Sopenharmony_ci		if (!is_swap_pte(pte))
21162306a36Sopenharmony_ci			continue;
21262306a36Sopenharmony_ci		entry = pte_to_swp_entry(pte);
21362306a36Sopenharmony_ci		if (unlikely(non_swap_entry(entry)))
21462306a36Sopenharmony_ci			continue;
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ci		pte_unmap_unlock(ptep, ptl);
21762306a36Sopenharmony_ci		ptep = NULL;
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
22062306a36Sopenharmony_ci					     vma, addr, &splug);
22162306a36Sopenharmony_ci		if (page)
22262306a36Sopenharmony_ci			put_page(page);
22362306a36Sopenharmony_ci	}
22462306a36Sopenharmony_ci
22562306a36Sopenharmony_ci	if (ptep)
22662306a36Sopenharmony_ci		pte_unmap_unlock(ptep, ptl);
22762306a36Sopenharmony_ci	swap_read_unplug(splug);
22862306a36Sopenharmony_ci	cond_resched();
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_ci	return 0;
23162306a36Sopenharmony_ci}
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_cistatic const struct mm_walk_ops swapin_walk_ops = {
23462306a36Sopenharmony_ci	.pmd_entry		= swapin_walk_pmd_entry,
23562306a36Sopenharmony_ci	.walk_lock		= PGWALK_RDLOCK,
23662306a36Sopenharmony_ci};
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_cistatic void shmem_swapin_range(struct vm_area_struct *vma,
23962306a36Sopenharmony_ci		unsigned long start, unsigned long end,
24062306a36Sopenharmony_ci		struct address_space *mapping)
24162306a36Sopenharmony_ci{
24262306a36Sopenharmony_ci	XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
24362306a36Sopenharmony_ci	pgoff_t end_index = linear_page_index(vma, end) - 1;
24462306a36Sopenharmony_ci	struct page *page;
24562306a36Sopenharmony_ci	struct swap_iocb *splug = NULL;
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	rcu_read_lock();
24862306a36Sopenharmony_ci	xas_for_each(&xas, page, end_index) {
24962306a36Sopenharmony_ci		unsigned long addr;
25062306a36Sopenharmony_ci		swp_entry_t entry;
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_ci		if (!xa_is_value(page))
25362306a36Sopenharmony_ci			continue;
25462306a36Sopenharmony_ci		entry = radix_to_swp_entry(page);
25562306a36Sopenharmony_ci		/* There might be swapin error entries in shmem mapping. */
25662306a36Sopenharmony_ci		if (non_swap_entry(entry))
25762306a36Sopenharmony_ci			continue;
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci		addr = vma->vm_start +
26062306a36Sopenharmony_ci			((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
26162306a36Sopenharmony_ci		xas_pause(&xas);
26262306a36Sopenharmony_ci		rcu_read_unlock();
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci		page = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
26562306a36Sopenharmony_ci					     vma, addr, &splug);
26662306a36Sopenharmony_ci		if (page)
26762306a36Sopenharmony_ci			put_page(page);
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci		rcu_read_lock();
27062306a36Sopenharmony_ci	}
27162306a36Sopenharmony_ci	rcu_read_unlock();
27262306a36Sopenharmony_ci	swap_read_unplug(splug);
27362306a36Sopenharmony_ci}
27462306a36Sopenharmony_ci#endif		/* CONFIG_SWAP */
27562306a36Sopenharmony_ci
27662306a36Sopenharmony_ci/*
27762306a36Sopenharmony_ci * Schedule all required I/O operations.  Do not wait for completion.
27862306a36Sopenharmony_ci */
27962306a36Sopenharmony_cistatic long madvise_willneed(struct vm_area_struct *vma,
28062306a36Sopenharmony_ci			     struct vm_area_struct **prev,
28162306a36Sopenharmony_ci			     unsigned long start, unsigned long end)
28262306a36Sopenharmony_ci{
28362306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
28462306a36Sopenharmony_ci	struct file *file = vma->vm_file;
28562306a36Sopenharmony_ci	loff_t offset;
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci	*prev = vma;
28862306a36Sopenharmony_ci#ifdef CONFIG_SWAP
28962306a36Sopenharmony_ci	if (!file) {
29062306a36Sopenharmony_ci		walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
29162306a36Sopenharmony_ci		lru_add_drain(); /* Push any new pages onto the LRU now */
29262306a36Sopenharmony_ci		return 0;
29362306a36Sopenharmony_ci	}
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	if (shmem_mapping(file->f_mapping)) {
29662306a36Sopenharmony_ci		shmem_swapin_range(vma, start, end, file->f_mapping);
29762306a36Sopenharmony_ci		lru_add_drain(); /* Push any new pages onto the LRU now */
29862306a36Sopenharmony_ci		return 0;
29962306a36Sopenharmony_ci	}
30062306a36Sopenharmony_ci#else
30162306a36Sopenharmony_ci	if (!file)
30262306a36Sopenharmony_ci		return -EBADF;
30362306a36Sopenharmony_ci#endif
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci	if (IS_DAX(file_inode(file))) {
30662306a36Sopenharmony_ci		/* no bad return value, but ignore advice */
30762306a36Sopenharmony_ci		return 0;
30862306a36Sopenharmony_ci	}
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci	/*
31162306a36Sopenharmony_ci	 * Filesystem's fadvise may need to take various locks.  We need to
31262306a36Sopenharmony_ci	 * explicitly grab a reference because the vma (and hence the
31362306a36Sopenharmony_ci	 * vma's reference to the file) can go away as soon as we drop
31462306a36Sopenharmony_ci	 * mmap_lock.
31562306a36Sopenharmony_ci	 */
31662306a36Sopenharmony_ci	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
31762306a36Sopenharmony_ci	get_file(file);
31862306a36Sopenharmony_ci	offset = (loff_t)(start - vma->vm_start)
31962306a36Sopenharmony_ci			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
32062306a36Sopenharmony_ci	mmap_read_unlock(mm);
32162306a36Sopenharmony_ci	vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
32262306a36Sopenharmony_ci	fput(file);
32362306a36Sopenharmony_ci	mmap_read_lock(mm);
32462306a36Sopenharmony_ci	return 0;
32562306a36Sopenharmony_ci}
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_cistatic inline bool can_do_file_pageout(struct vm_area_struct *vma)
32862306a36Sopenharmony_ci{
32962306a36Sopenharmony_ci	if (!vma->vm_file)
33062306a36Sopenharmony_ci		return false;
33162306a36Sopenharmony_ci	/*
33262306a36Sopenharmony_ci	 * paging out pagecache only for non-anonymous mappings that correspond
33362306a36Sopenharmony_ci	 * to the files the calling process could (if tried) open for writing;
33462306a36Sopenharmony_ci	 * otherwise we'd be including shared non-exclusive mappings, which
33562306a36Sopenharmony_ci	 * opens a side channel.
33662306a36Sopenharmony_ci	 */
33762306a36Sopenharmony_ci	return inode_owner_or_capable(&nop_mnt_idmap,
33862306a36Sopenharmony_ci				      file_inode(vma->vm_file)) ||
33962306a36Sopenharmony_ci	       file_permission(vma->vm_file, MAY_WRITE) == 0;
34062306a36Sopenharmony_ci}
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_cistatic int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
34362306a36Sopenharmony_ci				unsigned long addr, unsigned long end,
34462306a36Sopenharmony_ci				struct mm_walk *walk)
34562306a36Sopenharmony_ci{
34662306a36Sopenharmony_ci	struct madvise_walk_private *private = walk->private;
34762306a36Sopenharmony_ci	struct mmu_gather *tlb = private->tlb;
34862306a36Sopenharmony_ci	bool pageout = private->pageout;
34962306a36Sopenharmony_ci	struct mm_struct *mm = tlb->mm;
35062306a36Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
35162306a36Sopenharmony_ci	pte_t *start_pte, *pte, ptent;
35262306a36Sopenharmony_ci	spinlock_t *ptl;
35362306a36Sopenharmony_ci	struct folio *folio = NULL;
35462306a36Sopenharmony_ci	LIST_HEAD(folio_list);
35562306a36Sopenharmony_ci	bool pageout_anon_only_filter;
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_ci	if (fatal_signal_pending(current))
35862306a36Sopenharmony_ci		return -EINTR;
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci	pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
36162306a36Sopenharmony_ci					!can_do_file_pageout(vma);
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
36462306a36Sopenharmony_ci	if (pmd_trans_huge(*pmd)) {
36562306a36Sopenharmony_ci		pmd_t orig_pmd;
36662306a36Sopenharmony_ci		unsigned long next = pmd_addr_end(addr, end);
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci		tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
36962306a36Sopenharmony_ci		ptl = pmd_trans_huge_lock(pmd, vma);
37062306a36Sopenharmony_ci		if (!ptl)
37162306a36Sopenharmony_ci			return 0;
37262306a36Sopenharmony_ci
37362306a36Sopenharmony_ci		orig_pmd = *pmd;
37462306a36Sopenharmony_ci		if (is_huge_zero_pmd(orig_pmd))
37562306a36Sopenharmony_ci			goto huge_unlock;
37662306a36Sopenharmony_ci
37762306a36Sopenharmony_ci		if (unlikely(!pmd_present(orig_pmd))) {
37862306a36Sopenharmony_ci			VM_BUG_ON(thp_migration_supported() &&
37962306a36Sopenharmony_ci					!is_pmd_migration_entry(orig_pmd));
38062306a36Sopenharmony_ci			goto huge_unlock;
38162306a36Sopenharmony_ci		}
38262306a36Sopenharmony_ci
38362306a36Sopenharmony_ci		folio = pfn_folio(pmd_pfn(orig_pmd));
38462306a36Sopenharmony_ci
38562306a36Sopenharmony_ci		/* Do not interfere with other mappings of this folio */
38662306a36Sopenharmony_ci		if (folio_estimated_sharers(folio) != 1)
38762306a36Sopenharmony_ci			goto huge_unlock;
38862306a36Sopenharmony_ci
38962306a36Sopenharmony_ci		if (pageout_anon_only_filter && !folio_test_anon(folio))
39062306a36Sopenharmony_ci			goto huge_unlock;
39162306a36Sopenharmony_ci
39262306a36Sopenharmony_ci		if (next - addr != HPAGE_PMD_SIZE) {
39362306a36Sopenharmony_ci			int err;
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci			folio_get(folio);
39662306a36Sopenharmony_ci			spin_unlock(ptl);
39762306a36Sopenharmony_ci			folio_lock(folio);
39862306a36Sopenharmony_ci			err = split_folio(folio);
39962306a36Sopenharmony_ci			folio_unlock(folio);
40062306a36Sopenharmony_ci			folio_put(folio);
40162306a36Sopenharmony_ci			if (!err)
40262306a36Sopenharmony_ci				goto regular_folio;
40362306a36Sopenharmony_ci			return 0;
40462306a36Sopenharmony_ci		}
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci		if (pmd_young(orig_pmd)) {
40762306a36Sopenharmony_ci			pmdp_invalidate(vma, addr, pmd);
40862306a36Sopenharmony_ci			orig_pmd = pmd_mkold(orig_pmd);
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_ci			set_pmd_at(mm, addr, pmd, orig_pmd);
41162306a36Sopenharmony_ci			tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
41262306a36Sopenharmony_ci		}
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci		folio_clear_referenced(folio);
41562306a36Sopenharmony_ci		folio_test_clear_young(folio);
41662306a36Sopenharmony_ci		if (folio_test_active(folio))
41762306a36Sopenharmony_ci			folio_set_workingset(folio);
41862306a36Sopenharmony_ci		if (pageout) {
41962306a36Sopenharmony_ci			if (folio_isolate_lru(folio)) {
42062306a36Sopenharmony_ci				if (folio_test_unevictable(folio))
42162306a36Sopenharmony_ci					folio_putback_lru(folio);
42262306a36Sopenharmony_ci				else
42362306a36Sopenharmony_ci					list_add(&folio->lru, &folio_list);
42462306a36Sopenharmony_ci			}
42562306a36Sopenharmony_ci		} else
42662306a36Sopenharmony_ci			folio_deactivate(folio);
42762306a36Sopenharmony_cihuge_unlock:
42862306a36Sopenharmony_ci		spin_unlock(ptl);
42962306a36Sopenharmony_ci		if (pageout)
43062306a36Sopenharmony_ci			reclaim_pages(&folio_list);
43162306a36Sopenharmony_ci		return 0;
43262306a36Sopenharmony_ci	}
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ciregular_folio:
43562306a36Sopenharmony_ci#endif
43662306a36Sopenharmony_ci	tlb_change_page_size(tlb, PAGE_SIZE);
43762306a36Sopenharmony_ci	start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
43862306a36Sopenharmony_ci	if (!start_pte)
43962306a36Sopenharmony_ci		return 0;
44062306a36Sopenharmony_ci	flush_tlb_batched_pending(mm);
44162306a36Sopenharmony_ci	arch_enter_lazy_mmu_mode();
44262306a36Sopenharmony_ci	for (; addr < end; pte++, addr += PAGE_SIZE) {
44362306a36Sopenharmony_ci		ptent = ptep_get(pte);
44462306a36Sopenharmony_ci
44562306a36Sopenharmony_ci		if (pte_none(ptent))
44662306a36Sopenharmony_ci			continue;
44762306a36Sopenharmony_ci
44862306a36Sopenharmony_ci		if (!pte_present(ptent))
44962306a36Sopenharmony_ci			continue;
45062306a36Sopenharmony_ci
45162306a36Sopenharmony_ci		folio = vm_normal_folio(vma, addr, ptent);
45262306a36Sopenharmony_ci		if (!folio || folio_is_zone_device(folio))
45362306a36Sopenharmony_ci			continue;
45462306a36Sopenharmony_ci
45562306a36Sopenharmony_ci		/*
45662306a36Sopenharmony_ci		 * Creating a THP page is expensive so split it only if we
45762306a36Sopenharmony_ci		 * are sure it's worth. Split it if we are only owner.
45862306a36Sopenharmony_ci		 */
45962306a36Sopenharmony_ci		if (folio_test_large(folio)) {
46062306a36Sopenharmony_ci			int err;
46162306a36Sopenharmony_ci
46262306a36Sopenharmony_ci			if (folio_estimated_sharers(folio) != 1)
46362306a36Sopenharmony_ci				break;
46462306a36Sopenharmony_ci			if (pageout_anon_only_filter && !folio_test_anon(folio))
46562306a36Sopenharmony_ci				break;
46662306a36Sopenharmony_ci			if (!folio_trylock(folio))
46762306a36Sopenharmony_ci				break;
46862306a36Sopenharmony_ci			folio_get(folio);
46962306a36Sopenharmony_ci			arch_leave_lazy_mmu_mode();
47062306a36Sopenharmony_ci			pte_unmap_unlock(start_pte, ptl);
47162306a36Sopenharmony_ci			start_pte = NULL;
47262306a36Sopenharmony_ci			err = split_folio(folio);
47362306a36Sopenharmony_ci			folio_unlock(folio);
47462306a36Sopenharmony_ci			folio_put(folio);
47562306a36Sopenharmony_ci			if (err)
47662306a36Sopenharmony_ci				break;
47762306a36Sopenharmony_ci			start_pte = pte =
47862306a36Sopenharmony_ci				pte_offset_map_lock(mm, pmd, addr, &ptl);
47962306a36Sopenharmony_ci			if (!start_pte)
48062306a36Sopenharmony_ci				break;
48162306a36Sopenharmony_ci			arch_enter_lazy_mmu_mode();
48262306a36Sopenharmony_ci			pte--;
48362306a36Sopenharmony_ci			addr -= PAGE_SIZE;
48462306a36Sopenharmony_ci			continue;
48562306a36Sopenharmony_ci		}
48662306a36Sopenharmony_ci
48762306a36Sopenharmony_ci		/*
48862306a36Sopenharmony_ci		 * Do not interfere with other mappings of this folio and
48962306a36Sopenharmony_ci		 * non-LRU folio.
49062306a36Sopenharmony_ci		 */
49162306a36Sopenharmony_ci		if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
49262306a36Sopenharmony_ci			continue;
49362306a36Sopenharmony_ci
49462306a36Sopenharmony_ci		if (pageout_anon_only_filter && !folio_test_anon(folio))
49562306a36Sopenharmony_ci			continue;
49662306a36Sopenharmony_ci
49762306a36Sopenharmony_ci		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
49862306a36Sopenharmony_ci
49962306a36Sopenharmony_ci		if (pte_young(ptent)) {
50062306a36Sopenharmony_ci			ptent = ptep_get_and_clear_full(mm, addr, pte,
50162306a36Sopenharmony_ci							tlb->fullmm);
50262306a36Sopenharmony_ci			ptent = pte_mkold(ptent);
50362306a36Sopenharmony_ci			set_pte_at(mm, addr, pte, ptent);
50462306a36Sopenharmony_ci			tlb_remove_tlb_entry(tlb, pte, addr);
50562306a36Sopenharmony_ci		}
50662306a36Sopenharmony_ci
50762306a36Sopenharmony_ci		/*
50862306a36Sopenharmony_ci		 * We are deactivating a folio for accelerating reclaiming.
50962306a36Sopenharmony_ci		 * VM couldn't reclaim the folio unless we clear PG_young.
51062306a36Sopenharmony_ci		 * As a side effect, it makes confuse idle-page tracking
51162306a36Sopenharmony_ci		 * because they will miss recent referenced history.
51262306a36Sopenharmony_ci		 */
51362306a36Sopenharmony_ci		folio_clear_referenced(folio);
51462306a36Sopenharmony_ci		folio_test_clear_young(folio);
51562306a36Sopenharmony_ci		if (folio_test_active(folio))
51662306a36Sopenharmony_ci			folio_set_workingset(folio);
51762306a36Sopenharmony_ci		if (pageout) {
51862306a36Sopenharmony_ci			if (folio_isolate_lru(folio)) {
51962306a36Sopenharmony_ci				if (folio_test_unevictable(folio))
52062306a36Sopenharmony_ci					folio_putback_lru(folio);
52162306a36Sopenharmony_ci				else
52262306a36Sopenharmony_ci					list_add(&folio->lru, &folio_list);
52362306a36Sopenharmony_ci			}
52462306a36Sopenharmony_ci		} else
52562306a36Sopenharmony_ci			folio_deactivate(folio);
52662306a36Sopenharmony_ci	}
52762306a36Sopenharmony_ci
52862306a36Sopenharmony_ci	if (start_pte) {
52962306a36Sopenharmony_ci		arch_leave_lazy_mmu_mode();
53062306a36Sopenharmony_ci		pte_unmap_unlock(start_pte, ptl);
53162306a36Sopenharmony_ci	}
53262306a36Sopenharmony_ci	if (pageout)
53362306a36Sopenharmony_ci		reclaim_pages(&folio_list);
53462306a36Sopenharmony_ci	cond_resched();
53562306a36Sopenharmony_ci
53662306a36Sopenharmony_ci	return 0;
53762306a36Sopenharmony_ci}
53862306a36Sopenharmony_ci
53962306a36Sopenharmony_cistatic const struct mm_walk_ops cold_walk_ops = {
54062306a36Sopenharmony_ci	.pmd_entry = madvise_cold_or_pageout_pte_range,
54162306a36Sopenharmony_ci	.walk_lock = PGWALK_RDLOCK,
54262306a36Sopenharmony_ci};
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_cistatic void madvise_cold_page_range(struct mmu_gather *tlb,
54562306a36Sopenharmony_ci			     struct vm_area_struct *vma,
54662306a36Sopenharmony_ci			     unsigned long addr, unsigned long end)
54762306a36Sopenharmony_ci{
54862306a36Sopenharmony_ci	struct madvise_walk_private walk_private = {
54962306a36Sopenharmony_ci		.pageout = false,
55062306a36Sopenharmony_ci		.tlb = tlb,
55162306a36Sopenharmony_ci	};
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci	tlb_start_vma(tlb, vma);
55462306a36Sopenharmony_ci	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
55562306a36Sopenharmony_ci	tlb_end_vma(tlb, vma);
55662306a36Sopenharmony_ci}
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_cistatic inline bool can_madv_lru_vma(struct vm_area_struct *vma)
55962306a36Sopenharmony_ci{
56062306a36Sopenharmony_ci	return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
56162306a36Sopenharmony_ci}
56262306a36Sopenharmony_ci
56362306a36Sopenharmony_cistatic long madvise_cold(struct vm_area_struct *vma,
56462306a36Sopenharmony_ci			struct vm_area_struct **prev,
56562306a36Sopenharmony_ci			unsigned long start_addr, unsigned long end_addr)
56662306a36Sopenharmony_ci{
56762306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
56862306a36Sopenharmony_ci	struct mmu_gather tlb;
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci	*prev = vma;
57162306a36Sopenharmony_ci	if (!can_madv_lru_vma(vma))
57262306a36Sopenharmony_ci		return -EINVAL;
57362306a36Sopenharmony_ci
57462306a36Sopenharmony_ci	lru_add_drain();
57562306a36Sopenharmony_ci	tlb_gather_mmu(&tlb, mm);
57662306a36Sopenharmony_ci	madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
57762306a36Sopenharmony_ci	tlb_finish_mmu(&tlb);
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_ci	return 0;
58062306a36Sopenharmony_ci}
58162306a36Sopenharmony_ci
58262306a36Sopenharmony_cistatic void madvise_pageout_page_range(struct mmu_gather *tlb,
58362306a36Sopenharmony_ci			     struct vm_area_struct *vma,
58462306a36Sopenharmony_ci			     unsigned long addr, unsigned long end)
58562306a36Sopenharmony_ci{
58662306a36Sopenharmony_ci	struct madvise_walk_private walk_private = {
58762306a36Sopenharmony_ci		.pageout = true,
58862306a36Sopenharmony_ci		.tlb = tlb,
58962306a36Sopenharmony_ci	};
59062306a36Sopenharmony_ci
59162306a36Sopenharmony_ci	tlb_start_vma(tlb, vma);
59262306a36Sopenharmony_ci	walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
59362306a36Sopenharmony_ci	tlb_end_vma(tlb, vma);
59462306a36Sopenharmony_ci}
59562306a36Sopenharmony_ci
59662306a36Sopenharmony_cistatic long madvise_pageout(struct vm_area_struct *vma,
59762306a36Sopenharmony_ci			struct vm_area_struct **prev,
59862306a36Sopenharmony_ci			unsigned long start_addr, unsigned long end_addr)
59962306a36Sopenharmony_ci{
60062306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
60162306a36Sopenharmony_ci	struct mmu_gather tlb;
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci	*prev = vma;
60462306a36Sopenharmony_ci	if (!can_madv_lru_vma(vma))
60562306a36Sopenharmony_ci		return -EINVAL;
60662306a36Sopenharmony_ci
60762306a36Sopenharmony_ci	/*
60862306a36Sopenharmony_ci	 * If the VMA belongs to a private file mapping, there can be private
60962306a36Sopenharmony_ci	 * dirty pages which can be paged out if even this process is neither
61062306a36Sopenharmony_ci	 * owner nor write capable of the file. We allow private file mappings
61162306a36Sopenharmony_ci	 * further to pageout dirty anon pages.
61262306a36Sopenharmony_ci	 */
61362306a36Sopenharmony_ci	if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
61462306a36Sopenharmony_ci				(vma->vm_flags & VM_MAYSHARE)))
61562306a36Sopenharmony_ci		return 0;
61662306a36Sopenharmony_ci
61762306a36Sopenharmony_ci	lru_add_drain();
61862306a36Sopenharmony_ci	tlb_gather_mmu(&tlb, mm);
61962306a36Sopenharmony_ci	madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
62062306a36Sopenharmony_ci	tlb_finish_mmu(&tlb);
62162306a36Sopenharmony_ci
62262306a36Sopenharmony_ci	return 0;
62362306a36Sopenharmony_ci}
62462306a36Sopenharmony_ci
62562306a36Sopenharmony_cistatic int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
62662306a36Sopenharmony_ci				unsigned long end, struct mm_walk *walk)
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_ci{
62962306a36Sopenharmony_ci	struct mmu_gather *tlb = walk->private;
63062306a36Sopenharmony_ci	struct mm_struct *mm = tlb->mm;
63162306a36Sopenharmony_ci	struct vm_area_struct *vma = walk->vma;
63262306a36Sopenharmony_ci	spinlock_t *ptl;
63362306a36Sopenharmony_ci	pte_t *start_pte, *pte, ptent;
63462306a36Sopenharmony_ci	struct folio *folio;
63562306a36Sopenharmony_ci	int nr_swap = 0;
63662306a36Sopenharmony_ci	unsigned long next;
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	next = pmd_addr_end(addr, end);
63962306a36Sopenharmony_ci	if (pmd_trans_huge(*pmd))
64062306a36Sopenharmony_ci		if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
64162306a36Sopenharmony_ci			return 0;
64262306a36Sopenharmony_ci
64362306a36Sopenharmony_ci	tlb_change_page_size(tlb, PAGE_SIZE);
64462306a36Sopenharmony_ci	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
64562306a36Sopenharmony_ci	if (!start_pte)
64662306a36Sopenharmony_ci		return 0;
64762306a36Sopenharmony_ci	flush_tlb_batched_pending(mm);
64862306a36Sopenharmony_ci	arch_enter_lazy_mmu_mode();
64962306a36Sopenharmony_ci	for (; addr != end; pte++, addr += PAGE_SIZE) {
65062306a36Sopenharmony_ci		ptent = ptep_get(pte);
65162306a36Sopenharmony_ci
65262306a36Sopenharmony_ci		if (pte_none(ptent))
65362306a36Sopenharmony_ci			continue;
65462306a36Sopenharmony_ci		/*
65562306a36Sopenharmony_ci		 * If the pte has swp_entry, just clear page table to
65662306a36Sopenharmony_ci		 * prevent swap-in which is more expensive rather than
65762306a36Sopenharmony_ci		 * (page allocation + zeroing).
65862306a36Sopenharmony_ci		 */
65962306a36Sopenharmony_ci		if (!pte_present(ptent)) {
66062306a36Sopenharmony_ci			swp_entry_t entry;
66162306a36Sopenharmony_ci
66262306a36Sopenharmony_ci			entry = pte_to_swp_entry(ptent);
66362306a36Sopenharmony_ci			if (!non_swap_entry(entry)) {
66462306a36Sopenharmony_ci				nr_swap--;
66562306a36Sopenharmony_ci				free_swap_and_cache(entry);
66662306a36Sopenharmony_ci				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
66762306a36Sopenharmony_ci			} else if (is_hwpoison_entry(entry) ||
66862306a36Sopenharmony_ci				   is_poisoned_swp_entry(entry)) {
66962306a36Sopenharmony_ci				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
67062306a36Sopenharmony_ci			}
67162306a36Sopenharmony_ci			continue;
67262306a36Sopenharmony_ci		}
67362306a36Sopenharmony_ci
67462306a36Sopenharmony_ci		folio = vm_normal_folio(vma, addr, ptent);
67562306a36Sopenharmony_ci		if (!folio || folio_is_zone_device(folio))
67662306a36Sopenharmony_ci			continue;
67762306a36Sopenharmony_ci
67862306a36Sopenharmony_ci		/*
67962306a36Sopenharmony_ci		 * If pmd isn't transhuge but the folio is large and
68062306a36Sopenharmony_ci		 * is owned by only this process, split it and
68162306a36Sopenharmony_ci		 * deactivate all pages.
68262306a36Sopenharmony_ci		 */
68362306a36Sopenharmony_ci		if (folio_test_large(folio)) {
68462306a36Sopenharmony_ci			int err;
68562306a36Sopenharmony_ci
68662306a36Sopenharmony_ci			if (folio_estimated_sharers(folio) != 1)
68762306a36Sopenharmony_ci				break;
68862306a36Sopenharmony_ci			if (!folio_trylock(folio))
68962306a36Sopenharmony_ci				break;
69062306a36Sopenharmony_ci			folio_get(folio);
69162306a36Sopenharmony_ci			arch_leave_lazy_mmu_mode();
69262306a36Sopenharmony_ci			pte_unmap_unlock(start_pte, ptl);
69362306a36Sopenharmony_ci			start_pte = NULL;
69462306a36Sopenharmony_ci			err = split_folio(folio);
69562306a36Sopenharmony_ci			folio_unlock(folio);
69662306a36Sopenharmony_ci			folio_put(folio);
69762306a36Sopenharmony_ci			if (err)
69862306a36Sopenharmony_ci				break;
69962306a36Sopenharmony_ci			start_pte = pte =
70062306a36Sopenharmony_ci				pte_offset_map_lock(mm, pmd, addr, &ptl);
70162306a36Sopenharmony_ci			if (!start_pte)
70262306a36Sopenharmony_ci				break;
70362306a36Sopenharmony_ci			arch_enter_lazy_mmu_mode();
70462306a36Sopenharmony_ci			pte--;
70562306a36Sopenharmony_ci			addr -= PAGE_SIZE;
70662306a36Sopenharmony_ci			continue;
70762306a36Sopenharmony_ci		}
70862306a36Sopenharmony_ci
70962306a36Sopenharmony_ci		if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
71062306a36Sopenharmony_ci			if (!folio_trylock(folio))
71162306a36Sopenharmony_ci				continue;
71262306a36Sopenharmony_ci			/*
71362306a36Sopenharmony_ci			 * If folio is shared with others, we mustn't clear
71462306a36Sopenharmony_ci			 * the folio's dirty flag.
71562306a36Sopenharmony_ci			 */
71662306a36Sopenharmony_ci			if (folio_mapcount(folio) != 1) {
71762306a36Sopenharmony_ci				folio_unlock(folio);
71862306a36Sopenharmony_ci				continue;
71962306a36Sopenharmony_ci			}
72062306a36Sopenharmony_ci
72162306a36Sopenharmony_ci			if (folio_test_swapcache(folio) &&
72262306a36Sopenharmony_ci			    !folio_free_swap(folio)) {
72362306a36Sopenharmony_ci				folio_unlock(folio);
72462306a36Sopenharmony_ci				continue;
72562306a36Sopenharmony_ci			}
72662306a36Sopenharmony_ci
72762306a36Sopenharmony_ci			folio_clear_dirty(folio);
72862306a36Sopenharmony_ci			folio_unlock(folio);
72962306a36Sopenharmony_ci		}
73062306a36Sopenharmony_ci
73162306a36Sopenharmony_ci		if (pte_young(ptent) || pte_dirty(ptent)) {
73262306a36Sopenharmony_ci			/*
73362306a36Sopenharmony_ci			 * Some of architecture(ex, PPC) don't update TLB
73462306a36Sopenharmony_ci			 * with set_pte_at and tlb_remove_tlb_entry so for
73562306a36Sopenharmony_ci			 * the portability, remap the pte with old|clean
73662306a36Sopenharmony_ci			 * after pte clearing.
73762306a36Sopenharmony_ci			 */
73862306a36Sopenharmony_ci			ptent = ptep_get_and_clear_full(mm, addr, pte,
73962306a36Sopenharmony_ci							tlb->fullmm);
74062306a36Sopenharmony_ci
74162306a36Sopenharmony_ci			ptent = pte_mkold(ptent);
74262306a36Sopenharmony_ci			ptent = pte_mkclean(ptent);
74362306a36Sopenharmony_ci			set_pte_at(mm, addr, pte, ptent);
74462306a36Sopenharmony_ci			tlb_remove_tlb_entry(tlb, pte, addr);
74562306a36Sopenharmony_ci		}
74662306a36Sopenharmony_ci		folio_mark_lazyfree(folio);
74762306a36Sopenharmony_ci	}
74862306a36Sopenharmony_ci
74962306a36Sopenharmony_ci	if (nr_swap) {
75062306a36Sopenharmony_ci		if (current->mm == mm)
75162306a36Sopenharmony_ci			sync_mm_rss(mm);
75262306a36Sopenharmony_ci		add_mm_counter(mm, MM_SWAPENTS, nr_swap);
75362306a36Sopenharmony_ci	}
75462306a36Sopenharmony_ci	if (start_pte) {
75562306a36Sopenharmony_ci		arch_leave_lazy_mmu_mode();
75662306a36Sopenharmony_ci		pte_unmap_unlock(start_pte, ptl);
75762306a36Sopenharmony_ci	}
75862306a36Sopenharmony_ci	cond_resched();
75962306a36Sopenharmony_ci
76062306a36Sopenharmony_ci	return 0;
76162306a36Sopenharmony_ci}
76262306a36Sopenharmony_ci
76362306a36Sopenharmony_cistatic const struct mm_walk_ops madvise_free_walk_ops = {
76462306a36Sopenharmony_ci	.pmd_entry		= madvise_free_pte_range,
76562306a36Sopenharmony_ci	.walk_lock		= PGWALK_RDLOCK,
76662306a36Sopenharmony_ci};
76762306a36Sopenharmony_ci
76862306a36Sopenharmony_cistatic int madvise_free_single_vma(struct vm_area_struct *vma,
76962306a36Sopenharmony_ci			unsigned long start_addr, unsigned long end_addr)
77062306a36Sopenharmony_ci{
77162306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
77262306a36Sopenharmony_ci	struct mmu_notifier_range range;
77362306a36Sopenharmony_ci	struct mmu_gather tlb;
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci	/* MADV_FREE works for only anon vma at the moment */
77662306a36Sopenharmony_ci	if (!vma_is_anonymous(vma))
77762306a36Sopenharmony_ci		return -EINVAL;
77862306a36Sopenharmony_ci
77962306a36Sopenharmony_ci	range.start = max(vma->vm_start, start_addr);
78062306a36Sopenharmony_ci	if (range.start >= vma->vm_end)
78162306a36Sopenharmony_ci		return -EINVAL;
78262306a36Sopenharmony_ci	range.end = min(vma->vm_end, end_addr);
78362306a36Sopenharmony_ci	if (range.end <= vma->vm_start)
78462306a36Sopenharmony_ci		return -EINVAL;
78562306a36Sopenharmony_ci	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
78662306a36Sopenharmony_ci				range.start, range.end);
78762306a36Sopenharmony_ci
78862306a36Sopenharmony_ci	lru_add_drain();
78962306a36Sopenharmony_ci	tlb_gather_mmu(&tlb, mm);
79062306a36Sopenharmony_ci	update_hiwater_rss(mm);
79162306a36Sopenharmony_ci
79262306a36Sopenharmony_ci	mmu_notifier_invalidate_range_start(&range);
79362306a36Sopenharmony_ci	tlb_start_vma(&tlb, vma);
79462306a36Sopenharmony_ci	walk_page_range(vma->vm_mm, range.start, range.end,
79562306a36Sopenharmony_ci			&madvise_free_walk_ops, &tlb);
79662306a36Sopenharmony_ci	tlb_end_vma(&tlb, vma);
79762306a36Sopenharmony_ci	mmu_notifier_invalidate_range_end(&range);
79862306a36Sopenharmony_ci	tlb_finish_mmu(&tlb);
79962306a36Sopenharmony_ci
80062306a36Sopenharmony_ci	return 0;
80162306a36Sopenharmony_ci}
80262306a36Sopenharmony_ci
80362306a36Sopenharmony_ci/*
80462306a36Sopenharmony_ci * Application no longer needs these pages.  If the pages are dirty,
80562306a36Sopenharmony_ci * it's OK to just throw them away.  The app will be more careful about
80662306a36Sopenharmony_ci * data it wants to keep.  Be sure to free swap resources too.  The
80762306a36Sopenharmony_ci * zap_page_range_single call sets things up for shrink_active_list to actually
80862306a36Sopenharmony_ci * free these pages later if no one else has touched them in the meantime,
80962306a36Sopenharmony_ci * although we could add these pages to a global reuse list for
81062306a36Sopenharmony_ci * shrink_active_list to pick up before reclaiming other pages.
81162306a36Sopenharmony_ci *
81262306a36Sopenharmony_ci * NB: This interface discards data rather than pushes it out to swap,
81362306a36Sopenharmony_ci * as some implementations do.  This has performance implications for
81462306a36Sopenharmony_ci * applications like large transactional databases which want to discard
81562306a36Sopenharmony_ci * pages in anonymous maps after committing to backing store the data
81662306a36Sopenharmony_ci * that was kept in them.  There is no reason to write this data out to
81762306a36Sopenharmony_ci * the swap area if the application is discarding it.
81862306a36Sopenharmony_ci *
81962306a36Sopenharmony_ci * An interface that causes the system to free clean pages and flush
82062306a36Sopenharmony_ci * dirty pages is already available as msync(MS_INVALIDATE).
82162306a36Sopenharmony_ci */
82262306a36Sopenharmony_cistatic long madvise_dontneed_single_vma(struct vm_area_struct *vma,
82362306a36Sopenharmony_ci					unsigned long start, unsigned long end)
82462306a36Sopenharmony_ci{
82562306a36Sopenharmony_ci	zap_page_range_single(vma, start, end - start, NULL);
82662306a36Sopenharmony_ci	return 0;
82762306a36Sopenharmony_ci}
82862306a36Sopenharmony_ci
82962306a36Sopenharmony_cistatic bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
83062306a36Sopenharmony_ci					    unsigned long start,
83162306a36Sopenharmony_ci					    unsigned long *end,
83262306a36Sopenharmony_ci					    int behavior)
83362306a36Sopenharmony_ci{
83462306a36Sopenharmony_ci	if (!is_vm_hugetlb_page(vma)) {
83562306a36Sopenharmony_ci		unsigned int forbidden = VM_PFNMAP;
83662306a36Sopenharmony_ci
83762306a36Sopenharmony_ci		if (behavior != MADV_DONTNEED_LOCKED)
83862306a36Sopenharmony_ci			forbidden |= VM_LOCKED;
83962306a36Sopenharmony_ci
84062306a36Sopenharmony_ci		return !(vma->vm_flags & forbidden);
84162306a36Sopenharmony_ci	}
84262306a36Sopenharmony_ci
84362306a36Sopenharmony_ci	if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
84462306a36Sopenharmony_ci		return false;
84562306a36Sopenharmony_ci	if (start & ~huge_page_mask(hstate_vma(vma)))
84662306a36Sopenharmony_ci		return false;
84762306a36Sopenharmony_ci
84862306a36Sopenharmony_ci	/*
84962306a36Sopenharmony_ci	 * Madvise callers expect the length to be rounded up to PAGE_SIZE
85062306a36Sopenharmony_ci	 * boundaries, and may be unaware that this VMA uses huge pages.
85162306a36Sopenharmony_ci	 * Avoid unexpected data loss by rounding down the number of
85262306a36Sopenharmony_ci	 * huge pages freed.
85362306a36Sopenharmony_ci	 */
85462306a36Sopenharmony_ci	*end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
85562306a36Sopenharmony_ci
85662306a36Sopenharmony_ci	return true;
85762306a36Sopenharmony_ci}
85862306a36Sopenharmony_ci
85962306a36Sopenharmony_cistatic long madvise_dontneed_free(struct vm_area_struct *vma,
86062306a36Sopenharmony_ci				  struct vm_area_struct **prev,
86162306a36Sopenharmony_ci				  unsigned long start, unsigned long end,
86262306a36Sopenharmony_ci				  int behavior)
86362306a36Sopenharmony_ci{
86462306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
86562306a36Sopenharmony_ci
86662306a36Sopenharmony_ci	*prev = vma;
86762306a36Sopenharmony_ci	if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
86862306a36Sopenharmony_ci		return -EINVAL;
86962306a36Sopenharmony_ci
87062306a36Sopenharmony_ci	if (start == end)
87162306a36Sopenharmony_ci		return 0;
87262306a36Sopenharmony_ci
87362306a36Sopenharmony_ci	if (!userfaultfd_remove(vma, start, end)) {
87462306a36Sopenharmony_ci		*prev = NULL; /* mmap_lock has been dropped, prev is stale */
87562306a36Sopenharmony_ci
87662306a36Sopenharmony_ci		mmap_read_lock(mm);
87762306a36Sopenharmony_ci		vma = vma_lookup(mm, start);
87862306a36Sopenharmony_ci		if (!vma)
87962306a36Sopenharmony_ci			return -ENOMEM;
88062306a36Sopenharmony_ci		/*
88162306a36Sopenharmony_ci		 * Potential end adjustment for hugetlb vma is OK as
88262306a36Sopenharmony_ci		 * the check below keeps end within vma.
88362306a36Sopenharmony_ci		 */
88462306a36Sopenharmony_ci		if (!madvise_dontneed_free_valid_vma(vma, start, &end,
88562306a36Sopenharmony_ci						     behavior))
88662306a36Sopenharmony_ci			return -EINVAL;
88762306a36Sopenharmony_ci		if (end > vma->vm_end) {
88862306a36Sopenharmony_ci			/*
88962306a36Sopenharmony_ci			 * Don't fail if end > vma->vm_end. If the old
89062306a36Sopenharmony_ci			 * vma was split while the mmap_lock was
89162306a36Sopenharmony_ci			 * released the effect of the concurrent
89262306a36Sopenharmony_ci			 * operation may not cause madvise() to
89362306a36Sopenharmony_ci			 * have an undefined result. There may be an
89462306a36Sopenharmony_ci			 * adjacent next vma that we'll walk
89562306a36Sopenharmony_ci			 * next. userfaultfd_remove() will generate an
89662306a36Sopenharmony_ci			 * UFFD_EVENT_REMOVE repetition on the
89762306a36Sopenharmony_ci			 * end-vma->vm_end range, but the manager can
89862306a36Sopenharmony_ci			 * handle a repetition fine.
89962306a36Sopenharmony_ci			 */
90062306a36Sopenharmony_ci			end = vma->vm_end;
90162306a36Sopenharmony_ci		}
90262306a36Sopenharmony_ci		VM_WARN_ON(start >= end);
90362306a36Sopenharmony_ci	}
90462306a36Sopenharmony_ci
90562306a36Sopenharmony_ci	if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
90662306a36Sopenharmony_ci		return madvise_dontneed_single_vma(vma, start, end);
90762306a36Sopenharmony_ci	else if (behavior == MADV_FREE)
90862306a36Sopenharmony_ci		return madvise_free_single_vma(vma, start, end);
90962306a36Sopenharmony_ci	else
91062306a36Sopenharmony_ci		return -EINVAL;
91162306a36Sopenharmony_ci}
91262306a36Sopenharmony_ci
91362306a36Sopenharmony_cistatic long madvise_populate(struct vm_area_struct *vma,
91462306a36Sopenharmony_ci			     struct vm_area_struct **prev,
91562306a36Sopenharmony_ci			     unsigned long start, unsigned long end,
91662306a36Sopenharmony_ci			     int behavior)
91762306a36Sopenharmony_ci{
91862306a36Sopenharmony_ci	const bool write = behavior == MADV_POPULATE_WRITE;
91962306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
92062306a36Sopenharmony_ci	unsigned long tmp_end;
92162306a36Sopenharmony_ci	int locked = 1;
92262306a36Sopenharmony_ci	long pages;
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_ci	*prev = vma;
92562306a36Sopenharmony_ci
92662306a36Sopenharmony_ci	while (start < end) {
92762306a36Sopenharmony_ci		/*
92862306a36Sopenharmony_ci		 * We might have temporarily dropped the lock. For example,
92962306a36Sopenharmony_ci		 * our VMA might have been split.
93062306a36Sopenharmony_ci		 */
93162306a36Sopenharmony_ci		if (!vma || start >= vma->vm_end) {
93262306a36Sopenharmony_ci			vma = vma_lookup(mm, start);
93362306a36Sopenharmony_ci			if (!vma)
93462306a36Sopenharmony_ci				return -ENOMEM;
93562306a36Sopenharmony_ci		}
93662306a36Sopenharmony_ci
93762306a36Sopenharmony_ci		tmp_end = min_t(unsigned long, end, vma->vm_end);
93862306a36Sopenharmony_ci		/* Populate (prefault) page tables readable/writable. */
93962306a36Sopenharmony_ci		pages = faultin_vma_page_range(vma, start, tmp_end, write,
94062306a36Sopenharmony_ci					       &locked);
94162306a36Sopenharmony_ci		if (!locked) {
94262306a36Sopenharmony_ci			mmap_read_lock(mm);
94362306a36Sopenharmony_ci			locked = 1;
94462306a36Sopenharmony_ci			*prev = NULL;
94562306a36Sopenharmony_ci			vma = NULL;
94662306a36Sopenharmony_ci		}
94762306a36Sopenharmony_ci		if (pages < 0) {
94862306a36Sopenharmony_ci			switch (pages) {
94962306a36Sopenharmony_ci			case -EINTR:
95062306a36Sopenharmony_ci				return -EINTR;
95162306a36Sopenharmony_ci			case -EINVAL: /* Incompatible mappings / permissions. */
95262306a36Sopenharmony_ci				return -EINVAL;
95362306a36Sopenharmony_ci			case -EHWPOISON:
95462306a36Sopenharmony_ci				return -EHWPOISON;
95562306a36Sopenharmony_ci			case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
95662306a36Sopenharmony_ci				return -EFAULT;
95762306a36Sopenharmony_ci			default:
95862306a36Sopenharmony_ci				pr_warn_once("%s: unhandled return value: %ld\n",
95962306a36Sopenharmony_ci					     __func__, pages);
96062306a36Sopenharmony_ci				fallthrough;
96162306a36Sopenharmony_ci			case -ENOMEM:
96262306a36Sopenharmony_ci				return -ENOMEM;
96362306a36Sopenharmony_ci			}
96462306a36Sopenharmony_ci		}
96562306a36Sopenharmony_ci		start += pages * PAGE_SIZE;
96662306a36Sopenharmony_ci	}
96762306a36Sopenharmony_ci	return 0;
96862306a36Sopenharmony_ci}
96962306a36Sopenharmony_ci
97062306a36Sopenharmony_ci/*
97162306a36Sopenharmony_ci * Application wants to free up the pages and associated backing store.
97262306a36Sopenharmony_ci * This is effectively punching a hole into the middle of a file.
97362306a36Sopenharmony_ci */
97462306a36Sopenharmony_cistatic long madvise_remove(struct vm_area_struct *vma,
97562306a36Sopenharmony_ci				struct vm_area_struct **prev,
97662306a36Sopenharmony_ci				unsigned long start, unsigned long end)
97762306a36Sopenharmony_ci{
97862306a36Sopenharmony_ci	loff_t offset;
97962306a36Sopenharmony_ci	int error;
98062306a36Sopenharmony_ci	struct file *f;
98162306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
98262306a36Sopenharmony_ci
98362306a36Sopenharmony_ci	*prev = NULL;	/* tell sys_madvise we drop mmap_lock */
98462306a36Sopenharmony_ci
98562306a36Sopenharmony_ci	if (vma->vm_flags & VM_LOCKED)
98662306a36Sopenharmony_ci		return -EINVAL;
98762306a36Sopenharmony_ci
98862306a36Sopenharmony_ci	f = vma->vm_file;
98962306a36Sopenharmony_ci
99062306a36Sopenharmony_ci	if (!f || !f->f_mapping || !f->f_mapping->host) {
99162306a36Sopenharmony_ci			return -EINVAL;
99262306a36Sopenharmony_ci	}
99362306a36Sopenharmony_ci
99462306a36Sopenharmony_ci	if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
99562306a36Sopenharmony_ci		return -EACCES;
99662306a36Sopenharmony_ci
99762306a36Sopenharmony_ci	offset = (loff_t)(start - vma->vm_start)
99862306a36Sopenharmony_ci			+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
99962306a36Sopenharmony_ci
100062306a36Sopenharmony_ci	/*
100162306a36Sopenharmony_ci	 * Filesystem's fallocate may need to take i_rwsem.  We need to
100262306a36Sopenharmony_ci	 * explicitly grab a reference because the vma (and hence the
100362306a36Sopenharmony_ci	 * vma's reference to the file) can go away as soon as we drop
100462306a36Sopenharmony_ci	 * mmap_lock.
100562306a36Sopenharmony_ci	 */
100662306a36Sopenharmony_ci	get_file(f);
100762306a36Sopenharmony_ci	if (userfaultfd_remove(vma, start, end)) {
100862306a36Sopenharmony_ci		/* mmap_lock was not released by userfaultfd_remove() */
100962306a36Sopenharmony_ci		mmap_read_unlock(mm);
101062306a36Sopenharmony_ci	}
101162306a36Sopenharmony_ci	error = vfs_fallocate(f,
101262306a36Sopenharmony_ci				FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
101362306a36Sopenharmony_ci				offset, end - start);
101462306a36Sopenharmony_ci	fput(f);
101562306a36Sopenharmony_ci	mmap_read_lock(mm);
101662306a36Sopenharmony_ci	return error;
101762306a36Sopenharmony_ci}
101862306a36Sopenharmony_ci
101962306a36Sopenharmony_ci/*
102062306a36Sopenharmony_ci * Apply an madvise behavior to a region of a vma.  madvise_update_vma
102162306a36Sopenharmony_ci * will handle splitting a vm area into separate areas, each area with its own
102262306a36Sopenharmony_ci * behavior.
102362306a36Sopenharmony_ci */
102462306a36Sopenharmony_cistatic int madvise_vma_behavior(struct vm_area_struct *vma,
102562306a36Sopenharmony_ci				struct vm_area_struct **prev,
102662306a36Sopenharmony_ci				unsigned long start, unsigned long end,
102762306a36Sopenharmony_ci				unsigned long behavior)
102862306a36Sopenharmony_ci{
102962306a36Sopenharmony_ci	int error;
103062306a36Sopenharmony_ci	struct anon_vma_name *anon_name;
103162306a36Sopenharmony_ci	unsigned long new_flags = vma->vm_flags;
103262306a36Sopenharmony_ci
103362306a36Sopenharmony_ci	switch (behavior) {
103462306a36Sopenharmony_ci	case MADV_REMOVE:
103562306a36Sopenharmony_ci		return madvise_remove(vma, prev, start, end);
103662306a36Sopenharmony_ci	case MADV_WILLNEED:
103762306a36Sopenharmony_ci		return madvise_willneed(vma, prev, start, end);
103862306a36Sopenharmony_ci	case MADV_COLD:
103962306a36Sopenharmony_ci		return madvise_cold(vma, prev, start, end);
104062306a36Sopenharmony_ci	case MADV_PAGEOUT:
104162306a36Sopenharmony_ci		return madvise_pageout(vma, prev, start, end);
104262306a36Sopenharmony_ci	case MADV_FREE:
104362306a36Sopenharmony_ci	case MADV_DONTNEED:
104462306a36Sopenharmony_ci	case MADV_DONTNEED_LOCKED:
104562306a36Sopenharmony_ci		return madvise_dontneed_free(vma, prev, start, end, behavior);
104662306a36Sopenharmony_ci	case MADV_POPULATE_READ:
104762306a36Sopenharmony_ci	case MADV_POPULATE_WRITE:
104862306a36Sopenharmony_ci		return madvise_populate(vma, prev, start, end, behavior);
104962306a36Sopenharmony_ci	case MADV_NORMAL:
105062306a36Sopenharmony_ci		new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
105162306a36Sopenharmony_ci		break;
105262306a36Sopenharmony_ci	case MADV_SEQUENTIAL:
105362306a36Sopenharmony_ci		new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
105462306a36Sopenharmony_ci		break;
105562306a36Sopenharmony_ci	case MADV_RANDOM:
105662306a36Sopenharmony_ci		new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
105762306a36Sopenharmony_ci		break;
105862306a36Sopenharmony_ci	case MADV_DONTFORK:
105962306a36Sopenharmony_ci		new_flags |= VM_DONTCOPY;
106062306a36Sopenharmony_ci		break;
106162306a36Sopenharmony_ci	case MADV_DOFORK:
106262306a36Sopenharmony_ci		if (vma->vm_flags & VM_IO)
106362306a36Sopenharmony_ci			return -EINVAL;
106462306a36Sopenharmony_ci		new_flags &= ~VM_DONTCOPY;
106562306a36Sopenharmony_ci		break;
106662306a36Sopenharmony_ci	case MADV_WIPEONFORK:
106762306a36Sopenharmony_ci		/* MADV_WIPEONFORK is only supported on anonymous memory. */
106862306a36Sopenharmony_ci		if (vma->vm_file || vma->vm_flags & VM_SHARED)
106962306a36Sopenharmony_ci			return -EINVAL;
107062306a36Sopenharmony_ci		new_flags |= VM_WIPEONFORK;
107162306a36Sopenharmony_ci		break;
107262306a36Sopenharmony_ci	case MADV_KEEPONFORK:
107362306a36Sopenharmony_ci		new_flags &= ~VM_WIPEONFORK;
107462306a36Sopenharmony_ci		break;
107562306a36Sopenharmony_ci	case MADV_DONTDUMP:
107662306a36Sopenharmony_ci		new_flags |= VM_DONTDUMP;
107762306a36Sopenharmony_ci		break;
107862306a36Sopenharmony_ci	case MADV_DODUMP:
107962306a36Sopenharmony_ci		if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
108062306a36Sopenharmony_ci			return -EINVAL;
108162306a36Sopenharmony_ci		new_flags &= ~VM_DONTDUMP;
108262306a36Sopenharmony_ci		break;
108362306a36Sopenharmony_ci	case MADV_MERGEABLE:
108462306a36Sopenharmony_ci	case MADV_UNMERGEABLE:
108562306a36Sopenharmony_ci		error = ksm_madvise(vma, start, end, behavior, &new_flags);
108662306a36Sopenharmony_ci		if (error)
108762306a36Sopenharmony_ci			goto out;
108862306a36Sopenharmony_ci		break;
108962306a36Sopenharmony_ci	case MADV_HUGEPAGE:
109062306a36Sopenharmony_ci	case MADV_NOHUGEPAGE:
109162306a36Sopenharmony_ci		error = hugepage_madvise(vma, &new_flags, behavior);
109262306a36Sopenharmony_ci		if (error)
109362306a36Sopenharmony_ci			goto out;
109462306a36Sopenharmony_ci		break;
109562306a36Sopenharmony_ci	case MADV_COLLAPSE:
109662306a36Sopenharmony_ci		return madvise_collapse(vma, prev, start, end);
109762306a36Sopenharmony_ci	}
109862306a36Sopenharmony_ci
109962306a36Sopenharmony_ci	anon_name = anon_vma_name(vma);
110062306a36Sopenharmony_ci	anon_vma_name_get(anon_name);
110162306a36Sopenharmony_ci	error = madvise_update_vma(vma, prev, start, end, new_flags,
110262306a36Sopenharmony_ci				   anon_name);
110362306a36Sopenharmony_ci	anon_vma_name_put(anon_name);
110462306a36Sopenharmony_ci
110562306a36Sopenharmony_ciout:
110662306a36Sopenharmony_ci	/*
110762306a36Sopenharmony_ci	 * madvise() returns EAGAIN if kernel resources, such as
110862306a36Sopenharmony_ci	 * slab, are temporarily unavailable.
110962306a36Sopenharmony_ci	 */
111062306a36Sopenharmony_ci	if (error == -ENOMEM)
111162306a36Sopenharmony_ci		error = -EAGAIN;
111262306a36Sopenharmony_ci	return error;
111362306a36Sopenharmony_ci}
111462306a36Sopenharmony_ci
111562306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_FAILURE
111662306a36Sopenharmony_ci/*
111762306a36Sopenharmony_ci * Error injection support for memory error handling.
111862306a36Sopenharmony_ci */
111962306a36Sopenharmony_cistatic int madvise_inject_error(int behavior,
112062306a36Sopenharmony_ci		unsigned long start, unsigned long end)
112162306a36Sopenharmony_ci{
112262306a36Sopenharmony_ci	unsigned long size;
112362306a36Sopenharmony_ci
112462306a36Sopenharmony_ci	if (!capable(CAP_SYS_ADMIN))
112562306a36Sopenharmony_ci		return -EPERM;
112662306a36Sopenharmony_ci
112762306a36Sopenharmony_ci
112862306a36Sopenharmony_ci	for (; start < end; start += size) {
112962306a36Sopenharmony_ci		unsigned long pfn;
113062306a36Sopenharmony_ci		struct page *page;
113162306a36Sopenharmony_ci		int ret;
113262306a36Sopenharmony_ci
113362306a36Sopenharmony_ci		ret = get_user_pages_fast(start, 1, 0, &page);
113462306a36Sopenharmony_ci		if (ret != 1)
113562306a36Sopenharmony_ci			return ret;
113662306a36Sopenharmony_ci		pfn = page_to_pfn(page);
113762306a36Sopenharmony_ci
113862306a36Sopenharmony_ci		/*
113962306a36Sopenharmony_ci		 * When soft offlining hugepages, after migrating the page
114062306a36Sopenharmony_ci		 * we dissolve it, therefore in the second loop "page" will
114162306a36Sopenharmony_ci		 * no longer be a compound page.
114262306a36Sopenharmony_ci		 */
114362306a36Sopenharmony_ci		size = page_size(compound_head(page));
114462306a36Sopenharmony_ci
114562306a36Sopenharmony_ci		if (behavior == MADV_SOFT_OFFLINE) {
114662306a36Sopenharmony_ci			pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
114762306a36Sopenharmony_ci				 pfn, start);
114862306a36Sopenharmony_ci			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
114962306a36Sopenharmony_ci		} else {
115062306a36Sopenharmony_ci			pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
115162306a36Sopenharmony_ci				 pfn, start);
115262306a36Sopenharmony_ci			ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
115362306a36Sopenharmony_ci			if (ret == -EOPNOTSUPP)
115462306a36Sopenharmony_ci				ret = 0;
115562306a36Sopenharmony_ci		}
115662306a36Sopenharmony_ci
115762306a36Sopenharmony_ci		if (ret)
115862306a36Sopenharmony_ci			return ret;
115962306a36Sopenharmony_ci	}
116062306a36Sopenharmony_ci
116162306a36Sopenharmony_ci	return 0;
116262306a36Sopenharmony_ci}
116362306a36Sopenharmony_ci#endif
116462306a36Sopenharmony_ci
116562306a36Sopenharmony_cistatic bool
116662306a36Sopenharmony_cimadvise_behavior_valid(int behavior)
116762306a36Sopenharmony_ci{
116862306a36Sopenharmony_ci	switch (behavior) {
116962306a36Sopenharmony_ci	case MADV_DOFORK:
117062306a36Sopenharmony_ci	case MADV_DONTFORK:
117162306a36Sopenharmony_ci	case MADV_NORMAL:
117262306a36Sopenharmony_ci	case MADV_SEQUENTIAL:
117362306a36Sopenharmony_ci	case MADV_RANDOM:
117462306a36Sopenharmony_ci	case MADV_REMOVE:
117562306a36Sopenharmony_ci	case MADV_WILLNEED:
117662306a36Sopenharmony_ci	case MADV_DONTNEED:
117762306a36Sopenharmony_ci	case MADV_DONTNEED_LOCKED:
117862306a36Sopenharmony_ci	case MADV_FREE:
117962306a36Sopenharmony_ci	case MADV_COLD:
118062306a36Sopenharmony_ci	case MADV_PAGEOUT:
118162306a36Sopenharmony_ci	case MADV_POPULATE_READ:
118262306a36Sopenharmony_ci	case MADV_POPULATE_WRITE:
118362306a36Sopenharmony_ci#ifdef CONFIG_KSM
118462306a36Sopenharmony_ci	case MADV_MERGEABLE:
118562306a36Sopenharmony_ci	case MADV_UNMERGEABLE:
118662306a36Sopenharmony_ci#endif
118762306a36Sopenharmony_ci#ifdef CONFIG_TRANSPARENT_HUGEPAGE
118862306a36Sopenharmony_ci	case MADV_HUGEPAGE:
118962306a36Sopenharmony_ci	case MADV_NOHUGEPAGE:
119062306a36Sopenharmony_ci	case MADV_COLLAPSE:
119162306a36Sopenharmony_ci#endif
119262306a36Sopenharmony_ci	case MADV_DONTDUMP:
119362306a36Sopenharmony_ci	case MADV_DODUMP:
119462306a36Sopenharmony_ci	case MADV_WIPEONFORK:
119562306a36Sopenharmony_ci	case MADV_KEEPONFORK:
119662306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_FAILURE
119762306a36Sopenharmony_ci	case MADV_SOFT_OFFLINE:
119862306a36Sopenharmony_ci	case MADV_HWPOISON:
119962306a36Sopenharmony_ci#endif
120062306a36Sopenharmony_ci		return true;
120162306a36Sopenharmony_ci
120262306a36Sopenharmony_ci	default:
120362306a36Sopenharmony_ci		return false;
120462306a36Sopenharmony_ci	}
120562306a36Sopenharmony_ci}
120662306a36Sopenharmony_ci
120762306a36Sopenharmony_cistatic bool process_madvise_behavior_valid(int behavior)
120862306a36Sopenharmony_ci{
120962306a36Sopenharmony_ci	switch (behavior) {
121062306a36Sopenharmony_ci	case MADV_COLD:
121162306a36Sopenharmony_ci	case MADV_PAGEOUT:
121262306a36Sopenharmony_ci	case MADV_WILLNEED:
121362306a36Sopenharmony_ci	case MADV_COLLAPSE:
121462306a36Sopenharmony_ci		return true;
121562306a36Sopenharmony_ci	default:
121662306a36Sopenharmony_ci		return false;
121762306a36Sopenharmony_ci	}
121862306a36Sopenharmony_ci}
121962306a36Sopenharmony_ci
122062306a36Sopenharmony_ci/*
122162306a36Sopenharmony_ci * Walk the vmas in range [start,end), and call the visit function on each one.
122262306a36Sopenharmony_ci * The visit function will get start and end parameters that cover the overlap
122362306a36Sopenharmony_ci * between the current vma and the original range.  Any unmapped regions in the
122462306a36Sopenharmony_ci * original range will result in this function returning -ENOMEM while still
122562306a36Sopenharmony_ci * calling the visit function on all of the existing vmas in the range.
122662306a36Sopenharmony_ci * Must be called with the mmap_lock held for reading or writing.
122762306a36Sopenharmony_ci */
122862306a36Sopenharmony_cistatic
122962306a36Sopenharmony_ciint madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
123062306a36Sopenharmony_ci		      unsigned long end, unsigned long arg,
123162306a36Sopenharmony_ci		      int (*visit)(struct vm_area_struct *vma,
123262306a36Sopenharmony_ci				   struct vm_area_struct **prev, unsigned long start,
123362306a36Sopenharmony_ci				   unsigned long end, unsigned long arg))
123462306a36Sopenharmony_ci{
123562306a36Sopenharmony_ci	struct vm_area_struct *vma;
123662306a36Sopenharmony_ci	struct vm_area_struct *prev;
123762306a36Sopenharmony_ci	unsigned long tmp;
123862306a36Sopenharmony_ci	int unmapped_error = 0;
123962306a36Sopenharmony_ci
124062306a36Sopenharmony_ci	/*
124162306a36Sopenharmony_ci	 * If the interval [start,end) covers some unmapped address
124262306a36Sopenharmony_ci	 * ranges, just ignore them, but return -ENOMEM at the end.
124362306a36Sopenharmony_ci	 * - different from the way of handling in mlock etc.
124462306a36Sopenharmony_ci	 */
124562306a36Sopenharmony_ci	vma = find_vma_prev(mm, start, &prev);
124662306a36Sopenharmony_ci	if (vma && start > vma->vm_start)
124762306a36Sopenharmony_ci		prev = vma;
124862306a36Sopenharmony_ci
124962306a36Sopenharmony_ci	for (;;) {
125062306a36Sopenharmony_ci		int error;
125162306a36Sopenharmony_ci
125262306a36Sopenharmony_ci		/* Still start < end. */
125362306a36Sopenharmony_ci		if (!vma)
125462306a36Sopenharmony_ci			return -ENOMEM;
125562306a36Sopenharmony_ci
125662306a36Sopenharmony_ci		/* Here start < (end|vma->vm_end). */
125762306a36Sopenharmony_ci		if (start < vma->vm_start) {
125862306a36Sopenharmony_ci			unmapped_error = -ENOMEM;
125962306a36Sopenharmony_ci			start = vma->vm_start;
126062306a36Sopenharmony_ci			if (start >= end)
126162306a36Sopenharmony_ci				break;
126262306a36Sopenharmony_ci		}
126362306a36Sopenharmony_ci
126462306a36Sopenharmony_ci		/* Here vma->vm_start <= start < (end|vma->vm_end) */
126562306a36Sopenharmony_ci		tmp = vma->vm_end;
126662306a36Sopenharmony_ci		if (end < tmp)
126762306a36Sopenharmony_ci			tmp = end;
126862306a36Sopenharmony_ci
126962306a36Sopenharmony_ci		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
127062306a36Sopenharmony_ci		error = visit(vma, &prev, start, tmp, arg);
127162306a36Sopenharmony_ci		if (error)
127262306a36Sopenharmony_ci			return error;
127362306a36Sopenharmony_ci		start = tmp;
127462306a36Sopenharmony_ci		if (prev && start < prev->vm_end)
127562306a36Sopenharmony_ci			start = prev->vm_end;
127662306a36Sopenharmony_ci		if (start >= end)
127762306a36Sopenharmony_ci			break;
127862306a36Sopenharmony_ci		if (prev)
127962306a36Sopenharmony_ci			vma = find_vma(mm, prev->vm_end);
128062306a36Sopenharmony_ci		else	/* madvise_remove dropped mmap_lock */
128162306a36Sopenharmony_ci			vma = find_vma(mm, start);
128262306a36Sopenharmony_ci	}
128362306a36Sopenharmony_ci
128462306a36Sopenharmony_ci	return unmapped_error;
128562306a36Sopenharmony_ci}
128662306a36Sopenharmony_ci
128762306a36Sopenharmony_ci#ifdef CONFIG_ANON_VMA_NAME
128862306a36Sopenharmony_cistatic int madvise_vma_anon_name(struct vm_area_struct *vma,
128962306a36Sopenharmony_ci				 struct vm_area_struct **prev,
129062306a36Sopenharmony_ci				 unsigned long start, unsigned long end,
129162306a36Sopenharmony_ci				 unsigned long anon_name)
129262306a36Sopenharmony_ci{
129362306a36Sopenharmony_ci	int error;
129462306a36Sopenharmony_ci
129562306a36Sopenharmony_ci	/* Only anonymous mappings can be named */
129662306a36Sopenharmony_ci	if (vma->vm_file && !vma_is_anon_shmem(vma))
129762306a36Sopenharmony_ci		return -EBADF;
129862306a36Sopenharmony_ci
129962306a36Sopenharmony_ci	error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
130062306a36Sopenharmony_ci				   (struct anon_vma_name *)anon_name);
130162306a36Sopenharmony_ci
130262306a36Sopenharmony_ci	/*
130362306a36Sopenharmony_ci	 * madvise() returns EAGAIN if kernel resources, such as
130462306a36Sopenharmony_ci	 * slab, are temporarily unavailable.
130562306a36Sopenharmony_ci	 */
130662306a36Sopenharmony_ci	if (error == -ENOMEM)
130762306a36Sopenharmony_ci		error = -EAGAIN;
130862306a36Sopenharmony_ci	return error;
130962306a36Sopenharmony_ci}
131062306a36Sopenharmony_ci
131162306a36Sopenharmony_ciint madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
131262306a36Sopenharmony_ci			  unsigned long len_in, struct anon_vma_name *anon_name)
131362306a36Sopenharmony_ci{
131462306a36Sopenharmony_ci	unsigned long end;
131562306a36Sopenharmony_ci	unsigned long len;
131662306a36Sopenharmony_ci
131762306a36Sopenharmony_ci	if (start & ~PAGE_MASK)
131862306a36Sopenharmony_ci		return -EINVAL;
131962306a36Sopenharmony_ci	len = (len_in + ~PAGE_MASK) & PAGE_MASK;
132062306a36Sopenharmony_ci
132162306a36Sopenharmony_ci	/* Check to see whether len was rounded up from small -ve to zero */
132262306a36Sopenharmony_ci	if (len_in && !len)
132362306a36Sopenharmony_ci		return -EINVAL;
132462306a36Sopenharmony_ci
132562306a36Sopenharmony_ci	end = start + len;
132662306a36Sopenharmony_ci	if (end < start)
132762306a36Sopenharmony_ci		return -EINVAL;
132862306a36Sopenharmony_ci
132962306a36Sopenharmony_ci	if (end == start)
133062306a36Sopenharmony_ci		return 0;
133162306a36Sopenharmony_ci
133262306a36Sopenharmony_ci	return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
133362306a36Sopenharmony_ci				 madvise_vma_anon_name);
133462306a36Sopenharmony_ci}
133562306a36Sopenharmony_ci#endif /* CONFIG_ANON_VMA_NAME */
133662306a36Sopenharmony_ci/*
133762306a36Sopenharmony_ci * The madvise(2) system call.
133862306a36Sopenharmony_ci *
133962306a36Sopenharmony_ci * Applications can use madvise() to advise the kernel how it should
134062306a36Sopenharmony_ci * handle paging I/O in this VM area.  The idea is to help the kernel
134162306a36Sopenharmony_ci * use appropriate read-ahead and caching techniques.  The information
134262306a36Sopenharmony_ci * provided is advisory only, and can be safely disregarded by the
134362306a36Sopenharmony_ci * kernel without affecting the correct operation of the application.
134462306a36Sopenharmony_ci *
134562306a36Sopenharmony_ci * behavior values:
134662306a36Sopenharmony_ci *  MADV_NORMAL - the default behavior is to read clusters.  This
134762306a36Sopenharmony_ci *		results in some read-ahead and read-behind.
134862306a36Sopenharmony_ci *  MADV_RANDOM - the system should read the minimum amount of data
134962306a36Sopenharmony_ci *		on any access, since it is unlikely that the appli-
135062306a36Sopenharmony_ci *		cation will need more than what it asks for.
135162306a36Sopenharmony_ci *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
135262306a36Sopenharmony_ci *		once, so they can be aggressively read ahead, and
135362306a36Sopenharmony_ci *		can be freed soon after they are accessed.
135462306a36Sopenharmony_ci *  MADV_WILLNEED - the application is notifying the system to read
135562306a36Sopenharmony_ci *		some pages ahead.
135662306a36Sopenharmony_ci *  MADV_DONTNEED - the application is finished with the given range,
135762306a36Sopenharmony_ci *		so the kernel can free resources associated with it.
135862306a36Sopenharmony_ci *  MADV_FREE - the application marks pages in the given range as lazy free,
135962306a36Sopenharmony_ci *		where actual purges are postponed until memory pressure happens.
136062306a36Sopenharmony_ci *  MADV_REMOVE - the application wants to free up the given range of
136162306a36Sopenharmony_ci *		pages and associated backing store.
136262306a36Sopenharmony_ci *  MADV_DONTFORK - omit this area from child's address space when forking:
136362306a36Sopenharmony_ci *		typically, to avoid COWing pages pinned by get_user_pages().
136462306a36Sopenharmony_ci *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
136562306a36Sopenharmony_ci *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
136662306a36Sopenharmony_ci *              range after a fork.
136762306a36Sopenharmony_ci *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
136862306a36Sopenharmony_ci *  MADV_HWPOISON - trigger memory error handler as if the given memory range
136962306a36Sopenharmony_ci *		were corrupted by unrecoverable hardware memory failure.
137062306a36Sopenharmony_ci *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
137162306a36Sopenharmony_ci *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
137262306a36Sopenharmony_ci *		this area with pages of identical content from other such areas.
137362306a36Sopenharmony_ci *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
137462306a36Sopenharmony_ci *  MADV_HUGEPAGE - the application wants to back the given range by transparent
137562306a36Sopenharmony_ci *		huge pages in the future. Existing pages might be coalesced and
137662306a36Sopenharmony_ci *		new pages might be allocated as THP.
137762306a36Sopenharmony_ci *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
137862306a36Sopenharmony_ci *		transparent huge pages so the existing pages will not be
137962306a36Sopenharmony_ci *		coalesced into THP and new pages will not be allocated as THP.
138062306a36Sopenharmony_ci *  MADV_COLLAPSE - synchronously coalesce pages into new THP.
138162306a36Sopenharmony_ci *  MADV_DONTDUMP - the application wants to prevent pages in the given range
138262306a36Sopenharmony_ci *		from being included in its core dump.
138362306a36Sopenharmony_ci *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
138462306a36Sopenharmony_ci *  MADV_COLD - the application is not expected to use this memory soon,
138562306a36Sopenharmony_ci *		deactivate pages in this range so that they can be reclaimed
138662306a36Sopenharmony_ci *		easily if memory pressure happens.
138762306a36Sopenharmony_ci *  MADV_PAGEOUT - the application is not expected to use this memory soon,
138862306a36Sopenharmony_ci *		page out the pages in this range immediately.
138962306a36Sopenharmony_ci *  MADV_POPULATE_READ - populate (prefault) page tables readable by
139062306a36Sopenharmony_ci *		triggering read faults if required
139162306a36Sopenharmony_ci *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
139262306a36Sopenharmony_ci *		triggering write faults if required
139362306a36Sopenharmony_ci *
139462306a36Sopenharmony_ci * return values:
139562306a36Sopenharmony_ci *  zero    - success
139662306a36Sopenharmony_ci *  -EINVAL - start + len < 0, start is not page-aligned,
139762306a36Sopenharmony_ci *		"behavior" is not a valid value, or application
139862306a36Sopenharmony_ci *		is attempting to release locked or shared pages,
139962306a36Sopenharmony_ci *		or the specified address range includes file, Huge TLB,
140062306a36Sopenharmony_ci *		MAP_SHARED or VMPFNMAP range.
140162306a36Sopenharmony_ci *  -ENOMEM - addresses in the specified range are not currently
140262306a36Sopenharmony_ci *		mapped, or are outside the AS of the process.
140362306a36Sopenharmony_ci *  -EIO    - an I/O error occurred while paging in data.
140462306a36Sopenharmony_ci *  -EBADF  - map exists, but area maps something that isn't a file.
140562306a36Sopenharmony_ci *  -EAGAIN - a kernel resource was temporarily unavailable.
140662306a36Sopenharmony_ci */
140762306a36Sopenharmony_ciint do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
140862306a36Sopenharmony_ci{
140962306a36Sopenharmony_ci	unsigned long end;
141062306a36Sopenharmony_ci	int error;
141162306a36Sopenharmony_ci	int write;
141262306a36Sopenharmony_ci	size_t len;
141362306a36Sopenharmony_ci	struct blk_plug plug;
141462306a36Sopenharmony_ci
141562306a36Sopenharmony_ci	if (!madvise_behavior_valid(behavior))
141662306a36Sopenharmony_ci		return -EINVAL;
141762306a36Sopenharmony_ci
141862306a36Sopenharmony_ci	if (!PAGE_ALIGNED(start))
141962306a36Sopenharmony_ci		return -EINVAL;
142062306a36Sopenharmony_ci	len = PAGE_ALIGN(len_in);
142162306a36Sopenharmony_ci
142262306a36Sopenharmony_ci	/* Check to see whether len was rounded up from small -ve to zero */
142362306a36Sopenharmony_ci	if (len_in && !len)
142462306a36Sopenharmony_ci		return -EINVAL;
142562306a36Sopenharmony_ci
142662306a36Sopenharmony_ci	end = start + len;
142762306a36Sopenharmony_ci	if (end < start)
142862306a36Sopenharmony_ci		return -EINVAL;
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_ci	if (end == start)
143162306a36Sopenharmony_ci		return 0;
143262306a36Sopenharmony_ci
143362306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_FAILURE
143462306a36Sopenharmony_ci	if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
143562306a36Sopenharmony_ci		return madvise_inject_error(behavior, start, start + len_in);
143662306a36Sopenharmony_ci#endif
143762306a36Sopenharmony_ci
143862306a36Sopenharmony_ci	write = madvise_need_mmap_write(behavior);
143962306a36Sopenharmony_ci	if (write) {
144062306a36Sopenharmony_ci		if (mmap_write_lock_killable(mm))
144162306a36Sopenharmony_ci			return -EINTR;
144262306a36Sopenharmony_ci	} else {
144362306a36Sopenharmony_ci		mmap_read_lock(mm);
144462306a36Sopenharmony_ci	}
144562306a36Sopenharmony_ci
144662306a36Sopenharmony_ci	start = untagged_addr_remote(mm, start);
144762306a36Sopenharmony_ci	end = start + len;
144862306a36Sopenharmony_ci
144962306a36Sopenharmony_ci	blk_start_plug(&plug);
145062306a36Sopenharmony_ci	error = madvise_walk_vmas(mm, start, end, behavior,
145162306a36Sopenharmony_ci			madvise_vma_behavior);
145262306a36Sopenharmony_ci	blk_finish_plug(&plug);
145362306a36Sopenharmony_ci	if (write)
145462306a36Sopenharmony_ci		mmap_write_unlock(mm);
145562306a36Sopenharmony_ci	else
145662306a36Sopenharmony_ci		mmap_read_unlock(mm);
145762306a36Sopenharmony_ci
145862306a36Sopenharmony_ci	return error;
145962306a36Sopenharmony_ci}
146062306a36Sopenharmony_ci
146162306a36Sopenharmony_ciSYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
146262306a36Sopenharmony_ci{
146362306a36Sopenharmony_ci	return do_madvise(current->mm, start, len_in, behavior);
146462306a36Sopenharmony_ci}
146562306a36Sopenharmony_ci
146662306a36Sopenharmony_ciSYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
146762306a36Sopenharmony_ci		size_t, vlen, int, behavior, unsigned int, flags)
146862306a36Sopenharmony_ci{
146962306a36Sopenharmony_ci	ssize_t ret;
147062306a36Sopenharmony_ci	struct iovec iovstack[UIO_FASTIOV];
147162306a36Sopenharmony_ci	struct iovec *iov = iovstack;
147262306a36Sopenharmony_ci	struct iov_iter iter;
147362306a36Sopenharmony_ci	struct task_struct *task;
147462306a36Sopenharmony_ci	struct mm_struct *mm;
147562306a36Sopenharmony_ci	size_t total_len;
147662306a36Sopenharmony_ci	unsigned int f_flags;
147762306a36Sopenharmony_ci
147862306a36Sopenharmony_ci	if (flags != 0) {
147962306a36Sopenharmony_ci		ret = -EINVAL;
148062306a36Sopenharmony_ci		goto out;
148162306a36Sopenharmony_ci	}
148262306a36Sopenharmony_ci
148362306a36Sopenharmony_ci	ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
148462306a36Sopenharmony_ci	if (ret < 0)
148562306a36Sopenharmony_ci		goto out;
148662306a36Sopenharmony_ci
148762306a36Sopenharmony_ci	task = pidfd_get_task(pidfd, &f_flags);
148862306a36Sopenharmony_ci	if (IS_ERR(task)) {
148962306a36Sopenharmony_ci		ret = PTR_ERR(task);
149062306a36Sopenharmony_ci		goto free_iov;
149162306a36Sopenharmony_ci	}
149262306a36Sopenharmony_ci
149362306a36Sopenharmony_ci	if (!process_madvise_behavior_valid(behavior)) {
149462306a36Sopenharmony_ci		ret = -EINVAL;
149562306a36Sopenharmony_ci		goto release_task;
149662306a36Sopenharmony_ci	}
149762306a36Sopenharmony_ci
149862306a36Sopenharmony_ci	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
149962306a36Sopenharmony_ci	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
150062306a36Sopenharmony_ci	if (IS_ERR_OR_NULL(mm)) {
150162306a36Sopenharmony_ci		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
150262306a36Sopenharmony_ci		goto release_task;
150362306a36Sopenharmony_ci	}
150462306a36Sopenharmony_ci
150562306a36Sopenharmony_ci	/*
150662306a36Sopenharmony_ci	 * Require CAP_SYS_NICE for influencing process performance. Note that
150762306a36Sopenharmony_ci	 * only non-destructive hints are currently supported.
150862306a36Sopenharmony_ci	 */
150962306a36Sopenharmony_ci	if (!capable(CAP_SYS_NICE)) {
151062306a36Sopenharmony_ci		ret = -EPERM;
151162306a36Sopenharmony_ci		goto release_mm;
151262306a36Sopenharmony_ci	}
151362306a36Sopenharmony_ci
151462306a36Sopenharmony_ci	total_len = iov_iter_count(&iter);
151562306a36Sopenharmony_ci
151662306a36Sopenharmony_ci	while (iov_iter_count(&iter)) {
151762306a36Sopenharmony_ci		ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
151862306a36Sopenharmony_ci					iter_iov_len(&iter), behavior);
151962306a36Sopenharmony_ci		if (ret < 0)
152062306a36Sopenharmony_ci			break;
152162306a36Sopenharmony_ci		iov_iter_advance(&iter, iter_iov_len(&iter));
152262306a36Sopenharmony_ci	}
152362306a36Sopenharmony_ci
152462306a36Sopenharmony_ci	ret = (total_len - iov_iter_count(&iter)) ? : ret;
152562306a36Sopenharmony_ci
152662306a36Sopenharmony_cirelease_mm:
152762306a36Sopenharmony_ci	mmput(mm);
152862306a36Sopenharmony_cirelease_task:
152962306a36Sopenharmony_ci	put_task_struct(task);
153062306a36Sopenharmony_cifree_iov:
153162306a36Sopenharmony_ci	kfree(iov);
153262306a36Sopenharmony_ciout:
153362306a36Sopenharmony_ci	return ret;
153462306a36Sopenharmony_ci}
1535