162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci#include <linux/kernel.h>
362306a36Sopenharmony_ci#include <linux/errno.h>
462306a36Sopenharmony_ci#include <linux/err.h>
562306a36Sopenharmony_ci#include <linux/spinlock.h>
662306a36Sopenharmony_ci
762306a36Sopenharmony_ci#include <linux/mm.h>
862306a36Sopenharmony_ci#include <linux/memremap.h>
962306a36Sopenharmony_ci#include <linux/pagemap.h>
1062306a36Sopenharmony_ci#include <linux/rmap.h>
1162306a36Sopenharmony_ci#include <linux/swap.h>
1262306a36Sopenharmony_ci#include <linux/swapops.h>
1362306a36Sopenharmony_ci#include <linux/secretmem.h>
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci#include <linux/sched/signal.h>
1662306a36Sopenharmony_ci#include <linux/rwsem.h>
1762306a36Sopenharmony_ci#include <linux/hugetlb.h>
1862306a36Sopenharmony_ci#include <linux/migrate.h>
1962306a36Sopenharmony_ci#include <linux/mm_inline.h>
2062306a36Sopenharmony_ci#include <linux/sched/mm.h>
2162306a36Sopenharmony_ci#include <linux/shmem_fs.h>
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci#include <asm/mmu_context.h>
2462306a36Sopenharmony_ci#include <asm/tlbflush.h>
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci#include "internal.h"
2762306a36Sopenharmony_ci
2862306a36Sopenharmony_cistruct follow_page_context {
2962306a36Sopenharmony_ci	struct dev_pagemap *pgmap;
3062306a36Sopenharmony_ci	unsigned int page_mask;
3162306a36Sopenharmony_ci};
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_cistatic inline void sanity_check_pinned_pages(struct page **pages,
3462306a36Sopenharmony_ci					     unsigned long npages)
3562306a36Sopenharmony_ci{
3662306a36Sopenharmony_ci	if (!IS_ENABLED(CONFIG_DEBUG_VM))
3762306a36Sopenharmony_ci		return;
3862306a36Sopenharmony_ci
3962306a36Sopenharmony_ci	/*
4062306a36Sopenharmony_ci	 * We only pin anonymous pages if they are exclusive. Once pinned, we
4162306a36Sopenharmony_ci	 * can no longer turn them possibly shared and PageAnonExclusive() will
4262306a36Sopenharmony_ci	 * stick around until the page is freed.
4362306a36Sopenharmony_ci	 *
4462306a36Sopenharmony_ci	 * We'd like to verify that our pinned anonymous pages are still mapped
4562306a36Sopenharmony_ci	 * exclusively. The issue with anon THP is that we don't know how
4662306a36Sopenharmony_ci	 * they are/were mapped when pinning them. However, for anon
4762306a36Sopenharmony_ci	 * THP we can assume that either the given page (PTE-mapped THP) or
4862306a36Sopenharmony_ci	 * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
4962306a36Sopenharmony_ci	 * neither is the case, there is certainly something wrong.
5062306a36Sopenharmony_ci	 */
5162306a36Sopenharmony_ci	for (; npages; npages--, pages++) {
5262306a36Sopenharmony_ci		struct page *page = *pages;
5362306a36Sopenharmony_ci		struct folio *folio = page_folio(page);
5462306a36Sopenharmony_ci
5562306a36Sopenharmony_ci		if (is_zero_page(page) ||
5662306a36Sopenharmony_ci		    !folio_test_anon(folio))
5762306a36Sopenharmony_ci			continue;
5862306a36Sopenharmony_ci		if (!folio_test_large(folio) || folio_test_hugetlb(folio))
5962306a36Sopenharmony_ci			VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
6062306a36Sopenharmony_ci		else
6162306a36Sopenharmony_ci			/* Either a PTE-mapped or a PMD-mapped THP. */
6262306a36Sopenharmony_ci			VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
6362306a36Sopenharmony_ci				       !PageAnonExclusive(page), page);
6462306a36Sopenharmony_ci	}
6562306a36Sopenharmony_ci}
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_ci/*
6862306a36Sopenharmony_ci * Return the folio with ref appropriately incremented,
6962306a36Sopenharmony_ci * or NULL if that failed.
7062306a36Sopenharmony_ci */
7162306a36Sopenharmony_cistatic inline struct folio *try_get_folio(struct page *page, int refs)
7262306a36Sopenharmony_ci{
7362306a36Sopenharmony_ci	struct folio *folio;
7462306a36Sopenharmony_ci
7562306a36Sopenharmony_ciretry:
7662306a36Sopenharmony_ci	folio = page_folio(page);
7762306a36Sopenharmony_ci	if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
7862306a36Sopenharmony_ci		return NULL;
7962306a36Sopenharmony_ci	if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
8062306a36Sopenharmony_ci		return NULL;
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_ci	/*
8362306a36Sopenharmony_ci	 * At this point we have a stable reference to the folio; but it
8462306a36Sopenharmony_ci	 * could be that between calling page_folio() and the refcount
8562306a36Sopenharmony_ci	 * increment, the folio was split, in which case we'd end up
8662306a36Sopenharmony_ci	 * holding a reference on a folio that has nothing to do with the page
8762306a36Sopenharmony_ci	 * we were given anymore.
8862306a36Sopenharmony_ci	 * So now that the folio is stable, recheck that the page still
8962306a36Sopenharmony_ci	 * belongs to this folio.
9062306a36Sopenharmony_ci	 */
9162306a36Sopenharmony_ci	if (unlikely(page_folio(page) != folio)) {
9262306a36Sopenharmony_ci		if (!put_devmap_managed_page_refs(&folio->page, refs))
9362306a36Sopenharmony_ci			folio_put_refs(folio, refs);
9462306a36Sopenharmony_ci		goto retry;
9562306a36Sopenharmony_ci	}
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci	return folio;
9862306a36Sopenharmony_ci}
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci/**
10162306a36Sopenharmony_ci * try_grab_folio() - Attempt to get or pin a folio.
10262306a36Sopenharmony_ci * @page:  pointer to page to be grabbed
10362306a36Sopenharmony_ci * @refs:  the value to (effectively) add to the folio's refcount
10462306a36Sopenharmony_ci * @flags: gup flags: these are the FOLL_* flag values.
10562306a36Sopenharmony_ci *
10662306a36Sopenharmony_ci * "grab" names in this file mean, "look at flags to decide whether to use
10762306a36Sopenharmony_ci * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
10862306a36Sopenharmony_ci *
10962306a36Sopenharmony_ci * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
11062306a36Sopenharmony_ci * same time. (That's true throughout the get_user_pages*() and
11162306a36Sopenharmony_ci * pin_user_pages*() APIs.) Cases:
11262306a36Sopenharmony_ci *
11362306a36Sopenharmony_ci *    FOLL_GET: folio's refcount will be incremented by @refs.
11462306a36Sopenharmony_ci *
11562306a36Sopenharmony_ci *    FOLL_PIN on large folios: folio's refcount will be incremented by
11662306a36Sopenharmony_ci *    @refs, and its pincount will be incremented by @refs.
11762306a36Sopenharmony_ci *
11862306a36Sopenharmony_ci *    FOLL_PIN on single-page folios: folio's refcount will be incremented by
11962306a36Sopenharmony_ci *    @refs * GUP_PIN_COUNTING_BIAS.
12062306a36Sopenharmony_ci *
12162306a36Sopenharmony_ci * Return: The folio containing @page (with refcount appropriately
12262306a36Sopenharmony_ci * incremented) for success, or NULL upon failure. If neither FOLL_GET
12362306a36Sopenharmony_ci * nor FOLL_PIN was set, that's considered failure, and furthermore,
12462306a36Sopenharmony_ci * a likely bug in the caller, so a warning is also emitted.
12562306a36Sopenharmony_ci */
12662306a36Sopenharmony_cistruct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
12762306a36Sopenharmony_ci{
12862306a36Sopenharmony_ci	struct folio *folio;
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_ci	if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
13162306a36Sopenharmony_ci		return NULL;
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci	if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
13462306a36Sopenharmony_ci		return NULL;
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ci	if (flags & FOLL_GET)
13762306a36Sopenharmony_ci		return try_get_folio(page, refs);
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci	/* FOLL_PIN is set */
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci	/*
14262306a36Sopenharmony_ci	 * Don't take a pin on the zero page - it's not going anywhere
14362306a36Sopenharmony_ci	 * and it is used in a *lot* of places.
14462306a36Sopenharmony_ci	 */
14562306a36Sopenharmony_ci	if (is_zero_page(page))
14662306a36Sopenharmony_ci		return page_folio(page);
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci	folio = try_get_folio(page, refs);
14962306a36Sopenharmony_ci	if (!folio)
15062306a36Sopenharmony_ci		return NULL;
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci	/*
15362306a36Sopenharmony_ci	 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
15462306a36Sopenharmony_ci	 * right zone, so fail and let the caller fall back to the slow
15562306a36Sopenharmony_ci	 * path.
15662306a36Sopenharmony_ci	 */
15762306a36Sopenharmony_ci	if (unlikely((flags & FOLL_LONGTERM) &&
15862306a36Sopenharmony_ci		     !folio_is_longterm_pinnable(folio))) {
15962306a36Sopenharmony_ci		if (!put_devmap_managed_page_refs(&folio->page, refs))
16062306a36Sopenharmony_ci			folio_put_refs(folio, refs);
16162306a36Sopenharmony_ci		return NULL;
16262306a36Sopenharmony_ci	}
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci	/*
16562306a36Sopenharmony_ci	 * When pinning a large folio, use an exact count to track it.
16662306a36Sopenharmony_ci	 *
16762306a36Sopenharmony_ci	 * However, be sure to *also* increment the normal folio
16862306a36Sopenharmony_ci	 * refcount field at least once, so that the folio really
16962306a36Sopenharmony_ci	 * is pinned.  That's why the refcount from the earlier
17062306a36Sopenharmony_ci	 * try_get_folio() is left intact.
17162306a36Sopenharmony_ci	 */
17262306a36Sopenharmony_ci	if (folio_test_large(folio))
17362306a36Sopenharmony_ci		atomic_add(refs, &folio->_pincount);
17462306a36Sopenharmony_ci	else
17562306a36Sopenharmony_ci		folio_ref_add(folio,
17662306a36Sopenharmony_ci				refs * (GUP_PIN_COUNTING_BIAS - 1));
17762306a36Sopenharmony_ci	/*
17862306a36Sopenharmony_ci	 * Adjust the pincount before re-checking the PTE for changes.
17962306a36Sopenharmony_ci	 * This is essentially a smp_mb() and is paired with a memory
18062306a36Sopenharmony_ci	 * barrier in page_try_share_anon_rmap().
18162306a36Sopenharmony_ci	 */
18262306a36Sopenharmony_ci	smp_mb__after_atomic();
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci	node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci	return folio;
18762306a36Sopenharmony_ci}
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_cistatic void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
19062306a36Sopenharmony_ci{
19162306a36Sopenharmony_ci	if (flags & FOLL_PIN) {
19262306a36Sopenharmony_ci		if (is_zero_folio(folio))
19362306a36Sopenharmony_ci			return;
19462306a36Sopenharmony_ci		node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
19562306a36Sopenharmony_ci		if (folio_test_large(folio))
19662306a36Sopenharmony_ci			atomic_sub(refs, &folio->_pincount);
19762306a36Sopenharmony_ci		else
19862306a36Sopenharmony_ci			refs *= GUP_PIN_COUNTING_BIAS;
19962306a36Sopenharmony_ci	}
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci	if (!put_devmap_managed_page_refs(&folio->page, refs))
20262306a36Sopenharmony_ci		folio_put_refs(folio, refs);
20362306a36Sopenharmony_ci}
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci/**
20662306a36Sopenharmony_ci * try_grab_page() - elevate a page's refcount by a flag-dependent amount
20762306a36Sopenharmony_ci * @page:    pointer to page to be grabbed
20862306a36Sopenharmony_ci * @flags:   gup flags: these are the FOLL_* flag values.
20962306a36Sopenharmony_ci *
21062306a36Sopenharmony_ci * This might not do anything at all, depending on the flags argument.
21162306a36Sopenharmony_ci *
21262306a36Sopenharmony_ci * "grab" names in this file mean, "look at flags to decide whether to use
21362306a36Sopenharmony_ci * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
21462306a36Sopenharmony_ci *
21562306a36Sopenharmony_ci * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
21662306a36Sopenharmony_ci * time. Cases: please see the try_grab_folio() documentation, with
21762306a36Sopenharmony_ci * "refs=1".
21862306a36Sopenharmony_ci *
21962306a36Sopenharmony_ci * Return: 0 for success, or if no action was required (if neither FOLL_PIN
22062306a36Sopenharmony_ci * nor FOLL_GET was set, nothing is done). A negative error code for failure:
22162306a36Sopenharmony_ci *
22262306a36Sopenharmony_ci *   -ENOMEM		FOLL_GET or FOLL_PIN was set, but the page could not
22362306a36Sopenharmony_ci *			be grabbed.
22462306a36Sopenharmony_ci */
22562306a36Sopenharmony_ciint __must_check try_grab_page(struct page *page, unsigned int flags)
22662306a36Sopenharmony_ci{
22762306a36Sopenharmony_ci	struct folio *folio = page_folio(page);
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci	if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
23062306a36Sopenharmony_ci		return -ENOMEM;
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_ci	if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
23362306a36Sopenharmony_ci		return -EREMOTEIO;
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_ci	if (flags & FOLL_GET)
23662306a36Sopenharmony_ci		folio_ref_inc(folio);
23762306a36Sopenharmony_ci	else if (flags & FOLL_PIN) {
23862306a36Sopenharmony_ci		/*
23962306a36Sopenharmony_ci		 * Don't take a pin on the zero page - it's not going anywhere
24062306a36Sopenharmony_ci		 * and it is used in a *lot* of places.
24162306a36Sopenharmony_ci		 */
24262306a36Sopenharmony_ci		if (is_zero_page(page))
24362306a36Sopenharmony_ci			return 0;
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci		/*
24662306a36Sopenharmony_ci		 * Similar to try_grab_folio(): be sure to *also*
24762306a36Sopenharmony_ci		 * increment the normal page refcount field at least once,
24862306a36Sopenharmony_ci		 * so that the page really is pinned.
24962306a36Sopenharmony_ci		 */
25062306a36Sopenharmony_ci		if (folio_test_large(folio)) {
25162306a36Sopenharmony_ci			folio_ref_add(folio, 1);
25262306a36Sopenharmony_ci			atomic_add(1, &folio->_pincount);
25362306a36Sopenharmony_ci		} else {
25462306a36Sopenharmony_ci			folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
25562306a36Sopenharmony_ci		}
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci		node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
25862306a36Sopenharmony_ci	}
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	return 0;
26162306a36Sopenharmony_ci}
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci/**
26462306a36Sopenharmony_ci * unpin_user_page() - release a dma-pinned page
26562306a36Sopenharmony_ci * @page:            pointer to page to be released
26662306a36Sopenharmony_ci *
26762306a36Sopenharmony_ci * Pages that were pinned via pin_user_pages*() must be released via either
26862306a36Sopenharmony_ci * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
26962306a36Sopenharmony_ci * that such pages can be separately tracked and uniquely handled. In
27062306a36Sopenharmony_ci * particular, interactions with RDMA and filesystems need special handling.
27162306a36Sopenharmony_ci */
27262306a36Sopenharmony_civoid unpin_user_page(struct page *page)
27362306a36Sopenharmony_ci{
27462306a36Sopenharmony_ci	sanity_check_pinned_pages(&page, 1);
27562306a36Sopenharmony_ci	gup_put_folio(page_folio(page), 1, FOLL_PIN);
27662306a36Sopenharmony_ci}
27762306a36Sopenharmony_ciEXPORT_SYMBOL(unpin_user_page);
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci/**
28062306a36Sopenharmony_ci * folio_add_pin - Try to get an additional pin on a pinned folio
28162306a36Sopenharmony_ci * @folio: The folio to be pinned
28262306a36Sopenharmony_ci *
28362306a36Sopenharmony_ci * Get an additional pin on a folio we already have a pin on.  Makes no change
28462306a36Sopenharmony_ci * if the folio is a zero_page.
28562306a36Sopenharmony_ci */
28662306a36Sopenharmony_civoid folio_add_pin(struct folio *folio)
28762306a36Sopenharmony_ci{
28862306a36Sopenharmony_ci	if (is_zero_folio(folio))
28962306a36Sopenharmony_ci		return;
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_ci	/*
29262306a36Sopenharmony_ci	 * Similar to try_grab_folio(): be sure to *also* increment the normal
29362306a36Sopenharmony_ci	 * page refcount field at least once, so that the page really is
29462306a36Sopenharmony_ci	 * pinned.
29562306a36Sopenharmony_ci	 */
29662306a36Sopenharmony_ci	if (folio_test_large(folio)) {
29762306a36Sopenharmony_ci		WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
29862306a36Sopenharmony_ci		folio_ref_inc(folio);
29962306a36Sopenharmony_ci		atomic_inc(&folio->_pincount);
30062306a36Sopenharmony_ci	} else {
30162306a36Sopenharmony_ci		WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);
30262306a36Sopenharmony_ci		folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
30362306a36Sopenharmony_ci	}
30462306a36Sopenharmony_ci}
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_cistatic inline struct folio *gup_folio_range_next(struct page *start,
30762306a36Sopenharmony_ci		unsigned long npages, unsigned long i, unsigned int *ntails)
30862306a36Sopenharmony_ci{
30962306a36Sopenharmony_ci	struct page *next = nth_page(start, i);
31062306a36Sopenharmony_ci	struct folio *folio = page_folio(next);
31162306a36Sopenharmony_ci	unsigned int nr = 1;
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	if (folio_test_large(folio))
31462306a36Sopenharmony_ci		nr = min_t(unsigned int, npages - i,
31562306a36Sopenharmony_ci			   folio_nr_pages(folio) - folio_page_idx(folio, next));
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_ci	*ntails = nr;
31862306a36Sopenharmony_ci	return folio;
31962306a36Sopenharmony_ci}
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_cistatic inline struct folio *gup_folio_next(struct page **list,
32262306a36Sopenharmony_ci		unsigned long npages, unsigned long i, unsigned int *ntails)
32362306a36Sopenharmony_ci{
32462306a36Sopenharmony_ci	struct folio *folio = page_folio(list[i]);
32562306a36Sopenharmony_ci	unsigned int nr;
32662306a36Sopenharmony_ci
32762306a36Sopenharmony_ci	for (nr = i + 1; nr < npages; nr++) {
32862306a36Sopenharmony_ci		if (page_folio(list[nr]) != folio)
32962306a36Sopenharmony_ci			break;
33062306a36Sopenharmony_ci	}
33162306a36Sopenharmony_ci
33262306a36Sopenharmony_ci	*ntails = nr - i;
33362306a36Sopenharmony_ci	return folio;
33462306a36Sopenharmony_ci}
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci/**
33762306a36Sopenharmony_ci * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
33862306a36Sopenharmony_ci * @pages:  array of pages to be maybe marked dirty, and definitely released.
33962306a36Sopenharmony_ci * @npages: number of pages in the @pages array.
34062306a36Sopenharmony_ci * @make_dirty: whether to mark the pages dirty
34162306a36Sopenharmony_ci *
34262306a36Sopenharmony_ci * "gup-pinned page" refers to a page that has had one of the get_user_pages()
34362306a36Sopenharmony_ci * variants called on that page.
34462306a36Sopenharmony_ci *
34562306a36Sopenharmony_ci * For each page in the @pages array, make that page (or its head page, if a
34662306a36Sopenharmony_ci * compound page) dirty, if @make_dirty is true, and if the page was previously
34762306a36Sopenharmony_ci * listed as clean. In any case, releases all pages using unpin_user_page(),
34862306a36Sopenharmony_ci * possibly via unpin_user_pages(), for the non-dirty case.
34962306a36Sopenharmony_ci *
35062306a36Sopenharmony_ci * Please see the unpin_user_page() documentation for details.
35162306a36Sopenharmony_ci *
35262306a36Sopenharmony_ci * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
35362306a36Sopenharmony_ci * required, then the caller should a) verify that this is really correct,
35462306a36Sopenharmony_ci * because _lock() is usually required, and b) hand code it:
35562306a36Sopenharmony_ci * set_page_dirty_lock(), unpin_user_page().
35662306a36Sopenharmony_ci *
35762306a36Sopenharmony_ci */
35862306a36Sopenharmony_civoid unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
35962306a36Sopenharmony_ci				 bool make_dirty)
36062306a36Sopenharmony_ci{
36162306a36Sopenharmony_ci	unsigned long i;
36262306a36Sopenharmony_ci	struct folio *folio;
36362306a36Sopenharmony_ci	unsigned int nr;
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci	if (!make_dirty) {
36662306a36Sopenharmony_ci		unpin_user_pages(pages, npages);
36762306a36Sopenharmony_ci		return;
36862306a36Sopenharmony_ci	}
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci	sanity_check_pinned_pages(pages, npages);
37162306a36Sopenharmony_ci	for (i = 0; i < npages; i += nr) {
37262306a36Sopenharmony_ci		folio = gup_folio_next(pages, npages, i, &nr);
37362306a36Sopenharmony_ci		/*
37462306a36Sopenharmony_ci		 * Checking PageDirty at this point may race with
37562306a36Sopenharmony_ci		 * clear_page_dirty_for_io(), but that's OK. Two key
37662306a36Sopenharmony_ci		 * cases:
37762306a36Sopenharmony_ci		 *
37862306a36Sopenharmony_ci		 * 1) This code sees the page as already dirty, so it
37962306a36Sopenharmony_ci		 * skips the call to set_page_dirty(). That could happen
38062306a36Sopenharmony_ci		 * because clear_page_dirty_for_io() called
38162306a36Sopenharmony_ci		 * page_mkclean(), followed by set_page_dirty().
38262306a36Sopenharmony_ci		 * However, now the page is going to get written back,
38362306a36Sopenharmony_ci		 * which meets the original intention of setting it
38462306a36Sopenharmony_ci		 * dirty, so all is well: clear_page_dirty_for_io() goes
38562306a36Sopenharmony_ci		 * on to call TestClearPageDirty(), and write the page
38662306a36Sopenharmony_ci		 * back.
38762306a36Sopenharmony_ci		 *
38862306a36Sopenharmony_ci		 * 2) This code sees the page as clean, so it calls
38962306a36Sopenharmony_ci		 * set_page_dirty(). The page stays dirty, despite being
39062306a36Sopenharmony_ci		 * written back, so it gets written back again in the
39162306a36Sopenharmony_ci		 * next writeback cycle. This is harmless.
39262306a36Sopenharmony_ci		 */
39362306a36Sopenharmony_ci		if (!folio_test_dirty(folio)) {
39462306a36Sopenharmony_ci			folio_lock(folio);
39562306a36Sopenharmony_ci			folio_mark_dirty(folio);
39662306a36Sopenharmony_ci			folio_unlock(folio);
39762306a36Sopenharmony_ci		}
39862306a36Sopenharmony_ci		gup_put_folio(folio, nr, FOLL_PIN);
39962306a36Sopenharmony_ci	}
40062306a36Sopenharmony_ci}
40162306a36Sopenharmony_ciEXPORT_SYMBOL(unpin_user_pages_dirty_lock);
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci/**
40462306a36Sopenharmony_ci * unpin_user_page_range_dirty_lock() - release and optionally dirty
40562306a36Sopenharmony_ci * gup-pinned page range
40662306a36Sopenharmony_ci *
40762306a36Sopenharmony_ci * @page:  the starting page of a range maybe marked dirty, and definitely released.
40862306a36Sopenharmony_ci * @npages: number of consecutive pages to release.
40962306a36Sopenharmony_ci * @make_dirty: whether to mark the pages dirty
41062306a36Sopenharmony_ci *
41162306a36Sopenharmony_ci * "gup-pinned page range" refers to a range of pages that has had one of the
41262306a36Sopenharmony_ci * pin_user_pages() variants called on that page.
41362306a36Sopenharmony_ci *
41462306a36Sopenharmony_ci * For the page ranges defined by [page .. page+npages], make that range (or
41562306a36Sopenharmony_ci * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
41662306a36Sopenharmony_ci * page range was previously listed as clean.
41762306a36Sopenharmony_ci *
41862306a36Sopenharmony_ci * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
41962306a36Sopenharmony_ci * required, then the caller should a) verify that this is really correct,
42062306a36Sopenharmony_ci * because _lock() is usually required, and b) hand code it:
42162306a36Sopenharmony_ci * set_page_dirty_lock(), unpin_user_page().
42262306a36Sopenharmony_ci *
42362306a36Sopenharmony_ci */
42462306a36Sopenharmony_civoid unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
42562306a36Sopenharmony_ci				      bool make_dirty)
42662306a36Sopenharmony_ci{
42762306a36Sopenharmony_ci	unsigned long i;
42862306a36Sopenharmony_ci	struct folio *folio;
42962306a36Sopenharmony_ci	unsigned int nr;
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci	for (i = 0; i < npages; i += nr) {
43262306a36Sopenharmony_ci		folio = gup_folio_range_next(page, npages, i, &nr);
43362306a36Sopenharmony_ci		if (make_dirty && !folio_test_dirty(folio)) {
43462306a36Sopenharmony_ci			folio_lock(folio);
43562306a36Sopenharmony_ci			folio_mark_dirty(folio);
43662306a36Sopenharmony_ci			folio_unlock(folio);
43762306a36Sopenharmony_ci		}
43862306a36Sopenharmony_ci		gup_put_folio(folio, nr, FOLL_PIN);
43962306a36Sopenharmony_ci	}
44062306a36Sopenharmony_ci}
44162306a36Sopenharmony_ciEXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
44262306a36Sopenharmony_ci
44362306a36Sopenharmony_cistatic void unpin_user_pages_lockless(struct page **pages, unsigned long npages)
44462306a36Sopenharmony_ci{
44562306a36Sopenharmony_ci	unsigned long i;
44662306a36Sopenharmony_ci	struct folio *folio;
44762306a36Sopenharmony_ci	unsigned int nr;
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ci	/*
45062306a36Sopenharmony_ci	 * Don't perform any sanity checks because we might have raced with
45162306a36Sopenharmony_ci	 * fork() and some anonymous pages might now actually be shared --
45262306a36Sopenharmony_ci	 * which is why we're unpinning after all.
45362306a36Sopenharmony_ci	 */
45462306a36Sopenharmony_ci	for (i = 0; i < npages; i += nr) {
45562306a36Sopenharmony_ci		folio = gup_folio_next(pages, npages, i, &nr);
45662306a36Sopenharmony_ci		gup_put_folio(folio, nr, FOLL_PIN);
45762306a36Sopenharmony_ci	}
45862306a36Sopenharmony_ci}
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci/**
46162306a36Sopenharmony_ci * unpin_user_pages() - release an array of gup-pinned pages.
46262306a36Sopenharmony_ci * @pages:  array of pages to be marked dirty and released.
46362306a36Sopenharmony_ci * @npages: number of pages in the @pages array.
46462306a36Sopenharmony_ci *
46562306a36Sopenharmony_ci * For each page in the @pages array, release the page using unpin_user_page().
46662306a36Sopenharmony_ci *
46762306a36Sopenharmony_ci * Please see the unpin_user_page() documentation for details.
46862306a36Sopenharmony_ci */
46962306a36Sopenharmony_civoid unpin_user_pages(struct page **pages, unsigned long npages)
47062306a36Sopenharmony_ci{
47162306a36Sopenharmony_ci	unsigned long i;
47262306a36Sopenharmony_ci	struct folio *folio;
47362306a36Sopenharmony_ci	unsigned int nr;
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci	/*
47662306a36Sopenharmony_ci	 * If this WARN_ON() fires, then the system *might* be leaking pages (by
47762306a36Sopenharmony_ci	 * leaving them pinned), but probably not. More likely, gup/pup returned
47862306a36Sopenharmony_ci	 * a hard -ERRNO error to the caller, who erroneously passed it here.
47962306a36Sopenharmony_ci	 */
48062306a36Sopenharmony_ci	if (WARN_ON(IS_ERR_VALUE(npages)))
48162306a36Sopenharmony_ci		return;
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_ci	sanity_check_pinned_pages(pages, npages);
48462306a36Sopenharmony_ci	for (i = 0; i < npages; i += nr) {
48562306a36Sopenharmony_ci		folio = gup_folio_next(pages, npages, i, &nr);
48662306a36Sopenharmony_ci		gup_put_folio(folio, nr, FOLL_PIN);
48762306a36Sopenharmony_ci	}
48862306a36Sopenharmony_ci}
48962306a36Sopenharmony_ciEXPORT_SYMBOL(unpin_user_pages);
49062306a36Sopenharmony_ci
49162306a36Sopenharmony_ci/*
49262306a36Sopenharmony_ci * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
49362306a36Sopenharmony_ci * lifecycle.  Avoid setting the bit unless necessary, or it might cause write
49462306a36Sopenharmony_ci * cache bouncing on large SMP machines for concurrent pinned gups.
49562306a36Sopenharmony_ci */
49662306a36Sopenharmony_cistatic inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
49762306a36Sopenharmony_ci{
49862306a36Sopenharmony_ci	if (!test_bit(MMF_HAS_PINNED, mm_flags))
49962306a36Sopenharmony_ci		set_bit(MMF_HAS_PINNED, mm_flags);
50062306a36Sopenharmony_ci}
50162306a36Sopenharmony_ci
50262306a36Sopenharmony_ci#ifdef CONFIG_MMU
50362306a36Sopenharmony_cistatic struct page *no_page_table(struct vm_area_struct *vma,
50462306a36Sopenharmony_ci		unsigned int flags)
50562306a36Sopenharmony_ci{
50662306a36Sopenharmony_ci	/*
50762306a36Sopenharmony_ci	 * When core dumping an enormous anonymous area that nobody
50862306a36Sopenharmony_ci	 * has touched so far, we don't want to allocate unnecessary pages or
50962306a36Sopenharmony_ci	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
51062306a36Sopenharmony_ci	 * then get_dump_page() will return NULL to leave a hole in the dump.
51162306a36Sopenharmony_ci	 * But we can only make this optimization where a hole would surely
51262306a36Sopenharmony_ci	 * be zero-filled if handle_mm_fault() actually did handle it.
51362306a36Sopenharmony_ci	 */
51462306a36Sopenharmony_ci	if ((flags & FOLL_DUMP) &&
51562306a36Sopenharmony_ci			(vma_is_anonymous(vma) || !vma->vm_ops->fault))
51662306a36Sopenharmony_ci		return ERR_PTR(-EFAULT);
51762306a36Sopenharmony_ci	return NULL;
51862306a36Sopenharmony_ci}
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_cistatic int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
52162306a36Sopenharmony_ci		pte_t *pte, unsigned int flags)
52262306a36Sopenharmony_ci{
52362306a36Sopenharmony_ci	if (flags & FOLL_TOUCH) {
52462306a36Sopenharmony_ci		pte_t orig_entry = ptep_get(pte);
52562306a36Sopenharmony_ci		pte_t entry = orig_entry;
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_ci		if (flags & FOLL_WRITE)
52862306a36Sopenharmony_ci			entry = pte_mkdirty(entry);
52962306a36Sopenharmony_ci		entry = pte_mkyoung(entry);
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_ci		if (!pte_same(orig_entry, entry)) {
53262306a36Sopenharmony_ci			set_pte_at(vma->vm_mm, address, pte, entry);
53362306a36Sopenharmony_ci			update_mmu_cache(vma, address, pte);
53462306a36Sopenharmony_ci		}
53562306a36Sopenharmony_ci	}
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci	/* Proper page table entry exists, but no corresponding struct page */
53862306a36Sopenharmony_ci	return -EEXIST;
53962306a36Sopenharmony_ci}
54062306a36Sopenharmony_ci
54162306a36Sopenharmony_ci/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
54262306a36Sopenharmony_cistatic inline bool can_follow_write_pte(pte_t pte, struct page *page,
54362306a36Sopenharmony_ci					struct vm_area_struct *vma,
54462306a36Sopenharmony_ci					unsigned int flags)
54562306a36Sopenharmony_ci{
54662306a36Sopenharmony_ci	/* If the pte is writable, we can write to the page. */
54762306a36Sopenharmony_ci	if (pte_write(pte))
54862306a36Sopenharmony_ci		return true;
54962306a36Sopenharmony_ci
55062306a36Sopenharmony_ci	/* Maybe FOLL_FORCE is set to override it? */
55162306a36Sopenharmony_ci	if (!(flags & FOLL_FORCE))
55262306a36Sopenharmony_ci		return false;
55362306a36Sopenharmony_ci
55462306a36Sopenharmony_ci	/* But FOLL_FORCE has no effect on shared mappings */
55562306a36Sopenharmony_ci	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
55662306a36Sopenharmony_ci		return false;
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci	/* ... or read-only private ones */
55962306a36Sopenharmony_ci	if (!(vma->vm_flags & VM_MAYWRITE))
56062306a36Sopenharmony_ci		return false;
56162306a36Sopenharmony_ci
56262306a36Sopenharmony_ci	/* ... or already writable ones that just need to take a write fault */
56362306a36Sopenharmony_ci	if (vma->vm_flags & VM_WRITE)
56462306a36Sopenharmony_ci		return false;
56562306a36Sopenharmony_ci
56662306a36Sopenharmony_ci	/*
56762306a36Sopenharmony_ci	 * See can_change_pte_writable(): we broke COW and could map the page
56862306a36Sopenharmony_ci	 * writable if we have an exclusive anonymous page ...
56962306a36Sopenharmony_ci	 */
57062306a36Sopenharmony_ci	if (!page || !PageAnon(page) || !PageAnonExclusive(page))
57162306a36Sopenharmony_ci		return false;
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci	/* ... and a write-fault isn't required for other reasons. */
57462306a36Sopenharmony_ci	if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
57562306a36Sopenharmony_ci		return false;
57662306a36Sopenharmony_ci	return !userfaultfd_pte_wp(vma, pte);
57762306a36Sopenharmony_ci}
57862306a36Sopenharmony_ci
57962306a36Sopenharmony_cistatic struct page *follow_page_pte(struct vm_area_struct *vma,
58062306a36Sopenharmony_ci		unsigned long address, pmd_t *pmd, unsigned int flags,
58162306a36Sopenharmony_ci		struct dev_pagemap **pgmap)
58262306a36Sopenharmony_ci{
58362306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
58462306a36Sopenharmony_ci	struct page *page;
58562306a36Sopenharmony_ci	spinlock_t *ptl;
58662306a36Sopenharmony_ci	pte_t *ptep, pte;
58762306a36Sopenharmony_ci	int ret;
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
59062306a36Sopenharmony_ci	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
59162306a36Sopenharmony_ci			 (FOLL_PIN | FOLL_GET)))
59262306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
59362306a36Sopenharmony_ci
59462306a36Sopenharmony_ci	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
59562306a36Sopenharmony_ci	if (!ptep)
59662306a36Sopenharmony_ci		return no_page_table(vma, flags);
59762306a36Sopenharmony_ci	pte = ptep_get(ptep);
59862306a36Sopenharmony_ci	if (!pte_present(pte))
59962306a36Sopenharmony_ci		goto no_page;
60062306a36Sopenharmony_ci	if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))
60162306a36Sopenharmony_ci		goto no_page;
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci	page = vm_normal_page(vma, address, pte);
60462306a36Sopenharmony_ci
60562306a36Sopenharmony_ci	/*
60662306a36Sopenharmony_ci	 * We only care about anon pages in can_follow_write_pte() and don't
60762306a36Sopenharmony_ci	 * have to worry about pte_devmap() because they are never anon.
60862306a36Sopenharmony_ci	 */
60962306a36Sopenharmony_ci	if ((flags & FOLL_WRITE) &&
61062306a36Sopenharmony_ci	    !can_follow_write_pte(pte, page, vma, flags)) {
61162306a36Sopenharmony_ci		page = NULL;
61262306a36Sopenharmony_ci		goto out;
61362306a36Sopenharmony_ci	}
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_ci	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
61662306a36Sopenharmony_ci		/*
61762306a36Sopenharmony_ci		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
61862306a36Sopenharmony_ci		 * case since they are only valid while holding the pgmap
61962306a36Sopenharmony_ci		 * reference.
62062306a36Sopenharmony_ci		 */
62162306a36Sopenharmony_ci		*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
62262306a36Sopenharmony_ci		if (*pgmap)
62362306a36Sopenharmony_ci			page = pte_page(pte);
62462306a36Sopenharmony_ci		else
62562306a36Sopenharmony_ci			goto no_page;
62662306a36Sopenharmony_ci	} else if (unlikely(!page)) {
62762306a36Sopenharmony_ci		if (flags & FOLL_DUMP) {
62862306a36Sopenharmony_ci			/* Avoid special (like zero) pages in core dumps */
62962306a36Sopenharmony_ci			page = ERR_PTR(-EFAULT);
63062306a36Sopenharmony_ci			goto out;
63162306a36Sopenharmony_ci		}
63262306a36Sopenharmony_ci
63362306a36Sopenharmony_ci		if (is_zero_pfn(pte_pfn(pte))) {
63462306a36Sopenharmony_ci			page = pte_page(pte);
63562306a36Sopenharmony_ci		} else {
63662306a36Sopenharmony_ci			ret = follow_pfn_pte(vma, address, ptep, flags);
63762306a36Sopenharmony_ci			page = ERR_PTR(ret);
63862306a36Sopenharmony_ci			goto out;
63962306a36Sopenharmony_ci		}
64062306a36Sopenharmony_ci	}
64162306a36Sopenharmony_ci
64262306a36Sopenharmony_ci	if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
64362306a36Sopenharmony_ci		page = ERR_PTR(-EMLINK);
64462306a36Sopenharmony_ci		goto out;
64562306a36Sopenharmony_ci	}
64662306a36Sopenharmony_ci
64762306a36Sopenharmony_ci	VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
64862306a36Sopenharmony_ci		       !PageAnonExclusive(page), page);
64962306a36Sopenharmony_ci
65062306a36Sopenharmony_ci	/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
65162306a36Sopenharmony_ci	ret = try_grab_page(page, flags);
65262306a36Sopenharmony_ci	if (unlikely(ret)) {
65362306a36Sopenharmony_ci		page = ERR_PTR(ret);
65462306a36Sopenharmony_ci		goto out;
65562306a36Sopenharmony_ci	}
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci	/*
65862306a36Sopenharmony_ci	 * We need to make the page accessible if and only if we are going
65962306a36Sopenharmony_ci	 * to access its content (the FOLL_PIN case).  Please see
66062306a36Sopenharmony_ci	 * Documentation/core-api/pin_user_pages.rst for details.
66162306a36Sopenharmony_ci	 */
66262306a36Sopenharmony_ci	if (flags & FOLL_PIN) {
66362306a36Sopenharmony_ci		ret = arch_make_page_accessible(page);
66462306a36Sopenharmony_ci		if (ret) {
66562306a36Sopenharmony_ci			unpin_user_page(page);
66662306a36Sopenharmony_ci			page = ERR_PTR(ret);
66762306a36Sopenharmony_ci			goto out;
66862306a36Sopenharmony_ci		}
66962306a36Sopenharmony_ci	}
67062306a36Sopenharmony_ci	if (flags & FOLL_TOUCH) {
67162306a36Sopenharmony_ci		if ((flags & FOLL_WRITE) &&
67262306a36Sopenharmony_ci		    !pte_dirty(pte) && !PageDirty(page))
67362306a36Sopenharmony_ci			set_page_dirty(page);
67462306a36Sopenharmony_ci		/*
67562306a36Sopenharmony_ci		 * pte_mkyoung() would be more correct here, but atomic care
67662306a36Sopenharmony_ci		 * is needed to avoid losing the dirty bit: it is easier to use
67762306a36Sopenharmony_ci		 * mark_page_accessed().
67862306a36Sopenharmony_ci		 */
67962306a36Sopenharmony_ci		mark_page_accessed(page);
68062306a36Sopenharmony_ci	}
68162306a36Sopenharmony_ciout:
68262306a36Sopenharmony_ci	pte_unmap_unlock(ptep, ptl);
68362306a36Sopenharmony_ci	return page;
68462306a36Sopenharmony_cino_page:
68562306a36Sopenharmony_ci	pte_unmap_unlock(ptep, ptl);
68662306a36Sopenharmony_ci	if (!pte_none(pte))
68762306a36Sopenharmony_ci		return NULL;
68862306a36Sopenharmony_ci	return no_page_table(vma, flags);
68962306a36Sopenharmony_ci}
69062306a36Sopenharmony_ci
69162306a36Sopenharmony_cistatic struct page *follow_pmd_mask(struct vm_area_struct *vma,
69262306a36Sopenharmony_ci				    unsigned long address, pud_t *pudp,
69362306a36Sopenharmony_ci				    unsigned int flags,
69462306a36Sopenharmony_ci				    struct follow_page_context *ctx)
69562306a36Sopenharmony_ci{
69662306a36Sopenharmony_ci	pmd_t *pmd, pmdval;
69762306a36Sopenharmony_ci	spinlock_t *ptl;
69862306a36Sopenharmony_ci	struct page *page;
69962306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
70062306a36Sopenharmony_ci
70162306a36Sopenharmony_ci	pmd = pmd_offset(pudp, address);
70262306a36Sopenharmony_ci	pmdval = pmdp_get_lockless(pmd);
70362306a36Sopenharmony_ci	if (pmd_none(pmdval))
70462306a36Sopenharmony_ci		return no_page_table(vma, flags);
70562306a36Sopenharmony_ci	if (!pmd_present(pmdval))
70662306a36Sopenharmony_ci		return no_page_table(vma, flags);
70762306a36Sopenharmony_ci	if (pmd_devmap(pmdval)) {
70862306a36Sopenharmony_ci		ptl = pmd_lock(mm, pmd);
70962306a36Sopenharmony_ci		page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
71062306a36Sopenharmony_ci		spin_unlock(ptl);
71162306a36Sopenharmony_ci		if (page)
71262306a36Sopenharmony_ci			return page;
71362306a36Sopenharmony_ci	}
71462306a36Sopenharmony_ci	if (likely(!pmd_trans_huge(pmdval)))
71562306a36Sopenharmony_ci		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
71662306a36Sopenharmony_ci
71762306a36Sopenharmony_ci	if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
71862306a36Sopenharmony_ci		return no_page_table(vma, flags);
71962306a36Sopenharmony_ci
72062306a36Sopenharmony_ci	ptl = pmd_lock(mm, pmd);
72162306a36Sopenharmony_ci	if (unlikely(!pmd_present(*pmd))) {
72262306a36Sopenharmony_ci		spin_unlock(ptl);
72362306a36Sopenharmony_ci		return no_page_table(vma, flags);
72462306a36Sopenharmony_ci	}
72562306a36Sopenharmony_ci	if (unlikely(!pmd_trans_huge(*pmd))) {
72662306a36Sopenharmony_ci		spin_unlock(ptl);
72762306a36Sopenharmony_ci		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
72862306a36Sopenharmony_ci	}
72962306a36Sopenharmony_ci	if (flags & FOLL_SPLIT_PMD) {
73062306a36Sopenharmony_ci		spin_unlock(ptl);
73162306a36Sopenharmony_ci		split_huge_pmd(vma, pmd, address);
73262306a36Sopenharmony_ci		/* If pmd was left empty, stuff a page table in there quickly */
73362306a36Sopenharmony_ci		return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
73462306a36Sopenharmony_ci			follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
73562306a36Sopenharmony_ci	}
73662306a36Sopenharmony_ci	page = follow_trans_huge_pmd(vma, address, pmd, flags);
73762306a36Sopenharmony_ci	spin_unlock(ptl);
73862306a36Sopenharmony_ci	ctx->page_mask = HPAGE_PMD_NR - 1;
73962306a36Sopenharmony_ci	return page;
74062306a36Sopenharmony_ci}
74162306a36Sopenharmony_ci
74262306a36Sopenharmony_cistatic struct page *follow_pud_mask(struct vm_area_struct *vma,
74362306a36Sopenharmony_ci				    unsigned long address, p4d_t *p4dp,
74462306a36Sopenharmony_ci				    unsigned int flags,
74562306a36Sopenharmony_ci				    struct follow_page_context *ctx)
74662306a36Sopenharmony_ci{
74762306a36Sopenharmony_ci	pud_t *pud;
74862306a36Sopenharmony_ci	spinlock_t *ptl;
74962306a36Sopenharmony_ci	struct page *page;
75062306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
75162306a36Sopenharmony_ci
75262306a36Sopenharmony_ci	pud = pud_offset(p4dp, address);
75362306a36Sopenharmony_ci	if (pud_none(*pud))
75462306a36Sopenharmony_ci		return no_page_table(vma, flags);
75562306a36Sopenharmony_ci	if (pud_devmap(*pud)) {
75662306a36Sopenharmony_ci		ptl = pud_lock(mm, pud);
75762306a36Sopenharmony_ci		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
75862306a36Sopenharmony_ci		spin_unlock(ptl);
75962306a36Sopenharmony_ci		if (page)
76062306a36Sopenharmony_ci			return page;
76162306a36Sopenharmony_ci	}
76262306a36Sopenharmony_ci	if (unlikely(pud_bad(*pud)))
76362306a36Sopenharmony_ci		return no_page_table(vma, flags);
76462306a36Sopenharmony_ci
76562306a36Sopenharmony_ci	return follow_pmd_mask(vma, address, pud, flags, ctx);
76662306a36Sopenharmony_ci}
76762306a36Sopenharmony_ci
76862306a36Sopenharmony_cistatic struct page *follow_p4d_mask(struct vm_area_struct *vma,
76962306a36Sopenharmony_ci				    unsigned long address, pgd_t *pgdp,
77062306a36Sopenharmony_ci				    unsigned int flags,
77162306a36Sopenharmony_ci				    struct follow_page_context *ctx)
77262306a36Sopenharmony_ci{
77362306a36Sopenharmony_ci	p4d_t *p4d;
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci	p4d = p4d_offset(pgdp, address);
77662306a36Sopenharmony_ci	if (p4d_none(*p4d))
77762306a36Sopenharmony_ci		return no_page_table(vma, flags);
77862306a36Sopenharmony_ci	BUILD_BUG_ON(p4d_huge(*p4d));
77962306a36Sopenharmony_ci	if (unlikely(p4d_bad(*p4d)))
78062306a36Sopenharmony_ci		return no_page_table(vma, flags);
78162306a36Sopenharmony_ci
78262306a36Sopenharmony_ci	return follow_pud_mask(vma, address, p4d, flags, ctx);
78362306a36Sopenharmony_ci}
78462306a36Sopenharmony_ci
78562306a36Sopenharmony_ci/**
78662306a36Sopenharmony_ci * follow_page_mask - look up a page descriptor from a user-virtual address
78762306a36Sopenharmony_ci * @vma: vm_area_struct mapping @address
78862306a36Sopenharmony_ci * @address: virtual address to look up
78962306a36Sopenharmony_ci * @flags: flags modifying lookup behaviour
79062306a36Sopenharmony_ci * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
79162306a36Sopenharmony_ci *       pointer to output page_mask
79262306a36Sopenharmony_ci *
79362306a36Sopenharmony_ci * @flags can have FOLL_ flags set, defined in <linux/mm.h>
79462306a36Sopenharmony_ci *
79562306a36Sopenharmony_ci * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
79662306a36Sopenharmony_ci * the device's dev_pagemap metadata to avoid repeating expensive lookups.
79762306a36Sopenharmony_ci *
79862306a36Sopenharmony_ci * When getting an anonymous page and the caller has to trigger unsharing
79962306a36Sopenharmony_ci * of a shared anonymous page first, -EMLINK is returned. The caller should
80062306a36Sopenharmony_ci * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
80162306a36Sopenharmony_ci * relevant with FOLL_PIN and !FOLL_WRITE.
80262306a36Sopenharmony_ci *
80362306a36Sopenharmony_ci * On output, the @ctx->page_mask is set according to the size of the page.
80462306a36Sopenharmony_ci *
80562306a36Sopenharmony_ci * Return: the mapped (struct page *), %NULL if no mapping exists, or
80662306a36Sopenharmony_ci * an error pointer if there is a mapping to something not represented
80762306a36Sopenharmony_ci * by a page descriptor (see also vm_normal_page()).
80862306a36Sopenharmony_ci */
80962306a36Sopenharmony_cistatic struct page *follow_page_mask(struct vm_area_struct *vma,
81062306a36Sopenharmony_ci			      unsigned long address, unsigned int flags,
81162306a36Sopenharmony_ci			      struct follow_page_context *ctx)
81262306a36Sopenharmony_ci{
81362306a36Sopenharmony_ci	pgd_t *pgd;
81462306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
81562306a36Sopenharmony_ci
81662306a36Sopenharmony_ci	ctx->page_mask = 0;
81762306a36Sopenharmony_ci
81862306a36Sopenharmony_ci	/*
81962306a36Sopenharmony_ci	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
82062306a36Sopenharmony_ci	 * special hugetlb page table walking code.  This eliminates the
82162306a36Sopenharmony_ci	 * need to check for hugetlb entries in the general walking code.
82262306a36Sopenharmony_ci	 */
82362306a36Sopenharmony_ci	if (is_vm_hugetlb_page(vma))
82462306a36Sopenharmony_ci		return hugetlb_follow_page_mask(vma, address, flags,
82562306a36Sopenharmony_ci						&ctx->page_mask);
82662306a36Sopenharmony_ci
82762306a36Sopenharmony_ci	pgd = pgd_offset(mm, address);
82862306a36Sopenharmony_ci
82962306a36Sopenharmony_ci	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
83062306a36Sopenharmony_ci		return no_page_table(vma, flags);
83162306a36Sopenharmony_ci
83262306a36Sopenharmony_ci	return follow_p4d_mask(vma, address, pgd, flags, ctx);
83362306a36Sopenharmony_ci}
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_cistruct page *follow_page(struct vm_area_struct *vma, unsigned long address,
83662306a36Sopenharmony_ci			 unsigned int foll_flags)
83762306a36Sopenharmony_ci{
83862306a36Sopenharmony_ci	struct follow_page_context ctx = { NULL };
83962306a36Sopenharmony_ci	struct page *page;
84062306a36Sopenharmony_ci
84162306a36Sopenharmony_ci	if (vma_is_secretmem(vma))
84262306a36Sopenharmony_ci		return NULL;
84362306a36Sopenharmony_ci
84462306a36Sopenharmony_ci	if (WARN_ON_ONCE(foll_flags & FOLL_PIN))
84562306a36Sopenharmony_ci		return NULL;
84662306a36Sopenharmony_ci
84762306a36Sopenharmony_ci	/*
84862306a36Sopenharmony_ci	 * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect
84962306a36Sopenharmony_ci	 * to fail on PROT_NONE-mapped pages.
85062306a36Sopenharmony_ci	 */
85162306a36Sopenharmony_ci	page = follow_page_mask(vma, address, foll_flags, &ctx);
85262306a36Sopenharmony_ci	if (ctx.pgmap)
85362306a36Sopenharmony_ci		put_dev_pagemap(ctx.pgmap);
85462306a36Sopenharmony_ci	return page;
85562306a36Sopenharmony_ci}
85662306a36Sopenharmony_ci
85762306a36Sopenharmony_cistatic int get_gate_page(struct mm_struct *mm, unsigned long address,
85862306a36Sopenharmony_ci		unsigned int gup_flags, struct vm_area_struct **vma,
85962306a36Sopenharmony_ci		struct page **page)
86062306a36Sopenharmony_ci{
86162306a36Sopenharmony_ci	pgd_t *pgd;
86262306a36Sopenharmony_ci	p4d_t *p4d;
86362306a36Sopenharmony_ci	pud_t *pud;
86462306a36Sopenharmony_ci	pmd_t *pmd;
86562306a36Sopenharmony_ci	pte_t *pte;
86662306a36Sopenharmony_ci	pte_t entry;
86762306a36Sopenharmony_ci	int ret = -EFAULT;
86862306a36Sopenharmony_ci
86962306a36Sopenharmony_ci	/* user gate pages are read-only */
87062306a36Sopenharmony_ci	if (gup_flags & FOLL_WRITE)
87162306a36Sopenharmony_ci		return -EFAULT;
87262306a36Sopenharmony_ci	if (address > TASK_SIZE)
87362306a36Sopenharmony_ci		pgd = pgd_offset_k(address);
87462306a36Sopenharmony_ci	else
87562306a36Sopenharmony_ci		pgd = pgd_offset_gate(mm, address);
87662306a36Sopenharmony_ci	if (pgd_none(*pgd))
87762306a36Sopenharmony_ci		return -EFAULT;
87862306a36Sopenharmony_ci	p4d = p4d_offset(pgd, address);
87962306a36Sopenharmony_ci	if (p4d_none(*p4d))
88062306a36Sopenharmony_ci		return -EFAULT;
88162306a36Sopenharmony_ci	pud = pud_offset(p4d, address);
88262306a36Sopenharmony_ci	if (pud_none(*pud))
88362306a36Sopenharmony_ci		return -EFAULT;
88462306a36Sopenharmony_ci	pmd = pmd_offset(pud, address);
88562306a36Sopenharmony_ci	if (!pmd_present(*pmd))
88662306a36Sopenharmony_ci		return -EFAULT;
88762306a36Sopenharmony_ci	pte = pte_offset_map(pmd, address);
88862306a36Sopenharmony_ci	if (!pte)
88962306a36Sopenharmony_ci		return -EFAULT;
89062306a36Sopenharmony_ci	entry = ptep_get(pte);
89162306a36Sopenharmony_ci	if (pte_none(entry))
89262306a36Sopenharmony_ci		goto unmap;
89362306a36Sopenharmony_ci	*vma = get_gate_vma(mm);
89462306a36Sopenharmony_ci	if (!page)
89562306a36Sopenharmony_ci		goto out;
89662306a36Sopenharmony_ci	*page = vm_normal_page(*vma, address, entry);
89762306a36Sopenharmony_ci	if (!*page) {
89862306a36Sopenharmony_ci		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))
89962306a36Sopenharmony_ci			goto unmap;
90062306a36Sopenharmony_ci		*page = pte_page(entry);
90162306a36Sopenharmony_ci	}
90262306a36Sopenharmony_ci	ret = try_grab_page(*page, gup_flags);
90362306a36Sopenharmony_ci	if (unlikely(ret))
90462306a36Sopenharmony_ci		goto unmap;
90562306a36Sopenharmony_ciout:
90662306a36Sopenharmony_ci	ret = 0;
90762306a36Sopenharmony_ciunmap:
90862306a36Sopenharmony_ci	pte_unmap(pte);
90962306a36Sopenharmony_ci	return ret;
91062306a36Sopenharmony_ci}
91162306a36Sopenharmony_ci
91262306a36Sopenharmony_ci/*
91362306a36Sopenharmony_ci * mmap_lock must be held on entry.  If @flags has FOLL_UNLOCKABLE but not
91462306a36Sopenharmony_ci * FOLL_NOWAIT, the mmap_lock may be released.  If it is, *@locked will be set
91562306a36Sopenharmony_ci * to 0 and -EBUSY returned.
91662306a36Sopenharmony_ci */
91762306a36Sopenharmony_cistatic int faultin_page(struct vm_area_struct *vma,
91862306a36Sopenharmony_ci		unsigned long address, unsigned int *flags, bool unshare,
91962306a36Sopenharmony_ci		int *locked)
92062306a36Sopenharmony_ci{
92162306a36Sopenharmony_ci	unsigned int fault_flags = 0;
92262306a36Sopenharmony_ci	vm_fault_t ret;
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_ci	if (*flags & FOLL_NOFAULT)
92562306a36Sopenharmony_ci		return -EFAULT;
92662306a36Sopenharmony_ci	if (*flags & FOLL_WRITE)
92762306a36Sopenharmony_ci		fault_flags |= FAULT_FLAG_WRITE;
92862306a36Sopenharmony_ci	if (*flags & FOLL_REMOTE)
92962306a36Sopenharmony_ci		fault_flags |= FAULT_FLAG_REMOTE;
93062306a36Sopenharmony_ci	if (*flags & FOLL_UNLOCKABLE) {
93162306a36Sopenharmony_ci		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
93262306a36Sopenharmony_ci		/*
93362306a36Sopenharmony_ci		 * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
93462306a36Sopenharmony_ci		 * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
93562306a36Sopenharmony_ci		 * That's because some callers may not be prepared to
93662306a36Sopenharmony_ci		 * handle early exits caused by non-fatal signals.
93762306a36Sopenharmony_ci		 */
93862306a36Sopenharmony_ci		if (*flags & FOLL_INTERRUPTIBLE)
93962306a36Sopenharmony_ci			fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
94062306a36Sopenharmony_ci	}
94162306a36Sopenharmony_ci	if (*flags & FOLL_NOWAIT)
94262306a36Sopenharmony_ci		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
94362306a36Sopenharmony_ci	if (*flags & FOLL_TRIED) {
94462306a36Sopenharmony_ci		/*
94562306a36Sopenharmony_ci		 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
94662306a36Sopenharmony_ci		 * can co-exist
94762306a36Sopenharmony_ci		 */
94862306a36Sopenharmony_ci		fault_flags |= FAULT_FLAG_TRIED;
94962306a36Sopenharmony_ci	}
95062306a36Sopenharmony_ci	if (unshare) {
95162306a36Sopenharmony_ci		fault_flags |= FAULT_FLAG_UNSHARE;
95262306a36Sopenharmony_ci		/* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
95362306a36Sopenharmony_ci		VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
95462306a36Sopenharmony_ci	}
95562306a36Sopenharmony_ci
95662306a36Sopenharmony_ci	ret = handle_mm_fault(vma, address, fault_flags, NULL);
95762306a36Sopenharmony_ci
95862306a36Sopenharmony_ci	if (ret & VM_FAULT_COMPLETED) {
95962306a36Sopenharmony_ci		/*
96062306a36Sopenharmony_ci		 * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
96162306a36Sopenharmony_ci		 * mmap lock in the page fault handler. Sanity check this.
96262306a36Sopenharmony_ci		 */
96362306a36Sopenharmony_ci		WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
96462306a36Sopenharmony_ci		*locked = 0;
96562306a36Sopenharmony_ci
96662306a36Sopenharmony_ci		/*
96762306a36Sopenharmony_ci		 * We should do the same as VM_FAULT_RETRY, but let's not
96862306a36Sopenharmony_ci		 * return -EBUSY since that's not reflecting the reality of
96962306a36Sopenharmony_ci		 * what has happened - we've just fully completed a page
97062306a36Sopenharmony_ci		 * fault, with the mmap lock released.  Use -EAGAIN to show
97162306a36Sopenharmony_ci		 * that we want to take the mmap lock _again_.
97262306a36Sopenharmony_ci		 */
97362306a36Sopenharmony_ci		return -EAGAIN;
97462306a36Sopenharmony_ci	}
97562306a36Sopenharmony_ci
97662306a36Sopenharmony_ci	if (ret & VM_FAULT_ERROR) {
97762306a36Sopenharmony_ci		int err = vm_fault_to_errno(ret, *flags);
97862306a36Sopenharmony_ci
97962306a36Sopenharmony_ci		if (err)
98062306a36Sopenharmony_ci			return err;
98162306a36Sopenharmony_ci		BUG();
98262306a36Sopenharmony_ci	}
98362306a36Sopenharmony_ci
98462306a36Sopenharmony_ci	if (ret & VM_FAULT_RETRY) {
98562306a36Sopenharmony_ci		if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
98662306a36Sopenharmony_ci			*locked = 0;
98762306a36Sopenharmony_ci		return -EBUSY;
98862306a36Sopenharmony_ci	}
98962306a36Sopenharmony_ci
99062306a36Sopenharmony_ci	return 0;
99162306a36Sopenharmony_ci}
99262306a36Sopenharmony_ci
99362306a36Sopenharmony_ci/*
99462306a36Sopenharmony_ci * Writing to file-backed mappings which require folio dirty tracking using GUP
99562306a36Sopenharmony_ci * is a fundamentally broken operation, as kernel write access to GUP mappings
99662306a36Sopenharmony_ci * do not adhere to the semantics expected by a file system.
99762306a36Sopenharmony_ci *
99862306a36Sopenharmony_ci * Consider the following scenario:-
99962306a36Sopenharmony_ci *
100062306a36Sopenharmony_ci * 1. A folio is written to via GUP which write-faults the memory, notifying
100162306a36Sopenharmony_ci *    the file system and dirtying the folio.
100262306a36Sopenharmony_ci * 2. Later, writeback is triggered, resulting in the folio being cleaned and
100362306a36Sopenharmony_ci *    the PTE being marked read-only.
100462306a36Sopenharmony_ci * 3. The GUP caller writes to the folio, as it is mapped read/write via the
100562306a36Sopenharmony_ci *    direct mapping.
100662306a36Sopenharmony_ci * 4. The GUP caller, now done with the page, unpins it and sets it dirty
100762306a36Sopenharmony_ci *    (though it does not have to).
100862306a36Sopenharmony_ci *
100962306a36Sopenharmony_ci * This results in both data being written to a folio without writenotify, and
101062306a36Sopenharmony_ci * the folio being dirtied unexpectedly (if the caller decides to do so).
101162306a36Sopenharmony_ci */
101262306a36Sopenharmony_cistatic bool writable_file_mapping_allowed(struct vm_area_struct *vma,
101362306a36Sopenharmony_ci					  unsigned long gup_flags)
101462306a36Sopenharmony_ci{
101562306a36Sopenharmony_ci	/*
101662306a36Sopenharmony_ci	 * If we aren't pinning then no problematic write can occur. A long term
101762306a36Sopenharmony_ci	 * pin is the most egregious case so this is the case we disallow.
101862306a36Sopenharmony_ci	 */
101962306a36Sopenharmony_ci	if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
102062306a36Sopenharmony_ci	    (FOLL_PIN | FOLL_LONGTERM))
102162306a36Sopenharmony_ci		return true;
102262306a36Sopenharmony_ci
102362306a36Sopenharmony_ci	/*
102462306a36Sopenharmony_ci	 * If the VMA does not require dirty tracking then no problematic write
102562306a36Sopenharmony_ci	 * can occur either.
102662306a36Sopenharmony_ci	 */
102762306a36Sopenharmony_ci	return !vma_needs_dirty_tracking(vma);
102862306a36Sopenharmony_ci}
102962306a36Sopenharmony_ci
103062306a36Sopenharmony_cistatic int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
103162306a36Sopenharmony_ci{
103262306a36Sopenharmony_ci	vm_flags_t vm_flags = vma->vm_flags;
103362306a36Sopenharmony_ci	int write = (gup_flags & FOLL_WRITE);
103462306a36Sopenharmony_ci	int foreign = (gup_flags & FOLL_REMOTE);
103562306a36Sopenharmony_ci	bool vma_anon = vma_is_anonymous(vma);
103662306a36Sopenharmony_ci
103762306a36Sopenharmony_ci	if (vm_flags & (VM_IO | VM_PFNMAP))
103862306a36Sopenharmony_ci		return -EFAULT;
103962306a36Sopenharmony_ci
104062306a36Sopenharmony_ci	if ((gup_flags & FOLL_ANON) && !vma_anon)
104162306a36Sopenharmony_ci		return -EFAULT;
104262306a36Sopenharmony_ci
104362306a36Sopenharmony_ci	if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
104462306a36Sopenharmony_ci		return -EOPNOTSUPP;
104562306a36Sopenharmony_ci
104662306a36Sopenharmony_ci	if (vma_is_secretmem(vma))
104762306a36Sopenharmony_ci		return -EFAULT;
104862306a36Sopenharmony_ci
104962306a36Sopenharmony_ci	if (write) {
105062306a36Sopenharmony_ci		if (!vma_anon &&
105162306a36Sopenharmony_ci		    !writable_file_mapping_allowed(vma, gup_flags))
105262306a36Sopenharmony_ci			return -EFAULT;
105362306a36Sopenharmony_ci
105462306a36Sopenharmony_ci		if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
105562306a36Sopenharmony_ci			if (!(gup_flags & FOLL_FORCE))
105662306a36Sopenharmony_ci				return -EFAULT;
105762306a36Sopenharmony_ci			/* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
105862306a36Sopenharmony_ci			if (is_vm_hugetlb_page(vma))
105962306a36Sopenharmony_ci				return -EFAULT;
106062306a36Sopenharmony_ci			/*
106162306a36Sopenharmony_ci			 * We used to let the write,force case do COW in a
106262306a36Sopenharmony_ci			 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
106362306a36Sopenharmony_ci			 * set a breakpoint in a read-only mapping of an
106462306a36Sopenharmony_ci			 * executable, without corrupting the file (yet only
106562306a36Sopenharmony_ci			 * when that file had been opened for writing!).
106662306a36Sopenharmony_ci			 * Anon pages in shared mappings are surprising: now
106762306a36Sopenharmony_ci			 * just reject it.
106862306a36Sopenharmony_ci			 */
106962306a36Sopenharmony_ci			if (!is_cow_mapping(vm_flags))
107062306a36Sopenharmony_ci				return -EFAULT;
107162306a36Sopenharmony_ci		}
107262306a36Sopenharmony_ci	} else if (!(vm_flags & VM_READ)) {
107362306a36Sopenharmony_ci		if (!(gup_flags & FOLL_FORCE))
107462306a36Sopenharmony_ci			return -EFAULT;
107562306a36Sopenharmony_ci		/*
107662306a36Sopenharmony_ci		 * Is there actually any vma we can reach here which does not
107762306a36Sopenharmony_ci		 * have VM_MAYREAD set?
107862306a36Sopenharmony_ci		 */
107962306a36Sopenharmony_ci		if (!(vm_flags & VM_MAYREAD))
108062306a36Sopenharmony_ci			return -EFAULT;
108162306a36Sopenharmony_ci	}
108262306a36Sopenharmony_ci	/*
108362306a36Sopenharmony_ci	 * gups are always data accesses, not instruction
108462306a36Sopenharmony_ci	 * fetches, so execute=false here
108562306a36Sopenharmony_ci	 */
108662306a36Sopenharmony_ci	if (!arch_vma_access_permitted(vma, write, false, foreign))
108762306a36Sopenharmony_ci		return -EFAULT;
108862306a36Sopenharmony_ci	return 0;
108962306a36Sopenharmony_ci}
109062306a36Sopenharmony_ci
109162306a36Sopenharmony_ci/*
109262306a36Sopenharmony_ci * This is "vma_lookup()", but with a warning if we would have
109362306a36Sopenharmony_ci * historically expanded the stack in the GUP code.
109462306a36Sopenharmony_ci */
109562306a36Sopenharmony_cistatic struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm,
109662306a36Sopenharmony_ci	 unsigned long addr)
109762306a36Sopenharmony_ci{
109862306a36Sopenharmony_ci#ifdef CONFIG_STACK_GROWSUP
109962306a36Sopenharmony_ci	return vma_lookup(mm, addr);
110062306a36Sopenharmony_ci#else
110162306a36Sopenharmony_ci	static volatile unsigned long next_warn;
110262306a36Sopenharmony_ci	struct vm_area_struct *vma;
110362306a36Sopenharmony_ci	unsigned long now, next;
110462306a36Sopenharmony_ci
110562306a36Sopenharmony_ci	vma = find_vma(mm, addr);
110662306a36Sopenharmony_ci	if (!vma || (addr >= vma->vm_start))
110762306a36Sopenharmony_ci		return vma;
110862306a36Sopenharmony_ci
110962306a36Sopenharmony_ci	/* Only warn for half-way relevant accesses */
111062306a36Sopenharmony_ci	if (!(vma->vm_flags & VM_GROWSDOWN))
111162306a36Sopenharmony_ci		return NULL;
111262306a36Sopenharmony_ci	if (vma->vm_start - addr > 65536)
111362306a36Sopenharmony_ci		return NULL;
111462306a36Sopenharmony_ci
111562306a36Sopenharmony_ci	/* Let's not warn more than once an hour.. */
111662306a36Sopenharmony_ci	now = jiffies; next = next_warn;
111762306a36Sopenharmony_ci	if (next && time_before(now, next))
111862306a36Sopenharmony_ci		return NULL;
111962306a36Sopenharmony_ci	next_warn = now + 60*60*HZ;
112062306a36Sopenharmony_ci
112162306a36Sopenharmony_ci	/* Let people know things may have changed. */
112262306a36Sopenharmony_ci	pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n",
112362306a36Sopenharmony_ci		current->comm, task_pid_nr(current),
112462306a36Sopenharmony_ci		vma->vm_start, vma->vm_end, addr);
112562306a36Sopenharmony_ci	dump_stack();
112662306a36Sopenharmony_ci	return NULL;
112762306a36Sopenharmony_ci#endif
112862306a36Sopenharmony_ci}
112962306a36Sopenharmony_ci
113062306a36Sopenharmony_ci/**
113162306a36Sopenharmony_ci * __get_user_pages() - pin user pages in memory
113262306a36Sopenharmony_ci * @mm:		mm_struct of target mm
113362306a36Sopenharmony_ci * @start:	starting user address
113462306a36Sopenharmony_ci * @nr_pages:	number of pages from start to pin
113562306a36Sopenharmony_ci * @gup_flags:	flags modifying pin behaviour
113662306a36Sopenharmony_ci * @pages:	array that receives pointers to the pages pinned.
113762306a36Sopenharmony_ci *		Should be at least nr_pages long. Or NULL, if caller
113862306a36Sopenharmony_ci *		only intends to ensure the pages are faulted in.
113962306a36Sopenharmony_ci * @locked:     whether we're still with the mmap_lock held
114062306a36Sopenharmony_ci *
114162306a36Sopenharmony_ci * Returns either number of pages pinned (which may be less than the
114262306a36Sopenharmony_ci * number requested), or an error. Details about the return value:
114362306a36Sopenharmony_ci *
114462306a36Sopenharmony_ci * -- If nr_pages is 0, returns 0.
114562306a36Sopenharmony_ci * -- If nr_pages is >0, but no pages were pinned, returns -errno.
114662306a36Sopenharmony_ci * -- If nr_pages is >0, and some pages were pinned, returns the number of
114762306a36Sopenharmony_ci *    pages pinned. Again, this may be less than nr_pages.
114862306a36Sopenharmony_ci * -- 0 return value is possible when the fault would need to be retried.
114962306a36Sopenharmony_ci *
115062306a36Sopenharmony_ci * The caller is responsible for releasing returned @pages, via put_page().
115162306a36Sopenharmony_ci *
115262306a36Sopenharmony_ci * Must be called with mmap_lock held.  It may be released.  See below.
115362306a36Sopenharmony_ci *
115462306a36Sopenharmony_ci * __get_user_pages walks a process's page tables and takes a reference to
115562306a36Sopenharmony_ci * each struct page that each user address corresponds to at a given
115662306a36Sopenharmony_ci * instant. That is, it takes the page that would be accessed if a user
115762306a36Sopenharmony_ci * thread accesses the given user virtual address at that instant.
115862306a36Sopenharmony_ci *
115962306a36Sopenharmony_ci * This does not guarantee that the page exists in the user mappings when
116062306a36Sopenharmony_ci * __get_user_pages returns, and there may even be a completely different
116162306a36Sopenharmony_ci * page there in some cases (eg. if mmapped pagecache has been invalidated
116262306a36Sopenharmony_ci * and subsequently re-faulted). However it does guarantee that the page
116362306a36Sopenharmony_ci * won't be freed completely. And mostly callers simply care that the page
116462306a36Sopenharmony_ci * contains data that was valid *at some point in time*. Typically, an IO
116562306a36Sopenharmony_ci * or similar operation cannot guarantee anything stronger anyway because
116662306a36Sopenharmony_ci * locks can't be held over the syscall boundary.
116762306a36Sopenharmony_ci *
116862306a36Sopenharmony_ci * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
116962306a36Sopenharmony_ci * the page is written to, set_page_dirty (or set_page_dirty_lock, as
117062306a36Sopenharmony_ci * appropriate) must be called after the page is finished with, and
117162306a36Sopenharmony_ci * before put_page is called.
117262306a36Sopenharmony_ci *
117362306a36Sopenharmony_ci * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may
117462306a36Sopenharmony_ci * be released. If this happens *@locked will be set to 0 on return.
117562306a36Sopenharmony_ci *
117662306a36Sopenharmony_ci * A caller using such a combination of @gup_flags must therefore hold the
117762306a36Sopenharmony_ci * mmap_lock for reading only, and recognize when it's been released. Otherwise,
117862306a36Sopenharmony_ci * it must be held for either reading or writing and will not be released.
117962306a36Sopenharmony_ci *
118062306a36Sopenharmony_ci * In most cases, get_user_pages or get_user_pages_fast should be used
118162306a36Sopenharmony_ci * instead of __get_user_pages. __get_user_pages should be used only if
118262306a36Sopenharmony_ci * you need some special @gup_flags.
118362306a36Sopenharmony_ci */
118462306a36Sopenharmony_cistatic long __get_user_pages(struct mm_struct *mm,
118562306a36Sopenharmony_ci		unsigned long start, unsigned long nr_pages,
118662306a36Sopenharmony_ci		unsigned int gup_flags, struct page **pages,
118762306a36Sopenharmony_ci		int *locked)
118862306a36Sopenharmony_ci{
118962306a36Sopenharmony_ci	long ret = 0, i = 0;
119062306a36Sopenharmony_ci	struct vm_area_struct *vma = NULL;
119162306a36Sopenharmony_ci	struct follow_page_context ctx = { NULL };
119262306a36Sopenharmony_ci
119362306a36Sopenharmony_ci	if (!nr_pages)
119462306a36Sopenharmony_ci		return 0;
119562306a36Sopenharmony_ci
119662306a36Sopenharmony_ci	start = untagged_addr_remote(mm, start);
119762306a36Sopenharmony_ci
119862306a36Sopenharmony_ci	VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
119962306a36Sopenharmony_ci
120062306a36Sopenharmony_ci	do {
120162306a36Sopenharmony_ci		struct page *page;
120262306a36Sopenharmony_ci		unsigned int foll_flags = gup_flags;
120362306a36Sopenharmony_ci		unsigned int page_increm;
120462306a36Sopenharmony_ci
120562306a36Sopenharmony_ci		/* first iteration or cross vma bound */
120662306a36Sopenharmony_ci		if (!vma || start >= vma->vm_end) {
120762306a36Sopenharmony_ci			vma = gup_vma_lookup(mm, start);
120862306a36Sopenharmony_ci			if (!vma && in_gate_area(mm, start)) {
120962306a36Sopenharmony_ci				ret = get_gate_page(mm, start & PAGE_MASK,
121062306a36Sopenharmony_ci						gup_flags, &vma,
121162306a36Sopenharmony_ci						pages ? &page : NULL);
121262306a36Sopenharmony_ci				if (ret)
121362306a36Sopenharmony_ci					goto out;
121462306a36Sopenharmony_ci				ctx.page_mask = 0;
121562306a36Sopenharmony_ci				goto next_page;
121662306a36Sopenharmony_ci			}
121762306a36Sopenharmony_ci
121862306a36Sopenharmony_ci			if (!vma) {
121962306a36Sopenharmony_ci				ret = -EFAULT;
122062306a36Sopenharmony_ci				goto out;
122162306a36Sopenharmony_ci			}
122262306a36Sopenharmony_ci			ret = check_vma_flags(vma, gup_flags);
122362306a36Sopenharmony_ci			if (ret)
122462306a36Sopenharmony_ci				goto out;
122562306a36Sopenharmony_ci		}
122662306a36Sopenharmony_ciretry:
122762306a36Sopenharmony_ci		/*
122862306a36Sopenharmony_ci		 * If we have a pending SIGKILL, don't keep faulting pages and
122962306a36Sopenharmony_ci		 * potentially allocating memory.
123062306a36Sopenharmony_ci		 */
123162306a36Sopenharmony_ci		if (fatal_signal_pending(current)) {
123262306a36Sopenharmony_ci			ret = -EINTR;
123362306a36Sopenharmony_ci			goto out;
123462306a36Sopenharmony_ci		}
123562306a36Sopenharmony_ci		cond_resched();
123662306a36Sopenharmony_ci
123762306a36Sopenharmony_ci		page = follow_page_mask(vma, start, foll_flags, &ctx);
123862306a36Sopenharmony_ci		if (!page || PTR_ERR(page) == -EMLINK) {
123962306a36Sopenharmony_ci			ret = faultin_page(vma, start, &foll_flags,
124062306a36Sopenharmony_ci					   PTR_ERR(page) == -EMLINK, locked);
124162306a36Sopenharmony_ci			switch (ret) {
124262306a36Sopenharmony_ci			case 0:
124362306a36Sopenharmony_ci				goto retry;
124462306a36Sopenharmony_ci			case -EBUSY:
124562306a36Sopenharmony_ci			case -EAGAIN:
124662306a36Sopenharmony_ci				ret = 0;
124762306a36Sopenharmony_ci				fallthrough;
124862306a36Sopenharmony_ci			case -EFAULT:
124962306a36Sopenharmony_ci			case -ENOMEM:
125062306a36Sopenharmony_ci			case -EHWPOISON:
125162306a36Sopenharmony_ci				goto out;
125262306a36Sopenharmony_ci			}
125362306a36Sopenharmony_ci			BUG();
125462306a36Sopenharmony_ci		} else if (PTR_ERR(page) == -EEXIST) {
125562306a36Sopenharmony_ci			/*
125662306a36Sopenharmony_ci			 * Proper page table entry exists, but no corresponding
125762306a36Sopenharmony_ci			 * struct page. If the caller expects **pages to be
125862306a36Sopenharmony_ci			 * filled in, bail out now, because that can't be done
125962306a36Sopenharmony_ci			 * for this page.
126062306a36Sopenharmony_ci			 */
126162306a36Sopenharmony_ci			if (pages) {
126262306a36Sopenharmony_ci				ret = PTR_ERR(page);
126362306a36Sopenharmony_ci				goto out;
126462306a36Sopenharmony_ci			}
126562306a36Sopenharmony_ci		} else if (IS_ERR(page)) {
126662306a36Sopenharmony_ci			ret = PTR_ERR(page);
126762306a36Sopenharmony_ci			goto out;
126862306a36Sopenharmony_ci		}
126962306a36Sopenharmony_cinext_page:
127062306a36Sopenharmony_ci		page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
127162306a36Sopenharmony_ci		if (page_increm > nr_pages)
127262306a36Sopenharmony_ci			page_increm = nr_pages;
127362306a36Sopenharmony_ci
127462306a36Sopenharmony_ci		if (pages) {
127562306a36Sopenharmony_ci			struct page *subpage;
127662306a36Sopenharmony_ci			unsigned int j;
127762306a36Sopenharmony_ci
127862306a36Sopenharmony_ci			/*
127962306a36Sopenharmony_ci			 * This must be a large folio (and doesn't need to
128062306a36Sopenharmony_ci			 * be the whole folio; it can be part of it), do
128162306a36Sopenharmony_ci			 * the refcount work for all the subpages too.
128262306a36Sopenharmony_ci			 *
128362306a36Sopenharmony_ci			 * NOTE: here the page may not be the head page
128462306a36Sopenharmony_ci			 * e.g. when start addr is not thp-size aligned.
128562306a36Sopenharmony_ci			 * try_grab_folio() should have taken care of tail
128662306a36Sopenharmony_ci			 * pages.
128762306a36Sopenharmony_ci			 */
128862306a36Sopenharmony_ci			if (page_increm > 1) {
128962306a36Sopenharmony_ci				struct folio *folio;
129062306a36Sopenharmony_ci
129162306a36Sopenharmony_ci				/*
129262306a36Sopenharmony_ci				 * Since we already hold refcount on the
129362306a36Sopenharmony_ci				 * large folio, this should never fail.
129462306a36Sopenharmony_ci				 */
129562306a36Sopenharmony_ci				folio = try_grab_folio(page, page_increm - 1,
129662306a36Sopenharmony_ci						       foll_flags);
129762306a36Sopenharmony_ci				if (WARN_ON_ONCE(!folio)) {
129862306a36Sopenharmony_ci					/*
129962306a36Sopenharmony_ci					 * Release the 1st page ref if the
130062306a36Sopenharmony_ci					 * folio is problematic, fail hard.
130162306a36Sopenharmony_ci					 */
130262306a36Sopenharmony_ci					gup_put_folio(page_folio(page), 1,
130362306a36Sopenharmony_ci						      foll_flags);
130462306a36Sopenharmony_ci					ret = -EFAULT;
130562306a36Sopenharmony_ci					goto out;
130662306a36Sopenharmony_ci				}
130762306a36Sopenharmony_ci			}
130862306a36Sopenharmony_ci
130962306a36Sopenharmony_ci			for (j = 0; j < page_increm; j++) {
131062306a36Sopenharmony_ci				subpage = nth_page(page, j);
131162306a36Sopenharmony_ci				pages[i + j] = subpage;
131262306a36Sopenharmony_ci				flush_anon_page(vma, subpage, start + j * PAGE_SIZE);
131362306a36Sopenharmony_ci				flush_dcache_page(subpage);
131462306a36Sopenharmony_ci			}
131562306a36Sopenharmony_ci		}
131662306a36Sopenharmony_ci
131762306a36Sopenharmony_ci		i += page_increm;
131862306a36Sopenharmony_ci		start += page_increm * PAGE_SIZE;
131962306a36Sopenharmony_ci		nr_pages -= page_increm;
132062306a36Sopenharmony_ci	} while (nr_pages);
132162306a36Sopenharmony_ciout:
132262306a36Sopenharmony_ci	if (ctx.pgmap)
132362306a36Sopenharmony_ci		put_dev_pagemap(ctx.pgmap);
132462306a36Sopenharmony_ci	return i ? i : ret;
132562306a36Sopenharmony_ci}
132662306a36Sopenharmony_ci
132762306a36Sopenharmony_cistatic bool vma_permits_fault(struct vm_area_struct *vma,
132862306a36Sopenharmony_ci			      unsigned int fault_flags)
132962306a36Sopenharmony_ci{
133062306a36Sopenharmony_ci	bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
133162306a36Sopenharmony_ci	bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
133262306a36Sopenharmony_ci	vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
133362306a36Sopenharmony_ci
133462306a36Sopenharmony_ci	if (!(vm_flags & vma->vm_flags))
133562306a36Sopenharmony_ci		return false;
133662306a36Sopenharmony_ci
133762306a36Sopenharmony_ci	/*
133862306a36Sopenharmony_ci	 * The architecture might have a hardware protection
133962306a36Sopenharmony_ci	 * mechanism other than read/write that can deny access.
134062306a36Sopenharmony_ci	 *
134162306a36Sopenharmony_ci	 * gup always represents data access, not instruction
134262306a36Sopenharmony_ci	 * fetches, so execute=false here:
134362306a36Sopenharmony_ci	 */
134462306a36Sopenharmony_ci	if (!arch_vma_access_permitted(vma, write, false, foreign))
134562306a36Sopenharmony_ci		return false;
134662306a36Sopenharmony_ci
134762306a36Sopenharmony_ci	return true;
134862306a36Sopenharmony_ci}
134962306a36Sopenharmony_ci
135062306a36Sopenharmony_ci/**
135162306a36Sopenharmony_ci * fixup_user_fault() - manually resolve a user page fault
135262306a36Sopenharmony_ci * @mm:		mm_struct of target mm
135362306a36Sopenharmony_ci * @address:	user address
135462306a36Sopenharmony_ci * @fault_flags:flags to pass down to handle_mm_fault()
135562306a36Sopenharmony_ci * @unlocked:	did we unlock the mmap_lock while retrying, maybe NULL if caller
135662306a36Sopenharmony_ci *		does not allow retry. If NULL, the caller must guarantee
135762306a36Sopenharmony_ci *		that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
135862306a36Sopenharmony_ci *
135962306a36Sopenharmony_ci * This is meant to be called in the specific scenario where for locking reasons
136062306a36Sopenharmony_ci * we try to access user memory in atomic context (within a pagefault_disable()
136162306a36Sopenharmony_ci * section), this returns -EFAULT, and we want to resolve the user fault before
136262306a36Sopenharmony_ci * trying again.
136362306a36Sopenharmony_ci *
136462306a36Sopenharmony_ci * Typically this is meant to be used by the futex code.
136562306a36Sopenharmony_ci *
136662306a36Sopenharmony_ci * The main difference with get_user_pages() is that this function will
136762306a36Sopenharmony_ci * unconditionally call handle_mm_fault() which will in turn perform all the
136862306a36Sopenharmony_ci * necessary SW fixup of the dirty and young bits in the PTE, while
136962306a36Sopenharmony_ci * get_user_pages() only guarantees to update these in the struct page.
137062306a36Sopenharmony_ci *
137162306a36Sopenharmony_ci * This is important for some architectures where those bits also gate the
137262306a36Sopenharmony_ci * access permission to the page because they are maintained in software.  On
137362306a36Sopenharmony_ci * such architectures, gup() will not be enough to make a subsequent access
137462306a36Sopenharmony_ci * succeed.
137562306a36Sopenharmony_ci *
137662306a36Sopenharmony_ci * This function will not return with an unlocked mmap_lock. So it has not the
137762306a36Sopenharmony_ci * same semantics wrt the @mm->mmap_lock as does filemap_fault().
137862306a36Sopenharmony_ci */
137962306a36Sopenharmony_ciint fixup_user_fault(struct mm_struct *mm,
138062306a36Sopenharmony_ci		     unsigned long address, unsigned int fault_flags,
138162306a36Sopenharmony_ci		     bool *unlocked)
138262306a36Sopenharmony_ci{
138362306a36Sopenharmony_ci	struct vm_area_struct *vma;
138462306a36Sopenharmony_ci	vm_fault_t ret;
138562306a36Sopenharmony_ci
138662306a36Sopenharmony_ci	address = untagged_addr_remote(mm, address);
138762306a36Sopenharmony_ci
138862306a36Sopenharmony_ci	if (unlocked)
138962306a36Sopenharmony_ci		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
139062306a36Sopenharmony_ci
139162306a36Sopenharmony_ciretry:
139262306a36Sopenharmony_ci	vma = gup_vma_lookup(mm, address);
139362306a36Sopenharmony_ci	if (!vma)
139462306a36Sopenharmony_ci		return -EFAULT;
139562306a36Sopenharmony_ci
139662306a36Sopenharmony_ci	if (!vma_permits_fault(vma, fault_flags))
139762306a36Sopenharmony_ci		return -EFAULT;
139862306a36Sopenharmony_ci
139962306a36Sopenharmony_ci	if ((fault_flags & FAULT_FLAG_KILLABLE) &&
140062306a36Sopenharmony_ci	    fatal_signal_pending(current))
140162306a36Sopenharmony_ci		return -EINTR;
140262306a36Sopenharmony_ci
140362306a36Sopenharmony_ci	ret = handle_mm_fault(vma, address, fault_flags, NULL);
140462306a36Sopenharmony_ci
140562306a36Sopenharmony_ci	if (ret & VM_FAULT_COMPLETED) {
140662306a36Sopenharmony_ci		/*
140762306a36Sopenharmony_ci		 * NOTE: it's a pity that we need to retake the lock here
140862306a36Sopenharmony_ci		 * to pair with the unlock() in the callers. Ideally we
140962306a36Sopenharmony_ci		 * could tell the callers so they do not need to unlock.
141062306a36Sopenharmony_ci		 */
141162306a36Sopenharmony_ci		mmap_read_lock(mm);
141262306a36Sopenharmony_ci		*unlocked = true;
141362306a36Sopenharmony_ci		return 0;
141462306a36Sopenharmony_ci	}
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ci	if (ret & VM_FAULT_ERROR) {
141762306a36Sopenharmony_ci		int err = vm_fault_to_errno(ret, 0);
141862306a36Sopenharmony_ci
141962306a36Sopenharmony_ci		if (err)
142062306a36Sopenharmony_ci			return err;
142162306a36Sopenharmony_ci		BUG();
142262306a36Sopenharmony_ci	}
142362306a36Sopenharmony_ci
142462306a36Sopenharmony_ci	if (ret & VM_FAULT_RETRY) {
142562306a36Sopenharmony_ci		mmap_read_lock(mm);
142662306a36Sopenharmony_ci		*unlocked = true;
142762306a36Sopenharmony_ci		fault_flags |= FAULT_FLAG_TRIED;
142862306a36Sopenharmony_ci		goto retry;
142962306a36Sopenharmony_ci	}
143062306a36Sopenharmony_ci
143162306a36Sopenharmony_ci	return 0;
143262306a36Sopenharmony_ci}
143362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(fixup_user_fault);
143462306a36Sopenharmony_ci
143562306a36Sopenharmony_ci/*
143662306a36Sopenharmony_ci * GUP always responds to fatal signals.  When FOLL_INTERRUPTIBLE is
143762306a36Sopenharmony_ci * specified, it'll also respond to generic signals.  The caller of GUP
143862306a36Sopenharmony_ci * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
143962306a36Sopenharmony_ci */
144062306a36Sopenharmony_cistatic bool gup_signal_pending(unsigned int flags)
144162306a36Sopenharmony_ci{
144262306a36Sopenharmony_ci	if (fatal_signal_pending(current))
144362306a36Sopenharmony_ci		return true;
144462306a36Sopenharmony_ci
144562306a36Sopenharmony_ci	if (!(flags & FOLL_INTERRUPTIBLE))
144662306a36Sopenharmony_ci		return false;
144762306a36Sopenharmony_ci
144862306a36Sopenharmony_ci	return signal_pending(current);
144962306a36Sopenharmony_ci}
145062306a36Sopenharmony_ci
145162306a36Sopenharmony_ci/*
145262306a36Sopenharmony_ci * Locking: (*locked == 1) means that the mmap_lock has already been acquired by
145362306a36Sopenharmony_ci * the caller. This function may drop the mmap_lock. If it does so, then it will
145462306a36Sopenharmony_ci * set (*locked = 0).
145562306a36Sopenharmony_ci *
145662306a36Sopenharmony_ci * (*locked == 0) means that the caller expects this function to acquire and
145762306a36Sopenharmony_ci * drop the mmap_lock. Therefore, the value of *locked will still be zero when
145862306a36Sopenharmony_ci * the function returns, even though it may have changed temporarily during
145962306a36Sopenharmony_ci * function execution.
146062306a36Sopenharmony_ci *
146162306a36Sopenharmony_ci * Please note that this function, unlike __get_user_pages(), will not return 0
146262306a36Sopenharmony_ci * for nr_pages > 0, unless FOLL_NOWAIT is used.
146362306a36Sopenharmony_ci */
146462306a36Sopenharmony_cistatic __always_inline long __get_user_pages_locked(struct mm_struct *mm,
146562306a36Sopenharmony_ci						unsigned long start,
146662306a36Sopenharmony_ci						unsigned long nr_pages,
146762306a36Sopenharmony_ci						struct page **pages,
146862306a36Sopenharmony_ci						int *locked,
146962306a36Sopenharmony_ci						unsigned int flags)
147062306a36Sopenharmony_ci{
147162306a36Sopenharmony_ci	long ret, pages_done;
147262306a36Sopenharmony_ci	bool must_unlock = false;
147362306a36Sopenharmony_ci
147462306a36Sopenharmony_ci	/*
147562306a36Sopenharmony_ci	 * The internal caller expects GUP to manage the lock internally and the
147662306a36Sopenharmony_ci	 * lock must be released when this returns.
147762306a36Sopenharmony_ci	 */
147862306a36Sopenharmony_ci	if (!*locked) {
147962306a36Sopenharmony_ci		if (mmap_read_lock_killable(mm))
148062306a36Sopenharmony_ci			return -EAGAIN;
148162306a36Sopenharmony_ci		must_unlock = true;
148262306a36Sopenharmony_ci		*locked = 1;
148362306a36Sopenharmony_ci	}
148462306a36Sopenharmony_ci	else
148562306a36Sopenharmony_ci		mmap_assert_locked(mm);
148662306a36Sopenharmony_ci
148762306a36Sopenharmony_ci	if (flags & FOLL_PIN)
148862306a36Sopenharmony_ci		mm_set_has_pinned_flag(&mm->flags);
148962306a36Sopenharmony_ci
149062306a36Sopenharmony_ci	/*
149162306a36Sopenharmony_ci	 * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
149262306a36Sopenharmony_ci	 * is to set FOLL_GET if the caller wants pages[] filled in (but has
149362306a36Sopenharmony_ci	 * carelessly failed to specify FOLL_GET), so keep doing that, but only
149462306a36Sopenharmony_ci	 * for FOLL_GET, not for the newer FOLL_PIN.
149562306a36Sopenharmony_ci	 *
149662306a36Sopenharmony_ci	 * FOLL_PIN always expects pages to be non-null, but no need to assert
149762306a36Sopenharmony_ci	 * that here, as any failures will be obvious enough.
149862306a36Sopenharmony_ci	 */
149962306a36Sopenharmony_ci	if (pages && !(flags & FOLL_PIN))
150062306a36Sopenharmony_ci		flags |= FOLL_GET;
150162306a36Sopenharmony_ci
150262306a36Sopenharmony_ci	pages_done = 0;
150362306a36Sopenharmony_ci	for (;;) {
150462306a36Sopenharmony_ci		ret = __get_user_pages(mm, start, nr_pages, flags, pages,
150562306a36Sopenharmony_ci				       locked);
150662306a36Sopenharmony_ci		if (!(flags & FOLL_UNLOCKABLE)) {
150762306a36Sopenharmony_ci			/* VM_FAULT_RETRY couldn't trigger, bypass */
150862306a36Sopenharmony_ci			pages_done = ret;
150962306a36Sopenharmony_ci			break;
151062306a36Sopenharmony_ci		}
151162306a36Sopenharmony_ci
151262306a36Sopenharmony_ci		/* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
151362306a36Sopenharmony_ci		if (!*locked) {
151462306a36Sopenharmony_ci			BUG_ON(ret < 0);
151562306a36Sopenharmony_ci			BUG_ON(ret >= nr_pages);
151662306a36Sopenharmony_ci		}
151762306a36Sopenharmony_ci
151862306a36Sopenharmony_ci		if (ret > 0) {
151962306a36Sopenharmony_ci			nr_pages -= ret;
152062306a36Sopenharmony_ci			pages_done += ret;
152162306a36Sopenharmony_ci			if (!nr_pages)
152262306a36Sopenharmony_ci				break;
152362306a36Sopenharmony_ci		}
152462306a36Sopenharmony_ci		if (*locked) {
152562306a36Sopenharmony_ci			/*
152662306a36Sopenharmony_ci			 * VM_FAULT_RETRY didn't trigger or it was a
152762306a36Sopenharmony_ci			 * FOLL_NOWAIT.
152862306a36Sopenharmony_ci			 */
152962306a36Sopenharmony_ci			if (!pages_done)
153062306a36Sopenharmony_ci				pages_done = ret;
153162306a36Sopenharmony_ci			break;
153262306a36Sopenharmony_ci		}
153362306a36Sopenharmony_ci		/*
153462306a36Sopenharmony_ci		 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
153562306a36Sopenharmony_ci		 * For the prefault case (!pages) we only update counts.
153662306a36Sopenharmony_ci		 */
153762306a36Sopenharmony_ci		if (likely(pages))
153862306a36Sopenharmony_ci			pages += ret;
153962306a36Sopenharmony_ci		start += ret << PAGE_SHIFT;
154062306a36Sopenharmony_ci
154162306a36Sopenharmony_ci		/* The lock was temporarily dropped, so we must unlock later */
154262306a36Sopenharmony_ci		must_unlock = true;
154362306a36Sopenharmony_ci
154462306a36Sopenharmony_ciretry:
154562306a36Sopenharmony_ci		/*
154662306a36Sopenharmony_ci		 * Repeat on the address that fired VM_FAULT_RETRY
154762306a36Sopenharmony_ci		 * with both FAULT_FLAG_ALLOW_RETRY and
154862306a36Sopenharmony_ci		 * FAULT_FLAG_TRIED.  Note that GUP can be interrupted
154962306a36Sopenharmony_ci		 * by fatal signals of even common signals, depending on
155062306a36Sopenharmony_ci		 * the caller's request. So we need to check it before we
155162306a36Sopenharmony_ci		 * start trying again otherwise it can loop forever.
155262306a36Sopenharmony_ci		 */
155362306a36Sopenharmony_ci		if (gup_signal_pending(flags)) {
155462306a36Sopenharmony_ci			if (!pages_done)
155562306a36Sopenharmony_ci				pages_done = -EINTR;
155662306a36Sopenharmony_ci			break;
155762306a36Sopenharmony_ci		}
155862306a36Sopenharmony_ci
155962306a36Sopenharmony_ci		ret = mmap_read_lock_killable(mm);
156062306a36Sopenharmony_ci		if (ret) {
156162306a36Sopenharmony_ci			BUG_ON(ret > 0);
156262306a36Sopenharmony_ci			if (!pages_done)
156362306a36Sopenharmony_ci				pages_done = ret;
156462306a36Sopenharmony_ci			break;
156562306a36Sopenharmony_ci		}
156662306a36Sopenharmony_ci
156762306a36Sopenharmony_ci		*locked = 1;
156862306a36Sopenharmony_ci		ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
156962306a36Sopenharmony_ci				       pages, locked);
157062306a36Sopenharmony_ci		if (!*locked) {
157162306a36Sopenharmony_ci			/* Continue to retry until we succeeded */
157262306a36Sopenharmony_ci			BUG_ON(ret != 0);
157362306a36Sopenharmony_ci			goto retry;
157462306a36Sopenharmony_ci		}
157562306a36Sopenharmony_ci		if (ret != 1) {
157662306a36Sopenharmony_ci			BUG_ON(ret > 1);
157762306a36Sopenharmony_ci			if (!pages_done)
157862306a36Sopenharmony_ci				pages_done = ret;
157962306a36Sopenharmony_ci			break;
158062306a36Sopenharmony_ci		}
158162306a36Sopenharmony_ci		nr_pages--;
158262306a36Sopenharmony_ci		pages_done++;
158362306a36Sopenharmony_ci		if (!nr_pages)
158462306a36Sopenharmony_ci			break;
158562306a36Sopenharmony_ci		if (likely(pages))
158662306a36Sopenharmony_ci			pages++;
158762306a36Sopenharmony_ci		start += PAGE_SIZE;
158862306a36Sopenharmony_ci	}
158962306a36Sopenharmony_ci	if (must_unlock && *locked) {
159062306a36Sopenharmony_ci		/*
159162306a36Sopenharmony_ci		 * We either temporarily dropped the lock, or the caller
159262306a36Sopenharmony_ci		 * requested that we both acquire and drop the lock. Either way,
159362306a36Sopenharmony_ci		 * we must now unlock, and notify the caller of that state.
159462306a36Sopenharmony_ci		 */
159562306a36Sopenharmony_ci		mmap_read_unlock(mm);
159662306a36Sopenharmony_ci		*locked = 0;
159762306a36Sopenharmony_ci	}
159862306a36Sopenharmony_ci	return pages_done;
159962306a36Sopenharmony_ci}
160062306a36Sopenharmony_ci
160162306a36Sopenharmony_ci/**
160262306a36Sopenharmony_ci * populate_vma_page_range() -  populate a range of pages in the vma.
160362306a36Sopenharmony_ci * @vma:   target vma
160462306a36Sopenharmony_ci * @start: start address
160562306a36Sopenharmony_ci * @end:   end address
160662306a36Sopenharmony_ci * @locked: whether the mmap_lock is still held
160762306a36Sopenharmony_ci *
160862306a36Sopenharmony_ci * This takes care of mlocking the pages too if VM_LOCKED is set.
160962306a36Sopenharmony_ci *
161062306a36Sopenharmony_ci * Return either number of pages pinned in the vma, or a negative error
161162306a36Sopenharmony_ci * code on error.
161262306a36Sopenharmony_ci *
161362306a36Sopenharmony_ci * vma->vm_mm->mmap_lock must be held.
161462306a36Sopenharmony_ci *
161562306a36Sopenharmony_ci * If @locked is NULL, it may be held for read or write and will
161662306a36Sopenharmony_ci * be unperturbed.
161762306a36Sopenharmony_ci *
161862306a36Sopenharmony_ci * If @locked is non-NULL, it must held for read only and may be
161962306a36Sopenharmony_ci * released.  If it's released, *@locked will be set to 0.
162062306a36Sopenharmony_ci */
162162306a36Sopenharmony_cilong populate_vma_page_range(struct vm_area_struct *vma,
162262306a36Sopenharmony_ci		unsigned long start, unsigned long end, int *locked)
162362306a36Sopenharmony_ci{
162462306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
162562306a36Sopenharmony_ci	unsigned long nr_pages = (end - start) / PAGE_SIZE;
162662306a36Sopenharmony_ci	int local_locked = 1;
162762306a36Sopenharmony_ci	int gup_flags;
162862306a36Sopenharmony_ci	long ret;
162962306a36Sopenharmony_ci
163062306a36Sopenharmony_ci	VM_BUG_ON(!PAGE_ALIGNED(start));
163162306a36Sopenharmony_ci	VM_BUG_ON(!PAGE_ALIGNED(end));
163262306a36Sopenharmony_ci	VM_BUG_ON_VMA(start < vma->vm_start, vma);
163362306a36Sopenharmony_ci	VM_BUG_ON_VMA(end   > vma->vm_end, vma);
163462306a36Sopenharmony_ci	mmap_assert_locked(mm);
163562306a36Sopenharmony_ci
163662306a36Sopenharmony_ci	/*
163762306a36Sopenharmony_ci	 * Rightly or wrongly, the VM_LOCKONFAULT case has never used
163862306a36Sopenharmony_ci	 * faultin_page() to break COW, so it has no work to do here.
163962306a36Sopenharmony_ci	 */
164062306a36Sopenharmony_ci	if (vma->vm_flags & VM_LOCKONFAULT)
164162306a36Sopenharmony_ci		return nr_pages;
164262306a36Sopenharmony_ci
164362306a36Sopenharmony_ci	gup_flags = FOLL_TOUCH;
164462306a36Sopenharmony_ci	/*
164562306a36Sopenharmony_ci	 * We want to touch writable mappings with a write fault in order
164662306a36Sopenharmony_ci	 * to break COW, except for shared mappings because these don't COW
164762306a36Sopenharmony_ci	 * and we would not want to dirty them for nothing.
164862306a36Sopenharmony_ci	 */
164962306a36Sopenharmony_ci	if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
165062306a36Sopenharmony_ci		gup_flags |= FOLL_WRITE;
165162306a36Sopenharmony_ci
165262306a36Sopenharmony_ci	/*
165362306a36Sopenharmony_ci	 * We want mlock to succeed for regions that have any permissions
165462306a36Sopenharmony_ci	 * other than PROT_NONE.
165562306a36Sopenharmony_ci	 */
165662306a36Sopenharmony_ci	if (vma_is_accessible(vma))
165762306a36Sopenharmony_ci		gup_flags |= FOLL_FORCE;
165862306a36Sopenharmony_ci
165962306a36Sopenharmony_ci	if (locked)
166062306a36Sopenharmony_ci		gup_flags |= FOLL_UNLOCKABLE;
166162306a36Sopenharmony_ci
166262306a36Sopenharmony_ci	/*
166362306a36Sopenharmony_ci	 * We made sure addr is within a VMA, so the following will
166462306a36Sopenharmony_ci	 * not result in a stack expansion that recurses back here.
166562306a36Sopenharmony_ci	 */
166662306a36Sopenharmony_ci	ret = __get_user_pages(mm, start, nr_pages, gup_flags,
166762306a36Sopenharmony_ci			       NULL, locked ? locked : &local_locked);
166862306a36Sopenharmony_ci	lru_add_drain();
166962306a36Sopenharmony_ci	return ret;
167062306a36Sopenharmony_ci}
167162306a36Sopenharmony_ci
167262306a36Sopenharmony_ci/*
167362306a36Sopenharmony_ci * faultin_vma_page_range() - populate (prefault) page tables inside the
167462306a36Sopenharmony_ci *			      given VMA range readable/writable
167562306a36Sopenharmony_ci *
167662306a36Sopenharmony_ci * This takes care of mlocking the pages, too, if VM_LOCKED is set.
167762306a36Sopenharmony_ci *
167862306a36Sopenharmony_ci * @vma: target vma
167962306a36Sopenharmony_ci * @start: start address
168062306a36Sopenharmony_ci * @end: end address
168162306a36Sopenharmony_ci * @write: whether to prefault readable or writable
168262306a36Sopenharmony_ci * @locked: whether the mmap_lock is still held
168362306a36Sopenharmony_ci *
168462306a36Sopenharmony_ci * Returns either number of processed pages in the vma, or a negative error
168562306a36Sopenharmony_ci * code on error (see __get_user_pages()).
168662306a36Sopenharmony_ci *
168762306a36Sopenharmony_ci * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
168862306a36Sopenharmony_ci * covered by the VMA. If it's released, *@locked will be set to 0.
168962306a36Sopenharmony_ci */
169062306a36Sopenharmony_cilong faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
169162306a36Sopenharmony_ci			    unsigned long end, bool write, int *locked)
169262306a36Sopenharmony_ci{
169362306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
169462306a36Sopenharmony_ci	unsigned long nr_pages = (end - start) / PAGE_SIZE;
169562306a36Sopenharmony_ci	int gup_flags;
169662306a36Sopenharmony_ci	long ret;
169762306a36Sopenharmony_ci
169862306a36Sopenharmony_ci	VM_BUG_ON(!PAGE_ALIGNED(start));
169962306a36Sopenharmony_ci	VM_BUG_ON(!PAGE_ALIGNED(end));
170062306a36Sopenharmony_ci	VM_BUG_ON_VMA(start < vma->vm_start, vma);
170162306a36Sopenharmony_ci	VM_BUG_ON_VMA(end > vma->vm_end, vma);
170262306a36Sopenharmony_ci	mmap_assert_locked(mm);
170362306a36Sopenharmony_ci
170462306a36Sopenharmony_ci	/*
170562306a36Sopenharmony_ci	 * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
170662306a36Sopenharmony_ci	 *	       the page dirty with FOLL_WRITE -- which doesn't make a
170762306a36Sopenharmony_ci	 *	       difference with !FOLL_FORCE, because the page is writable
170862306a36Sopenharmony_ci	 *	       in the page table.
170962306a36Sopenharmony_ci	 * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
171062306a36Sopenharmony_ci	 *		  a poisoned page.
171162306a36Sopenharmony_ci	 * !FOLL_FORCE: Require proper access permissions.
171262306a36Sopenharmony_ci	 */
171362306a36Sopenharmony_ci	gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE;
171462306a36Sopenharmony_ci	if (write)
171562306a36Sopenharmony_ci		gup_flags |= FOLL_WRITE;
171662306a36Sopenharmony_ci
171762306a36Sopenharmony_ci	/*
171862306a36Sopenharmony_ci	 * We want to report -EINVAL instead of -EFAULT for any permission
171962306a36Sopenharmony_ci	 * problems or incompatible mappings.
172062306a36Sopenharmony_ci	 */
172162306a36Sopenharmony_ci	if (check_vma_flags(vma, gup_flags))
172262306a36Sopenharmony_ci		return -EINVAL;
172362306a36Sopenharmony_ci
172462306a36Sopenharmony_ci	ret = __get_user_pages(mm, start, nr_pages, gup_flags,
172562306a36Sopenharmony_ci			       NULL, locked);
172662306a36Sopenharmony_ci	lru_add_drain();
172762306a36Sopenharmony_ci	return ret;
172862306a36Sopenharmony_ci}
172962306a36Sopenharmony_ci
173062306a36Sopenharmony_ci/*
173162306a36Sopenharmony_ci * __mm_populate - populate and/or mlock pages within a range of address space.
173262306a36Sopenharmony_ci *
173362306a36Sopenharmony_ci * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
173462306a36Sopenharmony_ci * flags. VMAs must be already marked with the desired vm_flags, and
173562306a36Sopenharmony_ci * mmap_lock must not be held.
173662306a36Sopenharmony_ci */
173762306a36Sopenharmony_ciint __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
173862306a36Sopenharmony_ci{
173962306a36Sopenharmony_ci	struct mm_struct *mm = current->mm;
174062306a36Sopenharmony_ci	unsigned long end, nstart, nend;
174162306a36Sopenharmony_ci	struct vm_area_struct *vma = NULL;
174262306a36Sopenharmony_ci	int locked = 0;
174362306a36Sopenharmony_ci	long ret = 0;
174462306a36Sopenharmony_ci
174562306a36Sopenharmony_ci	end = start + len;
174662306a36Sopenharmony_ci
174762306a36Sopenharmony_ci	for (nstart = start; nstart < end; nstart = nend) {
174862306a36Sopenharmony_ci		/*
174962306a36Sopenharmony_ci		 * We want to fault in pages for [nstart; end) address range.
175062306a36Sopenharmony_ci		 * Find first corresponding VMA.
175162306a36Sopenharmony_ci		 */
175262306a36Sopenharmony_ci		if (!locked) {
175362306a36Sopenharmony_ci			locked = 1;
175462306a36Sopenharmony_ci			mmap_read_lock(mm);
175562306a36Sopenharmony_ci			vma = find_vma_intersection(mm, nstart, end);
175662306a36Sopenharmony_ci		} else if (nstart >= vma->vm_end)
175762306a36Sopenharmony_ci			vma = find_vma_intersection(mm, vma->vm_end, end);
175862306a36Sopenharmony_ci
175962306a36Sopenharmony_ci		if (!vma)
176062306a36Sopenharmony_ci			break;
176162306a36Sopenharmony_ci		/*
176262306a36Sopenharmony_ci		 * Set [nstart; nend) to intersection of desired address
176362306a36Sopenharmony_ci		 * range with the first VMA. Also, skip undesirable VMA types.
176462306a36Sopenharmony_ci		 */
176562306a36Sopenharmony_ci		nend = min(end, vma->vm_end);
176662306a36Sopenharmony_ci		if (vma->vm_flags & (VM_IO | VM_PFNMAP))
176762306a36Sopenharmony_ci			continue;
176862306a36Sopenharmony_ci		if (nstart < vma->vm_start)
176962306a36Sopenharmony_ci			nstart = vma->vm_start;
177062306a36Sopenharmony_ci		/*
177162306a36Sopenharmony_ci		 * Now fault in a range of pages. populate_vma_page_range()
177262306a36Sopenharmony_ci		 * double checks the vma flags, so that it won't mlock pages
177362306a36Sopenharmony_ci		 * if the vma was already munlocked.
177462306a36Sopenharmony_ci		 */
177562306a36Sopenharmony_ci		ret = populate_vma_page_range(vma, nstart, nend, &locked);
177662306a36Sopenharmony_ci		if (ret < 0) {
177762306a36Sopenharmony_ci			if (ignore_errors) {
177862306a36Sopenharmony_ci				ret = 0;
177962306a36Sopenharmony_ci				continue;	/* continue at next VMA */
178062306a36Sopenharmony_ci			}
178162306a36Sopenharmony_ci			break;
178262306a36Sopenharmony_ci		}
178362306a36Sopenharmony_ci		nend = nstart + ret * PAGE_SIZE;
178462306a36Sopenharmony_ci		ret = 0;
178562306a36Sopenharmony_ci	}
178662306a36Sopenharmony_ci	if (locked)
178762306a36Sopenharmony_ci		mmap_read_unlock(mm);
178862306a36Sopenharmony_ci	return ret;	/* 0 or negative error code */
178962306a36Sopenharmony_ci}
179062306a36Sopenharmony_ci#else /* CONFIG_MMU */
179162306a36Sopenharmony_cistatic long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
179262306a36Sopenharmony_ci		unsigned long nr_pages, struct page **pages,
179362306a36Sopenharmony_ci		int *locked, unsigned int foll_flags)
179462306a36Sopenharmony_ci{
179562306a36Sopenharmony_ci	struct vm_area_struct *vma;
179662306a36Sopenharmony_ci	bool must_unlock = false;
179762306a36Sopenharmony_ci	unsigned long vm_flags;
179862306a36Sopenharmony_ci	long i;
179962306a36Sopenharmony_ci
180062306a36Sopenharmony_ci	if (!nr_pages)
180162306a36Sopenharmony_ci		return 0;
180262306a36Sopenharmony_ci
180362306a36Sopenharmony_ci	/*
180462306a36Sopenharmony_ci	 * The internal caller expects GUP to manage the lock internally and the
180562306a36Sopenharmony_ci	 * lock must be released when this returns.
180662306a36Sopenharmony_ci	 */
180762306a36Sopenharmony_ci	if (!*locked) {
180862306a36Sopenharmony_ci		if (mmap_read_lock_killable(mm))
180962306a36Sopenharmony_ci			return -EAGAIN;
181062306a36Sopenharmony_ci		must_unlock = true;
181162306a36Sopenharmony_ci		*locked = 1;
181262306a36Sopenharmony_ci	}
181362306a36Sopenharmony_ci
181462306a36Sopenharmony_ci	/* calculate required read or write permissions.
181562306a36Sopenharmony_ci	 * If FOLL_FORCE is set, we only require the "MAY" flags.
181662306a36Sopenharmony_ci	 */
181762306a36Sopenharmony_ci	vm_flags  = (foll_flags & FOLL_WRITE) ?
181862306a36Sopenharmony_ci			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
181962306a36Sopenharmony_ci	vm_flags &= (foll_flags & FOLL_FORCE) ?
182062306a36Sopenharmony_ci			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
182162306a36Sopenharmony_ci
182262306a36Sopenharmony_ci	for (i = 0; i < nr_pages; i++) {
182362306a36Sopenharmony_ci		vma = find_vma(mm, start);
182462306a36Sopenharmony_ci		if (!vma)
182562306a36Sopenharmony_ci			break;
182662306a36Sopenharmony_ci
182762306a36Sopenharmony_ci		/* protect what we can, including chardevs */
182862306a36Sopenharmony_ci		if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
182962306a36Sopenharmony_ci		    !(vm_flags & vma->vm_flags))
183062306a36Sopenharmony_ci			break;
183162306a36Sopenharmony_ci
183262306a36Sopenharmony_ci		if (pages) {
183362306a36Sopenharmony_ci			pages[i] = virt_to_page((void *)start);
183462306a36Sopenharmony_ci			if (pages[i])
183562306a36Sopenharmony_ci				get_page(pages[i]);
183662306a36Sopenharmony_ci		}
183762306a36Sopenharmony_ci
183862306a36Sopenharmony_ci		start = (start + PAGE_SIZE) & PAGE_MASK;
183962306a36Sopenharmony_ci	}
184062306a36Sopenharmony_ci
184162306a36Sopenharmony_ci	if (must_unlock && *locked) {
184262306a36Sopenharmony_ci		mmap_read_unlock(mm);
184362306a36Sopenharmony_ci		*locked = 0;
184462306a36Sopenharmony_ci	}
184562306a36Sopenharmony_ci
184662306a36Sopenharmony_ci	return i ? : -EFAULT;
184762306a36Sopenharmony_ci}
184862306a36Sopenharmony_ci#endif /* !CONFIG_MMU */
184962306a36Sopenharmony_ci
185062306a36Sopenharmony_ci/**
185162306a36Sopenharmony_ci * fault_in_writeable - fault in userspace address range for writing
185262306a36Sopenharmony_ci * @uaddr: start of address range
185362306a36Sopenharmony_ci * @size: size of address range
185462306a36Sopenharmony_ci *
185562306a36Sopenharmony_ci * Returns the number of bytes not faulted in (like copy_to_user() and
185662306a36Sopenharmony_ci * copy_from_user()).
185762306a36Sopenharmony_ci */
185862306a36Sopenharmony_cisize_t fault_in_writeable(char __user *uaddr, size_t size)
185962306a36Sopenharmony_ci{
186062306a36Sopenharmony_ci	char __user *start = uaddr, *end;
186162306a36Sopenharmony_ci
186262306a36Sopenharmony_ci	if (unlikely(size == 0))
186362306a36Sopenharmony_ci		return 0;
186462306a36Sopenharmony_ci	if (!user_write_access_begin(uaddr, size))
186562306a36Sopenharmony_ci		return size;
186662306a36Sopenharmony_ci	if (!PAGE_ALIGNED(uaddr)) {
186762306a36Sopenharmony_ci		unsafe_put_user(0, uaddr, out);
186862306a36Sopenharmony_ci		uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
186962306a36Sopenharmony_ci	}
187062306a36Sopenharmony_ci	end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
187162306a36Sopenharmony_ci	if (unlikely(end < start))
187262306a36Sopenharmony_ci		end = NULL;
187362306a36Sopenharmony_ci	while (uaddr != end) {
187462306a36Sopenharmony_ci		unsafe_put_user(0, uaddr, out);
187562306a36Sopenharmony_ci		uaddr += PAGE_SIZE;
187662306a36Sopenharmony_ci	}
187762306a36Sopenharmony_ci
187862306a36Sopenharmony_ciout:
187962306a36Sopenharmony_ci	user_write_access_end();
188062306a36Sopenharmony_ci	if (size > uaddr - start)
188162306a36Sopenharmony_ci		return size - (uaddr - start);
188262306a36Sopenharmony_ci	return 0;
188362306a36Sopenharmony_ci}
188462306a36Sopenharmony_ciEXPORT_SYMBOL(fault_in_writeable);
188562306a36Sopenharmony_ci
188662306a36Sopenharmony_ci/**
188762306a36Sopenharmony_ci * fault_in_subpage_writeable - fault in an address range for writing
188862306a36Sopenharmony_ci * @uaddr: start of address range
188962306a36Sopenharmony_ci * @size: size of address range
189062306a36Sopenharmony_ci *
189162306a36Sopenharmony_ci * Fault in a user address range for writing while checking for permissions at
189262306a36Sopenharmony_ci * sub-page granularity (e.g. arm64 MTE). This function should be used when
189362306a36Sopenharmony_ci * the caller cannot guarantee forward progress of a copy_to_user() loop.
189462306a36Sopenharmony_ci *
189562306a36Sopenharmony_ci * Returns the number of bytes not faulted in (like copy_to_user() and
189662306a36Sopenharmony_ci * copy_from_user()).
189762306a36Sopenharmony_ci */
189862306a36Sopenharmony_cisize_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
189962306a36Sopenharmony_ci{
190062306a36Sopenharmony_ci	size_t faulted_in;
190162306a36Sopenharmony_ci
190262306a36Sopenharmony_ci	/*
190362306a36Sopenharmony_ci	 * Attempt faulting in at page granularity first for page table
190462306a36Sopenharmony_ci	 * permission checking. The arch-specific probe_subpage_writeable()
190562306a36Sopenharmony_ci	 * functions may not check for this.
190662306a36Sopenharmony_ci	 */
190762306a36Sopenharmony_ci	faulted_in = size - fault_in_writeable(uaddr, size);
190862306a36Sopenharmony_ci	if (faulted_in)
190962306a36Sopenharmony_ci		faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
191062306a36Sopenharmony_ci
191162306a36Sopenharmony_ci	return size - faulted_in;
191262306a36Sopenharmony_ci}
191362306a36Sopenharmony_ciEXPORT_SYMBOL(fault_in_subpage_writeable);
191462306a36Sopenharmony_ci
191562306a36Sopenharmony_ci/*
191662306a36Sopenharmony_ci * fault_in_safe_writeable - fault in an address range for writing
191762306a36Sopenharmony_ci * @uaddr: start of address range
191862306a36Sopenharmony_ci * @size: length of address range
191962306a36Sopenharmony_ci *
192062306a36Sopenharmony_ci * Faults in an address range for writing.  This is primarily useful when we
192162306a36Sopenharmony_ci * already know that some or all of the pages in the address range aren't in
192262306a36Sopenharmony_ci * memory.
192362306a36Sopenharmony_ci *
192462306a36Sopenharmony_ci * Unlike fault_in_writeable(), this function is non-destructive.
192562306a36Sopenharmony_ci *
192662306a36Sopenharmony_ci * Note that we don't pin or otherwise hold the pages referenced that we fault
192762306a36Sopenharmony_ci * in.  There's no guarantee that they'll stay in memory for any duration of
192862306a36Sopenharmony_ci * time.
192962306a36Sopenharmony_ci *
193062306a36Sopenharmony_ci * Returns the number of bytes not faulted in, like copy_to_user() and
193162306a36Sopenharmony_ci * copy_from_user().
193262306a36Sopenharmony_ci */
193362306a36Sopenharmony_cisize_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
193462306a36Sopenharmony_ci{
193562306a36Sopenharmony_ci	unsigned long start = (unsigned long)uaddr, end;
193662306a36Sopenharmony_ci	struct mm_struct *mm = current->mm;
193762306a36Sopenharmony_ci	bool unlocked = false;
193862306a36Sopenharmony_ci
193962306a36Sopenharmony_ci	if (unlikely(size == 0))
194062306a36Sopenharmony_ci		return 0;
194162306a36Sopenharmony_ci	end = PAGE_ALIGN(start + size);
194262306a36Sopenharmony_ci	if (end < start)
194362306a36Sopenharmony_ci		end = 0;
194462306a36Sopenharmony_ci
194562306a36Sopenharmony_ci	mmap_read_lock(mm);
194662306a36Sopenharmony_ci	do {
194762306a36Sopenharmony_ci		if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
194862306a36Sopenharmony_ci			break;
194962306a36Sopenharmony_ci		start = (start + PAGE_SIZE) & PAGE_MASK;
195062306a36Sopenharmony_ci	} while (start != end);
195162306a36Sopenharmony_ci	mmap_read_unlock(mm);
195262306a36Sopenharmony_ci
195362306a36Sopenharmony_ci	if (size > (unsigned long)uaddr - start)
195462306a36Sopenharmony_ci		return size - ((unsigned long)uaddr - start);
195562306a36Sopenharmony_ci	return 0;
195662306a36Sopenharmony_ci}
195762306a36Sopenharmony_ciEXPORT_SYMBOL(fault_in_safe_writeable);
195862306a36Sopenharmony_ci
195962306a36Sopenharmony_ci/**
196062306a36Sopenharmony_ci * fault_in_readable - fault in userspace address range for reading
196162306a36Sopenharmony_ci * @uaddr: start of user address range
196262306a36Sopenharmony_ci * @size: size of user address range
196362306a36Sopenharmony_ci *
196462306a36Sopenharmony_ci * Returns the number of bytes not faulted in (like copy_to_user() and
196562306a36Sopenharmony_ci * copy_from_user()).
196662306a36Sopenharmony_ci */
196762306a36Sopenharmony_cisize_t fault_in_readable(const char __user *uaddr, size_t size)
196862306a36Sopenharmony_ci{
196962306a36Sopenharmony_ci	const char __user *start = uaddr, *end;
197062306a36Sopenharmony_ci	volatile char c;
197162306a36Sopenharmony_ci
197262306a36Sopenharmony_ci	if (unlikely(size == 0))
197362306a36Sopenharmony_ci		return 0;
197462306a36Sopenharmony_ci	if (!user_read_access_begin(uaddr, size))
197562306a36Sopenharmony_ci		return size;
197662306a36Sopenharmony_ci	if (!PAGE_ALIGNED(uaddr)) {
197762306a36Sopenharmony_ci		unsafe_get_user(c, uaddr, out);
197862306a36Sopenharmony_ci		uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
197962306a36Sopenharmony_ci	}
198062306a36Sopenharmony_ci	end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
198162306a36Sopenharmony_ci	if (unlikely(end < start))
198262306a36Sopenharmony_ci		end = NULL;
198362306a36Sopenharmony_ci	while (uaddr != end) {
198462306a36Sopenharmony_ci		unsafe_get_user(c, uaddr, out);
198562306a36Sopenharmony_ci		uaddr += PAGE_SIZE;
198662306a36Sopenharmony_ci	}
198762306a36Sopenharmony_ci
198862306a36Sopenharmony_ciout:
198962306a36Sopenharmony_ci	user_read_access_end();
199062306a36Sopenharmony_ci	(void)c;
199162306a36Sopenharmony_ci	if (size > uaddr - start)
199262306a36Sopenharmony_ci		return size - (uaddr - start);
199362306a36Sopenharmony_ci	return 0;
199462306a36Sopenharmony_ci}
199562306a36Sopenharmony_ciEXPORT_SYMBOL(fault_in_readable);
199662306a36Sopenharmony_ci
199762306a36Sopenharmony_ci/**
199862306a36Sopenharmony_ci * get_dump_page() - pin user page in memory while writing it to core dump
199962306a36Sopenharmony_ci * @addr: user address
200062306a36Sopenharmony_ci *
200162306a36Sopenharmony_ci * Returns struct page pointer of user page pinned for dump,
200262306a36Sopenharmony_ci * to be freed afterwards by put_page().
200362306a36Sopenharmony_ci *
200462306a36Sopenharmony_ci * Returns NULL on any kind of failure - a hole must then be inserted into
200562306a36Sopenharmony_ci * the corefile, to preserve alignment with its headers; and also returns
200662306a36Sopenharmony_ci * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
200762306a36Sopenharmony_ci * allowing a hole to be left in the corefile to save disk space.
200862306a36Sopenharmony_ci *
200962306a36Sopenharmony_ci * Called without mmap_lock (takes and releases the mmap_lock by itself).
201062306a36Sopenharmony_ci */
201162306a36Sopenharmony_ci#ifdef CONFIG_ELF_CORE
201262306a36Sopenharmony_cistruct page *get_dump_page(unsigned long addr)
201362306a36Sopenharmony_ci{
201462306a36Sopenharmony_ci	struct page *page;
201562306a36Sopenharmony_ci	int locked = 0;
201662306a36Sopenharmony_ci	int ret;
201762306a36Sopenharmony_ci
201862306a36Sopenharmony_ci	ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
201962306a36Sopenharmony_ci				      FOLL_FORCE | FOLL_DUMP | FOLL_GET);
202062306a36Sopenharmony_ci	return (ret == 1) ? page : NULL;
202162306a36Sopenharmony_ci}
202262306a36Sopenharmony_ci#endif /* CONFIG_ELF_CORE */
202362306a36Sopenharmony_ci
202462306a36Sopenharmony_ci#ifdef CONFIG_MIGRATION
202562306a36Sopenharmony_ci/*
202662306a36Sopenharmony_ci * Returns the number of collected pages. Return value is always >= 0.
202762306a36Sopenharmony_ci */
202862306a36Sopenharmony_cistatic unsigned long collect_longterm_unpinnable_pages(
202962306a36Sopenharmony_ci					struct list_head *movable_page_list,
203062306a36Sopenharmony_ci					unsigned long nr_pages,
203162306a36Sopenharmony_ci					struct page **pages)
203262306a36Sopenharmony_ci{
203362306a36Sopenharmony_ci	unsigned long i, collected = 0;
203462306a36Sopenharmony_ci	struct folio *prev_folio = NULL;
203562306a36Sopenharmony_ci	bool drain_allow = true;
203662306a36Sopenharmony_ci
203762306a36Sopenharmony_ci	for (i = 0; i < nr_pages; i++) {
203862306a36Sopenharmony_ci		struct folio *folio = page_folio(pages[i]);
203962306a36Sopenharmony_ci
204062306a36Sopenharmony_ci		if (folio == prev_folio)
204162306a36Sopenharmony_ci			continue;
204262306a36Sopenharmony_ci		prev_folio = folio;
204362306a36Sopenharmony_ci
204462306a36Sopenharmony_ci		if (folio_is_longterm_pinnable(folio))
204562306a36Sopenharmony_ci			continue;
204662306a36Sopenharmony_ci
204762306a36Sopenharmony_ci		collected++;
204862306a36Sopenharmony_ci
204962306a36Sopenharmony_ci		if (folio_is_device_coherent(folio))
205062306a36Sopenharmony_ci			continue;
205162306a36Sopenharmony_ci
205262306a36Sopenharmony_ci		if (folio_test_hugetlb(folio)) {
205362306a36Sopenharmony_ci			isolate_hugetlb(folio, movable_page_list);
205462306a36Sopenharmony_ci			continue;
205562306a36Sopenharmony_ci		}
205662306a36Sopenharmony_ci
205762306a36Sopenharmony_ci		if (!folio_test_lru(folio) && drain_allow) {
205862306a36Sopenharmony_ci			lru_add_drain_all();
205962306a36Sopenharmony_ci			drain_allow = false;
206062306a36Sopenharmony_ci		}
206162306a36Sopenharmony_ci
206262306a36Sopenharmony_ci		if (!folio_isolate_lru(folio))
206362306a36Sopenharmony_ci			continue;
206462306a36Sopenharmony_ci
206562306a36Sopenharmony_ci		list_add_tail(&folio->lru, movable_page_list);
206662306a36Sopenharmony_ci		node_stat_mod_folio(folio,
206762306a36Sopenharmony_ci				    NR_ISOLATED_ANON + folio_is_file_lru(folio),
206862306a36Sopenharmony_ci				    folio_nr_pages(folio));
206962306a36Sopenharmony_ci	}
207062306a36Sopenharmony_ci
207162306a36Sopenharmony_ci	return collected;
207262306a36Sopenharmony_ci}
207362306a36Sopenharmony_ci
207462306a36Sopenharmony_ci/*
207562306a36Sopenharmony_ci * Unpins all pages and migrates device coherent pages and movable_page_list.
207662306a36Sopenharmony_ci * Returns -EAGAIN if all pages were successfully migrated or -errno for failure
207762306a36Sopenharmony_ci * (or partial success).
207862306a36Sopenharmony_ci */
207962306a36Sopenharmony_cistatic int migrate_longterm_unpinnable_pages(
208062306a36Sopenharmony_ci					struct list_head *movable_page_list,
208162306a36Sopenharmony_ci					unsigned long nr_pages,
208262306a36Sopenharmony_ci					struct page **pages)
208362306a36Sopenharmony_ci{
208462306a36Sopenharmony_ci	int ret;
208562306a36Sopenharmony_ci	unsigned long i;
208662306a36Sopenharmony_ci
208762306a36Sopenharmony_ci	for (i = 0; i < nr_pages; i++) {
208862306a36Sopenharmony_ci		struct folio *folio = page_folio(pages[i]);
208962306a36Sopenharmony_ci
209062306a36Sopenharmony_ci		if (folio_is_device_coherent(folio)) {
209162306a36Sopenharmony_ci			/*
209262306a36Sopenharmony_ci			 * Migration will fail if the page is pinned, so convert
209362306a36Sopenharmony_ci			 * the pin on the source page to a normal reference.
209462306a36Sopenharmony_ci			 */
209562306a36Sopenharmony_ci			pages[i] = NULL;
209662306a36Sopenharmony_ci			folio_get(folio);
209762306a36Sopenharmony_ci			gup_put_folio(folio, 1, FOLL_PIN);
209862306a36Sopenharmony_ci
209962306a36Sopenharmony_ci			if (migrate_device_coherent_page(&folio->page)) {
210062306a36Sopenharmony_ci				ret = -EBUSY;
210162306a36Sopenharmony_ci				goto err;
210262306a36Sopenharmony_ci			}
210362306a36Sopenharmony_ci
210462306a36Sopenharmony_ci			continue;
210562306a36Sopenharmony_ci		}
210662306a36Sopenharmony_ci
210762306a36Sopenharmony_ci		/*
210862306a36Sopenharmony_ci		 * We can't migrate pages with unexpected references, so drop
210962306a36Sopenharmony_ci		 * the reference obtained by __get_user_pages_locked().
211062306a36Sopenharmony_ci		 * Migrating pages have been added to movable_page_list after
211162306a36Sopenharmony_ci		 * calling folio_isolate_lru() which takes a reference so the
211262306a36Sopenharmony_ci		 * page won't be freed if it's migrating.
211362306a36Sopenharmony_ci		 */
211462306a36Sopenharmony_ci		unpin_user_page(pages[i]);
211562306a36Sopenharmony_ci		pages[i] = NULL;
211662306a36Sopenharmony_ci	}
211762306a36Sopenharmony_ci
211862306a36Sopenharmony_ci	if (!list_empty(movable_page_list)) {
211962306a36Sopenharmony_ci		struct migration_target_control mtc = {
212062306a36Sopenharmony_ci			.nid = NUMA_NO_NODE,
212162306a36Sopenharmony_ci			.gfp_mask = GFP_USER | __GFP_NOWARN,
212262306a36Sopenharmony_ci		};
212362306a36Sopenharmony_ci
212462306a36Sopenharmony_ci		if (migrate_pages(movable_page_list, alloc_migration_target,
212562306a36Sopenharmony_ci				  NULL, (unsigned long)&mtc, MIGRATE_SYNC,
212662306a36Sopenharmony_ci				  MR_LONGTERM_PIN, NULL)) {
212762306a36Sopenharmony_ci			ret = -ENOMEM;
212862306a36Sopenharmony_ci			goto err;
212962306a36Sopenharmony_ci		}
213062306a36Sopenharmony_ci	}
213162306a36Sopenharmony_ci
213262306a36Sopenharmony_ci	putback_movable_pages(movable_page_list);
213362306a36Sopenharmony_ci
213462306a36Sopenharmony_ci	return -EAGAIN;
213562306a36Sopenharmony_ci
213662306a36Sopenharmony_cierr:
213762306a36Sopenharmony_ci	for (i = 0; i < nr_pages; i++)
213862306a36Sopenharmony_ci		if (pages[i])
213962306a36Sopenharmony_ci			unpin_user_page(pages[i]);
214062306a36Sopenharmony_ci	putback_movable_pages(movable_page_list);
214162306a36Sopenharmony_ci
214262306a36Sopenharmony_ci	return ret;
214362306a36Sopenharmony_ci}
214462306a36Sopenharmony_ci
214562306a36Sopenharmony_ci/*
214662306a36Sopenharmony_ci * Check whether all pages are *allowed* to be pinned. Rather confusingly, all
214762306a36Sopenharmony_ci * pages in the range are required to be pinned via FOLL_PIN, before calling
214862306a36Sopenharmony_ci * this routine.
214962306a36Sopenharmony_ci *
215062306a36Sopenharmony_ci * If any pages in the range are not allowed to be pinned, then this routine
215162306a36Sopenharmony_ci * will migrate those pages away, unpin all the pages in the range and return
215262306a36Sopenharmony_ci * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then
215362306a36Sopenharmony_ci * call this routine again.
215462306a36Sopenharmony_ci *
215562306a36Sopenharmony_ci * If an error other than -EAGAIN occurs, this indicates a migration failure.
215662306a36Sopenharmony_ci * The caller should give up, and propagate the error back up the call stack.
215762306a36Sopenharmony_ci *
215862306a36Sopenharmony_ci * If everything is OK and all pages in the range are allowed to be pinned, then
215962306a36Sopenharmony_ci * this routine leaves all pages pinned and returns zero for success.
216062306a36Sopenharmony_ci */
216162306a36Sopenharmony_cistatic long check_and_migrate_movable_pages(unsigned long nr_pages,
216262306a36Sopenharmony_ci					    struct page **pages)
216362306a36Sopenharmony_ci{
216462306a36Sopenharmony_ci	unsigned long collected;
216562306a36Sopenharmony_ci	LIST_HEAD(movable_page_list);
216662306a36Sopenharmony_ci
216762306a36Sopenharmony_ci	collected = collect_longterm_unpinnable_pages(&movable_page_list,
216862306a36Sopenharmony_ci						nr_pages, pages);
216962306a36Sopenharmony_ci	if (!collected)
217062306a36Sopenharmony_ci		return 0;
217162306a36Sopenharmony_ci
217262306a36Sopenharmony_ci	return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages,
217362306a36Sopenharmony_ci						pages);
217462306a36Sopenharmony_ci}
217562306a36Sopenharmony_ci#else
217662306a36Sopenharmony_cistatic long check_and_migrate_movable_pages(unsigned long nr_pages,
217762306a36Sopenharmony_ci					    struct page **pages)
217862306a36Sopenharmony_ci{
217962306a36Sopenharmony_ci	return 0;
218062306a36Sopenharmony_ci}
218162306a36Sopenharmony_ci#endif /* CONFIG_MIGRATION */
218262306a36Sopenharmony_ci
218362306a36Sopenharmony_ci/*
218462306a36Sopenharmony_ci * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
218562306a36Sopenharmony_ci * allows us to process the FOLL_LONGTERM flag.
218662306a36Sopenharmony_ci */
218762306a36Sopenharmony_cistatic long __gup_longterm_locked(struct mm_struct *mm,
218862306a36Sopenharmony_ci				  unsigned long start,
218962306a36Sopenharmony_ci				  unsigned long nr_pages,
219062306a36Sopenharmony_ci				  struct page **pages,
219162306a36Sopenharmony_ci				  int *locked,
219262306a36Sopenharmony_ci				  unsigned int gup_flags)
219362306a36Sopenharmony_ci{
219462306a36Sopenharmony_ci	unsigned int flags;
219562306a36Sopenharmony_ci	long rc, nr_pinned_pages;
219662306a36Sopenharmony_ci
219762306a36Sopenharmony_ci	if (!(gup_flags & FOLL_LONGTERM))
219862306a36Sopenharmony_ci		return __get_user_pages_locked(mm, start, nr_pages, pages,
219962306a36Sopenharmony_ci					       locked, gup_flags);
220062306a36Sopenharmony_ci
220162306a36Sopenharmony_ci	flags = memalloc_pin_save();
220262306a36Sopenharmony_ci	do {
220362306a36Sopenharmony_ci		nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
220462306a36Sopenharmony_ci							  pages, locked,
220562306a36Sopenharmony_ci							  gup_flags);
220662306a36Sopenharmony_ci		if (nr_pinned_pages <= 0) {
220762306a36Sopenharmony_ci			rc = nr_pinned_pages;
220862306a36Sopenharmony_ci			break;
220962306a36Sopenharmony_ci		}
221062306a36Sopenharmony_ci
221162306a36Sopenharmony_ci		/* FOLL_LONGTERM implies FOLL_PIN */
221262306a36Sopenharmony_ci		rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
221362306a36Sopenharmony_ci	} while (rc == -EAGAIN);
221462306a36Sopenharmony_ci	memalloc_pin_restore(flags);
221562306a36Sopenharmony_ci	return rc ? rc : nr_pinned_pages;
221662306a36Sopenharmony_ci}
221762306a36Sopenharmony_ci
221862306a36Sopenharmony_ci/*
221962306a36Sopenharmony_ci * Check that the given flags are valid for the exported gup/pup interface, and
222062306a36Sopenharmony_ci * update them with the required flags that the caller must have set.
222162306a36Sopenharmony_ci */
222262306a36Sopenharmony_cistatic bool is_valid_gup_args(struct page **pages, int *locked,
222362306a36Sopenharmony_ci			      unsigned int *gup_flags_p, unsigned int to_set)
222462306a36Sopenharmony_ci{
222562306a36Sopenharmony_ci	unsigned int gup_flags = *gup_flags_p;
222662306a36Sopenharmony_ci
222762306a36Sopenharmony_ci	/*
222862306a36Sopenharmony_ci	 * These flags not allowed to be specified externally to the gup
222962306a36Sopenharmony_ci	 * interfaces:
223062306a36Sopenharmony_ci	 * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
223162306a36Sopenharmony_ci	 * - FOLL_REMOTE is internal only and used on follow_page()
223262306a36Sopenharmony_ci	 * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
223362306a36Sopenharmony_ci	 */
223462306a36Sopenharmony_ci	if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE |
223562306a36Sopenharmony_ci				      FOLL_REMOTE | FOLL_FAST_ONLY)))
223662306a36Sopenharmony_ci		return false;
223762306a36Sopenharmony_ci
223862306a36Sopenharmony_ci	gup_flags |= to_set;
223962306a36Sopenharmony_ci	if (locked) {
224062306a36Sopenharmony_ci		/* At the external interface locked must be set */
224162306a36Sopenharmony_ci		if (WARN_ON_ONCE(*locked != 1))
224262306a36Sopenharmony_ci			return false;
224362306a36Sopenharmony_ci
224462306a36Sopenharmony_ci		gup_flags |= FOLL_UNLOCKABLE;
224562306a36Sopenharmony_ci	}
224662306a36Sopenharmony_ci
224762306a36Sopenharmony_ci	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
224862306a36Sopenharmony_ci	if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
224962306a36Sopenharmony_ci			 (FOLL_PIN | FOLL_GET)))
225062306a36Sopenharmony_ci		return false;
225162306a36Sopenharmony_ci
225262306a36Sopenharmony_ci	/* LONGTERM can only be specified when pinning */
225362306a36Sopenharmony_ci	if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))
225462306a36Sopenharmony_ci		return false;
225562306a36Sopenharmony_ci
225662306a36Sopenharmony_ci	/* Pages input must be given if using GET/PIN */
225762306a36Sopenharmony_ci	if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))
225862306a36Sopenharmony_ci		return false;
225962306a36Sopenharmony_ci
226062306a36Sopenharmony_ci	/* We want to allow the pgmap to be hot-unplugged at all times */
226162306a36Sopenharmony_ci	if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&
226262306a36Sopenharmony_ci			 (gup_flags & FOLL_PCI_P2PDMA)))
226362306a36Sopenharmony_ci		return false;
226462306a36Sopenharmony_ci
226562306a36Sopenharmony_ci	*gup_flags_p = gup_flags;
226662306a36Sopenharmony_ci	return true;
226762306a36Sopenharmony_ci}
226862306a36Sopenharmony_ci
226962306a36Sopenharmony_ci#ifdef CONFIG_MMU
227062306a36Sopenharmony_ci/**
227162306a36Sopenharmony_ci * get_user_pages_remote() - pin user pages in memory
227262306a36Sopenharmony_ci * @mm:		mm_struct of target mm
227362306a36Sopenharmony_ci * @start:	starting user address
227462306a36Sopenharmony_ci * @nr_pages:	number of pages from start to pin
227562306a36Sopenharmony_ci * @gup_flags:	flags modifying lookup behaviour
227662306a36Sopenharmony_ci * @pages:	array that receives pointers to the pages pinned.
227762306a36Sopenharmony_ci *		Should be at least nr_pages long. Or NULL, if caller
227862306a36Sopenharmony_ci *		only intends to ensure the pages are faulted in.
227962306a36Sopenharmony_ci * @locked:	pointer to lock flag indicating whether lock is held and
228062306a36Sopenharmony_ci *		subsequently whether VM_FAULT_RETRY functionality can be
228162306a36Sopenharmony_ci *		utilised. Lock must initially be held.
228262306a36Sopenharmony_ci *
228362306a36Sopenharmony_ci * Returns either number of pages pinned (which may be less than the
228462306a36Sopenharmony_ci * number requested), or an error. Details about the return value:
228562306a36Sopenharmony_ci *
228662306a36Sopenharmony_ci * -- If nr_pages is 0, returns 0.
228762306a36Sopenharmony_ci * -- If nr_pages is >0, but no pages were pinned, returns -errno.
228862306a36Sopenharmony_ci * -- If nr_pages is >0, and some pages were pinned, returns the number of
228962306a36Sopenharmony_ci *    pages pinned. Again, this may be less than nr_pages.
229062306a36Sopenharmony_ci *
229162306a36Sopenharmony_ci * The caller is responsible for releasing returned @pages, via put_page().
229262306a36Sopenharmony_ci *
229362306a36Sopenharmony_ci * Must be called with mmap_lock held for read or write.
229462306a36Sopenharmony_ci *
229562306a36Sopenharmony_ci * get_user_pages_remote walks a process's page tables and takes a reference
229662306a36Sopenharmony_ci * to each struct page that each user address corresponds to at a given
229762306a36Sopenharmony_ci * instant. That is, it takes the page that would be accessed if a user
229862306a36Sopenharmony_ci * thread accesses the given user virtual address at that instant.
229962306a36Sopenharmony_ci *
230062306a36Sopenharmony_ci * This does not guarantee that the page exists in the user mappings when
230162306a36Sopenharmony_ci * get_user_pages_remote returns, and there may even be a completely different
230262306a36Sopenharmony_ci * page there in some cases (eg. if mmapped pagecache has been invalidated
230362306a36Sopenharmony_ci * and subsequently re-faulted). However it does guarantee that the page
230462306a36Sopenharmony_ci * won't be freed completely. And mostly callers simply care that the page
230562306a36Sopenharmony_ci * contains data that was valid *at some point in time*. Typically, an IO
230662306a36Sopenharmony_ci * or similar operation cannot guarantee anything stronger anyway because
230762306a36Sopenharmony_ci * locks can't be held over the syscall boundary.
230862306a36Sopenharmony_ci *
230962306a36Sopenharmony_ci * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
231062306a36Sopenharmony_ci * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
231162306a36Sopenharmony_ci * be called after the page is finished with, and before put_page is called.
231262306a36Sopenharmony_ci *
231362306a36Sopenharmony_ci * get_user_pages_remote is typically used for fewer-copy IO operations,
231462306a36Sopenharmony_ci * to get a handle on the memory by some means other than accesses
231562306a36Sopenharmony_ci * via the user virtual addresses. The pages may be submitted for
231662306a36Sopenharmony_ci * DMA to devices or accessed via their kernel linear mapping (via the
231762306a36Sopenharmony_ci * kmap APIs). Care should be taken to use the correct cache flushing APIs.
231862306a36Sopenharmony_ci *
231962306a36Sopenharmony_ci * See also get_user_pages_fast, for performance critical applications.
232062306a36Sopenharmony_ci *
232162306a36Sopenharmony_ci * get_user_pages_remote should be phased out in favor of
232262306a36Sopenharmony_ci * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
232362306a36Sopenharmony_ci * should use get_user_pages_remote because it cannot pass
232462306a36Sopenharmony_ci * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
232562306a36Sopenharmony_ci */
232662306a36Sopenharmony_cilong get_user_pages_remote(struct mm_struct *mm,
232762306a36Sopenharmony_ci		unsigned long start, unsigned long nr_pages,
232862306a36Sopenharmony_ci		unsigned int gup_flags, struct page **pages,
232962306a36Sopenharmony_ci		int *locked)
233062306a36Sopenharmony_ci{
233162306a36Sopenharmony_ci	int local_locked = 1;
233262306a36Sopenharmony_ci
233362306a36Sopenharmony_ci	if (!is_valid_gup_args(pages, locked, &gup_flags,
233462306a36Sopenharmony_ci			       FOLL_TOUCH | FOLL_REMOTE))
233562306a36Sopenharmony_ci		return -EINVAL;
233662306a36Sopenharmony_ci
233762306a36Sopenharmony_ci	return __get_user_pages_locked(mm, start, nr_pages, pages,
233862306a36Sopenharmony_ci				       locked ? locked : &local_locked,
233962306a36Sopenharmony_ci				       gup_flags);
234062306a36Sopenharmony_ci}
234162306a36Sopenharmony_ciEXPORT_SYMBOL(get_user_pages_remote);
234262306a36Sopenharmony_ci
234362306a36Sopenharmony_ci#else /* CONFIG_MMU */
234462306a36Sopenharmony_cilong get_user_pages_remote(struct mm_struct *mm,
234562306a36Sopenharmony_ci			   unsigned long start, unsigned long nr_pages,
234662306a36Sopenharmony_ci			   unsigned int gup_flags, struct page **pages,
234762306a36Sopenharmony_ci			   int *locked)
234862306a36Sopenharmony_ci{
234962306a36Sopenharmony_ci	return 0;
235062306a36Sopenharmony_ci}
235162306a36Sopenharmony_ci#endif /* !CONFIG_MMU */
235262306a36Sopenharmony_ci
235362306a36Sopenharmony_ci/**
235462306a36Sopenharmony_ci * get_user_pages() - pin user pages in memory
235562306a36Sopenharmony_ci * @start:      starting user address
235662306a36Sopenharmony_ci * @nr_pages:   number of pages from start to pin
235762306a36Sopenharmony_ci * @gup_flags:  flags modifying lookup behaviour
235862306a36Sopenharmony_ci * @pages:      array that receives pointers to the pages pinned.
235962306a36Sopenharmony_ci *              Should be at least nr_pages long. Or NULL, if caller
236062306a36Sopenharmony_ci *              only intends to ensure the pages are faulted in.
236162306a36Sopenharmony_ci *
236262306a36Sopenharmony_ci * This is the same as get_user_pages_remote(), just with a less-flexible
236362306a36Sopenharmony_ci * calling convention where we assume that the mm being operated on belongs to
236462306a36Sopenharmony_ci * the current task, and doesn't allow passing of a locked parameter.  We also
236562306a36Sopenharmony_ci * obviously don't pass FOLL_REMOTE in here.
236662306a36Sopenharmony_ci */
236762306a36Sopenharmony_cilong get_user_pages(unsigned long start, unsigned long nr_pages,
236862306a36Sopenharmony_ci		    unsigned int gup_flags, struct page **pages)
236962306a36Sopenharmony_ci{
237062306a36Sopenharmony_ci	int locked = 1;
237162306a36Sopenharmony_ci
237262306a36Sopenharmony_ci	if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
237362306a36Sopenharmony_ci		return -EINVAL;
237462306a36Sopenharmony_ci
237562306a36Sopenharmony_ci	return __get_user_pages_locked(current->mm, start, nr_pages, pages,
237662306a36Sopenharmony_ci				       &locked, gup_flags);
237762306a36Sopenharmony_ci}
237862306a36Sopenharmony_ciEXPORT_SYMBOL(get_user_pages);
237962306a36Sopenharmony_ci
238062306a36Sopenharmony_ci/*
238162306a36Sopenharmony_ci * get_user_pages_unlocked() is suitable to replace the form:
238262306a36Sopenharmony_ci *
238362306a36Sopenharmony_ci *      mmap_read_lock(mm);
238462306a36Sopenharmony_ci *      get_user_pages(mm, ..., pages, NULL);
238562306a36Sopenharmony_ci *      mmap_read_unlock(mm);
238662306a36Sopenharmony_ci *
238762306a36Sopenharmony_ci *  with:
238862306a36Sopenharmony_ci *
238962306a36Sopenharmony_ci *      get_user_pages_unlocked(mm, ..., pages);
239062306a36Sopenharmony_ci *
239162306a36Sopenharmony_ci * It is functionally equivalent to get_user_pages_fast so
239262306a36Sopenharmony_ci * get_user_pages_fast should be used instead if specific gup_flags
239362306a36Sopenharmony_ci * (e.g. FOLL_FORCE) are not required.
239462306a36Sopenharmony_ci */
239562306a36Sopenharmony_cilong get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
239662306a36Sopenharmony_ci			     struct page **pages, unsigned int gup_flags)
239762306a36Sopenharmony_ci{
239862306a36Sopenharmony_ci	int locked = 0;
239962306a36Sopenharmony_ci
240062306a36Sopenharmony_ci	if (!is_valid_gup_args(pages, NULL, &gup_flags,
240162306a36Sopenharmony_ci			       FOLL_TOUCH | FOLL_UNLOCKABLE))
240262306a36Sopenharmony_ci		return -EINVAL;
240362306a36Sopenharmony_ci
240462306a36Sopenharmony_ci	return __get_user_pages_locked(current->mm, start, nr_pages, pages,
240562306a36Sopenharmony_ci				       &locked, gup_flags);
240662306a36Sopenharmony_ci}
240762306a36Sopenharmony_ciEXPORT_SYMBOL(get_user_pages_unlocked);
240862306a36Sopenharmony_ci
240962306a36Sopenharmony_ci/*
241062306a36Sopenharmony_ci * Fast GUP
241162306a36Sopenharmony_ci *
241262306a36Sopenharmony_ci * get_user_pages_fast attempts to pin user pages by walking the page
241362306a36Sopenharmony_ci * tables directly and avoids taking locks. Thus the walker needs to be
241462306a36Sopenharmony_ci * protected from page table pages being freed from under it, and should
241562306a36Sopenharmony_ci * block any THP splits.
241662306a36Sopenharmony_ci *
241762306a36Sopenharmony_ci * One way to achieve this is to have the walker disable interrupts, and
241862306a36Sopenharmony_ci * rely on IPIs from the TLB flushing code blocking before the page table
241962306a36Sopenharmony_ci * pages are freed. This is unsuitable for architectures that do not need
242062306a36Sopenharmony_ci * to broadcast an IPI when invalidating TLBs.
242162306a36Sopenharmony_ci *
242262306a36Sopenharmony_ci * Another way to achieve this is to batch up page table containing pages
242362306a36Sopenharmony_ci * belonging to more than one mm_user, then rcu_sched a callback to free those
242462306a36Sopenharmony_ci * pages. Disabling interrupts will allow the fast_gup walker to both block
242562306a36Sopenharmony_ci * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
242662306a36Sopenharmony_ci * (which is a relatively rare event). The code below adopts this strategy.
242762306a36Sopenharmony_ci *
242862306a36Sopenharmony_ci * Before activating this code, please be aware that the following assumptions
242962306a36Sopenharmony_ci * are currently made:
243062306a36Sopenharmony_ci *
243162306a36Sopenharmony_ci *  *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
243262306a36Sopenharmony_ci *  free pages containing page tables or TLB flushing requires IPI broadcast.
243362306a36Sopenharmony_ci *
243462306a36Sopenharmony_ci *  *) ptes can be read atomically by the architecture.
243562306a36Sopenharmony_ci *
243662306a36Sopenharmony_ci *  *) access_ok is sufficient to validate userspace address ranges.
243762306a36Sopenharmony_ci *
243862306a36Sopenharmony_ci * The last two assumptions can be relaxed by the addition of helper functions.
243962306a36Sopenharmony_ci *
244062306a36Sopenharmony_ci * This code is based heavily on the PowerPC implementation by Nick Piggin.
244162306a36Sopenharmony_ci */
244262306a36Sopenharmony_ci#ifdef CONFIG_HAVE_FAST_GUP
244362306a36Sopenharmony_ci
244462306a36Sopenharmony_ci/*
244562306a36Sopenharmony_ci * Used in the GUP-fast path to determine whether a pin is permitted for a
244662306a36Sopenharmony_ci * specific folio.
244762306a36Sopenharmony_ci *
244862306a36Sopenharmony_ci * This call assumes the caller has pinned the folio, that the lowest page table
244962306a36Sopenharmony_ci * level still points to this folio, and that interrupts have been disabled.
245062306a36Sopenharmony_ci *
245162306a36Sopenharmony_ci * Writing to pinned file-backed dirty tracked folios is inherently problematic
245262306a36Sopenharmony_ci * (see comment describing the writable_file_mapping_allowed() function). We
245362306a36Sopenharmony_ci * therefore try to avoid the most egregious case of a long-term mapping doing
245462306a36Sopenharmony_ci * so.
245562306a36Sopenharmony_ci *
245662306a36Sopenharmony_ci * This function cannot be as thorough as that one as the VMA is not available
245762306a36Sopenharmony_ci * in the fast path, so instead we whitelist known good cases and if in doubt,
245862306a36Sopenharmony_ci * fall back to the slow path.
245962306a36Sopenharmony_ci */
246062306a36Sopenharmony_cistatic bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags)
246162306a36Sopenharmony_ci{
246262306a36Sopenharmony_ci	struct address_space *mapping;
246362306a36Sopenharmony_ci	unsigned long mapping_flags;
246462306a36Sopenharmony_ci
246562306a36Sopenharmony_ci	/*
246662306a36Sopenharmony_ci	 * If we aren't pinning then no problematic write can occur. A long term
246762306a36Sopenharmony_ci	 * pin is the most egregious case so this is the one we disallow.
246862306a36Sopenharmony_ci	 */
246962306a36Sopenharmony_ci	if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) !=
247062306a36Sopenharmony_ci	    (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
247162306a36Sopenharmony_ci		return true;
247262306a36Sopenharmony_ci
247362306a36Sopenharmony_ci	/* The folio is pinned, so we can safely access folio fields. */
247462306a36Sopenharmony_ci
247562306a36Sopenharmony_ci	if (WARN_ON_ONCE(folio_test_slab(folio)))
247662306a36Sopenharmony_ci		return false;
247762306a36Sopenharmony_ci
247862306a36Sopenharmony_ci	/* hugetlb mappings do not require dirty-tracking. */
247962306a36Sopenharmony_ci	if (folio_test_hugetlb(folio))
248062306a36Sopenharmony_ci		return true;
248162306a36Sopenharmony_ci
248262306a36Sopenharmony_ci	/*
248362306a36Sopenharmony_ci	 * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
248462306a36Sopenharmony_ci	 * cannot proceed, which means no actions performed under RCU can
248562306a36Sopenharmony_ci	 * proceed either.
248662306a36Sopenharmony_ci	 *
248762306a36Sopenharmony_ci	 * inodes and thus their mappings are freed under RCU, which means the
248862306a36Sopenharmony_ci	 * mapping cannot be freed beneath us and thus we can safely dereference
248962306a36Sopenharmony_ci	 * it.
249062306a36Sopenharmony_ci	 */
249162306a36Sopenharmony_ci	lockdep_assert_irqs_disabled();
249262306a36Sopenharmony_ci
249362306a36Sopenharmony_ci	/*
249462306a36Sopenharmony_ci	 * However, there may be operations which _alter_ the mapping, so ensure
249562306a36Sopenharmony_ci	 * we read it once and only once.
249662306a36Sopenharmony_ci	 */
249762306a36Sopenharmony_ci	mapping = READ_ONCE(folio->mapping);
249862306a36Sopenharmony_ci
249962306a36Sopenharmony_ci	/*
250062306a36Sopenharmony_ci	 * The mapping may have been truncated, in any case we cannot determine
250162306a36Sopenharmony_ci	 * if this mapping is safe - fall back to slow path to determine how to
250262306a36Sopenharmony_ci	 * proceed.
250362306a36Sopenharmony_ci	 */
250462306a36Sopenharmony_ci	if (!mapping)
250562306a36Sopenharmony_ci		return false;
250662306a36Sopenharmony_ci
250762306a36Sopenharmony_ci	/* Anonymous folios pose no problem. */
250862306a36Sopenharmony_ci	mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
250962306a36Sopenharmony_ci	if (mapping_flags)
251062306a36Sopenharmony_ci		return mapping_flags & PAGE_MAPPING_ANON;
251162306a36Sopenharmony_ci
251262306a36Sopenharmony_ci	/*
251362306a36Sopenharmony_ci	 * At this point, we know the mapping is non-null and points to an
251462306a36Sopenharmony_ci	 * address_space object. The only remaining whitelisted file system is
251562306a36Sopenharmony_ci	 * shmem.
251662306a36Sopenharmony_ci	 */
251762306a36Sopenharmony_ci	return shmem_mapping(mapping);
251862306a36Sopenharmony_ci}
251962306a36Sopenharmony_ci
252062306a36Sopenharmony_cistatic void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
252162306a36Sopenharmony_ci					    unsigned int flags,
252262306a36Sopenharmony_ci					    struct page **pages)
252362306a36Sopenharmony_ci{
252462306a36Sopenharmony_ci	while ((*nr) - nr_start) {
252562306a36Sopenharmony_ci		struct page *page = pages[--(*nr)];
252662306a36Sopenharmony_ci
252762306a36Sopenharmony_ci		ClearPageReferenced(page);
252862306a36Sopenharmony_ci		if (flags & FOLL_PIN)
252962306a36Sopenharmony_ci			unpin_user_page(page);
253062306a36Sopenharmony_ci		else
253162306a36Sopenharmony_ci			put_page(page);
253262306a36Sopenharmony_ci	}
253362306a36Sopenharmony_ci}
253462306a36Sopenharmony_ci
253562306a36Sopenharmony_ci#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
253662306a36Sopenharmony_ci/*
253762306a36Sopenharmony_ci * Fast-gup relies on pte change detection to avoid concurrent pgtable
253862306a36Sopenharmony_ci * operations.
253962306a36Sopenharmony_ci *
254062306a36Sopenharmony_ci * To pin the page, fast-gup needs to do below in order:
254162306a36Sopenharmony_ci * (1) pin the page (by prefetching pte), then (2) check pte not changed.
254262306a36Sopenharmony_ci *
254362306a36Sopenharmony_ci * For the rest of pgtable operations where pgtable updates can be racy
254462306a36Sopenharmony_ci * with fast-gup, we need to do (1) clear pte, then (2) check whether page
254562306a36Sopenharmony_ci * is pinned.
254662306a36Sopenharmony_ci *
254762306a36Sopenharmony_ci * Above will work for all pte-level operations, including THP split.
254862306a36Sopenharmony_ci *
254962306a36Sopenharmony_ci * For THP collapse, it's a bit more complicated because fast-gup may be
255062306a36Sopenharmony_ci * walking a pgtable page that is being freed (pte is still valid but pmd
255162306a36Sopenharmony_ci * can be cleared already).  To avoid race in such condition, we need to
255262306a36Sopenharmony_ci * also check pmd here to make sure pmd doesn't change (corresponds to
255362306a36Sopenharmony_ci * pmdp_collapse_flush() in the THP collapse code path).
255462306a36Sopenharmony_ci */
255562306a36Sopenharmony_cistatic int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
255662306a36Sopenharmony_ci			 unsigned long end, unsigned int flags,
255762306a36Sopenharmony_ci			 struct page **pages, int *nr)
255862306a36Sopenharmony_ci{
255962306a36Sopenharmony_ci	struct dev_pagemap *pgmap = NULL;
256062306a36Sopenharmony_ci	int nr_start = *nr, ret = 0;
256162306a36Sopenharmony_ci	pte_t *ptep, *ptem;
256262306a36Sopenharmony_ci
256362306a36Sopenharmony_ci	ptem = ptep = pte_offset_map(&pmd, addr);
256462306a36Sopenharmony_ci	if (!ptep)
256562306a36Sopenharmony_ci		return 0;
256662306a36Sopenharmony_ci	do {
256762306a36Sopenharmony_ci		pte_t pte = ptep_get_lockless(ptep);
256862306a36Sopenharmony_ci		struct page *page;
256962306a36Sopenharmony_ci		struct folio *folio;
257062306a36Sopenharmony_ci
257162306a36Sopenharmony_ci		/*
257262306a36Sopenharmony_ci		 * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
257362306a36Sopenharmony_ci		 * pte_access_permitted() better should reject these pages
257462306a36Sopenharmony_ci		 * either way: otherwise, GUP-fast might succeed in
257562306a36Sopenharmony_ci		 * cases where ordinary GUP would fail due to VMA access
257662306a36Sopenharmony_ci		 * permissions.
257762306a36Sopenharmony_ci		 */
257862306a36Sopenharmony_ci		if (pte_protnone(pte))
257962306a36Sopenharmony_ci			goto pte_unmap;
258062306a36Sopenharmony_ci
258162306a36Sopenharmony_ci		if (!pte_access_permitted(pte, flags & FOLL_WRITE))
258262306a36Sopenharmony_ci			goto pte_unmap;
258362306a36Sopenharmony_ci
258462306a36Sopenharmony_ci		if (pte_devmap(pte)) {
258562306a36Sopenharmony_ci			if (unlikely(flags & FOLL_LONGTERM))
258662306a36Sopenharmony_ci				goto pte_unmap;
258762306a36Sopenharmony_ci
258862306a36Sopenharmony_ci			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
258962306a36Sopenharmony_ci			if (unlikely(!pgmap)) {
259062306a36Sopenharmony_ci				undo_dev_pagemap(nr, nr_start, flags, pages);
259162306a36Sopenharmony_ci				goto pte_unmap;
259262306a36Sopenharmony_ci			}
259362306a36Sopenharmony_ci		} else if (pte_special(pte))
259462306a36Sopenharmony_ci			goto pte_unmap;
259562306a36Sopenharmony_ci
259662306a36Sopenharmony_ci		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
259762306a36Sopenharmony_ci		page = pte_page(pte);
259862306a36Sopenharmony_ci
259962306a36Sopenharmony_ci		folio = try_grab_folio(page, 1, flags);
260062306a36Sopenharmony_ci		if (!folio)
260162306a36Sopenharmony_ci			goto pte_unmap;
260262306a36Sopenharmony_ci
260362306a36Sopenharmony_ci		if (unlikely(folio_is_secretmem(folio))) {
260462306a36Sopenharmony_ci			gup_put_folio(folio, 1, flags);
260562306a36Sopenharmony_ci			goto pte_unmap;
260662306a36Sopenharmony_ci		}
260762306a36Sopenharmony_ci
260862306a36Sopenharmony_ci		if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
260962306a36Sopenharmony_ci		    unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
261062306a36Sopenharmony_ci			gup_put_folio(folio, 1, flags);
261162306a36Sopenharmony_ci			goto pte_unmap;
261262306a36Sopenharmony_ci		}
261362306a36Sopenharmony_ci
261462306a36Sopenharmony_ci		if (!folio_fast_pin_allowed(folio, flags)) {
261562306a36Sopenharmony_ci			gup_put_folio(folio, 1, flags);
261662306a36Sopenharmony_ci			goto pte_unmap;
261762306a36Sopenharmony_ci		}
261862306a36Sopenharmony_ci
261962306a36Sopenharmony_ci		if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
262062306a36Sopenharmony_ci			gup_put_folio(folio, 1, flags);
262162306a36Sopenharmony_ci			goto pte_unmap;
262262306a36Sopenharmony_ci		}
262362306a36Sopenharmony_ci
262462306a36Sopenharmony_ci		/*
262562306a36Sopenharmony_ci		 * We need to make the page accessible if and only if we are
262662306a36Sopenharmony_ci		 * going to access its content (the FOLL_PIN case).  Please
262762306a36Sopenharmony_ci		 * see Documentation/core-api/pin_user_pages.rst for
262862306a36Sopenharmony_ci		 * details.
262962306a36Sopenharmony_ci		 */
263062306a36Sopenharmony_ci		if (flags & FOLL_PIN) {
263162306a36Sopenharmony_ci			ret = arch_make_page_accessible(page);
263262306a36Sopenharmony_ci			if (ret) {
263362306a36Sopenharmony_ci				gup_put_folio(folio, 1, flags);
263462306a36Sopenharmony_ci				goto pte_unmap;
263562306a36Sopenharmony_ci			}
263662306a36Sopenharmony_ci		}
263762306a36Sopenharmony_ci		folio_set_referenced(folio);
263862306a36Sopenharmony_ci		pages[*nr] = page;
263962306a36Sopenharmony_ci		(*nr)++;
264062306a36Sopenharmony_ci	} while (ptep++, addr += PAGE_SIZE, addr != end);
264162306a36Sopenharmony_ci
264262306a36Sopenharmony_ci	ret = 1;
264362306a36Sopenharmony_ci
264462306a36Sopenharmony_cipte_unmap:
264562306a36Sopenharmony_ci	if (pgmap)
264662306a36Sopenharmony_ci		put_dev_pagemap(pgmap);
264762306a36Sopenharmony_ci	pte_unmap(ptem);
264862306a36Sopenharmony_ci	return ret;
264962306a36Sopenharmony_ci}
265062306a36Sopenharmony_ci#else
265162306a36Sopenharmony_ci
265262306a36Sopenharmony_ci/*
265362306a36Sopenharmony_ci * If we can't determine whether or not a pte is special, then fail immediately
265462306a36Sopenharmony_ci * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
265562306a36Sopenharmony_ci * to be special.
265662306a36Sopenharmony_ci *
265762306a36Sopenharmony_ci * For a futex to be placed on a THP tail page, get_futex_key requires a
265862306a36Sopenharmony_ci * get_user_pages_fast_only implementation that can pin pages. Thus it's still
265962306a36Sopenharmony_ci * useful to have gup_huge_pmd even if we can't operate on ptes.
266062306a36Sopenharmony_ci */
266162306a36Sopenharmony_cistatic int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
266262306a36Sopenharmony_ci			 unsigned long end, unsigned int flags,
266362306a36Sopenharmony_ci			 struct page **pages, int *nr)
266462306a36Sopenharmony_ci{
266562306a36Sopenharmony_ci	return 0;
266662306a36Sopenharmony_ci}
266762306a36Sopenharmony_ci#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
266862306a36Sopenharmony_ci
266962306a36Sopenharmony_ci#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
267062306a36Sopenharmony_cistatic int __gup_device_huge(unsigned long pfn, unsigned long addr,
267162306a36Sopenharmony_ci			     unsigned long end, unsigned int flags,
267262306a36Sopenharmony_ci			     struct page **pages, int *nr)
267362306a36Sopenharmony_ci{
267462306a36Sopenharmony_ci	int nr_start = *nr;
267562306a36Sopenharmony_ci	struct dev_pagemap *pgmap = NULL;
267662306a36Sopenharmony_ci
267762306a36Sopenharmony_ci	do {
267862306a36Sopenharmony_ci		struct page *page = pfn_to_page(pfn);
267962306a36Sopenharmony_ci
268062306a36Sopenharmony_ci		pgmap = get_dev_pagemap(pfn, pgmap);
268162306a36Sopenharmony_ci		if (unlikely(!pgmap)) {
268262306a36Sopenharmony_ci			undo_dev_pagemap(nr, nr_start, flags, pages);
268362306a36Sopenharmony_ci			break;
268462306a36Sopenharmony_ci		}
268562306a36Sopenharmony_ci
268662306a36Sopenharmony_ci		if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
268762306a36Sopenharmony_ci			undo_dev_pagemap(nr, nr_start, flags, pages);
268862306a36Sopenharmony_ci			break;
268962306a36Sopenharmony_ci		}
269062306a36Sopenharmony_ci
269162306a36Sopenharmony_ci		SetPageReferenced(page);
269262306a36Sopenharmony_ci		pages[*nr] = page;
269362306a36Sopenharmony_ci		if (unlikely(try_grab_page(page, flags))) {
269462306a36Sopenharmony_ci			undo_dev_pagemap(nr, nr_start, flags, pages);
269562306a36Sopenharmony_ci			break;
269662306a36Sopenharmony_ci		}
269762306a36Sopenharmony_ci		(*nr)++;
269862306a36Sopenharmony_ci		pfn++;
269962306a36Sopenharmony_ci	} while (addr += PAGE_SIZE, addr != end);
270062306a36Sopenharmony_ci
270162306a36Sopenharmony_ci	put_dev_pagemap(pgmap);
270262306a36Sopenharmony_ci	return addr == end;
270362306a36Sopenharmony_ci}
270462306a36Sopenharmony_ci
270562306a36Sopenharmony_cistatic int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
270662306a36Sopenharmony_ci				 unsigned long end, unsigned int flags,
270762306a36Sopenharmony_ci				 struct page **pages, int *nr)
270862306a36Sopenharmony_ci{
270962306a36Sopenharmony_ci	unsigned long fault_pfn;
271062306a36Sopenharmony_ci	int nr_start = *nr;
271162306a36Sopenharmony_ci
271262306a36Sopenharmony_ci	fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
271362306a36Sopenharmony_ci	if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
271462306a36Sopenharmony_ci		return 0;
271562306a36Sopenharmony_ci
271662306a36Sopenharmony_ci	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
271762306a36Sopenharmony_ci		undo_dev_pagemap(nr, nr_start, flags, pages);
271862306a36Sopenharmony_ci		return 0;
271962306a36Sopenharmony_ci	}
272062306a36Sopenharmony_ci	return 1;
272162306a36Sopenharmony_ci}
272262306a36Sopenharmony_ci
272362306a36Sopenharmony_cistatic int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
272462306a36Sopenharmony_ci				 unsigned long end, unsigned int flags,
272562306a36Sopenharmony_ci				 struct page **pages, int *nr)
272662306a36Sopenharmony_ci{
272762306a36Sopenharmony_ci	unsigned long fault_pfn;
272862306a36Sopenharmony_ci	int nr_start = *nr;
272962306a36Sopenharmony_ci
273062306a36Sopenharmony_ci	fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
273162306a36Sopenharmony_ci	if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
273262306a36Sopenharmony_ci		return 0;
273362306a36Sopenharmony_ci
273462306a36Sopenharmony_ci	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
273562306a36Sopenharmony_ci		undo_dev_pagemap(nr, nr_start, flags, pages);
273662306a36Sopenharmony_ci		return 0;
273762306a36Sopenharmony_ci	}
273862306a36Sopenharmony_ci	return 1;
273962306a36Sopenharmony_ci}
274062306a36Sopenharmony_ci#else
274162306a36Sopenharmony_cistatic int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
274262306a36Sopenharmony_ci				 unsigned long end, unsigned int flags,
274362306a36Sopenharmony_ci				 struct page **pages, int *nr)
274462306a36Sopenharmony_ci{
274562306a36Sopenharmony_ci	BUILD_BUG();
274662306a36Sopenharmony_ci	return 0;
274762306a36Sopenharmony_ci}
274862306a36Sopenharmony_ci
274962306a36Sopenharmony_cistatic int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
275062306a36Sopenharmony_ci				 unsigned long end, unsigned int flags,
275162306a36Sopenharmony_ci				 struct page **pages, int *nr)
275262306a36Sopenharmony_ci{
275362306a36Sopenharmony_ci	BUILD_BUG();
275462306a36Sopenharmony_ci	return 0;
275562306a36Sopenharmony_ci}
275662306a36Sopenharmony_ci#endif
275762306a36Sopenharmony_ci
275862306a36Sopenharmony_cistatic int record_subpages(struct page *page, unsigned long addr,
275962306a36Sopenharmony_ci			   unsigned long end, struct page **pages)
276062306a36Sopenharmony_ci{
276162306a36Sopenharmony_ci	int nr;
276262306a36Sopenharmony_ci
276362306a36Sopenharmony_ci	for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
276462306a36Sopenharmony_ci		pages[nr] = nth_page(page, nr);
276562306a36Sopenharmony_ci
276662306a36Sopenharmony_ci	return nr;
276762306a36Sopenharmony_ci}
276862306a36Sopenharmony_ci
276962306a36Sopenharmony_ci#ifdef CONFIG_ARCH_HAS_HUGEPD
277062306a36Sopenharmony_cistatic unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
277162306a36Sopenharmony_ci				      unsigned long sz)
277262306a36Sopenharmony_ci{
277362306a36Sopenharmony_ci	unsigned long __boundary = (addr + sz) & ~(sz-1);
277462306a36Sopenharmony_ci	return (__boundary - 1 < end - 1) ? __boundary : end;
277562306a36Sopenharmony_ci}
277662306a36Sopenharmony_ci
277762306a36Sopenharmony_cistatic int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
277862306a36Sopenharmony_ci		       unsigned long end, unsigned int flags,
277962306a36Sopenharmony_ci		       struct page **pages, int *nr)
278062306a36Sopenharmony_ci{
278162306a36Sopenharmony_ci	unsigned long pte_end;
278262306a36Sopenharmony_ci	struct page *page;
278362306a36Sopenharmony_ci	struct folio *folio;
278462306a36Sopenharmony_ci	pte_t pte;
278562306a36Sopenharmony_ci	int refs;
278662306a36Sopenharmony_ci
278762306a36Sopenharmony_ci	pte_end = (addr + sz) & ~(sz-1);
278862306a36Sopenharmony_ci	if (pte_end < end)
278962306a36Sopenharmony_ci		end = pte_end;
279062306a36Sopenharmony_ci
279162306a36Sopenharmony_ci	pte = huge_ptep_get(ptep);
279262306a36Sopenharmony_ci
279362306a36Sopenharmony_ci	if (!pte_access_permitted(pte, flags & FOLL_WRITE))
279462306a36Sopenharmony_ci		return 0;
279562306a36Sopenharmony_ci
279662306a36Sopenharmony_ci	/* hugepages are never "special" */
279762306a36Sopenharmony_ci	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
279862306a36Sopenharmony_ci
279962306a36Sopenharmony_ci	page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
280062306a36Sopenharmony_ci	refs = record_subpages(page, addr, end, pages + *nr);
280162306a36Sopenharmony_ci
280262306a36Sopenharmony_ci	folio = try_grab_folio(page, refs, flags);
280362306a36Sopenharmony_ci	if (!folio)
280462306a36Sopenharmony_ci		return 0;
280562306a36Sopenharmony_ci
280662306a36Sopenharmony_ci	if (unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
280762306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
280862306a36Sopenharmony_ci		return 0;
280962306a36Sopenharmony_ci	}
281062306a36Sopenharmony_ci
281162306a36Sopenharmony_ci	if (!folio_fast_pin_allowed(folio, flags)) {
281262306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
281362306a36Sopenharmony_ci		return 0;
281462306a36Sopenharmony_ci	}
281562306a36Sopenharmony_ci
281662306a36Sopenharmony_ci	if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) {
281762306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
281862306a36Sopenharmony_ci		return 0;
281962306a36Sopenharmony_ci	}
282062306a36Sopenharmony_ci
282162306a36Sopenharmony_ci	*nr += refs;
282262306a36Sopenharmony_ci	folio_set_referenced(folio);
282362306a36Sopenharmony_ci	return 1;
282462306a36Sopenharmony_ci}
282562306a36Sopenharmony_ci
282662306a36Sopenharmony_cistatic int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
282762306a36Sopenharmony_ci		unsigned int pdshift, unsigned long end, unsigned int flags,
282862306a36Sopenharmony_ci		struct page **pages, int *nr)
282962306a36Sopenharmony_ci{
283062306a36Sopenharmony_ci	pte_t *ptep;
283162306a36Sopenharmony_ci	unsigned long sz = 1UL << hugepd_shift(hugepd);
283262306a36Sopenharmony_ci	unsigned long next;
283362306a36Sopenharmony_ci
283462306a36Sopenharmony_ci	ptep = hugepte_offset(hugepd, addr, pdshift);
283562306a36Sopenharmony_ci	do {
283662306a36Sopenharmony_ci		next = hugepte_addr_end(addr, end, sz);
283762306a36Sopenharmony_ci		if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
283862306a36Sopenharmony_ci			return 0;
283962306a36Sopenharmony_ci	} while (ptep++, addr = next, addr != end);
284062306a36Sopenharmony_ci
284162306a36Sopenharmony_ci	return 1;
284262306a36Sopenharmony_ci}
284362306a36Sopenharmony_ci#else
284462306a36Sopenharmony_cistatic inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
284562306a36Sopenharmony_ci		unsigned int pdshift, unsigned long end, unsigned int flags,
284662306a36Sopenharmony_ci		struct page **pages, int *nr)
284762306a36Sopenharmony_ci{
284862306a36Sopenharmony_ci	return 0;
284962306a36Sopenharmony_ci}
285062306a36Sopenharmony_ci#endif /* CONFIG_ARCH_HAS_HUGEPD */
285162306a36Sopenharmony_ci
285262306a36Sopenharmony_cistatic int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
285362306a36Sopenharmony_ci			unsigned long end, unsigned int flags,
285462306a36Sopenharmony_ci			struct page **pages, int *nr)
285562306a36Sopenharmony_ci{
285662306a36Sopenharmony_ci	struct page *page;
285762306a36Sopenharmony_ci	struct folio *folio;
285862306a36Sopenharmony_ci	int refs;
285962306a36Sopenharmony_ci
286062306a36Sopenharmony_ci	if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
286162306a36Sopenharmony_ci		return 0;
286262306a36Sopenharmony_ci
286362306a36Sopenharmony_ci	if (pmd_devmap(orig)) {
286462306a36Sopenharmony_ci		if (unlikely(flags & FOLL_LONGTERM))
286562306a36Sopenharmony_ci			return 0;
286662306a36Sopenharmony_ci		return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
286762306a36Sopenharmony_ci					     pages, nr);
286862306a36Sopenharmony_ci	}
286962306a36Sopenharmony_ci
287062306a36Sopenharmony_ci	page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
287162306a36Sopenharmony_ci	refs = record_subpages(page, addr, end, pages + *nr);
287262306a36Sopenharmony_ci
287362306a36Sopenharmony_ci	folio = try_grab_folio(page, refs, flags);
287462306a36Sopenharmony_ci	if (!folio)
287562306a36Sopenharmony_ci		return 0;
287662306a36Sopenharmony_ci
287762306a36Sopenharmony_ci	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
287862306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
287962306a36Sopenharmony_ci		return 0;
288062306a36Sopenharmony_ci	}
288162306a36Sopenharmony_ci
288262306a36Sopenharmony_ci	if (!folio_fast_pin_allowed(folio, flags)) {
288362306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
288462306a36Sopenharmony_ci		return 0;
288562306a36Sopenharmony_ci	}
288662306a36Sopenharmony_ci	if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
288762306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
288862306a36Sopenharmony_ci		return 0;
288962306a36Sopenharmony_ci	}
289062306a36Sopenharmony_ci
289162306a36Sopenharmony_ci	*nr += refs;
289262306a36Sopenharmony_ci	folio_set_referenced(folio);
289362306a36Sopenharmony_ci	return 1;
289462306a36Sopenharmony_ci}
289562306a36Sopenharmony_ci
289662306a36Sopenharmony_cistatic int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
289762306a36Sopenharmony_ci			unsigned long end, unsigned int flags,
289862306a36Sopenharmony_ci			struct page **pages, int *nr)
289962306a36Sopenharmony_ci{
290062306a36Sopenharmony_ci	struct page *page;
290162306a36Sopenharmony_ci	struct folio *folio;
290262306a36Sopenharmony_ci	int refs;
290362306a36Sopenharmony_ci
290462306a36Sopenharmony_ci	if (!pud_access_permitted(orig, flags & FOLL_WRITE))
290562306a36Sopenharmony_ci		return 0;
290662306a36Sopenharmony_ci
290762306a36Sopenharmony_ci	if (pud_devmap(orig)) {
290862306a36Sopenharmony_ci		if (unlikely(flags & FOLL_LONGTERM))
290962306a36Sopenharmony_ci			return 0;
291062306a36Sopenharmony_ci		return __gup_device_huge_pud(orig, pudp, addr, end, flags,
291162306a36Sopenharmony_ci					     pages, nr);
291262306a36Sopenharmony_ci	}
291362306a36Sopenharmony_ci
291462306a36Sopenharmony_ci	page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT);
291562306a36Sopenharmony_ci	refs = record_subpages(page, addr, end, pages + *nr);
291662306a36Sopenharmony_ci
291762306a36Sopenharmony_ci	folio = try_grab_folio(page, refs, flags);
291862306a36Sopenharmony_ci	if (!folio)
291962306a36Sopenharmony_ci		return 0;
292062306a36Sopenharmony_ci
292162306a36Sopenharmony_ci	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
292262306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
292362306a36Sopenharmony_ci		return 0;
292462306a36Sopenharmony_ci	}
292562306a36Sopenharmony_ci
292662306a36Sopenharmony_ci	if (!folio_fast_pin_allowed(folio, flags)) {
292762306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
292862306a36Sopenharmony_ci		return 0;
292962306a36Sopenharmony_ci	}
293062306a36Sopenharmony_ci
293162306a36Sopenharmony_ci	if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
293262306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
293362306a36Sopenharmony_ci		return 0;
293462306a36Sopenharmony_ci	}
293562306a36Sopenharmony_ci
293662306a36Sopenharmony_ci	*nr += refs;
293762306a36Sopenharmony_ci	folio_set_referenced(folio);
293862306a36Sopenharmony_ci	return 1;
293962306a36Sopenharmony_ci}
294062306a36Sopenharmony_ci
294162306a36Sopenharmony_cistatic int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
294262306a36Sopenharmony_ci			unsigned long end, unsigned int flags,
294362306a36Sopenharmony_ci			struct page **pages, int *nr)
294462306a36Sopenharmony_ci{
294562306a36Sopenharmony_ci	int refs;
294662306a36Sopenharmony_ci	struct page *page;
294762306a36Sopenharmony_ci	struct folio *folio;
294862306a36Sopenharmony_ci
294962306a36Sopenharmony_ci	if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
295062306a36Sopenharmony_ci		return 0;
295162306a36Sopenharmony_ci
295262306a36Sopenharmony_ci	BUILD_BUG_ON(pgd_devmap(orig));
295362306a36Sopenharmony_ci
295462306a36Sopenharmony_ci	page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT);
295562306a36Sopenharmony_ci	refs = record_subpages(page, addr, end, pages + *nr);
295662306a36Sopenharmony_ci
295762306a36Sopenharmony_ci	folio = try_grab_folio(page, refs, flags);
295862306a36Sopenharmony_ci	if (!folio)
295962306a36Sopenharmony_ci		return 0;
296062306a36Sopenharmony_ci
296162306a36Sopenharmony_ci	if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
296262306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
296362306a36Sopenharmony_ci		return 0;
296462306a36Sopenharmony_ci	}
296562306a36Sopenharmony_ci
296662306a36Sopenharmony_ci	if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
296762306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
296862306a36Sopenharmony_ci		return 0;
296962306a36Sopenharmony_ci	}
297062306a36Sopenharmony_ci
297162306a36Sopenharmony_ci	if (!folio_fast_pin_allowed(folio, flags)) {
297262306a36Sopenharmony_ci		gup_put_folio(folio, refs, flags);
297362306a36Sopenharmony_ci		return 0;
297462306a36Sopenharmony_ci	}
297562306a36Sopenharmony_ci
297662306a36Sopenharmony_ci	*nr += refs;
297762306a36Sopenharmony_ci	folio_set_referenced(folio);
297862306a36Sopenharmony_ci	return 1;
297962306a36Sopenharmony_ci}
298062306a36Sopenharmony_ci
298162306a36Sopenharmony_cistatic int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
298262306a36Sopenharmony_ci		unsigned int flags, struct page **pages, int *nr)
298362306a36Sopenharmony_ci{
298462306a36Sopenharmony_ci	unsigned long next;
298562306a36Sopenharmony_ci	pmd_t *pmdp;
298662306a36Sopenharmony_ci
298762306a36Sopenharmony_ci	pmdp = pmd_offset_lockless(pudp, pud, addr);
298862306a36Sopenharmony_ci	do {
298962306a36Sopenharmony_ci		pmd_t pmd = pmdp_get_lockless(pmdp);
299062306a36Sopenharmony_ci
299162306a36Sopenharmony_ci		next = pmd_addr_end(addr, end);
299262306a36Sopenharmony_ci		if (!pmd_present(pmd))
299362306a36Sopenharmony_ci			return 0;
299462306a36Sopenharmony_ci
299562306a36Sopenharmony_ci		if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
299662306a36Sopenharmony_ci			     pmd_devmap(pmd))) {
299762306a36Sopenharmony_ci			/* See gup_pte_range() */
299862306a36Sopenharmony_ci			if (pmd_protnone(pmd))
299962306a36Sopenharmony_ci				return 0;
300062306a36Sopenharmony_ci
300162306a36Sopenharmony_ci			if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
300262306a36Sopenharmony_ci				pages, nr))
300362306a36Sopenharmony_ci				return 0;
300462306a36Sopenharmony_ci
300562306a36Sopenharmony_ci		} else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
300662306a36Sopenharmony_ci			/*
300762306a36Sopenharmony_ci			 * architecture have different format for hugetlbfs
300862306a36Sopenharmony_ci			 * pmd format and THP pmd format
300962306a36Sopenharmony_ci			 */
301062306a36Sopenharmony_ci			if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
301162306a36Sopenharmony_ci					 PMD_SHIFT, next, flags, pages, nr))
301262306a36Sopenharmony_ci				return 0;
301362306a36Sopenharmony_ci		} else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
301462306a36Sopenharmony_ci			return 0;
301562306a36Sopenharmony_ci	} while (pmdp++, addr = next, addr != end);
301662306a36Sopenharmony_ci
301762306a36Sopenharmony_ci	return 1;
301862306a36Sopenharmony_ci}
301962306a36Sopenharmony_ci
302062306a36Sopenharmony_cistatic int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
302162306a36Sopenharmony_ci			 unsigned int flags, struct page **pages, int *nr)
302262306a36Sopenharmony_ci{
302362306a36Sopenharmony_ci	unsigned long next;
302462306a36Sopenharmony_ci	pud_t *pudp;
302562306a36Sopenharmony_ci
302662306a36Sopenharmony_ci	pudp = pud_offset_lockless(p4dp, p4d, addr);
302762306a36Sopenharmony_ci	do {
302862306a36Sopenharmony_ci		pud_t pud = READ_ONCE(*pudp);
302962306a36Sopenharmony_ci
303062306a36Sopenharmony_ci		next = pud_addr_end(addr, end);
303162306a36Sopenharmony_ci		if (unlikely(!pud_present(pud)))
303262306a36Sopenharmony_ci			return 0;
303362306a36Sopenharmony_ci		if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
303462306a36Sopenharmony_ci			if (!gup_huge_pud(pud, pudp, addr, next, flags,
303562306a36Sopenharmony_ci					  pages, nr))
303662306a36Sopenharmony_ci				return 0;
303762306a36Sopenharmony_ci		} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
303862306a36Sopenharmony_ci			if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
303962306a36Sopenharmony_ci					 PUD_SHIFT, next, flags, pages, nr))
304062306a36Sopenharmony_ci				return 0;
304162306a36Sopenharmony_ci		} else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
304262306a36Sopenharmony_ci			return 0;
304362306a36Sopenharmony_ci	} while (pudp++, addr = next, addr != end);
304462306a36Sopenharmony_ci
304562306a36Sopenharmony_ci	return 1;
304662306a36Sopenharmony_ci}
304762306a36Sopenharmony_ci
304862306a36Sopenharmony_cistatic int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
304962306a36Sopenharmony_ci			 unsigned int flags, struct page **pages, int *nr)
305062306a36Sopenharmony_ci{
305162306a36Sopenharmony_ci	unsigned long next;
305262306a36Sopenharmony_ci	p4d_t *p4dp;
305362306a36Sopenharmony_ci
305462306a36Sopenharmony_ci	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
305562306a36Sopenharmony_ci	do {
305662306a36Sopenharmony_ci		p4d_t p4d = READ_ONCE(*p4dp);
305762306a36Sopenharmony_ci
305862306a36Sopenharmony_ci		next = p4d_addr_end(addr, end);
305962306a36Sopenharmony_ci		if (p4d_none(p4d))
306062306a36Sopenharmony_ci			return 0;
306162306a36Sopenharmony_ci		BUILD_BUG_ON(p4d_huge(p4d));
306262306a36Sopenharmony_ci		if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
306362306a36Sopenharmony_ci			if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
306462306a36Sopenharmony_ci					 P4D_SHIFT, next, flags, pages, nr))
306562306a36Sopenharmony_ci				return 0;
306662306a36Sopenharmony_ci		} else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
306762306a36Sopenharmony_ci			return 0;
306862306a36Sopenharmony_ci	} while (p4dp++, addr = next, addr != end);
306962306a36Sopenharmony_ci
307062306a36Sopenharmony_ci	return 1;
307162306a36Sopenharmony_ci}
307262306a36Sopenharmony_ci
307362306a36Sopenharmony_cistatic void gup_pgd_range(unsigned long addr, unsigned long end,
307462306a36Sopenharmony_ci		unsigned int flags, struct page **pages, int *nr)
307562306a36Sopenharmony_ci{
307662306a36Sopenharmony_ci	unsigned long next;
307762306a36Sopenharmony_ci	pgd_t *pgdp;
307862306a36Sopenharmony_ci
307962306a36Sopenharmony_ci	pgdp = pgd_offset(current->mm, addr);
308062306a36Sopenharmony_ci	do {
308162306a36Sopenharmony_ci		pgd_t pgd = READ_ONCE(*pgdp);
308262306a36Sopenharmony_ci
308362306a36Sopenharmony_ci		next = pgd_addr_end(addr, end);
308462306a36Sopenharmony_ci		if (pgd_none(pgd))
308562306a36Sopenharmony_ci			return;
308662306a36Sopenharmony_ci		if (unlikely(pgd_huge(pgd))) {
308762306a36Sopenharmony_ci			if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
308862306a36Sopenharmony_ci					  pages, nr))
308962306a36Sopenharmony_ci				return;
309062306a36Sopenharmony_ci		} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
309162306a36Sopenharmony_ci			if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
309262306a36Sopenharmony_ci					 PGDIR_SHIFT, next, flags, pages, nr))
309362306a36Sopenharmony_ci				return;
309462306a36Sopenharmony_ci		} else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
309562306a36Sopenharmony_ci			return;
309662306a36Sopenharmony_ci	} while (pgdp++, addr = next, addr != end);
309762306a36Sopenharmony_ci}
309862306a36Sopenharmony_ci#else
309962306a36Sopenharmony_cistatic inline void gup_pgd_range(unsigned long addr, unsigned long end,
310062306a36Sopenharmony_ci		unsigned int flags, struct page **pages, int *nr)
310162306a36Sopenharmony_ci{
310262306a36Sopenharmony_ci}
310362306a36Sopenharmony_ci#endif /* CONFIG_HAVE_FAST_GUP */
310462306a36Sopenharmony_ci
310562306a36Sopenharmony_ci#ifndef gup_fast_permitted
310662306a36Sopenharmony_ci/*
310762306a36Sopenharmony_ci * Check if it's allowed to use get_user_pages_fast_only() for the range, or
310862306a36Sopenharmony_ci * we need to fall back to the slow version:
310962306a36Sopenharmony_ci */
311062306a36Sopenharmony_cistatic bool gup_fast_permitted(unsigned long start, unsigned long end)
311162306a36Sopenharmony_ci{
311262306a36Sopenharmony_ci	return true;
311362306a36Sopenharmony_ci}
311462306a36Sopenharmony_ci#endif
311562306a36Sopenharmony_ci
311662306a36Sopenharmony_cistatic unsigned long lockless_pages_from_mm(unsigned long start,
311762306a36Sopenharmony_ci					    unsigned long end,
311862306a36Sopenharmony_ci					    unsigned int gup_flags,
311962306a36Sopenharmony_ci					    struct page **pages)
312062306a36Sopenharmony_ci{
312162306a36Sopenharmony_ci	unsigned long flags;
312262306a36Sopenharmony_ci	int nr_pinned = 0;
312362306a36Sopenharmony_ci	unsigned seq;
312462306a36Sopenharmony_ci
312562306a36Sopenharmony_ci	if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
312662306a36Sopenharmony_ci	    !gup_fast_permitted(start, end))
312762306a36Sopenharmony_ci		return 0;
312862306a36Sopenharmony_ci
312962306a36Sopenharmony_ci	if (gup_flags & FOLL_PIN) {
313062306a36Sopenharmony_ci		seq = raw_read_seqcount(&current->mm->write_protect_seq);
313162306a36Sopenharmony_ci		if (seq & 1)
313262306a36Sopenharmony_ci			return 0;
313362306a36Sopenharmony_ci	}
313462306a36Sopenharmony_ci
313562306a36Sopenharmony_ci	/*
313662306a36Sopenharmony_ci	 * Disable interrupts. The nested form is used, in order to allow full,
313762306a36Sopenharmony_ci	 * general purpose use of this routine.
313862306a36Sopenharmony_ci	 *
313962306a36Sopenharmony_ci	 * With interrupts disabled, we block page table pages from being freed
314062306a36Sopenharmony_ci	 * from under us. See struct mmu_table_batch comments in
314162306a36Sopenharmony_ci	 * include/asm-generic/tlb.h for more details.
314262306a36Sopenharmony_ci	 *
314362306a36Sopenharmony_ci	 * We do not adopt an rcu_read_lock() here as we also want to block IPIs
314462306a36Sopenharmony_ci	 * that come from THPs splitting.
314562306a36Sopenharmony_ci	 */
314662306a36Sopenharmony_ci	local_irq_save(flags);
314762306a36Sopenharmony_ci	gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
314862306a36Sopenharmony_ci	local_irq_restore(flags);
314962306a36Sopenharmony_ci
315062306a36Sopenharmony_ci	/*
315162306a36Sopenharmony_ci	 * When pinning pages for DMA there could be a concurrent write protect
315262306a36Sopenharmony_ci	 * from fork() via copy_page_range(), in this case always fail fast GUP.
315362306a36Sopenharmony_ci	 */
315462306a36Sopenharmony_ci	if (gup_flags & FOLL_PIN) {
315562306a36Sopenharmony_ci		if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
315662306a36Sopenharmony_ci			unpin_user_pages_lockless(pages, nr_pinned);
315762306a36Sopenharmony_ci			return 0;
315862306a36Sopenharmony_ci		} else {
315962306a36Sopenharmony_ci			sanity_check_pinned_pages(pages, nr_pinned);
316062306a36Sopenharmony_ci		}
316162306a36Sopenharmony_ci	}
316262306a36Sopenharmony_ci	return nr_pinned;
316362306a36Sopenharmony_ci}
316462306a36Sopenharmony_ci
316562306a36Sopenharmony_cistatic int internal_get_user_pages_fast(unsigned long start,
316662306a36Sopenharmony_ci					unsigned long nr_pages,
316762306a36Sopenharmony_ci					unsigned int gup_flags,
316862306a36Sopenharmony_ci					struct page **pages)
316962306a36Sopenharmony_ci{
317062306a36Sopenharmony_ci	unsigned long len, end;
317162306a36Sopenharmony_ci	unsigned long nr_pinned;
317262306a36Sopenharmony_ci	int locked = 0;
317362306a36Sopenharmony_ci	int ret;
317462306a36Sopenharmony_ci
317562306a36Sopenharmony_ci	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
317662306a36Sopenharmony_ci				       FOLL_FORCE | FOLL_PIN | FOLL_GET |
317762306a36Sopenharmony_ci				       FOLL_FAST_ONLY | FOLL_NOFAULT |
317862306a36Sopenharmony_ci				       FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))
317962306a36Sopenharmony_ci		return -EINVAL;
318062306a36Sopenharmony_ci
318162306a36Sopenharmony_ci	if (gup_flags & FOLL_PIN)
318262306a36Sopenharmony_ci		mm_set_has_pinned_flag(&current->mm->flags);
318362306a36Sopenharmony_ci
318462306a36Sopenharmony_ci	if (!(gup_flags & FOLL_FAST_ONLY))
318562306a36Sopenharmony_ci		might_lock_read(&current->mm->mmap_lock);
318662306a36Sopenharmony_ci
318762306a36Sopenharmony_ci	start = untagged_addr(start) & PAGE_MASK;
318862306a36Sopenharmony_ci	len = nr_pages << PAGE_SHIFT;
318962306a36Sopenharmony_ci	if (check_add_overflow(start, len, &end))
319062306a36Sopenharmony_ci		return -EOVERFLOW;
319162306a36Sopenharmony_ci	if (end > TASK_SIZE_MAX)
319262306a36Sopenharmony_ci		return -EFAULT;
319362306a36Sopenharmony_ci	if (unlikely(!access_ok((void __user *)start, len)))
319462306a36Sopenharmony_ci		return -EFAULT;
319562306a36Sopenharmony_ci
319662306a36Sopenharmony_ci	nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
319762306a36Sopenharmony_ci	if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
319862306a36Sopenharmony_ci		return nr_pinned;
319962306a36Sopenharmony_ci
320062306a36Sopenharmony_ci	/* Slow path: try to get the remaining pages with get_user_pages */
320162306a36Sopenharmony_ci	start += nr_pinned << PAGE_SHIFT;
320262306a36Sopenharmony_ci	pages += nr_pinned;
320362306a36Sopenharmony_ci	ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
320462306a36Sopenharmony_ci				    pages, &locked,
320562306a36Sopenharmony_ci				    gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
320662306a36Sopenharmony_ci	if (ret < 0) {
320762306a36Sopenharmony_ci		/*
320862306a36Sopenharmony_ci		 * The caller has to unpin the pages we already pinned so
320962306a36Sopenharmony_ci		 * returning -errno is not an option
321062306a36Sopenharmony_ci		 */
321162306a36Sopenharmony_ci		if (nr_pinned)
321262306a36Sopenharmony_ci			return nr_pinned;
321362306a36Sopenharmony_ci		return ret;
321462306a36Sopenharmony_ci	}
321562306a36Sopenharmony_ci	return ret + nr_pinned;
321662306a36Sopenharmony_ci}
321762306a36Sopenharmony_ci
321862306a36Sopenharmony_ci/**
321962306a36Sopenharmony_ci * get_user_pages_fast_only() - pin user pages in memory
322062306a36Sopenharmony_ci * @start:      starting user address
322162306a36Sopenharmony_ci * @nr_pages:   number of pages from start to pin
322262306a36Sopenharmony_ci * @gup_flags:  flags modifying pin behaviour
322362306a36Sopenharmony_ci * @pages:      array that receives pointers to the pages pinned.
322462306a36Sopenharmony_ci *              Should be at least nr_pages long.
322562306a36Sopenharmony_ci *
322662306a36Sopenharmony_ci * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
322762306a36Sopenharmony_ci * the regular GUP.
322862306a36Sopenharmony_ci *
322962306a36Sopenharmony_ci * If the architecture does not support this function, simply return with no
323062306a36Sopenharmony_ci * pages pinned.
323162306a36Sopenharmony_ci *
323262306a36Sopenharmony_ci * Careful, careful! COW breaking can go either way, so a non-write
323362306a36Sopenharmony_ci * access can get ambiguous page results. If you call this function without
323462306a36Sopenharmony_ci * 'write' set, you'd better be sure that you're ok with that ambiguity.
323562306a36Sopenharmony_ci */
323662306a36Sopenharmony_ciint get_user_pages_fast_only(unsigned long start, int nr_pages,
323762306a36Sopenharmony_ci			     unsigned int gup_flags, struct page **pages)
323862306a36Sopenharmony_ci{
323962306a36Sopenharmony_ci	/*
324062306a36Sopenharmony_ci	 * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
324162306a36Sopenharmony_ci	 * because gup fast is always a "pin with a +1 page refcount" request.
324262306a36Sopenharmony_ci	 *
324362306a36Sopenharmony_ci	 * FOLL_FAST_ONLY is required in order to match the API description of
324462306a36Sopenharmony_ci	 * this routine: no fall back to regular ("slow") GUP.
324562306a36Sopenharmony_ci	 */
324662306a36Sopenharmony_ci	if (!is_valid_gup_args(pages, NULL, &gup_flags,
324762306a36Sopenharmony_ci			       FOLL_GET | FOLL_FAST_ONLY))
324862306a36Sopenharmony_ci		return -EINVAL;
324962306a36Sopenharmony_ci
325062306a36Sopenharmony_ci	return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
325162306a36Sopenharmony_ci}
325262306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(get_user_pages_fast_only);
325362306a36Sopenharmony_ci
325462306a36Sopenharmony_ci/**
325562306a36Sopenharmony_ci * get_user_pages_fast() - pin user pages in memory
325662306a36Sopenharmony_ci * @start:      starting user address
325762306a36Sopenharmony_ci * @nr_pages:   number of pages from start to pin
325862306a36Sopenharmony_ci * @gup_flags:  flags modifying pin behaviour
325962306a36Sopenharmony_ci * @pages:      array that receives pointers to the pages pinned.
326062306a36Sopenharmony_ci *              Should be at least nr_pages long.
326162306a36Sopenharmony_ci *
326262306a36Sopenharmony_ci * Attempt to pin user pages in memory without taking mm->mmap_lock.
326362306a36Sopenharmony_ci * If not successful, it will fall back to taking the lock and
326462306a36Sopenharmony_ci * calling get_user_pages().
326562306a36Sopenharmony_ci *
326662306a36Sopenharmony_ci * Returns number of pages pinned. This may be fewer than the number requested.
326762306a36Sopenharmony_ci * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
326862306a36Sopenharmony_ci * -errno.
326962306a36Sopenharmony_ci */
327062306a36Sopenharmony_ciint get_user_pages_fast(unsigned long start, int nr_pages,
327162306a36Sopenharmony_ci			unsigned int gup_flags, struct page **pages)
327262306a36Sopenharmony_ci{
327362306a36Sopenharmony_ci	/*
327462306a36Sopenharmony_ci	 * The caller may or may not have explicitly set FOLL_GET; either way is
327562306a36Sopenharmony_ci	 * OK. However, internally (within mm/gup.c), gup fast variants must set
327662306a36Sopenharmony_ci	 * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
327762306a36Sopenharmony_ci	 * request.
327862306a36Sopenharmony_ci	 */
327962306a36Sopenharmony_ci	if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
328062306a36Sopenharmony_ci		return -EINVAL;
328162306a36Sopenharmony_ci	return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
328262306a36Sopenharmony_ci}
328362306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(get_user_pages_fast);
328462306a36Sopenharmony_ci
328562306a36Sopenharmony_ci/**
328662306a36Sopenharmony_ci * pin_user_pages_fast() - pin user pages in memory without taking locks
328762306a36Sopenharmony_ci *
328862306a36Sopenharmony_ci * @start:      starting user address
328962306a36Sopenharmony_ci * @nr_pages:   number of pages from start to pin
329062306a36Sopenharmony_ci * @gup_flags:  flags modifying pin behaviour
329162306a36Sopenharmony_ci * @pages:      array that receives pointers to the pages pinned.
329262306a36Sopenharmony_ci *              Should be at least nr_pages long.
329362306a36Sopenharmony_ci *
329462306a36Sopenharmony_ci * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
329562306a36Sopenharmony_ci * get_user_pages_fast() for documentation on the function arguments, because
329662306a36Sopenharmony_ci * the arguments here are identical.
329762306a36Sopenharmony_ci *
329862306a36Sopenharmony_ci * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
329962306a36Sopenharmony_ci * see Documentation/core-api/pin_user_pages.rst for further details.
330062306a36Sopenharmony_ci *
330162306a36Sopenharmony_ci * Note that if a zero_page is amongst the returned pages, it will not have
330262306a36Sopenharmony_ci * pins in it and unpin_user_page() will not remove pins from it.
330362306a36Sopenharmony_ci */
330462306a36Sopenharmony_ciint pin_user_pages_fast(unsigned long start, int nr_pages,
330562306a36Sopenharmony_ci			unsigned int gup_flags, struct page **pages)
330662306a36Sopenharmony_ci{
330762306a36Sopenharmony_ci	if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
330862306a36Sopenharmony_ci		return -EINVAL;
330962306a36Sopenharmony_ci	return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
331062306a36Sopenharmony_ci}
331162306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(pin_user_pages_fast);
331262306a36Sopenharmony_ci
331362306a36Sopenharmony_ci/**
331462306a36Sopenharmony_ci * pin_user_pages_remote() - pin pages of a remote process
331562306a36Sopenharmony_ci *
331662306a36Sopenharmony_ci * @mm:		mm_struct of target mm
331762306a36Sopenharmony_ci * @start:	starting user address
331862306a36Sopenharmony_ci * @nr_pages:	number of pages from start to pin
331962306a36Sopenharmony_ci * @gup_flags:	flags modifying lookup behaviour
332062306a36Sopenharmony_ci * @pages:	array that receives pointers to the pages pinned.
332162306a36Sopenharmony_ci *		Should be at least nr_pages long.
332262306a36Sopenharmony_ci * @locked:	pointer to lock flag indicating whether lock is held and
332362306a36Sopenharmony_ci *		subsequently whether VM_FAULT_RETRY functionality can be
332462306a36Sopenharmony_ci *		utilised. Lock must initially be held.
332562306a36Sopenharmony_ci *
332662306a36Sopenharmony_ci * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
332762306a36Sopenharmony_ci * get_user_pages_remote() for documentation on the function arguments, because
332862306a36Sopenharmony_ci * the arguments here are identical.
332962306a36Sopenharmony_ci *
333062306a36Sopenharmony_ci * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
333162306a36Sopenharmony_ci * see Documentation/core-api/pin_user_pages.rst for details.
333262306a36Sopenharmony_ci *
333362306a36Sopenharmony_ci * Note that if a zero_page is amongst the returned pages, it will not have
333462306a36Sopenharmony_ci * pins in it and unpin_user_page*() will not remove pins from it.
333562306a36Sopenharmony_ci */
333662306a36Sopenharmony_cilong pin_user_pages_remote(struct mm_struct *mm,
333762306a36Sopenharmony_ci			   unsigned long start, unsigned long nr_pages,
333862306a36Sopenharmony_ci			   unsigned int gup_flags, struct page **pages,
333962306a36Sopenharmony_ci			   int *locked)
334062306a36Sopenharmony_ci{
334162306a36Sopenharmony_ci	int local_locked = 1;
334262306a36Sopenharmony_ci
334362306a36Sopenharmony_ci	if (!is_valid_gup_args(pages, locked, &gup_flags,
334462306a36Sopenharmony_ci			       FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
334562306a36Sopenharmony_ci		return 0;
334662306a36Sopenharmony_ci	return __gup_longterm_locked(mm, start, nr_pages, pages,
334762306a36Sopenharmony_ci				     locked ? locked : &local_locked,
334862306a36Sopenharmony_ci				     gup_flags);
334962306a36Sopenharmony_ci}
335062306a36Sopenharmony_ciEXPORT_SYMBOL(pin_user_pages_remote);
335162306a36Sopenharmony_ci
335262306a36Sopenharmony_ci/**
335362306a36Sopenharmony_ci * pin_user_pages() - pin user pages in memory for use by other devices
335462306a36Sopenharmony_ci *
335562306a36Sopenharmony_ci * @start:	starting user address
335662306a36Sopenharmony_ci * @nr_pages:	number of pages from start to pin
335762306a36Sopenharmony_ci * @gup_flags:	flags modifying lookup behaviour
335862306a36Sopenharmony_ci * @pages:	array that receives pointers to the pages pinned.
335962306a36Sopenharmony_ci *		Should be at least nr_pages long.
336062306a36Sopenharmony_ci *
336162306a36Sopenharmony_ci * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
336262306a36Sopenharmony_ci * FOLL_PIN is set.
336362306a36Sopenharmony_ci *
336462306a36Sopenharmony_ci * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
336562306a36Sopenharmony_ci * see Documentation/core-api/pin_user_pages.rst for details.
336662306a36Sopenharmony_ci *
336762306a36Sopenharmony_ci * Note that if a zero_page is amongst the returned pages, it will not have
336862306a36Sopenharmony_ci * pins in it and unpin_user_page*() will not remove pins from it.
336962306a36Sopenharmony_ci */
337062306a36Sopenharmony_cilong pin_user_pages(unsigned long start, unsigned long nr_pages,
337162306a36Sopenharmony_ci		    unsigned int gup_flags, struct page **pages)
337262306a36Sopenharmony_ci{
337362306a36Sopenharmony_ci	int locked = 1;
337462306a36Sopenharmony_ci
337562306a36Sopenharmony_ci	if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
337662306a36Sopenharmony_ci		return 0;
337762306a36Sopenharmony_ci	return __gup_longterm_locked(current->mm, start, nr_pages,
337862306a36Sopenharmony_ci				     pages, &locked, gup_flags);
337962306a36Sopenharmony_ci}
338062306a36Sopenharmony_ciEXPORT_SYMBOL(pin_user_pages);
338162306a36Sopenharmony_ci
338262306a36Sopenharmony_ci/*
338362306a36Sopenharmony_ci * pin_user_pages_unlocked() is the FOLL_PIN variant of
338462306a36Sopenharmony_ci * get_user_pages_unlocked(). Behavior is the same, except that this one sets
338562306a36Sopenharmony_ci * FOLL_PIN and rejects FOLL_GET.
338662306a36Sopenharmony_ci *
338762306a36Sopenharmony_ci * Note that if a zero_page is amongst the returned pages, it will not have
338862306a36Sopenharmony_ci * pins in it and unpin_user_page*() will not remove pins from it.
338962306a36Sopenharmony_ci */
339062306a36Sopenharmony_cilong pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
339162306a36Sopenharmony_ci			     struct page **pages, unsigned int gup_flags)
339262306a36Sopenharmony_ci{
339362306a36Sopenharmony_ci	int locked = 0;
339462306a36Sopenharmony_ci
339562306a36Sopenharmony_ci	if (!is_valid_gup_args(pages, NULL, &gup_flags,
339662306a36Sopenharmony_ci			       FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
339762306a36Sopenharmony_ci		return 0;
339862306a36Sopenharmony_ci
339962306a36Sopenharmony_ci	return __gup_longterm_locked(current->mm, start, nr_pages, pages,
340062306a36Sopenharmony_ci				     &locked, gup_flags);
340162306a36Sopenharmony_ci}
340262306a36Sopenharmony_ciEXPORT_SYMBOL(pin_user_pages_unlocked);
3403