162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci *  fs/userfaultfd.c
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
662306a36Sopenharmony_ci *  Copyright (C) 2008-2009 Red Hat, Inc.
762306a36Sopenharmony_ci *  Copyright (C) 2015  Red Hat, Inc.
862306a36Sopenharmony_ci *
962306a36Sopenharmony_ci *  Some part derived from fs/eventfd.c (anon inode setup) and
1062306a36Sopenharmony_ci *  mm/ksm.c (mm hashing).
1162306a36Sopenharmony_ci */
1262306a36Sopenharmony_ci
1362306a36Sopenharmony_ci#include <linux/list.h>
1462306a36Sopenharmony_ci#include <linux/hashtable.h>
1562306a36Sopenharmony_ci#include <linux/sched/signal.h>
1662306a36Sopenharmony_ci#include <linux/sched/mm.h>
1762306a36Sopenharmony_ci#include <linux/mm.h>
1862306a36Sopenharmony_ci#include <linux/mm_inline.h>
1962306a36Sopenharmony_ci#include <linux/mmu_notifier.h>
2062306a36Sopenharmony_ci#include <linux/poll.h>
2162306a36Sopenharmony_ci#include <linux/slab.h>
2262306a36Sopenharmony_ci#include <linux/seq_file.h>
2362306a36Sopenharmony_ci#include <linux/file.h>
2462306a36Sopenharmony_ci#include <linux/bug.h>
2562306a36Sopenharmony_ci#include <linux/anon_inodes.h>
2662306a36Sopenharmony_ci#include <linux/syscalls.h>
2762306a36Sopenharmony_ci#include <linux/userfaultfd_k.h>
2862306a36Sopenharmony_ci#include <linux/mempolicy.h>
2962306a36Sopenharmony_ci#include <linux/ioctl.h>
3062306a36Sopenharmony_ci#include <linux/security.h>
3162306a36Sopenharmony_ci#include <linux/hugetlb.h>
3262306a36Sopenharmony_ci#include <linux/swapops.h>
3362306a36Sopenharmony_ci#include <linux/miscdevice.h>
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_cistatic int sysctl_unprivileged_userfaultfd __read_mostly;
3662306a36Sopenharmony_ci
3762306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL
3862306a36Sopenharmony_cistatic struct ctl_table vm_userfaultfd_table[] = {
3962306a36Sopenharmony_ci	{
4062306a36Sopenharmony_ci		.procname	= "unprivileged_userfaultfd",
4162306a36Sopenharmony_ci		.data		= &sysctl_unprivileged_userfaultfd,
4262306a36Sopenharmony_ci		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
4362306a36Sopenharmony_ci		.mode		= 0644,
4462306a36Sopenharmony_ci		.proc_handler	= proc_dointvec_minmax,
4562306a36Sopenharmony_ci		.extra1		= SYSCTL_ZERO,
4662306a36Sopenharmony_ci		.extra2		= SYSCTL_ONE,
4762306a36Sopenharmony_ci	},
4862306a36Sopenharmony_ci	{ }
4962306a36Sopenharmony_ci};
5062306a36Sopenharmony_ci#endif
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_cistatic struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci/*
5562306a36Sopenharmony_ci * Start with fault_pending_wqh and fault_wqh so they're more likely
5662306a36Sopenharmony_ci * to be in the same cacheline.
5762306a36Sopenharmony_ci *
5862306a36Sopenharmony_ci * Locking order:
5962306a36Sopenharmony_ci *	fd_wqh.lock
6062306a36Sopenharmony_ci *		fault_pending_wqh.lock
6162306a36Sopenharmony_ci *			fault_wqh.lock
6262306a36Sopenharmony_ci *		event_wqh.lock
6362306a36Sopenharmony_ci *
6462306a36Sopenharmony_ci * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
6562306a36Sopenharmony_ci * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
6662306a36Sopenharmony_ci * also taken in IRQ context.
6762306a36Sopenharmony_ci */
6862306a36Sopenharmony_cistruct userfaultfd_ctx {
6962306a36Sopenharmony_ci	/* waitqueue head for the pending (i.e. not read) userfaults */
7062306a36Sopenharmony_ci	wait_queue_head_t fault_pending_wqh;
7162306a36Sopenharmony_ci	/* waitqueue head for the userfaults */
7262306a36Sopenharmony_ci	wait_queue_head_t fault_wqh;
7362306a36Sopenharmony_ci	/* waitqueue head for the pseudo fd to wakeup poll/read */
7462306a36Sopenharmony_ci	wait_queue_head_t fd_wqh;
7562306a36Sopenharmony_ci	/* waitqueue head for events */
7662306a36Sopenharmony_ci	wait_queue_head_t event_wqh;
7762306a36Sopenharmony_ci	/* a refile sequence protected by fault_pending_wqh lock */
7862306a36Sopenharmony_ci	seqcount_spinlock_t refile_seq;
7962306a36Sopenharmony_ci	/* pseudo fd refcounting */
8062306a36Sopenharmony_ci	refcount_t refcount;
8162306a36Sopenharmony_ci	/* userfaultfd syscall flags */
8262306a36Sopenharmony_ci	unsigned int flags;
8362306a36Sopenharmony_ci	/* features requested from the userspace */
8462306a36Sopenharmony_ci	unsigned int features;
8562306a36Sopenharmony_ci	/* released */
8662306a36Sopenharmony_ci	bool released;
8762306a36Sopenharmony_ci	/* memory mappings are changing because of non-cooperative event */
8862306a36Sopenharmony_ci	atomic_t mmap_changing;
8962306a36Sopenharmony_ci	/* mm with one ore more vmas attached to this userfaultfd_ctx */
9062306a36Sopenharmony_ci	struct mm_struct *mm;
9162306a36Sopenharmony_ci};
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_cistruct userfaultfd_fork_ctx {
9462306a36Sopenharmony_ci	struct userfaultfd_ctx *orig;
9562306a36Sopenharmony_ci	struct userfaultfd_ctx *new;
9662306a36Sopenharmony_ci	struct list_head list;
9762306a36Sopenharmony_ci};
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_cistruct userfaultfd_unmap_ctx {
10062306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx;
10162306a36Sopenharmony_ci	unsigned long start;
10262306a36Sopenharmony_ci	unsigned long end;
10362306a36Sopenharmony_ci	struct list_head list;
10462306a36Sopenharmony_ci};
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_cistruct userfaultfd_wait_queue {
10762306a36Sopenharmony_ci	struct uffd_msg msg;
10862306a36Sopenharmony_ci	wait_queue_entry_t wq;
10962306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx;
11062306a36Sopenharmony_ci	bool waken;
11162306a36Sopenharmony_ci};
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_cistruct userfaultfd_wake_range {
11462306a36Sopenharmony_ci	unsigned long start;
11562306a36Sopenharmony_ci	unsigned long len;
11662306a36Sopenharmony_ci};
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci/* internal indication that UFFD_API ioctl was successfully executed */
11962306a36Sopenharmony_ci#define UFFD_FEATURE_INITIALIZED		(1u << 31)
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_cistatic bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
12262306a36Sopenharmony_ci{
12362306a36Sopenharmony_ci	return ctx->features & UFFD_FEATURE_INITIALIZED;
12462306a36Sopenharmony_ci}
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci/*
12762306a36Sopenharmony_ci * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
12862306a36Sopenharmony_ci * meaningful when userfaultfd_wp()==true on the vma and when it's
12962306a36Sopenharmony_ci * anonymous.
13062306a36Sopenharmony_ci */
13162306a36Sopenharmony_cibool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
13262306a36Sopenharmony_ci{
13362306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
13462306a36Sopenharmony_ci
13562306a36Sopenharmony_ci	if (!ctx)
13662306a36Sopenharmony_ci		return false;
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_ci	return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
13962306a36Sopenharmony_ci}
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_cistatic void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
14262306a36Sopenharmony_ci				     vm_flags_t flags)
14362306a36Sopenharmony_ci{
14462306a36Sopenharmony_ci	const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci	vm_flags_reset(vma, flags);
14762306a36Sopenharmony_ci	/*
14862306a36Sopenharmony_ci	 * For shared mappings, we want to enable writenotify while
14962306a36Sopenharmony_ci	 * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
15062306a36Sopenharmony_ci	 * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
15162306a36Sopenharmony_ci	 */
15262306a36Sopenharmony_ci	if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
15362306a36Sopenharmony_ci		vma_set_page_prot(vma);
15462306a36Sopenharmony_ci}
15562306a36Sopenharmony_ci
15662306a36Sopenharmony_cistatic int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
15762306a36Sopenharmony_ci				     int wake_flags, void *key)
15862306a36Sopenharmony_ci{
15962306a36Sopenharmony_ci	struct userfaultfd_wake_range *range = key;
16062306a36Sopenharmony_ci	int ret;
16162306a36Sopenharmony_ci	struct userfaultfd_wait_queue *uwq;
16262306a36Sopenharmony_ci	unsigned long start, len;
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
16562306a36Sopenharmony_ci	ret = 0;
16662306a36Sopenharmony_ci	/* len == 0 means wake all */
16762306a36Sopenharmony_ci	start = range->start;
16862306a36Sopenharmony_ci	len = range->len;
16962306a36Sopenharmony_ci	if (len && (start > uwq->msg.arg.pagefault.address ||
17062306a36Sopenharmony_ci		    start + len <= uwq->msg.arg.pagefault.address))
17162306a36Sopenharmony_ci		goto out;
17262306a36Sopenharmony_ci	WRITE_ONCE(uwq->waken, true);
17362306a36Sopenharmony_ci	/*
17462306a36Sopenharmony_ci	 * The Program-Order guarantees provided by the scheduler
17562306a36Sopenharmony_ci	 * ensure uwq->waken is visible before the task is woken.
17662306a36Sopenharmony_ci	 */
17762306a36Sopenharmony_ci	ret = wake_up_state(wq->private, mode);
17862306a36Sopenharmony_ci	if (ret) {
17962306a36Sopenharmony_ci		/*
18062306a36Sopenharmony_ci		 * Wake only once, autoremove behavior.
18162306a36Sopenharmony_ci		 *
18262306a36Sopenharmony_ci		 * After the effect of list_del_init is visible to the other
18362306a36Sopenharmony_ci		 * CPUs, the waitqueue may disappear from under us, see the
18462306a36Sopenharmony_ci		 * !list_empty_careful() in handle_userfault().
18562306a36Sopenharmony_ci		 *
18662306a36Sopenharmony_ci		 * try_to_wake_up() has an implicit smp_mb(), and the
18762306a36Sopenharmony_ci		 * wq->private is read before calling the extern function
18862306a36Sopenharmony_ci		 * "wake_up_state" (which in turns calls try_to_wake_up).
18962306a36Sopenharmony_ci		 */
19062306a36Sopenharmony_ci		list_del_init(&wq->entry);
19162306a36Sopenharmony_ci	}
19262306a36Sopenharmony_ciout:
19362306a36Sopenharmony_ci	return ret;
19462306a36Sopenharmony_ci}
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci/**
19762306a36Sopenharmony_ci * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
19862306a36Sopenharmony_ci * context.
19962306a36Sopenharmony_ci * @ctx: [in] Pointer to the userfaultfd context.
20062306a36Sopenharmony_ci */
20162306a36Sopenharmony_cistatic void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
20262306a36Sopenharmony_ci{
20362306a36Sopenharmony_ci	refcount_inc(&ctx->refcount);
20462306a36Sopenharmony_ci}
20562306a36Sopenharmony_ci
20662306a36Sopenharmony_ci/**
20762306a36Sopenharmony_ci * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
20862306a36Sopenharmony_ci * context.
20962306a36Sopenharmony_ci * @ctx: [in] Pointer to userfaultfd context.
21062306a36Sopenharmony_ci *
21162306a36Sopenharmony_ci * The userfaultfd context reference must have been previously acquired either
21262306a36Sopenharmony_ci * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
21362306a36Sopenharmony_ci */
21462306a36Sopenharmony_cistatic void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
21562306a36Sopenharmony_ci{
21662306a36Sopenharmony_ci	if (refcount_dec_and_test(&ctx->refcount)) {
21762306a36Sopenharmony_ci		VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
21862306a36Sopenharmony_ci		VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
21962306a36Sopenharmony_ci		VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
22062306a36Sopenharmony_ci		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
22162306a36Sopenharmony_ci		VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
22262306a36Sopenharmony_ci		VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
22362306a36Sopenharmony_ci		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
22462306a36Sopenharmony_ci		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
22562306a36Sopenharmony_ci		mmdrop(ctx->mm);
22662306a36Sopenharmony_ci		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
22762306a36Sopenharmony_ci	}
22862306a36Sopenharmony_ci}
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_cistatic inline void msg_init(struct uffd_msg *msg)
23162306a36Sopenharmony_ci{
23262306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
23362306a36Sopenharmony_ci	/*
23462306a36Sopenharmony_ci	 * Must use memset to zero out the paddings or kernel data is
23562306a36Sopenharmony_ci	 * leaked to userland.
23662306a36Sopenharmony_ci	 */
23762306a36Sopenharmony_ci	memset(msg, 0, sizeof(struct uffd_msg));
23862306a36Sopenharmony_ci}
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_cistatic inline struct uffd_msg userfault_msg(unsigned long address,
24162306a36Sopenharmony_ci					    unsigned long real_address,
24262306a36Sopenharmony_ci					    unsigned int flags,
24362306a36Sopenharmony_ci					    unsigned long reason,
24462306a36Sopenharmony_ci					    unsigned int features)
24562306a36Sopenharmony_ci{
24662306a36Sopenharmony_ci	struct uffd_msg msg;
24762306a36Sopenharmony_ci
24862306a36Sopenharmony_ci	msg_init(&msg);
24962306a36Sopenharmony_ci	msg.event = UFFD_EVENT_PAGEFAULT;
25062306a36Sopenharmony_ci
25162306a36Sopenharmony_ci	msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
25262306a36Sopenharmony_ci				    real_address : address;
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci	/*
25562306a36Sopenharmony_ci	 * These flags indicate why the userfault occurred:
25662306a36Sopenharmony_ci	 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
25762306a36Sopenharmony_ci	 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
25862306a36Sopenharmony_ci	 * - Neither of these flags being set indicates a MISSING fault.
25962306a36Sopenharmony_ci	 *
26062306a36Sopenharmony_ci	 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
26162306a36Sopenharmony_ci	 * fault. Otherwise, it was a read fault.
26262306a36Sopenharmony_ci	 */
26362306a36Sopenharmony_ci	if (flags & FAULT_FLAG_WRITE)
26462306a36Sopenharmony_ci		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
26562306a36Sopenharmony_ci	if (reason & VM_UFFD_WP)
26662306a36Sopenharmony_ci		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
26762306a36Sopenharmony_ci	if (reason & VM_UFFD_MINOR)
26862306a36Sopenharmony_ci		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
26962306a36Sopenharmony_ci	if (features & UFFD_FEATURE_THREAD_ID)
27062306a36Sopenharmony_ci		msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
27162306a36Sopenharmony_ci	return msg;
27262306a36Sopenharmony_ci}
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE
27562306a36Sopenharmony_ci/*
27662306a36Sopenharmony_ci * Same functionality as userfaultfd_must_wait below with modifications for
27762306a36Sopenharmony_ci * hugepmd ranges.
27862306a36Sopenharmony_ci */
27962306a36Sopenharmony_cistatic inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
28062306a36Sopenharmony_ci					      struct vm_fault *vmf,
28162306a36Sopenharmony_ci					      unsigned long reason)
28262306a36Sopenharmony_ci{
28362306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
28462306a36Sopenharmony_ci	pte_t *ptep, pte;
28562306a36Sopenharmony_ci	bool ret = true;
28662306a36Sopenharmony_ci
28762306a36Sopenharmony_ci	assert_fault_locked(vmf);
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci	ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
29062306a36Sopenharmony_ci	if (!ptep)
29162306a36Sopenharmony_ci		goto out;
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	ret = false;
29462306a36Sopenharmony_ci	pte = huge_ptep_get(ptep);
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci	/*
29762306a36Sopenharmony_ci	 * Lockless access: we're in a wait_event so it's ok if it
29862306a36Sopenharmony_ci	 * changes under us.  PTE markers should be handled the same as none
29962306a36Sopenharmony_ci	 * ptes here.
30062306a36Sopenharmony_ci	 */
30162306a36Sopenharmony_ci	if (huge_pte_none_mostly(pte))
30262306a36Sopenharmony_ci		ret = true;
30362306a36Sopenharmony_ci	if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
30462306a36Sopenharmony_ci		ret = true;
30562306a36Sopenharmony_ciout:
30662306a36Sopenharmony_ci	return ret;
30762306a36Sopenharmony_ci}
30862306a36Sopenharmony_ci#else
30962306a36Sopenharmony_cistatic inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
31062306a36Sopenharmony_ci					      struct vm_fault *vmf,
31162306a36Sopenharmony_ci					      unsigned long reason)
31262306a36Sopenharmony_ci{
31362306a36Sopenharmony_ci	return false;	/* should never get here */
31462306a36Sopenharmony_ci}
31562306a36Sopenharmony_ci#endif /* CONFIG_HUGETLB_PAGE */
31662306a36Sopenharmony_ci
31762306a36Sopenharmony_ci/*
31862306a36Sopenharmony_ci * Verify the pagetables are still not ok after having reigstered into
31962306a36Sopenharmony_ci * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
32062306a36Sopenharmony_ci * userfault that has already been resolved, if userfaultfd_read and
32162306a36Sopenharmony_ci * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
32262306a36Sopenharmony_ci * threads.
32362306a36Sopenharmony_ci */
32462306a36Sopenharmony_cistatic inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
32562306a36Sopenharmony_ci					 struct vm_fault *vmf,
32662306a36Sopenharmony_ci					 unsigned long reason)
32762306a36Sopenharmony_ci{
32862306a36Sopenharmony_ci	struct mm_struct *mm = ctx->mm;
32962306a36Sopenharmony_ci	unsigned long address = vmf->address;
33062306a36Sopenharmony_ci	pgd_t *pgd;
33162306a36Sopenharmony_ci	p4d_t *p4d;
33262306a36Sopenharmony_ci	pud_t *pud;
33362306a36Sopenharmony_ci	pmd_t *pmd, _pmd;
33462306a36Sopenharmony_ci	pte_t *pte;
33562306a36Sopenharmony_ci	pte_t ptent;
33662306a36Sopenharmony_ci	bool ret = true;
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ci	assert_fault_locked(vmf);
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci	pgd = pgd_offset(mm, address);
34162306a36Sopenharmony_ci	if (!pgd_present(*pgd))
34262306a36Sopenharmony_ci		goto out;
34362306a36Sopenharmony_ci	p4d = p4d_offset(pgd, address);
34462306a36Sopenharmony_ci	if (!p4d_present(*p4d))
34562306a36Sopenharmony_ci		goto out;
34662306a36Sopenharmony_ci	pud = pud_offset(p4d, address);
34762306a36Sopenharmony_ci	if (!pud_present(*pud))
34862306a36Sopenharmony_ci		goto out;
34962306a36Sopenharmony_ci	pmd = pmd_offset(pud, address);
35062306a36Sopenharmony_ciagain:
35162306a36Sopenharmony_ci	_pmd = pmdp_get_lockless(pmd);
35262306a36Sopenharmony_ci	if (pmd_none(_pmd))
35362306a36Sopenharmony_ci		goto out;
35462306a36Sopenharmony_ci
35562306a36Sopenharmony_ci	ret = false;
35662306a36Sopenharmony_ci	if (!pmd_present(_pmd) || pmd_devmap(_pmd))
35762306a36Sopenharmony_ci		goto out;
35862306a36Sopenharmony_ci
35962306a36Sopenharmony_ci	if (pmd_trans_huge(_pmd)) {
36062306a36Sopenharmony_ci		if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
36162306a36Sopenharmony_ci			ret = true;
36262306a36Sopenharmony_ci		goto out;
36362306a36Sopenharmony_ci	}
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci	pte = pte_offset_map(pmd, address);
36662306a36Sopenharmony_ci	if (!pte) {
36762306a36Sopenharmony_ci		ret = true;
36862306a36Sopenharmony_ci		goto again;
36962306a36Sopenharmony_ci	}
37062306a36Sopenharmony_ci	/*
37162306a36Sopenharmony_ci	 * Lockless access: we're in a wait_event so it's ok if it
37262306a36Sopenharmony_ci	 * changes under us.  PTE markers should be handled the same as none
37362306a36Sopenharmony_ci	 * ptes here.
37462306a36Sopenharmony_ci	 */
37562306a36Sopenharmony_ci	ptent = ptep_get(pte);
37662306a36Sopenharmony_ci	if (pte_none_mostly(ptent))
37762306a36Sopenharmony_ci		ret = true;
37862306a36Sopenharmony_ci	if (!pte_write(ptent) && (reason & VM_UFFD_WP))
37962306a36Sopenharmony_ci		ret = true;
38062306a36Sopenharmony_ci	pte_unmap(pte);
38162306a36Sopenharmony_ci
38262306a36Sopenharmony_ciout:
38362306a36Sopenharmony_ci	return ret;
38462306a36Sopenharmony_ci}
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_cistatic inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
38762306a36Sopenharmony_ci{
38862306a36Sopenharmony_ci	if (flags & FAULT_FLAG_INTERRUPTIBLE)
38962306a36Sopenharmony_ci		return TASK_INTERRUPTIBLE;
39062306a36Sopenharmony_ci
39162306a36Sopenharmony_ci	if (flags & FAULT_FLAG_KILLABLE)
39262306a36Sopenharmony_ci		return TASK_KILLABLE;
39362306a36Sopenharmony_ci
39462306a36Sopenharmony_ci	return TASK_UNINTERRUPTIBLE;
39562306a36Sopenharmony_ci}
39662306a36Sopenharmony_ci
39762306a36Sopenharmony_ci/*
39862306a36Sopenharmony_ci * The locking rules involved in returning VM_FAULT_RETRY depending on
39962306a36Sopenharmony_ci * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
40062306a36Sopenharmony_ci * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
40162306a36Sopenharmony_ci * recommendation in __lock_page_or_retry is not an understatement.
40262306a36Sopenharmony_ci *
40362306a36Sopenharmony_ci * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
40462306a36Sopenharmony_ci * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
40562306a36Sopenharmony_ci * not set.
40662306a36Sopenharmony_ci *
40762306a36Sopenharmony_ci * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
40862306a36Sopenharmony_ci * set, VM_FAULT_RETRY can still be returned if and only if there are
40962306a36Sopenharmony_ci * fatal_signal_pending()s, and the mmap_lock must be released before
41062306a36Sopenharmony_ci * returning it.
41162306a36Sopenharmony_ci */
41262306a36Sopenharmony_civm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
41362306a36Sopenharmony_ci{
41462306a36Sopenharmony_ci	struct vm_area_struct *vma = vmf->vma;
41562306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
41662306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx;
41762306a36Sopenharmony_ci	struct userfaultfd_wait_queue uwq;
41862306a36Sopenharmony_ci	vm_fault_t ret = VM_FAULT_SIGBUS;
41962306a36Sopenharmony_ci	bool must_wait;
42062306a36Sopenharmony_ci	unsigned int blocking_state;
42162306a36Sopenharmony_ci
42262306a36Sopenharmony_ci	/*
42362306a36Sopenharmony_ci	 * We don't do userfault handling for the final child pid update.
42462306a36Sopenharmony_ci	 *
42562306a36Sopenharmony_ci	 * We also don't do userfault handling during
42662306a36Sopenharmony_ci	 * coredumping. hugetlbfs has the special
42762306a36Sopenharmony_ci	 * hugetlb_follow_page_mask() to skip missing pages in the
42862306a36Sopenharmony_ci	 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
42962306a36Sopenharmony_ci	 * the no_page_table() helper in follow_page_mask(), but the
43062306a36Sopenharmony_ci	 * shmem_vm_ops->fault method is invoked even during
43162306a36Sopenharmony_ci	 * coredumping and it ends up here.
43262306a36Sopenharmony_ci	 */
43362306a36Sopenharmony_ci	if (current->flags & (PF_EXITING|PF_DUMPCORE))
43462306a36Sopenharmony_ci		goto out;
43562306a36Sopenharmony_ci
43662306a36Sopenharmony_ci	assert_fault_locked(vmf);
43762306a36Sopenharmony_ci
43862306a36Sopenharmony_ci	ctx = vma->vm_userfaultfd_ctx.ctx;
43962306a36Sopenharmony_ci	if (!ctx)
44062306a36Sopenharmony_ci		goto out;
44162306a36Sopenharmony_ci
44262306a36Sopenharmony_ci	BUG_ON(ctx->mm != mm);
44362306a36Sopenharmony_ci
44462306a36Sopenharmony_ci	/* Any unrecognized flag is a bug. */
44562306a36Sopenharmony_ci	VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
44662306a36Sopenharmony_ci	/* 0 or > 1 flags set is a bug; we expect exactly 1. */
44762306a36Sopenharmony_ci	VM_BUG_ON(!reason || (reason & (reason - 1)));
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ci	if (ctx->features & UFFD_FEATURE_SIGBUS)
45062306a36Sopenharmony_ci		goto out;
45162306a36Sopenharmony_ci	if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
45262306a36Sopenharmony_ci		goto out;
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci	/*
45562306a36Sopenharmony_ci	 * If it's already released don't get it. This avoids to loop
45662306a36Sopenharmony_ci	 * in __get_user_pages if userfaultfd_release waits on the
45762306a36Sopenharmony_ci	 * caller of handle_userfault to release the mmap_lock.
45862306a36Sopenharmony_ci	 */
45962306a36Sopenharmony_ci	if (unlikely(READ_ONCE(ctx->released))) {
46062306a36Sopenharmony_ci		/*
46162306a36Sopenharmony_ci		 * Don't return VM_FAULT_SIGBUS in this case, so a non
46262306a36Sopenharmony_ci		 * cooperative manager can close the uffd after the
46362306a36Sopenharmony_ci		 * last UFFDIO_COPY, without risking to trigger an
46462306a36Sopenharmony_ci		 * involuntary SIGBUS if the process was starting the
46562306a36Sopenharmony_ci		 * userfaultfd while the userfaultfd was still armed
46662306a36Sopenharmony_ci		 * (but after the last UFFDIO_COPY). If the uffd
46762306a36Sopenharmony_ci		 * wasn't already closed when the userfault reached
46862306a36Sopenharmony_ci		 * this point, that would normally be solved by
46962306a36Sopenharmony_ci		 * userfaultfd_must_wait returning 'false'.
47062306a36Sopenharmony_ci		 *
47162306a36Sopenharmony_ci		 * If we were to return VM_FAULT_SIGBUS here, the non
47262306a36Sopenharmony_ci		 * cooperative manager would be instead forced to
47362306a36Sopenharmony_ci		 * always call UFFDIO_UNREGISTER before it can safely
47462306a36Sopenharmony_ci		 * close the uffd.
47562306a36Sopenharmony_ci		 */
47662306a36Sopenharmony_ci		ret = VM_FAULT_NOPAGE;
47762306a36Sopenharmony_ci		goto out;
47862306a36Sopenharmony_ci	}
47962306a36Sopenharmony_ci
48062306a36Sopenharmony_ci	/*
48162306a36Sopenharmony_ci	 * Check that we can return VM_FAULT_RETRY.
48262306a36Sopenharmony_ci	 *
48362306a36Sopenharmony_ci	 * NOTE: it should become possible to return VM_FAULT_RETRY
48462306a36Sopenharmony_ci	 * even if FAULT_FLAG_TRIED is set without leading to gup()
48562306a36Sopenharmony_ci	 * -EBUSY failures, if the userfaultfd is to be extended for
48662306a36Sopenharmony_ci	 * VM_UFFD_WP tracking and we intend to arm the userfault
48762306a36Sopenharmony_ci	 * without first stopping userland access to the memory. For
48862306a36Sopenharmony_ci	 * VM_UFFD_MISSING userfaults this is enough for now.
48962306a36Sopenharmony_ci	 */
49062306a36Sopenharmony_ci	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
49162306a36Sopenharmony_ci		/*
49262306a36Sopenharmony_ci		 * Validate the invariant that nowait must allow retry
49362306a36Sopenharmony_ci		 * to be sure not to return SIGBUS erroneously on
49462306a36Sopenharmony_ci		 * nowait invocations.
49562306a36Sopenharmony_ci		 */
49662306a36Sopenharmony_ci		BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
49762306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM
49862306a36Sopenharmony_ci		if (printk_ratelimit()) {
49962306a36Sopenharmony_ci			printk(KERN_WARNING
50062306a36Sopenharmony_ci			       "FAULT_FLAG_ALLOW_RETRY missing %x\n",
50162306a36Sopenharmony_ci			       vmf->flags);
50262306a36Sopenharmony_ci			dump_stack();
50362306a36Sopenharmony_ci		}
50462306a36Sopenharmony_ci#endif
50562306a36Sopenharmony_ci		goto out;
50662306a36Sopenharmony_ci	}
50762306a36Sopenharmony_ci
50862306a36Sopenharmony_ci	/*
50962306a36Sopenharmony_ci	 * Handle nowait, not much to do other than tell it to retry
51062306a36Sopenharmony_ci	 * and wait.
51162306a36Sopenharmony_ci	 */
51262306a36Sopenharmony_ci	ret = VM_FAULT_RETRY;
51362306a36Sopenharmony_ci	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
51462306a36Sopenharmony_ci		goto out;
51562306a36Sopenharmony_ci
51662306a36Sopenharmony_ci	/* take the reference before dropping the mmap_lock */
51762306a36Sopenharmony_ci	userfaultfd_ctx_get(ctx);
51862306a36Sopenharmony_ci
51962306a36Sopenharmony_ci	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
52062306a36Sopenharmony_ci	uwq.wq.private = current;
52162306a36Sopenharmony_ci	uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
52262306a36Sopenharmony_ci				reason, ctx->features);
52362306a36Sopenharmony_ci	uwq.ctx = ctx;
52462306a36Sopenharmony_ci	uwq.waken = false;
52562306a36Sopenharmony_ci
52662306a36Sopenharmony_ci	blocking_state = userfaultfd_get_blocking_state(vmf->flags);
52762306a36Sopenharmony_ci
52862306a36Sopenharmony_ci        /*
52962306a36Sopenharmony_ci         * Take the vma lock now, in order to safely call
53062306a36Sopenharmony_ci         * userfaultfd_huge_must_wait() later. Since acquiring the
53162306a36Sopenharmony_ci         * (sleepable) vma lock can modify the current task state, that
53262306a36Sopenharmony_ci         * must be before explicitly calling set_current_state().
53362306a36Sopenharmony_ci         */
53462306a36Sopenharmony_ci	if (is_vm_hugetlb_page(vma))
53562306a36Sopenharmony_ci		hugetlb_vma_lock_read(vma);
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_ci	spin_lock_irq(&ctx->fault_pending_wqh.lock);
53862306a36Sopenharmony_ci	/*
53962306a36Sopenharmony_ci	 * After the __add_wait_queue the uwq is visible to userland
54062306a36Sopenharmony_ci	 * through poll/read().
54162306a36Sopenharmony_ci	 */
54262306a36Sopenharmony_ci	__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
54362306a36Sopenharmony_ci	/*
54462306a36Sopenharmony_ci	 * The smp_mb() after __set_current_state prevents the reads
54562306a36Sopenharmony_ci	 * following the spin_unlock to happen before the list_add in
54662306a36Sopenharmony_ci	 * __add_wait_queue.
54762306a36Sopenharmony_ci	 */
54862306a36Sopenharmony_ci	set_current_state(blocking_state);
54962306a36Sopenharmony_ci	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_ci	if (!is_vm_hugetlb_page(vma))
55262306a36Sopenharmony_ci		must_wait = userfaultfd_must_wait(ctx, vmf, reason);
55362306a36Sopenharmony_ci	else
55462306a36Sopenharmony_ci		must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
55562306a36Sopenharmony_ci	if (is_vm_hugetlb_page(vma))
55662306a36Sopenharmony_ci		hugetlb_vma_unlock_read(vma);
55762306a36Sopenharmony_ci	release_fault_lock(vmf);
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_ci	if (likely(must_wait && !READ_ONCE(ctx->released))) {
56062306a36Sopenharmony_ci		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
56162306a36Sopenharmony_ci		schedule();
56262306a36Sopenharmony_ci	}
56362306a36Sopenharmony_ci
56462306a36Sopenharmony_ci	__set_current_state(TASK_RUNNING);
56562306a36Sopenharmony_ci
56662306a36Sopenharmony_ci	/*
56762306a36Sopenharmony_ci	 * Here we race with the list_del; list_add in
56862306a36Sopenharmony_ci	 * userfaultfd_ctx_read(), however because we don't ever run
56962306a36Sopenharmony_ci	 * list_del_init() to refile across the two lists, the prev
57062306a36Sopenharmony_ci	 * and next pointers will never point to self. list_add also
57162306a36Sopenharmony_ci	 * would never let any of the two pointers to point to
57262306a36Sopenharmony_ci	 * self. So list_empty_careful won't risk to see both pointers
57362306a36Sopenharmony_ci	 * pointing to self at any time during the list refile. The
57462306a36Sopenharmony_ci	 * only case where list_del_init() is called is the full
57562306a36Sopenharmony_ci	 * removal in the wake function and there we don't re-list_add
57662306a36Sopenharmony_ci	 * and it's fine not to block on the spinlock. The uwq on this
57762306a36Sopenharmony_ci	 * kernel stack can be released after the list_del_init.
57862306a36Sopenharmony_ci	 */
57962306a36Sopenharmony_ci	if (!list_empty_careful(&uwq.wq.entry)) {
58062306a36Sopenharmony_ci		spin_lock_irq(&ctx->fault_pending_wqh.lock);
58162306a36Sopenharmony_ci		/*
58262306a36Sopenharmony_ci		 * No need of list_del_init(), the uwq on the stack
58362306a36Sopenharmony_ci		 * will be freed shortly anyway.
58462306a36Sopenharmony_ci		 */
58562306a36Sopenharmony_ci		list_del(&uwq.wq.entry);
58662306a36Sopenharmony_ci		spin_unlock_irq(&ctx->fault_pending_wqh.lock);
58762306a36Sopenharmony_ci	}
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci	/*
59062306a36Sopenharmony_ci	 * ctx may go away after this if the userfault pseudo fd is
59162306a36Sopenharmony_ci	 * already released.
59262306a36Sopenharmony_ci	 */
59362306a36Sopenharmony_ci	userfaultfd_ctx_put(ctx);
59462306a36Sopenharmony_ci
59562306a36Sopenharmony_ciout:
59662306a36Sopenharmony_ci	return ret;
59762306a36Sopenharmony_ci}
59862306a36Sopenharmony_ci
59962306a36Sopenharmony_cistatic void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
60062306a36Sopenharmony_ci					      struct userfaultfd_wait_queue *ewq)
60162306a36Sopenharmony_ci{
60262306a36Sopenharmony_ci	struct userfaultfd_ctx *release_new_ctx;
60362306a36Sopenharmony_ci
60462306a36Sopenharmony_ci	if (WARN_ON_ONCE(current->flags & PF_EXITING))
60562306a36Sopenharmony_ci		goto out;
60662306a36Sopenharmony_ci
60762306a36Sopenharmony_ci	ewq->ctx = ctx;
60862306a36Sopenharmony_ci	init_waitqueue_entry(&ewq->wq, current);
60962306a36Sopenharmony_ci	release_new_ctx = NULL;
61062306a36Sopenharmony_ci
61162306a36Sopenharmony_ci	spin_lock_irq(&ctx->event_wqh.lock);
61262306a36Sopenharmony_ci	/*
61362306a36Sopenharmony_ci	 * After the __add_wait_queue the uwq is visible to userland
61462306a36Sopenharmony_ci	 * through poll/read().
61562306a36Sopenharmony_ci	 */
61662306a36Sopenharmony_ci	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
61762306a36Sopenharmony_ci	for (;;) {
61862306a36Sopenharmony_ci		set_current_state(TASK_KILLABLE);
61962306a36Sopenharmony_ci		if (ewq->msg.event == 0)
62062306a36Sopenharmony_ci			break;
62162306a36Sopenharmony_ci		if (READ_ONCE(ctx->released) ||
62262306a36Sopenharmony_ci		    fatal_signal_pending(current)) {
62362306a36Sopenharmony_ci			/*
62462306a36Sopenharmony_ci			 * &ewq->wq may be queued in fork_event, but
62562306a36Sopenharmony_ci			 * __remove_wait_queue ignores the head
62662306a36Sopenharmony_ci			 * parameter. It would be a problem if it
62762306a36Sopenharmony_ci			 * didn't.
62862306a36Sopenharmony_ci			 */
62962306a36Sopenharmony_ci			__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
63062306a36Sopenharmony_ci			if (ewq->msg.event == UFFD_EVENT_FORK) {
63162306a36Sopenharmony_ci				struct userfaultfd_ctx *new;
63262306a36Sopenharmony_ci
63362306a36Sopenharmony_ci				new = (struct userfaultfd_ctx *)
63462306a36Sopenharmony_ci					(unsigned long)
63562306a36Sopenharmony_ci					ewq->msg.arg.reserved.reserved1;
63662306a36Sopenharmony_ci				release_new_ctx = new;
63762306a36Sopenharmony_ci			}
63862306a36Sopenharmony_ci			break;
63962306a36Sopenharmony_ci		}
64062306a36Sopenharmony_ci
64162306a36Sopenharmony_ci		spin_unlock_irq(&ctx->event_wqh.lock);
64262306a36Sopenharmony_ci
64362306a36Sopenharmony_ci		wake_up_poll(&ctx->fd_wqh, EPOLLIN);
64462306a36Sopenharmony_ci		schedule();
64562306a36Sopenharmony_ci
64662306a36Sopenharmony_ci		spin_lock_irq(&ctx->event_wqh.lock);
64762306a36Sopenharmony_ci	}
64862306a36Sopenharmony_ci	__set_current_state(TASK_RUNNING);
64962306a36Sopenharmony_ci	spin_unlock_irq(&ctx->event_wqh.lock);
65062306a36Sopenharmony_ci
65162306a36Sopenharmony_ci	if (release_new_ctx) {
65262306a36Sopenharmony_ci		struct vm_area_struct *vma;
65362306a36Sopenharmony_ci		struct mm_struct *mm = release_new_ctx->mm;
65462306a36Sopenharmony_ci		VMA_ITERATOR(vmi, mm, 0);
65562306a36Sopenharmony_ci
65662306a36Sopenharmony_ci		/* the various vma->vm_userfaultfd_ctx still points to it */
65762306a36Sopenharmony_ci		mmap_write_lock(mm);
65862306a36Sopenharmony_ci		for_each_vma(vmi, vma) {
65962306a36Sopenharmony_ci			if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
66062306a36Sopenharmony_ci				vma_start_write(vma);
66162306a36Sopenharmony_ci				vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
66262306a36Sopenharmony_ci				userfaultfd_set_vm_flags(vma,
66362306a36Sopenharmony_ci							 vma->vm_flags & ~__VM_UFFD_FLAGS);
66462306a36Sopenharmony_ci			}
66562306a36Sopenharmony_ci		}
66662306a36Sopenharmony_ci		mmap_write_unlock(mm);
66762306a36Sopenharmony_ci
66862306a36Sopenharmony_ci		userfaultfd_ctx_put(release_new_ctx);
66962306a36Sopenharmony_ci	}
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci	/*
67262306a36Sopenharmony_ci	 * ctx may go away after this if the userfault pseudo fd is
67362306a36Sopenharmony_ci	 * already released.
67462306a36Sopenharmony_ci	 */
67562306a36Sopenharmony_ciout:
67662306a36Sopenharmony_ci	atomic_dec(&ctx->mmap_changing);
67762306a36Sopenharmony_ci	VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
67862306a36Sopenharmony_ci	userfaultfd_ctx_put(ctx);
67962306a36Sopenharmony_ci}
68062306a36Sopenharmony_ci
68162306a36Sopenharmony_cistatic void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
68262306a36Sopenharmony_ci				       struct userfaultfd_wait_queue *ewq)
68362306a36Sopenharmony_ci{
68462306a36Sopenharmony_ci	ewq->msg.event = 0;
68562306a36Sopenharmony_ci	wake_up_locked(&ctx->event_wqh);
68662306a36Sopenharmony_ci	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
68762306a36Sopenharmony_ci}
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ciint dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
69062306a36Sopenharmony_ci{
69162306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx = NULL, *octx;
69262306a36Sopenharmony_ci	struct userfaultfd_fork_ctx *fctx;
69362306a36Sopenharmony_ci
69462306a36Sopenharmony_ci	octx = vma->vm_userfaultfd_ctx.ctx;
69562306a36Sopenharmony_ci	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
69662306a36Sopenharmony_ci		vma_start_write(vma);
69762306a36Sopenharmony_ci		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
69862306a36Sopenharmony_ci		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
69962306a36Sopenharmony_ci		return 0;
70062306a36Sopenharmony_ci	}
70162306a36Sopenharmony_ci
70262306a36Sopenharmony_ci	list_for_each_entry(fctx, fcs, list)
70362306a36Sopenharmony_ci		if (fctx->orig == octx) {
70462306a36Sopenharmony_ci			ctx = fctx->new;
70562306a36Sopenharmony_ci			break;
70662306a36Sopenharmony_ci		}
70762306a36Sopenharmony_ci
70862306a36Sopenharmony_ci	if (!ctx) {
70962306a36Sopenharmony_ci		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
71062306a36Sopenharmony_ci		if (!fctx)
71162306a36Sopenharmony_ci			return -ENOMEM;
71262306a36Sopenharmony_ci
71362306a36Sopenharmony_ci		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
71462306a36Sopenharmony_ci		if (!ctx) {
71562306a36Sopenharmony_ci			kfree(fctx);
71662306a36Sopenharmony_ci			return -ENOMEM;
71762306a36Sopenharmony_ci		}
71862306a36Sopenharmony_ci
71962306a36Sopenharmony_ci		refcount_set(&ctx->refcount, 1);
72062306a36Sopenharmony_ci		ctx->flags = octx->flags;
72162306a36Sopenharmony_ci		ctx->features = octx->features;
72262306a36Sopenharmony_ci		ctx->released = false;
72362306a36Sopenharmony_ci		atomic_set(&ctx->mmap_changing, 0);
72462306a36Sopenharmony_ci		ctx->mm = vma->vm_mm;
72562306a36Sopenharmony_ci		mmgrab(ctx->mm);
72662306a36Sopenharmony_ci
72762306a36Sopenharmony_ci		userfaultfd_ctx_get(octx);
72862306a36Sopenharmony_ci		atomic_inc(&octx->mmap_changing);
72962306a36Sopenharmony_ci		fctx->orig = octx;
73062306a36Sopenharmony_ci		fctx->new = ctx;
73162306a36Sopenharmony_ci		list_add_tail(&fctx->list, fcs);
73262306a36Sopenharmony_ci	}
73362306a36Sopenharmony_ci
73462306a36Sopenharmony_ci	vma->vm_userfaultfd_ctx.ctx = ctx;
73562306a36Sopenharmony_ci	return 0;
73662306a36Sopenharmony_ci}
73762306a36Sopenharmony_ci
73862306a36Sopenharmony_cistatic void dup_fctx(struct userfaultfd_fork_ctx *fctx)
73962306a36Sopenharmony_ci{
74062306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx = fctx->orig;
74162306a36Sopenharmony_ci	struct userfaultfd_wait_queue ewq;
74262306a36Sopenharmony_ci
74362306a36Sopenharmony_ci	msg_init(&ewq.msg);
74462306a36Sopenharmony_ci
74562306a36Sopenharmony_ci	ewq.msg.event = UFFD_EVENT_FORK;
74662306a36Sopenharmony_ci	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
74762306a36Sopenharmony_ci
74862306a36Sopenharmony_ci	userfaultfd_event_wait_completion(ctx, &ewq);
74962306a36Sopenharmony_ci}
75062306a36Sopenharmony_ci
75162306a36Sopenharmony_civoid dup_userfaultfd_complete(struct list_head *fcs)
75262306a36Sopenharmony_ci{
75362306a36Sopenharmony_ci	struct userfaultfd_fork_ctx *fctx, *n;
75462306a36Sopenharmony_ci
75562306a36Sopenharmony_ci	list_for_each_entry_safe(fctx, n, fcs, list) {
75662306a36Sopenharmony_ci		dup_fctx(fctx);
75762306a36Sopenharmony_ci		list_del(&fctx->list);
75862306a36Sopenharmony_ci		kfree(fctx);
75962306a36Sopenharmony_ci	}
76062306a36Sopenharmony_ci}
76162306a36Sopenharmony_ci
76262306a36Sopenharmony_civoid mremap_userfaultfd_prep(struct vm_area_struct *vma,
76362306a36Sopenharmony_ci			     struct vm_userfaultfd_ctx *vm_ctx)
76462306a36Sopenharmony_ci{
76562306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx;
76662306a36Sopenharmony_ci
76762306a36Sopenharmony_ci	ctx = vma->vm_userfaultfd_ctx.ctx;
76862306a36Sopenharmony_ci
76962306a36Sopenharmony_ci	if (!ctx)
77062306a36Sopenharmony_ci		return;
77162306a36Sopenharmony_ci
77262306a36Sopenharmony_ci	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
77362306a36Sopenharmony_ci		vm_ctx->ctx = ctx;
77462306a36Sopenharmony_ci		userfaultfd_ctx_get(ctx);
77562306a36Sopenharmony_ci		atomic_inc(&ctx->mmap_changing);
77662306a36Sopenharmony_ci	} else {
77762306a36Sopenharmony_ci		/* Drop uffd context if remap feature not enabled */
77862306a36Sopenharmony_ci		vma_start_write(vma);
77962306a36Sopenharmony_ci		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
78062306a36Sopenharmony_ci		userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
78162306a36Sopenharmony_ci	}
78262306a36Sopenharmony_ci}
78362306a36Sopenharmony_ci
78462306a36Sopenharmony_civoid mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
78562306a36Sopenharmony_ci				 unsigned long from, unsigned long to,
78662306a36Sopenharmony_ci				 unsigned long len)
78762306a36Sopenharmony_ci{
78862306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
78962306a36Sopenharmony_ci	struct userfaultfd_wait_queue ewq;
79062306a36Sopenharmony_ci
79162306a36Sopenharmony_ci	if (!ctx)
79262306a36Sopenharmony_ci		return;
79362306a36Sopenharmony_ci
79462306a36Sopenharmony_ci	if (to & ~PAGE_MASK) {
79562306a36Sopenharmony_ci		userfaultfd_ctx_put(ctx);
79662306a36Sopenharmony_ci		return;
79762306a36Sopenharmony_ci	}
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_ci	msg_init(&ewq.msg);
80062306a36Sopenharmony_ci
80162306a36Sopenharmony_ci	ewq.msg.event = UFFD_EVENT_REMAP;
80262306a36Sopenharmony_ci	ewq.msg.arg.remap.from = from;
80362306a36Sopenharmony_ci	ewq.msg.arg.remap.to = to;
80462306a36Sopenharmony_ci	ewq.msg.arg.remap.len = len;
80562306a36Sopenharmony_ci
80662306a36Sopenharmony_ci	userfaultfd_event_wait_completion(ctx, &ewq);
80762306a36Sopenharmony_ci}
80862306a36Sopenharmony_ci
80962306a36Sopenharmony_cibool userfaultfd_remove(struct vm_area_struct *vma,
81062306a36Sopenharmony_ci			unsigned long start, unsigned long end)
81162306a36Sopenharmony_ci{
81262306a36Sopenharmony_ci	struct mm_struct *mm = vma->vm_mm;
81362306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx;
81462306a36Sopenharmony_ci	struct userfaultfd_wait_queue ewq;
81562306a36Sopenharmony_ci
81662306a36Sopenharmony_ci	ctx = vma->vm_userfaultfd_ctx.ctx;
81762306a36Sopenharmony_ci	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
81862306a36Sopenharmony_ci		return true;
81962306a36Sopenharmony_ci
82062306a36Sopenharmony_ci	userfaultfd_ctx_get(ctx);
82162306a36Sopenharmony_ci	atomic_inc(&ctx->mmap_changing);
82262306a36Sopenharmony_ci	mmap_read_unlock(mm);
82362306a36Sopenharmony_ci
82462306a36Sopenharmony_ci	msg_init(&ewq.msg);
82562306a36Sopenharmony_ci
82662306a36Sopenharmony_ci	ewq.msg.event = UFFD_EVENT_REMOVE;
82762306a36Sopenharmony_ci	ewq.msg.arg.remove.start = start;
82862306a36Sopenharmony_ci	ewq.msg.arg.remove.end = end;
82962306a36Sopenharmony_ci
83062306a36Sopenharmony_ci	userfaultfd_event_wait_completion(ctx, &ewq);
83162306a36Sopenharmony_ci
83262306a36Sopenharmony_ci	return false;
83362306a36Sopenharmony_ci}
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_cistatic bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
83662306a36Sopenharmony_ci			  unsigned long start, unsigned long end)
83762306a36Sopenharmony_ci{
83862306a36Sopenharmony_ci	struct userfaultfd_unmap_ctx *unmap_ctx;
83962306a36Sopenharmony_ci
84062306a36Sopenharmony_ci	list_for_each_entry(unmap_ctx, unmaps, list)
84162306a36Sopenharmony_ci		if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
84262306a36Sopenharmony_ci		    unmap_ctx->end == end)
84362306a36Sopenharmony_ci			return true;
84462306a36Sopenharmony_ci
84562306a36Sopenharmony_ci	return false;
84662306a36Sopenharmony_ci}
84762306a36Sopenharmony_ci
84862306a36Sopenharmony_ciint userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
84962306a36Sopenharmony_ci			   unsigned long end, struct list_head *unmaps)
85062306a36Sopenharmony_ci{
85162306a36Sopenharmony_ci	struct userfaultfd_unmap_ctx *unmap_ctx;
85262306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
85362306a36Sopenharmony_ci
85462306a36Sopenharmony_ci	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
85562306a36Sopenharmony_ci	    has_unmap_ctx(ctx, unmaps, start, end))
85662306a36Sopenharmony_ci		return 0;
85762306a36Sopenharmony_ci
85862306a36Sopenharmony_ci	unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
85962306a36Sopenharmony_ci	if (!unmap_ctx)
86062306a36Sopenharmony_ci		return -ENOMEM;
86162306a36Sopenharmony_ci
86262306a36Sopenharmony_ci	userfaultfd_ctx_get(ctx);
86362306a36Sopenharmony_ci	atomic_inc(&ctx->mmap_changing);
86462306a36Sopenharmony_ci	unmap_ctx->ctx = ctx;
86562306a36Sopenharmony_ci	unmap_ctx->start = start;
86662306a36Sopenharmony_ci	unmap_ctx->end = end;
86762306a36Sopenharmony_ci	list_add_tail(&unmap_ctx->list, unmaps);
86862306a36Sopenharmony_ci
86962306a36Sopenharmony_ci	return 0;
87062306a36Sopenharmony_ci}
87162306a36Sopenharmony_ci
87262306a36Sopenharmony_civoid userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
87362306a36Sopenharmony_ci{
87462306a36Sopenharmony_ci	struct userfaultfd_unmap_ctx *ctx, *n;
87562306a36Sopenharmony_ci	struct userfaultfd_wait_queue ewq;
87662306a36Sopenharmony_ci
87762306a36Sopenharmony_ci	list_for_each_entry_safe(ctx, n, uf, list) {
87862306a36Sopenharmony_ci		msg_init(&ewq.msg);
87962306a36Sopenharmony_ci
88062306a36Sopenharmony_ci		ewq.msg.event = UFFD_EVENT_UNMAP;
88162306a36Sopenharmony_ci		ewq.msg.arg.remove.start = ctx->start;
88262306a36Sopenharmony_ci		ewq.msg.arg.remove.end = ctx->end;
88362306a36Sopenharmony_ci
88462306a36Sopenharmony_ci		userfaultfd_event_wait_completion(ctx->ctx, &ewq);
88562306a36Sopenharmony_ci
88662306a36Sopenharmony_ci		list_del(&ctx->list);
88762306a36Sopenharmony_ci		kfree(ctx);
88862306a36Sopenharmony_ci	}
88962306a36Sopenharmony_ci}
89062306a36Sopenharmony_ci
89162306a36Sopenharmony_cistatic int userfaultfd_release(struct inode *inode, struct file *file)
89262306a36Sopenharmony_ci{
89362306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx = file->private_data;
89462306a36Sopenharmony_ci	struct mm_struct *mm = ctx->mm;
89562306a36Sopenharmony_ci	struct vm_area_struct *vma, *prev;
89662306a36Sopenharmony_ci	/* len == 0 means wake all */
89762306a36Sopenharmony_ci	struct userfaultfd_wake_range range = { .len = 0, };
89862306a36Sopenharmony_ci	unsigned long new_flags;
89962306a36Sopenharmony_ci	VMA_ITERATOR(vmi, mm, 0);
90062306a36Sopenharmony_ci
90162306a36Sopenharmony_ci	WRITE_ONCE(ctx->released, true);
90262306a36Sopenharmony_ci
90362306a36Sopenharmony_ci	if (!mmget_not_zero(mm))
90462306a36Sopenharmony_ci		goto wakeup;
90562306a36Sopenharmony_ci
90662306a36Sopenharmony_ci	/*
90762306a36Sopenharmony_ci	 * Flush page faults out of all CPUs. NOTE: all page faults
90862306a36Sopenharmony_ci	 * must be retried without returning VM_FAULT_SIGBUS if
90962306a36Sopenharmony_ci	 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
91062306a36Sopenharmony_ci	 * changes while handle_userfault released the mmap_lock. So
91162306a36Sopenharmony_ci	 * it's critical that released is set to true (above), before
91262306a36Sopenharmony_ci	 * taking the mmap_lock for writing.
91362306a36Sopenharmony_ci	 */
91462306a36Sopenharmony_ci	mmap_write_lock(mm);
91562306a36Sopenharmony_ci	prev = NULL;
91662306a36Sopenharmony_ci	for_each_vma(vmi, vma) {
91762306a36Sopenharmony_ci		cond_resched();
91862306a36Sopenharmony_ci		BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
91962306a36Sopenharmony_ci		       !!(vma->vm_flags & __VM_UFFD_FLAGS));
92062306a36Sopenharmony_ci		if (vma->vm_userfaultfd_ctx.ctx != ctx) {
92162306a36Sopenharmony_ci			prev = vma;
92262306a36Sopenharmony_ci			continue;
92362306a36Sopenharmony_ci		}
92462306a36Sopenharmony_ci		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
92562306a36Sopenharmony_ci		prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end,
92662306a36Sopenharmony_ci				 new_flags, vma->anon_vma,
92762306a36Sopenharmony_ci				 vma->vm_file, vma->vm_pgoff,
92862306a36Sopenharmony_ci				 vma_policy(vma),
92962306a36Sopenharmony_ci				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
93062306a36Sopenharmony_ci		if (prev) {
93162306a36Sopenharmony_ci			vma = prev;
93262306a36Sopenharmony_ci		} else {
93362306a36Sopenharmony_ci			prev = vma;
93462306a36Sopenharmony_ci		}
93562306a36Sopenharmony_ci
93662306a36Sopenharmony_ci		vma_start_write(vma);
93762306a36Sopenharmony_ci		userfaultfd_set_vm_flags(vma, new_flags);
93862306a36Sopenharmony_ci		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
93962306a36Sopenharmony_ci	}
94062306a36Sopenharmony_ci	mmap_write_unlock(mm);
94162306a36Sopenharmony_ci	mmput(mm);
94262306a36Sopenharmony_ciwakeup:
94362306a36Sopenharmony_ci	/*
94462306a36Sopenharmony_ci	 * After no new page faults can wait on this fault_*wqh, flush
94562306a36Sopenharmony_ci	 * the last page faults that may have been already waiting on
94662306a36Sopenharmony_ci	 * the fault_*wqh.
94762306a36Sopenharmony_ci	 */
94862306a36Sopenharmony_ci	spin_lock_irq(&ctx->fault_pending_wqh.lock);
94962306a36Sopenharmony_ci	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
95062306a36Sopenharmony_ci	__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
95162306a36Sopenharmony_ci	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
95262306a36Sopenharmony_ci
95362306a36Sopenharmony_ci	/* Flush pending events that may still wait on event_wqh */
95462306a36Sopenharmony_ci	wake_up_all(&ctx->event_wqh);
95562306a36Sopenharmony_ci
95662306a36Sopenharmony_ci	wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
95762306a36Sopenharmony_ci	userfaultfd_ctx_put(ctx);
95862306a36Sopenharmony_ci	return 0;
95962306a36Sopenharmony_ci}
96062306a36Sopenharmony_ci
96162306a36Sopenharmony_ci/* fault_pending_wqh.lock must be hold by the caller */
96262306a36Sopenharmony_cistatic inline struct userfaultfd_wait_queue *find_userfault_in(
96362306a36Sopenharmony_ci		wait_queue_head_t *wqh)
96462306a36Sopenharmony_ci{
96562306a36Sopenharmony_ci	wait_queue_entry_t *wq;
96662306a36Sopenharmony_ci	struct userfaultfd_wait_queue *uwq;
96762306a36Sopenharmony_ci
96862306a36Sopenharmony_ci	lockdep_assert_held(&wqh->lock);
96962306a36Sopenharmony_ci
97062306a36Sopenharmony_ci	uwq = NULL;
97162306a36Sopenharmony_ci	if (!waitqueue_active(wqh))
97262306a36Sopenharmony_ci		goto out;
97362306a36Sopenharmony_ci	/* walk in reverse to provide FIFO behavior to read userfaults */
97462306a36Sopenharmony_ci	wq = list_last_entry(&wqh->head, typeof(*wq), entry);
97562306a36Sopenharmony_ci	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
97662306a36Sopenharmony_ciout:
97762306a36Sopenharmony_ci	return uwq;
97862306a36Sopenharmony_ci}
97962306a36Sopenharmony_ci
98062306a36Sopenharmony_cistatic inline struct userfaultfd_wait_queue *find_userfault(
98162306a36Sopenharmony_ci		struct userfaultfd_ctx *ctx)
98262306a36Sopenharmony_ci{
98362306a36Sopenharmony_ci	return find_userfault_in(&ctx->fault_pending_wqh);
98462306a36Sopenharmony_ci}
98562306a36Sopenharmony_ci
98662306a36Sopenharmony_cistatic inline struct userfaultfd_wait_queue *find_userfault_evt(
98762306a36Sopenharmony_ci		struct userfaultfd_ctx *ctx)
98862306a36Sopenharmony_ci{
98962306a36Sopenharmony_ci	return find_userfault_in(&ctx->event_wqh);
99062306a36Sopenharmony_ci}
99162306a36Sopenharmony_ci
99262306a36Sopenharmony_cistatic __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
99362306a36Sopenharmony_ci{
99462306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx = file->private_data;
99562306a36Sopenharmony_ci	__poll_t ret;
99662306a36Sopenharmony_ci
99762306a36Sopenharmony_ci	poll_wait(file, &ctx->fd_wqh, wait);
99862306a36Sopenharmony_ci
99962306a36Sopenharmony_ci	if (!userfaultfd_is_initialized(ctx))
100062306a36Sopenharmony_ci		return EPOLLERR;
100162306a36Sopenharmony_ci
100262306a36Sopenharmony_ci	/*
100362306a36Sopenharmony_ci	 * poll() never guarantees that read won't block.
100462306a36Sopenharmony_ci	 * userfaults can be waken before they're read().
100562306a36Sopenharmony_ci	 */
100662306a36Sopenharmony_ci	if (unlikely(!(file->f_flags & O_NONBLOCK)))
100762306a36Sopenharmony_ci		return EPOLLERR;
100862306a36Sopenharmony_ci	/*
100962306a36Sopenharmony_ci	 * lockless access to see if there are pending faults
101062306a36Sopenharmony_ci	 * __pollwait last action is the add_wait_queue but
101162306a36Sopenharmony_ci	 * the spin_unlock would allow the waitqueue_active to
101262306a36Sopenharmony_ci	 * pass above the actual list_add inside
101362306a36Sopenharmony_ci	 * add_wait_queue critical section. So use a full
101462306a36Sopenharmony_ci	 * memory barrier to serialize the list_add write of
101562306a36Sopenharmony_ci	 * add_wait_queue() with the waitqueue_active read
101662306a36Sopenharmony_ci	 * below.
101762306a36Sopenharmony_ci	 */
101862306a36Sopenharmony_ci	ret = 0;
101962306a36Sopenharmony_ci	smp_mb();
102062306a36Sopenharmony_ci	if (waitqueue_active(&ctx->fault_pending_wqh))
102162306a36Sopenharmony_ci		ret = EPOLLIN;
102262306a36Sopenharmony_ci	else if (waitqueue_active(&ctx->event_wqh))
102362306a36Sopenharmony_ci		ret = EPOLLIN;
102462306a36Sopenharmony_ci
102562306a36Sopenharmony_ci	return ret;
102662306a36Sopenharmony_ci}
102762306a36Sopenharmony_ci
102862306a36Sopenharmony_cistatic const struct file_operations userfaultfd_fops;
102962306a36Sopenharmony_ci
103062306a36Sopenharmony_cistatic int resolve_userfault_fork(struct userfaultfd_ctx *new,
103162306a36Sopenharmony_ci				  struct inode *inode,
103262306a36Sopenharmony_ci				  struct uffd_msg *msg)
103362306a36Sopenharmony_ci{
103462306a36Sopenharmony_ci	int fd;
103562306a36Sopenharmony_ci
103662306a36Sopenharmony_ci	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
103762306a36Sopenharmony_ci			O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
103862306a36Sopenharmony_ci	if (fd < 0)
103962306a36Sopenharmony_ci		return fd;
104062306a36Sopenharmony_ci
104162306a36Sopenharmony_ci	msg->arg.reserved.reserved1 = 0;
104262306a36Sopenharmony_ci	msg->arg.fork.ufd = fd;
104362306a36Sopenharmony_ci	return 0;
104462306a36Sopenharmony_ci}
104562306a36Sopenharmony_ci
104662306a36Sopenharmony_cistatic ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
104762306a36Sopenharmony_ci				    struct uffd_msg *msg, struct inode *inode)
104862306a36Sopenharmony_ci{
104962306a36Sopenharmony_ci	ssize_t ret;
105062306a36Sopenharmony_ci	DECLARE_WAITQUEUE(wait, current);
105162306a36Sopenharmony_ci	struct userfaultfd_wait_queue *uwq;
105262306a36Sopenharmony_ci	/*
105362306a36Sopenharmony_ci	 * Handling fork event requires sleeping operations, so
105462306a36Sopenharmony_ci	 * we drop the event_wqh lock, then do these ops, then
105562306a36Sopenharmony_ci	 * lock it back and wake up the waiter. While the lock is
105662306a36Sopenharmony_ci	 * dropped the ewq may go away so we keep track of it
105762306a36Sopenharmony_ci	 * carefully.
105862306a36Sopenharmony_ci	 */
105962306a36Sopenharmony_ci	LIST_HEAD(fork_event);
106062306a36Sopenharmony_ci	struct userfaultfd_ctx *fork_nctx = NULL;
106162306a36Sopenharmony_ci
106262306a36Sopenharmony_ci	/* always take the fd_wqh lock before the fault_pending_wqh lock */
106362306a36Sopenharmony_ci	spin_lock_irq(&ctx->fd_wqh.lock);
106462306a36Sopenharmony_ci	__add_wait_queue(&ctx->fd_wqh, &wait);
106562306a36Sopenharmony_ci	for (;;) {
106662306a36Sopenharmony_ci		set_current_state(TASK_INTERRUPTIBLE);
106762306a36Sopenharmony_ci		spin_lock(&ctx->fault_pending_wqh.lock);
106862306a36Sopenharmony_ci		uwq = find_userfault(ctx);
106962306a36Sopenharmony_ci		if (uwq) {
107062306a36Sopenharmony_ci			/*
107162306a36Sopenharmony_ci			 * Use a seqcount to repeat the lockless check
107262306a36Sopenharmony_ci			 * in wake_userfault() to avoid missing
107362306a36Sopenharmony_ci			 * wakeups because during the refile both
107462306a36Sopenharmony_ci			 * waitqueue could become empty if this is the
107562306a36Sopenharmony_ci			 * only userfault.
107662306a36Sopenharmony_ci			 */
107762306a36Sopenharmony_ci			write_seqcount_begin(&ctx->refile_seq);
107862306a36Sopenharmony_ci
107962306a36Sopenharmony_ci			/*
108062306a36Sopenharmony_ci			 * The fault_pending_wqh.lock prevents the uwq
108162306a36Sopenharmony_ci			 * to disappear from under us.
108262306a36Sopenharmony_ci			 *
108362306a36Sopenharmony_ci			 * Refile this userfault from
108462306a36Sopenharmony_ci			 * fault_pending_wqh to fault_wqh, it's not
108562306a36Sopenharmony_ci			 * pending anymore after we read it.
108662306a36Sopenharmony_ci			 *
108762306a36Sopenharmony_ci			 * Use list_del() by hand (as
108862306a36Sopenharmony_ci			 * userfaultfd_wake_function also uses
108962306a36Sopenharmony_ci			 * list_del_init() by hand) to be sure nobody
109062306a36Sopenharmony_ci			 * changes __remove_wait_queue() to use
109162306a36Sopenharmony_ci			 * list_del_init() in turn breaking the
109262306a36Sopenharmony_ci			 * !list_empty_careful() check in
109362306a36Sopenharmony_ci			 * handle_userfault(). The uwq->wq.head list
109462306a36Sopenharmony_ci			 * must never be empty at any time during the
109562306a36Sopenharmony_ci			 * refile, or the waitqueue could disappear
109662306a36Sopenharmony_ci			 * from under us. The "wait_queue_head_t"
109762306a36Sopenharmony_ci			 * parameter of __remove_wait_queue() is unused
109862306a36Sopenharmony_ci			 * anyway.
109962306a36Sopenharmony_ci			 */
110062306a36Sopenharmony_ci			list_del(&uwq->wq.entry);
110162306a36Sopenharmony_ci			add_wait_queue(&ctx->fault_wqh, &uwq->wq);
110262306a36Sopenharmony_ci
110362306a36Sopenharmony_ci			write_seqcount_end(&ctx->refile_seq);
110462306a36Sopenharmony_ci
110562306a36Sopenharmony_ci			/* careful to always initialize msg if ret == 0 */
110662306a36Sopenharmony_ci			*msg = uwq->msg;
110762306a36Sopenharmony_ci			spin_unlock(&ctx->fault_pending_wqh.lock);
110862306a36Sopenharmony_ci			ret = 0;
110962306a36Sopenharmony_ci			break;
111062306a36Sopenharmony_ci		}
111162306a36Sopenharmony_ci		spin_unlock(&ctx->fault_pending_wqh.lock);
111262306a36Sopenharmony_ci
111362306a36Sopenharmony_ci		spin_lock(&ctx->event_wqh.lock);
111462306a36Sopenharmony_ci		uwq = find_userfault_evt(ctx);
111562306a36Sopenharmony_ci		if (uwq) {
111662306a36Sopenharmony_ci			*msg = uwq->msg;
111762306a36Sopenharmony_ci
111862306a36Sopenharmony_ci			if (uwq->msg.event == UFFD_EVENT_FORK) {
111962306a36Sopenharmony_ci				fork_nctx = (struct userfaultfd_ctx *)
112062306a36Sopenharmony_ci					(unsigned long)
112162306a36Sopenharmony_ci					uwq->msg.arg.reserved.reserved1;
112262306a36Sopenharmony_ci				list_move(&uwq->wq.entry, &fork_event);
112362306a36Sopenharmony_ci				/*
112462306a36Sopenharmony_ci				 * fork_nctx can be freed as soon as
112562306a36Sopenharmony_ci				 * we drop the lock, unless we take a
112662306a36Sopenharmony_ci				 * reference on it.
112762306a36Sopenharmony_ci				 */
112862306a36Sopenharmony_ci				userfaultfd_ctx_get(fork_nctx);
112962306a36Sopenharmony_ci				spin_unlock(&ctx->event_wqh.lock);
113062306a36Sopenharmony_ci				ret = 0;
113162306a36Sopenharmony_ci				break;
113262306a36Sopenharmony_ci			}
113362306a36Sopenharmony_ci
113462306a36Sopenharmony_ci			userfaultfd_event_complete(ctx, uwq);
113562306a36Sopenharmony_ci			spin_unlock(&ctx->event_wqh.lock);
113662306a36Sopenharmony_ci			ret = 0;
113762306a36Sopenharmony_ci			break;
113862306a36Sopenharmony_ci		}
113962306a36Sopenharmony_ci		spin_unlock(&ctx->event_wqh.lock);
114062306a36Sopenharmony_ci
114162306a36Sopenharmony_ci		if (signal_pending(current)) {
114262306a36Sopenharmony_ci			ret = -ERESTARTSYS;
114362306a36Sopenharmony_ci			break;
114462306a36Sopenharmony_ci		}
114562306a36Sopenharmony_ci		if (no_wait) {
114662306a36Sopenharmony_ci			ret = -EAGAIN;
114762306a36Sopenharmony_ci			break;
114862306a36Sopenharmony_ci		}
114962306a36Sopenharmony_ci		spin_unlock_irq(&ctx->fd_wqh.lock);
115062306a36Sopenharmony_ci		schedule();
115162306a36Sopenharmony_ci		spin_lock_irq(&ctx->fd_wqh.lock);
115262306a36Sopenharmony_ci	}
115362306a36Sopenharmony_ci	__remove_wait_queue(&ctx->fd_wqh, &wait);
115462306a36Sopenharmony_ci	__set_current_state(TASK_RUNNING);
115562306a36Sopenharmony_ci	spin_unlock_irq(&ctx->fd_wqh.lock);
115662306a36Sopenharmony_ci
115762306a36Sopenharmony_ci	if (!ret && msg->event == UFFD_EVENT_FORK) {
115862306a36Sopenharmony_ci		ret = resolve_userfault_fork(fork_nctx, inode, msg);
115962306a36Sopenharmony_ci		spin_lock_irq(&ctx->event_wqh.lock);
116062306a36Sopenharmony_ci		if (!list_empty(&fork_event)) {
116162306a36Sopenharmony_ci			/*
116262306a36Sopenharmony_ci			 * The fork thread didn't abort, so we can
116362306a36Sopenharmony_ci			 * drop the temporary refcount.
116462306a36Sopenharmony_ci			 */
116562306a36Sopenharmony_ci			userfaultfd_ctx_put(fork_nctx);
116662306a36Sopenharmony_ci
116762306a36Sopenharmony_ci			uwq = list_first_entry(&fork_event,
116862306a36Sopenharmony_ci					       typeof(*uwq),
116962306a36Sopenharmony_ci					       wq.entry);
117062306a36Sopenharmony_ci			/*
117162306a36Sopenharmony_ci			 * If fork_event list wasn't empty and in turn
117262306a36Sopenharmony_ci			 * the event wasn't already released by fork
117362306a36Sopenharmony_ci			 * (the event is allocated on fork kernel
117462306a36Sopenharmony_ci			 * stack), put the event back to its place in
117562306a36Sopenharmony_ci			 * the event_wq. fork_event head will be freed
117662306a36Sopenharmony_ci			 * as soon as we return so the event cannot
117762306a36Sopenharmony_ci			 * stay queued there no matter the current
117862306a36Sopenharmony_ci			 * "ret" value.
117962306a36Sopenharmony_ci			 */
118062306a36Sopenharmony_ci			list_del(&uwq->wq.entry);
118162306a36Sopenharmony_ci			__add_wait_queue(&ctx->event_wqh, &uwq->wq);
118262306a36Sopenharmony_ci
118362306a36Sopenharmony_ci			/*
118462306a36Sopenharmony_ci			 * Leave the event in the waitqueue and report
118562306a36Sopenharmony_ci			 * error to userland if we failed to resolve
118662306a36Sopenharmony_ci			 * the userfault fork.
118762306a36Sopenharmony_ci			 */
118862306a36Sopenharmony_ci			if (likely(!ret))
118962306a36Sopenharmony_ci				userfaultfd_event_complete(ctx, uwq);
119062306a36Sopenharmony_ci		} else {
119162306a36Sopenharmony_ci			/*
119262306a36Sopenharmony_ci			 * Here the fork thread aborted and the
119362306a36Sopenharmony_ci			 * refcount from the fork thread on fork_nctx
119462306a36Sopenharmony_ci			 * has already been released. We still hold
119562306a36Sopenharmony_ci			 * the reference we took before releasing the
119662306a36Sopenharmony_ci			 * lock above. If resolve_userfault_fork
119762306a36Sopenharmony_ci			 * failed we've to drop it because the
119862306a36Sopenharmony_ci			 * fork_nctx has to be freed in such case. If
119962306a36Sopenharmony_ci			 * it succeeded we'll hold it because the new
120062306a36Sopenharmony_ci			 * uffd references it.
120162306a36Sopenharmony_ci			 */
120262306a36Sopenharmony_ci			if (ret)
120362306a36Sopenharmony_ci				userfaultfd_ctx_put(fork_nctx);
120462306a36Sopenharmony_ci		}
120562306a36Sopenharmony_ci		spin_unlock_irq(&ctx->event_wqh.lock);
120662306a36Sopenharmony_ci	}
120762306a36Sopenharmony_ci
120862306a36Sopenharmony_ci	return ret;
120962306a36Sopenharmony_ci}
121062306a36Sopenharmony_ci
121162306a36Sopenharmony_cistatic ssize_t userfaultfd_read(struct file *file, char __user *buf,
121262306a36Sopenharmony_ci				size_t count, loff_t *ppos)
121362306a36Sopenharmony_ci{
121462306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx = file->private_data;
121562306a36Sopenharmony_ci	ssize_t _ret, ret = 0;
121662306a36Sopenharmony_ci	struct uffd_msg msg;
121762306a36Sopenharmony_ci	int no_wait = file->f_flags & O_NONBLOCK;
121862306a36Sopenharmony_ci	struct inode *inode = file_inode(file);
121962306a36Sopenharmony_ci
122062306a36Sopenharmony_ci	if (!userfaultfd_is_initialized(ctx))
122162306a36Sopenharmony_ci		return -EINVAL;
122262306a36Sopenharmony_ci
122362306a36Sopenharmony_ci	for (;;) {
122462306a36Sopenharmony_ci		if (count < sizeof(msg))
122562306a36Sopenharmony_ci			return ret ? ret : -EINVAL;
122662306a36Sopenharmony_ci		_ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
122762306a36Sopenharmony_ci		if (_ret < 0)
122862306a36Sopenharmony_ci			return ret ? ret : _ret;
122962306a36Sopenharmony_ci		if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
123062306a36Sopenharmony_ci			return ret ? ret : -EFAULT;
123162306a36Sopenharmony_ci		ret += sizeof(msg);
123262306a36Sopenharmony_ci		buf += sizeof(msg);
123362306a36Sopenharmony_ci		count -= sizeof(msg);
123462306a36Sopenharmony_ci		/*
123562306a36Sopenharmony_ci		 * Allow to read more than one fault at time but only
123662306a36Sopenharmony_ci		 * block if waiting for the very first one.
123762306a36Sopenharmony_ci		 */
123862306a36Sopenharmony_ci		no_wait = O_NONBLOCK;
123962306a36Sopenharmony_ci	}
124062306a36Sopenharmony_ci}
124162306a36Sopenharmony_ci
124262306a36Sopenharmony_cistatic void __wake_userfault(struct userfaultfd_ctx *ctx,
124362306a36Sopenharmony_ci			     struct userfaultfd_wake_range *range)
124462306a36Sopenharmony_ci{
124562306a36Sopenharmony_ci	spin_lock_irq(&ctx->fault_pending_wqh.lock);
124662306a36Sopenharmony_ci	/* wake all in the range and autoremove */
124762306a36Sopenharmony_ci	if (waitqueue_active(&ctx->fault_pending_wqh))
124862306a36Sopenharmony_ci		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
124962306a36Sopenharmony_ci				     range);
125062306a36Sopenharmony_ci	if (waitqueue_active(&ctx->fault_wqh))
125162306a36Sopenharmony_ci		__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
125262306a36Sopenharmony_ci	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
125362306a36Sopenharmony_ci}
125462306a36Sopenharmony_ci
125562306a36Sopenharmony_cistatic __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
125662306a36Sopenharmony_ci					   struct userfaultfd_wake_range *range)
125762306a36Sopenharmony_ci{
125862306a36Sopenharmony_ci	unsigned seq;
125962306a36Sopenharmony_ci	bool need_wakeup;
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_ci	/*
126262306a36Sopenharmony_ci	 * To be sure waitqueue_active() is not reordered by the CPU
126362306a36Sopenharmony_ci	 * before the pagetable update, use an explicit SMP memory
126462306a36Sopenharmony_ci	 * barrier here. PT lock release or mmap_read_unlock(mm) still
126562306a36Sopenharmony_ci	 * have release semantics that can allow the
126662306a36Sopenharmony_ci	 * waitqueue_active() to be reordered before the pte update.
126762306a36Sopenharmony_ci	 */
126862306a36Sopenharmony_ci	smp_mb();
126962306a36Sopenharmony_ci
127062306a36Sopenharmony_ci	/*
127162306a36Sopenharmony_ci	 * Use waitqueue_active because it's very frequent to
127262306a36Sopenharmony_ci	 * change the address space atomically even if there are no
127362306a36Sopenharmony_ci	 * userfaults yet. So we take the spinlock only when we're
127462306a36Sopenharmony_ci	 * sure we've userfaults to wake.
127562306a36Sopenharmony_ci	 */
127662306a36Sopenharmony_ci	do {
127762306a36Sopenharmony_ci		seq = read_seqcount_begin(&ctx->refile_seq);
127862306a36Sopenharmony_ci		need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
127962306a36Sopenharmony_ci			waitqueue_active(&ctx->fault_wqh);
128062306a36Sopenharmony_ci		cond_resched();
128162306a36Sopenharmony_ci	} while (read_seqcount_retry(&ctx->refile_seq, seq));
128262306a36Sopenharmony_ci	if (need_wakeup)
128362306a36Sopenharmony_ci		__wake_userfault(ctx, range);
128462306a36Sopenharmony_ci}
128562306a36Sopenharmony_ci
128662306a36Sopenharmony_cistatic __always_inline int validate_unaligned_range(
128762306a36Sopenharmony_ci	struct mm_struct *mm, __u64 start, __u64 len)
128862306a36Sopenharmony_ci{
128962306a36Sopenharmony_ci	__u64 task_size = mm->task_size;
129062306a36Sopenharmony_ci
129162306a36Sopenharmony_ci	if (len & ~PAGE_MASK)
129262306a36Sopenharmony_ci		return -EINVAL;
129362306a36Sopenharmony_ci	if (!len)
129462306a36Sopenharmony_ci		return -EINVAL;
129562306a36Sopenharmony_ci	if (start < mmap_min_addr)
129662306a36Sopenharmony_ci		return -EINVAL;
129762306a36Sopenharmony_ci	if (start >= task_size)
129862306a36Sopenharmony_ci		return -EINVAL;
129962306a36Sopenharmony_ci	if (len > task_size - start)
130062306a36Sopenharmony_ci		return -EINVAL;
130162306a36Sopenharmony_ci	if (start + len <= start)
130262306a36Sopenharmony_ci		return -EINVAL;
130362306a36Sopenharmony_ci	return 0;
130462306a36Sopenharmony_ci}
130562306a36Sopenharmony_ci
130662306a36Sopenharmony_cistatic __always_inline int validate_range(struct mm_struct *mm,
130762306a36Sopenharmony_ci					  __u64 start, __u64 len)
130862306a36Sopenharmony_ci{
130962306a36Sopenharmony_ci	if (start & ~PAGE_MASK)
131062306a36Sopenharmony_ci		return -EINVAL;
131162306a36Sopenharmony_ci
131262306a36Sopenharmony_ci	return validate_unaligned_range(mm, start, len);
131362306a36Sopenharmony_ci}
131462306a36Sopenharmony_ci
131562306a36Sopenharmony_cistatic int userfaultfd_register(struct userfaultfd_ctx *ctx,
131662306a36Sopenharmony_ci				unsigned long arg)
131762306a36Sopenharmony_ci{
131862306a36Sopenharmony_ci	struct mm_struct *mm = ctx->mm;
131962306a36Sopenharmony_ci	struct vm_area_struct *vma, *prev, *cur;
132062306a36Sopenharmony_ci	int ret;
132162306a36Sopenharmony_ci	struct uffdio_register uffdio_register;
132262306a36Sopenharmony_ci	struct uffdio_register __user *user_uffdio_register;
132362306a36Sopenharmony_ci	unsigned long vm_flags, new_flags;
132462306a36Sopenharmony_ci	bool found;
132562306a36Sopenharmony_ci	bool basic_ioctls;
132662306a36Sopenharmony_ci	unsigned long start, end, vma_end;
132762306a36Sopenharmony_ci	struct vma_iterator vmi;
132862306a36Sopenharmony_ci	pgoff_t pgoff;
132962306a36Sopenharmony_ci
133062306a36Sopenharmony_ci	user_uffdio_register = (struct uffdio_register __user *) arg;
133162306a36Sopenharmony_ci
133262306a36Sopenharmony_ci	ret = -EFAULT;
133362306a36Sopenharmony_ci	if (copy_from_user(&uffdio_register, user_uffdio_register,
133462306a36Sopenharmony_ci			   sizeof(uffdio_register)-sizeof(__u64)))
133562306a36Sopenharmony_ci		goto out;
133662306a36Sopenharmony_ci
133762306a36Sopenharmony_ci	ret = -EINVAL;
133862306a36Sopenharmony_ci	if (!uffdio_register.mode)
133962306a36Sopenharmony_ci		goto out;
134062306a36Sopenharmony_ci	if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
134162306a36Sopenharmony_ci		goto out;
134262306a36Sopenharmony_ci	vm_flags = 0;
134362306a36Sopenharmony_ci	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
134462306a36Sopenharmony_ci		vm_flags |= VM_UFFD_MISSING;
134562306a36Sopenharmony_ci	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
134662306a36Sopenharmony_ci#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
134762306a36Sopenharmony_ci		goto out;
134862306a36Sopenharmony_ci#endif
134962306a36Sopenharmony_ci		vm_flags |= VM_UFFD_WP;
135062306a36Sopenharmony_ci	}
135162306a36Sopenharmony_ci	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
135262306a36Sopenharmony_ci#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
135362306a36Sopenharmony_ci		goto out;
135462306a36Sopenharmony_ci#endif
135562306a36Sopenharmony_ci		vm_flags |= VM_UFFD_MINOR;
135662306a36Sopenharmony_ci	}
135762306a36Sopenharmony_ci
135862306a36Sopenharmony_ci	ret = validate_range(mm, uffdio_register.range.start,
135962306a36Sopenharmony_ci			     uffdio_register.range.len);
136062306a36Sopenharmony_ci	if (ret)
136162306a36Sopenharmony_ci		goto out;
136262306a36Sopenharmony_ci
136362306a36Sopenharmony_ci	start = uffdio_register.range.start;
136462306a36Sopenharmony_ci	end = start + uffdio_register.range.len;
136562306a36Sopenharmony_ci
136662306a36Sopenharmony_ci	ret = -ENOMEM;
136762306a36Sopenharmony_ci	if (!mmget_not_zero(mm))
136862306a36Sopenharmony_ci		goto out;
136962306a36Sopenharmony_ci
137062306a36Sopenharmony_ci	ret = -EINVAL;
137162306a36Sopenharmony_ci	mmap_write_lock(mm);
137262306a36Sopenharmony_ci	vma_iter_init(&vmi, mm, start);
137362306a36Sopenharmony_ci	vma = vma_find(&vmi, end);
137462306a36Sopenharmony_ci	if (!vma)
137562306a36Sopenharmony_ci		goto out_unlock;
137662306a36Sopenharmony_ci
137762306a36Sopenharmony_ci	/*
137862306a36Sopenharmony_ci	 * If the first vma contains huge pages, make sure start address
137962306a36Sopenharmony_ci	 * is aligned to huge page size.
138062306a36Sopenharmony_ci	 */
138162306a36Sopenharmony_ci	if (is_vm_hugetlb_page(vma)) {
138262306a36Sopenharmony_ci		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
138362306a36Sopenharmony_ci
138462306a36Sopenharmony_ci		if (start & (vma_hpagesize - 1))
138562306a36Sopenharmony_ci			goto out_unlock;
138662306a36Sopenharmony_ci	}
138762306a36Sopenharmony_ci
138862306a36Sopenharmony_ci	/*
138962306a36Sopenharmony_ci	 * Search for not compatible vmas.
139062306a36Sopenharmony_ci	 */
139162306a36Sopenharmony_ci	found = false;
139262306a36Sopenharmony_ci	basic_ioctls = false;
139362306a36Sopenharmony_ci	cur = vma;
139462306a36Sopenharmony_ci	do {
139562306a36Sopenharmony_ci		cond_resched();
139662306a36Sopenharmony_ci
139762306a36Sopenharmony_ci		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
139862306a36Sopenharmony_ci		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
139962306a36Sopenharmony_ci
140062306a36Sopenharmony_ci		/* check not compatible vmas */
140162306a36Sopenharmony_ci		ret = -EINVAL;
140262306a36Sopenharmony_ci		if (!vma_can_userfault(cur, vm_flags))
140362306a36Sopenharmony_ci			goto out_unlock;
140462306a36Sopenharmony_ci
140562306a36Sopenharmony_ci		/*
140662306a36Sopenharmony_ci		 * UFFDIO_COPY will fill file holes even without
140762306a36Sopenharmony_ci		 * PROT_WRITE. This check enforces that if this is a
140862306a36Sopenharmony_ci		 * MAP_SHARED, the process has write permission to the backing
140962306a36Sopenharmony_ci		 * file. If VM_MAYWRITE is set it also enforces that on a
141062306a36Sopenharmony_ci		 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
141162306a36Sopenharmony_ci		 * F_WRITE_SEAL can be taken until the vma is destroyed.
141262306a36Sopenharmony_ci		 */
141362306a36Sopenharmony_ci		ret = -EPERM;
141462306a36Sopenharmony_ci		if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
141562306a36Sopenharmony_ci			goto out_unlock;
141662306a36Sopenharmony_ci
141762306a36Sopenharmony_ci		/*
141862306a36Sopenharmony_ci		 * If this vma contains ending address, and huge pages
141962306a36Sopenharmony_ci		 * check alignment.
142062306a36Sopenharmony_ci		 */
142162306a36Sopenharmony_ci		if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
142262306a36Sopenharmony_ci		    end > cur->vm_start) {
142362306a36Sopenharmony_ci			unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
142462306a36Sopenharmony_ci
142562306a36Sopenharmony_ci			ret = -EINVAL;
142662306a36Sopenharmony_ci
142762306a36Sopenharmony_ci			if (end & (vma_hpagesize - 1))
142862306a36Sopenharmony_ci				goto out_unlock;
142962306a36Sopenharmony_ci		}
143062306a36Sopenharmony_ci		if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
143162306a36Sopenharmony_ci			goto out_unlock;
143262306a36Sopenharmony_ci
143362306a36Sopenharmony_ci		/*
143462306a36Sopenharmony_ci		 * Check that this vma isn't already owned by a
143562306a36Sopenharmony_ci		 * different userfaultfd. We can't allow more than one
143662306a36Sopenharmony_ci		 * userfaultfd to own a single vma simultaneously or we
143762306a36Sopenharmony_ci		 * wouldn't know which one to deliver the userfaults to.
143862306a36Sopenharmony_ci		 */
143962306a36Sopenharmony_ci		ret = -EBUSY;
144062306a36Sopenharmony_ci		if (cur->vm_userfaultfd_ctx.ctx &&
144162306a36Sopenharmony_ci		    cur->vm_userfaultfd_ctx.ctx != ctx)
144262306a36Sopenharmony_ci			goto out_unlock;
144362306a36Sopenharmony_ci
144462306a36Sopenharmony_ci		/*
144562306a36Sopenharmony_ci		 * Note vmas containing huge pages
144662306a36Sopenharmony_ci		 */
144762306a36Sopenharmony_ci		if (is_vm_hugetlb_page(cur))
144862306a36Sopenharmony_ci			basic_ioctls = true;
144962306a36Sopenharmony_ci
145062306a36Sopenharmony_ci		found = true;
145162306a36Sopenharmony_ci	} for_each_vma_range(vmi, cur, end);
145262306a36Sopenharmony_ci	BUG_ON(!found);
145362306a36Sopenharmony_ci
145462306a36Sopenharmony_ci	vma_iter_set(&vmi, start);
145562306a36Sopenharmony_ci	prev = vma_prev(&vmi);
145662306a36Sopenharmony_ci	if (vma->vm_start < start)
145762306a36Sopenharmony_ci		prev = vma;
145862306a36Sopenharmony_ci
145962306a36Sopenharmony_ci	ret = 0;
146062306a36Sopenharmony_ci	for_each_vma_range(vmi, vma, end) {
146162306a36Sopenharmony_ci		cond_resched();
146262306a36Sopenharmony_ci
146362306a36Sopenharmony_ci		BUG_ON(!vma_can_userfault(vma, vm_flags));
146462306a36Sopenharmony_ci		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
146562306a36Sopenharmony_ci		       vma->vm_userfaultfd_ctx.ctx != ctx);
146662306a36Sopenharmony_ci		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
146762306a36Sopenharmony_ci
146862306a36Sopenharmony_ci		/*
146962306a36Sopenharmony_ci		 * Nothing to do: this vma is already registered into this
147062306a36Sopenharmony_ci		 * userfaultfd and with the right tracking mode too.
147162306a36Sopenharmony_ci		 */
147262306a36Sopenharmony_ci		if (vma->vm_userfaultfd_ctx.ctx == ctx &&
147362306a36Sopenharmony_ci		    (vma->vm_flags & vm_flags) == vm_flags)
147462306a36Sopenharmony_ci			goto skip;
147562306a36Sopenharmony_ci
147662306a36Sopenharmony_ci		if (vma->vm_start > start)
147762306a36Sopenharmony_ci			start = vma->vm_start;
147862306a36Sopenharmony_ci		vma_end = min(end, vma->vm_end);
147962306a36Sopenharmony_ci
148062306a36Sopenharmony_ci		new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
148162306a36Sopenharmony_ci		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
148262306a36Sopenharmony_ci		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
148362306a36Sopenharmony_ci				 vma->anon_vma, vma->vm_file, pgoff,
148462306a36Sopenharmony_ci				 vma_policy(vma),
148562306a36Sopenharmony_ci				 ((struct vm_userfaultfd_ctx){ ctx }),
148662306a36Sopenharmony_ci				 anon_vma_name(vma));
148762306a36Sopenharmony_ci		if (prev) {
148862306a36Sopenharmony_ci			/* vma_merge() invalidated the mas */
148962306a36Sopenharmony_ci			vma = prev;
149062306a36Sopenharmony_ci			goto next;
149162306a36Sopenharmony_ci		}
149262306a36Sopenharmony_ci		if (vma->vm_start < start) {
149362306a36Sopenharmony_ci			ret = split_vma(&vmi, vma, start, 1);
149462306a36Sopenharmony_ci			if (ret)
149562306a36Sopenharmony_ci				break;
149662306a36Sopenharmony_ci		}
149762306a36Sopenharmony_ci		if (vma->vm_end > end) {
149862306a36Sopenharmony_ci			ret = split_vma(&vmi, vma, end, 0);
149962306a36Sopenharmony_ci			if (ret)
150062306a36Sopenharmony_ci				break;
150162306a36Sopenharmony_ci		}
150262306a36Sopenharmony_ci	next:
150362306a36Sopenharmony_ci		/*
150462306a36Sopenharmony_ci		 * In the vma_merge() successful mprotect-like case 8:
150562306a36Sopenharmony_ci		 * the next vma was merged into the current one and
150662306a36Sopenharmony_ci		 * the current one has not been updated yet.
150762306a36Sopenharmony_ci		 */
150862306a36Sopenharmony_ci		vma_start_write(vma);
150962306a36Sopenharmony_ci		userfaultfd_set_vm_flags(vma, new_flags);
151062306a36Sopenharmony_ci		vma->vm_userfaultfd_ctx.ctx = ctx;
151162306a36Sopenharmony_ci
151262306a36Sopenharmony_ci		if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
151362306a36Sopenharmony_ci			hugetlb_unshare_all_pmds(vma);
151462306a36Sopenharmony_ci
151562306a36Sopenharmony_ci	skip:
151662306a36Sopenharmony_ci		prev = vma;
151762306a36Sopenharmony_ci		start = vma->vm_end;
151862306a36Sopenharmony_ci	}
151962306a36Sopenharmony_ci
152062306a36Sopenharmony_ciout_unlock:
152162306a36Sopenharmony_ci	mmap_write_unlock(mm);
152262306a36Sopenharmony_ci	mmput(mm);
152362306a36Sopenharmony_ci	if (!ret) {
152462306a36Sopenharmony_ci		__u64 ioctls_out;
152562306a36Sopenharmony_ci
152662306a36Sopenharmony_ci		ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
152762306a36Sopenharmony_ci		    UFFD_API_RANGE_IOCTLS;
152862306a36Sopenharmony_ci
152962306a36Sopenharmony_ci		/*
153062306a36Sopenharmony_ci		 * Declare the WP ioctl only if the WP mode is
153162306a36Sopenharmony_ci		 * specified and all checks passed with the range
153262306a36Sopenharmony_ci		 */
153362306a36Sopenharmony_ci		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
153462306a36Sopenharmony_ci			ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
153562306a36Sopenharmony_ci
153662306a36Sopenharmony_ci		/* CONTINUE ioctl is only supported for MINOR ranges. */
153762306a36Sopenharmony_ci		if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
153862306a36Sopenharmony_ci			ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
153962306a36Sopenharmony_ci
154062306a36Sopenharmony_ci		/*
154162306a36Sopenharmony_ci		 * Now that we scanned all vmas we can already tell
154262306a36Sopenharmony_ci		 * userland which ioctls methods are guaranteed to
154362306a36Sopenharmony_ci		 * succeed on this range.
154462306a36Sopenharmony_ci		 */
154562306a36Sopenharmony_ci		if (put_user(ioctls_out, &user_uffdio_register->ioctls))
154662306a36Sopenharmony_ci			ret = -EFAULT;
154762306a36Sopenharmony_ci	}
154862306a36Sopenharmony_ciout:
154962306a36Sopenharmony_ci	return ret;
155062306a36Sopenharmony_ci}
155162306a36Sopenharmony_ci
155262306a36Sopenharmony_cistatic int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
155362306a36Sopenharmony_ci				  unsigned long arg)
155462306a36Sopenharmony_ci{
155562306a36Sopenharmony_ci	struct mm_struct *mm = ctx->mm;
155662306a36Sopenharmony_ci	struct vm_area_struct *vma, *prev, *cur;
155762306a36Sopenharmony_ci	int ret;
155862306a36Sopenharmony_ci	struct uffdio_range uffdio_unregister;
155962306a36Sopenharmony_ci	unsigned long new_flags;
156062306a36Sopenharmony_ci	bool found;
156162306a36Sopenharmony_ci	unsigned long start, end, vma_end;
156262306a36Sopenharmony_ci	const void __user *buf = (void __user *)arg;
156362306a36Sopenharmony_ci	struct vma_iterator vmi;
156462306a36Sopenharmony_ci	pgoff_t pgoff;
156562306a36Sopenharmony_ci
156662306a36Sopenharmony_ci	ret = -EFAULT;
156762306a36Sopenharmony_ci	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
156862306a36Sopenharmony_ci		goto out;
156962306a36Sopenharmony_ci
157062306a36Sopenharmony_ci	ret = validate_range(mm, uffdio_unregister.start,
157162306a36Sopenharmony_ci			     uffdio_unregister.len);
157262306a36Sopenharmony_ci	if (ret)
157362306a36Sopenharmony_ci		goto out;
157462306a36Sopenharmony_ci
157562306a36Sopenharmony_ci	start = uffdio_unregister.start;
157662306a36Sopenharmony_ci	end = start + uffdio_unregister.len;
157762306a36Sopenharmony_ci
157862306a36Sopenharmony_ci	ret = -ENOMEM;
157962306a36Sopenharmony_ci	if (!mmget_not_zero(mm))
158062306a36Sopenharmony_ci		goto out;
158162306a36Sopenharmony_ci
158262306a36Sopenharmony_ci	mmap_write_lock(mm);
158362306a36Sopenharmony_ci	ret = -EINVAL;
158462306a36Sopenharmony_ci	vma_iter_init(&vmi, mm, start);
158562306a36Sopenharmony_ci	vma = vma_find(&vmi, end);
158662306a36Sopenharmony_ci	if (!vma)
158762306a36Sopenharmony_ci		goto out_unlock;
158862306a36Sopenharmony_ci
158962306a36Sopenharmony_ci	/*
159062306a36Sopenharmony_ci	 * If the first vma contains huge pages, make sure start address
159162306a36Sopenharmony_ci	 * is aligned to huge page size.
159262306a36Sopenharmony_ci	 */
159362306a36Sopenharmony_ci	if (is_vm_hugetlb_page(vma)) {
159462306a36Sopenharmony_ci		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
159562306a36Sopenharmony_ci
159662306a36Sopenharmony_ci		if (start & (vma_hpagesize - 1))
159762306a36Sopenharmony_ci			goto out_unlock;
159862306a36Sopenharmony_ci	}
159962306a36Sopenharmony_ci
160062306a36Sopenharmony_ci	/*
160162306a36Sopenharmony_ci	 * Search for not compatible vmas.
160262306a36Sopenharmony_ci	 */
160362306a36Sopenharmony_ci	found = false;
160462306a36Sopenharmony_ci	cur = vma;
160562306a36Sopenharmony_ci	do {
160662306a36Sopenharmony_ci		cond_resched();
160762306a36Sopenharmony_ci
160862306a36Sopenharmony_ci		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
160962306a36Sopenharmony_ci		       !!(cur->vm_flags & __VM_UFFD_FLAGS));
161062306a36Sopenharmony_ci
161162306a36Sopenharmony_ci		/*
161262306a36Sopenharmony_ci		 * Check not compatible vmas, not strictly required
161362306a36Sopenharmony_ci		 * here as not compatible vmas cannot have an
161462306a36Sopenharmony_ci		 * userfaultfd_ctx registered on them, but this
161562306a36Sopenharmony_ci		 * provides for more strict behavior to notice
161662306a36Sopenharmony_ci		 * unregistration errors.
161762306a36Sopenharmony_ci		 */
161862306a36Sopenharmony_ci		if (!vma_can_userfault(cur, cur->vm_flags))
161962306a36Sopenharmony_ci			goto out_unlock;
162062306a36Sopenharmony_ci
162162306a36Sopenharmony_ci		found = true;
162262306a36Sopenharmony_ci	} for_each_vma_range(vmi, cur, end);
162362306a36Sopenharmony_ci	BUG_ON(!found);
162462306a36Sopenharmony_ci
162562306a36Sopenharmony_ci	vma_iter_set(&vmi, start);
162662306a36Sopenharmony_ci	prev = vma_prev(&vmi);
162762306a36Sopenharmony_ci	if (vma->vm_start < start)
162862306a36Sopenharmony_ci		prev = vma;
162962306a36Sopenharmony_ci
163062306a36Sopenharmony_ci	ret = 0;
163162306a36Sopenharmony_ci	for_each_vma_range(vmi, vma, end) {
163262306a36Sopenharmony_ci		cond_resched();
163362306a36Sopenharmony_ci
163462306a36Sopenharmony_ci		BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
163562306a36Sopenharmony_ci
163662306a36Sopenharmony_ci		/*
163762306a36Sopenharmony_ci		 * Nothing to do: this vma is already registered into this
163862306a36Sopenharmony_ci		 * userfaultfd and with the right tracking mode too.
163962306a36Sopenharmony_ci		 */
164062306a36Sopenharmony_ci		if (!vma->vm_userfaultfd_ctx.ctx)
164162306a36Sopenharmony_ci			goto skip;
164262306a36Sopenharmony_ci
164362306a36Sopenharmony_ci		WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
164462306a36Sopenharmony_ci
164562306a36Sopenharmony_ci		if (vma->vm_start > start)
164662306a36Sopenharmony_ci			start = vma->vm_start;
164762306a36Sopenharmony_ci		vma_end = min(end, vma->vm_end);
164862306a36Sopenharmony_ci
164962306a36Sopenharmony_ci		if (userfaultfd_missing(vma)) {
165062306a36Sopenharmony_ci			/*
165162306a36Sopenharmony_ci			 * Wake any concurrent pending userfault while
165262306a36Sopenharmony_ci			 * we unregister, so they will not hang
165362306a36Sopenharmony_ci			 * permanently and it avoids userland to call
165462306a36Sopenharmony_ci			 * UFFDIO_WAKE explicitly.
165562306a36Sopenharmony_ci			 */
165662306a36Sopenharmony_ci			struct userfaultfd_wake_range range;
165762306a36Sopenharmony_ci			range.start = start;
165862306a36Sopenharmony_ci			range.len = vma_end - start;
165962306a36Sopenharmony_ci			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
166062306a36Sopenharmony_ci		}
166162306a36Sopenharmony_ci
166262306a36Sopenharmony_ci		/* Reset ptes for the whole vma range if wr-protected */
166362306a36Sopenharmony_ci		if (userfaultfd_wp(vma))
166462306a36Sopenharmony_ci			uffd_wp_range(vma, start, vma_end - start, false);
166562306a36Sopenharmony_ci
166662306a36Sopenharmony_ci		new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
166762306a36Sopenharmony_ci		pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
166862306a36Sopenharmony_ci		prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
166962306a36Sopenharmony_ci				 vma->anon_vma, vma->vm_file, pgoff,
167062306a36Sopenharmony_ci				 vma_policy(vma),
167162306a36Sopenharmony_ci				 NULL_VM_UFFD_CTX, anon_vma_name(vma));
167262306a36Sopenharmony_ci		if (prev) {
167362306a36Sopenharmony_ci			vma = prev;
167462306a36Sopenharmony_ci			goto next;
167562306a36Sopenharmony_ci		}
167662306a36Sopenharmony_ci		if (vma->vm_start < start) {
167762306a36Sopenharmony_ci			ret = split_vma(&vmi, vma, start, 1);
167862306a36Sopenharmony_ci			if (ret)
167962306a36Sopenharmony_ci				break;
168062306a36Sopenharmony_ci		}
168162306a36Sopenharmony_ci		if (vma->vm_end > end) {
168262306a36Sopenharmony_ci			ret = split_vma(&vmi, vma, end, 0);
168362306a36Sopenharmony_ci			if (ret)
168462306a36Sopenharmony_ci				break;
168562306a36Sopenharmony_ci		}
168662306a36Sopenharmony_ci	next:
168762306a36Sopenharmony_ci		/*
168862306a36Sopenharmony_ci		 * In the vma_merge() successful mprotect-like case 8:
168962306a36Sopenharmony_ci		 * the next vma was merged into the current one and
169062306a36Sopenharmony_ci		 * the current one has not been updated yet.
169162306a36Sopenharmony_ci		 */
169262306a36Sopenharmony_ci		vma_start_write(vma);
169362306a36Sopenharmony_ci		userfaultfd_set_vm_flags(vma, new_flags);
169462306a36Sopenharmony_ci		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
169562306a36Sopenharmony_ci
169662306a36Sopenharmony_ci	skip:
169762306a36Sopenharmony_ci		prev = vma;
169862306a36Sopenharmony_ci		start = vma->vm_end;
169962306a36Sopenharmony_ci	}
170062306a36Sopenharmony_ci
170162306a36Sopenharmony_ciout_unlock:
170262306a36Sopenharmony_ci	mmap_write_unlock(mm);
170362306a36Sopenharmony_ci	mmput(mm);
170462306a36Sopenharmony_ciout:
170562306a36Sopenharmony_ci	return ret;
170662306a36Sopenharmony_ci}
170762306a36Sopenharmony_ci
170862306a36Sopenharmony_ci/*
170962306a36Sopenharmony_ci * userfaultfd_wake may be used in combination with the
171062306a36Sopenharmony_ci * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
171162306a36Sopenharmony_ci */
171262306a36Sopenharmony_cistatic int userfaultfd_wake(struct userfaultfd_ctx *ctx,
171362306a36Sopenharmony_ci			    unsigned long arg)
171462306a36Sopenharmony_ci{
171562306a36Sopenharmony_ci	int ret;
171662306a36Sopenharmony_ci	struct uffdio_range uffdio_wake;
171762306a36Sopenharmony_ci	struct userfaultfd_wake_range range;
171862306a36Sopenharmony_ci	const void __user *buf = (void __user *)arg;
171962306a36Sopenharmony_ci
172062306a36Sopenharmony_ci	ret = -EFAULT;
172162306a36Sopenharmony_ci	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
172262306a36Sopenharmony_ci		goto out;
172362306a36Sopenharmony_ci
172462306a36Sopenharmony_ci	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
172562306a36Sopenharmony_ci	if (ret)
172662306a36Sopenharmony_ci		goto out;
172762306a36Sopenharmony_ci
172862306a36Sopenharmony_ci	range.start = uffdio_wake.start;
172962306a36Sopenharmony_ci	range.len = uffdio_wake.len;
173062306a36Sopenharmony_ci
173162306a36Sopenharmony_ci	/*
173262306a36Sopenharmony_ci	 * len == 0 means wake all and we don't want to wake all here,
173362306a36Sopenharmony_ci	 * so check it again to be sure.
173462306a36Sopenharmony_ci	 */
173562306a36Sopenharmony_ci	VM_BUG_ON(!range.len);
173662306a36Sopenharmony_ci
173762306a36Sopenharmony_ci	wake_userfault(ctx, &range);
173862306a36Sopenharmony_ci	ret = 0;
173962306a36Sopenharmony_ci
174062306a36Sopenharmony_ciout:
174162306a36Sopenharmony_ci	return ret;
174262306a36Sopenharmony_ci}
174362306a36Sopenharmony_ci
174462306a36Sopenharmony_cistatic int userfaultfd_copy(struct userfaultfd_ctx *ctx,
174562306a36Sopenharmony_ci			    unsigned long arg)
174662306a36Sopenharmony_ci{
174762306a36Sopenharmony_ci	__s64 ret;
174862306a36Sopenharmony_ci	struct uffdio_copy uffdio_copy;
174962306a36Sopenharmony_ci	struct uffdio_copy __user *user_uffdio_copy;
175062306a36Sopenharmony_ci	struct userfaultfd_wake_range range;
175162306a36Sopenharmony_ci	uffd_flags_t flags = 0;
175262306a36Sopenharmony_ci
175362306a36Sopenharmony_ci	user_uffdio_copy = (struct uffdio_copy __user *) arg;
175462306a36Sopenharmony_ci
175562306a36Sopenharmony_ci	ret = -EAGAIN;
175662306a36Sopenharmony_ci	if (atomic_read(&ctx->mmap_changing))
175762306a36Sopenharmony_ci		goto out;
175862306a36Sopenharmony_ci
175962306a36Sopenharmony_ci	ret = -EFAULT;
176062306a36Sopenharmony_ci	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
176162306a36Sopenharmony_ci			   /* don't copy "copy" last field */
176262306a36Sopenharmony_ci			   sizeof(uffdio_copy)-sizeof(__s64)))
176362306a36Sopenharmony_ci		goto out;
176462306a36Sopenharmony_ci
176562306a36Sopenharmony_ci	ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
176662306a36Sopenharmony_ci				       uffdio_copy.len);
176762306a36Sopenharmony_ci	if (ret)
176862306a36Sopenharmony_ci		goto out;
176962306a36Sopenharmony_ci	ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
177062306a36Sopenharmony_ci	if (ret)
177162306a36Sopenharmony_ci		goto out;
177262306a36Sopenharmony_ci
177362306a36Sopenharmony_ci	ret = -EINVAL;
177462306a36Sopenharmony_ci	if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
177562306a36Sopenharmony_ci		goto out;
177662306a36Sopenharmony_ci	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
177762306a36Sopenharmony_ci		flags |= MFILL_ATOMIC_WP;
177862306a36Sopenharmony_ci	if (mmget_not_zero(ctx->mm)) {
177962306a36Sopenharmony_ci		ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
178062306a36Sopenharmony_ci					uffdio_copy.len, &ctx->mmap_changing,
178162306a36Sopenharmony_ci					flags);
178262306a36Sopenharmony_ci		mmput(ctx->mm);
178362306a36Sopenharmony_ci	} else {
178462306a36Sopenharmony_ci		return -ESRCH;
178562306a36Sopenharmony_ci	}
178662306a36Sopenharmony_ci	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
178762306a36Sopenharmony_ci		return -EFAULT;
178862306a36Sopenharmony_ci	if (ret < 0)
178962306a36Sopenharmony_ci		goto out;
179062306a36Sopenharmony_ci	BUG_ON(!ret);
179162306a36Sopenharmony_ci	/* len == 0 would wake all */
179262306a36Sopenharmony_ci	range.len = ret;
179362306a36Sopenharmony_ci	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
179462306a36Sopenharmony_ci		range.start = uffdio_copy.dst;
179562306a36Sopenharmony_ci		wake_userfault(ctx, &range);
179662306a36Sopenharmony_ci	}
179762306a36Sopenharmony_ci	ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
179862306a36Sopenharmony_ciout:
179962306a36Sopenharmony_ci	return ret;
180062306a36Sopenharmony_ci}
180162306a36Sopenharmony_ci
180262306a36Sopenharmony_cistatic int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
180362306a36Sopenharmony_ci				unsigned long arg)
180462306a36Sopenharmony_ci{
180562306a36Sopenharmony_ci	__s64 ret;
180662306a36Sopenharmony_ci	struct uffdio_zeropage uffdio_zeropage;
180762306a36Sopenharmony_ci	struct uffdio_zeropage __user *user_uffdio_zeropage;
180862306a36Sopenharmony_ci	struct userfaultfd_wake_range range;
180962306a36Sopenharmony_ci
181062306a36Sopenharmony_ci	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
181162306a36Sopenharmony_ci
181262306a36Sopenharmony_ci	ret = -EAGAIN;
181362306a36Sopenharmony_ci	if (atomic_read(&ctx->mmap_changing))
181462306a36Sopenharmony_ci		goto out;
181562306a36Sopenharmony_ci
181662306a36Sopenharmony_ci	ret = -EFAULT;
181762306a36Sopenharmony_ci	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
181862306a36Sopenharmony_ci			   /* don't copy "zeropage" last field */
181962306a36Sopenharmony_ci			   sizeof(uffdio_zeropage)-sizeof(__s64)))
182062306a36Sopenharmony_ci		goto out;
182162306a36Sopenharmony_ci
182262306a36Sopenharmony_ci	ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
182362306a36Sopenharmony_ci			     uffdio_zeropage.range.len);
182462306a36Sopenharmony_ci	if (ret)
182562306a36Sopenharmony_ci		goto out;
182662306a36Sopenharmony_ci	ret = -EINVAL;
182762306a36Sopenharmony_ci	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
182862306a36Sopenharmony_ci		goto out;
182962306a36Sopenharmony_ci
183062306a36Sopenharmony_ci	if (mmget_not_zero(ctx->mm)) {
183162306a36Sopenharmony_ci		ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
183262306a36Sopenharmony_ci					   uffdio_zeropage.range.len,
183362306a36Sopenharmony_ci					   &ctx->mmap_changing);
183462306a36Sopenharmony_ci		mmput(ctx->mm);
183562306a36Sopenharmony_ci	} else {
183662306a36Sopenharmony_ci		return -ESRCH;
183762306a36Sopenharmony_ci	}
183862306a36Sopenharmony_ci	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
183962306a36Sopenharmony_ci		return -EFAULT;
184062306a36Sopenharmony_ci	if (ret < 0)
184162306a36Sopenharmony_ci		goto out;
184262306a36Sopenharmony_ci	/* len == 0 would wake all */
184362306a36Sopenharmony_ci	BUG_ON(!ret);
184462306a36Sopenharmony_ci	range.len = ret;
184562306a36Sopenharmony_ci	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
184662306a36Sopenharmony_ci		range.start = uffdio_zeropage.range.start;
184762306a36Sopenharmony_ci		wake_userfault(ctx, &range);
184862306a36Sopenharmony_ci	}
184962306a36Sopenharmony_ci	ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
185062306a36Sopenharmony_ciout:
185162306a36Sopenharmony_ci	return ret;
185262306a36Sopenharmony_ci}
185362306a36Sopenharmony_ci
185462306a36Sopenharmony_cistatic int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
185562306a36Sopenharmony_ci				    unsigned long arg)
185662306a36Sopenharmony_ci{
185762306a36Sopenharmony_ci	int ret;
185862306a36Sopenharmony_ci	struct uffdio_writeprotect uffdio_wp;
185962306a36Sopenharmony_ci	struct uffdio_writeprotect __user *user_uffdio_wp;
186062306a36Sopenharmony_ci	struct userfaultfd_wake_range range;
186162306a36Sopenharmony_ci	bool mode_wp, mode_dontwake;
186262306a36Sopenharmony_ci
186362306a36Sopenharmony_ci	if (atomic_read(&ctx->mmap_changing))
186462306a36Sopenharmony_ci		return -EAGAIN;
186562306a36Sopenharmony_ci
186662306a36Sopenharmony_ci	user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;
186762306a36Sopenharmony_ci
186862306a36Sopenharmony_ci	if (copy_from_user(&uffdio_wp, user_uffdio_wp,
186962306a36Sopenharmony_ci			   sizeof(struct uffdio_writeprotect)))
187062306a36Sopenharmony_ci		return -EFAULT;
187162306a36Sopenharmony_ci
187262306a36Sopenharmony_ci	ret = validate_range(ctx->mm, uffdio_wp.range.start,
187362306a36Sopenharmony_ci			     uffdio_wp.range.len);
187462306a36Sopenharmony_ci	if (ret)
187562306a36Sopenharmony_ci		return ret;
187662306a36Sopenharmony_ci
187762306a36Sopenharmony_ci	if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
187862306a36Sopenharmony_ci			       UFFDIO_WRITEPROTECT_MODE_WP))
187962306a36Sopenharmony_ci		return -EINVAL;
188062306a36Sopenharmony_ci
188162306a36Sopenharmony_ci	mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
188262306a36Sopenharmony_ci	mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
188362306a36Sopenharmony_ci
188462306a36Sopenharmony_ci	if (mode_wp && mode_dontwake)
188562306a36Sopenharmony_ci		return -EINVAL;
188662306a36Sopenharmony_ci
188762306a36Sopenharmony_ci	if (mmget_not_zero(ctx->mm)) {
188862306a36Sopenharmony_ci		ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
188962306a36Sopenharmony_ci					  uffdio_wp.range.len, mode_wp,
189062306a36Sopenharmony_ci					  &ctx->mmap_changing);
189162306a36Sopenharmony_ci		mmput(ctx->mm);
189262306a36Sopenharmony_ci	} else {
189362306a36Sopenharmony_ci		return -ESRCH;
189462306a36Sopenharmony_ci	}
189562306a36Sopenharmony_ci
189662306a36Sopenharmony_ci	if (ret)
189762306a36Sopenharmony_ci		return ret;
189862306a36Sopenharmony_ci
189962306a36Sopenharmony_ci	if (!mode_wp && !mode_dontwake) {
190062306a36Sopenharmony_ci		range.start = uffdio_wp.range.start;
190162306a36Sopenharmony_ci		range.len = uffdio_wp.range.len;
190262306a36Sopenharmony_ci		wake_userfault(ctx, &range);
190362306a36Sopenharmony_ci	}
190462306a36Sopenharmony_ci	return ret;
190562306a36Sopenharmony_ci}
190662306a36Sopenharmony_ci
190762306a36Sopenharmony_cistatic int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
190862306a36Sopenharmony_ci{
190962306a36Sopenharmony_ci	__s64 ret;
191062306a36Sopenharmony_ci	struct uffdio_continue uffdio_continue;
191162306a36Sopenharmony_ci	struct uffdio_continue __user *user_uffdio_continue;
191262306a36Sopenharmony_ci	struct userfaultfd_wake_range range;
191362306a36Sopenharmony_ci	uffd_flags_t flags = 0;
191462306a36Sopenharmony_ci
191562306a36Sopenharmony_ci	user_uffdio_continue = (struct uffdio_continue __user *)arg;
191662306a36Sopenharmony_ci
191762306a36Sopenharmony_ci	ret = -EAGAIN;
191862306a36Sopenharmony_ci	if (atomic_read(&ctx->mmap_changing))
191962306a36Sopenharmony_ci		goto out;
192062306a36Sopenharmony_ci
192162306a36Sopenharmony_ci	ret = -EFAULT;
192262306a36Sopenharmony_ci	if (copy_from_user(&uffdio_continue, user_uffdio_continue,
192362306a36Sopenharmony_ci			   /* don't copy the output fields */
192462306a36Sopenharmony_ci			   sizeof(uffdio_continue) - (sizeof(__s64))))
192562306a36Sopenharmony_ci		goto out;
192662306a36Sopenharmony_ci
192762306a36Sopenharmony_ci	ret = validate_range(ctx->mm, uffdio_continue.range.start,
192862306a36Sopenharmony_ci			     uffdio_continue.range.len);
192962306a36Sopenharmony_ci	if (ret)
193062306a36Sopenharmony_ci		goto out;
193162306a36Sopenharmony_ci
193262306a36Sopenharmony_ci	ret = -EINVAL;
193362306a36Sopenharmony_ci	if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
193462306a36Sopenharmony_ci				     UFFDIO_CONTINUE_MODE_WP))
193562306a36Sopenharmony_ci		goto out;
193662306a36Sopenharmony_ci	if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
193762306a36Sopenharmony_ci		flags |= MFILL_ATOMIC_WP;
193862306a36Sopenharmony_ci
193962306a36Sopenharmony_ci	if (mmget_not_zero(ctx->mm)) {
194062306a36Sopenharmony_ci		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
194162306a36Sopenharmony_ci					    uffdio_continue.range.len,
194262306a36Sopenharmony_ci					    &ctx->mmap_changing, flags);
194362306a36Sopenharmony_ci		mmput(ctx->mm);
194462306a36Sopenharmony_ci	} else {
194562306a36Sopenharmony_ci		return -ESRCH;
194662306a36Sopenharmony_ci	}
194762306a36Sopenharmony_ci
194862306a36Sopenharmony_ci	if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
194962306a36Sopenharmony_ci		return -EFAULT;
195062306a36Sopenharmony_ci	if (ret < 0)
195162306a36Sopenharmony_ci		goto out;
195262306a36Sopenharmony_ci
195362306a36Sopenharmony_ci	/* len == 0 would wake all */
195462306a36Sopenharmony_ci	BUG_ON(!ret);
195562306a36Sopenharmony_ci	range.len = ret;
195662306a36Sopenharmony_ci	if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
195762306a36Sopenharmony_ci		range.start = uffdio_continue.range.start;
195862306a36Sopenharmony_ci		wake_userfault(ctx, &range);
195962306a36Sopenharmony_ci	}
196062306a36Sopenharmony_ci	ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
196162306a36Sopenharmony_ci
196262306a36Sopenharmony_ciout:
196362306a36Sopenharmony_ci	return ret;
196462306a36Sopenharmony_ci}
196562306a36Sopenharmony_ci
196662306a36Sopenharmony_cistatic inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
196762306a36Sopenharmony_ci{
196862306a36Sopenharmony_ci	__s64 ret;
196962306a36Sopenharmony_ci	struct uffdio_poison uffdio_poison;
197062306a36Sopenharmony_ci	struct uffdio_poison __user *user_uffdio_poison;
197162306a36Sopenharmony_ci	struct userfaultfd_wake_range range;
197262306a36Sopenharmony_ci
197362306a36Sopenharmony_ci	user_uffdio_poison = (struct uffdio_poison __user *)arg;
197462306a36Sopenharmony_ci
197562306a36Sopenharmony_ci	ret = -EAGAIN;
197662306a36Sopenharmony_ci	if (atomic_read(&ctx->mmap_changing))
197762306a36Sopenharmony_ci		goto out;
197862306a36Sopenharmony_ci
197962306a36Sopenharmony_ci	ret = -EFAULT;
198062306a36Sopenharmony_ci	if (copy_from_user(&uffdio_poison, user_uffdio_poison,
198162306a36Sopenharmony_ci			   /* don't copy the output fields */
198262306a36Sopenharmony_ci			   sizeof(uffdio_poison) - (sizeof(__s64))))
198362306a36Sopenharmony_ci		goto out;
198462306a36Sopenharmony_ci
198562306a36Sopenharmony_ci	ret = validate_range(ctx->mm, uffdio_poison.range.start,
198662306a36Sopenharmony_ci			     uffdio_poison.range.len);
198762306a36Sopenharmony_ci	if (ret)
198862306a36Sopenharmony_ci		goto out;
198962306a36Sopenharmony_ci
199062306a36Sopenharmony_ci	ret = -EINVAL;
199162306a36Sopenharmony_ci	if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
199262306a36Sopenharmony_ci		goto out;
199362306a36Sopenharmony_ci
199462306a36Sopenharmony_ci	if (mmget_not_zero(ctx->mm)) {
199562306a36Sopenharmony_ci		ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
199662306a36Sopenharmony_ci					  uffdio_poison.range.len,
199762306a36Sopenharmony_ci					  &ctx->mmap_changing, 0);
199862306a36Sopenharmony_ci		mmput(ctx->mm);
199962306a36Sopenharmony_ci	} else {
200062306a36Sopenharmony_ci		return -ESRCH;
200162306a36Sopenharmony_ci	}
200262306a36Sopenharmony_ci
200362306a36Sopenharmony_ci	if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
200462306a36Sopenharmony_ci		return -EFAULT;
200562306a36Sopenharmony_ci	if (ret < 0)
200662306a36Sopenharmony_ci		goto out;
200762306a36Sopenharmony_ci
200862306a36Sopenharmony_ci	/* len == 0 would wake all */
200962306a36Sopenharmony_ci	BUG_ON(!ret);
201062306a36Sopenharmony_ci	range.len = ret;
201162306a36Sopenharmony_ci	if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
201262306a36Sopenharmony_ci		range.start = uffdio_poison.range.start;
201362306a36Sopenharmony_ci		wake_userfault(ctx, &range);
201462306a36Sopenharmony_ci	}
201562306a36Sopenharmony_ci	ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
201662306a36Sopenharmony_ci
201762306a36Sopenharmony_ciout:
201862306a36Sopenharmony_ci	return ret;
201962306a36Sopenharmony_ci}
202062306a36Sopenharmony_ci
202162306a36Sopenharmony_cistatic inline unsigned int uffd_ctx_features(__u64 user_features)
202262306a36Sopenharmony_ci{
202362306a36Sopenharmony_ci	/*
202462306a36Sopenharmony_ci	 * For the current set of features the bits just coincide. Set
202562306a36Sopenharmony_ci	 * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
202662306a36Sopenharmony_ci	 */
202762306a36Sopenharmony_ci	return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
202862306a36Sopenharmony_ci}
202962306a36Sopenharmony_ci
203062306a36Sopenharmony_ci/*
203162306a36Sopenharmony_ci * userland asks for a certain API version and we return which bits
203262306a36Sopenharmony_ci * and ioctl commands are implemented in this kernel for such API
203362306a36Sopenharmony_ci * version or -EINVAL if unknown.
203462306a36Sopenharmony_ci */
203562306a36Sopenharmony_cistatic int userfaultfd_api(struct userfaultfd_ctx *ctx,
203662306a36Sopenharmony_ci			   unsigned long arg)
203762306a36Sopenharmony_ci{
203862306a36Sopenharmony_ci	struct uffdio_api uffdio_api;
203962306a36Sopenharmony_ci	void __user *buf = (void __user *)arg;
204062306a36Sopenharmony_ci	unsigned int ctx_features;
204162306a36Sopenharmony_ci	int ret;
204262306a36Sopenharmony_ci	__u64 features;
204362306a36Sopenharmony_ci
204462306a36Sopenharmony_ci	ret = -EFAULT;
204562306a36Sopenharmony_ci	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
204662306a36Sopenharmony_ci		goto out;
204762306a36Sopenharmony_ci	features = uffdio_api.features;
204862306a36Sopenharmony_ci	ret = -EINVAL;
204962306a36Sopenharmony_ci	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
205062306a36Sopenharmony_ci		goto err_out;
205162306a36Sopenharmony_ci	ret = -EPERM;
205262306a36Sopenharmony_ci	if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
205362306a36Sopenharmony_ci		goto err_out;
205462306a36Sopenharmony_ci	/* report all available features and ioctls to userland */
205562306a36Sopenharmony_ci	uffdio_api.features = UFFD_API_FEATURES;
205662306a36Sopenharmony_ci#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
205762306a36Sopenharmony_ci	uffdio_api.features &=
205862306a36Sopenharmony_ci		~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
205962306a36Sopenharmony_ci#endif
206062306a36Sopenharmony_ci#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
206162306a36Sopenharmony_ci	uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
206262306a36Sopenharmony_ci#endif
206362306a36Sopenharmony_ci#ifndef CONFIG_PTE_MARKER_UFFD_WP
206462306a36Sopenharmony_ci	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
206562306a36Sopenharmony_ci	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
206662306a36Sopenharmony_ci#endif
206762306a36Sopenharmony_ci	uffdio_api.ioctls = UFFD_API_IOCTLS;
206862306a36Sopenharmony_ci	ret = -EFAULT;
206962306a36Sopenharmony_ci	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
207062306a36Sopenharmony_ci		goto out;
207162306a36Sopenharmony_ci
207262306a36Sopenharmony_ci	/* only enable the requested features for this uffd context */
207362306a36Sopenharmony_ci	ctx_features = uffd_ctx_features(features);
207462306a36Sopenharmony_ci	ret = -EINVAL;
207562306a36Sopenharmony_ci	if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
207662306a36Sopenharmony_ci		goto err_out;
207762306a36Sopenharmony_ci
207862306a36Sopenharmony_ci	ret = 0;
207962306a36Sopenharmony_ciout:
208062306a36Sopenharmony_ci	return ret;
208162306a36Sopenharmony_cierr_out:
208262306a36Sopenharmony_ci	memset(&uffdio_api, 0, sizeof(uffdio_api));
208362306a36Sopenharmony_ci	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
208462306a36Sopenharmony_ci		ret = -EFAULT;
208562306a36Sopenharmony_ci	goto out;
208662306a36Sopenharmony_ci}
208762306a36Sopenharmony_ci
208862306a36Sopenharmony_cistatic long userfaultfd_ioctl(struct file *file, unsigned cmd,
208962306a36Sopenharmony_ci			      unsigned long arg)
209062306a36Sopenharmony_ci{
209162306a36Sopenharmony_ci	int ret = -EINVAL;
209262306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx = file->private_data;
209362306a36Sopenharmony_ci
209462306a36Sopenharmony_ci	if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
209562306a36Sopenharmony_ci		return -EINVAL;
209662306a36Sopenharmony_ci
209762306a36Sopenharmony_ci	switch(cmd) {
209862306a36Sopenharmony_ci	case UFFDIO_API:
209962306a36Sopenharmony_ci		ret = userfaultfd_api(ctx, arg);
210062306a36Sopenharmony_ci		break;
210162306a36Sopenharmony_ci	case UFFDIO_REGISTER:
210262306a36Sopenharmony_ci		ret = userfaultfd_register(ctx, arg);
210362306a36Sopenharmony_ci		break;
210462306a36Sopenharmony_ci	case UFFDIO_UNREGISTER:
210562306a36Sopenharmony_ci		ret = userfaultfd_unregister(ctx, arg);
210662306a36Sopenharmony_ci		break;
210762306a36Sopenharmony_ci	case UFFDIO_WAKE:
210862306a36Sopenharmony_ci		ret = userfaultfd_wake(ctx, arg);
210962306a36Sopenharmony_ci		break;
211062306a36Sopenharmony_ci	case UFFDIO_COPY:
211162306a36Sopenharmony_ci		ret = userfaultfd_copy(ctx, arg);
211262306a36Sopenharmony_ci		break;
211362306a36Sopenharmony_ci	case UFFDIO_ZEROPAGE:
211462306a36Sopenharmony_ci		ret = userfaultfd_zeropage(ctx, arg);
211562306a36Sopenharmony_ci		break;
211662306a36Sopenharmony_ci	case UFFDIO_WRITEPROTECT:
211762306a36Sopenharmony_ci		ret = userfaultfd_writeprotect(ctx, arg);
211862306a36Sopenharmony_ci		break;
211962306a36Sopenharmony_ci	case UFFDIO_CONTINUE:
212062306a36Sopenharmony_ci		ret = userfaultfd_continue(ctx, arg);
212162306a36Sopenharmony_ci		break;
212262306a36Sopenharmony_ci	case UFFDIO_POISON:
212362306a36Sopenharmony_ci		ret = userfaultfd_poison(ctx, arg);
212462306a36Sopenharmony_ci		break;
212562306a36Sopenharmony_ci	}
212662306a36Sopenharmony_ci	return ret;
212762306a36Sopenharmony_ci}
212862306a36Sopenharmony_ci
212962306a36Sopenharmony_ci#ifdef CONFIG_PROC_FS
213062306a36Sopenharmony_cistatic void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
213162306a36Sopenharmony_ci{
213262306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx = f->private_data;
213362306a36Sopenharmony_ci	wait_queue_entry_t *wq;
213462306a36Sopenharmony_ci	unsigned long pending = 0, total = 0;
213562306a36Sopenharmony_ci
213662306a36Sopenharmony_ci	spin_lock_irq(&ctx->fault_pending_wqh.lock);
213762306a36Sopenharmony_ci	list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
213862306a36Sopenharmony_ci		pending++;
213962306a36Sopenharmony_ci		total++;
214062306a36Sopenharmony_ci	}
214162306a36Sopenharmony_ci	list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
214262306a36Sopenharmony_ci		total++;
214362306a36Sopenharmony_ci	}
214462306a36Sopenharmony_ci	spin_unlock_irq(&ctx->fault_pending_wqh.lock);
214562306a36Sopenharmony_ci
214662306a36Sopenharmony_ci	/*
214762306a36Sopenharmony_ci	 * If more protocols will be added, there will be all shown
214862306a36Sopenharmony_ci	 * separated by a space. Like this:
214962306a36Sopenharmony_ci	 *	protocols: aa:... bb:...
215062306a36Sopenharmony_ci	 */
215162306a36Sopenharmony_ci	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
215262306a36Sopenharmony_ci		   pending, total, UFFD_API, ctx->features,
215362306a36Sopenharmony_ci		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
215462306a36Sopenharmony_ci}
215562306a36Sopenharmony_ci#endif
215662306a36Sopenharmony_ci
215762306a36Sopenharmony_cistatic const struct file_operations userfaultfd_fops = {
215862306a36Sopenharmony_ci#ifdef CONFIG_PROC_FS
215962306a36Sopenharmony_ci	.show_fdinfo	= userfaultfd_show_fdinfo,
216062306a36Sopenharmony_ci#endif
216162306a36Sopenharmony_ci	.release	= userfaultfd_release,
216262306a36Sopenharmony_ci	.poll		= userfaultfd_poll,
216362306a36Sopenharmony_ci	.read		= userfaultfd_read,
216462306a36Sopenharmony_ci	.unlocked_ioctl = userfaultfd_ioctl,
216562306a36Sopenharmony_ci	.compat_ioctl	= compat_ptr_ioctl,
216662306a36Sopenharmony_ci	.llseek		= noop_llseek,
216762306a36Sopenharmony_ci};
216862306a36Sopenharmony_ci
216962306a36Sopenharmony_cistatic void init_once_userfaultfd_ctx(void *mem)
217062306a36Sopenharmony_ci{
217162306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
217262306a36Sopenharmony_ci
217362306a36Sopenharmony_ci	init_waitqueue_head(&ctx->fault_pending_wqh);
217462306a36Sopenharmony_ci	init_waitqueue_head(&ctx->fault_wqh);
217562306a36Sopenharmony_ci	init_waitqueue_head(&ctx->event_wqh);
217662306a36Sopenharmony_ci	init_waitqueue_head(&ctx->fd_wqh);
217762306a36Sopenharmony_ci	seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
217862306a36Sopenharmony_ci}
217962306a36Sopenharmony_ci
218062306a36Sopenharmony_cistatic int new_userfaultfd(int flags)
218162306a36Sopenharmony_ci{
218262306a36Sopenharmony_ci	struct userfaultfd_ctx *ctx;
218362306a36Sopenharmony_ci	int fd;
218462306a36Sopenharmony_ci
218562306a36Sopenharmony_ci	BUG_ON(!current->mm);
218662306a36Sopenharmony_ci
218762306a36Sopenharmony_ci	/* Check the UFFD_* constants for consistency.  */
218862306a36Sopenharmony_ci	BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
218962306a36Sopenharmony_ci	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
219062306a36Sopenharmony_ci	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
219162306a36Sopenharmony_ci
219262306a36Sopenharmony_ci	if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
219362306a36Sopenharmony_ci		return -EINVAL;
219462306a36Sopenharmony_ci
219562306a36Sopenharmony_ci	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
219662306a36Sopenharmony_ci	if (!ctx)
219762306a36Sopenharmony_ci		return -ENOMEM;
219862306a36Sopenharmony_ci
219962306a36Sopenharmony_ci	refcount_set(&ctx->refcount, 1);
220062306a36Sopenharmony_ci	ctx->flags = flags;
220162306a36Sopenharmony_ci	ctx->features = 0;
220262306a36Sopenharmony_ci	ctx->released = false;
220362306a36Sopenharmony_ci	atomic_set(&ctx->mmap_changing, 0);
220462306a36Sopenharmony_ci	ctx->mm = current->mm;
220562306a36Sopenharmony_ci	/* prevent the mm struct to be freed */
220662306a36Sopenharmony_ci	mmgrab(ctx->mm);
220762306a36Sopenharmony_ci
220862306a36Sopenharmony_ci	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
220962306a36Sopenharmony_ci			O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
221062306a36Sopenharmony_ci	if (fd < 0) {
221162306a36Sopenharmony_ci		mmdrop(ctx->mm);
221262306a36Sopenharmony_ci		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
221362306a36Sopenharmony_ci	}
221462306a36Sopenharmony_ci	return fd;
221562306a36Sopenharmony_ci}
221662306a36Sopenharmony_ci
221762306a36Sopenharmony_cistatic inline bool userfaultfd_syscall_allowed(int flags)
221862306a36Sopenharmony_ci{
221962306a36Sopenharmony_ci	/* Userspace-only page faults are always allowed */
222062306a36Sopenharmony_ci	if (flags & UFFD_USER_MODE_ONLY)
222162306a36Sopenharmony_ci		return true;
222262306a36Sopenharmony_ci
222362306a36Sopenharmony_ci	/*
222462306a36Sopenharmony_ci	 * The user is requesting a userfaultfd which can handle kernel faults.
222562306a36Sopenharmony_ci	 * Privileged users are always allowed to do this.
222662306a36Sopenharmony_ci	 */
222762306a36Sopenharmony_ci	if (capable(CAP_SYS_PTRACE))
222862306a36Sopenharmony_ci		return true;
222962306a36Sopenharmony_ci
223062306a36Sopenharmony_ci	/* Otherwise, access to kernel fault handling is sysctl controlled. */
223162306a36Sopenharmony_ci	return sysctl_unprivileged_userfaultfd;
223262306a36Sopenharmony_ci}
223362306a36Sopenharmony_ci
223462306a36Sopenharmony_ciSYSCALL_DEFINE1(userfaultfd, int, flags)
223562306a36Sopenharmony_ci{
223662306a36Sopenharmony_ci	if (!userfaultfd_syscall_allowed(flags))
223762306a36Sopenharmony_ci		return -EPERM;
223862306a36Sopenharmony_ci
223962306a36Sopenharmony_ci	return new_userfaultfd(flags);
224062306a36Sopenharmony_ci}
224162306a36Sopenharmony_ci
224262306a36Sopenharmony_cistatic long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
224362306a36Sopenharmony_ci{
224462306a36Sopenharmony_ci	if (cmd != USERFAULTFD_IOC_NEW)
224562306a36Sopenharmony_ci		return -EINVAL;
224662306a36Sopenharmony_ci
224762306a36Sopenharmony_ci	return new_userfaultfd(flags);
224862306a36Sopenharmony_ci}
224962306a36Sopenharmony_ci
225062306a36Sopenharmony_cistatic const struct file_operations userfaultfd_dev_fops = {
225162306a36Sopenharmony_ci	.unlocked_ioctl = userfaultfd_dev_ioctl,
225262306a36Sopenharmony_ci	.compat_ioctl = userfaultfd_dev_ioctl,
225362306a36Sopenharmony_ci	.owner = THIS_MODULE,
225462306a36Sopenharmony_ci	.llseek = noop_llseek,
225562306a36Sopenharmony_ci};
225662306a36Sopenharmony_ci
225762306a36Sopenharmony_cistatic struct miscdevice userfaultfd_misc = {
225862306a36Sopenharmony_ci	.minor = MISC_DYNAMIC_MINOR,
225962306a36Sopenharmony_ci	.name = "userfaultfd",
226062306a36Sopenharmony_ci	.fops = &userfaultfd_dev_fops
226162306a36Sopenharmony_ci};
226262306a36Sopenharmony_ci
226362306a36Sopenharmony_cistatic int __init userfaultfd_init(void)
226462306a36Sopenharmony_ci{
226562306a36Sopenharmony_ci	int ret;
226662306a36Sopenharmony_ci
226762306a36Sopenharmony_ci	ret = misc_register(&userfaultfd_misc);
226862306a36Sopenharmony_ci	if (ret)
226962306a36Sopenharmony_ci		return ret;
227062306a36Sopenharmony_ci
227162306a36Sopenharmony_ci	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
227262306a36Sopenharmony_ci						sizeof(struct userfaultfd_ctx),
227362306a36Sopenharmony_ci						0,
227462306a36Sopenharmony_ci						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
227562306a36Sopenharmony_ci						init_once_userfaultfd_ctx);
227662306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL
227762306a36Sopenharmony_ci	register_sysctl_init("vm", vm_userfaultfd_table);
227862306a36Sopenharmony_ci#endif
227962306a36Sopenharmony_ci	return 0;
228062306a36Sopenharmony_ci}
228162306a36Sopenharmony_ci__initcall(userfaultfd_init);
2282