162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * fs/userfaultfd.c 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 662306a36Sopenharmony_ci * Copyright (C) 2008-2009 Red Hat, Inc. 762306a36Sopenharmony_ci * Copyright (C) 2015 Red Hat, Inc. 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * Some part derived from fs/eventfd.c (anon inode setup) and 1062306a36Sopenharmony_ci * mm/ksm.c (mm hashing). 1162306a36Sopenharmony_ci */ 1262306a36Sopenharmony_ci 1362306a36Sopenharmony_ci#include <linux/list.h> 1462306a36Sopenharmony_ci#include <linux/hashtable.h> 1562306a36Sopenharmony_ci#include <linux/sched/signal.h> 1662306a36Sopenharmony_ci#include <linux/sched/mm.h> 1762306a36Sopenharmony_ci#include <linux/mm.h> 1862306a36Sopenharmony_ci#include <linux/mm_inline.h> 1962306a36Sopenharmony_ci#include <linux/mmu_notifier.h> 2062306a36Sopenharmony_ci#include <linux/poll.h> 2162306a36Sopenharmony_ci#include <linux/slab.h> 2262306a36Sopenharmony_ci#include <linux/seq_file.h> 2362306a36Sopenharmony_ci#include <linux/file.h> 2462306a36Sopenharmony_ci#include <linux/bug.h> 2562306a36Sopenharmony_ci#include <linux/anon_inodes.h> 2662306a36Sopenharmony_ci#include <linux/syscalls.h> 2762306a36Sopenharmony_ci#include <linux/userfaultfd_k.h> 2862306a36Sopenharmony_ci#include <linux/mempolicy.h> 2962306a36Sopenharmony_ci#include <linux/ioctl.h> 3062306a36Sopenharmony_ci#include <linux/security.h> 3162306a36Sopenharmony_ci#include <linux/hugetlb.h> 3262306a36Sopenharmony_ci#include <linux/swapops.h> 3362306a36Sopenharmony_ci#include <linux/miscdevice.h> 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_cistatic int sysctl_unprivileged_userfaultfd __read_mostly; 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL 3862306a36Sopenharmony_cistatic struct ctl_table vm_userfaultfd_table[] = { 3962306a36Sopenharmony_ci { 4062306a36Sopenharmony_ci .procname = "unprivileged_userfaultfd", 4162306a36Sopenharmony_ci .data = &sysctl_unprivileged_userfaultfd, 4262306a36Sopenharmony_ci .maxlen = sizeof(sysctl_unprivileged_userfaultfd), 4362306a36Sopenharmony_ci .mode = 0644, 4462306a36Sopenharmony_ci .proc_handler = proc_dointvec_minmax, 4562306a36Sopenharmony_ci .extra1 = SYSCTL_ZERO, 4662306a36Sopenharmony_ci .extra2 = SYSCTL_ONE, 4762306a36Sopenharmony_ci }, 4862306a36Sopenharmony_ci { } 4962306a36Sopenharmony_ci}; 5062306a36Sopenharmony_ci#endif 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_cistatic struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci/* 5562306a36Sopenharmony_ci * Start with fault_pending_wqh and fault_wqh so they're more likely 5662306a36Sopenharmony_ci * to be in the same cacheline. 5762306a36Sopenharmony_ci * 5862306a36Sopenharmony_ci * Locking order: 5962306a36Sopenharmony_ci * fd_wqh.lock 6062306a36Sopenharmony_ci * fault_pending_wqh.lock 6162306a36Sopenharmony_ci * fault_wqh.lock 6262306a36Sopenharmony_ci * event_wqh.lock 6362306a36Sopenharmony_ci * 6462306a36Sopenharmony_ci * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, 6562306a36Sopenharmony_ci * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's 6662306a36Sopenharmony_ci * also taken in IRQ context. 6762306a36Sopenharmony_ci */ 6862306a36Sopenharmony_cistruct userfaultfd_ctx { 6962306a36Sopenharmony_ci /* waitqueue head for the pending (i.e. not read) userfaults */ 7062306a36Sopenharmony_ci wait_queue_head_t fault_pending_wqh; 7162306a36Sopenharmony_ci /* waitqueue head for the userfaults */ 7262306a36Sopenharmony_ci wait_queue_head_t fault_wqh; 7362306a36Sopenharmony_ci /* waitqueue head for the pseudo fd to wakeup poll/read */ 7462306a36Sopenharmony_ci wait_queue_head_t fd_wqh; 7562306a36Sopenharmony_ci /* waitqueue head for events */ 7662306a36Sopenharmony_ci wait_queue_head_t event_wqh; 7762306a36Sopenharmony_ci /* a refile sequence protected by fault_pending_wqh lock */ 7862306a36Sopenharmony_ci seqcount_spinlock_t refile_seq; 7962306a36Sopenharmony_ci /* pseudo fd refcounting */ 8062306a36Sopenharmony_ci refcount_t refcount; 8162306a36Sopenharmony_ci /* userfaultfd syscall flags */ 8262306a36Sopenharmony_ci unsigned int flags; 8362306a36Sopenharmony_ci /* features requested from the userspace */ 8462306a36Sopenharmony_ci unsigned int features; 8562306a36Sopenharmony_ci /* released */ 8662306a36Sopenharmony_ci bool released; 8762306a36Sopenharmony_ci /* memory mappings are changing because of non-cooperative event */ 8862306a36Sopenharmony_ci atomic_t mmap_changing; 8962306a36Sopenharmony_ci /* mm with one ore more vmas attached to this userfaultfd_ctx */ 9062306a36Sopenharmony_ci struct mm_struct *mm; 9162306a36Sopenharmony_ci}; 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_cistruct userfaultfd_fork_ctx { 9462306a36Sopenharmony_ci struct userfaultfd_ctx *orig; 9562306a36Sopenharmony_ci struct userfaultfd_ctx *new; 9662306a36Sopenharmony_ci struct list_head list; 9762306a36Sopenharmony_ci}; 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_cistruct userfaultfd_unmap_ctx { 10062306a36Sopenharmony_ci struct userfaultfd_ctx *ctx; 10162306a36Sopenharmony_ci unsigned long start; 10262306a36Sopenharmony_ci unsigned long end; 10362306a36Sopenharmony_ci struct list_head list; 10462306a36Sopenharmony_ci}; 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_cistruct userfaultfd_wait_queue { 10762306a36Sopenharmony_ci struct uffd_msg msg; 10862306a36Sopenharmony_ci wait_queue_entry_t wq; 10962306a36Sopenharmony_ci struct userfaultfd_ctx *ctx; 11062306a36Sopenharmony_ci bool waken; 11162306a36Sopenharmony_ci}; 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_cistruct userfaultfd_wake_range { 11462306a36Sopenharmony_ci unsigned long start; 11562306a36Sopenharmony_ci unsigned long len; 11662306a36Sopenharmony_ci}; 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci/* internal indication that UFFD_API ioctl was successfully executed */ 11962306a36Sopenharmony_ci#define UFFD_FEATURE_INITIALIZED (1u << 31) 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_cistatic bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) 12262306a36Sopenharmony_ci{ 12362306a36Sopenharmony_ci return ctx->features & UFFD_FEATURE_INITIALIZED; 12462306a36Sopenharmony_ci} 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci/* 12762306a36Sopenharmony_ci * Whether WP_UNPOPULATED is enabled on the uffd context. It is only 12862306a36Sopenharmony_ci * meaningful when userfaultfd_wp()==true on the vma and when it's 12962306a36Sopenharmony_ci * anonymous. 13062306a36Sopenharmony_ci */ 13162306a36Sopenharmony_cibool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) 13262306a36Sopenharmony_ci{ 13362306a36Sopenharmony_ci struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci if (!ctx) 13662306a36Sopenharmony_ci return false; 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci return ctx->features & UFFD_FEATURE_WP_UNPOPULATED; 13962306a36Sopenharmony_ci} 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_cistatic void userfaultfd_set_vm_flags(struct vm_area_struct *vma, 14262306a36Sopenharmony_ci vm_flags_t flags) 14362306a36Sopenharmony_ci{ 14462306a36Sopenharmony_ci const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci vm_flags_reset(vma, flags); 14762306a36Sopenharmony_ci /* 14862306a36Sopenharmony_ci * For shared mappings, we want to enable writenotify while 14962306a36Sopenharmony_ci * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply 15062306a36Sopenharmony_ci * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. 15162306a36Sopenharmony_ci */ 15262306a36Sopenharmony_ci if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) 15362306a36Sopenharmony_ci vma_set_page_prot(vma); 15462306a36Sopenharmony_ci} 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_cistatic int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, 15762306a36Sopenharmony_ci int wake_flags, void *key) 15862306a36Sopenharmony_ci{ 15962306a36Sopenharmony_ci struct userfaultfd_wake_range *range = key; 16062306a36Sopenharmony_ci int ret; 16162306a36Sopenharmony_ci struct userfaultfd_wait_queue *uwq; 16262306a36Sopenharmony_ci unsigned long start, len; 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 16562306a36Sopenharmony_ci ret = 0; 16662306a36Sopenharmony_ci /* len == 0 means wake all */ 16762306a36Sopenharmony_ci start = range->start; 16862306a36Sopenharmony_ci len = range->len; 16962306a36Sopenharmony_ci if (len && (start > uwq->msg.arg.pagefault.address || 17062306a36Sopenharmony_ci start + len <= uwq->msg.arg.pagefault.address)) 17162306a36Sopenharmony_ci goto out; 17262306a36Sopenharmony_ci WRITE_ONCE(uwq->waken, true); 17362306a36Sopenharmony_ci /* 17462306a36Sopenharmony_ci * The Program-Order guarantees provided by the scheduler 17562306a36Sopenharmony_ci * ensure uwq->waken is visible before the task is woken. 17662306a36Sopenharmony_ci */ 17762306a36Sopenharmony_ci ret = wake_up_state(wq->private, mode); 17862306a36Sopenharmony_ci if (ret) { 17962306a36Sopenharmony_ci /* 18062306a36Sopenharmony_ci * Wake only once, autoremove behavior. 18162306a36Sopenharmony_ci * 18262306a36Sopenharmony_ci * After the effect of list_del_init is visible to the other 18362306a36Sopenharmony_ci * CPUs, the waitqueue may disappear from under us, see the 18462306a36Sopenharmony_ci * !list_empty_careful() in handle_userfault(). 18562306a36Sopenharmony_ci * 18662306a36Sopenharmony_ci * try_to_wake_up() has an implicit smp_mb(), and the 18762306a36Sopenharmony_ci * wq->private is read before calling the extern function 18862306a36Sopenharmony_ci * "wake_up_state" (which in turns calls try_to_wake_up). 18962306a36Sopenharmony_ci */ 19062306a36Sopenharmony_ci list_del_init(&wq->entry); 19162306a36Sopenharmony_ci } 19262306a36Sopenharmony_ciout: 19362306a36Sopenharmony_ci return ret; 19462306a36Sopenharmony_ci} 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci/** 19762306a36Sopenharmony_ci * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd 19862306a36Sopenharmony_ci * context. 19962306a36Sopenharmony_ci * @ctx: [in] Pointer to the userfaultfd context. 20062306a36Sopenharmony_ci */ 20162306a36Sopenharmony_cistatic void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) 20262306a36Sopenharmony_ci{ 20362306a36Sopenharmony_ci refcount_inc(&ctx->refcount); 20462306a36Sopenharmony_ci} 20562306a36Sopenharmony_ci 20662306a36Sopenharmony_ci/** 20762306a36Sopenharmony_ci * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd 20862306a36Sopenharmony_ci * context. 20962306a36Sopenharmony_ci * @ctx: [in] Pointer to userfaultfd context. 21062306a36Sopenharmony_ci * 21162306a36Sopenharmony_ci * The userfaultfd context reference must have been previously acquired either 21262306a36Sopenharmony_ci * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). 21362306a36Sopenharmony_ci */ 21462306a36Sopenharmony_cistatic void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) 21562306a36Sopenharmony_ci{ 21662306a36Sopenharmony_ci if (refcount_dec_and_test(&ctx->refcount)) { 21762306a36Sopenharmony_ci VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); 21862306a36Sopenharmony_ci VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); 21962306a36Sopenharmony_ci VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); 22062306a36Sopenharmony_ci VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); 22162306a36Sopenharmony_ci VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock)); 22262306a36Sopenharmony_ci VM_BUG_ON(waitqueue_active(&ctx->event_wqh)); 22362306a36Sopenharmony_ci VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); 22462306a36Sopenharmony_ci VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); 22562306a36Sopenharmony_ci mmdrop(ctx->mm); 22662306a36Sopenharmony_ci kmem_cache_free(userfaultfd_ctx_cachep, ctx); 22762306a36Sopenharmony_ci } 22862306a36Sopenharmony_ci} 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_cistatic inline void msg_init(struct uffd_msg *msg) 23162306a36Sopenharmony_ci{ 23262306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); 23362306a36Sopenharmony_ci /* 23462306a36Sopenharmony_ci * Must use memset to zero out the paddings or kernel data is 23562306a36Sopenharmony_ci * leaked to userland. 23662306a36Sopenharmony_ci */ 23762306a36Sopenharmony_ci memset(msg, 0, sizeof(struct uffd_msg)); 23862306a36Sopenharmony_ci} 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_cistatic inline struct uffd_msg userfault_msg(unsigned long address, 24162306a36Sopenharmony_ci unsigned long real_address, 24262306a36Sopenharmony_ci unsigned int flags, 24362306a36Sopenharmony_ci unsigned long reason, 24462306a36Sopenharmony_ci unsigned int features) 24562306a36Sopenharmony_ci{ 24662306a36Sopenharmony_ci struct uffd_msg msg; 24762306a36Sopenharmony_ci 24862306a36Sopenharmony_ci msg_init(&msg); 24962306a36Sopenharmony_ci msg.event = UFFD_EVENT_PAGEFAULT; 25062306a36Sopenharmony_ci 25162306a36Sopenharmony_ci msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? 25262306a36Sopenharmony_ci real_address : address; 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci /* 25562306a36Sopenharmony_ci * These flags indicate why the userfault occurred: 25662306a36Sopenharmony_ci * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. 25762306a36Sopenharmony_ci * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault. 25862306a36Sopenharmony_ci * - Neither of these flags being set indicates a MISSING fault. 25962306a36Sopenharmony_ci * 26062306a36Sopenharmony_ci * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write 26162306a36Sopenharmony_ci * fault. Otherwise, it was a read fault. 26262306a36Sopenharmony_ci */ 26362306a36Sopenharmony_ci if (flags & FAULT_FLAG_WRITE) 26462306a36Sopenharmony_ci msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; 26562306a36Sopenharmony_ci if (reason & VM_UFFD_WP) 26662306a36Sopenharmony_ci msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; 26762306a36Sopenharmony_ci if (reason & VM_UFFD_MINOR) 26862306a36Sopenharmony_ci msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; 26962306a36Sopenharmony_ci if (features & UFFD_FEATURE_THREAD_ID) 27062306a36Sopenharmony_ci msg.arg.pagefault.feat.ptid = task_pid_vnr(current); 27162306a36Sopenharmony_ci return msg; 27262306a36Sopenharmony_ci} 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 27562306a36Sopenharmony_ci/* 27662306a36Sopenharmony_ci * Same functionality as userfaultfd_must_wait below with modifications for 27762306a36Sopenharmony_ci * hugepmd ranges. 27862306a36Sopenharmony_ci */ 27962306a36Sopenharmony_cistatic inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, 28062306a36Sopenharmony_ci struct vm_fault *vmf, 28162306a36Sopenharmony_ci unsigned long reason) 28262306a36Sopenharmony_ci{ 28362306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 28462306a36Sopenharmony_ci pte_t *ptep, pte; 28562306a36Sopenharmony_ci bool ret = true; 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci assert_fault_locked(vmf); 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); 29062306a36Sopenharmony_ci if (!ptep) 29162306a36Sopenharmony_ci goto out; 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci ret = false; 29462306a36Sopenharmony_ci pte = huge_ptep_get(ptep); 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci /* 29762306a36Sopenharmony_ci * Lockless access: we're in a wait_event so it's ok if it 29862306a36Sopenharmony_ci * changes under us. PTE markers should be handled the same as none 29962306a36Sopenharmony_ci * ptes here. 30062306a36Sopenharmony_ci */ 30162306a36Sopenharmony_ci if (huge_pte_none_mostly(pte)) 30262306a36Sopenharmony_ci ret = true; 30362306a36Sopenharmony_ci if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) 30462306a36Sopenharmony_ci ret = true; 30562306a36Sopenharmony_ciout: 30662306a36Sopenharmony_ci return ret; 30762306a36Sopenharmony_ci} 30862306a36Sopenharmony_ci#else 30962306a36Sopenharmony_cistatic inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, 31062306a36Sopenharmony_ci struct vm_fault *vmf, 31162306a36Sopenharmony_ci unsigned long reason) 31262306a36Sopenharmony_ci{ 31362306a36Sopenharmony_ci return false; /* should never get here */ 31462306a36Sopenharmony_ci} 31562306a36Sopenharmony_ci#endif /* CONFIG_HUGETLB_PAGE */ 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ci/* 31862306a36Sopenharmony_ci * Verify the pagetables are still not ok after having reigstered into 31962306a36Sopenharmony_ci * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any 32062306a36Sopenharmony_ci * userfault that has already been resolved, if userfaultfd_read and 32162306a36Sopenharmony_ci * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different 32262306a36Sopenharmony_ci * threads. 32362306a36Sopenharmony_ci */ 32462306a36Sopenharmony_cistatic inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, 32562306a36Sopenharmony_ci struct vm_fault *vmf, 32662306a36Sopenharmony_ci unsigned long reason) 32762306a36Sopenharmony_ci{ 32862306a36Sopenharmony_ci struct mm_struct *mm = ctx->mm; 32962306a36Sopenharmony_ci unsigned long address = vmf->address; 33062306a36Sopenharmony_ci pgd_t *pgd; 33162306a36Sopenharmony_ci p4d_t *p4d; 33262306a36Sopenharmony_ci pud_t *pud; 33362306a36Sopenharmony_ci pmd_t *pmd, _pmd; 33462306a36Sopenharmony_ci pte_t *pte; 33562306a36Sopenharmony_ci pte_t ptent; 33662306a36Sopenharmony_ci bool ret = true; 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ci assert_fault_locked(vmf); 33962306a36Sopenharmony_ci 34062306a36Sopenharmony_ci pgd = pgd_offset(mm, address); 34162306a36Sopenharmony_ci if (!pgd_present(*pgd)) 34262306a36Sopenharmony_ci goto out; 34362306a36Sopenharmony_ci p4d = p4d_offset(pgd, address); 34462306a36Sopenharmony_ci if (!p4d_present(*p4d)) 34562306a36Sopenharmony_ci goto out; 34662306a36Sopenharmony_ci pud = pud_offset(p4d, address); 34762306a36Sopenharmony_ci if (!pud_present(*pud)) 34862306a36Sopenharmony_ci goto out; 34962306a36Sopenharmony_ci pmd = pmd_offset(pud, address); 35062306a36Sopenharmony_ciagain: 35162306a36Sopenharmony_ci _pmd = pmdp_get_lockless(pmd); 35262306a36Sopenharmony_ci if (pmd_none(_pmd)) 35362306a36Sopenharmony_ci goto out; 35462306a36Sopenharmony_ci 35562306a36Sopenharmony_ci ret = false; 35662306a36Sopenharmony_ci if (!pmd_present(_pmd) || pmd_devmap(_pmd)) 35762306a36Sopenharmony_ci goto out; 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci if (pmd_trans_huge(_pmd)) { 36062306a36Sopenharmony_ci if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) 36162306a36Sopenharmony_ci ret = true; 36262306a36Sopenharmony_ci goto out; 36362306a36Sopenharmony_ci } 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci pte = pte_offset_map(pmd, address); 36662306a36Sopenharmony_ci if (!pte) { 36762306a36Sopenharmony_ci ret = true; 36862306a36Sopenharmony_ci goto again; 36962306a36Sopenharmony_ci } 37062306a36Sopenharmony_ci /* 37162306a36Sopenharmony_ci * Lockless access: we're in a wait_event so it's ok if it 37262306a36Sopenharmony_ci * changes under us. PTE markers should be handled the same as none 37362306a36Sopenharmony_ci * ptes here. 37462306a36Sopenharmony_ci */ 37562306a36Sopenharmony_ci ptent = ptep_get(pte); 37662306a36Sopenharmony_ci if (pte_none_mostly(ptent)) 37762306a36Sopenharmony_ci ret = true; 37862306a36Sopenharmony_ci if (!pte_write(ptent) && (reason & VM_UFFD_WP)) 37962306a36Sopenharmony_ci ret = true; 38062306a36Sopenharmony_ci pte_unmap(pte); 38162306a36Sopenharmony_ci 38262306a36Sopenharmony_ciout: 38362306a36Sopenharmony_ci return ret; 38462306a36Sopenharmony_ci} 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_cistatic inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) 38762306a36Sopenharmony_ci{ 38862306a36Sopenharmony_ci if (flags & FAULT_FLAG_INTERRUPTIBLE) 38962306a36Sopenharmony_ci return TASK_INTERRUPTIBLE; 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci if (flags & FAULT_FLAG_KILLABLE) 39262306a36Sopenharmony_ci return TASK_KILLABLE; 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_ci return TASK_UNINTERRUPTIBLE; 39562306a36Sopenharmony_ci} 39662306a36Sopenharmony_ci 39762306a36Sopenharmony_ci/* 39862306a36Sopenharmony_ci * The locking rules involved in returning VM_FAULT_RETRY depending on 39962306a36Sopenharmony_ci * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and 40062306a36Sopenharmony_ci * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" 40162306a36Sopenharmony_ci * recommendation in __lock_page_or_retry is not an understatement. 40262306a36Sopenharmony_ci * 40362306a36Sopenharmony_ci * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released 40462306a36Sopenharmony_ci * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is 40562306a36Sopenharmony_ci * not set. 40662306a36Sopenharmony_ci * 40762306a36Sopenharmony_ci * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not 40862306a36Sopenharmony_ci * set, VM_FAULT_RETRY can still be returned if and only if there are 40962306a36Sopenharmony_ci * fatal_signal_pending()s, and the mmap_lock must be released before 41062306a36Sopenharmony_ci * returning it. 41162306a36Sopenharmony_ci */ 41262306a36Sopenharmony_civm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) 41362306a36Sopenharmony_ci{ 41462306a36Sopenharmony_ci struct vm_area_struct *vma = vmf->vma; 41562306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 41662306a36Sopenharmony_ci struct userfaultfd_ctx *ctx; 41762306a36Sopenharmony_ci struct userfaultfd_wait_queue uwq; 41862306a36Sopenharmony_ci vm_fault_t ret = VM_FAULT_SIGBUS; 41962306a36Sopenharmony_ci bool must_wait; 42062306a36Sopenharmony_ci unsigned int blocking_state; 42162306a36Sopenharmony_ci 42262306a36Sopenharmony_ci /* 42362306a36Sopenharmony_ci * We don't do userfault handling for the final child pid update. 42462306a36Sopenharmony_ci * 42562306a36Sopenharmony_ci * We also don't do userfault handling during 42662306a36Sopenharmony_ci * coredumping. hugetlbfs has the special 42762306a36Sopenharmony_ci * hugetlb_follow_page_mask() to skip missing pages in the 42862306a36Sopenharmony_ci * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with 42962306a36Sopenharmony_ci * the no_page_table() helper in follow_page_mask(), but the 43062306a36Sopenharmony_ci * shmem_vm_ops->fault method is invoked even during 43162306a36Sopenharmony_ci * coredumping and it ends up here. 43262306a36Sopenharmony_ci */ 43362306a36Sopenharmony_ci if (current->flags & (PF_EXITING|PF_DUMPCORE)) 43462306a36Sopenharmony_ci goto out; 43562306a36Sopenharmony_ci 43662306a36Sopenharmony_ci assert_fault_locked(vmf); 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_ci ctx = vma->vm_userfaultfd_ctx.ctx; 43962306a36Sopenharmony_ci if (!ctx) 44062306a36Sopenharmony_ci goto out; 44162306a36Sopenharmony_ci 44262306a36Sopenharmony_ci BUG_ON(ctx->mm != mm); 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_ci /* Any unrecognized flag is a bug. */ 44562306a36Sopenharmony_ci VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); 44662306a36Sopenharmony_ci /* 0 or > 1 flags set is a bug; we expect exactly 1. */ 44762306a36Sopenharmony_ci VM_BUG_ON(!reason || (reason & (reason - 1))); 44862306a36Sopenharmony_ci 44962306a36Sopenharmony_ci if (ctx->features & UFFD_FEATURE_SIGBUS) 45062306a36Sopenharmony_ci goto out; 45162306a36Sopenharmony_ci if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) 45262306a36Sopenharmony_ci goto out; 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci /* 45562306a36Sopenharmony_ci * If it's already released don't get it. This avoids to loop 45662306a36Sopenharmony_ci * in __get_user_pages if userfaultfd_release waits on the 45762306a36Sopenharmony_ci * caller of handle_userfault to release the mmap_lock. 45862306a36Sopenharmony_ci */ 45962306a36Sopenharmony_ci if (unlikely(READ_ONCE(ctx->released))) { 46062306a36Sopenharmony_ci /* 46162306a36Sopenharmony_ci * Don't return VM_FAULT_SIGBUS in this case, so a non 46262306a36Sopenharmony_ci * cooperative manager can close the uffd after the 46362306a36Sopenharmony_ci * last UFFDIO_COPY, without risking to trigger an 46462306a36Sopenharmony_ci * involuntary SIGBUS if the process was starting the 46562306a36Sopenharmony_ci * userfaultfd while the userfaultfd was still armed 46662306a36Sopenharmony_ci * (but after the last UFFDIO_COPY). If the uffd 46762306a36Sopenharmony_ci * wasn't already closed when the userfault reached 46862306a36Sopenharmony_ci * this point, that would normally be solved by 46962306a36Sopenharmony_ci * userfaultfd_must_wait returning 'false'. 47062306a36Sopenharmony_ci * 47162306a36Sopenharmony_ci * If we were to return VM_FAULT_SIGBUS here, the non 47262306a36Sopenharmony_ci * cooperative manager would be instead forced to 47362306a36Sopenharmony_ci * always call UFFDIO_UNREGISTER before it can safely 47462306a36Sopenharmony_ci * close the uffd. 47562306a36Sopenharmony_ci */ 47662306a36Sopenharmony_ci ret = VM_FAULT_NOPAGE; 47762306a36Sopenharmony_ci goto out; 47862306a36Sopenharmony_ci } 47962306a36Sopenharmony_ci 48062306a36Sopenharmony_ci /* 48162306a36Sopenharmony_ci * Check that we can return VM_FAULT_RETRY. 48262306a36Sopenharmony_ci * 48362306a36Sopenharmony_ci * NOTE: it should become possible to return VM_FAULT_RETRY 48462306a36Sopenharmony_ci * even if FAULT_FLAG_TRIED is set without leading to gup() 48562306a36Sopenharmony_ci * -EBUSY failures, if the userfaultfd is to be extended for 48662306a36Sopenharmony_ci * VM_UFFD_WP tracking and we intend to arm the userfault 48762306a36Sopenharmony_ci * without first stopping userland access to the memory. For 48862306a36Sopenharmony_ci * VM_UFFD_MISSING userfaults this is enough for now. 48962306a36Sopenharmony_ci */ 49062306a36Sopenharmony_ci if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { 49162306a36Sopenharmony_ci /* 49262306a36Sopenharmony_ci * Validate the invariant that nowait must allow retry 49362306a36Sopenharmony_ci * to be sure not to return SIGBUS erroneously on 49462306a36Sopenharmony_ci * nowait invocations. 49562306a36Sopenharmony_ci */ 49662306a36Sopenharmony_ci BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); 49762306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 49862306a36Sopenharmony_ci if (printk_ratelimit()) { 49962306a36Sopenharmony_ci printk(KERN_WARNING 50062306a36Sopenharmony_ci "FAULT_FLAG_ALLOW_RETRY missing %x\n", 50162306a36Sopenharmony_ci vmf->flags); 50262306a36Sopenharmony_ci dump_stack(); 50362306a36Sopenharmony_ci } 50462306a36Sopenharmony_ci#endif 50562306a36Sopenharmony_ci goto out; 50662306a36Sopenharmony_ci } 50762306a36Sopenharmony_ci 50862306a36Sopenharmony_ci /* 50962306a36Sopenharmony_ci * Handle nowait, not much to do other than tell it to retry 51062306a36Sopenharmony_ci * and wait. 51162306a36Sopenharmony_ci */ 51262306a36Sopenharmony_ci ret = VM_FAULT_RETRY; 51362306a36Sopenharmony_ci if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) 51462306a36Sopenharmony_ci goto out; 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_ci /* take the reference before dropping the mmap_lock */ 51762306a36Sopenharmony_ci userfaultfd_ctx_get(ctx); 51862306a36Sopenharmony_ci 51962306a36Sopenharmony_ci init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 52062306a36Sopenharmony_ci uwq.wq.private = current; 52162306a36Sopenharmony_ci uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, 52262306a36Sopenharmony_ci reason, ctx->features); 52362306a36Sopenharmony_ci uwq.ctx = ctx; 52462306a36Sopenharmony_ci uwq.waken = false; 52562306a36Sopenharmony_ci 52662306a36Sopenharmony_ci blocking_state = userfaultfd_get_blocking_state(vmf->flags); 52762306a36Sopenharmony_ci 52862306a36Sopenharmony_ci /* 52962306a36Sopenharmony_ci * Take the vma lock now, in order to safely call 53062306a36Sopenharmony_ci * userfaultfd_huge_must_wait() later. Since acquiring the 53162306a36Sopenharmony_ci * (sleepable) vma lock can modify the current task state, that 53262306a36Sopenharmony_ci * must be before explicitly calling set_current_state(). 53362306a36Sopenharmony_ci */ 53462306a36Sopenharmony_ci if (is_vm_hugetlb_page(vma)) 53562306a36Sopenharmony_ci hugetlb_vma_lock_read(vma); 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci spin_lock_irq(&ctx->fault_pending_wqh.lock); 53862306a36Sopenharmony_ci /* 53962306a36Sopenharmony_ci * After the __add_wait_queue the uwq is visible to userland 54062306a36Sopenharmony_ci * through poll/read(). 54162306a36Sopenharmony_ci */ 54262306a36Sopenharmony_ci __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); 54362306a36Sopenharmony_ci /* 54462306a36Sopenharmony_ci * The smp_mb() after __set_current_state prevents the reads 54562306a36Sopenharmony_ci * following the spin_unlock to happen before the list_add in 54662306a36Sopenharmony_ci * __add_wait_queue. 54762306a36Sopenharmony_ci */ 54862306a36Sopenharmony_ci set_current_state(blocking_state); 54962306a36Sopenharmony_ci spin_unlock_irq(&ctx->fault_pending_wqh.lock); 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_ci if (!is_vm_hugetlb_page(vma)) 55262306a36Sopenharmony_ci must_wait = userfaultfd_must_wait(ctx, vmf, reason); 55362306a36Sopenharmony_ci else 55462306a36Sopenharmony_ci must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); 55562306a36Sopenharmony_ci if (is_vm_hugetlb_page(vma)) 55662306a36Sopenharmony_ci hugetlb_vma_unlock_read(vma); 55762306a36Sopenharmony_ci release_fault_lock(vmf); 55862306a36Sopenharmony_ci 55962306a36Sopenharmony_ci if (likely(must_wait && !READ_ONCE(ctx->released))) { 56062306a36Sopenharmony_ci wake_up_poll(&ctx->fd_wqh, EPOLLIN); 56162306a36Sopenharmony_ci schedule(); 56262306a36Sopenharmony_ci } 56362306a36Sopenharmony_ci 56462306a36Sopenharmony_ci __set_current_state(TASK_RUNNING); 56562306a36Sopenharmony_ci 56662306a36Sopenharmony_ci /* 56762306a36Sopenharmony_ci * Here we race with the list_del; list_add in 56862306a36Sopenharmony_ci * userfaultfd_ctx_read(), however because we don't ever run 56962306a36Sopenharmony_ci * list_del_init() to refile across the two lists, the prev 57062306a36Sopenharmony_ci * and next pointers will never point to self. list_add also 57162306a36Sopenharmony_ci * would never let any of the two pointers to point to 57262306a36Sopenharmony_ci * self. So list_empty_careful won't risk to see both pointers 57362306a36Sopenharmony_ci * pointing to self at any time during the list refile. The 57462306a36Sopenharmony_ci * only case where list_del_init() is called is the full 57562306a36Sopenharmony_ci * removal in the wake function and there we don't re-list_add 57662306a36Sopenharmony_ci * and it's fine not to block on the spinlock. The uwq on this 57762306a36Sopenharmony_ci * kernel stack can be released after the list_del_init. 57862306a36Sopenharmony_ci */ 57962306a36Sopenharmony_ci if (!list_empty_careful(&uwq.wq.entry)) { 58062306a36Sopenharmony_ci spin_lock_irq(&ctx->fault_pending_wqh.lock); 58162306a36Sopenharmony_ci /* 58262306a36Sopenharmony_ci * No need of list_del_init(), the uwq on the stack 58362306a36Sopenharmony_ci * will be freed shortly anyway. 58462306a36Sopenharmony_ci */ 58562306a36Sopenharmony_ci list_del(&uwq.wq.entry); 58662306a36Sopenharmony_ci spin_unlock_irq(&ctx->fault_pending_wqh.lock); 58762306a36Sopenharmony_ci } 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci /* 59062306a36Sopenharmony_ci * ctx may go away after this if the userfault pseudo fd is 59162306a36Sopenharmony_ci * already released. 59262306a36Sopenharmony_ci */ 59362306a36Sopenharmony_ci userfaultfd_ctx_put(ctx); 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ciout: 59662306a36Sopenharmony_ci return ret; 59762306a36Sopenharmony_ci} 59862306a36Sopenharmony_ci 59962306a36Sopenharmony_cistatic void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, 60062306a36Sopenharmony_ci struct userfaultfd_wait_queue *ewq) 60162306a36Sopenharmony_ci{ 60262306a36Sopenharmony_ci struct userfaultfd_ctx *release_new_ctx; 60362306a36Sopenharmony_ci 60462306a36Sopenharmony_ci if (WARN_ON_ONCE(current->flags & PF_EXITING)) 60562306a36Sopenharmony_ci goto out; 60662306a36Sopenharmony_ci 60762306a36Sopenharmony_ci ewq->ctx = ctx; 60862306a36Sopenharmony_ci init_waitqueue_entry(&ewq->wq, current); 60962306a36Sopenharmony_ci release_new_ctx = NULL; 61062306a36Sopenharmony_ci 61162306a36Sopenharmony_ci spin_lock_irq(&ctx->event_wqh.lock); 61262306a36Sopenharmony_ci /* 61362306a36Sopenharmony_ci * After the __add_wait_queue the uwq is visible to userland 61462306a36Sopenharmony_ci * through poll/read(). 61562306a36Sopenharmony_ci */ 61662306a36Sopenharmony_ci __add_wait_queue(&ctx->event_wqh, &ewq->wq); 61762306a36Sopenharmony_ci for (;;) { 61862306a36Sopenharmony_ci set_current_state(TASK_KILLABLE); 61962306a36Sopenharmony_ci if (ewq->msg.event == 0) 62062306a36Sopenharmony_ci break; 62162306a36Sopenharmony_ci if (READ_ONCE(ctx->released) || 62262306a36Sopenharmony_ci fatal_signal_pending(current)) { 62362306a36Sopenharmony_ci /* 62462306a36Sopenharmony_ci * &ewq->wq may be queued in fork_event, but 62562306a36Sopenharmony_ci * __remove_wait_queue ignores the head 62662306a36Sopenharmony_ci * parameter. It would be a problem if it 62762306a36Sopenharmony_ci * didn't. 62862306a36Sopenharmony_ci */ 62962306a36Sopenharmony_ci __remove_wait_queue(&ctx->event_wqh, &ewq->wq); 63062306a36Sopenharmony_ci if (ewq->msg.event == UFFD_EVENT_FORK) { 63162306a36Sopenharmony_ci struct userfaultfd_ctx *new; 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci new = (struct userfaultfd_ctx *) 63462306a36Sopenharmony_ci (unsigned long) 63562306a36Sopenharmony_ci ewq->msg.arg.reserved.reserved1; 63662306a36Sopenharmony_ci release_new_ctx = new; 63762306a36Sopenharmony_ci } 63862306a36Sopenharmony_ci break; 63962306a36Sopenharmony_ci } 64062306a36Sopenharmony_ci 64162306a36Sopenharmony_ci spin_unlock_irq(&ctx->event_wqh.lock); 64262306a36Sopenharmony_ci 64362306a36Sopenharmony_ci wake_up_poll(&ctx->fd_wqh, EPOLLIN); 64462306a36Sopenharmony_ci schedule(); 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_ci spin_lock_irq(&ctx->event_wqh.lock); 64762306a36Sopenharmony_ci } 64862306a36Sopenharmony_ci __set_current_state(TASK_RUNNING); 64962306a36Sopenharmony_ci spin_unlock_irq(&ctx->event_wqh.lock); 65062306a36Sopenharmony_ci 65162306a36Sopenharmony_ci if (release_new_ctx) { 65262306a36Sopenharmony_ci struct vm_area_struct *vma; 65362306a36Sopenharmony_ci struct mm_struct *mm = release_new_ctx->mm; 65462306a36Sopenharmony_ci VMA_ITERATOR(vmi, mm, 0); 65562306a36Sopenharmony_ci 65662306a36Sopenharmony_ci /* the various vma->vm_userfaultfd_ctx still points to it */ 65762306a36Sopenharmony_ci mmap_write_lock(mm); 65862306a36Sopenharmony_ci for_each_vma(vmi, vma) { 65962306a36Sopenharmony_ci if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { 66062306a36Sopenharmony_ci vma_start_write(vma); 66162306a36Sopenharmony_ci vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 66262306a36Sopenharmony_ci userfaultfd_set_vm_flags(vma, 66362306a36Sopenharmony_ci vma->vm_flags & ~__VM_UFFD_FLAGS); 66462306a36Sopenharmony_ci } 66562306a36Sopenharmony_ci } 66662306a36Sopenharmony_ci mmap_write_unlock(mm); 66762306a36Sopenharmony_ci 66862306a36Sopenharmony_ci userfaultfd_ctx_put(release_new_ctx); 66962306a36Sopenharmony_ci } 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci /* 67262306a36Sopenharmony_ci * ctx may go away after this if the userfault pseudo fd is 67362306a36Sopenharmony_ci * already released. 67462306a36Sopenharmony_ci */ 67562306a36Sopenharmony_ciout: 67662306a36Sopenharmony_ci atomic_dec(&ctx->mmap_changing); 67762306a36Sopenharmony_ci VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0); 67862306a36Sopenharmony_ci userfaultfd_ctx_put(ctx); 67962306a36Sopenharmony_ci} 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_cistatic void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, 68262306a36Sopenharmony_ci struct userfaultfd_wait_queue *ewq) 68362306a36Sopenharmony_ci{ 68462306a36Sopenharmony_ci ewq->msg.event = 0; 68562306a36Sopenharmony_ci wake_up_locked(&ctx->event_wqh); 68662306a36Sopenharmony_ci __remove_wait_queue(&ctx->event_wqh, &ewq->wq); 68762306a36Sopenharmony_ci} 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ciint dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) 69062306a36Sopenharmony_ci{ 69162306a36Sopenharmony_ci struct userfaultfd_ctx *ctx = NULL, *octx; 69262306a36Sopenharmony_ci struct userfaultfd_fork_ctx *fctx; 69362306a36Sopenharmony_ci 69462306a36Sopenharmony_ci octx = vma->vm_userfaultfd_ctx.ctx; 69562306a36Sopenharmony_ci if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { 69662306a36Sopenharmony_ci vma_start_write(vma); 69762306a36Sopenharmony_ci vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 69862306a36Sopenharmony_ci userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); 69962306a36Sopenharmony_ci return 0; 70062306a36Sopenharmony_ci } 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_ci list_for_each_entry(fctx, fcs, list) 70362306a36Sopenharmony_ci if (fctx->orig == octx) { 70462306a36Sopenharmony_ci ctx = fctx->new; 70562306a36Sopenharmony_ci break; 70662306a36Sopenharmony_ci } 70762306a36Sopenharmony_ci 70862306a36Sopenharmony_ci if (!ctx) { 70962306a36Sopenharmony_ci fctx = kmalloc(sizeof(*fctx), GFP_KERNEL); 71062306a36Sopenharmony_ci if (!fctx) 71162306a36Sopenharmony_ci return -ENOMEM; 71262306a36Sopenharmony_ci 71362306a36Sopenharmony_ci ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); 71462306a36Sopenharmony_ci if (!ctx) { 71562306a36Sopenharmony_ci kfree(fctx); 71662306a36Sopenharmony_ci return -ENOMEM; 71762306a36Sopenharmony_ci } 71862306a36Sopenharmony_ci 71962306a36Sopenharmony_ci refcount_set(&ctx->refcount, 1); 72062306a36Sopenharmony_ci ctx->flags = octx->flags; 72162306a36Sopenharmony_ci ctx->features = octx->features; 72262306a36Sopenharmony_ci ctx->released = false; 72362306a36Sopenharmony_ci atomic_set(&ctx->mmap_changing, 0); 72462306a36Sopenharmony_ci ctx->mm = vma->vm_mm; 72562306a36Sopenharmony_ci mmgrab(ctx->mm); 72662306a36Sopenharmony_ci 72762306a36Sopenharmony_ci userfaultfd_ctx_get(octx); 72862306a36Sopenharmony_ci atomic_inc(&octx->mmap_changing); 72962306a36Sopenharmony_ci fctx->orig = octx; 73062306a36Sopenharmony_ci fctx->new = ctx; 73162306a36Sopenharmony_ci list_add_tail(&fctx->list, fcs); 73262306a36Sopenharmony_ci } 73362306a36Sopenharmony_ci 73462306a36Sopenharmony_ci vma->vm_userfaultfd_ctx.ctx = ctx; 73562306a36Sopenharmony_ci return 0; 73662306a36Sopenharmony_ci} 73762306a36Sopenharmony_ci 73862306a36Sopenharmony_cistatic void dup_fctx(struct userfaultfd_fork_ctx *fctx) 73962306a36Sopenharmony_ci{ 74062306a36Sopenharmony_ci struct userfaultfd_ctx *ctx = fctx->orig; 74162306a36Sopenharmony_ci struct userfaultfd_wait_queue ewq; 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci msg_init(&ewq.msg); 74462306a36Sopenharmony_ci 74562306a36Sopenharmony_ci ewq.msg.event = UFFD_EVENT_FORK; 74662306a36Sopenharmony_ci ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; 74762306a36Sopenharmony_ci 74862306a36Sopenharmony_ci userfaultfd_event_wait_completion(ctx, &ewq); 74962306a36Sopenharmony_ci} 75062306a36Sopenharmony_ci 75162306a36Sopenharmony_civoid dup_userfaultfd_complete(struct list_head *fcs) 75262306a36Sopenharmony_ci{ 75362306a36Sopenharmony_ci struct userfaultfd_fork_ctx *fctx, *n; 75462306a36Sopenharmony_ci 75562306a36Sopenharmony_ci list_for_each_entry_safe(fctx, n, fcs, list) { 75662306a36Sopenharmony_ci dup_fctx(fctx); 75762306a36Sopenharmony_ci list_del(&fctx->list); 75862306a36Sopenharmony_ci kfree(fctx); 75962306a36Sopenharmony_ci } 76062306a36Sopenharmony_ci} 76162306a36Sopenharmony_ci 76262306a36Sopenharmony_civoid mremap_userfaultfd_prep(struct vm_area_struct *vma, 76362306a36Sopenharmony_ci struct vm_userfaultfd_ctx *vm_ctx) 76462306a36Sopenharmony_ci{ 76562306a36Sopenharmony_ci struct userfaultfd_ctx *ctx; 76662306a36Sopenharmony_ci 76762306a36Sopenharmony_ci ctx = vma->vm_userfaultfd_ctx.ctx; 76862306a36Sopenharmony_ci 76962306a36Sopenharmony_ci if (!ctx) 77062306a36Sopenharmony_ci return; 77162306a36Sopenharmony_ci 77262306a36Sopenharmony_ci if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { 77362306a36Sopenharmony_ci vm_ctx->ctx = ctx; 77462306a36Sopenharmony_ci userfaultfd_ctx_get(ctx); 77562306a36Sopenharmony_ci atomic_inc(&ctx->mmap_changing); 77662306a36Sopenharmony_ci } else { 77762306a36Sopenharmony_ci /* Drop uffd context if remap feature not enabled */ 77862306a36Sopenharmony_ci vma_start_write(vma); 77962306a36Sopenharmony_ci vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 78062306a36Sopenharmony_ci userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); 78162306a36Sopenharmony_ci } 78262306a36Sopenharmony_ci} 78362306a36Sopenharmony_ci 78462306a36Sopenharmony_civoid mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, 78562306a36Sopenharmony_ci unsigned long from, unsigned long to, 78662306a36Sopenharmony_ci unsigned long len) 78762306a36Sopenharmony_ci{ 78862306a36Sopenharmony_ci struct userfaultfd_ctx *ctx = vm_ctx->ctx; 78962306a36Sopenharmony_ci struct userfaultfd_wait_queue ewq; 79062306a36Sopenharmony_ci 79162306a36Sopenharmony_ci if (!ctx) 79262306a36Sopenharmony_ci return; 79362306a36Sopenharmony_ci 79462306a36Sopenharmony_ci if (to & ~PAGE_MASK) { 79562306a36Sopenharmony_ci userfaultfd_ctx_put(ctx); 79662306a36Sopenharmony_ci return; 79762306a36Sopenharmony_ci } 79862306a36Sopenharmony_ci 79962306a36Sopenharmony_ci msg_init(&ewq.msg); 80062306a36Sopenharmony_ci 80162306a36Sopenharmony_ci ewq.msg.event = UFFD_EVENT_REMAP; 80262306a36Sopenharmony_ci ewq.msg.arg.remap.from = from; 80362306a36Sopenharmony_ci ewq.msg.arg.remap.to = to; 80462306a36Sopenharmony_ci ewq.msg.arg.remap.len = len; 80562306a36Sopenharmony_ci 80662306a36Sopenharmony_ci userfaultfd_event_wait_completion(ctx, &ewq); 80762306a36Sopenharmony_ci} 80862306a36Sopenharmony_ci 80962306a36Sopenharmony_cibool userfaultfd_remove(struct vm_area_struct *vma, 81062306a36Sopenharmony_ci unsigned long start, unsigned long end) 81162306a36Sopenharmony_ci{ 81262306a36Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 81362306a36Sopenharmony_ci struct userfaultfd_ctx *ctx; 81462306a36Sopenharmony_ci struct userfaultfd_wait_queue ewq; 81562306a36Sopenharmony_ci 81662306a36Sopenharmony_ci ctx = vma->vm_userfaultfd_ctx.ctx; 81762306a36Sopenharmony_ci if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) 81862306a36Sopenharmony_ci return true; 81962306a36Sopenharmony_ci 82062306a36Sopenharmony_ci userfaultfd_ctx_get(ctx); 82162306a36Sopenharmony_ci atomic_inc(&ctx->mmap_changing); 82262306a36Sopenharmony_ci mmap_read_unlock(mm); 82362306a36Sopenharmony_ci 82462306a36Sopenharmony_ci msg_init(&ewq.msg); 82562306a36Sopenharmony_ci 82662306a36Sopenharmony_ci ewq.msg.event = UFFD_EVENT_REMOVE; 82762306a36Sopenharmony_ci ewq.msg.arg.remove.start = start; 82862306a36Sopenharmony_ci ewq.msg.arg.remove.end = end; 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_ci userfaultfd_event_wait_completion(ctx, &ewq); 83162306a36Sopenharmony_ci 83262306a36Sopenharmony_ci return false; 83362306a36Sopenharmony_ci} 83462306a36Sopenharmony_ci 83562306a36Sopenharmony_cistatic bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, 83662306a36Sopenharmony_ci unsigned long start, unsigned long end) 83762306a36Sopenharmony_ci{ 83862306a36Sopenharmony_ci struct userfaultfd_unmap_ctx *unmap_ctx; 83962306a36Sopenharmony_ci 84062306a36Sopenharmony_ci list_for_each_entry(unmap_ctx, unmaps, list) 84162306a36Sopenharmony_ci if (unmap_ctx->ctx == ctx && unmap_ctx->start == start && 84262306a36Sopenharmony_ci unmap_ctx->end == end) 84362306a36Sopenharmony_ci return true; 84462306a36Sopenharmony_ci 84562306a36Sopenharmony_ci return false; 84662306a36Sopenharmony_ci} 84762306a36Sopenharmony_ci 84862306a36Sopenharmony_ciint userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, 84962306a36Sopenharmony_ci unsigned long end, struct list_head *unmaps) 85062306a36Sopenharmony_ci{ 85162306a36Sopenharmony_ci struct userfaultfd_unmap_ctx *unmap_ctx; 85262306a36Sopenharmony_ci struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || 85562306a36Sopenharmony_ci has_unmap_ctx(ctx, unmaps, start, end)) 85662306a36Sopenharmony_ci return 0; 85762306a36Sopenharmony_ci 85862306a36Sopenharmony_ci unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); 85962306a36Sopenharmony_ci if (!unmap_ctx) 86062306a36Sopenharmony_ci return -ENOMEM; 86162306a36Sopenharmony_ci 86262306a36Sopenharmony_ci userfaultfd_ctx_get(ctx); 86362306a36Sopenharmony_ci atomic_inc(&ctx->mmap_changing); 86462306a36Sopenharmony_ci unmap_ctx->ctx = ctx; 86562306a36Sopenharmony_ci unmap_ctx->start = start; 86662306a36Sopenharmony_ci unmap_ctx->end = end; 86762306a36Sopenharmony_ci list_add_tail(&unmap_ctx->list, unmaps); 86862306a36Sopenharmony_ci 86962306a36Sopenharmony_ci return 0; 87062306a36Sopenharmony_ci} 87162306a36Sopenharmony_ci 87262306a36Sopenharmony_civoid userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) 87362306a36Sopenharmony_ci{ 87462306a36Sopenharmony_ci struct userfaultfd_unmap_ctx *ctx, *n; 87562306a36Sopenharmony_ci struct userfaultfd_wait_queue ewq; 87662306a36Sopenharmony_ci 87762306a36Sopenharmony_ci list_for_each_entry_safe(ctx, n, uf, list) { 87862306a36Sopenharmony_ci msg_init(&ewq.msg); 87962306a36Sopenharmony_ci 88062306a36Sopenharmony_ci ewq.msg.event = UFFD_EVENT_UNMAP; 88162306a36Sopenharmony_ci ewq.msg.arg.remove.start = ctx->start; 88262306a36Sopenharmony_ci ewq.msg.arg.remove.end = ctx->end; 88362306a36Sopenharmony_ci 88462306a36Sopenharmony_ci userfaultfd_event_wait_completion(ctx->ctx, &ewq); 88562306a36Sopenharmony_ci 88662306a36Sopenharmony_ci list_del(&ctx->list); 88762306a36Sopenharmony_ci kfree(ctx); 88862306a36Sopenharmony_ci } 88962306a36Sopenharmony_ci} 89062306a36Sopenharmony_ci 89162306a36Sopenharmony_cistatic int userfaultfd_release(struct inode *inode, struct file *file) 89262306a36Sopenharmony_ci{ 89362306a36Sopenharmony_ci struct userfaultfd_ctx *ctx = file->private_data; 89462306a36Sopenharmony_ci struct mm_struct *mm = ctx->mm; 89562306a36Sopenharmony_ci struct vm_area_struct *vma, *prev; 89662306a36Sopenharmony_ci /* len == 0 means wake all */ 89762306a36Sopenharmony_ci struct userfaultfd_wake_range range = { .len = 0, }; 89862306a36Sopenharmony_ci unsigned long new_flags; 89962306a36Sopenharmony_ci VMA_ITERATOR(vmi, mm, 0); 90062306a36Sopenharmony_ci 90162306a36Sopenharmony_ci WRITE_ONCE(ctx->released, true); 90262306a36Sopenharmony_ci 90362306a36Sopenharmony_ci if (!mmget_not_zero(mm)) 90462306a36Sopenharmony_ci goto wakeup; 90562306a36Sopenharmony_ci 90662306a36Sopenharmony_ci /* 90762306a36Sopenharmony_ci * Flush page faults out of all CPUs. NOTE: all page faults 90862306a36Sopenharmony_ci * must be retried without returning VM_FAULT_SIGBUS if 90962306a36Sopenharmony_ci * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx 91062306a36Sopenharmony_ci * changes while handle_userfault released the mmap_lock. So 91162306a36Sopenharmony_ci * it's critical that released is set to true (above), before 91262306a36Sopenharmony_ci * taking the mmap_lock for writing. 91362306a36Sopenharmony_ci */ 91462306a36Sopenharmony_ci mmap_write_lock(mm); 91562306a36Sopenharmony_ci prev = NULL; 91662306a36Sopenharmony_ci for_each_vma(vmi, vma) { 91762306a36Sopenharmony_ci cond_resched(); 91862306a36Sopenharmony_ci BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ 91962306a36Sopenharmony_ci !!(vma->vm_flags & __VM_UFFD_FLAGS)); 92062306a36Sopenharmony_ci if (vma->vm_userfaultfd_ctx.ctx != ctx) { 92162306a36Sopenharmony_ci prev = vma; 92262306a36Sopenharmony_ci continue; 92362306a36Sopenharmony_ci } 92462306a36Sopenharmony_ci new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; 92562306a36Sopenharmony_ci prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, 92662306a36Sopenharmony_ci new_flags, vma->anon_vma, 92762306a36Sopenharmony_ci vma->vm_file, vma->vm_pgoff, 92862306a36Sopenharmony_ci vma_policy(vma), 92962306a36Sopenharmony_ci NULL_VM_UFFD_CTX, anon_vma_name(vma)); 93062306a36Sopenharmony_ci if (prev) { 93162306a36Sopenharmony_ci vma = prev; 93262306a36Sopenharmony_ci } else { 93362306a36Sopenharmony_ci prev = vma; 93462306a36Sopenharmony_ci } 93562306a36Sopenharmony_ci 93662306a36Sopenharmony_ci vma_start_write(vma); 93762306a36Sopenharmony_ci userfaultfd_set_vm_flags(vma, new_flags); 93862306a36Sopenharmony_ci vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 93962306a36Sopenharmony_ci } 94062306a36Sopenharmony_ci mmap_write_unlock(mm); 94162306a36Sopenharmony_ci mmput(mm); 94262306a36Sopenharmony_ciwakeup: 94362306a36Sopenharmony_ci /* 94462306a36Sopenharmony_ci * After no new page faults can wait on this fault_*wqh, flush 94562306a36Sopenharmony_ci * the last page faults that may have been already waiting on 94662306a36Sopenharmony_ci * the fault_*wqh. 94762306a36Sopenharmony_ci */ 94862306a36Sopenharmony_ci spin_lock_irq(&ctx->fault_pending_wqh.lock); 94962306a36Sopenharmony_ci __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); 95062306a36Sopenharmony_ci __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); 95162306a36Sopenharmony_ci spin_unlock_irq(&ctx->fault_pending_wqh.lock); 95262306a36Sopenharmony_ci 95362306a36Sopenharmony_ci /* Flush pending events that may still wait on event_wqh */ 95462306a36Sopenharmony_ci wake_up_all(&ctx->event_wqh); 95562306a36Sopenharmony_ci 95662306a36Sopenharmony_ci wake_up_poll(&ctx->fd_wqh, EPOLLHUP); 95762306a36Sopenharmony_ci userfaultfd_ctx_put(ctx); 95862306a36Sopenharmony_ci return 0; 95962306a36Sopenharmony_ci} 96062306a36Sopenharmony_ci 96162306a36Sopenharmony_ci/* fault_pending_wqh.lock must be hold by the caller */ 96262306a36Sopenharmony_cistatic inline struct userfaultfd_wait_queue *find_userfault_in( 96362306a36Sopenharmony_ci wait_queue_head_t *wqh) 96462306a36Sopenharmony_ci{ 96562306a36Sopenharmony_ci wait_queue_entry_t *wq; 96662306a36Sopenharmony_ci struct userfaultfd_wait_queue *uwq; 96762306a36Sopenharmony_ci 96862306a36Sopenharmony_ci lockdep_assert_held(&wqh->lock); 96962306a36Sopenharmony_ci 97062306a36Sopenharmony_ci uwq = NULL; 97162306a36Sopenharmony_ci if (!waitqueue_active(wqh)) 97262306a36Sopenharmony_ci goto out; 97362306a36Sopenharmony_ci /* walk in reverse to provide FIFO behavior to read userfaults */ 97462306a36Sopenharmony_ci wq = list_last_entry(&wqh->head, typeof(*wq), entry); 97562306a36Sopenharmony_ci uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 97662306a36Sopenharmony_ciout: 97762306a36Sopenharmony_ci return uwq; 97862306a36Sopenharmony_ci} 97962306a36Sopenharmony_ci 98062306a36Sopenharmony_cistatic inline struct userfaultfd_wait_queue *find_userfault( 98162306a36Sopenharmony_ci struct userfaultfd_ctx *ctx) 98262306a36Sopenharmony_ci{ 98362306a36Sopenharmony_ci return find_userfault_in(&ctx->fault_pending_wqh); 98462306a36Sopenharmony_ci} 98562306a36Sopenharmony_ci 98662306a36Sopenharmony_cistatic inline struct userfaultfd_wait_queue *find_userfault_evt( 98762306a36Sopenharmony_ci struct userfaultfd_ctx *ctx) 98862306a36Sopenharmony_ci{ 98962306a36Sopenharmony_ci return find_userfault_in(&ctx->event_wqh); 99062306a36Sopenharmony_ci} 99162306a36Sopenharmony_ci 99262306a36Sopenharmony_cistatic __poll_t userfaultfd_poll(struct file *file, poll_table *wait) 99362306a36Sopenharmony_ci{ 99462306a36Sopenharmony_ci struct userfaultfd_ctx *ctx = file->private_data; 99562306a36Sopenharmony_ci __poll_t ret; 99662306a36Sopenharmony_ci 99762306a36Sopenharmony_ci poll_wait(file, &ctx->fd_wqh, wait); 99862306a36Sopenharmony_ci 99962306a36Sopenharmony_ci if (!userfaultfd_is_initialized(ctx)) 100062306a36Sopenharmony_ci return EPOLLERR; 100162306a36Sopenharmony_ci 100262306a36Sopenharmony_ci /* 100362306a36Sopenharmony_ci * poll() never guarantees that read won't block. 100462306a36Sopenharmony_ci * userfaults can be waken before they're read(). 100562306a36Sopenharmony_ci */ 100662306a36Sopenharmony_ci if (unlikely(!(file->f_flags & O_NONBLOCK))) 100762306a36Sopenharmony_ci return EPOLLERR; 100862306a36Sopenharmony_ci /* 100962306a36Sopenharmony_ci * lockless access to see if there are pending faults 101062306a36Sopenharmony_ci * __pollwait last action is the add_wait_queue but 101162306a36Sopenharmony_ci * the spin_unlock would allow the waitqueue_active to 101262306a36Sopenharmony_ci * pass above the actual list_add inside 101362306a36Sopenharmony_ci * add_wait_queue critical section. So use a full 101462306a36Sopenharmony_ci * memory barrier to serialize the list_add write of 101562306a36Sopenharmony_ci * add_wait_queue() with the waitqueue_active read 101662306a36Sopenharmony_ci * below. 101762306a36Sopenharmony_ci */ 101862306a36Sopenharmony_ci ret = 0; 101962306a36Sopenharmony_ci smp_mb(); 102062306a36Sopenharmony_ci if (waitqueue_active(&ctx->fault_pending_wqh)) 102162306a36Sopenharmony_ci ret = EPOLLIN; 102262306a36Sopenharmony_ci else if (waitqueue_active(&ctx->event_wqh)) 102362306a36Sopenharmony_ci ret = EPOLLIN; 102462306a36Sopenharmony_ci 102562306a36Sopenharmony_ci return ret; 102662306a36Sopenharmony_ci} 102762306a36Sopenharmony_ci 102862306a36Sopenharmony_cistatic const struct file_operations userfaultfd_fops; 102962306a36Sopenharmony_ci 103062306a36Sopenharmony_cistatic int resolve_userfault_fork(struct userfaultfd_ctx *new, 103162306a36Sopenharmony_ci struct inode *inode, 103262306a36Sopenharmony_ci struct uffd_msg *msg) 103362306a36Sopenharmony_ci{ 103462306a36Sopenharmony_ci int fd; 103562306a36Sopenharmony_ci 103662306a36Sopenharmony_ci fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, 103762306a36Sopenharmony_ci O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); 103862306a36Sopenharmony_ci if (fd < 0) 103962306a36Sopenharmony_ci return fd; 104062306a36Sopenharmony_ci 104162306a36Sopenharmony_ci msg->arg.reserved.reserved1 = 0; 104262306a36Sopenharmony_ci msg->arg.fork.ufd = fd; 104362306a36Sopenharmony_ci return 0; 104462306a36Sopenharmony_ci} 104562306a36Sopenharmony_ci 104662306a36Sopenharmony_cistatic ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, 104762306a36Sopenharmony_ci struct uffd_msg *msg, struct inode *inode) 104862306a36Sopenharmony_ci{ 104962306a36Sopenharmony_ci ssize_t ret; 105062306a36Sopenharmony_ci DECLARE_WAITQUEUE(wait, current); 105162306a36Sopenharmony_ci struct userfaultfd_wait_queue *uwq; 105262306a36Sopenharmony_ci /* 105362306a36Sopenharmony_ci * Handling fork event requires sleeping operations, so 105462306a36Sopenharmony_ci * we drop the event_wqh lock, then do these ops, then 105562306a36Sopenharmony_ci * lock it back and wake up the waiter. While the lock is 105662306a36Sopenharmony_ci * dropped the ewq may go away so we keep track of it 105762306a36Sopenharmony_ci * carefully. 105862306a36Sopenharmony_ci */ 105962306a36Sopenharmony_ci LIST_HEAD(fork_event); 106062306a36Sopenharmony_ci struct userfaultfd_ctx *fork_nctx = NULL; 106162306a36Sopenharmony_ci 106262306a36Sopenharmony_ci /* always take the fd_wqh lock before the fault_pending_wqh lock */ 106362306a36Sopenharmony_ci spin_lock_irq(&ctx->fd_wqh.lock); 106462306a36Sopenharmony_ci __add_wait_queue(&ctx->fd_wqh, &wait); 106562306a36Sopenharmony_ci for (;;) { 106662306a36Sopenharmony_ci set_current_state(TASK_INTERRUPTIBLE); 106762306a36Sopenharmony_ci spin_lock(&ctx->fault_pending_wqh.lock); 106862306a36Sopenharmony_ci uwq = find_userfault(ctx); 106962306a36Sopenharmony_ci if (uwq) { 107062306a36Sopenharmony_ci /* 107162306a36Sopenharmony_ci * Use a seqcount to repeat the lockless check 107262306a36Sopenharmony_ci * in wake_userfault() to avoid missing 107362306a36Sopenharmony_ci * wakeups because during the refile both 107462306a36Sopenharmony_ci * waitqueue could become empty if this is the 107562306a36Sopenharmony_ci * only userfault. 107662306a36Sopenharmony_ci */ 107762306a36Sopenharmony_ci write_seqcount_begin(&ctx->refile_seq); 107862306a36Sopenharmony_ci 107962306a36Sopenharmony_ci /* 108062306a36Sopenharmony_ci * The fault_pending_wqh.lock prevents the uwq 108162306a36Sopenharmony_ci * to disappear from under us. 108262306a36Sopenharmony_ci * 108362306a36Sopenharmony_ci * Refile this userfault from 108462306a36Sopenharmony_ci * fault_pending_wqh to fault_wqh, it's not 108562306a36Sopenharmony_ci * pending anymore after we read it. 108662306a36Sopenharmony_ci * 108762306a36Sopenharmony_ci * Use list_del() by hand (as 108862306a36Sopenharmony_ci * userfaultfd_wake_function also uses 108962306a36Sopenharmony_ci * list_del_init() by hand) to be sure nobody 109062306a36Sopenharmony_ci * changes __remove_wait_queue() to use 109162306a36Sopenharmony_ci * list_del_init() in turn breaking the 109262306a36Sopenharmony_ci * !list_empty_careful() check in 109362306a36Sopenharmony_ci * handle_userfault(). The uwq->wq.head list 109462306a36Sopenharmony_ci * must never be empty at any time during the 109562306a36Sopenharmony_ci * refile, or the waitqueue could disappear 109662306a36Sopenharmony_ci * from under us. The "wait_queue_head_t" 109762306a36Sopenharmony_ci * parameter of __remove_wait_queue() is unused 109862306a36Sopenharmony_ci * anyway. 109962306a36Sopenharmony_ci */ 110062306a36Sopenharmony_ci list_del(&uwq->wq.entry); 110162306a36Sopenharmony_ci add_wait_queue(&ctx->fault_wqh, &uwq->wq); 110262306a36Sopenharmony_ci 110362306a36Sopenharmony_ci write_seqcount_end(&ctx->refile_seq); 110462306a36Sopenharmony_ci 110562306a36Sopenharmony_ci /* careful to always initialize msg if ret == 0 */ 110662306a36Sopenharmony_ci *msg = uwq->msg; 110762306a36Sopenharmony_ci spin_unlock(&ctx->fault_pending_wqh.lock); 110862306a36Sopenharmony_ci ret = 0; 110962306a36Sopenharmony_ci break; 111062306a36Sopenharmony_ci } 111162306a36Sopenharmony_ci spin_unlock(&ctx->fault_pending_wqh.lock); 111262306a36Sopenharmony_ci 111362306a36Sopenharmony_ci spin_lock(&ctx->event_wqh.lock); 111462306a36Sopenharmony_ci uwq = find_userfault_evt(ctx); 111562306a36Sopenharmony_ci if (uwq) { 111662306a36Sopenharmony_ci *msg = uwq->msg; 111762306a36Sopenharmony_ci 111862306a36Sopenharmony_ci if (uwq->msg.event == UFFD_EVENT_FORK) { 111962306a36Sopenharmony_ci fork_nctx = (struct userfaultfd_ctx *) 112062306a36Sopenharmony_ci (unsigned long) 112162306a36Sopenharmony_ci uwq->msg.arg.reserved.reserved1; 112262306a36Sopenharmony_ci list_move(&uwq->wq.entry, &fork_event); 112362306a36Sopenharmony_ci /* 112462306a36Sopenharmony_ci * fork_nctx can be freed as soon as 112562306a36Sopenharmony_ci * we drop the lock, unless we take a 112662306a36Sopenharmony_ci * reference on it. 112762306a36Sopenharmony_ci */ 112862306a36Sopenharmony_ci userfaultfd_ctx_get(fork_nctx); 112962306a36Sopenharmony_ci spin_unlock(&ctx->event_wqh.lock); 113062306a36Sopenharmony_ci ret = 0; 113162306a36Sopenharmony_ci break; 113262306a36Sopenharmony_ci } 113362306a36Sopenharmony_ci 113462306a36Sopenharmony_ci userfaultfd_event_complete(ctx, uwq); 113562306a36Sopenharmony_ci spin_unlock(&ctx->event_wqh.lock); 113662306a36Sopenharmony_ci ret = 0; 113762306a36Sopenharmony_ci break; 113862306a36Sopenharmony_ci } 113962306a36Sopenharmony_ci spin_unlock(&ctx->event_wqh.lock); 114062306a36Sopenharmony_ci 114162306a36Sopenharmony_ci if (signal_pending(current)) { 114262306a36Sopenharmony_ci ret = -ERESTARTSYS; 114362306a36Sopenharmony_ci break; 114462306a36Sopenharmony_ci } 114562306a36Sopenharmony_ci if (no_wait) { 114662306a36Sopenharmony_ci ret = -EAGAIN; 114762306a36Sopenharmony_ci break; 114862306a36Sopenharmony_ci } 114962306a36Sopenharmony_ci spin_unlock_irq(&ctx->fd_wqh.lock); 115062306a36Sopenharmony_ci schedule(); 115162306a36Sopenharmony_ci spin_lock_irq(&ctx->fd_wqh.lock); 115262306a36Sopenharmony_ci } 115362306a36Sopenharmony_ci __remove_wait_queue(&ctx->fd_wqh, &wait); 115462306a36Sopenharmony_ci __set_current_state(TASK_RUNNING); 115562306a36Sopenharmony_ci spin_unlock_irq(&ctx->fd_wqh.lock); 115662306a36Sopenharmony_ci 115762306a36Sopenharmony_ci if (!ret && msg->event == UFFD_EVENT_FORK) { 115862306a36Sopenharmony_ci ret = resolve_userfault_fork(fork_nctx, inode, msg); 115962306a36Sopenharmony_ci spin_lock_irq(&ctx->event_wqh.lock); 116062306a36Sopenharmony_ci if (!list_empty(&fork_event)) { 116162306a36Sopenharmony_ci /* 116262306a36Sopenharmony_ci * The fork thread didn't abort, so we can 116362306a36Sopenharmony_ci * drop the temporary refcount. 116462306a36Sopenharmony_ci */ 116562306a36Sopenharmony_ci userfaultfd_ctx_put(fork_nctx); 116662306a36Sopenharmony_ci 116762306a36Sopenharmony_ci uwq = list_first_entry(&fork_event, 116862306a36Sopenharmony_ci typeof(*uwq), 116962306a36Sopenharmony_ci wq.entry); 117062306a36Sopenharmony_ci /* 117162306a36Sopenharmony_ci * If fork_event list wasn't empty and in turn 117262306a36Sopenharmony_ci * the event wasn't already released by fork 117362306a36Sopenharmony_ci * (the event is allocated on fork kernel 117462306a36Sopenharmony_ci * stack), put the event back to its place in 117562306a36Sopenharmony_ci * the event_wq. fork_event head will be freed 117662306a36Sopenharmony_ci * as soon as we return so the event cannot 117762306a36Sopenharmony_ci * stay queued there no matter the current 117862306a36Sopenharmony_ci * "ret" value. 117962306a36Sopenharmony_ci */ 118062306a36Sopenharmony_ci list_del(&uwq->wq.entry); 118162306a36Sopenharmony_ci __add_wait_queue(&ctx->event_wqh, &uwq->wq); 118262306a36Sopenharmony_ci 118362306a36Sopenharmony_ci /* 118462306a36Sopenharmony_ci * Leave the event in the waitqueue and report 118562306a36Sopenharmony_ci * error to userland if we failed to resolve 118662306a36Sopenharmony_ci * the userfault fork. 118762306a36Sopenharmony_ci */ 118862306a36Sopenharmony_ci if (likely(!ret)) 118962306a36Sopenharmony_ci userfaultfd_event_complete(ctx, uwq); 119062306a36Sopenharmony_ci } else { 119162306a36Sopenharmony_ci /* 119262306a36Sopenharmony_ci * Here the fork thread aborted and the 119362306a36Sopenharmony_ci * refcount from the fork thread on fork_nctx 119462306a36Sopenharmony_ci * has already been released. We still hold 119562306a36Sopenharmony_ci * the reference we took before releasing the 119662306a36Sopenharmony_ci * lock above. If resolve_userfault_fork 119762306a36Sopenharmony_ci * failed we've to drop it because the 119862306a36Sopenharmony_ci * fork_nctx has to be freed in such case. If 119962306a36Sopenharmony_ci * it succeeded we'll hold it because the new 120062306a36Sopenharmony_ci * uffd references it. 120162306a36Sopenharmony_ci */ 120262306a36Sopenharmony_ci if (ret) 120362306a36Sopenharmony_ci userfaultfd_ctx_put(fork_nctx); 120462306a36Sopenharmony_ci } 120562306a36Sopenharmony_ci spin_unlock_irq(&ctx->event_wqh.lock); 120662306a36Sopenharmony_ci } 120762306a36Sopenharmony_ci 120862306a36Sopenharmony_ci return ret; 120962306a36Sopenharmony_ci} 121062306a36Sopenharmony_ci 121162306a36Sopenharmony_cistatic ssize_t userfaultfd_read(struct file *file, char __user *buf, 121262306a36Sopenharmony_ci size_t count, loff_t *ppos) 121362306a36Sopenharmony_ci{ 121462306a36Sopenharmony_ci struct userfaultfd_ctx *ctx = file->private_data; 121562306a36Sopenharmony_ci ssize_t _ret, ret = 0; 121662306a36Sopenharmony_ci struct uffd_msg msg; 121762306a36Sopenharmony_ci int no_wait = file->f_flags & O_NONBLOCK; 121862306a36Sopenharmony_ci struct inode *inode = file_inode(file); 121962306a36Sopenharmony_ci 122062306a36Sopenharmony_ci if (!userfaultfd_is_initialized(ctx)) 122162306a36Sopenharmony_ci return -EINVAL; 122262306a36Sopenharmony_ci 122362306a36Sopenharmony_ci for (;;) { 122462306a36Sopenharmony_ci if (count < sizeof(msg)) 122562306a36Sopenharmony_ci return ret ? ret : -EINVAL; 122662306a36Sopenharmony_ci _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); 122762306a36Sopenharmony_ci if (_ret < 0) 122862306a36Sopenharmony_ci return ret ? ret : _ret; 122962306a36Sopenharmony_ci if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) 123062306a36Sopenharmony_ci return ret ? ret : -EFAULT; 123162306a36Sopenharmony_ci ret += sizeof(msg); 123262306a36Sopenharmony_ci buf += sizeof(msg); 123362306a36Sopenharmony_ci count -= sizeof(msg); 123462306a36Sopenharmony_ci /* 123562306a36Sopenharmony_ci * Allow to read more than one fault at time but only 123662306a36Sopenharmony_ci * block if waiting for the very first one. 123762306a36Sopenharmony_ci */ 123862306a36Sopenharmony_ci no_wait = O_NONBLOCK; 123962306a36Sopenharmony_ci } 124062306a36Sopenharmony_ci} 124162306a36Sopenharmony_ci 124262306a36Sopenharmony_cistatic void __wake_userfault(struct userfaultfd_ctx *ctx, 124362306a36Sopenharmony_ci struct userfaultfd_wake_range *range) 124462306a36Sopenharmony_ci{ 124562306a36Sopenharmony_ci spin_lock_irq(&ctx->fault_pending_wqh.lock); 124662306a36Sopenharmony_ci /* wake all in the range and autoremove */ 124762306a36Sopenharmony_ci if (waitqueue_active(&ctx->fault_pending_wqh)) 124862306a36Sopenharmony_ci __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 124962306a36Sopenharmony_ci range); 125062306a36Sopenharmony_ci if (waitqueue_active(&ctx->fault_wqh)) 125162306a36Sopenharmony_ci __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); 125262306a36Sopenharmony_ci spin_unlock_irq(&ctx->fault_pending_wqh.lock); 125362306a36Sopenharmony_ci} 125462306a36Sopenharmony_ci 125562306a36Sopenharmony_cistatic __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, 125662306a36Sopenharmony_ci struct userfaultfd_wake_range *range) 125762306a36Sopenharmony_ci{ 125862306a36Sopenharmony_ci unsigned seq; 125962306a36Sopenharmony_ci bool need_wakeup; 126062306a36Sopenharmony_ci 126162306a36Sopenharmony_ci /* 126262306a36Sopenharmony_ci * To be sure waitqueue_active() is not reordered by the CPU 126362306a36Sopenharmony_ci * before the pagetable update, use an explicit SMP memory 126462306a36Sopenharmony_ci * barrier here. PT lock release or mmap_read_unlock(mm) still 126562306a36Sopenharmony_ci * have release semantics that can allow the 126662306a36Sopenharmony_ci * waitqueue_active() to be reordered before the pte update. 126762306a36Sopenharmony_ci */ 126862306a36Sopenharmony_ci smp_mb(); 126962306a36Sopenharmony_ci 127062306a36Sopenharmony_ci /* 127162306a36Sopenharmony_ci * Use waitqueue_active because it's very frequent to 127262306a36Sopenharmony_ci * change the address space atomically even if there are no 127362306a36Sopenharmony_ci * userfaults yet. So we take the spinlock only when we're 127462306a36Sopenharmony_ci * sure we've userfaults to wake. 127562306a36Sopenharmony_ci */ 127662306a36Sopenharmony_ci do { 127762306a36Sopenharmony_ci seq = read_seqcount_begin(&ctx->refile_seq); 127862306a36Sopenharmony_ci need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || 127962306a36Sopenharmony_ci waitqueue_active(&ctx->fault_wqh); 128062306a36Sopenharmony_ci cond_resched(); 128162306a36Sopenharmony_ci } while (read_seqcount_retry(&ctx->refile_seq, seq)); 128262306a36Sopenharmony_ci if (need_wakeup) 128362306a36Sopenharmony_ci __wake_userfault(ctx, range); 128462306a36Sopenharmony_ci} 128562306a36Sopenharmony_ci 128662306a36Sopenharmony_cistatic __always_inline int validate_unaligned_range( 128762306a36Sopenharmony_ci struct mm_struct *mm, __u64 start, __u64 len) 128862306a36Sopenharmony_ci{ 128962306a36Sopenharmony_ci __u64 task_size = mm->task_size; 129062306a36Sopenharmony_ci 129162306a36Sopenharmony_ci if (len & ~PAGE_MASK) 129262306a36Sopenharmony_ci return -EINVAL; 129362306a36Sopenharmony_ci if (!len) 129462306a36Sopenharmony_ci return -EINVAL; 129562306a36Sopenharmony_ci if (start < mmap_min_addr) 129662306a36Sopenharmony_ci return -EINVAL; 129762306a36Sopenharmony_ci if (start >= task_size) 129862306a36Sopenharmony_ci return -EINVAL; 129962306a36Sopenharmony_ci if (len > task_size - start) 130062306a36Sopenharmony_ci return -EINVAL; 130162306a36Sopenharmony_ci if (start + len <= start) 130262306a36Sopenharmony_ci return -EINVAL; 130362306a36Sopenharmony_ci return 0; 130462306a36Sopenharmony_ci} 130562306a36Sopenharmony_ci 130662306a36Sopenharmony_cistatic __always_inline int validate_range(struct mm_struct *mm, 130762306a36Sopenharmony_ci __u64 start, __u64 len) 130862306a36Sopenharmony_ci{ 130962306a36Sopenharmony_ci if (start & ~PAGE_MASK) 131062306a36Sopenharmony_ci return -EINVAL; 131162306a36Sopenharmony_ci 131262306a36Sopenharmony_ci return validate_unaligned_range(mm, start, len); 131362306a36Sopenharmony_ci} 131462306a36Sopenharmony_ci 131562306a36Sopenharmony_cistatic int userfaultfd_register(struct userfaultfd_ctx *ctx, 131662306a36Sopenharmony_ci unsigned long arg) 131762306a36Sopenharmony_ci{ 131862306a36Sopenharmony_ci struct mm_struct *mm = ctx->mm; 131962306a36Sopenharmony_ci struct vm_area_struct *vma, *prev, *cur; 132062306a36Sopenharmony_ci int ret; 132162306a36Sopenharmony_ci struct uffdio_register uffdio_register; 132262306a36Sopenharmony_ci struct uffdio_register __user *user_uffdio_register; 132362306a36Sopenharmony_ci unsigned long vm_flags, new_flags; 132462306a36Sopenharmony_ci bool found; 132562306a36Sopenharmony_ci bool basic_ioctls; 132662306a36Sopenharmony_ci unsigned long start, end, vma_end; 132762306a36Sopenharmony_ci struct vma_iterator vmi; 132862306a36Sopenharmony_ci pgoff_t pgoff; 132962306a36Sopenharmony_ci 133062306a36Sopenharmony_ci user_uffdio_register = (struct uffdio_register __user *) arg; 133162306a36Sopenharmony_ci 133262306a36Sopenharmony_ci ret = -EFAULT; 133362306a36Sopenharmony_ci if (copy_from_user(&uffdio_register, user_uffdio_register, 133462306a36Sopenharmony_ci sizeof(uffdio_register)-sizeof(__u64))) 133562306a36Sopenharmony_ci goto out; 133662306a36Sopenharmony_ci 133762306a36Sopenharmony_ci ret = -EINVAL; 133862306a36Sopenharmony_ci if (!uffdio_register.mode) 133962306a36Sopenharmony_ci goto out; 134062306a36Sopenharmony_ci if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) 134162306a36Sopenharmony_ci goto out; 134262306a36Sopenharmony_ci vm_flags = 0; 134362306a36Sopenharmony_ci if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) 134462306a36Sopenharmony_ci vm_flags |= VM_UFFD_MISSING; 134562306a36Sopenharmony_ci if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { 134662306a36Sopenharmony_ci#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP 134762306a36Sopenharmony_ci goto out; 134862306a36Sopenharmony_ci#endif 134962306a36Sopenharmony_ci vm_flags |= VM_UFFD_WP; 135062306a36Sopenharmony_ci } 135162306a36Sopenharmony_ci if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { 135262306a36Sopenharmony_ci#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 135362306a36Sopenharmony_ci goto out; 135462306a36Sopenharmony_ci#endif 135562306a36Sopenharmony_ci vm_flags |= VM_UFFD_MINOR; 135662306a36Sopenharmony_ci } 135762306a36Sopenharmony_ci 135862306a36Sopenharmony_ci ret = validate_range(mm, uffdio_register.range.start, 135962306a36Sopenharmony_ci uffdio_register.range.len); 136062306a36Sopenharmony_ci if (ret) 136162306a36Sopenharmony_ci goto out; 136262306a36Sopenharmony_ci 136362306a36Sopenharmony_ci start = uffdio_register.range.start; 136462306a36Sopenharmony_ci end = start + uffdio_register.range.len; 136562306a36Sopenharmony_ci 136662306a36Sopenharmony_ci ret = -ENOMEM; 136762306a36Sopenharmony_ci if (!mmget_not_zero(mm)) 136862306a36Sopenharmony_ci goto out; 136962306a36Sopenharmony_ci 137062306a36Sopenharmony_ci ret = -EINVAL; 137162306a36Sopenharmony_ci mmap_write_lock(mm); 137262306a36Sopenharmony_ci vma_iter_init(&vmi, mm, start); 137362306a36Sopenharmony_ci vma = vma_find(&vmi, end); 137462306a36Sopenharmony_ci if (!vma) 137562306a36Sopenharmony_ci goto out_unlock; 137662306a36Sopenharmony_ci 137762306a36Sopenharmony_ci /* 137862306a36Sopenharmony_ci * If the first vma contains huge pages, make sure start address 137962306a36Sopenharmony_ci * is aligned to huge page size. 138062306a36Sopenharmony_ci */ 138162306a36Sopenharmony_ci if (is_vm_hugetlb_page(vma)) { 138262306a36Sopenharmony_ci unsigned long vma_hpagesize = vma_kernel_pagesize(vma); 138362306a36Sopenharmony_ci 138462306a36Sopenharmony_ci if (start & (vma_hpagesize - 1)) 138562306a36Sopenharmony_ci goto out_unlock; 138662306a36Sopenharmony_ci } 138762306a36Sopenharmony_ci 138862306a36Sopenharmony_ci /* 138962306a36Sopenharmony_ci * Search for not compatible vmas. 139062306a36Sopenharmony_ci */ 139162306a36Sopenharmony_ci found = false; 139262306a36Sopenharmony_ci basic_ioctls = false; 139362306a36Sopenharmony_ci cur = vma; 139462306a36Sopenharmony_ci do { 139562306a36Sopenharmony_ci cond_resched(); 139662306a36Sopenharmony_ci 139762306a36Sopenharmony_ci BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ 139862306a36Sopenharmony_ci !!(cur->vm_flags & __VM_UFFD_FLAGS)); 139962306a36Sopenharmony_ci 140062306a36Sopenharmony_ci /* check not compatible vmas */ 140162306a36Sopenharmony_ci ret = -EINVAL; 140262306a36Sopenharmony_ci if (!vma_can_userfault(cur, vm_flags)) 140362306a36Sopenharmony_ci goto out_unlock; 140462306a36Sopenharmony_ci 140562306a36Sopenharmony_ci /* 140662306a36Sopenharmony_ci * UFFDIO_COPY will fill file holes even without 140762306a36Sopenharmony_ci * PROT_WRITE. This check enforces that if this is a 140862306a36Sopenharmony_ci * MAP_SHARED, the process has write permission to the backing 140962306a36Sopenharmony_ci * file. If VM_MAYWRITE is set it also enforces that on a 141062306a36Sopenharmony_ci * MAP_SHARED vma: there is no F_WRITE_SEAL and no further 141162306a36Sopenharmony_ci * F_WRITE_SEAL can be taken until the vma is destroyed. 141262306a36Sopenharmony_ci */ 141362306a36Sopenharmony_ci ret = -EPERM; 141462306a36Sopenharmony_ci if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) 141562306a36Sopenharmony_ci goto out_unlock; 141662306a36Sopenharmony_ci 141762306a36Sopenharmony_ci /* 141862306a36Sopenharmony_ci * If this vma contains ending address, and huge pages 141962306a36Sopenharmony_ci * check alignment. 142062306a36Sopenharmony_ci */ 142162306a36Sopenharmony_ci if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && 142262306a36Sopenharmony_ci end > cur->vm_start) { 142362306a36Sopenharmony_ci unsigned long vma_hpagesize = vma_kernel_pagesize(cur); 142462306a36Sopenharmony_ci 142562306a36Sopenharmony_ci ret = -EINVAL; 142662306a36Sopenharmony_ci 142762306a36Sopenharmony_ci if (end & (vma_hpagesize - 1)) 142862306a36Sopenharmony_ci goto out_unlock; 142962306a36Sopenharmony_ci } 143062306a36Sopenharmony_ci if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) 143162306a36Sopenharmony_ci goto out_unlock; 143262306a36Sopenharmony_ci 143362306a36Sopenharmony_ci /* 143462306a36Sopenharmony_ci * Check that this vma isn't already owned by a 143562306a36Sopenharmony_ci * different userfaultfd. We can't allow more than one 143662306a36Sopenharmony_ci * userfaultfd to own a single vma simultaneously or we 143762306a36Sopenharmony_ci * wouldn't know which one to deliver the userfaults to. 143862306a36Sopenharmony_ci */ 143962306a36Sopenharmony_ci ret = -EBUSY; 144062306a36Sopenharmony_ci if (cur->vm_userfaultfd_ctx.ctx && 144162306a36Sopenharmony_ci cur->vm_userfaultfd_ctx.ctx != ctx) 144262306a36Sopenharmony_ci goto out_unlock; 144362306a36Sopenharmony_ci 144462306a36Sopenharmony_ci /* 144562306a36Sopenharmony_ci * Note vmas containing huge pages 144662306a36Sopenharmony_ci */ 144762306a36Sopenharmony_ci if (is_vm_hugetlb_page(cur)) 144862306a36Sopenharmony_ci basic_ioctls = true; 144962306a36Sopenharmony_ci 145062306a36Sopenharmony_ci found = true; 145162306a36Sopenharmony_ci } for_each_vma_range(vmi, cur, end); 145262306a36Sopenharmony_ci BUG_ON(!found); 145362306a36Sopenharmony_ci 145462306a36Sopenharmony_ci vma_iter_set(&vmi, start); 145562306a36Sopenharmony_ci prev = vma_prev(&vmi); 145662306a36Sopenharmony_ci if (vma->vm_start < start) 145762306a36Sopenharmony_ci prev = vma; 145862306a36Sopenharmony_ci 145962306a36Sopenharmony_ci ret = 0; 146062306a36Sopenharmony_ci for_each_vma_range(vmi, vma, end) { 146162306a36Sopenharmony_ci cond_resched(); 146262306a36Sopenharmony_ci 146362306a36Sopenharmony_ci BUG_ON(!vma_can_userfault(vma, vm_flags)); 146462306a36Sopenharmony_ci BUG_ON(vma->vm_userfaultfd_ctx.ctx && 146562306a36Sopenharmony_ci vma->vm_userfaultfd_ctx.ctx != ctx); 146662306a36Sopenharmony_ci WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); 146762306a36Sopenharmony_ci 146862306a36Sopenharmony_ci /* 146962306a36Sopenharmony_ci * Nothing to do: this vma is already registered into this 147062306a36Sopenharmony_ci * userfaultfd and with the right tracking mode too. 147162306a36Sopenharmony_ci */ 147262306a36Sopenharmony_ci if (vma->vm_userfaultfd_ctx.ctx == ctx && 147362306a36Sopenharmony_ci (vma->vm_flags & vm_flags) == vm_flags) 147462306a36Sopenharmony_ci goto skip; 147562306a36Sopenharmony_ci 147662306a36Sopenharmony_ci if (vma->vm_start > start) 147762306a36Sopenharmony_ci start = vma->vm_start; 147862306a36Sopenharmony_ci vma_end = min(end, vma->vm_end); 147962306a36Sopenharmony_ci 148062306a36Sopenharmony_ci new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; 148162306a36Sopenharmony_ci pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 148262306a36Sopenharmony_ci prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, 148362306a36Sopenharmony_ci vma->anon_vma, vma->vm_file, pgoff, 148462306a36Sopenharmony_ci vma_policy(vma), 148562306a36Sopenharmony_ci ((struct vm_userfaultfd_ctx){ ctx }), 148662306a36Sopenharmony_ci anon_vma_name(vma)); 148762306a36Sopenharmony_ci if (prev) { 148862306a36Sopenharmony_ci /* vma_merge() invalidated the mas */ 148962306a36Sopenharmony_ci vma = prev; 149062306a36Sopenharmony_ci goto next; 149162306a36Sopenharmony_ci } 149262306a36Sopenharmony_ci if (vma->vm_start < start) { 149362306a36Sopenharmony_ci ret = split_vma(&vmi, vma, start, 1); 149462306a36Sopenharmony_ci if (ret) 149562306a36Sopenharmony_ci break; 149662306a36Sopenharmony_ci } 149762306a36Sopenharmony_ci if (vma->vm_end > end) { 149862306a36Sopenharmony_ci ret = split_vma(&vmi, vma, end, 0); 149962306a36Sopenharmony_ci if (ret) 150062306a36Sopenharmony_ci break; 150162306a36Sopenharmony_ci } 150262306a36Sopenharmony_ci next: 150362306a36Sopenharmony_ci /* 150462306a36Sopenharmony_ci * In the vma_merge() successful mprotect-like case 8: 150562306a36Sopenharmony_ci * the next vma was merged into the current one and 150662306a36Sopenharmony_ci * the current one has not been updated yet. 150762306a36Sopenharmony_ci */ 150862306a36Sopenharmony_ci vma_start_write(vma); 150962306a36Sopenharmony_ci userfaultfd_set_vm_flags(vma, new_flags); 151062306a36Sopenharmony_ci vma->vm_userfaultfd_ctx.ctx = ctx; 151162306a36Sopenharmony_ci 151262306a36Sopenharmony_ci if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) 151362306a36Sopenharmony_ci hugetlb_unshare_all_pmds(vma); 151462306a36Sopenharmony_ci 151562306a36Sopenharmony_ci skip: 151662306a36Sopenharmony_ci prev = vma; 151762306a36Sopenharmony_ci start = vma->vm_end; 151862306a36Sopenharmony_ci } 151962306a36Sopenharmony_ci 152062306a36Sopenharmony_ciout_unlock: 152162306a36Sopenharmony_ci mmap_write_unlock(mm); 152262306a36Sopenharmony_ci mmput(mm); 152362306a36Sopenharmony_ci if (!ret) { 152462306a36Sopenharmony_ci __u64 ioctls_out; 152562306a36Sopenharmony_ci 152662306a36Sopenharmony_ci ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : 152762306a36Sopenharmony_ci UFFD_API_RANGE_IOCTLS; 152862306a36Sopenharmony_ci 152962306a36Sopenharmony_ci /* 153062306a36Sopenharmony_ci * Declare the WP ioctl only if the WP mode is 153162306a36Sopenharmony_ci * specified and all checks passed with the range 153262306a36Sopenharmony_ci */ 153362306a36Sopenharmony_ci if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) 153462306a36Sopenharmony_ci ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); 153562306a36Sopenharmony_ci 153662306a36Sopenharmony_ci /* CONTINUE ioctl is only supported for MINOR ranges. */ 153762306a36Sopenharmony_ci if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) 153862306a36Sopenharmony_ci ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE); 153962306a36Sopenharmony_ci 154062306a36Sopenharmony_ci /* 154162306a36Sopenharmony_ci * Now that we scanned all vmas we can already tell 154262306a36Sopenharmony_ci * userland which ioctls methods are guaranteed to 154362306a36Sopenharmony_ci * succeed on this range. 154462306a36Sopenharmony_ci */ 154562306a36Sopenharmony_ci if (put_user(ioctls_out, &user_uffdio_register->ioctls)) 154662306a36Sopenharmony_ci ret = -EFAULT; 154762306a36Sopenharmony_ci } 154862306a36Sopenharmony_ciout: 154962306a36Sopenharmony_ci return ret; 155062306a36Sopenharmony_ci} 155162306a36Sopenharmony_ci 155262306a36Sopenharmony_cistatic int userfaultfd_unregister(struct userfaultfd_ctx *ctx, 155362306a36Sopenharmony_ci unsigned long arg) 155462306a36Sopenharmony_ci{ 155562306a36Sopenharmony_ci struct mm_struct *mm = ctx->mm; 155662306a36Sopenharmony_ci struct vm_area_struct *vma, *prev, *cur; 155762306a36Sopenharmony_ci int ret; 155862306a36Sopenharmony_ci struct uffdio_range uffdio_unregister; 155962306a36Sopenharmony_ci unsigned long new_flags; 156062306a36Sopenharmony_ci bool found; 156162306a36Sopenharmony_ci unsigned long start, end, vma_end; 156262306a36Sopenharmony_ci const void __user *buf = (void __user *)arg; 156362306a36Sopenharmony_ci struct vma_iterator vmi; 156462306a36Sopenharmony_ci pgoff_t pgoff; 156562306a36Sopenharmony_ci 156662306a36Sopenharmony_ci ret = -EFAULT; 156762306a36Sopenharmony_ci if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) 156862306a36Sopenharmony_ci goto out; 156962306a36Sopenharmony_ci 157062306a36Sopenharmony_ci ret = validate_range(mm, uffdio_unregister.start, 157162306a36Sopenharmony_ci uffdio_unregister.len); 157262306a36Sopenharmony_ci if (ret) 157362306a36Sopenharmony_ci goto out; 157462306a36Sopenharmony_ci 157562306a36Sopenharmony_ci start = uffdio_unregister.start; 157662306a36Sopenharmony_ci end = start + uffdio_unregister.len; 157762306a36Sopenharmony_ci 157862306a36Sopenharmony_ci ret = -ENOMEM; 157962306a36Sopenharmony_ci if (!mmget_not_zero(mm)) 158062306a36Sopenharmony_ci goto out; 158162306a36Sopenharmony_ci 158262306a36Sopenharmony_ci mmap_write_lock(mm); 158362306a36Sopenharmony_ci ret = -EINVAL; 158462306a36Sopenharmony_ci vma_iter_init(&vmi, mm, start); 158562306a36Sopenharmony_ci vma = vma_find(&vmi, end); 158662306a36Sopenharmony_ci if (!vma) 158762306a36Sopenharmony_ci goto out_unlock; 158862306a36Sopenharmony_ci 158962306a36Sopenharmony_ci /* 159062306a36Sopenharmony_ci * If the first vma contains huge pages, make sure start address 159162306a36Sopenharmony_ci * is aligned to huge page size. 159262306a36Sopenharmony_ci */ 159362306a36Sopenharmony_ci if (is_vm_hugetlb_page(vma)) { 159462306a36Sopenharmony_ci unsigned long vma_hpagesize = vma_kernel_pagesize(vma); 159562306a36Sopenharmony_ci 159662306a36Sopenharmony_ci if (start & (vma_hpagesize - 1)) 159762306a36Sopenharmony_ci goto out_unlock; 159862306a36Sopenharmony_ci } 159962306a36Sopenharmony_ci 160062306a36Sopenharmony_ci /* 160162306a36Sopenharmony_ci * Search for not compatible vmas. 160262306a36Sopenharmony_ci */ 160362306a36Sopenharmony_ci found = false; 160462306a36Sopenharmony_ci cur = vma; 160562306a36Sopenharmony_ci do { 160662306a36Sopenharmony_ci cond_resched(); 160762306a36Sopenharmony_ci 160862306a36Sopenharmony_ci BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ 160962306a36Sopenharmony_ci !!(cur->vm_flags & __VM_UFFD_FLAGS)); 161062306a36Sopenharmony_ci 161162306a36Sopenharmony_ci /* 161262306a36Sopenharmony_ci * Check not compatible vmas, not strictly required 161362306a36Sopenharmony_ci * here as not compatible vmas cannot have an 161462306a36Sopenharmony_ci * userfaultfd_ctx registered on them, but this 161562306a36Sopenharmony_ci * provides for more strict behavior to notice 161662306a36Sopenharmony_ci * unregistration errors. 161762306a36Sopenharmony_ci */ 161862306a36Sopenharmony_ci if (!vma_can_userfault(cur, cur->vm_flags)) 161962306a36Sopenharmony_ci goto out_unlock; 162062306a36Sopenharmony_ci 162162306a36Sopenharmony_ci found = true; 162262306a36Sopenharmony_ci } for_each_vma_range(vmi, cur, end); 162362306a36Sopenharmony_ci BUG_ON(!found); 162462306a36Sopenharmony_ci 162562306a36Sopenharmony_ci vma_iter_set(&vmi, start); 162662306a36Sopenharmony_ci prev = vma_prev(&vmi); 162762306a36Sopenharmony_ci if (vma->vm_start < start) 162862306a36Sopenharmony_ci prev = vma; 162962306a36Sopenharmony_ci 163062306a36Sopenharmony_ci ret = 0; 163162306a36Sopenharmony_ci for_each_vma_range(vmi, vma, end) { 163262306a36Sopenharmony_ci cond_resched(); 163362306a36Sopenharmony_ci 163462306a36Sopenharmony_ci BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); 163562306a36Sopenharmony_ci 163662306a36Sopenharmony_ci /* 163762306a36Sopenharmony_ci * Nothing to do: this vma is already registered into this 163862306a36Sopenharmony_ci * userfaultfd and with the right tracking mode too. 163962306a36Sopenharmony_ci */ 164062306a36Sopenharmony_ci if (!vma->vm_userfaultfd_ctx.ctx) 164162306a36Sopenharmony_ci goto skip; 164262306a36Sopenharmony_ci 164362306a36Sopenharmony_ci WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); 164462306a36Sopenharmony_ci 164562306a36Sopenharmony_ci if (vma->vm_start > start) 164662306a36Sopenharmony_ci start = vma->vm_start; 164762306a36Sopenharmony_ci vma_end = min(end, vma->vm_end); 164862306a36Sopenharmony_ci 164962306a36Sopenharmony_ci if (userfaultfd_missing(vma)) { 165062306a36Sopenharmony_ci /* 165162306a36Sopenharmony_ci * Wake any concurrent pending userfault while 165262306a36Sopenharmony_ci * we unregister, so they will not hang 165362306a36Sopenharmony_ci * permanently and it avoids userland to call 165462306a36Sopenharmony_ci * UFFDIO_WAKE explicitly. 165562306a36Sopenharmony_ci */ 165662306a36Sopenharmony_ci struct userfaultfd_wake_range range; 165762306a36Sopenharmony_ci range.start = start; 165862306a36Sopenharmony_ci range.len = vma_end - start; 165962306a36Sopenharmony_ci wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); 166062306a36Sopenharmony_ci } 166162306a36Sopenharmony_ci 166262306a36Sopenharmony_ci /* Reset ptes for the whole vma range if wr-protected */ 166362306a36Sopenharmony_ci if (userfaultfd_wp(vma)) 166462306a36Sopenharmony_ci uffd_wp_range(vma, start, vma_end - start, false); 166562306a36Sopenharmony_ci 166662306a36Sopenharmony_ci new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; 166762306a36Sopenharmony_ci pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 166862306a36Sopenharmony_ci prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, 166962306a36Sopenharmony_ci vma->anon_vma, vma->vm_file, pgoff, 167062306a36Sopenharmony_ci vma_policy(vma), 167162306a36Sopenharmony_ci NULL_VM_UFFD_CTX, anon_vma_name(vma)); 167262306a36Sopenharmony_ci if (prev) { 167362306a36Sopenharmony_ci vma = prev; 167462306a36Sopenharmony_ci goto next; 167562306a36Sopenharmony_ci } 167662306a36Sopenharmony_ci if (vma->vm_start < start) { 167762306a36Sopenharmony_ci ret = split_vma(&vmi, vma, start, 1); 167862306a36Sopenharmony_ci if (ret) 167962306a36Sopenharmony_ci break; 168062306a36Sopenharmony_ci } 168162306a36Sopenharmony_ci if (vma->vm_end > end) { 168262306a36Sopenharmony_ci ret = split_vma(&vmi, vma, end, 0); 168362306a36Sopenharmony_ci if (ret) 168462306a36Sopenharmony_ci break; 168562306a36Sopenharmony_ci } 168662306a36Sopenharmony_ci next: 168762306a36Sopenharmony_ci /* 168862306a36Sopenharmony_ci * In the vma_merge() successful mprotect-like case 8: 168962306a36Sopenharmony_ci * the next vma was merged into the current one and 169062306a36Sopenharmony_ci * the current one has not been updated yet. 169162306a36Sopenharmony_ci */ 169262306a36Sopenharmony_ci vma_start_write(vma); 169362306a36Sopenharmony_ci userfaultfd_set_vm_flags(vma, new_flags); 169462306a36Sopenharmony_ci vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 169562306a36Sopenharmony_ci 169662306a36Sopenharmony_ci skip: 169762306a36Sopenharmony_ci prev = vma; 169862306a36Sopenharmony_ci start = vma->vm_end; 169962306a36Sopenharmony_ci } 170062306a36Sopenharmony_ci 170162306a36Sopenharmony_ciout_unlock: 170262306a36Sopenharmony_ci mmap_write_unlock(mm); 170362306a36Sopenharmony_ci mmput(mm); 170462306a36Sopenharmony_ciout: 170562306a36Sopenharmony_ci return ret; 170662306a36Sopenharmony_ci} 170762306a36Sopenharmony_ci 170862306a36Sopenharmony_ci/* 170962306a36Sopenharmony_ci * userfaultfd_wake may be used in combination with the 171062306a36Sopenharmony_ci * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. 171162306a36Sopenharmony_ci */ 171262306a36Sopenharmony_cistatic int userfaultfd_wake(struct userfaultfd_ctx *ctx, 171362306a36Sopenharmony_ci unsigned long arg) 171462306a36Sopenharmony_ci{ 171562306a36Sopenharmony_ci int ret; 171662306a36Sopenharmony_ci struct uffdio_range uffdio_wake; 171762306a36Sopenharmony_ci struct userfaultfd_wake_range range; 171862306a36Sopenharmony_ci const void __user *buf = (void __user *)arg; 171962306a36Sopenharmony_ci 172062306a36Sopenharmony_ci ret = -EFAULT; 172162306a36Sopenharmony_ci if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) 172262306a36Sopenharmony_ci goto out; 172362306a36Sopenharmony_ci 172462306a36Sopenharmony_ci ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); 172562306a36Sopenharmony_ci if (ret) 172662306a36Sopenharmony_ci goto out; 172762306a36Sopenharmony_ci 172862306a36Sopenharmony_ci range.start = uffdio_wake.start; 172962306a36Sopenharmony_ci range.len = uffdio_wake.len; 173062306a36Sopenharmony_ci 173162306a36Sopenharmony_ci /* 173262306a36Sopenharmony_ci * len == 0 means wake all and we don't want to wake all here, 173362306a36Sopenharmony_ci * so check it again to be sure. 173462306a36Sopenharmony_ci */ 173562306a36Sopenharmony_ci VM_BUG_ON(!range.len); 173662306a36Sopenharmony_ci 173762306a36Sopenharmony_ci wake_userfault(ctx, &range); 173862306a36Sopenharmony_ci ret = 0; 173962306a36Sopenharmony_ci 174062306a36Sopenharmony_ciout: 174162306a36Sopenharmony_ci return ret; 174262306a36Sopenharmony_ci} 174362306a36Sopenharmony_ci 174462306a36Sopenharmony_cistatic int userfaultfd_copy(struct userfaultfd_ctx *ctx, 174562306a36Sopenharmony_ci unsigned long arg) 174662306a36Sopenharmony_ci{ 174762306a36Sopenharmony_ci __s64 ret; 174862306a36Sopenharmony_ci struct uffdio_copy uffdio_copy; 174962306a36Sopenharmony_ci struct uffdio_copy __user *user_uffdio_copy; 175062306a36Sopenharmony_ci struct userfaultfd_wake_range range; 175162306a36Sopenharmony_ci uffd_flags_t flags = 0; 175262306a36Sopenharmony_ci 175362306a36Sopenharmony_ci user_uffdio_copy = (struct uffdio_copy __user *) arg; 175462306a36Sopenharmony_ci 175562306a36Sopenharmony_ci ret = -EAGAIN; 175662306a36Sopenharmony_ci if (atomic_read(&ctx->mmap_changing)) 175762306a36Sopenharmony_ci goto out; 175862306a36Sopenharmony_ci 175962306a36Sopenharmony_ci ret = -EFAULT; 176062306a36Sopenharmony_ci if (copy_from_user(&uffdio_copy, user_uffdio_copy, 176162306a36Sopenharmony_ci /* don't copy "copy" last field */ 176262306a36Sopenharmony_ci sizeof(uffdio_copy)-sizeof(__s64))) 176362306a36Sopenharmony_ci goto out; 176462306a36Sopenharmony_ci 176562306a36Sopenharmony_ci ret = validate_unaligned_range(ctx->mm, uffdio_copy.src, 176662306a36Sopenharmony_ci uffdio_copy.len); 176762306a36Sopenharmony_ci if (ret) 176862306a36Sopenharmony_ci goto out; 176962306a36Sopenharmony_ci ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); 177062306a36Sopenharmony_ci if (ret) 177162306a36Sopenharmony_ci goto out; 177262306a36Sopenharmony_ci 177362306a36Sopenharmony_ci ret = -EINVAL; 177462306a36Sopenharmony_ci if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) 177562306a36Sopenharmony_ci goto out; 177662306a36Sopenharmony_ci if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) 177762306a36Sopenharmony_ci flags |= MFILL_ATOMIC_WP; 177862306a36Sopenharmony_ci if (mmget_not_zero(ctx->mm)) { 177962306a36Sopenharmony_ci ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, 178062306a36Sopenharmony_ci uffdio_copy.len, &ctx->mmap_changing, 178162306a36Sopenharmony_ci flags); 178262306a36Sopenharmony_ci mmput(ctx->mm); 178362306a36Sopenharmony_ci } else { 178462306a36Sopenharmony_ci return -ESRCH; 178562306a36Sopenharmony_ci } 178662306a36Sopenharmony_ci if (unlikely(put_user(ret, &user_uffdio_copy->copy))) 178762306a36Sopenharmony_ci return -EFAULT; 178862306a36Sopenharmony_ci if (ret < 0) 178962306a36Sopenharmony_ci goto out; 179062306a36Sopenharmony_ci BUG_ON(!ret); 179162306a36Sopenharmony_ci /* len == 0 would wake all */ 179262306a36Sopenharmony_ci range.len = ret; 179362306a36Sopenharmony_ci if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { 179462306a36Sopenharmony_ci range.start = uffdio_copy.dst; 179562306a36Sopenharmony_ci wake_userfault(ctx, &range); 179662306a36Sopenharmony_ci } 179762306a36Sopenharmony_ci ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; 179862306a36Sopenharmony_ciout: 179962306a36Sopenharmony_ci return ret; 180062306a36Sopenharmony_ci} 180162306a36Sopenharmony_ci 180262306a36Sopenharmony_cistatic int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, 180362306a36Sopenharmony_ci unsigned long arg) 180462306a36Sopenharmony_ci{ 180562306a36Sopenharmony_ci __s64 ret; 180662306a36Sopenharmony_ci struct uffdio_zeropage uffdio_zeropage; 180762306a36Sopenharmony_ci struct uffdio_zeropage __user *user_uffdio_zeropage; 180862306a36Sopenharmony_ci struct userfaultfd_wake_range range; 180962306a36Sopenharmony_ci 181062306a36Sopenharmony_ci user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; 181162306a36Sopenharmony_ci 181262306a36Sopenharmony_ci ret = -EAGAIN; 181362306a36Sopenharmony_ci if (atomic_read(&ctx->mmap_changing)) 181462306a36Sopenharmony_ci goto out; 181562306a36Sopenharmony_ci 181662306a36Sopenharmony_ci ret = -EFAULT; 181762306a36Sopenharmony_ci if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, 181862306a36Sopenharmony_ci /* don't copy "zeropage" last field */ 181962306a36Sopenharmony_ci sizeof(uffdio_zeropage)-sizeof(__s64))) 182062306a36Sopenharmony_ci goto out; 182162306a36Sopenharmony_ci 182262306a36Sopenharmony_ci ret = validate_range(ctx->mm, uffdio_zeropage.range.start, 182362306a36Sopenharmony_ci uffdio_zeropage.range.len); 182462306a36Sopenharmony_ci if (ret) 182562306a36Sopenharmony_ci goto out; 182662306a36Sopenharmony_ci ret = -EINVAL; 182762306a36Sopenharmony_ci if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) 182862306a36Sopenharmony_ci goto out; 182962306a36Sopenharmony_ci 183062306a36Sopenharmony_ci if (mmget_not_zero(ctx->mm)) { 183162306a36Sopenharmony_ci ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start, 183262306a36Sopenharmony_ci uffdio_zeropage.range.len, 183362306a36Sopenharmony_ci &ctx->mmap_changing); 183462306a36Sopenharmony_ci mmput(ctx->mm); 183562306a36Sopenharmony_ci } else { 183662306a36Sopenharmony_ci return -ESRCH; 183762306a36Sopenharmony_ci } 183862306a36Sopenharmony_ci if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) 183962306a36Sopenharmony_ci return -EFAULT; 184062306a36Sopenharmony_ci if (ret < 0) 184162306a36Sopenharmony_ci goto out; 184262306a36Sopenharmony_ci /* len == 0 would wake all */ 184362306a36Sopenharmony_ci BUG_ON(!ret); 184462306a36Sopenharmony_ci range.len = ret; 184562306a36Sopenharmony_ci if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { 184662306a36Sopenharmony_ci range.start = uffdio_zeropage.range.start; 184762306a36Sopenharmony_ci wake_userfault(ctx, &range); 184862306a36Sopenharmony_ci } 184962306a36Sopenharmony_ci ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; 185062306a36Sopenharmony_ciout: 185162306a36Sopenharmony_ci return ret; 185262306a36Sopenharmony_ci} 185362306a36Sopenharmony_ci 185462306a36Sopenharmony_cistatic int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, 185562306a36Sopenharmony_ci unsigned long arg) 185662306a36Sopenharmony_ci{ 185762306a36Sopenharmony_ci int ret; 185862306a36Sopenharmony_ci struct uffdio_writeprotect uffdio_wp; 185962306a36Sopenharmony_ci struct uffdio_writeprotect __user *user_uffdio_wp; 186062306a36Sopenharmony_ci struct userfaultfd_wake_range range; 186162306a36Sopenharmony_ci bool mode_wp, mode_dontwake; 186262306a36Sopenharmony_ci 186362306a36Sopenharmony_ci if (atomic_read(&ctx->mmap_changing)) 186462306a36Sopenharmony_ci return -EAGAIN; 186562306a36Sopenharmony_ci 186662306a36Sopenharmony_ci user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; 186762306a36Sopenharmony_ci 186862306a36Sopenharmony_ci if (copy_from_user(&uffdio_wp, user_uffdio_wp, 186962306a36Sopenharmony_ci sizeof(struct uffdio_writeprotect))) 187062306a36Sopenharmony_ci return -EFAULT; 187162306a36Sopenharmony_ci 187262306a36Sopenharmony_ci ret = validate_range(ctx->mm, uffdio_wp.range.start, 187362306a36Sopenharmony_ci uffdio_wp.range.len); 187462306a36Sopenharmony_ci if (ret) 187562306a36Sopenharmony_ci return ret; 187662306a36Sopenharmony_ci 187762306a36Sopenharmony_ci if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | 187862306a36Sopenharmony_ci UFFDIO_WRITEPROTECT_MODE_WP)) 187962306a36Sopenharmony_ci return -EINVAL; 188062306a36Sopenharmony_ci 188162306a36Sopenharmony_ci mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; 188262306a36Sopenharmony_ci mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; 188362306a36Sopenharmony_ci 188462306a36Sopenharmony_ci if (mode_wp && mode_dontwake) 188562306a36Sopenharmony_ci return -EINVAL; 188662306a36Sopenharmony_ci 188762306a36Sopenharmony_ci if (mmget_not_zero(ctx->mm)) { 188862306a36Sopenharmony_ci ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, 188962306a36Sopenharmony_ci uffdio_wp.range.len, mode_wp, 189062306a36Sopenharmony_ci &ctx->mmap_changing); 189162306a36Sopenharmony_ci mmput(ctx->mm); 189262306a36Sopenharmony_ci } else { 189362306a36Sopenharmony_ci return -ESRCH; 189462306a36Sopenharmony_ci } 189562306a36Sopenharmony_ci 189662306a36Sopenharmony_ci if (ret) 189762306a36Sopenharmony_ci return ret; 189862306a36Sopenharmony_ci 189962306a36Sopenharmony_ci if (!mode_wp && !mode_dontwake) { 190062306a36Sopenharmony_ci range.start = uffdio_wp.range.start; 190162306a36Sopenharmony_ci range.len = uffdio_wp.range.len; 190262306a36Sopenharmony_ci wake_userfault(ctx, &range); 190362306a36Sopenharmony_ci } 190462306a36Sopenharmony_ci return ret; 190562306a36Sopenharmony_ci} 190662306a36Sopenharmony_ci 190762306a36Sopenharmony_cistatic int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) 190862306a36Sopenharmony_ci{ 190962306a36Sopenharmony_ci __s64 ret; 191062306a36Sopenharmony_ci struct uffdio_continue uffdio_continue; 191162306a36Sopenharmony_ci struct uffdio_continue __user *user_uffdio_continue; 191262306a36Sopenharmony_ci struct userfaultfd_wake_range range; 191362306a36Sopenharmony_ci uffd_flags_t flags = 0; 191462306a36Sopenharmony_ci 191562306a36Sopenharmony_ci user_uffdio_continue = (struct uffdio_continue __user *)arg; 191662306a36Sopenharmony_ci 191762306a36Sopenharmony_ci ret = -EAGAIN; 191862306a36Sopenharmony_ci if (atomic_read(&ctx->mmap_changing)) 191962306a36Sopenharmony_ci goto out; 192062306a36Sopenharmony_ci 192162306a36Sopenharmony_ci ret = -EFAULT; 192262306a36Sopenharmony_ci if (copy_from_user(&uffdio_continue, user_uffdio_continue, 192362306a36Sopenharmony_ci /* don't copy the output fields */ 192462306a36Sopenharmony_ci sizeof(uffdio_continue) - (sizeof(__s64)))) 192562306a36Sopenharmony_ci goto out; 192662306a36Sopenharmony_ci 192762306a36Sopenharmony_ci ret = validate_range(ctx->mm, uffdio_continue.range.start, 192862306a36Sopenharmony_ci uffdio_continue.range.len); 192962306a36Sopenharmony_ci if (ret) 193062306a36Sopenharmony_ci goto out; 193162306a36Sopenharmony_ci 193262306a36Sopenharmony_ci ret = -EINVAL; 193362306a36Sopenharmony_ci if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | 193462306a36Sopenharmony_ci UFFDIO_CONTINUE_MODE_WP)) 193562306a36Sopenharmony_ci goto out; 193662306a36Sopenharmony_ci if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP) 193762306a36Sopenharmony_ci flags |= MFILL_ATOMIC_WP; 193862306a36Sopenharmony_ci 193962306a36Sopenharmony_ci if (mmget_not_zero(ctx->mm)) { 194062306a36Sopenharmony_ci ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start, 194162306a36Sopenharmony_ci uffdio_continue.range.len, 194262306a36Sopenharmony_ci &ctx->mmap_changing, flags); 194362306a36Sopenharmony_ci mmput(ctx->mm); 194462306a36Sopenharmony_ci } else { 194562306a36Sopenharmony_ci return -ESRCH; 194662306a36Sopenharmony_ci } 194762306a36Sopenharmony_ci 194862306a36Sopenharmony_ci if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) 194962306a36Sopenharmony_ci return -EFAULT; 195062306a36Sopenharmony_ci if (ret < 0) 195162306a36Sopenharmony_ci goto out; 195262306a36Sopenharmony_ci 195362306a36Sopenharmony_ci /* len == 0 would wake all */ 195462306a36Sopenharmony_ci BUG_ON(!ret); 195562306a36Sopenharmony_ci range.len = ret; 195662306a36Sopenharmony_ci if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { 195762306a36Sopenharmony_ci range.start = uffdio_continue.range.start; 195862306a36Sopenharmony_ci wake_userfault(ctx, &range); 195962306a36Sopenharmony_ci } 196062306a36Sopenharmony_ci ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN; 196162306a36Sopenharmony_ci 196262306a36Sopenharmony_ciout: 196362306a36Sopenharmony_ci return ret; 196462306a36Sopenharmony_ci} 196562306a36Sopenharmony_ci 196662306a36Sopenharmony_cistatic inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg) 196762306a36Sopenharmony_ci{ 196862306a36Sopenharmony_ci __s64 ret; 196962306a36Sopenharmony_ci struct uffdio_poison uffdio_poison; 197062306a36Sopenharmony_ci struct uffdio_poison __user *user_uffdio_poison; 197162306a36Sopenharmony_ci struct userfaultfd_wake_range range; 197262306a36Sopenharmony_ci 197362306a36Sopenharmony_ci user_uffdio_poison = (struct uffdio_poison __user *)arg; 197462306a36Sopenharmony_ci 197562306a36Sopenharmony_ci ret = -EAGAIN; 197662306a36Sopenharmony_ci if (atomic_read(&ctx->mmap_changing)) 197762306a36Sopenharmony_ci goto out; 197862306a36Sopenharmony_ci 197962306a36Sopenharmony_ci ret = -EFAULT; 198062306a36Sopenharmony_ci if (copy_from_user(&uffdio_poison, user_uffdio_poison, 198162306a36Sopenharmony_ci /* don't copy the output fields */ 198262306a36Sopenharmony_ci sizeof(uffdio_poison) - (sizeof(__s64)))) 198362306a36Sopenharmony_ci goto out; 198462306a36Sopenharmony_ci 198562306a36Sopenharmony_ci ret = validate_range(ctx->mm, uffdio_poison.range.start, 198662306a36Sopenharmony_ci uffdio_poison.range.len); 198762306a36Sopenharmony_ci if (ret) 198862306a36Sopenharmony_ci goto out; 198962306a36Sopenharmony_ci 199062306a36Sopenharmony_ci ret = -EINVAL; 199162306a36Sopenharmony_ci if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE) 199262306a36Sopenharmony_ci goto out; 199362306a36Sopenharmony_ci 199462306a36Sopenharmony_ci if (mmget_not_zero(ctx->mm)) { 199562306a36Sopenharmony_ci ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start, 199662306a36Sopenharmony_ci uffdio_poison.range.len, 199762306a36Sopenharmony_ci &ctx->mmap_changing, 0); 199862306a36Sopenharmony_ci mmput(ctx->mm); 199962306a36Sopenharmony_ci } else { 200062306a36Sopenharmony_ci return -ESRCH; 200162306a36Sopenharmony_ci } 200262306a36Sopenharmony_ci 200362306a36Sopenharmony_ci if (unlikely(put_user(ret, &user_uffdio_poison->updated))) 200462306a36Sopenharmony_ci return -EFAULT; 200562306a36Sopenharmony_ci if (ret < 0) 200662306a36Sopenharmony_ci goto out; 200762306a36Sopenharmony_ci 200862306a36Sopenharmony_ci /* len == 0 would wake all */ 200962306a36Sopenharmony_ci BUG_ON(!ret); 201062306a36Sopenharmony_ci range.len = ret; 201162306a36Sopenharmony_ci if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { 201262306a36Sopenharmony_ci range.start = uffdio_poison.range.start; 201362306a36Sopenharmony_ci wake_userfault(ctx, &range); 201462306a36Sopenharmony_ci } 201562306a36Sopenharmony_ci ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN; 201662306a36Sopenharmony_ci 201762306a36Sopenharmony_ciout: 201862306a36Sopenharmony_ci return ret; 201962306a36Sopenharmony_ci} 202062306a36Sopenharmony_ci 202162306a36Sopenharmony_cistatic inline unsigned int uffd_ctx_features(__u64 user_features) 202262306a36Sopenharmony_ci{ 202362306a36Sopenharmony_ci /* 202462306a36Sopenharmony_ci * For the current set of features the bits just coincide. Set 202562306a36Sopenharmony_ci * UFFD_FEATURE_INITIALIZED to mark the features as enabled. 202662306a36Sopenharmony_ci */ 202762306a36Sopenharmony_ci return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; 202862306a36Sopenharmony_ci} 202962306a36Sopenharmony_ci 203062306a36Sopenharmony_ci/* 203162306a36Sopenharmony_ci * userland asks for a certain API version and we return which bits 203262306a36Sopenharmony_ci * and ioctl commands are implemented in this kernel for such API 203362306a36Sopenharmony_ci * version or -EINVAL if unknown. 203462306a36Sopenharmony_ci */ 203562306a36Sopenharmony_cistatic int userfaultfd_api(struct userfaultfd_ctx *ctx, 203662306a36Sopenharmony_ci unsigned long arg) 203762306a36Sopenharmony_ci{ 203862306a36Sopenharmony_ci struct uffdio_api uffdio_api; 203962306a36Sopenharmony_ci void __user *buf = (void __user *)arg; 204062306a36Sopenharmony_ci unsigned int ctx_features; 204162306a36Sopenharmony_ci int ret; 204262306a36Sopenharmony_ci __u64 features; 204362306a36Sopenharmony_ci 204462306a36Sopenharmony_ci ret = -EFAULT; 204562306a36Sopenharmony_ci if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) 204662306a36Sopenharmony_ci goto out; 204762306a36Sopenharmony_ci features = uffdio_api.features; 204862306a36Sopenharmony_ci ret = -EINVAL; 204962306a36Sopenharmony_ci if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) 205062306a36Sopenharmony_ci goto err_out; 205162306a36Sopenharmony_ci ret = -EPERM; 205262306a36Sopenharmony_ci if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) 205362306a36Sopenharmony_ci goto err_out; 205462306a36Sopenharmony_ci /* report all available features and ioctls to userland */ 205562306a36Sopenharmony_ci uffdio_api.features = UFFD_API_FEATURES; 205662306a36Sopenharmony_ci#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 205762306a36Sopenharmony_ci uffdio_api.features &= 205862306a36Sopenharmony_ci ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); 205962306a36Sopenharmony_ci#endif 206062306a36Sopenharmony_ci#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP 206162306a36Sopenharmony_ci uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; 206262306a36Sopenharmony_ci#endif 206362306a36Sopenharmony_ci#ifndef CONFIG_PTE_MARKER_UFFD_WP 206462306a36Sopenharmony_ci uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; 206562306a36Sopenharmony_ci uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; 206662306a36Sopenharmony_ci#endif 206762306a36Sopenharmony_ci uffdio_api.ioctls = UFFD_API_IOCTLS; 206862306a36Sopenharmony_ci ret = -EFAULT; 206962306a36Sopenharmony_ci if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) 207062306a36Sopenharmony_ci goto out; 207162306a36Sopenharmony_ci 207262306a36Sopenharmony_ci /* only enable the requested features for this uffd context */ 207362306a36Sopenharmony_ci ctx_features = uffd_ctx_features(features); 207462306a36Sopenharmony_ci ret = -EINVAL; 207562306a36Sopenharmony_ci if (cmpxchg(&ctx->features, 0, ctx_features) != 0) 207662306a36Sopenharmony_ci goto err_out; 207762306a36Sopenharmony_ci 207862306a36Sopenharmony_ci ret = 0; 207962306a36Sopenharmony_ciout: 208062306a36Sopenharmony_ci return ret; 208162306a36Sopenharmony_cierr_out: 208262306a36Sopenharmony_ci memset(&uffdio_api, 0, sizeof(uffdio_api)); 208362306a36Sopenharmony_ci if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) 208462306a36Sopenharmony_ci ret = -EFAULT; 208562306a36Sopenharmony_ci goto out; 208662306a36Sopenharmony_ci} 208762306a36Sopenharmony_ci 208862306a36Sopenharmony_cistatic long userfaultfd_ioctl(struct file *file, unsigned cmd, 208962306a36Sopenharmony_ci unsigned long arg) 209062306a36Sopenharmony_ci{ 209162306a36Sopenharmony_ci int ret = -EINVAL; 209262306a36Sopenharmony_ci struct userfaultfd_ctx *ctx = file->private_data; 209362306a36Sopenharmony_ci 209462306a36Sopenharmony_ci if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) 209562306a36Sopenharmony_ci return -EINVAL; 209662306a36Sopenharmony_ci 209762306a36Sopenharmony_ci switch(cmd) { 209862306a36Sopenharmony_ci case UFFDIO_API: 209962306a36Sopenharmony_ci ret = userfaultfd_api(ctx, arg); 210062306a36Sopenharmony_ci break; 210162306a36Sopenharmony_ci case UFFDIO_REGISTER: 210262306a36Sopenharmony_ci ret = userfaultfd_register(ctx, arg); 210362306a36Sopenharmony_ci break; 210462306a36Sopenharmony_ci case UFFDIO_UNREGISTER: 210562306a36Sopenharmony_ci ret = userfaultfd_unregister(ctx, arg); 210662306a36Sopenharmony_ci break; 210762306a36Sopenharmony_ci case UFFDIO_WAKE: 210862306a36Sopenharmony_ci ret = userfaultfd_wake(ctx, arg); 210962306a36Sopenharmony_ci break; 211062306a36Sopenharmony_ci case UFFDIO_COPY: 211162306a36Sopenharmony_ci ret = userfaultfd_copy(ctx, arg); 211262306a36Sopenharmony_ci break; 211362306a36Sopenharmony_ci case UFFDIO_ZEROPAGE: 211462306a36Sopenharmony_ci ret = userfaultfd_zeropage(ctx, arg); 211562306a36Sopenharmony_ci break; 211662306a36Sopenharmony_ci case UFFDIO_WRITEPROTECT: 211762306a36Sopenharmony_ci ret = userfaultfd_writeprotect(ctx, arg); 211862306a36Sopenharmony_ci break; 211962306a36Sopenharmony_ci case UFFDIO_CONTINUE: 212062306a36Sopenharmony_ci ret = userfaultfd_continue(ctx, arg); 212162306a36Sopenharmony_ci break; 212262306a36Sopenharmony_ci case UFFDIO_POISON: 212362306a36Sopenharmony_ci ret = userfaultfd_poison(ctx, arg); 212462306a36Sopenharmony_ci break; 212562306a36Sopenharmony_ci } 212662306a36Sopenharmony_ci return ret; 212762306a36Sopenharmony_ci} 212862306a36Sopenharmony_ci 212962306a36Sopenharmony_ci#ifdef CONFIG_PROC_FS 213062306a36Sopenharmony_cistatic void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) 213162306a36Sopenharmony_ci{ 213262306a36Sopenharmony_ci struct userfaultfd_ctx *ctx = f->private_data; 213362306a36Sopenharmony_ci wait_queue_entry_t *wq; 213462306a36Sopenharmony_ci unsigned long pending = 0, total = 0; 213562306a36Sopenharmony_ci 213662306a36Sopenharmony_ci spin_lock_irq(&ctx->fault_pending_wqh.lock); 213762306a36Sopenharmony_ci list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { 213862306a36Sopenharmony_ci pending++; 213962306a36Sopenharmony_ci total++; 214062306a36Sopenharmony_ci } 214162306a36Sopenharmony_ci list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { 214262306a36Sopenharmony_ci total++; 214362306a36Sopenharmony_ci } 214462306a36Sopenharmony_ci spin_unlock_irq(&ctx->fault_pending_wqh.lock); 214562306a36Sopenharmony_ci 214662306a36Sopenharmony_ci /* 214762306a36Sopenharmony_ci * If more protocols will be added, there will be all shown 214862306a36Sopenharmony_ci * separated by a space. Like this: 214962306a36Sopenharmony_ci * protocols: aa:... bb:... 215062306a36Sopenharmony_ci */ 215162306a36Sopenharmony_ci seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", 215262306a36Sopenharmony_ci pending, total, UFFD_API, ctx->features, 215362306a36Sopenharmony_ci UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); 215462306a36Sopenharmony_ci} 215562306a36Sopenharmony_ci#endif 215662306a36Sopenharmony_ci 215762306a36Sopenharmony_cistatic const struct file_operations userfaultfd_fops = { 215862306a36Sopenharmony_ci#ifdef CONFIG_PROC_FS 215962306a36Sopenharmony_ci .show_fdinfo = userfaultfd_show_fdinfo, 216062306a36Sopenharmony_ci#endif 216162306a36Sopenharmony_ci .release = userfaultfd_release, 216262306a36Sopenharmony_ci .poll = userfaultfd_poll, 216362306a36Sopenharmony_ci .read = userfaultfd_read, 216462306a36Sopenharmony_ci .unlocked_ioctl = userfaultfd_ioctl, 216562306a36Sopenharmony_ci .compat_ioctl = compat_ptr_ioctl, 216662306a36Sopenharmony_ci .llseek = noop_llseek, 216762306a36Sopenharmony_ci}; 216862306a36Sopenharmony_ci 216962306a36Sopenharmony_cistatic void init_once_userfaultfd_ctx(void *mem) 217062306a36Sopenharmony_ci{ 217162306a36Sopenharmony_ci struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; 217262306a36Sopenharmony_ci 217362306a36Sopenharmony_ci init_waitqueue_head(&ctx->fault_pending_wqh); 217462306a36Sopenharmony_ci init_waitqueue_head(&ctx->fault_wqh); 217562306a36Sopenharmony_ci init_waitqueue_head(&ctx->event_wqh); 217662306a36Sopenharmony_ci init_waitqueue_head(&ctx->fd_wqh); 217762306a36Sopenharmony_ci seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); 217862306a36Sopenharmony_ci} 217962306a36Sopenharmony_ci 218062306a36Sopenharmony_cistatic int new_userfaultfd(int flags) 218162306a36Sopenharmony_ci{ 218262306a36Sopenharmony_ci struct userfaultfd_ctx *ctx; 218362306a36Sopenharmony_ci int fd; 218462306a36Sopenharmony_ci 218562306a36Sopenharmony_ci BUG_ON(!current->mm); 218662306a36Sopenharmony_ci 218762306a36Sopenharmony_ci /* Check the UFFD_* constants for consistency. */ 218862306a36Sopenharmony_ci BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); 218962306a36Sopenharmony_ci BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); 219062306a36Sopenharmony_ci BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); 219162306a36Sopenharmony_ci 219262306a36Sopenharmony_ci if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) 219362306a36Sopenharmony_ci return -EINVAL; 219462306a36Sopenharmony_ci 219562306a36Sopenharmony_ci ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); 219662306a36Sopenharmony_ci if (!ctx) 219762306a36Sopenharmony_ci return -ENOMEM; 219862306a36Sopenharmony_ci 219962306a36Sopenharmony_ci refcount_set(&ctx->refcount, 1); 220062306a36Sopenharmony_ci ctx->flags = flags; 220162306a36Sopenharmony_ci ctx->features = 0; 220262306a36Sopenharmony_ci ctx->released = false; 220362306a36Sopenharmony_ci atomic_set(&ctx->mmap_changing, 0); 220462306a36Sopenharmony_ci ctx->mm = current->mm; 220562306a36Sopenharmony_ci /* prevent the mm struct to be freed */ 220662306a36Sopenharmony_ci mmgrab(ctx->mm); 220762306a36Sopenharmony_ci 220862306a36Sopenharmony_ci fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, 220962306a36Sopenharmony_ci O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); 221062306a36Sopenharmony_ci if (fd < 0) { 221162306a36Sopenharmony_ci mmdrop(ctx->mm); 221262306a36Sopenharmony_ci kmem_cache_free(userfaultfd_ctx_cachep, ctx); 221362306a36Sopenharmony_ci } 221462306a36Sopenharmony_ci return fd; 221562306a36Sopenharmony_ci} 221662306a36Sopenharmony_ci 221762306a36Sopenharmony_cistatic inline bool userfaultfd_syscall_allowed(int flags) 221862306a36Sopenharmony_ci{ 221962306a36Sopenharmony_ci /* Userspace-only page faults are always allowed */ 222062306a36Sopenharmony_ci if (flags & UFFD_USER_MODE_ONLY) 222162306a36Sopenharmony_ci return true; 222262306a36Sopenharmony_ci 222362306a36Sopenharmony_ci /* 222462306a36Sopenharmony_ci * The user is requesting a userfaultfd which can handle kernel faults. 222562306a36Sopenharmony_ci * Privileged users are always allowed to do this. 222662306a36Sopenharmony_ci */ 222762306a36Sopenharmony_ci if (capable(CAP_SYS_PTRACE)) 222862306a36Sopenharmony_ci return true; 222962306a36Sopenharmony_ci 223062306a36Sopenharmony_ci /* Otherwise, access to kernel fault handling is sysctl controlled. */ 223162306a36Sopenharmony_ci return sysctl_unprivileged_userfaultfd; 223262306a36Sopenharmony_ci} 223362306a36Sopenharmony_ci 223462306a36Sopenharmony_ciSYSCALL_DEFINE1(userfaultfd, int, flags) 223562306a36Sopenharmony_ci{ 223662306a36Sopenharmony_ci if (!userfaultfd_syscall_allowed(flags)) 223762306a36Sopenharmony_ci return -EPERM; 223862306a36Sopenharmony_ci 223962306a36Sopenharmony_ci return new_userfaultfd(flags); 224062306a36Sopenharmony_ci} 224162306a36Sopenharmony_ci 224262306a36Sopenharmony_cistatic long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags) 224362306a36Sopenharmony_ci{ 224462306a36Sopenharmony_ci if (cmd != USERFAULTFD_IOC_NEW) 224562306a36Sopenharmony_ci return -EINVAL; 224662306a36Sopenharmony_ci 224762306a36Sopenharmony_ci return new_userfaultfd(flags); 224862306a36Sopenharmony_ci} 224962306a36Sopenharmony_ci 225062306a36Sopenharmony_cistatic const struct file_operations userfaultfd_dev_fops = { 225162306a36Sopenharmony_ci .unlocked_ioctl = userfaultfd_dev_ioctl, 225262306a36Sopenharmony_ci .compat_ioctl = userfaultfd_dev_ioctl, 225362306a36Sopenharmony_ci .owner = THIS_MODULE, 225462306a36Sopenharmony_ci .llseek = noop_llseek, 225562306a36Sopenharmony_ci}; 225662306a36Sopenharmony_ci 225762306a36Sopenharmony_cistatic struct miscdevice userfaultfd_misc = { 225862306a36Sopenharmony_ci .minor = MISC_DYNAMIC_MINOR, 225962306a36Sopenharmony_ci .name = "userfaultfd", 226062306a36Sopenharmony_ci .fops = &userfaultfd_dev_fops 226162306a36Sopenharmony_ci}; 226262306a36Sopenharmony_ci 226362306a36Sopenharmony_cistatic int __init userfaultfd_init(void) 226462306a36Sopenharmony_ci{ 226562306a36Sopenharmony_ci int ret; 226662306a36Sopenharmony_ci 226762306a36Sopenharmony_ci ret = misc_register(&userfaultfd_misc); 226862306a36Sopenharmony_ci if (ret) 226962306a36Sopenharmony_ci return ret; 227062306a36Sopenharmony_ci 227162306a36Sopenharmony_ci userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", 227262306a36Sopenharmony_ci sizeof(struct userfaultfd_ctx), 227362306a36Sopenharmony_ci 0, 227462306a36Sopenharmony_ci SLAB_HWCACHE_ALIGN|SLAB_PANIC, 227562306a36Sopenharmony_ci init_once_userfaultfd_ctx); 227662306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL 227762306a36Sopenharmony_ci register_sysctl_init("vm", vm_userfaultfd_table); 227862306a36Sopenharmony_ci#endif 227962306a36Sopenharmony_ci return 0; 228062306a36Sopenharmony_ci} 228162306a36Sopenharmony_ci__initcall(userfaultfd_init); 2282