18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * fs/userfaultfd.c 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 68c2ecf20Sopenharmony_ci * Copyright (C) 2008-2009 Red Hat, Inc. 78c2ecf20Sopenharmony_ci * Copyright (C) 2015 Red Hat, Inc. 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * Some part derived from fs/eventfd.c (anon inode setup) and 108c2ecf20Sopenharmony_ci * mm/ksm.c (mm hashing). 118c2ecf20Sopenharmony_ci */ 128c2ecf20Sopenharmony_ci 138c2ecf20Sopenharmony_ci#include <linux/list.h> 148c2ecf20Sopenharmony_ci#include <linux/hashtable.h> 158c2ecf20Sopenharmony_ci#include <linux/sched/signal.h> 168c2ecf20Sopenharmony_ci#include <linux/sched/mm.h> 178c2ecf20Sopenharmony_ci#include <linux/mm.h> 188c2ecf20Sopenharmony_ci#include <linux/mm_inline.h> 198c2ecf20Sopenharmony_ci#include <linux/poll.h> 208c2ecf20Sopenharmony_ci#include <linux/slab.h> 218c2ecf20Sopenharmony_ci#include <linux/seq_file.h> 228c2ecf20Sopenharmony_ci#include <linux/file.h> 238c2ecf20Sopenharmony_ci#include <linux/bug.h> 248c2ecf20Sopenharmony_ci#include <linux/anon_inodes.h> 258c2ecf20Sopenharmony_ci#include <linux/syscalls.h> 268c2ecf20Sopenharmony_ci#include <linux/userfaultfd_k.h> 278c2ecf20Sopenharmony_ci#include <linux/mempolicy.h> 288c2ecf20Sopenharmony_ci#include <linux/ioctl.h> 298c2ecf20Sopenharmony_ci#include <linux/security.h> 308c2ecf20Sopenharmony_ci#include <linux/hugetlb.h> 318c2ecf20Sopenharmony_ci 328c2ecf20Sopenharmony_ciint sysctl_unprivileged_userfaultfd __read_mostly = 1; 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_cistatic struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; 358c2ecf20Sopenharmony_ci 368c2ecf20Sopenharmony_ci/* 378c2ecf20Sopenharmony_ci * Start with fault_pending_wqh and fault_wqh so they're more likely 388c2ecf20Sopenharmony_ci * to be in the same cacheline. 398c2ecf20Sopenharmony_ci * 408c2ecf20Sopenharmony_ci * Locking order: 418c2ecf20Sopenharmony_ci * fd_wqh.lock 428c2ecf20Sopenharmony_ci * fault_pending_wqh.lock 438c2ecf20Sopenharmony_ci * fault_wqh.lock 448c2ecf20Sopenharmony_ci * event_wqh.lock 458c2ecf20Sopenharmony_ci * 468c2ecf20Sopenharmony_ci * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, 478c2ecf20Sopenharmony_ci * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's 488c2ecf20Sopenharmony_ci * also taken in IRQ context. 498c2ecf20Sopenharmony_ci */ 508c2ecf20Sopenharmony_cistruct userfaultfd_ctx { 518c2ecf20Sopenharmony_ci /* waitqueue head for the pending (i.e. not read) userfaults */ 528c2ecf20Sopenharmony_ci wait_queue_head_t fault_pending_wqh; 538c2ecf20Sopenharmony_ci /* waitqueue head for the userfaults */ 548c2ecf20Sopenharmony_ci wait_queue_head_t fault_wqh; 558c2ecf20Sopenharmony_ci /* waitqueue head for the pseudo fd to wakeup poll/read */ 568c2ecf20Sopenharmony_ci wait_queue_head_t fd_wqh; 578c2ecf20Sopenharmony_ci /* waitqueue head for events */ 588c2ecf20Sopenharmony_ci wait_queue_head_t event_wqh; 598c2ecf20Sopenharmony_ci /* a refile sequence protected by fault_pending_wqh lock */ 608c2ecf20Sopenharmony_ci seqcount_spinlock_t refile_seq; 618c2ecf20Sopenharmony_ci /* pseudo fd refcounting */ 628c2ecf20Sopenharmony_ci refcount_t refcount; 638c2ecf20Sopenharmony_ci /* userfaultfd syscall flags */ 648c2ecf20Sopenharmony_ci unsigned int flags; 658c2ecf20Sopenharmony_ci /* features requested from the userspace */ 668c2ecf20Sopenharmony_ci unsigned int features; 678c2ecf20Sopenharmony_ci /* released */ 688c2ecf20Sopenharmony_ci bool released; 698c2ecf20Sopenharmony_ci /* memory mappings are changing because of non-cooperative event */ 708c2ecf20Sopenharmony_ci bool mmap_changing; 718c2ecf20Sopenharmony_ci /* mm with one ore more vmas attached to this userfaultfd_ctx */ 728c2ecf20Sopenharmony_ci struct mm_struct *mm; 738c2ecf20Sopenharmony_ci}; 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_cistruct userfaultfd_fork_ctx { 768c2ecf20Sopenharmony_ci struct userfaultfd_ctx *orig; 778c2ecf20Sopenharmony_ci struct userfaultfd_ctx *new; 788c2ecf20Sopenharmony_ci struct list_head list; 798c2ecf20Sopenharmony_ci}; 808c2ecf20Sopenharmony_ci 818c2ecf20Sopenharmony_cistruct userfaultfd_unmap_ctx { 828c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx; 838c2ecf20Sopenharmony_ci unsigned long start; 848c2ecf20Sopenharmony_ci unsigned long end; 858c2ecf20Sopenharmony_ci struct list_head list; 868c2ecf20Sopenharmony_ci}; 878c2ecf20Sopenharmony_ci 888c2ecf20Sopenharmony_cistruct userfaultfd_wait_queue { 898c2ecf20Sopenharmony_ci struct uffd_msg msg; 908c2ecf20Sopenharmony_ci wait_queue_entry_t wq; 918c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx; 928c2ecf20Sopenharmony_ci bool waken; 938c2ecf20Sopenharmony_ci}; 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_cistruct userfaultfd_wake_range { 968c2ecf20Sopenharmony_ci unsigned long start; 978c2ecf20Sopenharmony_ci unsigned long len; 988c2ecf20Sopenharmony_ci}; 998c2ecf20Sopenharmony_ci 1008c2ecf20Sopenharmony_ci/* internal indication that UFFD_API ioctl was successfully executed */ 1018c2ecf20Sopenharmony_ci#define UFFD_FEATURE_INITIALIZED (1u << 31) 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_cistatic bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) 1048c2ecf20Sopenharmony_ci{ 1058c2ecf20Sopenharmony_ci return ctx->features & UFFD_FEATURE_INITIALIZED; 1068c2ecf20Sopenharmony_ci} 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_cistatic int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, 1098c2ecf20Sopenharmony_ci int wake_flags, void *key) 1108c2ecf20Sopenharmony_ci{ 1118c2ecf20Sopenharmony_ci struct userfaultfd_wake_range *range = key; 1128c2ecf20Sopenharmony_ci int ret; 1138c2ecf20Sopenharmony_ci struct userfaultfd_wait_queue *uwq; 1148c2ecf20Sopenharmony_ci unsigned long start, len; 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 1178c2ecf20Sopenharmony_ci ret = 0; 1188c2ecf20Sopenharmony_ci /* len == 0 means wake all */ 1198c2ecf20Sopenharmony_ci start = range->start; 1208c2ecf20Sopenharmony_ci len = range->len; 1218c2ecf20Sopenharmony_ci if (len && (start > uwq->msg.arg.pagefault.address || 1228c2ecf20Sopenharmony_ci start + len <= uwq->msg.arg.pagefault.address)) 1238c2ecf20Sopenharmony_ci goto out; 1248c2ecf20Sopenharmony_ci WRITE_ONCE(uwq->waken, true); 1258c2ecf20Sopenharmony_ci /* 1268c2ecf20Sopenharmony_ci * The Program-Order guarantees provided by the scheduler 1278c2ecf20Sopenharmony_ci * ensure uwq->waken is visible before the task is woken. 1288c2ecf20Sopenharmony_ci */ 1298c2ecf20Sopenharmony_ci ret = wake_up_state(wq->private, mode); 1308c2ecf20Sopenharmony_ci if (ret) { 1318c2ecf20Sopenharmony_ci /* 1328c2ecf20Sopenharmony_ci * Wake only once, autoremove behavior. 1338c2ecf20Sopenharmony_ci * 1348c2ecf20Sopenharmony_ci * After the effect of list_del_init is visible to the other 1358c2ecf20Sopenharmony_ci * CPUs, the waitqueue may disappear from under us, see the 1368c2ecf20Sopenharmony_ci * !list_empty_careful() in handle_userfault(). 1378c2ecf20Sopenharmony_ci * 1388c2ecf20Sopenharmony_ci * try_to_wake_up() has an implicit smp_mb(), and the 1398c2ecf20Sopenharmony_ci * wq->private is read before calling the extern function 1408c2ecf20Sopenharmony_ci * "wake_up_state" (which in turns calls try_to_wake_up). 1418c2ecf20Sopenharmony_ci */ 1428c2ecf20Sopenharmony_ci list_del_init(&wq->entry); 1438c2ecf20Sopenharmony_ci } 1448c2ecf20Sopenharmony_ciout: 1458c2ecf20Sopenharmony_ci return ret; 1468c2ecf20Sopenharmony_ci} 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci/** 1498c2ecf20Sopenharmony_ci * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd 1508c2ecf20Sopenharmony_ci * context. 1518c2ecf20Sopenharmony_ci * @ctx: [in] Pointer to the userfaultfd context. 1528c2ecf20Sopenharmony_ci */ 1538c2ecf20Sopenharmony_cistatic void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) 1548c2ecf20Sopenharmony_ci{ 1558c2ecf20Sopenharmony_ci refcount_inc(&ctx->refcount); 1568c2ecf20Sopenharmony_ci} 1578c2ecf20Sopenharmony_ci 1588c2ecf20Sopenharmony_ci/** 1598c2ecf20Sopenharmony_ci * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd 1608c2ecf20Sopenharmony_ci * context. 1618c2ecf20Sopenharmony_ci * @ctx: [in] Pointer to userfaultfd context. 1628c2ecf20Sopenharmony_ci * 1638c2ecf20Sopenharmony_ci * The userfaultfd context reference must have been previously acquired either 1648c2ecf20Sopenharmony_ci * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). 1658c2ecf20Sopenharmony_ci */ 1668c2ecf20Sopenharmony_cistatic void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) 1678c2ecf20Sopenharmony_ci{ 1688c2ecf20Sopenharmony_ci if (refcount_dec_and_test(&ctx->refcount)) { 1698c2ecf20Sopenharmony_ci VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); 1708c2ecf20Sopenharmony_ci VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); 1718c2ecf20Sopenharmony_ci VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); 1728c2ecf20Sopenharmony_ci VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); 1738c2ecf20Sopenharmony_ci VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock)); 1748c2ecf20Sopenharmony_ci VM_BUG_ON(waitqueue_active(&ctx->event_wqh)); 1758c2ecf20Sopenharmony_ci VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); 1768c2ecf20Sopenharmony_ci VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); 1778c2ecf20Sopenharmony_ci mmdrop(ctx->mm); 1788c2ecf20Sopenharmony_ci kmem_cache_free(userfaultfd_ctx_cachep, ctx); 1798c2ecf20Sopenharmony_ci } 1808c2ecf20Sopenharmony_ci} 1818c2ecf20Sopenharmony_ci 1828c2ecf20Sopenharmony_cistatic inline void msg_init(struct uffd_msg *msg) 1838c2ecf20Sopenharmony_ci{ 1848c2ecf20Sopenharmony_ci BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); 1858c2ecf20Sopenharmony_ci /* 1868c2ecf20Sopenharmony_ci * Must use memset to zero out the paddings or kernel data is 1878c2ecf20Sopenharmony_ci * leaked to userland. 1888c2ecf20Sopenharmony_ci */ 1898c2ecf20Sopenharmony_ci memset(msg, 0, sizeof(struct uffd_msg)); 1908c2ecf20Sopenharmony_ci} 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_cistatic inline struct uffd_msg userfault_msg(unsigned long address, 1938c2ecf20Sopenharmony_ci unsigned int flags, 1948c2ecf20Sopenharmony_ci unsigned long reason, 1958c2ecf20Sopenharmony_ci unsigned int features) 1968c2ecf20Sopenharmony_ci{ 1978c2ecf20Sopenharmony_ci struct uffd_msg msg; 1988c2ecf20Sopenharmony_ci msg_init(&msg); 1998c2ecf20Sopenharmony_ci msg.event = UFFD_EVENT_PAGEFAULT; 2008c2ecf20Sopenharmony_ci msg.arg.pagefault.address = address; 2018c2ecf20Sopenharmony_ci if (flags & FAULT_FLAG_WRITE) 2028c2ecf20Sopenharmony_ci /* 2038c2ecf20Sopenharmony_ci * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the 2048c2ecf20Sopenharmony_ci * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE 2058c2ecf20Sopenharmony_ci * was not set in a UFFD_EVENT_PAGEFAULT, it means it 2068c2ecf20Sopenharmony_ci * was a read fault, otherwise if set it means it's 2078c2ecf20Sopenharmony_ci * a write fault. 2088c2ecf20Sopenharmony_ci */ 2098c2ecf20Sopenharmony_ci msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; 2108c2ecf20Sopenharmony_ci if (reason & VM_UFFD_WP) 2118c2ecf20Sopenharmony_ci /* 2128c2ecf20Sopenharmony_ci * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the 2138c2ecf20Sopenharmony_ci * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was 2148c2ecf20Sopenharmony_ci * not set in a UFFD_EVENT_PAGEFAULT, it means it was 2158c2ecf20Sopenharmony_ci * a missing fault, otherwise if set it means it's a 2168c2ecf20Sopenharmony_ci * write protect fault. 2178c2ecf20Sopenharmony_ci */ 2188c2ecf20Sopenharmony_ci msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; 2198c2ecf20Sopenharmony_ci if (features & UFFD_FEATURE_THREAD_ID) 2208c2ecf20Sopenharmony_ci msg.arg.pagefault.feat.ptid = task_pid_vnr(current); 2218c2ecf20Sopenharmony_ci return msg; 2228c2ecf20Sopenharmony_ci} 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_ci#ifdef CONFIG_HUGETLB_PAGE 2258c2ecf20Sopenharmony_ci/* 2268c2ecf20Sopenharmony_ci * Same functionality as userfaultfd_must_wait below with modifications for 2278c2ecf20Sopenharmony_ci * hugepmd ranges. 2288c2ecf20Sopenharmony_ci */ 2298c2ecf20Sopenharmony_cistatic inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, 2308c2ecf20Sopenharmony_ci struct vm_area_struct *vma, 2318c2ecf20Sopenharmony_ci unsigned long address, 2328c2ecf20Sopenharmony_ci unsigned long flags, 2338c2ecf20Sopenharmony_ci unsigned long reason) 2348c2ecf20Sopenharmony_ci{ 2358c2ecf20Sopenharmony_ci struct mm_struct *mm = ctx->mm; 2368c2ecf20Sopenharmony_ci pte_t *ptep, pte; 2378c2ecf20Sopenharmony_ci bool ret = true; 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci mmap_assert_locked(mm); 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_ci ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_ci if (!ptep) 2448c2ecf20Sopenharmony_ci goto out; 2458c2ecf20Sopenharmony_ci 2468c2ecf20Sopenharmony_ci ret = false; 2478c2ecf20Sopenharmony_ci pte = huge_ptep_get(ptep); 2488c2ecf20Sopenharmony_ci 2498c2ecf20Sopenharmony_ci /* 2508c2ecf20Sopenharmony_ci * Lockless access: we're in a wait_event so it's ok if it 2518c2ecf20Sopenharmony_ci * changes under us. 2528c2ecf20Sopenharmony_ci */ 2538c2ecf20Sopenharmony_ci if (huge_pte_none(pte)) 2548c2ecf20Sopenharmony_ci ret = true; 2558c2ecf20Sopenharmony_ci if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) 2568c2ecf20Sopenharmony_ci ret = true; 2578c2ecf20Sopenharmony_ciout: 2588c2ecf20Sopenharmony_ci return ret; 2598c2ecf20Sopenharmony_ci} 2608c2ecf20Sopenharmony_ci#else 2618c2ecf20Sopenharmony_cistatic inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, 2628c2ecf20Sopenharmony_ci struct vm_area_struct *vma, 2638c2ecf20Sopenharmony_ci unsigned long address, 2648c2ecf20Sopenharmony_ci unsigned long flags, 2658c2ecf20Sopenharmony_ci unsigned long reason) 2668c2ecf20Sopenharmony_ci{ 2678c2ecf20Sopenharmony_ci return false; /* should never get here */ 2688c2ecf20Sopenharmony_ci} 2698c2ecf20Sopenharmony_ci#endif /* CONFIG_HUGETLB_PAGE */ 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_ci/* 2728c2ecf20Sopenharmony_ci * Verify the pagetables are still not ok after having reigstered into 2738c2ecf20Sopenharmony_ci * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any 2748c2ecf20Sopenharmony_ci * userfault that has already been resolved, if userfaultfd_read and 2758c2ecf20Sopenharmony_ci * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different 2768c2ecf20Sopenharmony_ci * threads. 2778c2ecf20Sopenharmony_ci */ 2788c2ecf20Sopenharmony_cistatic inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, 2798c2ecf20Sopenharmony_ci unsigned long address, 2808c2ecf20Sopenharmony_ci unsigned long flags, 2818c2ecf20Sopenharmony_ci unsigned long reason) 2828c2ecf20Sopenharmony_ci{ 2838c2ecf20Sopenharmony_ci struct mm_struct *mm = ctx->mm; 2848c2ecf20Sopenharmony_ci pgd_t *pgd; 2858c2ecf20Sopenharmony_ci p4d_t *p4d; 2868c2ecf20Sopenharmony_ci pud_t *pud; 2878c2ecf20Sopenharmony_ci pmd_t *pmd, _pmd; 2888c2ecf20Sopenharmony_ci pte_t *pte; 2898c2ecf20Sopenharmony_ci bool ret = true; 2908c2ecf20Sopenharmony_ci 2918c2ecf20Sopenharmony_ci mmap_assert_locked(mm); 2928c2ecf20Sopenharmony_ci 2938c2ecf20Sopenharmony_ci pgd = pgd_offset(mm, address); 2948c2ecf20Sopenharmony_ci if (!pgd_present(*pgd)) 2958c2ecf20Sopenharmony_ci goto out; 2968c2ecf20Sopenharmony_ci p4d = p4d_offset(pgd, address); 2978c2ecf20Sopenharmony_ci if (!p4d_present(*p4d)) 2988c2ecf20Sopenharmony_ci goto out; 2998c2ecf20Sopenharmony_ci pud = pud_offset(p4d, address); 3008c2ecf20Sopenharmony_ci if (!pud_present(*pud)) 3018c2ecf20Sopenharmony_ci goto out; 3028c2ecf20Sopenharmony_ci pmd = pmd_offset(pud, address); 3038c2ecf20Sopenharmony_ci /* 3048c2ecf20Sopenharmony_ci * READ_ONCE must function as a barrier with narrower scope 3058c2ecf20Sopenharmony_ci * and it must be equivalent to: 3068c2ecf20Sopenharmony_ci * _pmd = *pmd; barrier(); 3078c2ecf20Sopenharmony_ci * 3088c2ecf20Sopenharmony_ci * This is to deal with the instability (as in 3098c2ecf20Sopenharmony_ci * pmd_trans_unstable) of the pmd. 3108c2ecf20Sopenharmony_ci */ 3118c2ecf20Sopenharmony_ci _pmd = READ_ONCE(*pmd); 3128c2ecf20Sopenharmony_ci if (pmd_none(_pmd)) 3138c2ecf20Sopenharmony_ci goto out; 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci ret = false; 3168c2ecf20Sopenharmony_ci if (!pmd_present(_pmd)) 3178c2ecf20Sopenharmony_ci goto out; 3188c2ecf20Sopenharmony_ci 3198c2ecf20Sopenharmony_ci if (pmd_trans_huge(_pmd)) { 3208c2ecf20Sopenharmony_ci if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) 3218c2ecf20Sopenharmony_ci ret = true; 3228c2ecf20Sopenharmony_ci goto out; 3238c2ecf20Sopenharmony_ci } 3248c2ecf20Sopenharmony_ci 3258c2ecf20Sopenharmony_ci /* 3268c2ecf20Sopenharmony_ci * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it 3278c2ecf20Sopenharmony_ci * and use the standard pte_offset_map() instead of parsing _pmd. 3288c2ecf20Sopenharmony_ci */ 3298c2ecf20Sopenharmony_ci pte = pte_offset_map(pmd, address); 3308c2ecf20Sopenharmony_ci /* 3318c2ecf20Sopenharmony_ci * Lockless access: we're in a wait_event so it's ok if it 3328c2ecf20Sopenharmony_ci * changes under us. 3338c2ecf20Sopenharmony_ci */ 3348c2ecf20Sopenharmony_ci if (pte_none(*pte)) 3358c2ecf20Sopenharmony_ci ret = true; 3368c2ecf20Sopenharmony_ci if (!pte_write(*pte) && (reason & VM_UFFD_WP)) 3378c2ecf20Sopenharmony_ci ret = true; 3388c2ecf20Sopenharmony_ci pte_unmap(pte); 3398c2ecf20Sopenharmony_ci 3408c2ecf20Sopenharmony_ciout: 3418c2ecf20Sopenharmony_ci return ret; 3428c2ecf20Sopenharmony_ci} 3438c2ecf20Sopenharmony_ci 3448c2ecf20Sopenharmony_cistatic inline long userfaultfd_get_blocking_state(unsigned int flags) 3458c2ecf20Sopenharmony_ci{ 3468c2ecf20Sopenharmony_ci if (flags & FAULT_FLAG_INTERRUPTIBLE) 3478c2ecf20Sopenharmony_ci return TASK_INTERRUPTIBLE; 3488c2ecf20Sopenharmony_ci 3498c2ecf20Sopenharmony_ci if (flags & FAULT_FLAG_KILLABLE) 3508c2ecf20Sopenharmony_ci return TASK_KILLABLE; 3518c2ecf20Sopenharmony_ci 3528c2ecf20Sopenharmony_ci return TASK_UNINTERRUPTIBLE; 3538c2ecf20Sopenharmony_ci} 3548c2ecf20Sopenharmony_ci 3558c2ecf20Sopenharmony_ci/* 3568c2ecf20Sopenharmony_ci * The locking rules involved in returning VM_FAULT_RETRY depending on 3578c2ecf20Sopenharmony_ci * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and 3588c2ecf20Sopenharmony_ci * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" 3598c2ecf20Sopenharmony_ci * recommendation in __lock_page_or_retry is not an understatement. 3608c2ecf20Sopenharmony_ci * 3618c2ecf20Sopenharmony_ci * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released 3628c2ecf20Sopenharmony_ci * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is 3638c2ecf20Sopenharmony_ci * not set. 3648c2ecf20Sopenharmony_ci * 3658c2ecf20Sopenharmony_ci * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not 3668c2ecf20Sopenharmony_ci * set, VM_FAULT_RETRY can still be returned if and only if there are 3678c2ecf20Sopenharmony_ci * fatal_signal_pending()s, and the mmap_lock must be released before 3688c2ecf20Sopenharmony_ci * returning it. 3698c2ecf20Sopenharmony_ci */ 3708c2ecf20Sopenharmony_civm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) 3718c2ecf20Sopenharmony_ci{ 3728c2ecf20Sopenharmony_ci struct mm_struct *mm = vmf->vma->vm_mm; 3738c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx; 3748c2ecf20Sopenharmony_ci struct userfaultfd_wait_queue uwq; 3758c2ecf20Sopenharmony_ci vm_fault_t ret = VM_FAULT_SIGBUS; 3768c2ecf20Sopenharmony_ci bool must_wait; 3778c2ecf20Sopenharmony_ci long blocking_state; 3788c2ecf20Sopenharmony_ci 3798c2ecf20Sopenharmony_ci /* 3808c2ecf20Sopenharmony_ci * We don't do userfault handling for the final child pid update. 3818c2ecf20Sopenharmony_ci * 3828c2ecf20Sopenharmony_ci * We also don't do userfault handling during 3838c2ecf20Sopenharmony_ci * coredumping. hugetlbfs has the special 3848c2ecf20Sopenharmony_ci * follow_hugetlb_page() to skip missing pages in the 3858c2ecf20Sopenharmony_ci * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with 3868c2ecf20Sopenharmony_ci * the no_page_table() helper in follow_page_mask(), but the 3878c2ecf20Sopenharmony_ci * shmem_vm_ops->fault method is invoked even during 3888c2ecf20Sopenharmony_ci * coredumping without mmap_lock and it ends up here. 3898c2ecf20Sopenharmony_ci */ 3908c2ecf20Sopenharmony_ci if (current->flags & (PF_EXITING|PF_DUMPCORE)) 3918c2ecf20Sopenharmony_ci goto out; 3928c2ecf20Sopenharmony_ci 3938c2ecf20Sopenharmony_ci /* 3948c2ecf20Sopenharmony_ci * Coredumping runs without mmap_lock so we can only check that 3958c2ecf20Sopenharmony_ci * the mmap_lock is held, if PF_DUMPCORE was not set. 3968c2ecf20Sopenharmony_ci */ 3978c2ecf20Sopenharmony_ci mmap_assert_locked(mm); 3988c2ecf20Sopenharmony_ci 3998c2ecf20Sopenharmony_ci ctx = vmf->vma->vm_userfaultfd_ctx.ctx; 4008c2ecf20Sopenharmony_ci if (!ctx) 4018c2ecf20Sopenharmony_ci goto out; 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_ci BUG_ON(ctx->mm != mm); 4048c2ecf20Sopenharmony_ci 4058c2ecf20Sopenharmony_ci VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); 4068c2ecf20Sopenharmony_ci VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); 4078c2ecf20Sopenharmony_ci 4088c2ecf20Sopenharmony_ci if (ctx->features & UFFD_FEATURE_SIGBUS) 4098c2ecf20Sopenharmony_ci goto out; 4108c2ecf20Sopenharmony_ci 4118c2ecf20Sopenharmony_ci /* 4128c2ecf20Sopenharmony_ci * If it's already released don't get it. This avoids to loop 4138c2ecf20Sopenharmony_ci * in __get_user_pages if userfaultfd_release waits on the 4148c2ecf20Sopenharmony_ci * caller of handle_userfault to release the mmap_lock. 4158c2ecf20Sopenharmony_ci */ 4168c2ecf20Sopenharmony_ci if (unlikely(READ_ONCE(ctx->released))) { 4178c2ecf20Sopenharmony_ci /* 4188c2ecf20Sopenharmony_ci * Don't return VM_FAULT_SIGBUS in this case, so a non 4198c2ecf20Sopenharmony_ci * cooperative manager can close the uffd after the 4208c2ecf20Sopenharmony_ci * last UFFDIO_COPY, without risking to trigger an 4218c2ecf20Sopenharmony_ci * involuntary SIGBUS if the process was starting the 4228c2ecf20Sopenharmony_ci * userfaultfd while the userfaultfd was still armed 4238c2ecf20Sopenharmony_ci * (but after the last UFFDIO_COPY). If the uffd 4248c2ecf20Sopenharmony_ci * wasn't already closed when the userfault reached 4258c2ecf20Sopenharmony_ci * this point, that would normally be solved by 4268c2ecf20Sopenharmony_ci * userfaultfd_must_wait returning 'false'. 4278c2ecf20Sopenharmony_ci * 4288c2ecf20Sopenharmony_ci * If we were to return VM_FAULT_SIGBUS here, the non 4298c2ecf20Sopenharmony_ci * cooperative manager would be instead forced to 4308c2ecf20Sopenharmony_ci * always call UFFDIO_UNREGISTER before it can safely 4318c2ecf20Sopenharmony_ci * close the uffd. 4328c2ecf20Sopenharmony_ci */ 4338c2ecf20Sopenharmony_ci ret = VM_FAULT_NOPAGE; 4348c2ecf20Sopenharmony_ci goto out; 4358c2ecf20Sopenharmony_ci } 4368c2ecf20Sopenharmony_ci 4378c2ecf20Sopenharmony_ci /* 4388c2ecf20Sopenharmony_ci * Check that we can return VM_FAULT_RETRY. 4398c2ecf20Sopenharmony_ci * 4408c2ecf20Sopenharmony_ci * NOTE: it should become possible to return VM_FAULT_RETRY 4418c2ecf20Sopenharmony_ci * even if FAULT_FLAG_TRIED is set without leading to gup() 4428c2ecf20Sopenharmony_ci * -EBUSY failures, if the userfaultfd is to be extended for 4438c2ecf20Sopenharmony_ci * VM_UFFD_WP tracking and we intend to arm the userfault 4448c2ecf20Sopenharmony_ci * without first stopping userland access to the memory. For 4458c2ecf20Sopenharmony_ci * VM_UFFD_MISSING userfaults this is enough for now. 4468c2ecf20Sopenharmony_ci */ 4478c2ecf20Sopenharmony_ci if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { 4488c2ecf20Sopenharmony_ci /* 4498c2ecf20Sopenharmony_ci * Validate the invariant that nowait must allow retry 4508c2ecf20Sopenharmony_ci * to be sure not to return SIGBUS erroneously on 4518c2ecf20Sopenharmony_ci * nowait invocations. 4528c2ecf20Sopenharmony_ci */ 4538c2ecf20Sopenharmony_ci BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); 4548c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_VM 4558c2ecf20Sopenharmony_ci if (printk_ratelimit()) { 4568c2ecf20Sopenharmony_ci printk(KERN_WARNING 4578c2ecf20Sopenharmony_ci "FAULT_FLAG_ALLOW_RETRY missing %x\n", 4588c2ecf20Sopenharmony_ci vmf->flags); 4598c2ecf20Sopenharmony_ci dump_stack(); 4608c2ecf20Sopenharmony_ci } 4618c2ecf20Sopenharmony_ci#endif 4628c2ecf20Sopenharmony_ci goto out; 4638c2ecf20Sopenharmony_ci } 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_ci /* 4668c2ecf20Sopenharmony_ci * Handle nowait, not much to do other than tell it to retry 4678c2ecf20Sopenharmony_ci * and wait. 4688c2ecf20Sopenharmony_ci */ 4698c2ecf20Sopenharmony_ci ret = VM_FAULT_RETRY; 4708c2ecf20Sopenharmony_ci if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) 4718c2ecf20Sopenharmony_ci goto out; 4728c2ecf20Sopenharmony_ci 4738c2ecf20Sopenharmony_ci /* take the reference before dropping the mmap_lock */ 4748c2ecf20Sopenharmony_ci userfaultfd_ctx_get(ctx); 4758c2ecf20Sopenharmony_ci 4768c2ecf20Sopenharmony_ci init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 4778c2ecf20Sopenharmony_ci uwq.wq.private = current; 4788c2ecf20Sopenharmony_ci uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, 4798c2ecf20Sopenharmony_ci ctx->features); 4808c2ecf20Sopenharmony_ci uwq.ctx = ctx; 4818c2ecf20Sopenharmony_ci uwq.waken = false; 4828c2ecf20Sopenharmony_ci 4838c2ecf20Sopenharmony_ci blocking_state = userfaultfd_get_blocking_state(vmf->flags); 4848c2ecf20Sopenharmony_ci 4858c2ecf20Sopenharmony_ci spin_lock_irq(&ctx->fault_pending_wqh.lock); 4868c2ecf20Sopenharmony_ci /* 4878c2ecf20Sopenharmony_ci * After the __add_wait_queue the uwq is visible to userland 4888c2ecf20Sopenharmony_ci * through poll/read(). 4898c2ecf20Sopenharmony_ci */ 4908c2ecf20Sopenharmony_ci __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); 4918c2ecf20Sopenharmony_ci /* 4928c2ecf20Sopenharmony_ci * The smp_mb() after __set_current_state prevents the reads 4938c2ecf20Sopenharmony_ci * following the spin_unlock to happen before the list_add in 4948c2ecf20Sopenharmony_ci * __add_wait_queue. 4958c2ecf20Sopenharmony_ci */ 4968c2ecf20Sopenharmony_ci set_current_state(blocking_state); 4978c2ecf20Sopenharmony_ci spin_unlock_irq(&ctx->fault_pending_wqh.lock); 4988c2ecf20Sopenharmony_ci 4998c2ecf20Sopenharmony_ci if (!is_vm_hugetlb_page(vmf->vma)) 5008c2ecf20Sopenharmony_ci must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, 5018c2ecf20Sopenharmony_ci reason); 5028c2ecf20Sopenharmony_ci else 5038c2ecf20Sopenharmony_ci must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, 5048c2ecf20Sopenharmony_ci vmf->address, 5058c2ecf20Sopenharmony_ci vmf->flags, reason); 5068c2ecf20Sopenharmony_ci mmap_read_unlock(mm); 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_ci if (likely(must_wait && !READ_ONCE(ctx->released))) { 5098c2ecf20Sopenharmony_ci wake_up_poll(&ctx->fd_wqh, EPOLLIN); 5108c2ecf20Sopenharmony_ci schedule(); 5118c2ecf20Sopenharmony_ci } 5128c2ecf20Sopenharmony_ci 5138c2ecf20Sopenharmony_ci __set_current_state(TASK_RUNNING); 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_ci /* 5168c2ecf20Sopenharmony_ci * Here we race with the list_del; list_add in 5178c2ecf20Sopenharmony_ci * userfaultfd_ctx_read(), however because we don't ever run 5188c2ecf20Sopenharmony_ci * list_del_init() to refile across the two lists, the prev 5198c2ecf20Sopenharmony_ci * and next pointers will never point to self. list_add also 5208c2ecf20Sopenharmony_ci * would never let any of the two pointers to point to 5218c2ecf20Sopenharmony_ci * self. So list_empty_careful won't risk to see both pointers 5228c2ecf20Sopenharmony_ci * pointing to self at any time during the list refile. The 5238c2ecf20Sopenharmony_ci * only case where list_del_init() is called is the full 5248c2ecf20Sopenharmony_ci * removal in the wake function and there we don't re-list_add 5258c2ecf20Sopenharmony_ci * and it's fine not to block on the spinlock. The uwq on this 5268c2ecf20Sopenharmony_ci * kernel stack can be released after the list_del_init. 5278c2ecf20Sopenharmony_ci */ 5288c2ecf20Sopenharmony_ci if (!list_empty_careful(&uwq.wq.entry)) { 5298c2ecf20Sopenharmony_ci spin_lock_irq(&ctx->fault_pending_wqh.lock); 5308c2ecf20Sopenharmony_ci /* 5318c2ecf20Sopenharmony_ci * No need of list_del_init(), the uwq on the stack 5328c2ecf20Sopenharmony_ci * will be freed shortly anyway. 5338c2ecf20Sopenharmony_ci */ 5348c2ecf20Sopenharmony_ci list_del(&uwq.wq.entry); 5358c2ecf20Sopenharmony_ci spin_unlock_irq(&ctx->fault_pending_wqh.lock); 5368c2ecf20Sopenharmony_ci } 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_ci /* 5398c2ecf20Sopenharmony_ci * ctx may go away after this if the userfault pseudo fd is 5408c2ecf20Sopenharmony_ci * already released. 5418c2ecf20Sopenharmony_ci */ 5428c2ecf20Sopenharmony_ci userfaultfd_ctx_put(ctx); 5438c2ecf20Sopenharmony_ci 5448c2ecf20Sopenharmony_ciout: 5458c2ecf20Sopenharmony_ci return ret; 5468c2ecf20Sopenharmony_ci} 5478c2ecf20Sopenharmony_ci 5488c2ecf20Sopenharmony_cistatic void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, 5498c2ecf20Sopenharmony_ci struct userfaultfd_wait_queue *ewq) 5508c2ecf20Sopenharmony_ci{ 5518c2ecf20Sopenharmony_ci struct userfaultfd_ctx *release_new_ctx; 5528c2ecf20Sopenharmony_ci 5538c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(current->flags & PF_EXITING)) 5548c2ecf20Sopenharmony_ci goto out; 5558c2ecf20Sopenharmony_ci 5568c2ecf20Sopenharmony_ci ewq->ctx = ctx; 5578c2ecf20Sopenharmony_ci init_waitqueue_entry(&ewq->wq, current); 5588c2ecf20Sopenharmony_ci release_new_ctx = NULL; 5598c2ecf20Sopenharmony_ci 5608c2ecf20Sopenharmony_ci spin_lock_irq(&ctx->event_wqh.lock); 5618c2ecf20Sopenharmony_ci /* 5628c2ecf20Sopenharmony_ci * After the __add_wait_queue the uwq is visible to userland 5638c2ecf20Sopenharmony_ci * through poll/read(). 5648c2ecf20Sopenharmony_ci */ 5658c2ecf20Sopenharmony_ci __add_wait_queue(&ctx->event_wqh, &ewq->wq); 5668c2ecf20Sopenharmony_ci for (;;) { 5678c2ecf20Sopenharmony_ci set_current_state(TASK_KILLABLE); 5688c2ecf20Sopenharmony_ci if (ewq->msg.event == 0) 5698c2ecf20Sopenharmony_ci break; 5708c2ecf20Sopenharmony_ci if (READ_ONCE(ctx->released) || 5718c2ecf20Sopenharmony_ci fatal_signal_pending(current)) { 5728c2ecf20Sopenharmony_ci /* 5738c2ecf20Sopenharmony_ci * &ewq->wq may be queued in fork_event, but 5748c2ecf20Sopenharmony_ci * __remove_wait_queue ignores the head 5758c2ecf20Sopenharmony_ci * parameter. It would be a problem if it 5768c2ecf20Sopenharmony_ci * didn't. 5778c2ecf20Sopenharmony_ci */ 5788c2ecf20Sopenharmony_ci __remove_wait_queue(&ctx->event_wqh, &ewq->wq); 5798c2ecf20Sopenharmony_ci if (ewq->msg.event == UFFD_EVENT_FORK) { 5808c2ecf20Sopenharmony_ci struct userfaultfd_ctx *new; 5818c2ecf20Sopenharmony_ci 5828c2ecf20Sopenharmony_ci new = (struct userfaultfd_ctx *) 5838c2ecf20Sopenharmony_ci (unsigned long) 5848c2ecf20Sopenharmony_ci ewq->msg.arg.reserved.reserved1; 5858c2ecf20Sopenharmony_ci release_new_ctx = new; 5868c2ecf20Sopenharmony_ci } 5878c2ecf20Sopenharmony_ci break; 5888c2ecf20Sopenharmony_ci } 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_ci spin_unlock_irq(&ctx->event_wqh.lock); 5918c2ecf20Sopenharmony_ci 5928c2ecf20Sopenharmony_ci wake_up_poll(&ctx->fd_wqh, EPOLLIN); 5938c2ecf20Sopenharmony_ci schedule(); 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ci spin_lock_irq(&ctx->event_wqh.lock); 5968c2ecf20Sopenharmony_ci } 5978c2ecf20Sopenharmony_ci __set_current_state(TASK_RUNNING); 5988c2ecf20Sopenharmony_ci spin_unlock_irq(&ctx->event_wqh.lock); 5998c2ecf20Sopenharmony_ci 6008c2ecf20Sopenharmony_ci if (release_new_ctx) { 6018c2ecf20Sopenharmony_ci struct vm_area_struct *vma; 6028c2ecf20Sopenharmony_ci struct mm_struct *mm = release_new_ctx->mm; 6038c2ecf20Sopenharmony_ci 6048c2ecf20Sopenharmony_ci /* the various vma->vm_userfaultfd_ctx still points to it */ 6058c2ecf20Sopenharmony_ci mmap_write_lock(mm); 6068c2ecf20Sopenharmony_ci for (vma = mm->mmap; vma; vma = vma->vm_next) 6078c2ecf20Sopenharmony_ci if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { 6088c2ecf20Sopenharmony_ci vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 6098c2ecf20Sopenharmony_ci vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); 6108c2ecf20Sopenharmony_ci } 6118c2ecf20Sopenharmony_ci mmap_write_unlock(mm); 6128c2ecf20Sopenharmony_ci 6138c2ecf20Sopenharmony_ci userfaultfd_ctx_put(release_new_ctx); 6148c2ecf20Sopenharmony_ci } 6158c2ecf20Sopenharmony_ci 6168c2ecf20Sopenharmony_ci /* 6178c2ecf20Sopenharmony_ci * ctx may go away after this if the userfault pseudo fd is 6188c2ecf20Sopenharmony_ci * already released. 6198c2ecf20Sopenharmony_ci */ 6208c2ecf20Sopenharmony_ciout: 6218c2ecf20Sopenharmony_ci WRITE_ONCE(ctx->mmap_changing, false); 6228c2ecf20Sopenharmony_ci userfaultfd_ctx_put(ctx); 6238c2ecf20Sopenharmony_ci} 6248c2ecf20Sopenharmony_ci 6258c2ecf20Sopenharmony_cistatic void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, 6268c2ecf20Sopenharmony_ci struct userfaultfd_wait_queue *ewq) 6278c2ecf20Sopenharmony_ci{ 6288c2ecf20Sopenharmony_ci ewq->msg.event = 0; 6298c2ecf20Sopenharmony_ci wake_up_locked(&ctx->event_wqh); 6308c2ecf20Sopenharmony_ci __remove_wait_queue(&ctx->event_wqh, &ewq->wq); 6318c2ecf20Sopenharmony_ci} 6328c2ecf20Sopenharmony_ci 6338c2ecf20Sopenharmony_ciint dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) 6348c2ecf20Sopenharmony_ci{ 6358c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx = NULL, *octx; 6368c2ecf20Sopenharmony_ci struct userfaultfd_fork_ctx *fctx; 6378c2ecf20Sopenharmony_ci 6388c2ecf20Sopenharmony_ci octx = vma->vm_userfaultfd_ctx.ctx; 6398c2ecf20Sopenharmony_ci if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { 6408c2ecf20Sopenharmony_ci vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 6418c2ecf20Sopenharmony_ci vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); 6428c2ecf20Sopenharmony_ci return 0; 6438c2ecf20Sopenharmony_ci } 6448c2ecf20Sopenharmony_ci 6458c2ecf20Sopenharmony_ci list_for_each_entry(fctx, fcs, list) 6468c2ecf20Sopenharmony_ci if (fctx->orig == octx) { 6478c2ecf20Sopenharmony_ci ctx = fctx->new; 6488c2ecf20Sopenharmony_ci break; 6498c2ecf20Sopenharmony_ci } 6508c2ecf20Sopenharmony_ci 6518c2ecf20Sopenharmony_ci if (!ctx) { 6528c2ecf20Sopenharmony_ci fctx = kmalloc(sizeof(*fctx), GFP_KERNEL); 6538c2ecf20Sopenharmony_ci if (!fctx) 6548c2ecf20Sopenharmony_ci return -ENOMEM; 6558c2ecf20Sopenharmony_ci 6568c2ecf20Sopenharmony_ci ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); 6578c2ecf20Sopenharmony_ci if (!ctx) { 6588c2ecf20Sopenharmony_ci kfree(fctx); 6598c2ecf20Sopenharmony_ci return -ENOMEM; 6608c2ecf20Sopenharmony_ci } 6618c2ecf20Sopenharmony_ci 6628c2ecf20Sopenharmony_ci refcount_set(&ctx->refcount, 1); 6638c2ecf20Sopenharmony_ci ctx->flags = octx->flags; 6648c2ecf20Sopenharmony_ci ctx->features = octx->features; 6658c2ecf20Sopenharmony_ci ctx->released = false; 6668c2ecf20Sopenharmony_ci ctx->mmap_changing = false; 6678c2ecf20Sopenharmony_ci ctx->mm = vma->vm_mm; 6688c2ecf20Sopenharmony_ci mmgrab(ctx->mm); 6698c2ecf20Sopenharmony_ci 6708c2ecf20Sopenharmony_ci userfaultfd_ctx_get(octx); 6718c2ecf20Sopenharmony_ci WRITE_ONCE(octx->mmap_changing, true); 6728c2ecf20Sopenharmony_ci fctx->orig = octx; 6738c2ecf20Sopenharmony_ci fctx->new = ctx; 6748c2ecf20Sopenharmony_ci list_add_tail(&fctx->list, fcs); 6758c2ecf20Sopenharmony_ci } 6768c2ecf20Sopenharmony_ci 6778c2ecf20Sopenharmony_ci vma->vm_userfaultfd_ctx.ctx = ctx; 6788c2ecf20Sopenharmony_ci return 0; 6798c2ecf20Sopenharmony_ci} 6808c2ecf20Sopenharmony_ci 6818c2ecf20Sopenharmony_cistatic void dup_fctx(struct userfaultfd_fork_ctx *fctx) 6828c2ecf20Sopenharmony_ci{ 6838c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx = fctx->orig; 6848c2ecf20Sopenharmony_ci struct userfaultfd_wait_queue ewq; 6858c2ecf20Sopenharmony_ci 6868c2ecf20Sopenharmony_ci msg_init(&ewq.msg); 6878c2ecf20Sopenharmony_ci 6888c2ecf20Sopenharmony_ci ewq.msg.event = UFFD_EVENT_FORK; 6898c2ecf20Sopenharmony_ci ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; 6908c2ecf20Sopenharmony_ci 6918c2ecf20Sopenharmony_ci userfaultfd_event_wait_completion(ctx, &ewq); 6928c2ecf20Sopenharmony_ci} 6938c2ecf20Sopenharmony_ci 6948c2ecf20Sopenharmony_civoid dup_userfaultfd_complete(struct list_head *fcs) 6958c2ecf20Sopenharmony_ci{ 6968c2ecf20Sopenharmony_ci struct userfaultfd_fork_ctx *fctx, *n; 6978c2ecf20Sopenharmony_ci 6988c2ecf20Sopenharmony_ci list_for_each_entry_safe(fctx, n, fcs, list) { 6998c2ecf20Sopenharmony_ci dup_fctx(fctx); 7008c2ecf20Sopenharmony_ci list_del(&fctx->list); 7018c2ecf20Sopenharmony_ci kfree(fctx); 7028c2ecf20Sopenharmony_ci } 7038c2ecf20Sopenharmony_ci} 7048c2ecf20Sopenharmony_ci 7058c2ecf20Sopenharmony_civoid mremap_userfaultfd_prep(struct vm_area_struct *vma, 7068c2ecf20Sopenharmony_ci struct vm_userfaultfd_ctx *vm_ctx) 7078c2ecf20Sopenharmony_ci{ 7088c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx; 7098c2ecf20Sopenharmony_ci 7108c2ecf20Sopenharmony_ci ctx = vma->vm_userfaultfd_ctx.ctx; 7118c2ecf20Sopenharmony_ci 7128c2ecf20Sopenharmony_ci if (!ctx) 7138c2ecf20Sopenharmony_ci return; 7148c2ecf20Sopenharmony_ci 7158c2ecf20Sopenharmony_ci if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { 7168c2ecf20Sopenharmony_ci vm_ctx->ctx = ctx; 7178c2ecf20Sopenharmony_ci userfaultfd_ctx_get(ctx); 7188c2ecf20Sopenharmony_ci WRITE_ONCE(ctx->mmap_changing, true); 7198c2ecf20Sopenharmony_ci } else { 7208c2ecf20Sopenharmony_ci /* Drop uffd context if remap feature not enabled */ 7218c2ecf20Sopenharmony_ci vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 7228c2ecf20Sopenharmony_ci vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); 7238c2ecf20Sopenharmony_ci } 7248c2ecf20Sopenharmony_ci} 7258c2ecf20Sopenharmony_ci 7268c2ecf20Sopenharmony_civoid mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, 7278c2ecf20Sopenharmony_ci unsigned long from, unsigned long to, 7288c2ecf20Sopenharmony_ci unsigned long len) 7298c2ecf20Sopenharmony_ci{ 7308c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx = vm_ctx->ctx; 7318c2ecf20Sopenharmony_ci struct userfaultfd_wait_queue ewq; 7328c2ecf20Sopenharmony_ci 7338c2ecf20Sopenharmony_ci if (!ctx) 7348c2ecf20Sopenharmony_ci return; 7358c2ecf20Sopenharmony_ci 7368c2ecf20Sopenharmony_ci if (to & ~PAGE_MASK) { 7378c2ecf20Sopenharmony_ci userfaultfd_ctx_put(ctx); 7388c2ecf20Sopenharmony_ci return; 7398c2ecf20Sopenharmony_ci } 7408c2ecf20Sopenharmony_ci 7418c2ecf20Sopenharmony_ci msg_init(&ewq.msg); 7428c2ecf20Sopenharmony_ci 7438c2ecf20Sopenharmony_ci ewq.msg.event = UFFD_EVENT_REMAP; 7448c2ecf20Sopenharmony_ci ewq.msg.arg.remap.from = from; 7458c2ecf20Sopenharmony_ci ewq.msg.arg.remap.to = to; 7468c2ecf20Sopenharmony_ci ewq.msg.arg.remap.len = len; 7478c2ecf20Sopenharmony_ci 7488c2ecf20Sopenharmony_ci userfaultfd_event_wait_completion(ctx, &ewq); 7498c2ecf20Sopenharmony_ci} 7508c2ecf20Sopenharmony_ci 7518c2ecf20Sopenharmony_cibool userfaultfd_remove(struct vm_area_struct *vma, 7528c2ecf20Sopenharmony_ci unsigned long start, unsigned long end) 7538c2ecf20Sopenharmony_ci{ 7548c2ecf20Sopenharmony_ci struct mm_struct *mm = vma->vm_mm; 7558c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx; 7568c2ecf20Sopenharmony_ci struct userfaultfd_wait_queue ewq; 7578c2ecf20Sopenharmony_ci 7588c2ecf20Sopenharmony_ci ctx = vma->vm_userfaultfd_ctx.ctx; 7598c2ecf20Sopenharmony_ci if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) 7608c2ecf20Sopenharmony_ci return true; 7618c2ecf20Sopenharmony_ci 7628c2ecf20Sopenharmony_ci userfaultfd_ctx_get(ctx); 7638c2ecf20Sopenharmony_ci WRITE_ONCE(ctx->mmap_changing, true); 7648c2ecf20Sopenharmony_ci mmap_read_unlock(mm); 7658c2ecf20Sopenharmony_ci 7668c2ecf20Sopenharmony_ci msg_init(&ewq.msg); 7678c2ecf20Sopenharmony_ci 7688c2ecf20Sopenharmony_ci ewq.msg.event = UFFD_EVENT_REMOVE; 7698c2ecf20Sopenharmony_ci ewq.msg.arg.remove.start = start; 7708c2ecf20Sopenharmony_ci ewq.msg.arg.remove.end = end; 7718c2ecf20Sopenharmony_ci 7728c2ecf20Sopenharmony_ci userfaultfd_event_wait_completion(ctx, &ewq); 7738c2ecf20Sopenharmony_ci 7748c2ecf20Sopenharmony_ci return false; 7758c2ecf20Sopenharmony_ci} 7768c2ecf20Sopenharmony_ci 7778c2ecf20Sopenharmony_cistatic bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, 7788c2ecf20Sopenharmony_ci unsigned long start, unsigned long end) 7798c2ecf20Sopenharmony_ci{ 7808c2ecf20Sopenharmony_ci struct userfaultfd_unmap_ctx *unmap_ctx; 7818c2ecf20Sopenharmony_ci 7828c2ecf20Sopenharmony_ci list_for_each_entry(unmap_ctx, unmaps, list) 7838c2ecf20Sopenharmony_ci if (unmap_ctx->ctx == ctx && unmap_ctx->start == start && 7848c2ecf20Sopenharmony_ci unmap_ctx->end == end) 7858c2ecf20Sopenharmony_ci return true; 7868c2ecf20Sopenharmony_ci 7878c2ecf20Sopenharmony_ci return false; 7888c2ecf20Sopenharmony_ci} 7898c2ecf20Sopenharmony_ci 7908c2ecf20Sopenharmony_ciint userfaultfd_unmap_prep(struct vm_area_struct *vma, 7918c2ecf20Sopenharmony_ci unsigned long start, unsigned long end, 7928c2ecf20Sopenharmony_ci struct list_head *unmaps) 7938c2ecf20Sopenharmony_ci{ 7948c2ecf20Sopenharmony_ci for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { 7958c2ecf20Sopenharmony_ci struct userfaultfd_unmap_ctx *unmap_ctx; 7968c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; 7978c2ecf20Sopenharmony_ci 7988c2ecf20Sopenharmony_ci if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || 7998c2ecf20Sopenharmony_ci has_unmap_ctx(ctx, unmaps, start, end)) 8008c2ecf20Sopenharmony_ci continue; 8018c2ecf20Sopenharmony_ci 8028c2ecf20Sopenharmony_ci unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); 8038c2ecf20Sopenharmony_ci if (!unmap_ctx) 8048c2ecf20Sopenharmony_ci return -ENOMEM; 8058c2ecf20Sopenharmony_ci 8068c2ecf20Sopenharmony_ci userfaultfd_ctx_get(ctx); 8078c2ecf20Sopenharmony_ci WRITE_ONCE(ctx->mmap_changing, true); 8088c2ecf20Sopenharmony_ci unmap_ctx->ctx = ctx; 8098c2ecf20Sopenharmony_ci unmap_ctx->start = start; 8108c2ecf20Sopenharmony_ci unmap_ctx->end = end; 8118c2ecf20Sopenharmony_ci list_add_tail(&unmap_ctx->list, unmaps); 8128c2ecf20Sopenharmony_ci } 8138c2ecf20Sopenharmony_ci 8148c2ecf20Sopenharmony_ci return 0; 8158c2ecf20Sopenharmony_ci} 8168c2ecf20Sopenharmony_ci 8178c2ecf20Sopenharmony_civoid userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) 8188c2ecf20Sopenharmony_ci{ 8198c2ecf20Sopenharmony_ci struct userfaultfd_unmap_ctx *ctx, *n; 8208c2ecf20Sopenharmony_ci struct userfaultfd_wait_queue ewq; 8218c2ecf20Sopenharmony_ci 8228c2ecf20Sopenharmony_ci list_for_each_entry_safe(ctx, n, uf, list) { 8238c2ecf20Sopenharmony_ci msg_init(&ewq.msg); 8248c2ecf20Sopenharmony_ci 8258c2ecf20Sopenharmony_ci ewq.msg.event = UFFD_EVENT_UNMAP; 8268c2ecf20Sopenharmony_ci ewq.msg.arg.remove.start = ctx->start; 8278c2ecf20Sopenharmony_ci ewq.msg.arg.remove.end = ctx->end; 8288c2ecf20Sopenharmony_ci 8298c2ecf20Sopenharmony_ci userfaultfd_event_wait_completion(ctx->ctx, &ewq); 8308c2ecf20Sopenharmony_ci 8318c2ecf20Sopenharmony_ci list_del(&ctx->list); 8328c2ecf20Sopenharmony_ci kfree(ctx); 8338c2ecf20Sopenharmony_ci } 8348c2ecf20Sopenharmony_ci} 8358c2ecf20Sopenharmony_ci 8368c2ecf20Sopenharmony_cistatic int userfaultfd_release(struct inode *inode, struct file *file) 8378c2ecf20Sopenharmony_ci{ 8388c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx = file->private_data; 8398c2ecf20Sopenharmony_ci struct mm_struct *mm = ctx->mm; 8408c2ecf20Sopenharmony_ci struct vm_area_struct *vma, *prev; 8418c2ecf20Sopenharmony_ci /* len == 0 means wake all */ 8428c2ecf20Sopenharmony_ci struct userfaultfd_wake_range range = { .len = 0, }; 8438c2ecf20Sopenharmony_ci unsigned long new_flags; 8448c2ecf20Sopenharmony_ci 8458c2ecf20Sopenharmony_ci WRITE_ONCE(ctx->released, true); 8468c2ecf20Sopenharmony_ci 8478c2ecf20Sopenharmony_ci if (!mmget_not_zero(mm)) 8488c2ecf20Sopenharmony_ci goto wakeup; 8498c2ecf20Sopenharmony_ci 8508c2ecf20Sopenharmony_ci /* 8518c2ecf20Sopenharmony_ci * Flush page faults out of all CPUs. NOTE: all page faults 8528c2ecf20Sopenharmony_ci * must be retried without returning VM_FAULT_SIGBUS if 8538c2ecf20Sopenharmony_ci * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx 8548c2ecf20Sopenharmony_ci * changes while handle_userfault released the mmap_lock. So 8558c2ecf20Sopenharmony_ci * it's critical that released is set to true (above), before 8568c2ecf20Sopenharmony_ci * taking the mmap_lock for writing. 8578c2ecf20Sopenharmony_ci */ 8588c2ecf20Sopenharmony_ci mmap_write_lock(mm); 8598c2ecf20Sopenharmony_ci prev = NULL; 8608c2ecf20Sopenharmony_ci for (vma = mm->mmap; vma; vma = vma->vm_next) { 8618c2ecf20Sopenharmony_ci cond_resched(); 8628c2ecf20Sopenharmony_ci BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ 8638c2ecf20Sopenharmony_ci !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); 8648c2ecf20Sopenharmony_ci if (vma->vm_userfaultfd_ctx.ctx != ctx) { 8658c2ecf20Sopenharmony_ci prev = vma; 8668c2ecf20Sopenharmony_ci continue; 8678c2ecf20Sopenharmony_ci } 8688c2ecf20Sopenharmony_ci new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); 8698c2ecf20Sopenharmony_ci prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, 8708c2ecf20Sopenharmony_ci new_flags, vma->anon_vma, 8718c2ecf20Sopenharmony_ci vma->vm_file, vma->vm_pgoff, 8728c2ecf20Sopenharmony_ci vma_policy(vma), 8738c2ecf20Sopenharmony_ci NULL_VM_UFFD_CTX, anon_vma_name(vma)); 8748c2ecf20Sopenharmony_ci if (prev) 8758c2ecf20Sopenharmony_ci vma = prev; 8768c2ecf20Sopenharmony_ci else 8778c2ecf20Sopenharmony_ci prev = vma; 8788c2ecf20Sopenharmony_ci vma->vm_flags = new_flags; 8798c2ecf20Sopenharmony_ci vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 8808c2ecf20Sopenharmony_ci } 8818c2ecf20Sopenharmony_ci mmap_write_unlock(mm); 8828c2ecf20Sopenharmony_ci mmput(mm); 8838c2ecf20Sopenharmony_ciwakeup: 8848c2ecf20Sopenharmony_ci /* 8858c2ecf20Sopenharmony_ci * After no new page faults can wait on this fault_*wqh, flush 8868c2ecf20Sopenharmony_ci * the last page faults that may have been already waiting on 8878c2ecf20Sopenharmony_ci * the fault_*wqh. 8888c2ecf20Sopenharmony_ci */ 8898c2ecf20Sopenharmony_ci spin_lock_irq(&ctx->fault_pending_wqh.lock); 8908c2ecf20Sopenharmony_ci __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); 8918c2ecf20Sopenharmony_ci __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); 8928c2ecf20Sopenharmony_ci spin_unlock_irq(&ctx->fault_pending_wqh.lock); 8938c2ecf20Sopenharmony_ci 8948c2ecf20Sopenharmony_ci /* Flush pending events that may still wait on event_wqh */ 8958c2ecf20Sopenharmony_ci wake_up_all(&ctx->event_wqh); 8968c2ecf20Sopenharmony_ci 8978c2ecf20Sopenharmony_ci wake_up_poll(&ctx->fd_wqh, EPOLLHUP); 8988c2ecf20Sopenharmony_ci userfaultfd_ctx_put(ctx); 8998c2ecf20Sopenharmony_ci return 0; 9008c2ecf20Sopenharmony_ci} 9018c2ecf20Sopenharmony_ci 9028c2ecf20Sopenharmony_ci/* fault_pending_wqh.lock must be hold by the caller */ 9038c2ecf20Sopenharmony_cistatic inline struct userfaultfd_wait_queue *find_userfault_in( 9048c2ecf20Sopenharmony_ci wait_queue_head_t *wqh) 9058c2ecf20Sopenharmony_ci{ 9068c2ecf20Sopenharmony_ci wait_queue_entry_t *wq; 9078c2ecf20Sopenharmony_ci struct userfaultfd_wait_queue *uwq; 9088c2ecf20Sopenharmony_ci 9098c2ecf20Sopenharmony_ci lockdep_assert_held(&wqh->lock); 9108c2ecf20Sopenharmony_ci 9118c2ecf20Sopenharmony_ci uwq = NULL; 9128c2ecf20Sopenharmony_ci if (!waitqueue_active(wqh)) 9138c2ecf20Sopenharmony_ci goto out; 9148c2ecf20Sopenharmony_ci /* walk in reverse to provide FIFO behavior to read userfaults */ 9158c2ecf20Sopenharmony_ci wq = list_last_entry(&wqh->head, typeof(*wq), entry); 9168c2ecf20Sopenharmony_ci uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 9178c2ecf20Sopenharmony_ciout: 9188c2ecf20Sopenharmony_ci return uwq; 9198c2ecf20Sopenharmony_ci} 9208c2ecf20Sopenharmony_ci 9218c2ecf20Sopenharmony_cistatic inline struct userfaultfd_wait_queue *find_userfault( 9228c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx) 9238c2ecf20Sopenharmony_ci{ 9248c2ecf20Sopenharmony_ci return find_userfault_in(&ctx->fault_pending_wqh); 9258c2ecf20Sopenharmony_ci} 9268c2ecf20Sopenharmony_ci 9278c2ecf20Sopenharmony_cistatic inline struct userfaultfd_wait_queue *find_userfault_evt( 9288c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx) 9298c2ecf20Sopenharmony_ci{ 9308c2ecf20Sopenharmony_ci return find_userfault_in(&ctx->event_wqh); 9318c2ecf20Sopenharmony_ci} 9328c2ecf20Sopenharmony_ci 9338c2ecf20Sopenharmony_cistatic __poll_t userfaultfd_poll(struct file *file, poll_table *wait) 9348c2ecf20Sopenharmony_ci{ 9358c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx = file->private_data; 9368c2ecf20Sopenharmony_ci __poll_t ret; 9378c2ecf20Sopenharmony_ci 9388c2ecf20Sopenharmony_ci poll_wait(file, &ctx->fd_wqh, wait); 9398c2ecf20Sopenharmony_ci 9408c2ecf20Sopenharmony_ci if (!userfaultfd_is_initialized(ctx)) 9418c2ecf20Sopenharmony_ci return EPOLLERR; 9428c2ecf20Sopenharmony_ci 9438c2ecf20Sopenharmony_ci /* 9448c2ecf20Sopenharmony_ci * poll() never guarantees that read won't block. 9458c2ecf20Sopenharmony_ci * userfaults can be waken before they're read(). 9468c2ecf20Sopenharmony_ci */ 9478c2ecf20Sopenharmony_ci if (unlikely(!(file->f_flags & O_NONBLOCK))) 9488c2ecf20Sopenharmony_ci return EPOLLERR; 9498c2ecf20Sopenharmony_ci /* 9508c2ecf20Sopenharmony_ci * lockless access to see if there are pending faults 9518c2ecf20Sopenharmony_ci * __pollwait last action is the add_wait_queue but 9528c2ecf20Sopenharmony_ci * the spin_unlock would allow the waitqueue_active to 9538c2ecf20Sopenharmony_ci * pass above the actual list_add inside 9548c2ecf20Sopenharmony_ci * add_wait_queue critical section. So use a full 9558c2ecf20Sopenharmony_ci * memory barrier to serialize the list_add write of 9568c2ecf20Sopenharmony_ci * add_wait_queue() with the waitqueue_active read 9578c2ecf20Sopenharmony_ci * below. 9588c2ecf20Sopenharmony_ci */ 9598c2ecf20Sopenharmony_ci ret = 0; 9608c2ecf20Sopenharmony_ci smp_mb(); 9618c2ecf20Sopenharmony_ci if (waitqueue_active(&ctx->fault_pending_wqh)) 9628c2ecf20Sopenharmony_ci ret = EPOLLIN; 9638c2ecf20Sopenharmony_ci else if (waitqueue_active(&ctx->event_wqh)) 9648c2ecf20Sopenharmony_ci ret = EPOLLIN; 9658c2ecf20Sopenharmony_ci 9668c2ecf20Sopenharmony_ci return ret; 9678c2ecf20Sopenharmony_ci} 9688c2ecf20Sopenharmony_ci 9698c2ecf20Sopenharmony_cistatic const struct file_operations userfaultfd_fops; 9708c2ecf20Sopenharmony_ci 9718c2ecf20Sopenharmony_cistatic int resolve_userfault_fork(struct userfaultfd_ctx *ctx, 9728c2ecf20Sopenharmony_ci struct userfaultfd_ctx *new, 9738c2ecf20Sopenharmony_ci struct uffd_msg *msg) 9748c2ecf20Sopenharmony_ci{ 9758c2ecf20Sopenharmony_ci int fd; 9768c2ecf20Sopenharmony_ci 9778c2ecf20Sopenharmony_ci fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new, 9788c2ecf20Sopenharmony_ci O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS)); 9798c2ecf20Sopenharmony_ci if (fd < 0) 9808c2ecf20Sopenharmony_ci return fd; 9818c2ecf20Sopenharmony_ci 9828c2ecf20Sopenharmony_ci msg->arg.reserved.reserved1 = 0; 9838c2ecf20Sopenharmony_ci msg->arg.fork.ufd = fd; 9848c2ecf20Sopenharmony_ci return 0; 9858c2ecf20Sopenharmony_ci} 9868c2ecf20Sopenharmony_ci 9878c2ecf20Sopenharmony_cistatic ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, 9888c2ecf20Sopenharmony_ci struct uffd_msg *msg) 9898c2ecf20Sopenharmony_ci{ 9908c2ecf20Sopenharmony_ci ssize_t ret; 9918c2ecf20Sopenharmony_ci DECLARE_WAITQUEUE(wait, current); 9928c2ecf20Sopenharmony_ci struct userfaultfd_wait_queue *uwq; 9938c2ecf20Sopenharmony_ci /* 9948c2ecf20Sopenharmony_ci * Handling fork event requires sleeping operations, so 9958c2ecf20Sopenharmony_ci * we drop the event_wqh lock, then do these ops, then 9968c2ecf20Sopenharmony_ci * lock it back and wake up the waiter. While the lock is 9978c2ecf20Sopenharmony_ci * dropped the ewq may go away so we keep track of it 9988c2ecf20Sopenharmony_ci * carefully. 9998c2ecf20Sopenharmony_ci */ 10008c2ecf20Sopenharmony_ci LIST_HEAD(fork_event); 10018c2ecf20Sopenharmony_ci struct userfaultfd_ctx *fork_nctx = NULL; 10028c2ecf20Sopenharmony_ci 10038c2ecf20Sopenharmony_ci /* always take the fd_wqh lock before the fault_pending_wqh lock */ 10048c2ecf20Sopenharmony_ci spin_lock_irq(&ctx->fd_wqh.lock); 10058c2ecf20Sopenharmony_ci __add_wait_queue(&ctx->fd_wqh, &wait); 10068c2ecf20Sopenharmony_ci for (;;) { 10078c2ecf20Sopenharmony_ci set_current_state(TASK_INTERRUPTIBLE); 10088c2ecf20Sopenharmony_ci spin_lock(&ctx->fault_pending_wqh.lock); 10098c2ecf20Sopenharmony_ci uwq = find_userfault(ctx); 10108c2ecf20Sopenharmony_ci if (uwq) { 10118c2ecf20Sopenharmony_ci /* 10128c2ecf20Sopenharmony_ci * Use a seqcount to repeat the lockless check 10138c2ecf20Sopenharmony_ci * in wake_userfault() to avoid missing 10148c2ecf20Sopenharmony_ci * wakeups because during the refile both 10158c2ecf20Sopenharmony_ci * waitqueue could become empty if this is the 10168c2ecf20Sopenharmony_ci * only userfault. 10178c2ecf20Sopenharmony_ci */ 10188c2ecf20Sopenharmony_ci write_seqcount_begin(&ctx->refile_seq); 10198c2ecf20Sopenharmony_ci 10208c2ecf20Sopenharmony_ci /* 10218c2ecf20Sopenharmony_ci * The fault_pending_wqh.lock prevents the uwq 10228c2ecf20Sopenharmony_ci * to disappear from under us. 10238c2ecf20Sopenharmony_ci * 10248c2ecf20Sopenharmony_ci * Refile this userfault from 10258c2ecf20Sopenharmony_ci * fault_pending_wqh to fault_wqh, it's not 10268c2ecf20Sopenharmony_ci * pending anymore after we read it. 10278c2ecf20Sopenharmony_ci * 10288c2ecf20Sopenharmony_ci * Use list_del() by hand (as 10298c2ecf20Sopenharmony_ci * userfaultfd_wake_function also uses 10308c2ecf20Sopenharmony_ci * list_del_init() by hand) to be sure nobody 10318c2ecf20Sopenharmony_ci * changes __remove_wait_queue() to use 10328c2ecf20Sopenharmony_ci * list_del_init() in turn breaking the 10338c2ecf20Sopenharmony_ci * !list_empty_careful() check in 10348c2ecf20Sopenharmony_ci * handle_userfault(). The uwq->wq.head list 10358c2ecf20Sopenharmony_ci * must never be empty at any time during the 10368c2ecf20Sopenharmony_ci * refile, or the waitqueue could disappear 10378c2ecf20Sopenharmony_ci * from under us. The "wait_queue_head_t" 10388c2ecf20Sopenharmony_ci * parameter of __remove_wait_queue() is unused 10398c2ecf20Sopenharmony_ci * anyway. 10408c2ecf20Sopenharmony_ci */ 10418c2ecf20Sopenharmony_ci list_del(&uwq->wq.entry); 10428c2ecf20Sopenharmony_ci add_wait_queue(&ctx->fault_wqh, &uwq->wq); 10438c2ecf20Sopenharmony_ci 10448c2ecf20Sopenharmony_ci write_seqcount_end(&ctx->refile_seq); 10458c2ecf20Sopenharmony_ci 10468c2ecf20Sopenharmony_ci /* careful to always initialize msg if ret == 0 */ 10478c2ecf20Sopenharmony_ci *msg = uwq->msg; 10488c2ecf20Sopenharmony_ci spin_unlock(&ctx->fault_pending_wqh.lock); 10498c2ecf20Sopenharmony_ci ret = 0; 10508c2ecf20Sopenharmony_ci break; 10518c2ecf20Sopenharmony_ci } 10528c2ecf20Sopenharmony_ci spin_unlock(&ctx->fault_pending_wqh.lock); 10538c2ecf20Sopenharmony_ci 10548c2ecf20Sopenharmony_ci spin_lock(&ctx->event_wqh.lock); 10558c2ecf20Sopenharmony_ci uwq = find_userfault_evt(ctx); 10568c2ecf20Sopenharmony_ci if (uwq) { 10578c2ecf20Sopenharmony_ci *msg = uwq->msg; 10588c2ecf20Sopenharmony_ci 10598c2ecf20Sopenharmony_ci if (uwq->msg.event == UFFD_EVENT_FORK) { 10608c2ecf20Sopenharmony_ci fork_nctx = (struct userfaultfd_ctx *) 10618c2ecf20Sopenharmony_ci (unsigned long) 10628c2ecf20Sopenharmony_ci uwq->msg.arg.reserved.reserved1; 10638c2ecf20Sopenharmony_ci list_move(&uwq->wq.entry, &fork_event); 10648c2ecf20Sopenharmony_ci /* 10658c2ecf20Sopenharmony_ci * fork_nctx can be freed as soon as 10668c2ecf20Sopenharmony_ci * we drop the lock, unless we take a 10678c2ecf20Sopenharmony_ci * reference on it. 10688c2ecf20Sopenharmony_ci */ 10698c2ecf20Sopenharmony_ci userfaultfd_ctx_get(fork_nctx); 10708c2ecf20Sopenharmony_ci spin_unlock(&ctx->event_wqh.lock); 10718c2ecf20Sopenharmony_ci ret = 0; 10728c2ecf20Sopenharmony_ci break; 10738c2ecf20Sopenharmony_ci } 10748c2ecf20Sopenharmony_ci 10758c2ecf20Sopenharmony_ci userfaultfd_event_complete(ctx, uwq); 10768c2ecf20Sopenharmony_ci spin_unlock(&ctx->event_wqh.lock); 10778c2ecf20Sopenharmony_ci ret = 0; 10788c2ecf20Sopenharmony_ci break; 10798c2ecf20Sopenharmony_ci } 10808c2ecf20Sopenharmony_ci spin_unlock(&ctx->event_wqh.lock); 10818c2ecf20Sopenharmony_ci 10828c2ecf20Sopenharmony_ci if (signal_pending(current)) { 10838c2ecf20Sopenharmony_ci ret = -ERESTARTSYS; 10848c2ecf20Sopenharmony_ci break; 10858c2ecf20Sopenharmony_ci } 10868c2ecf20Sopenharmony_ci if (no_wait) { 10878c2ecf20Sopenharmony_ci ret = -EAGAIN; 10888c2ecf20Sopenharmony_ci break; 10898c2ecf20Sopenharmony_ci } 10908c2ecf20Sopenharmony_ci spin_unlock_irq(&ctx->fd_wqh.lock); 10918c2ecf20Sopenharmony_ci schedule(); 10928c2ecf20Sopenharmony_ci spin_lock_irq(&ctx->fd_wqh.lock); 10938c2ecf20Sopenharmony_ci } 10948c2ecf20Sopenharmony_ci __remove_wait_queue(&ctx->fd_wqh, &wait); 10958c2ecf20Sopenharmony_ci __set_current_state(TASK_RUNNING); 10968c2ecf20Sopenharmony_ci spin_unlock_irq(&ctx->fd_wqh.lock); 10978c2ecf20Sopenharmony_ci 10988c2ecf20Sopenharmony_ci if (!ret && msg->event == UFFD_EVENT_FORK) { 10998c2ecf20Sopenharmony_ci ret = resolve_userfault_fork(ctx, fork_nctx, msg); 11008c2ecf20Sopenharmony_ci spin_lock_irq(&ctx->event_wqh.lock); 11018c2ecf20Sopenharmony_ci if (!list_empty(&fork_event)) { 11028c2ecf20Sopenharmony_ci /* 11038c2ecf20Sopenharmony_ci * The fork thread didn't abort, so we can 11048c2ecf20Sopenharmony_ci * drop the temporary refcount. 11058c2ecf20Sopenharmony_ci */ 11068c2ecf20Sopenharmony_ci userfaultfd_ctx_put(fork_nctx); 11078c2ecf20Sopenharmony_ci 11088c2ecf20Sopenharmony_ci uwq = list_first_entry(&fork_event, 11098c2ecf20Sopenharmony_ci typeof(*uwq), 11108c2ecf20Sopenharmony_ci wq.entry); 11118c2ecf20Sopenharmony_ci /* 11128c2ecf20Sopenharmony_ci * If fork_event list wasn't empty and in turn 11138c2ecf20Sopenharmony_ci * the event wasn't already released by fork 11148c2ecf20Sopenharmony_ci * (the event is allocated on fork kernel 11158c2ecf20Sopenharmony_ci * stack), put the event back to its place in 11168c2ecf20Sopenharmony_ci * the event_wq. fork_event head will be freed 11178c2ecf20Sopenharmony_ci * as soon as we return so the event cannot 11188c2ecf20Sopenharmony_ci * stay queued there no matter the current 11198c2ecf20Sopenharmony_ci * "ret" value. 11208c2ecf20Sopenharmony_ci */ 11218c2ecf20Sopenharmony_ci list_del(&uwq->wq.entry); 11228c2ecf20Sopenharmony_ci __add_wait_queue(&ctx->event_wqh, &uwq->wq); 11238c2ecf20Sopenharmony_ci 11248c2ecf20Sopenharmony_ci /* 11258c2ecf20Sopenharmony_ci * Leave the event in the waitqueue and report 11268c2ecf20Sopenharmony_ci * error to userland if we failed to resolve 11278c2ecf20Sopenharmony_ci * the userfault fork. 11288c2ecf20Sopenharmony_ci */ 11298c2ecf20Sopenharmony_ci if (likely(!ret)) 11308c2ecf20Sopenharmony_ci userfaultfd_event_complete(ctx, uwq); 11318c2ecf20Sopenharmony_ci } else { 11328c2ecf20Sopenharmony_ci /* 11338c2ecf20Sopenharmony_ci * Here the fork thread aborted and the 11348c2ecf20Sopenharmony_ci * refcount from the fork thread on fork_nctx 11358c2ecf20Sopenharmony_ci * has already been released. We still hold 11368c2ecf20Sopenharmony_ci * the reference we took before releasing the 11378c2ecf20Sopenharmony_ci * lock above. If resolve_userfault_fork 11388c2ecf20Sopenharmony_ci * failed we've to drop it because the 11398c2ecf20Sopenharmony_ci * fork_nctx has to be freed in such case. If 11408c2ecf20Sopenharmony_ci * it succeeded we'll hold it because the new 11418c2ecf20Sopenharmony_ci * uffd references it. 11428c2ecf20Sopenharmony_ci */ 11438c2ecf20Sopenharmony_ci if (ret) 11448c2ecf20Sopenharmony_ci userfaultfd_ctx_put(fork_nctx); 11458c2ecf20Sopenharmony_ci } 11468c2ecf20Sopenharmony_ci spin_unlock_irq(&ctx->event_wqh.lock); 11478c2ecf20Sopenharmony_ci } 11488c2ecf20Sopenharmony_ci 11498c2ecf20Sopenharmony_ci return ret; 11508c2ecf20Sopenharmony_ci} 11518c2ecf20Sopenharmony_ci 11528c2ecf20Sopenharmony_cistatic ssize_t userfaultfd_read(struct file *file, char __user *buf, 11538c2ecf20Sopenharmony_ci size_t count, loff_t *ppos) 11548c2ecf20Sopenharmony_ci{ 11558c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx = file->private_data; 11568c2ecf20Sopenharmony_ci ssize_t _ret, ret = 0; 11578c2ecf20Sopenharmony_ci struct uffd_msg msg; 11588c2ecf20Sopenharmony_ci int no_wait = file->f_flags & O_NONBLOCK; 11598c2ecf20Sopenharmony_ci 11608c2ecf20Sopenharmony_ci if (!userfaultfd_is_initialized(ctx)) 11618c2ecf20Sopenharmony_ci return -EINVAL; 11628c2ecf20Sopenharmony_ci 11638c2ecf20Sopenharmony_ci for (;;) { 11648c2ecf20Sopenharmony_ci if (count < sizeof(msg)) 11658c2ecf20Sopenharmony_ci return ret ? ret : -EINVAL; 11668c2ecf20Sopenharmony_ci _ret = userfaultfd_ctx_read(ctx, no_wait, &msg); 11678c2ecf20Sopenharmony_ci if (_ret < 0) 11688c2ecf20Sopenharmony_ci return ret ? ret : _ret; 11698c2ecf20Sopenharmony_ci if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) 11708c2ecf20Sopenharmony_ci return ret ? ret : -EFAULT; 11718c2ecf20Sopenharmony_ci ret += sizeof(msg); 11728c2ecf20Sopenharmony_ci buf += sizeof(msg); 11738c2ecf20Sopenharmony_ci count -= sizeof(msg); 11748c2ecf20Sopenharmony_ci /* 11758c2ecf20Sopenharmony_ci * Allow to read more than one fault at time but only 11768c2ecf20Sopenharmony_ci * block if waiting for the very first one. 11778c2ecf20Sopenharmony_ci */ 11788c2ecf20Sopenharmony_ci no_wait = O_NONBLOCK; 11798c2ecf20Sopenharmony_ci } 11808c2ecf20Sopenharmony_ci} 11818c2ecf20Sopenharmony_ci 11828c2ecf20Sopenharmony_cistatic void __wake_userfault(struct userfaultfd_ctx *ctx, 11838c2ecf20Sopenharmony_ci struct userfaultfd_wake_range *range) 11848c2ecf20Sopenharmony_ci{ 11858c2ecf20Sopenharmony_ci spin_lock_irq(&ctx->fault_pending_wqh.lock); 11868c2ecf20Sopenharmony_ci /* wake all in the range and autoremove */ 11878c2ecf20Sopenharmony_ci if (waitqueue_active(&ctx->fault_pending_wqh)) 11888c2ecf20Sopenharmony_ci __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 11898c2ecf20Sopenharmony_ci range); 11908c2ecf20Sopenharmony_ci if (waitqueue_active(&ctx->fault_wqh)) 11918c2ecf20Sopenharmony_ci __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); 11928c2ecf20Sopenharmony_ci spin_unlock_irq(&ctx->fault_pending_wqh.lock); 11938c2ecf20Sopenharmony_ci} 11948c2ecf20Sopenharmony_ci 11958c2ecf20Sopenharmony_cistatic __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, 11968c2ecf20Sopenharmony_ci struct userfaultfd_wake_range *range) 11978c2ecf20Sopenharmony_ci{ 11988c2ecf20Sopenharmony_ci unsigned seq; 11998c2ecf20Sopenharmony_ci bool need_wakeup; 12008c2ecf20Sopenharmony_ci 12018c2ecf20Sopenharmony_ci /* 12028c2ecf20Sopenharmony_ci * To be sure waitqueue_active() is not reordered by the CPU 12038c2ecf20Sopenharmony_ci * before the pagetable update, use an explicit SMP memory 12048c2ecf20Sopenharmony_ci * barrier here. PT lock release or mmap_read_unlock(mm) still 12058c2ecf20Sopenharmony_ci * have release semantics that can allow the 12068c2ecf20Sopenharmony_ci * waitqueue_active() to be reordered before the pte update. 12078c2ecf20Sopenharmony_ci */ 12088c2ecf20Sopenharmony_ci smp_mb(); 12098c2ecf20Sopenharmony_ci 12108c2ecf20Sopenharmony_ci /* 12118c2ecf20Sopenharmony_ci * Use waitqueue_active because it's very frequent to 12128c2ecf20Sopenharmony_ci * change the address space atomically even if there are no 12138c2ecf20Sopenharmony_ci * userfaults yet. So we take the spinlock only when we're 12148c2ecf20Sopenharmony_ci * sure we've userfaults to wake. 12158c2ecf20Sopenharmony_ci */ 12168c2ecf20Sopenharmony_ci do { 12178c2ecf20Sopenharmony_ci seq = read_seqcount_begin(&ctx->refile_seq); 12188c2ecf20Sopenharmony_ci need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || 12198c2ecf20Sopenharmony_ci waitqueue_active(&ctx->fault_wqh); 12208c2ecf20Sopenharmony_ci cond_resched(); 12218c2ecf20Sopenharmony_ci } while (read_seqcount_retry(&ctx->refile_seq, seq)); 12228c2ecf20Sopenharmony_ci if (need_wakeup) 12238c2ecf20Sopenharmony_ci __wake_userfault(ctx, range); 12248c2ecf20Sopenharmony_ci} 12258c2ecf20Sopenharmony_ci 12268c2ecf20Sopenharmony_cistatic __always_inline int validate_range(struct mm_struct *mm, 12278c2ecf20Sopenharmony_ci __u64 start, __u64 len) 12288c2ecf20Sopenharmony_ci{ 12298c2ecf20Sopenharmony_ci __u64 task_size = mm->task_size; 12308c2ecf20Sopenharmony_ci 12318c2ecf20Sopenharmony_ci if (start & ~PAGE_MASK) 12328c2ecf20Sopenharmony_ci return -EINVAL; 12338c2ecf20Sopenharmony_ci if (len & ~PAGE_MASK) 12348c2ecf20Sopenharmony_ci return -EINVAL; 12358c2ecf20Sopenharmony_ci if (!len) 12368c2ecf20Sopenharmony_ci return -EINVAL; 12378c2ecf20Sopenharmony_ci if (start < mmap_min_addr) 12388c2ecf20Sopenharmony_ci return -EINVAL; 12398c2ecf20Sopenharmony_ci if (start >= task_size) 12408c2ecf20Sopenharmony_ci return -EINVAL; 12418c2ecf20Sopenharmony_ci if (len > task_size - start) 12428c2ecf20Sopenharmony_ci return -EINVAL; 12438c2ecf20Sopenharmony_ci return 0; 12448c2ecf20Sopenharmony_ci} 12458c2ecf20Sopenharmony_ci 12468c2ecf20Sopenharmony_cistatic inline bool vma_can_userfault(struct vm_area_struct *vma, 12478c2ecf20Sopenharmony_ci unsigned long vm_flags) 12488c2ecf20Sopenharmony_ci{ 12498c2ecf20Sopenharmony_ci /* FIXME: add WP support to hugetlbfs and shmem */ 12508c2ecf20Sopenharmony_ci return vma_is_anonymous(vma) || 12518c2ecf20Sopenharmony_ci ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) && 12528c2ecf20Sopenharmony_ci !(vm_flags & VM_UFFD_WP)); 12538c2ecf20Sopenharmony_ci} 12548c2ecf20Sopenharmony_ci 12558c2ecf20Sopenharmony_cistatic int userfaultfd_register(struct userfaultfd_ctx *ctx, 12568c2ecf20Sopenharmony_ci unsigned long arg) 12578c2ecf20Sopenharmony_ci{ 12588c2ecf20Sopenharmony_ci struct mm_struct *mm = ctx->mm; 12598c2ecf20Sopenharmony_ci struct vm_area_struct *vma, *prev, *cur; 12608c2ecf20Sopenharmony_ci int ret; 12618c2ecf20Sopenharmony_ci struct uffdio_register uffdio_register; 12628c2ecf20Sopenharmony_ci struct uffdio_register __user *user_uffdio_register; 12638c2ecf20Sopenharmony_ci unsigned long vm_flags, new_flags; 12648c2ecf20Sopenharmony_ci bool found; 12658c2ecf20Sopenharmony_ci bool basic_ioctls; 12668c2ecf20Sopenharmony_ci unsigned long start, end, vma_end; 12678c2ecf20Sopenharmony_ci 12688c2ecf20Sopenharmony_ci user_uffdio_register = (struct uffdio_register __user *) arg; 12698c2ecf20Sopenharmony_ci 12708c2ecf20Sopenharmony_ci ret = -EFAULT; 12718c2ecf20Sopenharmony_ci if (copy_from_user(&uffdio_register, user_uffdio_register, 12728c2ecf20Sopenharmony_ci sizeof(uffdio_register)-sizeof(__u64))) 12738c2ecf20Sopenharmony_ci goto out; 12748c2ecf20Sopenharmony_ci 12758c2ecf20Sopenharmony_ci ret = -EINVAL; 12768c2ecf20Sopenharmony_ci if (!uffdio_register.mode) 12778c2ecf20Sopenharmony_ci goto out; 12788c2ecf20Sopenharmony_ci if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| 12798c2ecf20Sopenharmony_ci UFFDIO_REGISTER_MODE_WP)) 12808c2ecf20Sopenharmony_ci goto out; 12818c2ecf20Sopenharmony_ci vm_flags = 0; 12828c2ecf20Sopenharmony_ci if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) 12838c2ecf20Sopenharmony_ci vm_flags |= VM_UFFD_MISSING; 12848c2ecf20Sopenharmony_ci if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) 12858c2ecf20Sopenharmony_ci vm_flags |= VM_UFFD_WP; 12868c2ecf20Sopenharmony_ci 12878c2ecf20Sopenharmony_ci ret = validate_range(mm, uffdio_register.range.start, 12888c2ecf20Sopenharmony_ci uffdio_register.range.len); 12898c2ecf20Sopenharmony_ci if (ret) 12908c2ecf20Sopenharmony_ci goto out; 12918c2ecf20Sopenharmony_ci 12928c2ecf20Sopenharmony_ci start = uffdio_register.range.start; 12938c2ecf20Sopenharmony_ci end = start + uffdio_register.range.len; 12948c2ecf20Sopenharmony_ci 12958c2ecf20Sopenharmony_ci ret = -ENOMEM; 12968c2ecf20Sopenharmony_ci if (!mmget_not_zero(mm)) 12978c2ecf20Sopenharmony_ci goto out; 12988c2ecf20Sopenharmony_ci 12998c2ecf20Sopenharmony_ci mmap_write_lock(mm); 13008c2ecf20Sopenharmony_ci vma = find_vma_prev(mm, start, &prev); 13018c2ecf20Sopenharmony_ci if (!vma) 13028c2ecf20Sopenharmony_ci goto out_unlock; 13038c2ecf20Sopenharmony_ci 13048c2ecf20Sopenharmony_ci /* check that there's at least one vma in the range */ 13058c2ecf20Sopenharmony_ci ret = -EINVAL; 13068c2ecf20Sopenharmony_ci if (vma->vm_start >= end) 13078c2ecf20Sopenharmony_ci goto out_unlock; 13088c2ecf20Sopenharmony_ci 13098c2ecf20Sopenharmony_ci /* 13108c2ecf20Sopenharmony_ci * If the first vma contains huge pages, make sure start address 13118c2ecf20Sopenharmony_ci * is aligned to huge page size. 13128c2ecf20Sopenharmony_ci */ 13138c2ecf20Sopenharmony_ci if (is_vm_hugetlb_page(vma)) { 13148c2ecf20Sopenharmony_ci unsigned long vma_hpagesize = vma_kernel_pagesize(vma); 13158c2ecf20Sopenharmony_ci 13168c2ecf20Sopenharmony_ci if (start & (vma_hpagesize - 1)) 13178c2ecf20Sopenharmony_ci goto out_unlock; 13188c2ecf20Sopenharmony_ci } 13198c2ecf20Sopenharmony_ci 13208c2ecf20Sopenharmony_ci /* 13218c2ecf20Sopenharmony_ci * Search for not compatible vmas. 13228c2ecf20Sopenharmony_ci */ 13238c2ecf20Sopenharmony_ci found = false; 13248c2ecf20Sopenharmony_ci basic_ioctls = false; 13258c2ecf20Sopenharmony_ci for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { 13268c2ecf20Sopenharmony_ci cond_resched(); 13278c2ecf20Sopenharmony_ci 13288c2ecf20Sopenharmony_ci BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ 13298c2ecf20Sopenharmony_ci !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); 13308c2ecf20Sopenharmony_ci 13318c2ecf20Sopenharmony_ci /* check not compatible vmas */ 13328c2ecf20Sopenharmony_ci ret = -EINVAL; 13338c2ecf20Sopenharmony_ci if (!vma_can_userfault(cur, vm_flags)) 13348c2ecf20Sopenharmony_ci goto out_unlock; 13358c2ecf20Sopenharmony_ci 13368c2ecf20Sopenharmony_ci /* 13378c2ecf20Sopenharmony_ci * UFFDIO_COPY will fill file holes even without 13388c2ecf20Sopenharmony_ci * PROT_WRITE. This check enforces that if this is a 13398c2ecf20Sopenharmony_ci * MAP_SHARED, the process has write permission to the backing 13408c2ecf20Sopenharmony_ci * file. If VM_MAYWRITE is set it also enforces that on a 13418c2ecf20Sopenharmony_ci * MAP_SHARED vma: there is no F_WRITE_SEAL and no further 13428c2ecf20Sopenharmony_ci * F_WRITE_SEAL can be taken until the vma is destroyed. 13438c2ecf20Sopenharmony_ci */ 13448c2ecf20Sopenharmony_ci ret = -EPERM; 13458c2ecf20Sopenharmony_ci if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) 13468c2ecf20Sopenharmony_ci goto out_unlock; 13478c2ecf20Sopenharmony_ci 13488c2ecf20Sopenharmony_ci /* 13498c2ecf20Sopenharmony_ci * If this vma contains ending address, and huge pages 13508c2ecf20Sopenharmony_ci * check alignment. 13518c2ecf20Sopenharmony_ci */ 13528c2ecf20Sopenharmony_ci if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && 13538c2ecf20Sopenharmony_ci end > cur->vm_start) { 13548c2ecf20Sopenharmony_ci unsigned long vma_hpagesize = vma_kernel_pagesize(cur); 13558c2ecf20Sopenharmony_ci 13568c2ecf20Sopenharmony_ci ret = -EINVAL; 13578c2ecf20Sopenharmony_ci 13588c2ecf20Sopenharmony_ci if (end & (vma_hpagesize - 1)) 13598c2ecf20Sopenharmony_ci goto out_unlock; 13608c2ecf20Sopenharmony_ci } 13618c2ecf20Sopenharmony_ci if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) 13628c2ecf20Sopenharmony_ci goto out_unlock; 13638c2ecf20Sopenharmony_ci 13648c2ecf20Sopenharmony_ci /* 13658c2ecf20Sopenharmony_ci * Check that this vma isn't already owned by a 13668c2ecf20Sopenharmony_ci * different userfaultfd. We can't allow more than one 13678c2ecf20Sopenharmony_ci * userfaultfd to own a single vma simultaneously or we 13688c2ecf20Sopenharmony_ci * wouldn't know which one to deliver the userfaults to. 13698c2ecf20Sopenharmony_ci */ 13708c2ecf20Sopenharmony_ci ret = -EBUSY; 13718c2ecf20Sopenharmony_ci if (cur->vm_userfaultfd_ctx.ctx && 13728c2ecf20Sopenharmony_ci cur->vm_userfaultfd_ctx.ctx != ctx) 13738c2ecf20Sopenharmony_ci goto out_unlock; 13748c2ecf20Sopenharmony_ci 13758c2ecf20Sopenharmony_ci /* 13768c2ecf20Sopenharmony_ci * Note vmas containing huge pages 13778c2ecf20Sopenharmony_ci */ 13788c2ecf20Sopenharmony_ci if (is_vm_hugetlb_page(cur)) 13798c2ecf20Sopenharmony_ci basic_ioctls = true; 13808c2ecf20Sopenharmony_ci 13818c2ecf20Sopenharmony_ci found = true; 13828c2ecf20Sopenharmony_ci } 13838c2ecf20Sopenharmony_ci BUG_ON(!found); 13848c2ecf20Sopenharmony_ci 13858c2ecf20Sopenharmony_ci if (vma->vm_start < start) 13868c2ecf20Sopenharmony_ci prev = vma; 13878c2ecf20Sopenharmony_ci 13888c2ecf20Sopenharmony_ci ret = 0; 13898c2ecf20Sopenharmony_ci do { 13908c2ecf20Sopenharmony_ci cond_resched(); 13918c2ecf20Sopenharmony_ci 13928c2ecf20Sopenharmony_ci BUG_ON(!vma_can_userfault(vma, vm_flags)); 13938c2ecf20Sopenharmony_ci BUG_ON(vma->vm_userfaultfd_ctx.ctx && 13948c2ecf20Sopenharmony_ci vma->vm_userfaultfd_ctx.ctx != ctx); 13958c2ecf20Sopenharmony_ci WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); 13968c2ecf20Sopenharmony_ci 13978c2ecf20Sopenharmony_ci /* 13988c2ecf20Sopenharmony_ci * Nothing to do: this vma is already registered into this 13998c2ecf20Sopenharmony_ci * userfaultfd and with the right tracking mode too. 14008c2ecf20Sopenharmony_ci */ 14018c2ecf20Sopenharmony_ci if (vma->vm_userfaultfd_ctx.ctx == ctx && 14028c2ecf20Sopenharmony_ci (vma->vm_flags & vm_flags) == vm_flags) 14038c2ecf20Sopenharmony_ci goto skip; 14048c2ecf20Sopenharmony_ci 14058c2ecf20Sopenharmony_ci if (vma->vm_start > start) 14068c2ecf20Sopenharmony_ci start = vma->vm_start; 14078c2ecf20Sopenharmony_ci vma_end = min(end, vma->vm_end); 14088c2ecf20Sopenharmony_ci 14098c2ecf20Sopenharmony_ci new_flags = (vma->vm_flags & 14108c2ecf20Sopenharmony_ci ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags; 14118c2ecf20Sopenharmony_ci prev = vma_merge(mm, prev, start, vma_end, new_flags, 14128c2ecf20Sopenharmony_ci vma->anon_vma, vma->vm_file, vma->vm_pgoff, 14138c2ecf20Sopenharmony_ci vma_policy(vma), 14148c2ecf20Sopenharmony_ci ((struct vm_userfaultfd_ctx){ ctx }), 14158c2ecf20Sopenharmony_ci anon_vma_name(vma)); 14168c2ecf20Sopenharmony_ci if (prev) { 14178c2ecf20Sopenharmony_ci vma = prev; 14188c2ecf20Sopenharmony_ci goto next; 14198c2ecf20Sopenharmony_ci } 14208c2ecf20Sopenharmony_ci if (vma->vm_start < start) { 14218c2ecf20Sopenharmony_ci ret = split_vma(mm, vma, start, 1); 14228c2ecf20Sopenharmony_ci if (ret) 14238c2ecf20Sopenharmony_ci break; 14248c2ecf20Sopenharmony_ci } 14258c2ecf20Sopenharmony_ci if (vma->vm_end > end) { 14268c2ecf20Sopenharmony_ci ret = split_vma(mm, vma, end, 0); 14278c2ecf20Sopenharmony_ci if (ret) 14288c2ecf20Sopenharmony_ci break; 14298c2ecf20Sopenharmony_ci } 14308c2ecf20Sopenharmony_ci next: 14318c2ecf20Sopenharmony_ci /* 14328c2ecf20Sopenharmony_ci * In the vma_merge() successful mprotect-like case 8: 14338c2ecf20Sopenharmony_ci * the next vma was merged into the current one and 14348c2ecf20Sopenharmony_ci * the current one has not been updated yet. 14358c2ecf20Sopenharmony_ci */ 14368c2ecf20Sopenharmony_ci vma->vm_flags = new_flags; 14378c2ecf20Sopenharmony_ci vma->vm_userfaultfd_ctx.ctx = ctx; 14388c2ecf20Sopenharmony_ci 14398c2ecf20Sopenharmony_ci skip: 14408c2ecf20Sopenharmony_ci prev = vma; 14418c2ecf20Sopenharmony_ci start = vma->vm_end; 14428c2ecf20Sopenharmony_ci vma = vma->vm_next; 14438c2ecf20Sopenharmony_ci } while (vma && vma->vm_start < end); 14448c2ecf20Sopenharmony_ciout_unlock: 14458c2ecf20Sopenharmony_ci mmap_write_unlock(mm); 14468c2ecf20Sopenharmony_ci mmput(mm); 14478c2ecf20Sopenharmony_ci if (!ret) { 14488c2ecf20Sopenharmony_ci __u64 ioctls_out; 14498c2ecf20Sopenharmony_ci 14508c2ecf20Sopenharmony_ci ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : 14518c2ecf20Sopenharmony_ci UFFD_API_RANGE_IOCTLS; 14528c2ecf20Sopenharmony_ci 14538c2ecf20Sopenharmony_ci /* 14548c2ecf20Sopenharmony_ci * Declare the WP ioctl only if the WP mode is 14558c2ecf20Sopenharmony_ci * specified and all checks passed with the range 14568c2ecf20Sopenharmony_ci */ 14578c2ecf20Sopenharmony_ci if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) 14588c2ecf20Sopenharmony_ci ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); 14598c2ecf20Sopenharmony_ci 14608c2ecf20Sopenharmony_ci /* 14618c2ecf20Sopenharmony_ci * Now that we scanned all vmas we can already tell 14628c2ecf20Sopenharmony_ci * userland which ioctls methods are guaranteed to 14638c2ecf20Sopenharmony_ci * succeed on this range. 14648c2ecf20Sopenharmony_ci */ 14658c2ecf20Sopenharmony_ci if (put_user(ioctls_out, &user_uffdio_register->ioctls)) 14668c2ecf20Sopenharmony_ci ret = -EFAULT; 14678c2ecf20Sopenharmony_ci } 14688c2ecf20Sopenharmony_ciout: 14698c2ecf20Sopenharmony_ci return ret; 14708c2ecf20Sopenharmony_ci} 14718c2ecf20Sopenharmony_ci 14728c2ecf20Sopenharmony_cistatic int userfaultfd_unregister(struct userfaultfd_ctx *ctx, 14738c2ecf20Sopenharmony_ci unsigned long arg) 14748c2ecf20Sopenharmony_ci{ 14758c2ecf20Sopenharmony_ci struct mm_struct *mm = ctx->mm; 14768c2ecf20Sopenharmony_ci struct vm_area_struct *vma, *prev, *cur; 14778c2ecf20Sopenharmony_ci int ret; 14788c2ecf20Sopenharmony_ci struct uffdio_range uffdio_unregister; 14798c2ecf20Sopenharmony_ci unsigned long new_flags; 14808c2ecf20Sopenharmony_ci bool found; 14818c2ecf20Sopenharmony_ci unsigned long start, end, vma_end; 14828c2ecf20Sopenharmony_ci const void __user *buf = (void __user *)arg; 14838c2ecf20Sopenharmony_ci 14848c2ecf20Sopenharmony_ci ret = -EFAULT; 14858c2ecf20Sopenharmony_ci if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) 14868c2ecf20Sopenharmony_ci goto out; 14878c2ecf20Sopenharmony_ci 14888c2ecf20Sopenharmony_ci ret = validate_range(mm, uffdio_unregister.start, 14898c2ecf20Sopenharmony_ci uffdio_unregister.len); 14908c2ecf20Sopenharmony_ci if (ret) 14918c2ecf20Sopenharmony_ci goto out; 14928c2ecf20Sopenharmony_ci 14938c2ecf20Sopenharmony_ci start = uffdio_unregister.start; 14948c2ecf20Sopenharmony_ci end = start + uffdio_unregister.len; 14958c2ecf20Sopenharmony_ci 14968c2ecf20Sopenharmony_ci ret = -ENOMEM; 14978c2ecf20Sopenharmony_ci if (!mmget_not_zero(mm)) 14988c2ecf20Sopenharmony_ci goto out; 14998c2ecf20Sopenharmony_ci 15008c2ecf20Sopenharmony_ci mmap_write_lock(mm); 15018c2ecf20Sopenharmony_ci vma = find_vma_prev(mm, start, &prev); 15028c2ecf20Sopenharmony_ci if (!vma) 15038c2ecf20Sopenharmony_ci goto out_unlock; 15048c2ecf20Sopenharmony_ci 15058c2ecf20Sopenharmony_ci /* check that there's at least one vma in the range */ 15068c2ecf20Sopenharmony_ci ret = -EINVAL; 15078c2ecf20Sopenharmony_ci if (vma->vm_start >= end) 15088c2ecf20Sopenharmony_ci goto out_unlock; 15098c2ecf20Sopenharmony_ci 15108c2ecf20Sopenharmony_ci /* 15118c2ecf20Sopenharmony_ci * If the first vma contains huge pages, make sure start address 15128c2ecf20Sopenharmony_ci * is aligned to huge page size. 15138c2ecf20Sopenharmony_ci */ 15148c2ecf20Sopenharmony_ci if (is_vm_hugetlb_page(vma)) { 15158c2ecf20Sopenharmony_ci unsigned long vma_hpagesize = vma_kernel_pagesize(vma); 15168c2ecf20Sopenharmony_ci 15178c2ecf20Sopenharmony_ci if (start & (vma_hpagesize - 1)) 15188c2ecf20Sopenharmony_ci goto out_unlock; 15198c2ecf20Sopenharmony_ci } 15208c2ecf20Sopenharmony_ci 15218c2ecf20Sopenharmony_ci /* 15228c2ecf20Sopenharmony_ci * Search for not compatible vmas. 15238c2ecf20Sopenharmony_ci */ 15248c2ecf20Sopenharmony_ci found = false; 15258c2ecf20Sopenharmony_ci ret = -EINVAL; 15268c2ecf20Sopenharmony_ci for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { 15278c2ecf20Sopenharmony_ci cond_resched(); 15288c2ecf20Sopenharmony_ci 15298c2ecf20Sopenharmony_ci BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ 15308c2ecf20Sopenharmony_ci !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); 15318c2ecf20Sopenharmony_ci 15328c2ecf20Sopenharmony_ci /* 15338c2ecf20Sopenharmony_ci * Check not compatible vmas, not strictly required 15348c2ecf20Sopenharmony_ci * here as not compatible vmas cannot have an 15358c2ecf20Sopenharmony_ci * userfaultfd_ctx registered on them, but this 15368c2ecf20Sopenharmony_ci * provides for more strict behavior to notice 15378c2ecf20Sopenharmony_ci * unregistration errors. 15388c2ecf20Sopenharmony_ci */ 15398c2ecf20Sopenharmony_ci if (!vma_can_userfault(cur, cur->vm_flags)) 15408c2ecf20Sopenharmony_ci goto out_unlock; 15418c2ecf20Sopenharmony_ci 15428c2ecf20Sopenharmony_ci found = true; 15438c2ecf20Sopenharmony_ci } 15448c2ecf20Sopenharmony_ci BUG_ON(!found); 15458c2ecf20Sopenharmony_ci 15468c2ecf20Sopenharmony_ci if (vma->vm_start < start) 15478c2ecf20Sopenharmony_ci prev = vma; 15488c2ecf20Sopenharmony_ci 15498c2ecf20Sopenharmony_ci ret = 0; 15508c2ecf20Sopenharmony_ci do { 15518c2ecf20Sopenharmony_ci cond_resched(); 15528c2ecf20Sopenharmony_ci 15538c2ecf20Sopenharmony_ci BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); 15548c2ecf20Sopenharmony_ci 15558c2ecf20Sopenharmony_ci /* 15568c2ecf20Sopenharmony_ci * Nothing to do: this vma is already registered into this 15578c2ecf20Sopenharmony_ci * userfaultfd and with the right tracking mode too. 15588c2ecf20Sopenharmony_ci */ 15598c2ecf20Sopenharmony_ci if (!vma->vm_userfaultfd_ctx.ctx) 15608c2ecf20Sopenharmony_ci goto skip; 15618c2ecf20Sopenharmony_ci 15628c2ecf20Sopenharmony_ci WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); 15638c2ecf20Sopenharmony_ci 15648c2ecf20Sopenharmony_ci if (vma->vm_start > start) 15658c2ecf20Sopenharmony_ci start = vma->vm_start; 15668c2ecf20Sopenharmony_ci vma_end = min(end, vma->vm_end); 15678c2ecf20Sopenharmony_ci 15688c2ecf20Sopenharmony_ci if (userfaultfd_missing(vma)) { 15698c2ecf20Sopenharmony_ci /* 15708c2ecf20Sopenharmony_ci * Wake any concurrent pending userfault while 15718c2ecf20Sopenharmony_ci * we unregister, so they will not hang 15728c2ecf20Sopenharmony_ci * permanently and it avoids userland to call 15738c2ecf20Sopenharmony_ci * UFFDIO_WAKE explicitly. 15748c2ecf20Sopenharmony_ci */ 15758c2ecf20Sopenharmony_ci struct userfaultfd_wake_range range; 15768c2ecf20Sopenharmony_ci range.start = start; 15778c2ecf20Sopenharmony_ci range.len = vma_end - start; 15788c2ecf20Sopenharmony_ci wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); 15798c2ecf20Sopenharmony_ci } 15808c2ecf20Sopenharmony_ci 15818c2ecf20Sopenharmony_ci new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); 15828c2ecf20Sopenharmony_ci prev = vma_merge(mm, prev, start, vma_end, new_flags, 15838c2ecf20Sopenharmony_ci vma->anon_vma, vma->vm_file, vma->vm_pgoff, 15848c2ecf20Sopenharmony_ci vma_policy(vma), 15858c2ecf20Sopenharmony_ci NULL_VM_UFFD_CTX, anon_vma_name(vma)); 15868c2ecf20Sopenharmony_ci if (prev) { 15878c2ecf20Sopenharmony_ci vma = prev; 15888c2ecf20Sopenharmony_ci goto next; 15898c2ecf20Sopenharmony_ci } 15908c2ecf20Sopenharmony_ci if (vma->vm_start < start) { 15918c2ecf20Sopenharmony_ci ret = split_vma(mm, vma, start, 1); 15928c2ecf20Sopenharmony_ci if (ret) 15938c2ecf20Sopenharmony_ci break; 15948c2ecf20Sopenharmony_ci } 15958c2ecf20Sopenharmony_ci if (vma->vm_end > end) { 15968c2ecf20Sopenharmony_ci ret = split_vma(mm, vma, end, 0); 15978c2ecf20Sopenharmony_ci if (ret) 15988c2ecf20Sopenharmony_ci break; 15998c2ecf20Sopenharmony_ci } 16008c2ecf20Sopenharmony_ci next: 16018c2ecf20Sopenharmony_ci /* 16028c2ecf20Sopenharmony_ci * In the vma_merge() successful mprotect-like case 8: 16038c2ecf20Sopenharmony_ci * the next vma was merged into the current one and 16048c2ecf20Sopenharmony_ci * the current one has not been updated yet. 16058c2ecf20Sopenharmony_ci */ 16068c2ecf20Sopenharmony_ci vma->vm_flags = new_flags; 16078c2ecf20Sopenharmony_ci vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 16088c2ecf20Sopenharmony_ci 16098c2ecf20Sopenharmony_ci skip: 16108c2ecf20Sopenharmony_ci prev = vma; 16118c2ecf20Sopenharmony_ci start = vma->vm_end; 16128c2ecf20Sopenharmony_ci vma = vma->vm_next; 16138c2ecf20Sopenharmony_ci } while (vma && vma->vm_start < end); 16148c2ecf20Sopenharmony_ciout_unlock: 16158c2ecf20Sopenharmony_ci mmap_write_unlock(mm); 16168c2ecf20Sopenharmony_ci mmput(mm); 16178c2ecf20Sopenharmony_ciout: 16188c2ecf20Sopenharmony_ci return ret; 16198c2ecf20Sopenharmony_ci} 16208c2ecf20Sopenharmony_ci 16218c2ecf20Sopenharmony_ci/* 16228c2ecf20Sopenharmony_ci * userfaultfd_wake may be used in combination with the 16238c2ecf20Sopenharmony_ci * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. 16248c2ecf20Sopenharmony_ci */ 16258c2ecf20Sopenharmony_cistatic int userfaultfd_wake(struct userfaultfd_ctx *ctx, 16268c2ecf20Sopenharmony_ci unsigned long arg) 16278c2ecf20Sopenharmony_ci{ 16288c2ecf20Sopenharmony_ci int ret; 16298c2ecf20Sopenharmony_ci struct uffdio_range uffdio_wake; 16308c2ecf20Sopenharmony_ci struct userfaultfd_wake_range range; 16318c2ecf20Sopenharmony_ci const void __user *buf = (void __user *)arg; 16328c2ecf20Sopenharmony_ci 16338c2ecf20Sopenharmony_ci ret = -EFAULT; 16348c2ecf20Sopenharmony_ci if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) 16358c2ecf20Sopenharmony_ci goto out; 16368c2ecf20Sopenharmony_ci 16378c2ecf20Sopenharmony_ci ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); 16388c2ecf20Sopenharmony_ci if (ret) 16398c2ecf20Sopenharmony_ci goto out; 16408c2ecf20Sopenharmony_ci 16418c2ecf20Sopenharmony_ci range.start = uffdio_wake.start; 16428c2ecf20Sopenharmony_ci range.len = uffdio_wake.len; 16438c2ecf20Sopenharmony_ci 16448c2ecf20Sopenharmony_ci /* 16458c2ecf20Sopenharmony_ci * len == 0 means wake all and we don't want to wake all here, 16468c2ecf20Sopenharmony_ci * so check it again to be sure. 16478c2ecf20Sopenharmony_ci */ 16488c2ecf20Sopenharmony_ci VM_BUG_ON(!range.len); 16498c2ecf20Sopenharmony_ci 16508c2ecf20Sopenharmony_ci wake_userfault(ctx, &range); 16518c2ecf20Sopenharmony_ci ret = 0; 16528c2ecf20Sopenharmony_ci 16538c2ecf20Sopenharmony_ciout: 16548c2ecf20Sopenharmony_ci return ret; 16558c2ecf20Sopenharmony_ci} 16568c2ecf20Sopenharmony_ci 16578c2ecf20Sopenharmony_cistatic int userfaultfd_copy(struct userfaultfd_ctx *ctx, 16588c2ecf20Sopenharmony_ci unsigned long arg) 16598c2ecf20Sopenharmony_ci{ 16608c2ecf20Sopenharmony_ci __s64 ret; 16618c2ecf20Sopenharmony_ci struct uffdio_copy uffdio_copy; 16628c2ecf20Sopenharmony_ci struct uffdio_copy __user *user_uffdio_copy; 16638c2ecf20Sopenharmony_ci struct userfaultfd_wake_range range; 16648c2ecf20Sopenharmony_ci 16658c2ecf20Sopenharmony_ci user_uffdio_copy = (struct uffdio_copy __user *) arg; 16668c2ecf20Sopenharmony_ci 16678c2ecf20Sopenharmony_ci ret = -EAGAIN; 16688c2ecf20Sopenharmony_ci if (READ_ONCE(ctx->mmap_changing)) 16698c2ecf20Sopenharmony_ci goto out; 16708c2ecf20Sopenharmony_ci 16718c2ecf20Sopenharmony_ci ret = -EFAULT; 16728c2ecf20Sopenharmony_ci if (copy_from_user(&uffdio_copy, user_uffdio_copy, 16738c2ecf20Sopenharmony_ci /* don't copy "copy" last field */ 16748c2ecf20Sopenharmony_ci sizeof(uffdio_copy)-sizeof(__s64))) 16758c2ecf20Sopenharmony_ci goto out; 16768c2ecf20Sopenharmony_ci 16778c2ecf20Sopenharmony_ci ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); 16788c2ecf20Sopenharmony_ci if (ret) 16798c2ecf20Sopenharmony_ci goto out; 16808c2ecf20Sopenharmony_ci /* 16818c2ecf20Sopenharmony_ci * double check for wraparound just in case. copy_from_user() 16828c2ecf20Sopenharmony_ci * will later check uffdio_copy.src + uffdio_copy.len to fit 16838c2ecf20Sopenharmony_ci * in the userland range. 16848c2ecf20Sopenharmony_ci */ 16858c2ecf20Sopenharmony_ci ret = -EINVAL; 16868c2ecf20Sopenharmony_ci if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) 16878c2ecf20Sopenharmony_ci goto out; 16888c2ecf20Sopenharmony_ci if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) 16898c2ecf20Sopenharmony_ci goto out; 16908c2ecf20Sopenharmony_ci if (mmget_not_zero(ctx->mm)) { 16918c2ecf20Sopenharmony_ci ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, 16928c2ecf20Sopenharmony_ci uffdio_copy.len, &ctx->mmap_changing, 16938c2ecf20Sopenharmony_ci uffdio_copy.mode); 16948c2ecf20Sopenharmony_ci mmput(ctx->mm); 16958c2ecf20Sopenharmony_ci } else { 16968c2ecf20Sopenharmony_ci return -ESRCH; 16978c2ecf20Sopenharmony_ci } 16988c2ecf20Sopenharmony_ci if (unlikely(put_user(ret, &user_uffdio_copy->copy))) 16998c2ecf20Sopenharmony_ci return -EFAULT; 17008c2ecf20Sopenharmony_ci if (ret < 0) 17018c2ecf20Sopenharmony_ci goto out; 17028c2ecf20Sopenharmony_ci BUG_ON(!ret); 17038c2ecf20Sopenharmony_ci /* len == 0 would wake all */ 17048c2ecf20Sopenharmony_ci range.len = ret; 17058c2ecf20Sopenharmony_ci if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { 17068c2ecf20Sopenharmony_ci range.start = uffdio_copy.dst; 17078c2ecf20Sopenharmony_ci wake_userfault(ctx, &range); 17088c2ecf20Sopenharmony_ci } 17098c2ecf20Sopenharmony_ci ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; 17108c2ecf20Sopenharmony_ciout: 17118c2ecf20Sopenharmony_ci return ret; 17128c2ecf20Sopenharmony_ci} 17138c2ecf20Sopenharmony_ci 17148c2ecf20Sopenharmony_cistatic int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, 17158c2ecf20Sopenharmony_ci unsigned long arg) 17168c2ecf20Sopenharmony_ci{ 17178c2ecf20Sopenharmony_ci __s64 ret; 17188c2ecf20Sopenharmony_ci struct uffdio_zeropage uffdio_zeropage; 17198c2ecf20Sopenharmony_ci struct uffdio_zeropage __user *user_uffdio_zeropage; 17208c2ecf20Sopenharmony_ci struct userfaultfd_wake_range range; 17218c2ecf20Sopenharmony_ci 17228c2ecf20Sopenharmony_ci user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; 17238c2ecf20Sopenharmony_ci 17248c2ecf20Sopenharmony_ci ret = -EAGAIN; 17258c2ecf20Sopenharmony_ci if (READ_ONCE(ctx->mmap_changing)) 17268c2ecf20Sopenharmony_ci goto out; 17278c2ecf20Sopenharmony_ci 17288c2ecf20Sopenharmony_ci ret = -EFAULT; 17298c2ecf20Sopenharmony_ci if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, 17308c2ecf20Sopenharmony_ci /* don't copy "zeropage" last field */ 17318c2ecf20Sopenharmony_ci sizeof(uffdio_zeropage)-sizeof(__s64))) 17328c2ecf20Sopenharmony_ci goto out; 17338c2ecf20Sopenharmony_ci 17348c2ecf20Sopenharmony_ci ret = validate_range(ctx->mm, uffdio_zeropage.range.start, 17358c2ecf20Sopenharmony_ci uffdio_zeropage.range.len); 17368c2ecf20Sopenharmony_ci if (ret) 17378c2ecf20Sopenharmony_ci goto out; 17388c2ecf20Sopenharmony_ci ret = -EINVAL; 17398c2ecf20Sopenharmony_ci if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) 17408c2ecf20Sopenharmony_ci goto out; 17418c2ecf20Sopenharmony_ci 17428c2ecf20Sopenharmony_ci if (mmget_not_zero(ctx->mm)) { 17438c2ecf20Sopenharmony_ci ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, 17448c2ecf20Sopenharmony_ci uffdio_zeropage.range.len, 17458c2ecf20Sopenharmony_ci &ctx->mmap_changing); 17468c2ecf20Sopenharmony_ci mmput(ctx->mm); 17478c2ecf20Sopenharmony_ci } else { 17488c2ecf20Sopenharmony_ci return -ESRCH; 17498c2ecf20Sopenharmony_ci } 17508c2ecf20Sopenharmony_ci if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) 17518c2ecf20Sopenharmony_ci return -EFAULT; 17528c2ecf20Sopenharmony_ci if (ret < 0) 17538c2ecf20Sopenharmony_ci goto out; 17548c2ecf20Sopenharmony_ci /* len == 0 would wake all */ 17558c2ecf20Sopenharmony_ci BUG_ON(!ret); 17568c2ecf20Sopenharmony_ci range.len = ret; 17578c2ecf20Sopenharmony_ci if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { 17588c2ecf20Sopenharmony_ci range.start = uffdio_zeropage.range.start; 17598c2ecf20Sopenharmony_ci wake_userfault(ctx, &range); 17608c2ecf20Sopenharmony_ci } 17618c2ecf20Sopenharmony_ci ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; 17628c2ecf20Sopenharmony_ciout: 17638c2ecf20Sopenharmony_ci return ret; 17648c2ecf20Sopenharmony_ci} 17658c2ecf20Sopenharmony_ci 17668c2ecf20Sopenharmony_cistatic int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, 17678c2ecf20Sopenharmony_ci unsigned long arg) 17688c2ecf20Sopenharmony_ci{ 17698c2ecf20Sopenharmony_ci int ret; 17708c2ecf20Sopenharmony_ci struct uffdio_writeprotect uffdio_wp; 17718c2ecf20Sopenharmony_ci struct uffdio_writeprotect __user *user_uffdio_wp; 17728c2ecf20Sopenharmony_ci struct userfaultfd_wake_range range; 17738c2ecf20Sopenharmony_ci bool mode_wp, mode_dontwake; 17748c2ecf20Sopenharmony_ci 17758c2ecf20Sopenharmony_ci if (READ_ONCE(ctx->mmap_changing)) 17768c2ecf20Sopenharmony_ci return -EAGAIN; 17778c2ecf20Sopenharmony_ci 17788c2ecf20Sopenharmony_ci user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; 17798c2ecf20Sopenharmony_ci 17808c2ecf20Sopenharmony_ci if (copy_from_user(&uffdio_wp, user_uffdio_wp, 17818c2ecf20Sopenharmony_ci sizeof(struct uffdio_writeprotect))) 17828c2ecf20Sopenharmony_ci return -EFAULT; 17838c2ecf20Sopenharmony_ci 17848c2ecf20Sopenharmony_ci ret = validate_range(ctx->mm, uffdio_wp.range.start, 17858c2ecf20Sopenharmony_ci uffdio_wp.range.len); 17868c2ecf20Sopenharmony_ci if (ret) 17878c2ecf20Sopenharmony_ci return ret; 17888c2ecf20Sopenharmony_ci 17898c2ecf20Sopenharmony_ci if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | 17908c2ecf20Sopenharmony_ci UFFDIO_WRITEPROTECT_MODE_WP)) 17918c2ecf20Sopenharmony_ci return -EINVAL; 17928c2ecf20Sopenharmony_ci 17938c2ecf20Sopenharmony_ci mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; 17948c2ecf20Sopenharmony_ci mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; 17958c2ecf20Sopenharmony_ci 17968c2ecf20Sopenharmony_ci if (mode_wp && mode_dontwake) 17978c2ecf20Sopenharmony_ci return -EINVAL; 17988c2ecf20Sopenharmony_ci 17998c2ecf20Sopenharmony_ci if (mmget_not_zero(ctx->mm)) { 18008c2ecf20Sopenharmony_ci ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, 18018c2ecf20Sopenharmony_ci uffdio_wp.range.len, mode_wp, 18028c2ecf20Sopenharmony_ci &ctx->mmap_changing); 18038c2ecf20Sopenharmony_ci mmput(ctx->mm); 18048c2ecf20Sopenharmony_ci } else { 18058c2ecf20Sopenharmony_ci return -ESRCH; 18068c2ecf20Sopenharmony_ci } 18078c2ecf20Sopenharmony_ci 18088c2ecf20Sopenharmony_ci if (ret) 18098c2ecf20Sopenharmony_ci return ret; 18108c2ecf20Sopenharmony_ci 18118c2ecf20Sopenharmony_ci if (!mode_wp && !mode_dontwake) { 18128c2ecf20Sopenharmony_ci range.start = uffdio_wp.range.start; 18138c2ecf20Sopenharmony_ci range.len = uffdio_wp.range.len; 18148c2ecf20Sopenharmony_ci wake_userfault(ctx, &range); 18158c2ecf20Sopenharmony_ci } 18168c2ecf20Sopenharmony_ci return ret; 18178c2ecf20Sopenharmony_ci} 18188c2ecf20Sopenharmony_ci 18198c2ecf20Sopenharmony_cistatic inline unsigned int uffd_ctx_features(__u64 user_features) 18208c2ecf20Sopenharmony_ci{ 18218c2ecf20Sopenharmony_ci /* 18228c2ecf20Sopenharmony_ci * For the current set of features the bits just coincide. Set 18238c2ecf20Sopenharmony_ci * UFFD_FEATURE_INITIALIZED to mark the features as enabled. 18248c2ecf20Sopenharmony_ci */ 18258c2ecf20Sopenharmony_ci return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; 18268c2ecf20Sopenharmony_ci} 18278c2ecf20Sopenharmony_ci 18288c2ecf20Sopenharmony_ci/* 18298c2ecf20Sopenharmony_ci * userland asks for a certain API version and we return which bits 18308c2ecf20Sopenharmony_ci * and ioctl commands are implemented in this kernel for such API 18318c2ecf20Sopenharmony_ci * version or -EINVAL if unknown. 18328c2ecf20Sopenharmony_ci */ 18338c2ecf20Sopenharmony_cistatic int userfaultfd_api(struct userfaultfd_ctx *ctx, 18348c2ecf20Sopenharmony_ci unsigned long arg) 18358c2ecf20Sopenharmony_ci{ 18368c2ecf20Sopenharmony_ci struct uffdio_api uffdio_api; 18378c2ecf20Sopenharmony_ci void __user *buf = (void __user *)arg; 18388c2ecf20Sopenharmony_ci unsigned int ctx_features; 18398c2ecf20Sopenharmony_ci int ret; 18408c2ecf20Sopenharmony_ci __u64 features; 18418c2ecf20Sopenharmony_ci 18428c2ecf20Sopenharmony_ci ret = -EFAULT; 18438c2ecf20Sopenharmony_ci if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) 18448c2ecf20Sopenharmony_ci goto out; 18458c2ecf20Sopenharmony_ci features = uffdio_api.features; 18468c2ecf20Sopenharmony_ci ret = -EINVAL; 18478c2ecf20Sopenharmony_ci if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) 18488c2ecf20Sopenharmony_ci goto err_out; 18498c2ecf20Sopenharmony_ci ret = -EPERM; 18508c2ecf20Sopenharmony_ci if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) 18518c2ecf20Sopenharmony_ci goto err_out; 18528c2ecf20Sopenharmony_ci /* report all available features and ioctls to userland */ 18538c2ecf20Sopenharmony_ci uffdio_api.features = UFFD_API_FEATURES; 18548c2ecf20Sopenharmony_ci uffdio_api.ioctls = UFFD_API_IOCTLS; 18558c2ecf20Sopenharmony_ci ret = -EFAULT; 18568c2ecf20Sopenharmony_ci if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) 18578c2ecf20Sopenharmony_ci goto out; 18588c2ecf20Sopenharmony_ci 18598c2ecf20Sopenharmony_ci /* only enable the requested features for this uffd context */ 18608c2ecf20Sopenharmony_ci ctx_features = uffd_ctx_features(features); 18618c2ecf20Sopenharmony_ci ret = -EINVAL; 18628c2ecf20Sopenharmony_ci if (cmpxchg(&ctx->features, 0, ctx_features) != 0) 18638c2ecf20Sopenharmony_ci goto err_out; 18648c2ecf20Sopenharmony_ci 18658c2ecf20Sopenharmony_ci ret = 0; 18668c2ecf20Sopenharmony_ciout: 18678c2ecf20Sopenharmony_ci return ret; 18688c2ecf20Sopenharmony_cierr_out: 18698c2ecf20Sopenharmony_ci memset(&uffdio_api, 0, sizeof(uffdio_api)); 18708c2ecf20Sopenharmony_ci if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) 18718c2ecf20Sopenharmony_ci ret = -EFAULT; 18728c2ecf20Sopenharmony_ci goto out; 18738c2ecf20Sopenharmony_ci} 18748c2ecf20Sopenharmony_ci 18758c2ecf20Sopenharmony_cistatic long userfaultfd_ioctl(struct file *file, unsigned cmd, 18768c2ecf20Sopenharmony_ci unsigned long arg) 18778c2ecf20Sopenharmony_ci{ 18788c2ecf20Sopenharmony_ci int ret = -EINVAL; 18798c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx = file->private_data; 18808c2ecf20Sopenharmony_ci 18818c2ecf20Sopenharmony_ci if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) 18828c2ecf20Sopenharmony_ci return -EINVAL; 18838c2ecf20Sopenharmony_ci 18848c2ecf20Sopenharmony_ci switch(cmd) { 18858c2ecf20Sopenharmony_ci case UFFDIO_API: 18868c2ecf20Sopenharmony_ci ret = userfaultfd_api(ctx, arg); 18878c2ecf20Sopenharmony_ci break; 18888c2ecf20Sopenharmony_ci case UFFDIO_REGISTER: 18898c2ecf20Sopenharmony_ci ret = userfaultfd_register(ctx, arg); 18908c2ecf20Sopenharmony_ci break; 18918c2ecf20Sopenharmony_ci case UFFDIO_UNREGISTER: 18928c2ecf20Sopenharmony_ci ret = userfaultfd_unregister(ctx, arg); 18938c2ecf20Sopenharmony_ci break; 18948c2ecf20Sopenharmony_ci case UFFDIO_WAKE: 18958c2ecf20Sopenharmony_ci ret = userfaultfd_wake(ctx, arg); 18968c2ecf20Sopenharmony_ci break; 18978c2ecf20Sopenharmony_ci case UFFDIO_COPY: 18988c2ecf20Sopenharmony_ci ret = userfaultfd_copy(ctx, arg); 18998c2ecf20Sopenharmony_ci break; 19008c2ecf20Sopenharmony_ci case UFFDIO_ZEROPAGE: 19018c2ecf20Sopenharmony_ci ret = userfaultfd_zeropage(ctx, arg); 19028c2ecf20Sopenharmony_ci break; 19038c2ecf20Sopenharmony_ci case UFFDIO_WRITEPROTECT: 19048c2ecf20Sopenharmony_ci ret = userfaultfd_writeprotect(ctx, arg); 19058c2ecf20Sopenharmony_ci break; 19068c2ecf20Sopenharmony_ci } 19078c2ecf20Sopenharmony_ci return ret; 19088c2ecf20Sopenharmony_ci} 19098c2ecf20Sopenharmony_ci 19108c2ecf20Sopenharmony_ci#ifdef CONFIG_PROC_FS 19118c2ecf20Sopenharmony_cistatic void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) 19128c2ecf20Sopenharmony_ci{ 19138c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx = f->private_data; 19148c2ecf20Sopenharmony_ci wait_queue_entry_t *wq; 19158c2ecf20Sopenharmony_ci unsigned long pending = 0, total = 0; 19168c2ecf20Sopenharmony_ci 19178c2ecf20Sopenharmony_ci spin_lock_irq(&ctx->fault_pending_wqh.lock); 19188c2ecf20Sopenharmony_ci list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { 19198c2ecf20Sopenharmony_ci pending++; 19208c2ecf20Sopenharmony_ci total++; 19218c2ecf20Sopenharmony_ci } 19228c2ecf20Sopenharmony_ci list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { 19238c2ecf20Sopenharmony_ci total++; 19248c2ecf20Sopenharmony_ci } 19258c2ecf20Sopenharmony_ci spin_unlock_irq(&ctx->fault_pending_wqh.lock); 19268c2ecf20Sopenharmony_ci 19278c2ecf20Sopenharmony_ci /* 19288c2ecf20Sopenharmony_ci * If more protocols will be added, there will be all shown 19298c2ecf20Sopenharmony_ci * separated by a space. Like this: 19308c2ecf20Sopenharmony_ci * protocols: aa:... bb:... 19318c2ecf20Sopenharmony_ci */ 19328c2ecf20Sopenharmony_ci seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", 19338c2ecf20Sopenharmony_ci pending, total, UFFD_API, ctx->features, 19348c2ecf20Sopenharmony_ci UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); 19358c2ecf20Sopenharmony_ci} 19368c2ecf20Sopenharmony_ci#endif 19378c2ecf20Sopenharmony_ci 19388c2ecf20Sopenharmony_cistatic const struct file_operations userfaultfd_fops = { 19398c2ecf20Sopenharmony_ci#ifdef CONFIG_PROC_FS 19408c2ecf20Sopenharmony_ci .show_fdinfo = userfaultfd_show_fdinfo, 19418c2ecf20Sopenharmony_ci#endif 19428c2ecf20Sopenharmony_ci .release = userfaultfd_release, 19438c2ecf20Sopenharmony_ci .poll = userfaultfd_poll, 19448c2ecf20Sopenharmony_ci .read = userfaultfd_read, 19458c2ecf20Sopenharmony_ci .unlocked_ioctl = userfaultfd_ioctl, 19468c2ecf20Sopenharmony_ci .compat_ioctl = compat_ptr_ioctl, 19478c2ecf20Sopenharmony_ci .llseek = noop_llseek, 19488c2ecf20Sopenharmony_ci}; 19498c2ecf20Sopenharmony_ci 19508c2ecf20Sopenharmony_cistatic void init_once_userfaultfd_ctx(void *mem) 19518c2ecf20Sopenharmony_ci{ 19528c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; 19538c2ecf20Sopenharmony_ci 19548c2ecf20Sopenharmony_ci init_waitqueue_head(&ctx->fault_pending_wqh); 19558c2ecf20Sopenharmony_ci init_waitqueue_head(&ctx->fault_wqh); 19568c2ecf20Sopenharmony_ci init_waitqueue_head(&ctx->event_wqh); 19578c2ecf20Sopenharmony_ci init_waitqueue_head(&ctx->fd_wqh); 19588c2ecf20Sopenharmony_ci seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); 19598c2ecf20Sopenharmony_ci} 19608c2ecf20Sopenharmony_ci 19618c2ecf20Sopenharmony_ciSYSCALL_DEFINE1(userfaultfd, int, flags) 19628c2ecf20Sopenharmony_ci{ 19638c2ecf20Sopenharmony_ci struct userfaultfd_ctx *ctx; 19648c2ecf20Sopenharmony_ci int fd; 19658c2ecf20Sopenharmony_ci 19668c2ecf20Sopenharmony_ci if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE)) 19678c2ecf20Sopenharmony_ci return -EPERM; 19688c2ecf20Sopenharmony_ci 19698c2ecf20Sopenharmony_ci BUG_ON(!current->mm); 19708c2ecf20Sopenharmony_ci 19718c2ecf20Sopenharmony_ci /* Check the UFFD_* constants for consistency. */ 19728c2ecf20Sopenharmony_ci BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); 19738c2ecf20Sopenharmony_ci BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); 19748c2ecf20Sopenharmony_ci 19758c2ecf20Sopenharmony_ci if (flags & ~UFFD_SHARED_FCNTL_FLAGS) 19768c2ecf20Sopenharmony_ci return -EINVAL; 19778c2ecf20Sopenharmony_ci 19788c2ecf20Sopenharmony_ci ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); 19798c2ecf20Sopenharmony_ci if (!ctx) 19808c2ecf20Sopenharmony_ci return -ENOMEM; 19818c2ecf20Sopenharmony_ci 19828c2ecf20Sopenharmony_ci refcount_set(&ctx->refcount, 1); 19838c2ecf20Sopenharmony_ci ctx->flags = flags; 19848c2ecf20Sopenharmony_ci ctx->features = 0; 19858c2ecf20Sopenharmony_ci ctx->released = false; 19868c2ecf20Sopenharmony_ci ctx->mmap_changing = false; 19878c2ecf20Sopenharmony_ci ctx->mm = current->mm; 19888c2ecf20Sopenharmony_ci /* prevent the mm struct to be freed */ 19898c2ecf20Sopenharmony_ci mmgrab(ctx->mm); 19908c2ecf20Sopenharmony_ci 19918c2ecf20Sopenharmony_ci fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx, 19928c2ecf20Sopenharmony_ci O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS)); 19938c2ecf20Sopenharmony_ci if (fd < 0) { 19948c2ecf20Sopenharmony_ci mmdrop(ctx->mm); 19958c2ecf20Sopenharmony_ci kmem_cache_free(userfaultfd_ctx_cachep, ctx); 19968c2ecf20Sopenharmony_ci } 19978c2ecf20Sopenharmony_ci return fd; 19988c2ecf20Sopenharmony_ci} 19998c2ecf20Sopenharmony_ci 20008c2ecf20Sopenharmony_cistatic int __init userfaultfd_init(void) 20018c2ecf20Sopenharmony_ci{ 20028c2ecf20Sopenharmony_ci userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", 20038c2ecf20Sopenharmony_ci sizeof(struct userfaultfd_ctx), 20048c2ecf20Sopenharmony_ci 0, 20058c2ecf20Sopenharmony_ci SLAB_HWCACHE_ALIGN|SLAB_PANIC, 20068c2ecf20Sopenharmony_ci init_once_userfaultfd_ctx); 20078c2ecf20Sopenharmony_ci return 0; 20088c2ecf20Sopenharmony_ci} 20098c2ecf20Sopenharmony_ci__initcall(userfaultfd_init); 2010