18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * fs/eventpoll.c (Efficient event retrieval implementation) 48c2ecf20Sopenharmony_ci * Copyright (C) 2001,...,2009 Davide Libenzi 58c2ecf20Sopenharmony_ci * 68c2ecf20Sopenharmony_ci * Davide Libenzi <davidel@xmailserver.org> 78c2ecf20Sopenharmony_ci */ 88c2ecf20Sopenharmony_ci 98c2ecf20Sopenharmony_ci#include <linux/init.h> 108c2ecf20Sopenharmony_ci#include <linux/kernel.h> 118c2ecf20Sopenharmony_ci#include <linux/sched/signal.h> 128c2ecf20Sopenharmony_ci#include <linux/fs.h> 138c2ecf20Sopenharmony_ci#include <linux/file.h> 148c2ecf20Sopenharmony_ci#include <linux/signal.h> 158c2ecf20Sopenharmony_ci#include <linux/errno.h> 168c2ecf20Sopenharmony_ci#include <linux/mm.h> 178c2ecf20Sopenharmony_ci#include <linux/slab.h> 188c2ecf20Sopenharmony_ci#include <linux/poll.h> 198c2ecf20Sopenharmony_ci#include <linux/string.h> 208c2ecf20Sopenharmony_ci#include <linux/list.h> 218c2ecf20Sopenharmony_ci#include <linux/hash.h> 228c2ecf20Sopenharmony_ci#include <linux/spinlock.h> 238c2ecf20Sopenharmony_ci#include <linux/syscalls.h> 248c2ecf20Sopenharmony_ci#include <linux/rbtree.h> 258c2ecf20Sopenharmony_ci#include <linux/wait.h> 268c2ecf20Sopenharmony_ci#include <linux/eventpoll.h> 278c2ecf20Sopenharmony_ci#include <linux/mount.h> 288c2ecf20Sopenharmony_ci#include <linux/bitops.h> 298c2ecf20Sopenharmony_ci#include <linux/mutex.h> 308c2ecf20Sopenharmony_ci#include <linux/anon_inodes.h> 318c2ecf20Sopenharmony_ci#include <linux/device.h> 328c2ecf20Sopenharmony_ci#include <linux/uaccess.h> 338c2ecf20Sopenharmony_ci#include <asm/io.h> 348c2ecf20Sopenharmony_ci#include <asm/mman.h> 358c2ecf20Sopenharmony_ci#include <linux/atomic.h> 368c2ecf20Sopenharmony_ci#include <linux/proc_fs.h> 378c2ecf20Sopenharmony_ci#include <linux/seq_file.h> 388c2ecf20Sopenharmony_ci#include <linux/compat.h> 398c2ecf20Sopenharmony_ci#include <linux/rculist.h> 408c2ecf20Sopenharmony_ci#include <net/busy_poll.h> 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci/* 438c2ecf20Sopenharmony_ci * LOCKING: 448c2ecf20Sopenharmony_ci * There are three level of locking required by epoll : 458c2ecf20Sopenharmony_ci * 468c2ecf20Sopenharmony_ci * 1) epmutex (mutex) 478c2ecf20Sopenharmony_ci * 2) ep->mtx (mutex) 488c2ecf20Sopenharmony_ci * 3) ep->lock (rwlock) 498c2ecf20Sopenharmony_ci * 508c2ecf20Sopenharmony_ci * The acquire order is the one listed above, from 1 to 3. 518c2ecf20Sopenharmony_ci * We need a rwlock (ep->lock) because we manipulate objects 528c2ecf20Sopenharmony_ci * from inside the poll callback, that might be triggered from 538c2ecf20Sopenharmony_ci * a wake_up() that in turn might be called from IRQ context. 548c2ecf20Sopenharmony_ci * So we can't sleep inside the poll callback and hence we need 558c2ecf20Sopenharmony_ci * a spinlock. During the event transfer loop (from kernel to 568c2ecf20Sopenharmony_ci * user space) we could end up sleeping due a copy_to_user(), so 578c2ecf20Sopenharmony_ci * we need a lock that will allow us to sleep. This lock is a 588c2ecf20Sopenharmony_ci * mutex (ep->mtx). It is acquired during the event transfer loop, 598c2ecf20Sopenharmony_ci * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). 608c2ecf20Sopenharmony_ci * Then we also need a global mutex to serialize eventpoll_release_file() 618c2ecf20Sopenharmony_ci * and ep_free(). 628c2ecf20Sopenharmony_ci * This mutex is acquired by ep_free() during the epoll file 638c2ecf20Sopenharmony_ci * cleanup path and it is also acquired by eventpoll_release_file() 648c2ecf20Sopenharmony_ci * if a file has been pushed inside an epoll set and it is then 658c2ecf20Sopenharmony_ci * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). 668c2ecf20Sopenharmony_ci * It is also acquired when inserting an epoll fd onto another epoll 678c2ecf20Sopenharmony_ci * fd. We do this so that we walk the epoll tree and ensure that this 688c2ecf20Sopenharmony_ci * insertion does not create a cycle of epoll file descriptors, which 698c2ecf20Sopenharmony_ci * could lead to deadlock. We need a global mutex to prevent two 708c2ecf20Sopenharmony_ci * simultaneous inserts (A into B and B into A) from racing and 718c2ecf20Sopenharmony_ci * constructing a cycle without either insert observing that it is 728c2ecf20Sopenharmony_ci * going to. 738c2ecf20Sopenharmony_ci * It is necessary to acquire multiple "ep->mtx"es at once in the 748c2ecf20Sopenharmony_ci * case when one epoll fd is added to another. In this case, we 758c2ecf20Sopenharmony_ci * always acquire the locks in the order of nesting (i.e. after 768c2ecf20Sopenharmony_ci * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired 778c2ecf20Sopenharmony_ci * before e2->mtx). Since we disallow cycles of epoll file 788c2ecf20Sopenharmony_ci * descriptors, this ensures that the mutexes are well-ordered. In 798c2ecf20Sopenharmony_ci * order to communicate this nesting to lockdep, when walking a tree 808c2ecf20Sopenharmony_ci * of epoll file descriptors, we use the current recursion depth as 818c2ecf20Sopenharmony_ci * the lockdep subkey. 828c2ecf20Sopenharmony_ci * It is possible to drop the "ep->mtx" and to use the global 838c2ecf20Sopenharmony_ci * mutex "epmutex" (together with "ep->lock") to have it working, 848c2ecf20Sopenharmony_ci * but having "ep->mtx" will make the interface more scalable. 858c2ecf20Sopenharmony_ci * Events that require holding "epmutex" are very rare, while for 868c2ecf20Sopenharmony_ci * normal operations the epoll private "ep->mtx" will guarantee 878c2ecf20Sopenharmony_ci * a better scalability. 888c2ecf20Sopenharmony_ci */ 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci/* Epoll private bits inside the event mask */ 918c2ecf20Sopenharmony_ci#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT) 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_ci#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \ 968c2ecf20Sopenharmony_ci EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci/* Maximum number of nesting allowed inside epoll sets */ 998c2ecf20Sopenharmony_ci#define EP_MAX_NESTS 4 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci#define EP_UNACTIVE_PTR ((void *) -1L) 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_ci#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_cistruct epoll_filefd { 1088c2ecf20Sopenharmony_ci struct file *file; 1098c2ecf20Sopenharmony_ci int fd; 1108c2ecf20Sopenharmony_ci} __packed; 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci/* 1138c2ecf20Sopenharmony_ci * Structure used to track possible nested calls, for too deep recursions 1148c2ecf20Sopenharmony_ci * and loop cycles. 1158c2ecf20Sopenharmony_ci */ 1168c2ecf20Sopenharmony_cistruct nested_call_node { 1178c2ecf20Sopenharmony_ci struct list_head llink; 1188c2ecf20Sopenharmony_ci void *cookie; 1198c2ecf20Sopenharmony_ci void *ctx; 1208c2ecf20Sopenharmony_ci}; 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci/* 1238c2ecf20Sopenharmony_ci * This structure is used as collector for nested calls, to check for 1248c2ecf20Sopenharmony_ci * maximum recursion dept and loop cycles. 1258c2ecf20Sopenharmony_ci */ 1268c2ecf20Sopenharmony_cistruct nested_calls { 1278c2ecf20Sopenharmony_ci struct list_head tasks_call_list; 1288c2ecf20Sopenharmony_ci spinlock_t lock; 1298c2ecf20Sopenharmony_ci}; 1308c2ecf20Sopenharmony_ci 1318c2ecf20Sopenharmony_ci/* 1328c2ecf20Sopenharmony_ci * Each file descriptor added to the eventpoll interface will 1338c2ecf20Sopenharmony_ci * have an entry of this type linked to the "rbr" RB tree. 1348c2ecf20Sopenharmony_ci * Avoid increasing the size of this struct, there can be many thousands 1358c2ecf20Sopenharmony_ci * of these on a server and we do not want this to take another cache line. 1368c2ecf20Sopenharmony_ci */ 1378c2ecf20Sopenharmony_cistruct epitem { 1388c2ecf20Sopenharmony_ci union { 1398c2ecf20Sopenharmony_ci /* RB tree node links this structure to the eventpoll RB tree */ 1408c2ecf20Sopenharmony_ci struct rb_node rbn; 1418c2ecf20Sopenharmony_ci /* Used to free the struct epitem */ 1428c2ecf20Sopenharmony_ci struct rcu_head rcu; 1438c2ecf20Sopenharmony_ci }; 1448c2ecf20Sopenharmony_ci 1458c2ecf20Sopenharmony_ci /* List header used to link this structure to the eventpoll ready list */ 1468c2ecf20Sopenharmony_ci struct list_head rdllink; 1478c2ecf20Sopenharmony_ci 1488c2ecf20Sopenharmony_ci /* 1498c2ecf20Sopenharmony_ci * Works together "struct eventpoll"->ovflist in keeping the 1508c2ecf20Sopenharmony_ci * single linked chain of items. 1518c2ecf20Sopenharmony_ci */ 1528c2ecf20Sopenharmony_ci struct epitem *next; 1538c2ecf20Sopenharmony_ci 1548c2ecf20Sopenharmony_ci /* The file descriptor information this item refers to */ 1558c2ecf20Sopenharmony_ci struct epoll_filefd ffd; 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci /* Number of active wait queue attached to poll operations */ 1588c2ecf20Sopenharmony_ci int nwait; 1598c2ecf20Sopenharmony_ci 1608c2ecf20Sopenharmony_ci /* List containing poll wait queues */ 1618c2ecf20Sopenharmony_ci struct list_head pwqlist; 1628c2ecf20Sopenharmony_ci 1638c2ecf20Sopenharmony_ci /* The "container" of this item */ 1648c2ecf20Sopenharmony_ci struct eventpoll *ep; 1658c2ecf20Sopenharmony_ci 1668c2ecf20Sopenharmony_ci /* List header used to link this item to the "struct file" items list */ 1678c2ecf20Sopenharmony_ci struct list_head fllink; 1688c2ecf20Sopenharmony_ci 1698c2ecf20Sopenharmony_ci /* wakeup_source used when EPOLLWAKEUP is set */ 1708c2ecf20Sopenharmony_ci struct wakeup_source __rcu *ws; 1718c2ecf20Sopenharmony_ci 1728c2ecf20Sopenharmony_ci /* The structure that describe the interested events and the source fd */ 1738c2ecf20Sopenharmony_ci struct epoll_event event; 1748c2ecf20Sopenharmony_ci}; 1758c2ecf20Sopenharmony_ci 1768c2ecf20Sopenharmony_ci/* 1778c2ecf20Sopenharmony_ci * This structure is stored inside the "private_data" member of the file 1788c2ecf20Sopenharmony_ci * structure and represents the main data structure for the eventpoll 1798c2ecf20Sopenharmony_ci * interface. 1808c2ecf20Sopenharmony_ci */ 1818c2ecf20Sopenharmony_cistruct eventpoll { 1828c2ecf20Sopenharmony_ci /* 1838c2ecf20Sopenharmony_ci * This mutex is used to ensure that files are not removed 1848c2ecf20Sopenharmony_ci * while epoll is using them. This is held during the event 1858c2ecf20Sopenharmony_ci * collection loop, the file cleanup path, the epoll file exit 1868c2ecf20Sopenharmony_ci * code and the ctl operations. 1878c2ecf20Sopenharmony_ci */ 1888c2ecf20Sopenharmony_ci struct mutex mtx; 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci /* Wait queue used by sys_epoll_wait() */ 1918c2ecf20Sopenharmony_ci wait_queue_head_t wq; 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci /* Wait queue used by file->poll() */ 1948c2ecf20Sopenharmony_ci wait_queue_head_t poll_wait; 1958c2ecf20Sopenharmony_ci 1968c2ecf20Sopenharmony_ci /* List of ready file descriptors */ 1978c2ecf20Sopenharmony_ci struct list_head rdllist; 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_ci /* Lock which protects rdllist and ovflist */ 2008c2ecf20Sopenharmony_ci rwlock_t lock; 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci /* RB tree root used to store monitored fd structs */ 2038c2ecf20Sopenharmony_ci struct rb_root_cached rbr; 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci /* 2068c2ecf20Sopenharmony_ci * This is a single linked list that chains all the "struct epitem" that 2078c2ecf20Sopenharmony_ci * happened while transferring ready events to userspace w/out 2088c2ecf20Sopenharmony_ci * holding ->lock. 2098c2ecf20Sopenharmony_ci */ 2108c2ecf20Sopenharmony_ci struct epitem *ovflist; 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci /* wakeup_source used when ep_scan_ready_list is running */ 2138c2ecf20Sopenharmony_ci struct wakeup_source *ws; 2148c2ecf20Sopenharmony_ci 2158c2ecf20Sopenharmony_ci /* The user that created the eventpoll descriptor */ 2168c2ecf20Sopenharmony_ci struct user_struct *user; 2178c2ecf20Sopenharmony_ci 2188c2ecf20Sopenharmony_ci struct file *file; 2198c2ecf20Sopenharmony_ci 2208c2ecf20Sopenharmony_ci /* used to optimize loop detection check */ 2218c2ecf20Sopenharmony_ci u64 gen; 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_ci#ifdef CONFIG_NET_RX_BUSY_POLL 2248c2ecf20Sopenharmony_ci /* used to track busy poll napi_id */ 2258c2ecf20Sopenharmony_ci unsigned int napi_id; 2268c2ecf20Sopenharmony_ci#endif 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_LOCK_ALLOC 2298c2ecf20Sopenharmony_ci /* tracks wakeup nests for lockdep validation */ 2308c2ecf20Sopenharmony_ci u8 nests; 2318c2ecf20Sopenharmony_ci#endif 2328c2ecf20Sopenharmony_ci}; 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci/* Wait structure used by the poll hooks */ 2358c2ecf20Sopenharmony_cistruct eppoll_entry { 2368c2ecf20Sopenharmony_ci /* List header used to link this structure to the "struct epitem" */ 2378c2ecf20Sopenharmony_ci struct list_head llink; 2388c2ecf20Sopenharmony_ci 2398c2ecf20Sopenharmony_ci /* The "base" pointer is set to the container "struct epitem" */ 2408c2ecf20Sopenharmony_ci struct epitem *base; 2418c2ecf20Sopenharmony_ci 2428c2ecf20Sopenharmony_ci /* 2438c2ecf20Sopenharmony_ci * Wait queue item that will be linked to the target file wait 2448c2ecf20Sopenharmony_ci * queue head. 2458c2ecf20Sopenharmony_ci */ 2468c2ecf20Sopenharmony_ci wait_queue_entry_t wait; 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci /* The wait queue head that linked the "wait" wait queue item */ 2498c2ecf20Sopenharmony_ci wait_queue_head_t *whead; 2508c2ecf20Sopenharmony_ci}; 2518c2ecf20Sopenharmony_ci 2528c2ecf20Sopenharmony_ci/* Wrapper struct used by poll queueing */ 2538c2ecf20Sopenharmony_cistruct ep_pqueue { 2548c2ecf20Sopenharmony_ci poll_table pt; 2558c2ecf20Sopenharmony_ci struct epitem *epi; 2568c2ecf20Sopenharmony_ci}; 2578c2ecf20Sopenharmony_ci 2588c2ecf20Sopenharmony_ci/* Used by the ep_send_events() function as callback private data */ 2598c2ecf20Sopenharmony_cistruct ep_send_events_data { 2608c2ecf20Sopenharmony_ci int maxevents; 2618c2ecf20Sopenharmony_ci struct epoll_event __user *events; 2628c2ecf20Sopenharmony_ci int res; 2638c2ecf20Sopenharmony_ci}; 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci/* 2668c2ecf20Sopenharmony_ci * Configuration options available inside /proc/sys/fs/epoll/ 2678c2ecf20Sopenharmony_ci */ 2688c2ecf20Sopenharmony_ci/* Maximum number of epoll watched descriptors, per user */ 2698c2ecf20Sopenharmony_cistatic long max_user_watches __read_mostly; 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_ci/* 2728c2ecf20Sopenharmony_ci * This mutex is used to serialize ep_free() and eventpoll_release_file(). 2738c2ecf20Sopenharmony_ci */ 2748c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(epmutex); 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_cistatic u64 loop_check_gen = 0; 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_ci/* Used to check for epoll file descriptor inclusion loops */ 2798c2ecf20Sopenharmony_cistatic struct nested_calls poll_loop_ncalls; 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_ci/* Slab cache used to allocate "struct epitem" */ 2828c2ecf20Sopenharmony_cistatic struct kmem_cache *epi_cache __read_mostly; 2838c2ecf20Sopenharmony_ci 2848c2ecf20Sopenharmony_ci/* Slab cache used to allocate "struct eppoll_entry" */ 2858c2ecf20Sopenharmony_cistatic struct kmem_cache *pwq_cache __read_mostly; 2868c2ecf20Sopenharmony_ci 2878c2ecf20Sopenharmony_ci/* 2888c2ecf20Sopenharmony_ci * List of files with newly added links, where we may need to limit the number 2898c2ecf20Sopenharmony_ci * of emanating paths. Protected by the epmutex. 2908c2ecf20Sopenharmony_ci */ 2918c2ecf20Sopenharmony_cistatic LIST_HEAD(tfile_check_list); 2928c2ecf20Sopenharmony_ci 2938c2ecf20Sopenharmony_ci#ifdef CONFIG_SYSCTL 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ci#include <linux/sysctl.h> 2968c2ecf20Sopenharmony_ci 2978c2ecf20Sopenharmony_cistatic long long_zero; 2988c2ecf20Sopenharmony_cistatic long long_max = LONG_MAX; 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_cistruct ctl_table epoll_table[] = { 3018c2ecf20Sopenharmony_ci { 3028c2ecf20Sopenharmony_ci .procname = "max_user_watches", 3038c2ecf20Sopenharmony_ci .data = &max_user_watches, 3048c2ecf20Sopenharmony_ci .maxlen = sizeof(max_user_watches), 3058c2ecf20Sopenharmony_ci .mode = 0644, 3068c2ecf20Sopenharmony_ci .proc_handler = proc_doulongvec_minmax, 3078c2ecf20Sopenharmony_ci .extra1 = &long_zero, 3088c2ecf20Sopenharmony_ci .extra2 = &long_max, 3098c2ecf20Sopenharmony_ci }, 3108c2ecf20Sopenharmony_ci { } 3118c2ecf20Sopenharmony_ci}; 3128c2ecf20Sopenharmony_ci#endif /* CONFIG_SYSCTL */ 3138c2ecf20Sopenharmony_ci 3148c2ecf20Sopenharmony_cistatic const struct file_operations eventpoll_fops; 3158c2ecf20Sopenharmony_ci 3168c2ecf20Sopenharmony_cistatic inline int is_file_epoll(struct file *f) 3178c2ecf20Sopenharmony_ci{ 3188c2ecf20Sopenharmony_ci return f->f_op == &eventpoll_fops; 3198c2ecf20Sopenharmony_ci} 3208c2ecf20Sopenharmony_ci 3218c2ecf20Sopenharmony_ci/* Setup the structure that is used as key for the RB tree */ 3228c2ecf20Sopenharmony_cistatic inline void ep_set_ffd(struct epoll_filefd *ffd, 3238c2ecf20Sopenharmony_ci struct file *file, int fd) 3248c2ecf20Sopenharmony_ci{ 3258c2ecf20Sopenharmony_ci ffd->file = file; 3268c2ecf20Sopenharmony_ci ffd->fd = fd; 3278c2ecf20Sopenharmony_ci} 3288c2ecf20Sopenharmony_ci 3298c2ecf20Sopenharmony_ci/* Compare RB tree keys */ 3308c2ecf20Sopenharmony_cistatic inline int ep_cmp_ffd(struct epoll_filefd *p1, 3318c2ecf20Sopenharmony_ci struct epoll_filefd *p2) 3328c2ecf20Sopenharmony_ci{ 3338c2ecf20Sopenharmony_ci return (p1->file > p2->file ? +1: 3348c2ecf20Sopenharmony_ci (p1->file < p2->file ? -1 : p1->fd - p2->fd)); 3358c2ecf20Sopenharmony_ci} 3368c2ecf20Sopenharmony_ci 3378c2ecf20Sopenharmony_ci/* Tells us if the item is currently linked */ 3388c2ecf20Sopenharmony_cistatic inline int ep_is_linked(struct epitem *epi) 3398c2ecf20Sopenharmony_ci{ 3408c2ecf20Sopenharmony_ci return !list_empty(&epi->rdllink); 3418c2ecf20Sopenharmony_ci} 3428c2ecf20Sopenharmony_ci 3438c2ecf20Sopenharmony_cistatic inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) 3448c2ecf20Sopenharmony_ci{ 3458c2ecf20Sopenharmony_ci return container_of(p, struct eppoll_entry, wait); 3468c2ecf20Sopenharmony_ci} 3478c2ecf20Sopenharmony_ci 3488c2ecf20Sopenharmony_ci/* Get the "struct epitem" from a wait queue pointer */ 3498c2ecf20Sopenharmony_cistatic inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) 3508c2ecf20Sopenharmony_ci{ 3518c2ecf20Sopenharmony_ci return container_of(p, struct eppoll_entry, wait)->base; 3528c2ecf20Sopenharmony_ci} 3538c2ecf20Sopenharmony_ci 3548c2ecf20Sopenharmony_ci/* Get the "struct epitem" from an epoll queue wrapper */ 3558c2ecf20Sopenharmony_cistatic inline struct epitem *ep_item_from_epqueue(poll_table *p) 3568c2ecf20Sopenharmony_ci{ 3578c2ecf20Sopenharmony_ci return container_of(p, struct ep_pqueue, pt)->epi; 3588c2ecf20Sopenharmony_ci} 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci/* Initialize the poll safe wake up structure */ 3618c2ecf20Sopenharmony_cistatic void ep_nested_calls_init(struct nested_calls *ncalls) 3628c2ecf20Sopenharmony_ci{ 3638c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&ncalls->tasks_call_list); 3648c2ecf20Sopenharmony_ci spin_lock_init(&ncalls->lock); 3658c2ecf20Sopenharmony_ci} 3668c2ecf20Sopenharmony_ci 3678c2ecf20Sopenharmony_ci/** 3688c2ecf20Sopenharmony_ci * ep_events_available - Checks if ready events might be available. 3698c2ecf20Sopenharmony_ci * 3708c2ecf20Sopenharmony_ci * @ep: Pointer to the eventpoll context. 3718c2ecf20Sopenharmony_ci * 3728c2ecf20Sopenharmony_ci * Returns: Returns a value different than zero if ready events are available, 3738c2ecf20Sopenharmony_ci * or zero otherwise. 3748c2ecf20Sopenharmony_ci */ 3758c2ecf20Sopenharmony_cistatic inline int ep_events_available(struct eventpoll *ep) 3768c2ecf20Sopenharmony_ci{ 3778c2ecf20Sopenharmony_ci return !list_empty_careful(&ep->rdllist) || 3788c2ecf20Sopenharmony_ci READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; 3798c2ecf20Sopenharmony_ci} 3808c2ecf20Sopenharmony_ci 3818c2ecf20Sopenharmony_ci#ifdef CONFIG_NET_RX_BUSY_POLL 3828c2ecf20Sopenharmony_cistatic bool ep_busy_loop_end(void *p, unsigned long start_time) 3838c2ecf20Sopenharmony_ci{ 3848c2ecf20Sopenharmony_ci struct eventpoll *ep = p; 3858c2ecf20Sopenharmony_ci 3868c2ecf20Sopenharmony_ci return ep_events_available(ep) || busy_loop_timeout(start_time); 3878c2ecf20Sopenharmony_ci} 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci/* 3908c2ecf20Sopenharmony_ci * Busy poll if globally on and supporting sockets found && no events, 3918c2ecf20Sopenharmony_ci * busy loop will return if need_resched or ep_events_available. 3928c2ecf20Sopenharmony_ci * 3938c2ecf20Sopenharmony_ci * we must do our busy polling with irqs enabled 3948c2ecf20Sopenharmony_ci */ 3958c2ecf20Sopenharmony_cistatic void ep_busy_loop(struct eventpoll *ep, int nonblock) 3968c2ecf20Sopenharmony_ci{ 3978c2ecf20Sopenharmony_ci unsigned int napi_id = READ_ONCE(ep->napi_id); 3988c2ecf20Sopenharmony_ci 3998c2ecf20Sopenharmony_ci if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) 4008c2ecf20Sopenharmony_ci napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); 4018c2ecf20Sopenharmony_ci} 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_cistatic inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) 4048c2ecf20Sopenharmony_ci{ 4058c2ecf20Sopenharmony_ci if (ep->napi_id) 4068c2ecf20Sopenharmony_ci ep->napi_id = 0; 4078c2ecf20Sopenharmony_ci} 4088c2ecf20Sopenharmony_ci 4098c2ecf20Sopenharmony_ci/* 4108c2ecf20Sopenharmony_ci * Set epoll busy poll NAPI ID from sk. 4118c2ecf20Sopenharmony_ci */ 4128c2ecf20Sopenharmony_cistatic inline void ep_set_busy_poll_napi_id(struct epitem *epi) 4138c2ecf20Sopenharmony_ci{ 4148c2ecf20Sopenharmony_ci struct eventpoll *ep; 4158c2ecf20Sopenharmony_ci unsigned int napi_id; 4168c2ecf20Sopenharmony_ci struct socket *sock; 4178c2ecf20Sopenharmony_ci struct sock *sk; 4188c2ecf20Sopenharmony_ci int err; 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci if (!net_busy_loop_on()) 4218c2ecf20Sopenharmony_ci return; 4228c2ecf20Sopenharmony_ci 4238c2ecf20Sopenharmony_ci sock = sock_from_file(epi->ffd.file, &err); 4248c2ecf20Sopenharmony_ci if (!sock) 4258c2ecf20Sopenharmony_ci return; 4268c2ecf20Sopenharmony_ci 4278c2ecf20Sopenharmony_ci sk = sock->sk; 4288c2ecf20Sopenharmony_ci if (!sk) 4298c2ecf20Sopenharmony_ci return; 4308c2ecf20Sopenharmony_ci 4318c2ecf20Sopenharmony_ci napi_id = READ_ONCE(sk->sk_napi_id); 4328c2ecf20Sopenharmony_ci ep = epi->ep; 4338c2ecf20Sopenharmony_ci 4348c2ecf20Sopenharmony_ci /* Non-NAPI IDs can be rejected 4358c2ecf20Sopenharmony_ci * or 4368c2ecf20Sopenharmony_ci * Nothing to do if we already have this ID 4378c2ecf20Sopenharmony_ci */ 4388c2ecf20Sopenharmony_ci if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) 4398c2ecf20Sopenharmony_ci return; 4408c2ecf20Sopenharmony_ci 4418c2ecf20Sopenharmony_ci /* record NAPI ID for use in next busy poll */ 4428c2ecf20Sopenharmony_ci ep->napi_id = napi_id; 4438c2ecf20Sopenharmony_ci} 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci#else 4468c2ecf20Sopenharmony_ci 4478c2ecf20Sopenharmony_cistatic inline void ep_busy_loop(struct eventpoll *ep, int nonblock) 4488c2ecf20Sopenharmony_ci{ 4498c2ecf20Sopenharmony_ci} 4508c2ecf20Sopenharmony_ci 4518c2ecf20Sopenharmony_cistatic inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) 4528c2ecf20Sopenharmony_ci{ 4538c2ecf20Sopenharmony_ci} 4548c2ecf20Sopenharmony_ci 4558c2ecf20Sopenharmony_cistatic inline void ep_set_busy_poll_napi_id(struct epitem *epi) 4568c2ecf20Sopenharmony_ci{ 4578c2ecf20Sopenharmony_ci} 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_ci#endif /* CONFIG_NET_RX_BUSY_POLL */ 4608c2ecf20Sopenharmony_ci 4618c2ecf20Sopenharmony_ci/** 4628c2ecf20Sopenharmony_ci * ep_call_nested - Perform a bound (possibly) nested call, by checking 4638c2ecf20Sopenharmony_ci * that the recursion limit is not exceeded, and that 4648c2ecf20Sopenharmony_ci * the same nested call (by the meaning of same cookie) is 4658c2ecf20Sopenharmony_ci * no re-entered. 4668c2ecf20Sopenharmony_ci * 4678c2ecf20Sopenharmony_ci * @ncalls: Pointer to the nested_calls structure to be used for this call. 4688c2ecf20Sopenharmony_ci * @nproc: Nested call core function pointer. 4698c2ecf20Sopenharmony_ci * @priv: Opaque data to be passed to the @nproc callback. 4708c2ecf20Sopenharmony_ci * @cookie: Cookie to be used to identify this nested call. 4718c2ecf20Sopenharmony_ci * @ctx: This instance context. 4728c2ecf20Sopenharmony_ci * 4738c2ecf20Sopenharmony_ci * Returns: Returns the code returned by the @nproc callback, or -1 if 4748c2ecf20Sopenharmony_ci * the maximum recursion limit has been exceeded. 4758c2ecf20Sopenharmony_ci */ 4768c2ecf20Sopenharmony_cistatic int ep_call_nested(struct nested_calls *ncalls, 4778c2ecf20Sopenharmony_ci int (*nproc)(void *, void *, int), void *priv, 4788c2ecf20Sopenharmony_ci void *cookie, void *ctx) 4798c2ecf20Sopenharmony_ci{ 4808c2ecf20Sopenharmony_ci int error, call_nests = 0; 4818c2ecf20Sopenharmony_ci unsigned long flags; 4828c2ecf20Sopenharmony_ci struct list_head *lsthead = &ncalls->tasks_call_list; 4838c2ecf20Sopenharmony_ci struct nested_call_node *tncur; 4848c2ecf20Sopenharmony_ci struct nested_call_node tnode; 4858c2ecf20Sopenharmony_ci 4868c2ecf20Sopenharmony_ci spin_lock_irqsave(&ncalls->lock, flags); 4878c2ecf20Sopenharmony_ci 4888c2ecf20Sopenharmony_ci /* 4898c2ecf20Sopenharmony_ci * Try to see if the current task is already inside this wakeup call. 4908c2ecf20Sopenharmony_ci * We use a list here, since the population inside this set is always 4918c2ecf20Sopenharmony_ci * very much limited. 4928c2ecf20Sopenharmony_ci */ 4938c2ecf20Sopenharmony_ci list_for_each_entry(tncur, lsthead, llink) { 4948c2ecf20Sopenharmony_ci if (tncur->ctx == ctx && 4958c2ecf20Sopenharmony_ci (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) { 4968c2ecf20Sopenharmony_ci /* 4978c2ecf20Sopenharmony_ci * Ops ... loop detected or maximum nest level reached. 4988c2ecf20Sopenharmony_ci * We abort this wake by breaking the cycle itself. 4998c2ecf20Sopenharmony_ci */ 5008c2ecf20Sopenharmony_ci error = -1; 5018c2ecf20Sopenharmony_ci goto out_unlock; 5028c2ecf20Sopenharmony_ci } 5038c2ecf20Sopenharmony_ci } 5048c2ecf20Sopenharmony_ci 5058c2ecf20Sopenharmony_ci /* Add the current task and cookie to the list */ 5068c2ecf20Sopenharmony_ci tnode.ctx = ctx; 5078c2ecf20Sopenharmony_ci tnode.cookie = cookie; 5088c2ecf20Sopenharmony_ci list_add(&tnode.llink, lsthead); 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&ncalls->lock, flags); 5118c2ecf20Sopenharmony_ci 5128c2ecf20Sopenharmony_ci /* Call the nested function */ 5138c2ecf20Sopenharmony_ci error = (*nproc)(priv, cookie, call_nests); 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_ci /* Remove the current task from the list */ 5168c2ecf20Sopenharmony_ci spin_lock_irqsave(&ncalls->lock, flags); 5178c2ecf20Sopenharmony_ci list_del(&tnode.llink); 5188c2ecf20Sopenharmony_ciout_unlock: 5198c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&ncalls->lock, flags); 5208c2ecf20Sopenharmony_ci 5218c2ecf20Sopenharmony_ci return error; 5228c2ecf20Sopenharmony_ci} 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_ci/* 5258c2ecf20Sopenharmony_ci * As described in commit 0ccf831cb lockdep: annotate epoll 5268c2ecf20Sopenharmony_ci * the use of wait queues used by epoll is done in a very controlled 5278c2ecf20Sopenharmony_ci * manner. Wake ups can nest inside each other, but are never done 5288c2ecf20Sopenharmony_ci * with the same locking. For example: 5298c2ecf20Sopenharmony_ci * 5308c2ecf20Sopenharmony_ci * dfd = socket(...); 5318c2ecf20Sopenharmony_ci * efd1 = epoll_create(); 5328c2ecf20Sopenharmony_ci * efd2 = epoll_create(); 5338c2ecf20Sopenharmony_ci * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); 5348c2ecf20Sopenharmony_ci * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); 5358c2ecf20Sopenharmony_ci * 5368c2ecf20Sopenharmony_ci * When a packet arrives to the device underneath "dfd", the net code will 5378c2ecf20Sopenharmony_ci * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a 5388c2ecf20Sopenharmony_ci * callback wakeup entry on that queue, and the wake_up() performed by the 5398c2ecf20Sopenharmony_ci * "dfd" net code will end up in ep_poll_callback(). At this point epoll 5408c2ecf20Sopenharmony_ci * (efd1) notices that it may have some event ready, so it needs to wake up 5418c2ecf20Sopenharmony_ci * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() 5428c2ecf20Sopenharmony_ci * that ends up in another wake_up(), after having checked about the 5438c2ecf20Sopenharmony_ci * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to 5448c2ecf20Sopenharmony_ci * avoid stack blasting. 5458c2ecf20Sopenharmony_ci * 5468c2ecf20Sopenharmony_ci * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle 5478c2ecf20Sopenharmony_ci * this special case of epoll. 5488c2ecf20Sopenharmony_ci */ 5498c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_LOCK_ALLOC 5508c2ecf20Sopenharmony_ci 5518c2ecf20Sopenharmony_cistatic void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, 5528c2ecf20Sopenharmony_ci unsigned pollflags) 5538c2ecf20Sopenharmony_ci{ 5548c2ecf20Sopenharmony_ci struct eventpoll *ep_src; 5558c2ecf20Sopenharmony_ci unsigned long flags; 5568c2ecf20Sopenharmony_ci u8 nests = 0; 5578c2ecf20Sopenharmony_ci 5588c2ecf20Sopenharmony_ci /* 5598c2ecf20Sopenharmony_ci * To set the subclass or nesting level for spin_lock_irqsave_nested() 5608c2ecf20Sopenharmony_ci * it might be natural to create a per-cpu nest count. However, since 5618c2ecf20Sopenharmony_ci * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can 5628c2ecf20Sopenharmony_ci * schedule() in the -rt kernel, the per-cpu variable are no longer 5638c2ecf20Sopenharmony_ci * protected. Thus, we are introducing a per eventpoll nest field. 5648c2ecf20Sopenharmony_ci * If we are not being call from ep_poll_callback(), epi is NULL and 5658c2ecf20Sopenharmony_ci * we are at the first level of nesting, 0. Otherwise, we are being 5668c2ecf20Sopenharmony_ci * called from ep_poll_callback() and if a previous wakeup source is 5678c2ecf20Sopenharmony_ci * not an epoll file itself, we are at depth 1 since the wakeup source 5688c2ecf20Sopenharmony_ci * is depth 0. If the wakeup source is a previous epoll file in the 5698c2ecf20Sopenharmony_ci * wakeup chain then we use its nests value and record ours as 5708c2ecf20Sopenharmony_ci * nests + 1. The previous epoll file nests value is stable since its 5718c2ecf20Sopenharmony_ci * already holding its own poll_wait.lock. 5728c2ecf20Sopenharmony_ci */ 5738c2ecf20Sopenharmony_ci if (epi) { 5748c2ecf20Sopenharmony_ci if ((is_file_epoll(epi->ffd.file))) { 5758c2ecf20Sopenharmony_ci ep_src = epi->ffd.file->private_data; 5768c2ecf20Sopenharmony_ci nests = ep_src->nests; 5778c2ecf20Sopenharmony_ci } else { 5788c2ecf20Sopenharmony_ci nests = 1; 5798c2ecf20Sopenharmony_ci } 5808c2ecf20Sopenharmony_ci } 5818c2ecf20Sopenharmony_ci spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); 5828c2ecf20Sopenharmony_ci ep->nests = nests + 1; 5838c2ecf20Sopenharmony_ci wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); 5848c2ecf20Sopenharmony_ci ep->nests = 0; 5858c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&ep->poll_wait.lock, flags); 5868c2ecf20Sopenharmony_ci} 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_ci#else 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_cistatic void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, 5918c2ecf20Sopenharmony_ci unsigned pollflags) 5928c2ecf20Sopenharmony_ci{ 5938c2ecf20Sopenharmony_ci wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); 5948c2ecf20Sopenharmony_ci} 5958c2ecf20Sopenharmony_ci 5968c2ecf20Sopenharmony_ci#endif 5978c2ecf20Sopenharmony_ci 5988c2ecf20Sopenharmony_cistatic void ep_remove_wait_queue(struct eppoll_entry *pwq) 5998c2ecf20Sopenharmony_ci{ 6008c2ecf20Sopenharmony_ci wait_queue_head_t *whead; 6018c2ecf20Sopenharmony_ci 6028c2ecf20Sopenharmony_ci rcu_read_lock(); 6038c2ecf20Sopenharmony_ci /* 6048c2ecf20Sopenharmony_ci * If it is cleared by POLLFREE, it should be rcu-safe. 6058c2ecf20Sopenharmony_ci * If we read NULL we need a barrier paired with 6068c2ecf20Sopenharmony_ci * smp_store_release() in ep_poll_callback(), otherwise 6078c2ecf20Sopenharmony_ci * we rely on whead->lock. 6088c2ecf20Sopenharmony_ci */ 6098c2ecf20Sopenharmony_ci whead = smp_load_acquire(&pwq->whead); 6108c2ecf20Sopenharmony_ci if (whead) 6118c2ecf20Sopenharmony_ci remove_wait_queue(whead, &pwq->wait); 6128c2ecf20Sopenharmony_ci rcu_read_unlock(); 6138c2ecf20Sopenharmony_ci} 6148c2ecf20Sopenharmony_ci 6158c2ecf20Sopenharmony_ci/* 6168c2ecf20Sopenharmony_ci * This function unregisters poll callbacks from the associated file 6178c2ecf20Sopenharmony_ci * descriptor. Must be called with "mtx" held (or "epmutex" if called from 6188c2ecf20Sopenharmony_ci * ep_free). 6198c2ecf20Sopenharmony_ci */ 6208c2ecf20Sopenharmony_cistatic void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) 6218c2ecf20Sopenharmony_ci{ 6228c2ecf20Sopenharmony_ci struct list_head *lsthead = &epi->pwqlist; 6238c2ecf20Sopenharmony_ci struct eppoll_entry *pwq; 6248c2ecf20Sopenharmony_ci 6258c2ecf20Sopenharmony_ci while (!list_empty(lsthead)) { 6268c2ecf20Sopenharmony_ci pwq = list_first_entry(lsthead, struct eppoll_entry, llink); 6278c2ecf20Sopenharmony_ci 6288c2ecf20Sopenharmony_ci list_del(&pwq->llink); 6298c2ecf20Sopenharmony_ci ep_remove_wait_queue(pwq); 6308c2ecf20Sopenharmony_ci kmem_cache_free(pwq_cache, pwq); 6318c2ecf20Sopenharmony_ci } 6328c2ecf20Sopenharmony_ci} 6338c2ecf20Sopenharmony_ci 6348c2ecf20Sopenharmony_ci/* call only when ep->mtx is held */ 6358c2ecf20Sopenharmony_cistatic inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) 6368c2ecf20Sopenharmony_ci{ 6378c2ecf20Sopenharmony_ci return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); 6388c2ecf20Sopenharmony_ci} 6398c2ecf20Sopenharmony_ci 6408c2ecf20Sopenharmony_ci/* call only when ep->mtx is held */ 6418c2ecf20Sopenharmony_cistatic inline void ep_pm_stay_awake(struct epitem *epi) 6428c2ecf20Sopenharmony_ci{ 6438c2ecf20Sopenharmony_ci struct wakeup_source *ws = ep_wakeup_source(epi); 6448c2ecf20Sopenharmony_ci 6458c2ecf20Sopenharmony_ci if (ws) 6468c2ecf20Sopenharmony_ci __pm_stay_awake(ws); 6478c2ecf20Sopenharmony_ci} 6488c2ecf20Sopenharmony_ci 6498c2ecf20Sopenharmony_cistatic inline bool ep_has_wakeup_source(struct epitem *epi) 6508c2ecf20Sopenharmony_ci{ 6518c2ecf20Sopenharmony_ci return rcu_access_pointer(epi->ws) ? true : false; 6528c2ecf20Sopenharmony_ci} 6538c2ecf20Sopenharmony_ci 6548c2ecf20Sopenharmony_ci/* call when ep->mtx cannot be held (ep_poll_callback) */ 6558c2ecf20Sopenharmony_cistatic inline void ep_pm_stay_awake_rcu(struct epitem *epi) 6568c2ecf20Sopenharmony_ci{ 6578c2ecf20Sopenharmony_ci struct wakeup_source *ws; 6588c2ecf20Sopenharmony_ci 6598c2ecf20Sopenharmony_ci rcu_read_lock(); 6608c2ecf20Sopenharmony_ci ws = rcu_dereference(epi->ws); 6618c2ecf20Sopenharmony_ci if (ws) 6628c2ecf20Sopenharmony_ci __pm_stay_awake(ws); 6638c2ecf20Sopenharmony_ci rcu_read_unlock(); 6648c2ecf20Sopenharmony_ci} 6658c2ecf20Sopenharmony_ci 6668c2ecf20Sopenharmony_ci/** 6678c2ecf20Sopenharmony_ci * ep_scan_ready_list - Scans the ready list in a way that makes possible for 6688c2ecf20Sopenharmony_ci * the scan code, to call f_op->poll(). Also allows for 6698c2ecf20Sopenharmony_ci * O(NumReady) performance. 6708c2ecf20Sopenharmony_ci * 6718c2ecf20Sopenharmony_ci * @ep: Pointer to the epoll private data structure. 6728c2ecf20Sopenharmony_ci * @sproc: Pointer to the scan callback. 6738c2ecf20Sopenharmony_ci * @priv: Private opaque data passed to the @sproc callback. 6748c2ecf20Sopenharmony_ci * @depth: The current depth of recursive f_op->poll calls. 6758c2ecf20Sopenharmony_ci * @ep_locked: caller already holds ep->mtx 6768c2ecf20Sopenharmony_ci * 6778c2ecf20Sopenharmony_ci * Returns: The same integer error code returned by the @sproc callback. 6788c2ecf20Sopenharmony_ci */ 6798c2ecf20Sopenharmony_cistatic __poll_t ep_scan_ready_list(struct eventpoll *ep, 6808c2ecf20Sopenharmony_ci __poll_t (*sproc)(struct eventpoll *, 6818c2ecf20Sopenharmony_ci struct list_head *, void *), 6828c2ecf20Sopenharmony_ci void *priv, int depth, bool ep_locked) 6838c2ecf20Sopenharmony_ci{ 6848c2ecf20Sopenharmony_ci __poll_t res; 6858c2ecf20Sopenharmony_ci struct epitem *epi, *nepi; 6868c2ecf20Sopenharmony_ci LIST_HEAD(txlist); 6878c2ecf20Sopenharmony_ci 6888c2ecf20Sopenharmony_ci lockdep_assert_irqs_enabled(); 6898c2ecf20Sopenharmony_ci 6908c2ecf20Sopenharmony_ci /* 6918c2ecf20Sopenharmony_ci * We need to lock this because we could be hit by 6928c2ecf20Sopenharmony_ci * eventpoll_release_file() and epoll_ctl(). 6938c2ecf20Sopenharmony_ci */ 6948c2ecf20Sopenharmony_ci 6958c2ecf20Sopenharmony_ci if (!ep_locked) 6968c2ecf20Sopenharmony_ci mutex_lock_nested(&ep->mtx, depth); 6978c2ecf20Sopenharmony_ci 6988c2ecf20Sopenharmony_ci /* 6998c2ecf20Sopenharmony_ci * Steal the ready list, and re-init the original one to the 7008c2ecf20Sopenharmony_ci * empty list. Also, set ep->ovflist to NULL so that events 7018c2ecf20Sopenharmony_ci * happening while looping w/out locks, are not lost. We cannot 7028c2ecf20Sopenharmony_ci * have the poll callback to queue directly on ep->rdllist, 7038c2ecf20Sopenharmony_ci * because we want the "sproc" callback to be able to do it 7048c2ecf20Sopenharmony_ci * in a lockless way. 7058c2ecf20Sopenharmony_ci */ 7068c2ecf20Sopenharmony_ci write_lock_irq(&ep->lock); 7078c2ecf20Sopenharmony_ci list_splice_init(&ep->rdllist, &txlist); 7088c2ecf20Sopenharmony_ci WRITE_ONCE(ep->ovflist, NULL); 7098c2ecf20Sopenharmony_ci write_unlock_irq(&ep->lock); 7108c2ecf20Sopenharmony_ci 7118c2ecf20Sopenharmony_ci /* 7128c2ecf20Sopenharmony_ci * Now call the callback function. 7138c2ecf20Sopenharmony_ci */ 7148c2ecf20Sopenharmony_ci res = (*sproc)(ep, &txlist, priv); 7158c2ecf20Sopenharmony_ci 7168c2ecf20Sopenharmony_ci write_lock_irq(&ep->lock); 7178c2ecf20Sopenharmony_ci /* 7188c2ecf20Sopenharmony_ci * During the time we spent inside the "sproc" callback, some 7198c2ecf20Sopenharmony_ci * other events might have been queued by the poll callback. 7208c2ecf20Sopenharmony_ci * We re-insert them inside the main ready-list here. 7218c2ecf20Sopenharmony_ci */ 7228c2ecf20Sopenharmony_ci for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; 7238c2ecf20Sopenharmony_ci nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { 7248c2ecf20Sopenharmony_ci /* 7258c2ecf20Sopenharmony_ci * We need to check if the item is already in the list. 7268c2ecf20Sopenharmony_ci * During the "sproc" callback execution time, items are 7278c2ecf20Sopenharmony_ci * queued into ->ovflist but the "txlist" might already 7288c2ecf20Sopenharmony_ci * contain them, and the list_splice() below takes care of them. 7298c2ecf20Sopenharmony_ci */ 7308c2ecf20Sopenharmony_ci if (!ep_is_linked(epi)) { 7318c2ecf20Sopenharmony_ci /* 7328c2ecf20Sopenharmony_ci * ->ovflist is LIFO, so we have to reverse it in order 7338c2ecf20Sopenharmony_ci * to keep in FIFO. 7348c2ecf20Sopenharmony_ci */ 7358c2ecf20Sopenharmony_ci list_add(&epi->rdllink, &ep->rdllist); 7368c2ecf20Sopenharmony_ci ep_pm_stay_awake(epi); 7378c2ecf20Sopenharmony_ci } 7388c2ecf20Sopenharmony_ci } 7398c2ecf20Sopenharmony_ci /* 7408c2ecf20Sopenharmony_ci * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after 7418c2ecf20Sopenharmony_ci * releasing the lock, events will be queued in the normal way inside 7428c2ecf20Sopenharmony_ci * ep->rdllist. 7438c2ecf20Sopenharmony_ci */ 7448c2ecf20Sopenharmony_ci WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); 7458c2ecf20Sopenharmony_ci 7468c2ecf20Sopenharmony_ci /* 7478c2ecf20Sopenharmony_ci * Quickly re-inject items left on "txlist". 7488c2ecf20Sopenharmony_ci */ 7498c2ecf20Sopenharmony_ci list_splice(&txlist, &ep->rdllist); 7508c2ecf20Sopenharmony_ci __pm_relax(ep->ws); 7518c2ecf20Sopenharmony_ci 7528c2ecf20Sopenharmony_ci if (!list_empty(&ep->rdllist)) { 7538c2ecf20Sopenharmony_ci if (waitqueue_active(&ep->wq)) 7548c2ecf20Sopenharmony_ci wake_up(&ep->wq); 7558c2ecf20Sopenharmony_ci } 7568c2ecf20Sopenharmony_ci 7578c2ecf20Sopenharmony_ci write_unlock_irq(&ep->lock); 7588c2ecf20Sopenharmony_ci 7598c2ecf20Sopenharmony_ci if (!ep_locked) 7608c2ecf20Sopenharmony_ci mutex_unlock(&ep->mtx); 7618c2ecf20Sopenharmony_ci 7628c2ecf20Sopenharmony_ci return res; 7638c2ecf20Sopenharmony_ci} 7648c2ecf20Sopenharmony_ci 7658c2ecf20Sopenharmony_cistatic void epi_rcu_free(struct rcu_head *head) 7668c2ecf20Sopenharmony_ci{ 7678c2ecf20Sopenharmony_ci struct epitem *epi = container_of(head, struct epitem, rcu); 7688c2ecf20Sopenharmony_ci kmem_cache_free(epi_cache, epi); 7698c2ecf20Sopenharmony_ci} 7708c2ecf20Sopenharmony_ci 7718c2ecf20Sopenharmony_ci/* 7728c2ecf20Sopenharmony_ci * Removes a "struct epitem" from the eventpoll RB tree and deallocates 7738c2ecf20Sopenharmony_ci * all the associated resources. Must be called with "mtx" held. 7748c2ecf20Sopenharmony_ci */ 7758c2ecf20Sopenharmony_cistatic int ep_remove(struct eventpoll *ep, struct epitem *epi) 7768c2ecf20Sopenharmony_ci{ 7778c2ecf20Sopenharmony_ci struct file *file = epi->ffd.file; 7788c2ecf20Sopenharmony_ci 7798c2ecf20Sopenharmony_ci lockdep_assert_irqs_enabled(); 7808c2ecf20Sopenharmony_ci 7818c2ecf20Sopenharmony_ci /* 7828c2ecf20Sopenharmony_ci * Removes poll wait queue hooks. 7838c2ecf20Sopenharmony_ci */ 7848c2ecf20Sopenharmony_ci ep_unregister_pollwait(ep, epi); 7858c2ecf20Sopenharmony_ci 7868c2ecf20Sopenharmony_ci /* Remove the current item from the list of epoll hooks */ 7878c2ecf20Sopenharmony_ci spin_lock(&file->f_lock); 7888c2ecf20Sopenharmony_ci list_del_rcu(&epi->fllink); 7898c2ecf20Sopenharmony_ci spin_unlock(&file->f_lock); 7908c2ecf20Sopenharmony_ci 7918c2ecf20Sopenharmony_ci rb_erase_cached(&epi->rbn, &ep->rbr); 7928c2ecf20Sopenharmony_ci 7938c2ecf20Sopenharmony_ci write_lock_irq(&ep->lock); 7948c2ecf20Sopenharmony_ci if (ep_is_linked(epi)) 7958c2ecf20Sopenharmony_ci list_del_init(&epi->rdllink); 7968c2ecf20Sopenharmony_ci write_unlock_irq(&ep->lock); 7978c2ecf20Sopenharmony_ci 7988c2ecf20Sopenharmony_ci wakeup_source_unregister(ep_wakeup_source(epi)); 7998c2ecf20Sopenharmony_ci /* 8008c2ecf20Sopenharmony_ci * At this point it is safe to free the eventpoll item. Use the union 8018c2ecf20Sopenharmony_ci * field epi->rcu, since we are trying to minimize the size of 8028c2ecf20Sopenharmony_ci * 'struct epitem'. The 'rbn' field is no longer in use. Protected by 8038c2ecf20Sopenharmony_ci * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make 8048c2ecf20Sopenharmony_ci * use of the rbn field. 8058c2ecf20Sopenharmony_ci */ 8068c2ecf20Sopenharmony_ci call_rcu(&epi->rcu, epi_rcu_free); 8078c2ecf20Sopenharmony_ci 8088c2ecf20Sopenharmony_ci atomic_long_dec(&ep->user->epoll_watches); 8098c2ecf20Sopenharmony_ci 8108c2ecf20Sopenharmony_ci return 0; 8118c2ecf20Sopenharmony_ci} 8128c2ecf20Sopenharmony_ci 8138c2ecf20Sopenharmony_cistatic void ep_free(struct eventpoll *ep) 8148c2ecf20Sopenharmony_ci{ 8158c2ecf20Sopenharmony_ci struct rb_node *rbp; 8168c2ecf20Sopenharmony_ci struct epitem *epi; 8178c2ecf20Sopenharmony_ci 8188c2ecf20Sopenharmony_ci /* We need to release all tasks waiting for these file */ 8198c2ecf20Sopenharmony_ci if (waitqueue_active(&ep->poll_wait)) 8208c2ecf20Sopenharmony_ci ep_poll_safewake(ep, NULL, 0); 8218c2ecf20Sopenharmony_ci 8228c2ecf20Sopenharmony_ci /* 8238c2ecf20Sopenharmony_ci * We need to lock this because we could be hit by 8248c2ecf20Sopenharmony_ci * eventpoll_release_file() while we're freeing the "struct eventpoll". 8258c2ecf20Sopenharmony_ci * We do not need to hold "ep->mtx" here because the epoll file 8268c2ecf20Sopenharmony_ci * is on the way to be removed and no one has references to it 8278c2ecf20Sopenharmony_ci * anymore. The only hit might come from eventpoll_release_file() but 8288c2ecf20Sopenharmony_ci * holding "epmutex" is sufficient here. 8298c2ecf20Sopenharmony_ci */ 8308c2ecf20Sopenharmony_ci mutex_lock(&epmutex); 8318c2ecf20Sopenharmony_ci 8328c2ecf20Sopenharmony_ci /* 8338c2ecf20Sopenharmony_ci * Walks through the whole tree by unregistering poll callbacks. 8348c2ecf20Sopenharmony_ci */ 8358c2ecf20Sopenharmony_ci for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 8368c2ecf20Sopenharmony_ci epi = rb_entry(rbp, struct epitem, rbn); 8378c2ecf20Sopenharmony_ci 8388c2ecf20Sopenharmony_ci ep_unregister_pollwait(ep, epi); 8398c2ecf20Sopenharmony_ci cond_resched(); 8408c2ecf20Sopenharmony_ci } 8418c2ecf20Sopenharmony_ci 8428c2ecf20Sopenharmony_ci /* 8438c2ecf20Sopenharmony_ci * Walks through the whole tree by freeing each "struct epitem". At this 8448c2ecf20Sopenharmony_ci * point we are sure no poll callbacks will be lingering around, and also by 8458c2ecf20Sopenharmony_ci * holding "epmutex" we can be sure that no file cleanup code will hit 8468c2ecf20Sopenharmony_ci * us during this operation. So we can avoid the lock on "ep->lock". 8478c2ecf20Sopenharmony_ci * We do not need to lock ep->mtx, either, we only do it to prevent 8488c2ecf20Sopenharmony_ci * a lockdep warning. 8498c2ecf20Sopenharmony_ci */ 8508c2ecf20Sopenharmony_ci mutex_lock(&ep->mtx); 8518c2ecf20Sopenharmony_ci while ((rbp = rb_first_cached(&ep->rbr)) != NULL) { 8528c2ecf20Sopenharmony_ci epi = rb_entry(rbp, struct epitem, rbn); 8538c2ecf20Sopenharmony_ci ep_remove(ep, epi); 8548c2ecf20Sopenharmony_ci cond_resched(); 8558c2ecf20Sopenharmony_ci } 8568c2ecf20Sopenharmony_ci mutex_unlock(&ep->mtx); 8578c2ecf20Sopenharmony_ci 8588c2ecf20Sopenharmony_ci mutex_unlock(&epmutex); 8598c2ecf20Sopenharmony_ci mutex_destroy(&ep->mtx); 8608c2ecf20Sopenharmony_ci free_uid(ep->user); 8618c2ecf20Sopenharmony_ci wakeup_source_unregister(ep->ws); 8628c2ecf20Sopenharmony_ci kfree(ep); 8638c2ecf20Sopenharmony_ci} 8648c2ecf20Sopenharmony_ci 8658c2ecf20Sopenharmony_cistatic int ep_eventpoll_release(struct inode *inode, struct file *file) 8668c2ecf20Sopenharmony_ci{ 8678c2ecf20Sopenharmony_ci struct eventpoll *ep = file->private_data; 8688c2ecf20Sopenharmony_ci 8698c2ecf20Sopenharmony_ci if (ep) 8708c2ecf20Sopenharmony_ci ep_free(ep); 8718c2ecf20Sopenharmony_ci 8728c2ecf20Sopenharmony_ci return 0; 8738c2ecf20Sopenharmony_ci} 8748c2ecf20Sopenharmony_ci 8758c2ecf20Sopenharmony_cistatic __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, 8768c2ecf20Sopenharmony_ci void *priv); 8778c2ecf20Sopenharmony_cistatic void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, 8788c2ecf20Sopenharmony_ci poll_table *pt); 8798c2ecf20Sopenharmony_ci 8808c2ecf20Sopenharmony_ci/* 8818c2ecf20Sopenharmony_ci * Differs from ep_eventpoll_poll() in that internal callers already have 8828c2ecf20Sopenharmony_ci * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() 8838c2ecf20Sopenharmony_ci * is correctly annotated. 8848c2ecf20Sopenharmony_ci */ 8858c2ecf20Sopenharmony_cistatic __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, 8868c2ecf20Sopenharmony_ci int depth) 8878c2ecf20Sopenharmony_ci{ 8888c2ecf20Sopenharmony_ci struct eventpoll *ep; 8898c2ecf20Sopenharmony_ci bool locked; 8908c2ecf20Sopenharmony_ci 8918c2ecf20Sopenharmony_ci pt->_key = epi->event.events; 8928c2ecf20Sopenharmony_ci if (!is_file_epoll(epi->ffd.file)) 8938c2ecf20Sopenharmony_ci return vfs_poll(epi->ffd.file, pt) & epi->event.events; 8948c2ecf20Sopenharmony_ci 8958c2ecf20Sopenharmony_ci ep = epi->ffd.file->private_data; 8968c2ecf20Sopenharmony_ci poll_wait(epi->ffd.file, &ep->poll_wait, pt); 8978c2ecf20Sopenharmony_ci locked = pt && (pt->_qproc == ep_ptable_queue_proc); 8988c2ecf20Sopenharmony_ci 8998c2ecf20Sopenharmony_ci return ep_scan_ready_list(epi->ffd.file->private_data, 9008c2ecf20Sopenharmony_ci ep_read_events_proc, &depth, depth, 9018c2ecf20Sopenharmony_ci locked) & epi->event.events; 9028c2ecf20Sopenharmony_ci} 9038c2ecf20Sopenharmony_ci 9048c2ecf20Sopenharmony_cistatic __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head, 9058c2ecf20Sopenharmony_ci void *priv) 9068c2ecf20Sopenharmony_ci{ 9078c2ecf20Sopenharmony_ci struct epitem *epi, *tmp; 9088c2ecf20Sopenharmony_ci poll_table pt; 9098c2ecf20Sopenharmony_ci int depth = *(int *)priv; 9108c2ecf20Sopenharmony_ci 9118c2ecf20Sopenharmony_ci init_poll_funcptr(&pt, NULL); 9128c2ecf20Sopenharmony_ci depth++; 9138c2ecf20Sopenharmony_ci 9148c2ecf20Sopenharmony_ci list_for_each_entry_safe(epi, tmp, head, rdllink) { 9158c2ecf20Sopenharmony_ci if (ep_item_poll(epi, &pt, depth)) { 9168c2ecf20Sopenharmony_ci return EPOLLIN | EPOLLRDNORM; 9178c2ecf20Sopenharmony_ci } else { 9188c2ecf20Sopenharmony_ci /* 9198c2ecf20Sopenharmony_ci * Item has been dropped into the ready list by the poll 9208c2ecf20Sopenharmony_ci * callback, but it's not actually ready, as far as 9218c2ecf20Sopenharmony_ci * caller requested events goes. We can remove it here. 9228c2ecf20Sopenharmony_ci */ 9238c2ecf20Sopenharmony_ci __pm_relax(ep_wakeup_source(epi)); 9248c2ecf20Sopenharmony_ci list_del_init(&epi->rdllink); 9258c2ecf20Sopenharmony_ci } 9268c2ecf20Sopenharmony_ci } 9278c2ecf20Sopenharmony_ci 9288c2ecf20Sopenharmony_ci return 0; 9298c2ecf20Sopenharmony_ci} 9308c2ecf20Sopenharmony_ci 9318c2ecf20Sopenharmony_cistatic __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) 9328c2ecf20Sopenharmony_ci{ 9338c2ecf20Sopenharmony_ci struct eventpoll *ep = file->private_data; 9348c2ecf20Sopenharmony_ci int depth = 0; 9358c2ecf20Sopenharmony_ci 9368c2ecf20Sopenharmony_ci /* Insert inside our poll wait queue */ 9378c2ecf20Sopenharmony_ci poll_wait(file, &ep->poll_wait, wait); 9388c2ecf20Sopenharmony_ci 9398c2ecf20Sopenharmony_ci /* 9408c2ecf20Sopenharmony_ci * Proceed to find out if wanted events are really available inside 9418c2ecf20Sopenharmony_ci * the ready list. 9428c2ecf20Sopenharmony_ci */ 9438c2ecf20Sopenharmony_ci return ep_scan_ready_list(ep, ep_read_events_proc, 9448c2ecf20Sopenharmony_ci &depth, depth, false); 9458c2ecf20Sopenharmony_ci} 9468c2ecf20Sopenharmony_ci 9478c2ecf20Sopenharmony_ci#ifdef CONFIG_PROC_FS 9488c2ecf20Sopenharmony_cistatic void ep_show_fdinfo(struct seq_file *m, struct file *f) 9498c2ecf20Sopenharmony_ci{ 9508c2ecf20Sopenharmony_ci struct eventpoll *ep = f->private_data; 9518c2ecf20Sopenharmony_ci struct rb_node *rbp; 9528c2ecf20Sopenharmony_ci 9538c2ecf20Sopenharmony_ci mutex_lock(&ep->mtx); 9548c2ecf20Sopenharmony_ci for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 9558c2ecf20Sopenharmony_ci struct epitem *epi = rb_entry(rbp, struct epitem, rbn); 9568c2ecf20Sopenharmony_ci struct inode *inode = file_inode(epi->ffd.file); 9578c2ecf20Sopenharmony_ci 9588c2ecf20Sopenharmony_ci seq_printf(m, "tfd: %8d events: %8x data: %16llx " 9598c2ecf20Sopenharmony_ci " pos:%lli ino:%lx sdev:%x\n", 9608c2ecf20Sopenharmony_ci epi->ffd.fd, epi->event.events, 9618c2ecf20Sopenharmony_ci (long long)epi->event.data, 9628c2ecf20Sopenharmony_ci (long long)epi->ffd.file->f_pos, 9638c2ecf20Sopenharmony_ci inode->i_ino, inode->i_sb->s_dev); 9648c2ecf20Sopenharmony_ci if (seq_has_overflowed(m)) 9658c2ecf20Sopenharmony_ci break; 9668c2ecf20Sopenharmony_ci } 9678c2ecf20Sopenharmony_ci mutex_unlock(&ep->mtx); 9688c2ecf20Sopenharmony_ci} 9698c2ecf20Sopenharmony_ci#endif 9708c2ecf20Sopenharmony_ci 9718c2ecf20Sopenharmony_ci/* File callbacks that implement the eventpoll file behaviour */ 9728c2ecf20Sopenharmony_cistatic const struct file_operations eventpoll_fops = { 9738c2ecf20Sopenharmony_ci#ifdef CONFIG_PROC_FS 9748c2ecf20Sopenharmony_ci .show_fdinfo = ep_show_fdinfo, 9758c2ecf20Sopenharmony_ci#endif 9768c2ecf20Sopenharmony_ci .release = ep_eventpoll_release, 9778c2ecf20Sopenharmony_ci .poll = ep_eventpoll_poll, 9788c2ecf20Sopenharmony_ci .llseek = noop_llseek, 9798c2ecf20Sopenharmony_ci}; 9808c2ecf20Sopenharmony_ci 9818c2ecf20Sopenharmony_ci/* 9828c2ecf20Sopenharmony_ci * This is called from eventpoll_release() to unlink files from the eventpoll 9838c2ecf20Sopenharmony_ci * interface. We need to have this facility to cleanup correctly files that are 9848c2ecf20Sopenharmony_ci * closed without being removed from the eventpoll interface. 9858c2ecf20Sopenharmony_ci */ 9868c2ecf20Sopenharmony_civoid eventpoll_release_file(struct file *file) 9878c2ecf20Sopenharmony_ci{ 9888c2ecf20Sopenharmony_ci struct eventpoll *ep; 9898c2ecf20Sopenharmony_ci struct epitem *epi, *next; 9908c2ecf20Sopenharmony_ci 9918c2ecf20Sopenharmony_ci /* 9928c2ecf20Sopenharmony_ci * We don't want to get "file->f_lock" because it is not 9938c2ecf20Sopenharmony_ci * necessary. It is not necessary because we're in the "struct file" 9948c2ecf20Sopenharmony_ci * cleanup path, and this means that no one is using this file anymore. 9958c2ecf20Sopenharmony_ci * So, for example, epoll_ctl() cannot hit here since if we reach this 9968c2ecf20Sopenharmony_ci * point, the file counter already went to zero and fget() would fail. 9978c2ecf20Sopenharmony_ci * The only hit might come from ep_free() but by holding the mutex 9988c2ecf20Sopenharmony_ci * will correctly serialize the operation. We do need to acquire 9998c2ecf20Sopenharmony_ci * "ep->mtx" after "epmutex" because ep_remove() requires it when called 10008c2ecf20Sopenharmony_ci * from anywhere but ep_free(). 10018c2ecf20Sopenharmony_ci * 10028c2ecf20Sopenharmony_ci * Besides, ep_remove() acquires the lock, so we can't hold it here. 10038c2ecf20Sopenharmony_ci */ 10048c2ecf20Sopenharmony_ci mutex_lock(&epmutex); 10058c2ecf20Sopenharmony_ci list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) { 10068c2ecf20Sopenharmony_ci ep = epi->ep; 10078c2ecf20Sopenharmony_ci mutex_lock_nested(&ep->mtx, 0); 10088c2ecf20Sopenharmony_ci ep_remove(ep, epi); 10098c2ecf20Sopenharmony_ci mutex_unlock(&ep->mtx); 10108c2ecf20Sopenharmony_ci } 10118c2ecf20Sopenharmony_ci mutex_unlock(&epmutex); 10128c2ecf20Sopenharmony_ci} 10138c2ecf20Sopenharmony_ci 10148c2ecf20Sopenharmony_cistatic int ep_alloc(struct eventpoll **pep) 10158c2ecf20Sopenharmony_ci{ 10168c2ecf20Sopenharmony_ci int error; 10178c2ecf20Sopenharmony_ci struct user_struct *user; 10188c2ecf20Sopenharmony_ci struct eventpoll *ep; 10198c2ecf20Sopenharmony_ci 10208c2ecf20Sopenharmony_ci user = get_current_user(); 10218c2ecf20Sopenharmony_ci error = -ENOMEM; 10228c2ecf20Sopenharmony_ci ep = kzalloc(sizeof(*ep), GFP_KERNEL); 10238c2ecf20Sopenharmony_ci if (unlikely(!ep)) 10248c2ecf20Sopenharmony_ci goto free_uid; 10258c2ecf20Sopenharmony_ci 10268c2ecf20Sopenharmony_ci mutex_init(&ep->mtx); 10278c2ecf20Sopenharmony_ci rwlock_init(&ep->lock); 10288c2ecf20Sopenharmony_ci init_waitqueue_head(&ep->wq); 10298c2ecf20Sopenharmony_ci init_waitqueue_head(&ep->poll_wait); 10308c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&ep->rdllist); 10318c2ecf20Sopenharmony_ci ep->rbr = RB_ROOT_CACHED; 10328c2ecf20Sopenharmony_ci ep->ovflist = EP_UNACTIVE_PTR; 10338c2ecf20Sopenharmony_ci ep->user = user; 10348c2ecf20Sopenharmony_ci 10358c2ecf20Sopenharmony_ci *pep = ep; 10368c2ecf20Sopenharmony_ci 10378c2ecf20Sopenharmony_ci return 0; 10388c2ecf20Sopenharmony_ci 10398c2ecf20Sopenharmony_cifree_uid: 10408c2ecf20Sopenharmony_ci free_uid(user); 10418c2ecf20Sopenharmony_ci return error; 10428c2ecf20Sopenharmony_ci} 10438c2ecf20Sopenharmony_ci 10448c2ecf20Sopenharmony_ci/* 10458c2ecf20Sopenharmony_ci * Search the file inside the eventpoll tree. The RB tree operations 10468c2ecf20Sopenharmony_ci * are protected by the "mtx" mutex, and ep_find() must be called with 10478c2ecf20Sopenharmony_ci * "mtx" held. 10488c2ecf20Sopenharmony_ci */ 10498c2ecf20Sopenharmony_cistatic struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) 10508c2ecf20Sopenharmony_ci{ 10518c2ecf20Sopenharmony_ci int kcmp; 10528c2ecf20Sopenharmony_ci struct rb_node *rbp; 10538c2ecf20Sopenharmony_ci struct epitem *epi, *epir = NULL; 10548c2ecf20Sopenharmony_ci struct epoll_filefd ffd; 10558c2ecf20Sopenharmony_ci 10568c2ecf20Sopenharmony_ci ep_set_ffd(&ffd, file, fd); 10578c2ecf20Sopenharmony_ci for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { 10588c2ecf20Sopenharmony_ci epi = rb_entry(rbp, struct epitem, rbn); 10598c2ecf20Sopenharmony_ci kcmp = ep_cmp_ffd(&ffd, &epi->ffd); 10608c2ecf20Sopenharmony_ci if (kcmp > 0) 10618c2ecf20Sopenharmony_ci rbp = rbp->rb_right; 10628c2ecf20Sopenharmony_ci else if (kcmp < 0) 10638c2ecf20Sopenharmony_ci rbp = rbp->rb_left; 10648c2ecf20Sopenharmony_ci else { 10658c2ecf20Sopenharmony_ci epir = epi; 10668c2ecf20Sopenharmony_ci break; 10678c2ecf20Sopenharmony_ci } 10688c2ecf20Sopenharmony_ci } 10698c2ecf20Sopenharmony_ci 10708c2ecf20Sopenharmony_ci return epir; 10718c2ecf20Sopenharmony_ci} 10728c2ecf20Sopenharmony_ci 10738c2ecf20Sopenharmony_ci#ifdef CONFIG_KCMP 10748c2ecf20Sopenharmony_cistatic struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) 10758c2ecf20Sopenharmony_ci{ 10768c2ecf20Sopenharmony_ci struct rb_node *rbp; 10778c2ecf20Sopenharmony_ci struct epitem *epi; 10788c2ecf20Sopenharmony_ci 10798c2ecf20Sopenharmony_ci for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 10808c2ecf20Sopenharmony_ci epi = rb_entry(rbp, struct epitem, rbn); 10818c2ecf20Sopenharmony_ci if (epi->ffd.fd == tfd) { 10828c2ecf20Sopenharmony_ci if (toff == 0) 10838c2ecf20Sopenharmony_ci return epi; 10848c2ecf20Sopenharmony_ci else 10858c2ecf20Sopenharmony_ci toff--; 10868c2ecf20Sopenharmony_ci } 10878c2ecf20Sopenharmony_ci cond_resched(); 10888c2ecf20Sopenharmony_ci } 10898c2ecf20Sopenharmony_ci 10908c2ecf20Sopenharmony_ci return NULL; 10918c2ecf20Sopenharmony_ci} 10928c2ecf20Sopenharmony_ci 10938c2ecf20Sopenharmony_cistruct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, 10948c2ecf20Sopenharmony_ci unsigned long toff) 10958c2ecf20Sopenharmony_ci{ 10968c2ecf20Sopenharmony_ci struct file *file_raw; 10978c2ecf20Sopenharmony_ci struct eventpoll *ep; 10988c2ecf20Sopenharmony_ci struct epitem *epi; 10998c2ecf20Sopenharmony_ci 11008c2ecf20Sopenharmony_ci if (!is_file_epoll(file)) 11018c2ecf20Sopenharmony_ci return ERR_PTR(-EINVAL); 11028c2ecf20Sopenharmony_ci 11038c2ecf20Sopenharmony_ci ep = file->private_data; 11048c2ecf20Sopenharmony_ci 11058c2ecf20Sopenharmony_ci mutex_lock(&ep->mtx); 11068c2ecf20Sopenharmony_ci epi = ep_find_tfd(ep, tfd, toff); 11078c2ecf20Sopenharmony_ci if (epi) 11088c2ecf20Sopenharmony_ci file_raw = epi->ffd.file; 11098c2ecf20Sopenharmony_ci else 11108c2ecf20Sopenharmony_ci file_raw = ERR_PTR(-ENOENT); 11118c2ecf20Sopenharmony_ci mutex_unlock(&ep->mtx); 11128c2ecf20Sopenharmony_ci 11138c2ecf20Sopenharmony_ci return file_raw; 11148c2ecf20Sopenharmony_ci} 11158c2ecf20Sopenharmony_ci#endif /* CONFIG_KCMP */ 11168c2ecf20Sopenharmony_ci 11178c2ecf20Sopenharmony_ci/** 11188c2ecf20Sopenharmony_ci * Adds a new entry to the tail of the list in a lockless way, i.e. 11198c2ecf20Sopenharmony_ci * multiple CPUs are allowed to call this function concurrently. 11208c2ecf20Sopenharmony_ci * 11218c2ecf20Sopenharmony_ci * Beware: it is necessary to prevent any other modifications of the 11228c2ecf20Sopenharmony_ci * existing list until all changes are completed, in other words 11238c2ecf20Sopenharmony_ci * concurrent list_add_tail_lockless() calls should be protected 11248c2ecf20Sopenharmony_ci * with a read lock, where write lock acts as a barrier which 11258c2ecf20Sopenharmony_ci * makes sure all list_add_tail_lockless() calls are fully 11268c2ecf20Sopenharmony_ci * completed. 11278c2ecf20Sopenharmony_ci * 11288c2ecf20Sopenharmony_ci * Also an element can be locklessly added to the list only in one 11298c2ecf20Sopenharmony_ci * direction i.e. either to the tail either to the head, otherwise 11308c2ecf20Sopenharmony_ci * concurrent access will corrupt the list. 11318c2ecf20Sopenharmony_ci * 11328c2ecf20Sopenharmony_ci * Returns %false if element has been already added to the list, %true 11338c2ecf20Sopenharmony_ci * otherwise. 11348c2ecf20Sopenharmony_ci */ 11358c2ecf20Sopenharmony_cistatic inline bool list_add_tail_lockless(struct list_head *new, 11368c2ecf20Sopenharmony_ci struct list_head *head) 11378c2ecf20Sopenharmony_ci{ 11388c2ecf20Sopenharmony_ci struct list_head *prev; 11398c2ecf20Sopenharmony_ci 11408c2ecf20Sopenharmony_ci /* 11418c2ecf20Sopenharmony_ci * This is simple 'new->next = head' operation, but cmpxchg() 11428c2ecf20Sopenharmony_ci * is used in order to detect that same element has been just 11438c2ecf20Sopenharmony_ci * added to the list from another CPU: the winner observes 11448c2ecf20Sopenharmony_ci * new->next == new. 11458c2ecf20Sopenharmony_ci */ 11468c2ecf20Sopenharmony_ci if (cmpxchg(&new->next, new, head) != new) 11478c2ecf20Sopenharmony_ci return false; 11488c2ecf20Sopenharmony_ci 11498c2ecf20Sopenharmony_ci /* 11508c2ecf20Sopenharmony_ci * Initially ->next of a new element must be updated with the head 11518c2ecf20Sopenharmony_ci * (we are inserting to the tail) and only then pointers are atomically 11528c2ecf20Sopenharmony_ci * exchanged. XCHG guarantees memory ordering, thus ->next should be 11538c2ecf20Sopenharmony_ci * updated before pointers are actually swapped and pointers are 11548c2ecf20Sopenharmony_ci * swapped before prev->next is updated. 11558c2ecf20Sopenharmony_ci */ 11568c2ecf20Sopenharmony_ci 11578c2ecf20Sopenharmony_ci prev = xchg(&head->prev, new); 11588c2ecf20Sopenharmony_ci 11598c2ecf20Sopenharmony_ci /* 11608c2ecf20Sopenharmony_ci * It is safe to modify prev->next and new->prev, because a new element 11618c2ecf20Sopenharmony_ci * is added only to the tail and new->next is updated before XCHG. 11628c2ecf20Sopenharmony_ci */ 11638c2ecf20Sopenharmony_ci 11648c2ecf20Sopenharmony_ci prev->next = new; 11658c2ecf20Sopenharmony_ci new->prev = prev; 11668c2ecf20Sopenharmony_ci 11678c2ecf20Sopenharmony_ci return true; 11688c2ecf20Sopenharmony_ci} 11698c2ecf20Sopenharmony_ci 11708c2ecf20Sopenharmony_ci/** 11718c2ecf20Sopenharmony_ci * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, 11728c2ecf20Sopenharmony_ci * i.e. multiple CPUs are allowed to call this function concurrently. 11738c2ecf20Sopenharmony_ci * 11748c2ecf20Sopenharmony_ci * Returns %false if epi element has been already chained, %true otherwise. 11758c2ecf20Sopenharmony_ci */ 11768c2ecf20Sopenharmony_cistatic inline bool chain_epi_lockless(struct epitem *epi) 11778c2ecf20Sopenharmony_ci{ 11788c2ecf20Sopenharmony_ci struct eventpoll *ep = epi->ep; 11798c2ecf20Sopenharmony_ci 11808c2ecf20Sopenharmony_ci /* Fast preliminary check */ 11818c2ecf20Sopenharmony_ci if (epi->next != EP_UNACTIVE_PTR) 11828c2ecf20Sopenharmony_ci return false; 11838c2ecf20Sopenharmony_ci 11848c2ecf20Sopenharmony_ci /* Check that the same epi has not been just chained from another CPU */ 11858c2ecf20Sopenharmony_ci if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) 11868c2ecf20Sopenharmony_ci return false; 11878c2ecf20Sopenharmony_ci 11888c2ecf20Sopenharmony_ci /* Atomically exchange tail */ 11898c2ecf20Sopenharmony_ci epi->next = xchg(&ep->ovflist, epi); 11908c2ecf20Sopenharmony_ci 11918c2ecf20Sopenharmony_ci return true; 11928c2ecf20Sopenharmony_ci} 11938c2ecf20Sopenharmony_ci 11948c2ecf20Sopenharmony_ci/* 11958c2ecf20Sopenharmony_ci * This is the callback that is passed to the wait queue wakeup 11968c2ecf20Sopenharmony_ci * mechanism. It is called by the stored file descriptors when they 11978c2ecf20Sopenharmony_ci * have events to report. 11988c2ecf20Sopenharmony_ci * 11998c2ecf20Sopenharmony_ci * This callback takes a read lock in order not to content with concurrent 12008c2ecf20Sopenharmony_ci * events from another file descriptors, thus all modifications to ->rdllist 12018c2ecf20Sopenharmony_ci * or ->ovflist are lockless. Read lock is paired with the write lock from 12028c2ecf20Sopenharmony_ci * ep_scan_ready_list(), which stops all list modifications and guarantees 12038c2ecf20Sopenharmony_ci * that lists state is seen correctly. 12048c2ecf20Sopenharmony_ci * 12058c2ecf20Sopenharmony_ci * Another thing worth to mention is that ep_poll_callback() can be called 12068c2ecf20Sopenharmony_ci * concurrently for the same @epi from different CPUs if poll table was inited 12078c2ecf20Sopenharmony_ci * with several wait queues entries. Plural wakeup from different CPUs of a 12088c2ecf20Sopenharmony_ci * single wait queue is serialized by wq.lock, but the case when multiple wait 12098c2ecf20Sopenharmony_ci * queues are used should be detected accordingly. This is detected using 12108c2ecf20Sopenharmony_ci * cmpxchg() operation. 12118c2ecf20Sopenharmony_ci */ 12128c2ecf20Sopenharmony_cistatic int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 12138c2ecf20Sopenharmony_ci{ 12148c2ecf20Sopenharmony_ci int pwake = 0; 12158c2ecf20Sopenharmony_ci struct epitem *epi = ep_item_from_wait(wait); 12168c2ecf20Sopenharmony_ci struct eventpoll *ep = epi->ep; 12178c2ecf20Sopenharmony_ci __poll_t pollflags = key_to_poll(key); 12188c2ecf20Sopenharmony_ci unsigned long flags; 12198c2ecf20Sopenharmony_ci int ewake = 0; 12208c2ecf20Sopenharmony_ci 12218c2ecf20Sopenharmony_ci read_lock_irqsave(&ep->lock, flags); 12228c2ecf20Sopenharmony_ci 12238c2ecf20Sopenharmony_ci ep_set_busy_poll_napi_id(epi); 12248c2ecf20Sopenharmony_ci 12258c2ecf20Sopenharmony_ci /* 12268c2ecf20Sopenharmony_ci * If the event mask does not contain any poll(2) event, we consider the 12278c2ecf20Sopenharmony_ci * descriptor to be disabled. This condition is likely the effect of the 12288c2ecf20Sopenharmony_ci * EPOLLONESHOT bit that disables the descriptor when an event is received, 12298c2ecf20Sopenharmony_ci * until the next EPOLL_CTL_MOD will be issued. 12308c2ecf20Sopenharmony_ci */ 12318c2ecf20Sopenharmony_ci if (!(epi->event.events & ~EP_PRIVATE_BITS)) 12328c2ecf20Sopenharmony_ci goto out_unlock; 12338c2ecf20Sopenharmony_ci 12348c2ecf20Sopenharmony_ci /* 12358c2ecf20Sopenharmony_ci * Check the events coming with the callback. At this stage, not 12368c2ecf20Sopenharmony_ci * every device reports the events in the "key" parameter of the 12378c2ecf20Sopenharmony_ci * callback. We need to be able to handle both cases here, hence the 12388c2ecf20Sopenharmony_ci * test for "key" != NULL before the event match test. 12398c2ecf20Sopenharmony_ci */ 12408c2ecf20Sopenharmony_ci if (pollflags && !(pollflags & epi->event.events)) 12418c2ecf20Sopenharmony_ci goto out_unlock; 12428c2ecf20Sopenharmony_ci 12438c2ecf20Sopenharmony_ci /* 12448c2ecf20Sopenharmony_ci * If we are transferring events to userspace, we can hold no locks 12458c2ecf20Sopenharmony_ci * (because we're accessing user memory, and because of linux f_op->poll() 12468c2ecf20Sopenharmony_ci * semantics). All the events that happen during that period of time are 12478c2ecf20Sopenharmony_ci * chained in ep->ovflist and requeued later on. 12488c2ecf20Sopenharmony_ci */ 12498c2ecf20Sopenharmony_ci if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { 12508c2ecf20Sopenharmony_ci if (chain_epi_lockless(epi)) 12518c2ecf20Sopenharmony_ci ep_pm_stay_awake_rcu(epi); 12528c2ecf20Sopenharmony_ci } else if (!ep_is_linked(epi)) { 12538c2ecf20Sopenharmony_ci /* In the usual case, add event to ready list. */ 12548c2ecf20Sopenharmony_ci if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) 12558c2ecf20Sopenharmony_ci ep_pm_stay_awake_rcu(epi); 12568c2ecf20Sopenharmony_ci } 12578c2ecf20Sopenharmony_ci 12588c2ecf20Sopenharmony_ci /* 12598c2ecf20Sopenharmony_ci * Wake up ( if active ) both the eventpoll wait list and the ->poll() 12608c2ecf20Sopenharmony_ci * wait list. 12618c2ecf20Sopenharmony_ci */ 12628c2ecf20Sopenharmony_ci if (waitqueue_active(&ep->wq)) { 12638c2ecf20Sopenharmony_ci if ((epi->event.events & EPOLLEXCLUSIVE) && 12648c2ecf20Sopenharmony_ci !(pollflags & POLLFREE)) { 12658c2ecf20Sopenharmony_ci switch (pollflags & EPOLLINOUT_BITS) { 12668c2ecf20Sopenharmony_ci case EPOLLIN: 12678c2ecf20Sopenharmony_ci if (epi->event.events & EPOLLIN) 12688c2ecf20Sopenharmony_ci ewake = 1; 12698c2ecf20Sopenharmony_ci break; 12708c2ecf20Sopenharmony_ci case EPOLLOUT: 12718c2ecf20Sopenharmony_ci if (epi->event.events & EPOLLOUT) 12728c2ecf20Sopenharmony_ci ewake = 1; 12738c2ecf20Sopenharmony_ci break; 12748c2ecf20Sopenharmony_ci case 0: 12758c2ecf20Sopenharmony_ci ewake = 1; 12768c2ecf20Sopenharmony_ci break; 12778c2ecf20Sopenharmony_ci } 12788c2ecf20Sopenharmony_ci } 12798c2ecf20Sopenharmony_ci wake_up(&ep->wq); 12808c2ecf20Sopenharmony_ci } 12818c2ecf20Sopenharmony_ci if (waitqueue_active(&ep->poll_wait)) 12828c2ecf20Sopenharmony_ci pwake++; 12838c2ecf20Sopenharmony_ci 12848c2ecf20Sopenharmony_ciout_unlock: 12858c2ecf20Sopenharmony_ci read_unlock_irqrestore(&ep->lock, flags); 12868c2ecf20Sopenharmony_ci 12878c2ecf20Sopenharmony_ci /* We have to call this outside the lock */ 12888c2ecf20Sopenharmony_ci if (pwake) 12898c2ecf20Sopenharmony_ci ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); 12908c2ecf20Sopenharmony_ci 12918c2ecf20Sopenharmony_ci if (!(epi->event.events & EPOLLEXCLUSIVE)) 12928c2ecf20Sopenharmony_ci ewake = 1; 12938c2ecf20Sopenharmony_ci 12948c2ecf20Sopenharmony_ci if (pollflags & POLLFREE) { 12958c2ecf20Sopenharmony_ci /* 12968c2ecf20Sopenharmony_ci * If we race with ep_remove_wait_queue() it can miss 12978c2ecf20Sopenharmony_ci * ->whead = NULL and do another remove_wait_queue() after 12988c2ecf20Sopenharmony_ci * us, so we can't use __remove_wait_queue(). 12998c2ecf20Sopenharmony_ci */ 13008c2ecf20Sopenharmony_ci list_del_init(&wait->entry); 13018c2ecf20Sopenharmony_ci /* 13028c2ecf20Sopenharmony_ci * ->whead != NULL protects us from the race with ep_free() 13038c2ecf20Sopenharmony_ci * or ep_remove(), ep_remove_wait_queue() takes whead->lock 13048c2ecf20Sopenharmony_ci * held by the caller. Once we nullify it, nothing protects 13058c2ecf20Sopenharmony_ci * ep/epi or even wait. 13068c2ecf20Sopenharmony_ci */ 13078c2ecf20Sopenharmony_ci smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); 13088c2ecf20Sopenharmony_ci } 13098c2ecf20Sopenharmony_ci 13108c2ecf20Sopenharmony_ci return ewake; 13118c2ecf20Sopenharmony_ci} 13128c2ecf20Sopenharmony_ci 13138c2ecf20Sopenharmony_ci/* 13148c2ecf20Sopenharmony_ci * This is the callback that is used to add our wait queue to the 13158c2ecf20Sopenharmony_ci * target file wakeup lists. 13168c2ecf20Sopenharmony_ci */ 13178c2ecf20Sopenharmony_cistatic void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, 13188c2ecf20Sopenharmony_ci poll_table *pt) 13198c2ecf20Sopenharmony_ci{ 13208c2ecf20Sopenharmony_ci struct epitem *epi = ep_item_from_epqueue(pt); 13218c2ecf20Sopenharmony_ci struct eppoll_entry *pwq; 13228c2ecf20Sopenharmony_ci 13238c2ecf20Sopenharmony_ci if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { 13248c2ecf20Sopenharmony_ci init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); 13258c2ecf20Sopenharmony_ci pwq->whead = whead; 13268c2ecf20Sopenharmony_ci pwq->base = epi; 13278c2ecf20Sopenharmony_ci if (epi->event.events & EPOLLEXCLUSIVE) 13288c2ecf20Sopenharmony_ci add_wait_queue_exclusive(whead, &pwq->wait); 13298c2ecf20Sopenharmony_ci else 13308c2ecf20Sopenharmony_ci add_wait_queue(whead, &pwq->wait); 13318c2ecf20Sopenharmony_ci list_add_tail(&pwq->llink, &epi->pwqlist); 13328c2ecf20Sopenharmony_ci epi->nwait++; 13338c2ecf20Sopenharmony_ci } else { 13348c2ecf20Sopenharmony_ci /* We have to signal that an error occurred */ 13358c2ecf20Sopenharmony_ci epi->nwait = -1; 13368c2ecf20Sopenharmony_ci } 13378c2ecf20Sopenharmony_ci} 13388c2ecf20Sopenharmony_ci 13398c2ecf20Sopenharmony_cistatic void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) 13408c2ecf20Sopenharmony_ci{ 13418c2ecf20Sopenharmony_ci int kcmp; 13428c2ecf20Sopenharmony_ci struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; 13438c2ecf20Sopenharmony_ci struct epitem *epic; 13448c2ecf20Sopenharmony_ci bool leftmost = true; 13458c2ecf20Sopenharmony_ci 13468c2ecf20Sopenharmony_ci while (*p) { 13478c2ecf20Sopenharmony_ci parent = *p; 13488c2ecf20Sopenharmony_ci epic = rb_entry(parent, struct epitem, rbn); 13498c2ecf20Sopenharmony_ci kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); 13508c2ecf20Sopenharmony_ci if (kcmp > 0) { 13518c2ecf20Sopenharmony_ci p = &parent->rb_right; 13528c2ecf20Sopenharmony_ci leftmost = false; 13538c2ecf20Sopenharmony_ci } else 13548c2ecf20Sopenharmony_ci p = &parent->rb_left; 13558c2ecf20Sopenharmony_ci } 13568c2ecf20Sopenharmony_ci rb_link_node(&epi->rbn, parent, p); 13578c2ecf20Sopenharmony_ci rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); 13588c2ecf20Sopenharmony_ci} 13598c2ecf20Sopenharmony_ci 13608c2ecf20Sopenharmony_ci 13618c2ecf20Sopenharmony_ci 13628c2ecf20Sopenharmony_ci#define PATH_ARR_SIZE 5 13638c2ecf20Sopenharmony_ci/* 13648c2ecf20Sopenharmony_ci * These are the number paths of length 1 to 5, that we are allowing to emanate 13658c2ecf20Sopenharmony_ci * from a single file of interest. For example, we allow 1000 paths of length 13668c2ecf20Sopenharmony_ci * 1, to emanate from each file of interest. This essentially represents the 13678c2ecf20Sopenharmony_ci * potential wakeup paths, which need to be limited in order to avoid massive 13688c2ecf20Sopenharmony_ci * uncontrolled wakeup storms. The common use case should be a single ep which 13698c2ecf20Sopenharmony_ci * is connected to n file sources. In this case each file source has 1 path 13708c2ecf20Sopenharmony_ci * of length 1. Thus, the numbers below should be more than sufficient. These 13718c2ecf20Sopenharmony_ci * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify 13728c2ecf20Sopenharmony_ci * and delete can't add additional paths. Protected by the epmutex. 13738c2ecf20Sopenharmony_ci */ 13748c2ecf20Sopenharmony_cistatic const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; 13758c2ecf20Sopenharmony_cistatic int path_count[PATH_ARR_SIZE]; 13768c2ecf20Sopenharmony_ci 13778c2ecf20Sopenharmony_cistatic int path_count_inc(int nests) 13788c2ecf20Sopenharmony_ci{ 13798c2ecf20Sopenharmony_ci /* Allow an arbitrary number of depth 1 paths */ 13808c2ecf20Sopenharmony_ci if (nests == 0) 13818c2ecf20Sopenharmony_ci return 0; 13828c2ecf20Sopenharmony_ci 13838c2ecf20Sopenharmony_ci if (++path_count[nests] > path_limits[nests]) 13848c2ecf20Sopenharmony_ci return -1; 13858c2ecf20Sopenharmony_ci return 0; 13868c2ecf20Sopenharmony_ci} 13878c2ecf20Sopenharmony_ci 13888c2ecf20Sopenharmony_cistatic void path_count_init(void) 13898c2ecf20Sopenharmony_ci{ 13908c2ecf20Sopenharmony_ci int i; 13918c2ecf20Sopenharmony_ci 13928c2ecf20Sopenharmony_ci for (i = 0; i < PATH_ARR_SIZE; i++) 13938c2ecf20Sopenharmony_ci path_count[i] = 0; 13948c2ecf20Sopenharmony_ci} 13958c2ecf20Sopenharmony_ci 13968c2ecf20Sopenharmony_cistatic int reverse_path_check_proc(void *priv, void *cookie, int call_nests) 13978c2ecf20Sopenharmony_ci{ 13988c2ecf20Sopenharmony_ci int error = 0; 13998c2ecf20Sopenharmony_ci struct file *file = priv; 14008c2ecf20Sopenharmony_ci struct file *child_file; 14018c2ecf20Sopenharmony_ci struct epitem *epi; 14028c2ecf20Sopenharmony_ci 14038c2ecf20Sopenharmony_ci /* CTL_DEL can remove links here, but that can't increase our count */ 14048c2ecf20Sopenharmony_ci rcu_read_lock(); 14058c2ecf20Sopenharmony_ci list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) { 14068c2ecf20Sopenharmony_ci child_file = epi->ep->file; 14078c2ecf20Sopenharmony_ci if (is_file_epoll(child_file)) { 14088c2ecf20Sopenharmony_ci if (list_empty(&child_file->f_ep_links)) { 14098c2ecf20Sopenharmony_ci if (path_count_inc(call_nests)) { 14108c2ecf20Sopenharmony_ci error = -1; 14118c2ecf20Sopenharmony_ci break; 14128c2ecf20Sopenharmony_ci } 14138c2ecf20Sopenharmony_ci } else { 14148c2ecf20Sopenharmony_ci error = ep_call_nested(&poll_loop_ncalls, 14158c2ecf20Sopenharmony_ci reverse_path_check_proc, 14168c2ecf20Sopenharmony_ci child_file, child_file, 14178c2ecf20Sopenharmony_ci current); 14188c2ecf20Sopenharmony_ci } 14198c2ecf20Sopenharmony_ci if (error != 0) 14208c2ecf20Sopenharmony_ci break; 14218c2ecf20Sopenharmony_ci } else { 14228c2ecf20Sopenharmony_ci printk(KERN_ERR "reverse_path_check_proc: " 14238c2ecf20Sopenharmony_ci "file is not an ep!\n"); 14248c2ecf20Sopenharmony_ci } 14258c2ecf20Sopenharmony_ci } 14268c2ecf20Sopenharmony_ci rcu_read_unlock(); 14278c2ecf20Sopenharmony_ci return error; 14288c2ecf20Sopenharmony_ci} 14298c2ecf20Sopenharmony_ci 14308c2ecf20Sopenharmony_ci/** 14318c2ecf20Sopenharmony_ci * reverse_path_check - The tfile_check_list is list of file *, which have 14328c2ecf20Sopenharmony_ci * links that are proposed to be newly added. We need to 14338c2ecf20Sopenharmony_ci * make sure that those added links don't add too many 14348c2ecf20Sopenharmony_ci * paths such that we will spend all our time waking up 14358c2ecf20Sopenharmony_ci * eventpoll objects. 14368c2ecf20Sopenharmony_ci * 14378c2ecf20Sopenharmony_ci * Returns: Returns zero if the proposed links don't create too many paths, 14388c2ecf20Sopenharmony_ci * -1 otherwise. 14398c2ecf20Sopenharmony_ci */ 14408c2ecf20Sopenharmony_cistatic int reverse_path_check(void) 14418c2ecf20Sopenharmony_ci{ 14428c2ecf20Sopenharmony_ci int error = 0; 14438c2ecf20Sopenharmony_ci struct file *current_file; 14448c2ecf20Sopenharmony_ci 14458c2ecf20Sopenharmony_ci /* let's call this for all tfiles */ 14468c2ecf20Sopenharmony_ci list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { 14478c2ecf20Sopenharmony_ci path_count_init(); 14488c2ecf20Sopenharmony_ci error = ep_call_nested(&poll_loop_ncalls, 14498c2ecf20Sopenharmony_ci reverse_path_check_proc, current_file, 14508c2ecf20Sopenharmony_ci current_file, current); 14518c2ecf20Sopenharmony_ci if (error) 14528c2ecf20Sopenharmony_ci break; 14538c2ecf20Sopenharmony_ci } 14548c2ecf20Sopenharmony_ci return error; 14558c2ecf20Sopenharmony_ci} 14568c2ecf20Sopenharmony_ci 14578c2ecf20Sopenharmony_cistatic int ep_create_wakeup_source(struct epitem *epi) 14588c2ecf20Sopenharmony_ci{ 14598c2ecf20Sopenharmony_ci struct name_snapshot n; 14608c2ecf20Sopenharmony_ci struct wakeup_source *ws; 14618c2ecf20Sopenharmony_ci 14628c2ecf20Sopenharmony_ci if (!epi->ep->ws) { 14638c2ecf20Sopenharmony_ci epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); 14648c2ecf20Sopenharmony_ci if (!epi->ep->ws) 14658c2ecf20Sopenharmony_ci return -ENOMEM; 14668c2ecf20Sopenharmony_ci } 14678c2ecf20Sopenharmony_ci 14688c2ecf20Sopenharmony_ci take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); 14698c2ecf20Sopenharmony_ci ws = wakeup_source_register(NULL, n.name.name); 14708c2ecf20Sopenharmony_ci release_dentry_name_snapshot(&n); 14718c2ecf20Sopenharmony_ci 14728c2ecf20Sopenharmony_ci if (!ws) 14738c2ecf20Sopenharmony_ci return -ENOMEM; 14748c2ecf20Sopenharmony_ci rcu_assign_pointer(epi->ws, ws); 14758c2ecf20Sopenharmony_ci 14768c2ecf20Sopenharmony_ci return 0; 14778c2ecf20Sopenharmony_ci} 14788c2ecf20Sopenharmony_ci 14798c2ecf20Sopenharmony_ci/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ 14808c2ecf20Sopenharmony_cistatic noinline void ep_destroy_wakeup_source(struct epitem *epi) 14818c2ecf20Sopenharmony_ci{ 14828c2ecf20Sopenharmony_ci struct wakeup_source *ws = ep_wakeup_source(epi); 14838c2ecf20Sopenharmony_ci 14848c2ecf20Sopenharmony_ci RCU_INIT_POINTER(epi->ws, NULL); 14858c2ecf20Sopenharmony_ci 14868c2ecf20Sopenharmony_ci /* 14878c2ecf20Sopenharmony_ci * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is 14888c2ecf20Sopenharmony_ci * used internally by wakeup_source_remove, too (called by 14898c2ecf20Sopenharmony_ci * wakeup_source_unregister), so we cannot use call_rcu 14908c2ecf20Sopenharmony_ci */ 14918c2ecf20Sopenharmony_ci synchronize_rcu(); 14928c2ecf20Sopenharmony_ci wakeup_source_unregister(ws); 14938c2ecf20Sopenharmony_ci} 14948c2ecf20Sopenharmony_ci 14958c2ecf20Sopenharmony_ci/* 14968c2ecf20Sopenharmony_ci * Must be called with "mtx" held. 14978c2ecf20Sopenharmony_ci */ 14988c2ecf20Sopenharmony_cistatic int ep_insert(struct eventpoll *ep, const struct epoll_event *event, 14998c2ecf20Sopenharmony_ci struct file *tfile, int fd, int full_check) 15008c2ecf20Sopenharmony_ci{ 15018c2ecf20Sopenharmony_ci int error, pwake = 0; 15028c2ecf20Sopenharmony_ci __poll_t revents; 15038c2ecf20Sopenharmony_ci long user_watches; 15048c2ecf20Sopenharmony_ci struct epitem *epi; 15058c2ecf20Sopenharmony_ci struct ep_pqueue epq; 15068c2ecf20Sopenharmony_ci 15078c2ecf20Sopenharmony_ci lockdep_assert_irqs_enabled(); 15088c2ecf20Sopenharmony_ci 15098c2ecf20Sopenharmony_ci user_watches = atomic_long_read(&ep->user->epoll_watches); 15108c2ecf20Sopenharmony_ci if (unlikely(user_watches >= max_user_watches)) 15118c2ecf20Sopenharmony_ci return -ENOSPC; 15128c2ecf20Sopenharmony_ci if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) 15138c2ecf20Sopenharmony_ci return -ENOMEM; 15148c2ecf20Sopenharmony_ci 15158c2ecf20Sopenharmony_ci /* Item initialization follow here ... */ 15168c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&epi->rdllink); 15178c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&epi->fllink); 15188c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&epi->pwqlist); 15198c2ecf20Sopenharmony_ci epi->ep = ep; 15208c2ecf20Sopenharmony_ci ep_set_ffd(&epi->ffd, tfile, fd); 15218c2ecf20Sopenharmony_ci epi->event = *event; 15228c2ecf20Sopenharmony_ci epi->nwait = 0; 15238c2ecf20Sopenharmony_ci epi->next = EP_UNACTIVE_PTR; 15248c2ecf20Sopenharmony_ci if (epi->event.events & EPOLLWAKEUP) { 15258c2ecf20Sopenharmony_ci error = ep_create_wakeup_source(epi); 15268c2ecf20Sopenharmony_ci if (error) 15278c2ecf20Sopenharmony_ci goto error_create_wakeup_source; 15288c2ecf20Sopenharmony_ci } else { 15298c2ecf20Sopenharmony_ci RCU_INIT_POINTER(epi->ws, NULL); 15308c2ecf20Sopenharmony_ci } 15318c2ecf20Sopenharmony_ci 15328c2ecf20Sopenharmony_ci /* Add the current item to the list of active epoll hook for this file */ 15338c2ecf20Sopenharmony_ci spin_lock(&tfile->f_lock); 15348c2ecf20Sopenharmony_ci list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); 15358c2ecf20Sopenharmony_ci spin_unlock(&tfile->f_lock); 15368c2ecf20Sopenharmony_ci 15378c2ecf20Sopenharmony_ci /* 15388c2ecf20Sopenharmony_ci * Add the current item to the RB tree. All RB tree operations are 15398c2ecf20Sopenharmony_ci * protected by "mtx", and ep_insert() is called with "mtx" held. 15408c2ecf20Sopenharmony_ci */ 15418c2ecf20Sopenharmony_ci ep_rbtree_insert(ep, epi); 15428c2ecf20Sopenharmony_ci 15438c2ecf20Sopenharmony_ci /* now check if we've created too many backpaths */ 15448c2ecf20Sopenharmony_ci error = -EINVAL; 15458c2ecf20Sopenharmony_ci if (full_check && reverse_path_check()) 15468c2ecf20Sopenharmony_ci goto error_remove_epi; 15478c2ecf20Sopenharmony_ci 15488c2ecf20Sopenharmony_ci /* Initialize the poll table using the queue callback */ 15498c2ecf20Sopenharmony_ci epq.epi = epi; 15508c2ecf20Sopenharmony_ci init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); 15518c2ecf20Sopenharmony_ci 15528c2ecf20Sopenharmony_ci /* 15538c2ecf20Sopenharmony_ci * Attach the item to the poll hooks and get current event bits. 15548c2ecf20Sopenharmony_ci * We can safely use the file* here because its usage count has 15558c2ecf20Sopenharmony_ci * been increased by the caller of this function. Note that after 15568c2ecf20Sopenharmony_ci * this operation completes, the poll callback can start hitting 15578c2ecf20Sopenharmony_ci * the new item. 15588c2ecf20Sopenharmony_ci */ 15598c2ecf20Sopenharmony_ci revents = ep_item_poll(epi, &epq.pt, 1); 15608c2ecf20Sopenharmony_ci 15618c2ecf20Sopenharmony_ci /* 15628c2ecf20Sopenharmony_ci * We have to check if something went wrong during the poll wait queue 15638c2ecf20Sopenharmony_ci * install process. Namely an allocation for a wait queue failed due 15648c2ecf20Sopenharmony_ci * high memory pressure. 15658c2ecf20Sopenharmony_ci */ 15668c2ecf20Sopenharmony_ci error = -ENOMEM; 15678c2ecf20Sopenharmony_ci if (epi->nwait < 0) 15688c2ecf20Sopenharmony_ci goto error_unregister; 15698c2ecf20Sopenharmony_ci 15708c2ecf20Sopenharmony_ci /* We have to drop the new item inside our item list to keep track of it */ 15718c2ecf20Sopenharmony_ci write_lock_irq(&ep->lock); 15728c2ecf20Sopenharmony_ci 15738c2ecf20Sopenharmony_ci /* record NAPI ID of new item if present */ 15748c2ecf20Sopenharmony_ci ep_set_busy_poll_napi_id(epi); 15758c2ecf20Sopenharmony_ci 15768c2ecf20Sopenharmony_ci /* If the file is already "ready" we drop it inside the ready list */ 15778c2ecf20Sopenharmony_ci if (revents && !ep_is_linked(epi)) { 15788c2ecf20Sopenharmony_ci list_add_tail(&epi->rdllink, &ep->rdllist); 15798c2ecf20Sopenharmony_ci ep_pm_stay_awake(epi); 15808c2ecf20Sopenharmony_ci 15818c2ecf20Sopenharmony_ci /* Notify waiting tasks that events are available */ 15828c2ecf20Sopenharmony_ci if (waitqueue_active(&ep->wq)) 15838c2ecf20Sopenharmony_ci wake_up(&ep->wq); 15848c2ecf20Sopenharmony_ci if (waitqueue_active(&ep->poll_wait)) 15858c2ecf20Sopenharmony_ci pwake++; 15868c2ecf20Sopenharmony_ci } 15878c2ecf20Sopenharmony_ci 15888c2ecf20Sopenharmony_ci write_unlock_irq(&ep->lock); 15898c2ecf20Sopenharmony_ci 15908c2ecf20Sopenharmony_ci atomic_long_inc(&ep->user->epoll_watches); 15918c2ecf20Sopenharmony_ci 15928c2ecf20Sopenharmony_ci /* We have to call this outside the lock */ 15938c2ecf20Sopenharmony_ci if (pwake) 15948c2ecf20Sopenharmony_ci ep_poll_safewake(ep, NULL, 0); 15958c2ecf20Sopenharmony_ci 15968c2ecf20Sopenharmony_ci return 0; 15978c2ecf20Sopenharmony_ci 15988c2ecf20Sopenharmony_cierror_unregister: 15998c2ecf20Sopenharmony_ci ep_unregister_pollwait(ep, epi); 16008c2ecf20Sopenharmony_cierror_remove_epi: 16018c2ecf20Sopenharmony_ci spin_lock(&tfile->f_lock); 16028c2ecf20Sopenharmony_ci list_del_rcu(&epi->fllink); 16038c2ecf20Sopenharmony_ci spin_unlock(&tfile->f_lock); 16048c2ecf20Sopenharmony_ci 16058c2ecf20Sopenharmony_ci rb_erase_cached(&epi->rbn, &ep->rbr); 16068c2ecf20Sopenharmony_ci 16078c2ecf20Sopenharmony_ci /* 16088c2ecf20Sopenharmony_ci * We need to do this because an event could have been arrived on some 16098c2ecf20Sopenharmony_ci * allocated wait queue. Note that we don't care about the ep->ovflist 16108c2ecf20Sopenharmony_ci * list, since that is used/cleaned only inside a section bound by "mtx". 16118c2ecf20Sopenharmony_ci * And ep_insert() is called with "mtx" held. 16128c2ecf20Sopenharmony_ci */ 16138c2ecf20Sopenharmony_ci write_lock_irq(&ep->lock); 16148c2ecf20Sopenharmony_ci if (ep_is_linked(epi)) 16158c2ecf20Sopenharmony_ci list_del_init(&epi->rdllink); 16168c2ecf20Sopenharmony_ci write_unlock_irq(&ep->lock); 16178c2ecf20Sopenharmony_ci 16188c2ecf20Sopenharmony_ci wakeup_source_unregister(ep_wakeup_source(epi)); 16198c2ecf20Sopenharmony_ci 16208c2ecf20Sopenharmony_cierror_create_wakeup_source: 16218c2ecf20Sopenharmony_ci kmem_cache_free(epi_cache, epi); 16228c2ecf20Sopenharmony_ci 16238c2ecf20Sopenharmony_ci return error; 16248c2ecf20Sopenharmony_ci} 16258c2ecf20Sopenharmony_ci 16268c2ecf20Sopenharmony_ci/* 16278c2ecf20Sopenharmony_ci * Modify the interest event mask by dropping an event if the new mask 16288c2ecf20Sopenharmony_ci * has a match in the current file status. Must be called with "mtx" held. 16298c2ecf20Sopenharmony_ci */ 16308c2ecf20Sopenharmony_cistatic int ep_modify(struct eventpoll *ep, struct epitem *epi, 16318c2ecf20Sopenharmony_ci const struct epoll_event *event) 16328c2ecf20Sopenharmony_ci{ 16338c2ecf20Sopenharmony_ci int pwake = 0; 16348c2ecf20Sopenharmony_ci poll_table pt; 16358c2ecf20Sopenharmony_ci 16368c2ecf20Sopenharmony_ci lockdep_assert_irqs_enabled(); 16378c2ecf20Sopenharmony_ci 16388c2ecf20Sopenharmony_ci init_poll_funcptr(&pt, NULL); 16398c2ecf20Sopenharmony_ci 16408c2ecf20Sopenharmony_ci /* 16418c2ecf20Sopenharmony_ci * Set the new event interest mask before calling f_op->poll(); 16428c2ecf20Sopenharmony_ci * otherwise we might miss an event that happens between the 16438c2ecf20Sopenharmony_ci * f_op->poll() call and the new event set registering. 16448c2ecf20Sopenharmony_ci */ 16458c2ecf20Sopenharmony_ci epi->event.events = event->events; /* need barrier below */ 16468c2ecf20Sopenharmony_ci epi->event.data = event->data; /* protected by mtx */ 16478c2ecf20Sopenharmony_ci if (epi->event.events & EPOLLWAKEUP) { 16488c2ecf20Sopenharmony_ci if (!ep_has_wakeup_source(epi)) 16498c2ecf20Sopenharmony_ci ep_create_wakeup_source(epi); 16508c2ecf20Sopenharmony_ci } else if (ep_has_wakeup_source(epi)) { 16518c2ecf20Sopenharmony_ci ep_destroy_wakeup_source(epi); 16528c2ecf20Sopenharmony_ci } 16538c2ecf20Sopenharmony_ci 16548c2ecf20Sopenharmony_ci /* 16558c2ecf20Sopenharmony_ci * The following barrier has two effects: 16568c2ecf20Sopenharmony_ci * 16578c2ecf20Sopenharmony_ci * 1) Flush epi changes above to other CPUs. This ensures 16588c2ecf20Sopenharmony_ci * we do not miss events from ep_poll_callback if an 16598c2ecf20Sopenharmony_ci * event occurs immediately after we call f_op->poll(). 16608c2ecf20Sopenharmony_ci * We need this because we did not take ep->lock while 16618c2ecf20Sopenharmony_ci * changing epi above (but ep_poll_callback does take 16628c2ecf20Sopenharmony_ci * ep->lock). 16638c2ecf20Sopenharmony_ci * 16648c2ecf20Sopenharmony_ci * 2) We also need to ensure we do not miss _past_ events 16658c2ecf20Sopenharmony_ci * when calling f_op->poll(). This barrier also 16668c2ecf20Sopenharmony_ci * pairs with the barrier in wq_has_sleeper (see 16678c2ecf20Sopenharmony_ci * comments for wq_has_sleeper). 16688c2ecf20Sopenharmony_ci * 16698c2ecf20Sopenharmony_ci * This barrier will now guarantee ep_poll_callback or f_op->poll 16708c2ecf20Sopenharmony_ci * (or both) will notice the readiness of an item. 16718c2ecf20Sopenharmony_ci */ 16728c2ecf20Sopenharmony_ci smp_mb(); 16738c2ecf20Sopenharmony_ci 16748c2ecf20Sopenharmony_ci /* 16758c2ecf20Sopenharmony_ci * Get current event bits. We can safely use the file* here because 16768c2ecf20Sopenharmony_ci * its usage count has been increased by the caller of this function. 16778c2ecf20Sopenharmony_ci * If the item is "hot" and it is not registered inside the ready 16788c2ecf20Sopenharmony_ci * list, push it inside. 16798c2ecf20Sopenharmony_ci */ 16808c2ecf20Sopenharmony_ci if (ep_item_poll(epi, &pt, 1)) { 16818c2ecf20Sopenharmony_ci write_lock_irq(&ep->lock); 16828c2ecf20Sopenharmony_ci if (!ep_is_linked(epi)) { 16838c2ecf20Sopenharmony_ci list_add_tail(&epi->rdllink, &ep->rdllist); 16848c2ecf20Sopenharmony_ci ep_pm_stay_awake(epi); 16858c2ecf20Sopenharmony_ci 16868c2ecf20Sopenharmony_ci /* Notify waiting tasks that events are available */ 16878c2ecf20Sopenharmony_ci if (waitqueue_active(&ep->wq)) 16888c2ecf20Sopenharmony_ci wake_up(&ep->wq); 16898c2ecf20Sopenharmony_ci if (waitqueue_active(&ep->poll_wait)) 16908c2ecf20Sopenharmony_ci pwake++; 16918c2ecf20Sopenharmony_ci } 16928c2ecf20Sopenharmony_ci write_unlock_irq(&ep->lock); 16938c2ecf20Sopenharmony_ci } 16948c2ecf20Sopenharmony_ci 16958c2ecf20Sopenharmony_ci /* We have to call this outside the lock */ 16968c2ecf20Sopenharmony_ci if (pwake) 16978c2ecf20Sopenharmony_ci ep_poll_safewake(ep, NULL, 0); 16988c2ecf20Sopenharmony_ci 16998c2ecf20Sopenharmony_ci return 0; 17008c2ecf20Sopenharmony_ci} 17018c2ecf20Sopenharmony_ci 17028c2ecf20Sopenharmony_cistatic __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head, 17038c2ecf20Sopenharmony_ci void *priv) 17048c2ecf20Sopenharmony_ci{ 17058c2ecf20Sopenharmony_ci struct ep_send_events_data *esed = priv; 17068c2ecf20Sopenharmony_ci __poll_t revents; 17078c2ecf20Sopenharmony_ci struct epitem *epi, *tmp; 17088c2ecf20Sopenharmony_ci struct epoll_event __user *uevent = esed->events; 17098c2ecf20Sopenharmony_ci struct wakeup_source *ws; 17108c2ecf20Sopenharmony_ci poll_table pt; 17118c2ecf20Sopenharmony_ci 17128c2ecf20Sopenharmony_ci init_poll_funcptr(&pt, NULL); 17138c2ecf20Sopenharmony_ci esed->res = 0; 17148c2ecf20Sopenharmony_ci 17158c2ecf20Sopenharmony_ci /* 17168c2ecf20Sopenharmony_ci * We can loop without lock because we are passed a task private list. 17178c2ecf20Sopenharmony_ci * Items cannot vanish during the loop because ep_scan_ready_list() is 17188c2ecf20Sopenharmony_ci * holding "mtx" during this call. 17198c2ecf20Sopenharmony_ci */ 17208c2ecf20Sopenharmony_ci lockdep_assert_held(&ep->mtx); 17218c2ecf20Sopenharmony_ci 17228c2ecf20Sopenharmony_ci list_for_each_entry_safe(epi, tmp, head, rdllink) { 17238c2ecf20Sopenharmony_ci if (esed->res >= esed->maxevents) 17248c2ecf20Sopenharmony_ci break; 17258c2ecf20Sopenharmony_ci 17268c2ecf20Sopenharmony_ci /* 17278c2ecf20Sopenharmony_ci * Activate ep->ws before deactivating epi->ws to prevent 17288c2ecf20Sopenharmony_ci * triggering auto-suspend here (in case we reactive epi->ws 17298c2ecf20Sopenharmony_ci * below). 17308c2ecf20Sopenharmony_ci * 17318c2ecf20Sopenharmony_ci * This could be rearranged to delay the deactivation of epi->ws 17328c2ecf20Sopenharmony_ci * instead, but then epi->ws would temporarily be out of sync 17338c2ecf20Sopenharmony_ci * with ep_is_linked(). 17348c2ecf20Sopenharmony_ci */ 17358c2ecf20Sopenharmony_ci ws = ep_wakeup_source(epi); 17368c2ecf20Sopenharmony_ci if (ws) { 17378c2ecf20Sopenharmony_ci if (ws->active) 17388c2ecf20Sopenharmony_ci __pm_stay_awake(ep->ws); 17398c2ecf20Sopenharmony_ci __pm_relax(ws); 17408c2ecf20Sopenharmony_ci } 17418c2ecf20Sopenharmony_ci 17428c2ecf20Sopenharmony_ci list_del_init(&epi->rdllink); 17438c2ecf20Sopenharmony_ci 17448c2ecf20Sopenharmony_ci /* 17458c2ecf20Sopenharmony_ci * If the event mask intersect the caller-requested one, 17468c2ecf20Sopenharmony_ci * deliver the event to userspace. Again, ep_scan_ready_list() 17478c2ecf20Sopenharmony_ci * is holding ep->mtx, so no operations coming from userspace 17488c2ecf20Sopenharmony_ci * can change the item. 17498c2ecf20Sopenharmony_ci */ 17508c2ecf20Sopenharmony_ci revents = ep_item_poll(epi, &pt, 1); 17518c2ecf20Sopenharmony_ci if (!revents) 17528c2ecf20Sopenharmony_ci continue; 17538c2ecf20Sopenharmony_ci 17548c2ecf20Sopenharmony_ci if (__put_user(revents, &uevent->events) || 17558c2ecf20Sopenharmony_ci __put_user(epi->event.data, &uevent->data)) { 17568c2ecf20Sopenharmony_ci list_add(&epi->rdllink, head); 17578c2ecf20Sopenharmony_ci ep_pm_stay_awake(epi); 17588c2ecf20Sopenharmony_ci if (!esed->res) 17598c2ecf20Sopenharmony_ci esed->res = -EFAULT; 17608c2ecf20Sopenharmony_ci return 0; 17618c2ecf20Sopenharmony_ci } 17628c2ecf20Sopenharmony_ci esed->res++; 17638c2ecf20Sopenharmony_ci uevent++; 17648c2ecf20Sopenharmony_ci if (epi->event.events & EPOLLONESHOT) 17658c2ecf20Sopenharmony_ci epi->event.events &= EP_PRIVATE_BITS; 17668c2ecf20Sopenharmony_ci else if (!(epi->event.events & EPOLLET)) { 17678c2ecf20Sopenharmony_ci /* 17688c2ecf20Sopenharmony_ci * If this file has been added with Level 17698c2ecf20Sopenharmony_ci * Trigger mode, we need to insert back inside 17708c2ecf20Sopenharmony_ci * the ready list, so that the next call to 17718c2ecf20Sopenharmony_ci * epoll_wait() will check again the events 17728c2ecf20Sopenharmony_ci * availability. At this point, no one can insert 17738c2ecf20Sopenharmony_ci * into ep->rdllist besides us. The epoll_ctl() 17748c2ecf20Sopenharmony_ci * callers are locked out by 17758c2ecf20Sopenharmony_ci * ep_scan_ready_list() holding "mtx" and the 17768c2ecf20Sopenharmony_ci * poll callback will queue them in ep->ovflist. 17778c2ecf20Sopenharmony_ci */ 17788c2ecf20Sopenharmony_ci list_add_tail(&epi->rdllink, &ep->rdllist); 17798c2ecf20Sopenharmony_ci ep_pm_stay_awake(epi); 17808c2ecf20Sopenharmony_ci } 17818c2ecf20Sopenharmony_ci } 17828c2ecf20Sopenharmony_ci 17838c2ecf20Sopenharmony_ci return 0; 17848c2ecf20Sopenharmony_ci} 17858c2ecf20Sopenharmony_ci 17868c2ecf20Sopenharmony_cistatic int ep_send_events(struct eventpoll *ep, 17878c2ecf20Sopenharmony_ci struct epoll_event __user *events, int maxevents) 17888c2ecf20Sopenharmony_ci{ 17898c2ecf20Sopenharmony_ci struct ep_send_events_data esed; 17908c2ecf20Sopenharmony_ci 17918c2ecf20Sopenharmony_ci esed.maxevents = maxevents; 17928c2ecf20Sopenharmony_ci esed.events = events; 17938c2ecf20Sopenharmony_ci 17948c2ecf20Sopenharmony_ci ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); 17958c2ecf20Sopenharmony_ci return esed.res; 17968c2ecf20Sopenharmony_ci} 17978c2ecf20Sopenharmony_ci 17988c2ecf20Sopenharmony_cistatic inline struct timespec64 ep_set_mstimeout(long ms) 17998c2ecf20Sopenharmony_ci{ 18008c2ecf20Sopenharmony_ci struct timespec64 now, ts = { 18018c2ecf20Sopenharmony_ci .tv_sec = ms / MSEC_PER_SEC, 18028c2ecf20Sopenharmony_ci .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC), 18038c2ecf20Sopenharmony_ci }; 18048c2ecf20Sopenharmony_ci 18058c2ecf20Sopenharmony_ci ktime_get_ts64(&now); 18068c2ecf20Sopenharmony_ci return timespec64_add_safe(now, ts); 18078c2ecf20Sopenharmony_ci} 18088c2ecf20Sopenharmony_ci 18098c2ecf20Sopenharmony_ci/* 18108c2ecf20Sopenharmony_ci * autoremove_wake_function, but remove even on failure to wake up, because we 18118c2ecf20Sopenharmony_ci * know that default_wake_function/ttwu will only fail if the thread is already 18128c2ecf20Sopenharmony_ci * woken, and in that case the ep_poll loop will remove the entry anyways, not 18138c2ecf20Sopenharmony_ci * try to reuse it. 18148c2ecf20Sopenharmony_ci */ 18158c2ecf20Sopenharmony_cistatic int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, 18168c2ecf20Sopenharmony_ci unsigned int mode, int sync, void *key) 18178c2ecf20Sopenharmony_ci{ 18188c2ecf20Sopenharmony_ci int ret = default_wake_function(wq_entry, mode, sync, key); 18198c2ecf20Sopenharmony_ci 18208c2ecf20Sopenharmony_ci /* 18218c2ecf20Sopenharmony_ci * Pairs with list_empty_careful in ep_poll, and ensures future loop 18228c2ecf20Sopenharmony_ci * iterations see the cause of this wakeup. 18238c2ecf20Sopenharmony_ci */ 18248c2ecf20Sopenharmony_ci list_del_init_careful(&wq_entry->entry); 18258c2ecf20Sopenharmony_ci return ret; 18268c2ecf20Sopenharmony_ci} 18278c2ecf20Sopenharmony_ci 18288c2ecf20Sopenharmony_ci/** 18298c2ecf20Sopenharmony_ci * ep_poll - Retrieves ready events, and delivers them to the caller supplied 18308c2ecf20Sopenharmony_ci * event buffer. 18318c2ecf20Sopenharmony_ci * 18328c2ecf20Sopenharmony_ci * @ep: Pointer to the eventpoll context. 18338c2ecf20Sopenharmony_ci * @events: Pointer to the userspace buffer where the ready events should be 18348c2ecf20Sopenharmony_ci * stored. 18358c2ecf20Sopenharmony_ci * @maxevents: Size (in terms of number of events) of the caller event buffer. 18368c2ecf20Sopenharmony_ci * @timeout: Maximum timeout for the ready events fetch operation, in 18378c2ecf20Sopenharmony_ci * milliseconds. If the @timeout is zero, the function will not block, 18388c2ecf20Sopenharmony_ci * while if the @timeout is less than zero, the function will block 18398c2ecf20Sopenharmony_ci * until at least one event has been retrieved (or an error 18408c2ecf20Sopenharmony_ci * occurred). 18418c2ecf20Sopenharmony_ci * 18428c2ecf20Sopenharmony_ci * Returns: Returns the number of ready events which have been fetched, or an 18438c2ecf20Sopenharmony_ci * error code, in case of error. 18448c2ecf20Sopenharmony_ci */ 18458c2ecf20Sopenharmony_cistatic int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 18468c2ecf20Sopenharmony_ci int maxevents, long timeout) 18478c2ecf20Sopenharmony_ci{ 18488c2ecf20Sopenharmony_ci int res = 0, eavail, timed_out = 0; 18498c2ecf20Sopenharmony_ci u64 slack = 0; 18508c2ecf20Sopenharmony_ci wait_queue_entry_t wait; 18518c2ecf20Sopenharmony_ci ktime_t expires, *to = NULL; 18528c2ecf20Sopenharmony_ci 18538c2ecf20Sopenharmony_ci lockdep_assert_irqs_enabled(); 18548c2ecf20Sopenharmony_ci 18558c2ecf20Sopenharmony_ci if (timeout > 0) { 18568c2ecf20Sopenharmony_ci struct timespec64 end_time = ep_set_mstimeout(timeout); 18578c2ecf20Sopenharmony_ci 18588c2ecf20Sopenharmony_ci slack = select_estimate_accuracy(&end_time); 18598c2ecf20Sopenharmony_ci to = &expires; 18608c2ecf20Sopenharmony_ci *to = timespec64_to_ktime(end_time); 18618c2ecf20Sopenharmony_ci } else if (timeout == 0) { 18628c2ecf20Sopenharmony_ci /* 18638c2ecf20Sopenharmony_ci * Avoid the unnecessary trip to the wait queue loop, if the 18648c2ecf20Sopenharmony_ci * caller specified a non blocking operation. We still need 18658c2ecf20Sopenharmony_ci * lock because we could race and not see an epi being added 18668c2ecf20Sopenharmony_ci * to the ready list while in irq callback. Thus incorrectly 18678c2ecf20Sopenharmony_ci * returning 0 back to userspace. 18688c2ecf20Sopenharmony_ci */ 18698c2ecf20Sopenharmony_ci timed_out = 1; 18708c2ecf20Sopenharmony_ci 18718c2ecf20Sopenharmony_ci write_lock_irq(&ep->lock); 18728c2ecf20Sopenharmony_ci eavail = ep_events_available(ep); 18738c2ecf20Sopenharmony_ci write_unlock_irq(&ep->lock); 18748c2ecf20Sopenharmony_ci 18758c2ecf20Sopenharmony_ci goto send_events; 18768c2ecf20Sopenharmony_ci } 18778c2ecf20Sopenharmony_ci 18788c2ecf20Sopenharmony_cifetch_events: 18798c2ecf20Sopenharmony_ci 18808c2ecf20Sopenharmony_ci if (!ep_events_available(ep)) 18818c2ecf20Sopenharmony_ci ep_busy_loop(ep, timed_out); 18828c2ecf20Sopenharmony_ci 18838c2ecf20Sopenharmony_ci eavail = ep_events_available(ep); 18848c2ecf20Sopenharmony_ci if (eavail) 18858c2ecf20Sopenharmony_ci goto send_events; 18868c2ecf20Sopenharmony_ci 18878c2ecf20Sopenharmony_ci /* 18888c2ecf20Sopenharmony_ci * Busy poll timed out. Drop NAPI ID for now, we can add 18898c2ecf20Sopenharmony_ci * it back in when we have moved a socket with a valid NAPI 18908c2ecf20Sopenharmony_ci * ID onto the ready list. 18918c2ecf20Sopenharmony_ci */ 18928c2ecf20Sopenharmony_ci ep_reset_busy_poll_napi_id(ep); 18938c2ecf20Sopenharmony_ci 18948c2ecf20Sopenharmony_ci do { 18958c2ecf20Sopenharmony_ci /* 18968c2ecf20Sopenharmony_ci * Internally init_wait() uses autoremove_wake_function(), 18978c2ecf20Sopenharmony_ci * thus wait entry is removed from the wait queue on each 18988c2ecf20Sopenharmony_ci * wakeup. Why it is important? In case of several waiters 18998c2ecf20Sopenharmony_ci * each new wakeup will hit the next waiter, giving it the 19008c2ecf20Sopenharmony_ci * chance to harvest new event. Otherwise wakeup can be 19018c2ecf20Sopenharmony_ci * lost. This is also good performance-wise, because on 19028c2ecf20Sopenharmony_ci * normal wakeup path no need to call __remove_wait_queue() 19038c2ecf20Sopenharmony_ci * explicitly, thus ep->lock is not taken, which halts the 19048c2ecf20Sopenharmony_ci * event delivery. 19058c2ecf20Sopenharmony_ci * 19068c2ecf20Sopenharmony_ci * In fact, we now use an even more aggressive function that 19078c2ecf20Sopenharmony_ci * unconditionally removes, because we don't reuse the wait 19088c2ecf20Sopenharmony_ci * entry between loop iterations. This lets us also avoid the 19098c2ecf20Sopenharmony_ci * performance issue if a process is killed, causing all of its 19108c2ecf20Sopenharmony_ci * threads to wake up without being removed normally. 19118c2ecf20Sopenharmony_ci */ 19128c2ecf20Sopenharmony_ci init_wait(&wait); 19138c2ecf20Sopenharmony_ci wait.func = ep_autoremove_wake_function; 19148c2ecf20Sopenharmony_ci 19158c2ecf20Sopenharmony_ci write_lock_irq(&ep->lock); 19168c2ecf20Sopenharmony_ci /* 19178c2ecf20Sopenharmony_ci * Barrierless variant, waitqueue_active() is called under 19188c2ecf20Sopenharmony_ci * the same lock on wakeup ep_poll_callback() side, so it 19198c2ecf20Sopenharmony_ci * is safe to avoid an explicit barrier. 19208c2ecf20Sopenharmony_ci */ 19218c2ecf20Sopenharmony_ci __set_current_state(TASK_INTERRUPTIBLE); 19228c2ecf20Sopenharmony_ci 19238c2ecf20Sopenharmony_ci /* 19248c2ecf20Sopenharmony_ci * Do the final check under the lock. ep_scan_ready_list() 19258c2ecf20Sopenharmony_ci * plays with two lists (->rdllist and ->ovflist) and there 19268c2ecf20Sopenharmony_ci * is always a race when both lists are empty for short 19278c2ecf20Sopenharmony_ci * period of time although events are pending, so lock is 19288c2ecf20Sopenharmony_ci * important. 19298c2ecf20Sopenharmony_ci */ 19308c2ecf20Sopenharmony_ci eavail = ep_events_available(ep); 19318c2ecf20Sopenharmony_ci if (!eavail) { 19328c2ecf20Sopenharmony_ci if (signal_pending(current)) 19338c2ecf20Sopenharmony_ci res = -EINTR; 19348c2ecf20Sopenharmony_ci else 19358c2ecf20Sopenharmony_ci __add_wait_queue_exclusive(&ep->wq, &wait); 19368c2ecf20Sopenharmony_ci } 19378c2ecf20Sopenharmony_ci write_unlock_irq(&ep->lock); 19388c2ecf20Sopenharmony_ci 19398c2ecf20Sopenharmony_ci if (!eavail && !res) 19408c2ecf20Sopenharmony_ci timed_out = !schedule_hrtimeout_range(to, slack, 19418c2ecf20Sopenharmony_ci HRTIMER_MODE_ABS); 19428c2ecf20Sopenharmony_ci 19438c2ecf20Sopenharmony_ci /* 19448c2ecf20Sopenharmony_ci * We were woken up, thus go and try to harvest some events. 19458c2ecf20Sopenharmony_ci * If timed out and still on the wait queue, recheck eavail 19468c2ecf20Sopenharmony_ci * carefully under lock, below. 19478c2ecf20Sopenharmony_ci */ 19488c2ecf20Sopenharmony_ci eavail = 1; 19498c2ecf20Sopenharmony_ci } while (0); 19508c2ecf20Sopenharmony_ci 19518c2ecf20Sopenharmony_ci __set_current_state(TASK_RUNNING); 19528c2ecf20Sopenharmony_ci 19538c2ecf20Sopenharmony_ci if (!list_empty_careful(&wait.entry)) { 19548c2ecf20Sopenharmony_ci write_lock_irq(&ep->lock); 19558c2ecf20Sopenharmony_ci /* 19568c2ecf20Sopenharmony_ci * If the thread timed out and is not on the wait queue, it 19578c2ecf20Sopenharmony_ci * means that the thread was woken up after its timeout expired 19588c2ecf20Sopenharmony_ci * before it could reacquire the lock. Thus, when wait.entry is 19598c2ecf20Sopenharmony_ci * empty, it needs to harvest events. 19608c2ecf20Sopenharmony_ci */ 19618c2ecf20Sopenharmony_ci if (timed_out) 19628c2ecf20Sopenharmony_ci eavail = list_empty(&wait.entry); 19638c2ecf20Sopenharmony_ci __remove_wait_queue(&ep->wq, &wait); 19648c2ecf20Sopenharmony_ci write_unlock_irq(&ep->lock); 19658c2ecf20Sopenharmony_ci } 19668c2ecf20Sopenharmony_ci 19678c2ecf20Sopenharmony_cisend_events: 19688c2ecf20Sopenharmony_ci if (fatal_signal_pending(current)) { 19698c2ecf20Sopenharmony_ci /* 19708c2ecf20Sopenharmony_ci * Always short-circuit for fatal signals to allow 19718c2ecf20Sopenharmony_ci * threads to make a timely exit without the chance of 19728c2ecf20Sopenharmony_ci * finding more events available and fetching 19738c2ecf20Sopenharmony_ci * repeatedly. 19748c2ecf20Sopenharmony_ci */ 19758c2ecf20Sopenharmony_ci res = -EINTR; 19768c2ecf20Sopenharmony_ci } 19778c2ecf20Sopenharmony_ci /* 19788c2ecf20Sopenharmony_ci * Try to transfer events to user space. In case we get 0 events and 19798c2ecf20Sopenharmony_ci * there's still timeout left over, we go trying again in search of 19808c2ecf20Sopenharmony_ci * more luck. 19818c2ecf20Sopenharmony_ci */ 19828c2ecf20Sopenharmony_ci if (!res && eavail && 19838c2ecf20Sopenharmony_ci !(res = ep_send_events(ep, events, maxevents)) && !timed_out) 19848c2ecf20Sopenharmony_ci goto fetch_events; 19858c2ecf20Sopenharmony_ci 19868c2ecf20Sopenharmony_ci return res; 19878c2ecf20Sopenharmony_ci} 19888c2ecf20Sopenharmony_ci 19898c2ecf20Sopenharmony_ci/** 19908c2ecf20Sopenharmony_ci * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested() 19918c2ecf20Sopenharmony_ci * API, to verify that adding an epoll file inside another 19928c2ecf20Sopenharmony_ci * epoll structure, does not violate the constraints, in 19938c2ecf20Sopenharmony_ci * terms of closed loops, or too deep chains (which can 19948c2ecf20Sopenharmony_ci * result in excessive stack usage). 19958c2ecf20Sopenharmony_ci * 19968c2ecf20Sopenharmony_ci * @priv: Pointer to the epoll file to be currently checked. 19978c2ecf20Sopenharmony_ci * @cookie: Original cookie for this call. This is the top-of-the-chain epoll 19988c2ecf20Sopenharmony_ci * data structure pointer. 19998c2ecf20Sopenharmony_ci * @call_nests: Current dept of the @ep_call_nested() call stack. 20008c2ecf20Sopenharmony_ci * 20018c2ecf20Sopenharmony_ci * Returns: Returns zero if adding the epoll @file inside current epoll 20028c2ecf20Sopenharmony_ci * structure @ep does not violate the constraints, or -1 otherwise. 20038c2ecf20Sopenharmony_ci */ 20048c2ecf20Sopenharmony_cistatic int ep_loop_check_proc(void *priv, void *cookie, int call_nests) 20058c2ecf20Sopenharmony_ci{ 20068c2ecf20Sopenharmony_ci int error = 0; 20078c2ecf20Sopenharmony_ci struct file *file = priv; 20088c2ecf20Sopenharmony_ci struct eventpoll *ep = file->private_data; 20098c2ecf20Sopenharmony_ci struct eventpoll *ep_tovisit; 20108c2ecf20Sopenharmony_ci struct rb_node *rbp; 20118c2ecf20Sopenharmony_ci struct epitem *epi; 20128c2ecf20Sopenharmony_ci 20138c2ecf20Sopenharmony_ci mutex_lock_nested(&ep->mtx, call_nests + 1); 20148c2ecf20Sopenharmony_ci ep->gen = loop_check_gen; 20158c2ecf20Sopenharmony_ci for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 20168c2ecf20Sopenharmony_ci epi = rb_entry(rbp, struct epitem, rbn); 20178c2ecf20Sopenharmony_ci if (unlikely(is_file_epoll(epi->ffd.file))) { 20188c2ecf20Sopenharmony_ci ep_tovisit = epi->ffd.file->private_data; 20198c2ecf20Sopenharmony_ci if (ep_tovisit->gen == loop_check_gen) 20208c2ecf20Sopenharmony_ci continue; 20218c2ecf20Sopenharmony_ci error = ep_call_nested(&poll_loop_ncalls, 20228c2ecf20Sopenharmony_ci ep_loop_check_proc, epi->ffd.file, 20238c2ecf20Sopenharmony_ci ep_tovisit, current); 20248c2ecf20Sopenharmony_ci if (error != 0) 20258c2ecf20Sopenharmony_ci break; 20268c2ecf20Sopenharmony_ci } else { 20278c2ecf20Sopenharmony_ci /* 20288c2ecf20Sopenharmony_ci * If we've reached a file that is not associated with 20298c2ecf20Sopenharmony_ci * an ep, then we need to check if the newly added 20308c2ecf20Sopenharmony_ci * links are going to add too many wakeup paths. We do 20318c2ecf20Sopenharmony_ci * this by adding it to the tfile_check_list, if it's 20328c2ecf20Sopenharmony_ci * not already there, and calling reverse_path_check() 20338c2ecf20Sopenharmony_ci * during ep_insert(). 20348c2ecf20Sopenharmony_ci */ 20358c2ecf20Sopenharmony_ci if (list_empty(&epi->ffd.file->f_tfile_llink)) { 20368c2ecf20Sopenharmony_ci if (get_file_rcu(epi->ffd.file)) 20378c2ecf20Sopenharmony_ci list_add(&epi->ffd.file->f_tfile_llink, 20388c2ecf20Sopenharmony_ci &tfile_check_list); 20398c2ecf20Sopenharmony_ci } 20408c2ecf20Sopenharmony_ci } 20418c2ecf20Sopenharmony_ci } 20428c2ecf20Sopenharmony_ci mutex_unlock(&ep->mtx); 20438c2ecf20Sopenharmony_ci 20448c2ecf20Sopenharmony_ci return error; 20458c2ecf20Sopenharmony_ci} 20468c2ecf20Sopenharmony_ci 20478c2ecf20Sopenharmony_ci/** 20488c2ecf20Sopenharmony_ci * ep_loop_check - Performs a check to verify that adding an epoll file (@file) 20498c2ecf20Sopenharmony_ci * another epoll file (represented by @ep) does not create 20508c2ecf20Sopenharmony_ci * closed loops or too deep chains. 20518c2ecf20Sopenharmony_ci * 20528c2ecf20Sopenharmony_ci * @ep: Pointer to the epoll private data structure. 20538c2ecf20Sopenharmony_ci * @file: Pointer to the epoll file to be checked. 20548c2ecf20Sopenharmony_ci * 20558c2ecf20Sopenharmony_ci * Returns: Returns zero if adding the epoll @file inside current epoll 20568c2ecf20Sopenharmony_ci * structure @ep does not violate the constraints, or -1 otherwise. 20578c2ecf20Sopenharmony_ci */ 20588c2ecf20Sopenharmony_cistatic int ep_loop_check(struct eventpoll *ep, struct file *file) 20598c2ecf20Sopenharmony_ci{ 20608c2ecf20Sopenharmony_ci return ep_call_nested(&poll_loop_ncalls, 20618c2ecf20Sopenharmony_ci ep_loop_check_proc, file, ep, current); 20628c2ecf20Sopenharmony_ci} 20638c2ecf20Sopenharmony_ci 20648c2ecf20Sopenharmony_cistatic void clear_tfile_check_list(void) 20658c2ecf20Sopenharmony_ci{ 20668c2ecf20Sopenharmony_ci struct file *file; 20678c2ecf20Sopenharmony_ci 20688c2ecf20Sopenharmony_ci /* first clear the tfile_check_list */ 20698c2ecf20Sopenharmony_ci while (!list_empty(&tfile_check_list)) { 20708c2ecf20Sopenharmony_ci file = list_first_entry(&tfile_check_list, struct file, 20718c2ecf20Sopenharmony_ci f_tfile_llink); 20728c2ecf20Sopenharmony_ci list_del_init(&file->f_tfile_llink); 20738c2ecf20Sopenharmony_ci fput(file); 20748c2ecf20Sopenharmony_ci } 20758c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&tfile_check_list); 20768c2ecf20Sopenharmony_ci} 20778c2ecf20Sopenharmony_ci 20788c2ecf20Sopenharmony_ci/* 20798c2ecf20Sopenharmony_ci * Open an eventpoll file descriptor. 20808c2ecf20Sopenharmony_ci */ 20818c2ecf20Sopenharmony_cistatic int do_epoll_create(int flags) 20828c2ecf20Sopenharmony_ci{ 20838c2ecf20Sopenharmony_ci int error, fd; 20848c2ecf20Sopenharmony_ci struct eventpoll *ep = NULL; 20858c2ecf20Sopenharmony_ci struct file *file; 20868c2ecf20Sopenharmony_ci 20878c2ecf20Sopenharmony_ci /* Check the EPOLL_* constant for consistency. */ 20888c2ecf20Sopenharmony_ci BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 20898c2ecf20Sopenharmony_ci 20908c2ecf20Sopenharmony_ci if (flags & ~EPOLL_CLOEXEC) 20918c2ecf20Sopenharmony_ci return -EINVAL; 20928c2ecf20Sopenharmony_ci /* 20938c2ecf20Sopenharmony_ci * Create the internal data structure ("struct eventpoll"). 20948c2ecf20Sopenharmony_ci */ 20958c2ecf20Sopenharmony_ci error = ep_alloc(&ep); 20968c2ecf20Sopenharmony_ci if (error < 0) 20978c2ecf20Sopenharmony_ci return error; 20988c2ecf20Sopenharmony_ci /* 20998c2ecf20Sopenharmony_ci * Creates all the items needed to setup an eventpoll file. That is, 21008c2ecf20Sopenharmony_ci * a file structure and a free file descriptor. 21018c2ecf20Sopenharmony_ci */ 21028c2ecf20Sopenharmony_ci fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); 21038c2ecf20Sopenharmony_ci if (fd < 0) { 21048c2ecf20Sopenharmony_ci error = fd; 21058c2ecf20Sopenharmony_ci goto out_free_ep; 21068c2ecf20Sopenharmony_ci } 21078c2ecf20Sopenharmony_ci file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, 21088c2ecf20Sopenharmony_ci O_RDWR | (flags & O_CLOEXEC)); 21098c2ecf20Sopenharmony_ci if (IS_ERR(file)) { 21108c2ecf20Sopenharmony_ci error = PTR_ERR(file); 21118c2ecf20Sopenharmony_ci goto out_free_fd; 21128c2ecf20Sopenharmony_ci } 21138c2ecf20Sopenharmony_ci ep->file = file; 21148c2ecf20Sopenharmony_ci fd_install(fd, file); 21158c2ecf20Sopenharmony_ci return fd; 21168c2ecf20Sopenharmony_ci 21178c2ecf20Sopenharmony_ciout_free_fd: 21188c2ecf20Sopenharmony_ci put_unused_fd(fd); 21198c2ecf20Sopenharmony_ciout_free_ep: 21208c2ecf20Sopenharmony_ci ep_free(ep); 21218c2ecf20Sopenharmony_ci return error; 21228c2ecf20Sopenharmony_ci} 21238c2ecf20Sopenharmony_ci 21248c2ecf20Sopenharmony_ciSYSCALL_DEFINE1(epoll_create1, int, flags) 21258c2ecf20Sopenharmony_ci{ 21268c2ecf20Sopenharmony_ci return do_epoll_create(flags); 21278c2ecf20Sopenharmony_ci} 21288c2ecf20Sopenharmony_ci 21298c2ecf20Sopenharmony_ciSYSCALL_DEFINE1(epoll_create, int, size) 21308c2ecf20Sopenharmony_ci{ 21318c2ecf20Sopenharmony_ci if (size <= 0) 21328c2ecf20Sopenharmony_ci return -EINVAL; 21338c2ecf20Sopenharmony_ci 21348c2ecf20Sopenharmony_ci return do_epoll_create(0); 21358c2ecf20Sopenharmony_ci} 21368c2ecf20Sopenharmony_ci 21378c2ecf20Sopenharmony_cistatic inline int epoll_mutex_lock(struct mutex *mutex, int depth, 21388c2ecf20Sopenharmony_ci bool nonblock) 21398c2ecf20Sopenharmony_ci{ 21408c2ecf20Sopenharmony_ci if (!nonblock) { 21418c2ecf20Sopenharmony_ci mutex_lock_nested(mutex, depth); 21428c2ecf20Sopenharmony_ci return 0; 21438c2ecf20Sopenharmony_ci } 21448c2ecf20Sopenharmony_ci if (mutex_trylock(mutex)) 21458c2ecf20Sopenharmony_ci return 0; 21468c2ecf20Sopenharmony_ci return -EAGAIN; 21478c2ecf20Sopenharmony_ci} 21488c2ecf20Sopenharmony_ci 21498c2ecf20Sopenharmony_ciint do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, 21508c2ecf20Sopenharmony_ci bool nonblock) 21518c2ecf20Sopenharmony_ci{ 21528c2ecf20Sopenharmony_ci int error; 21538c2ecf20Sopenharmony_ci int full_check = 0; 21548c2ecf20Sopenharmony_ci struct fd f, tf; 21558c2ecf20Sopenharmony_ci struct eventpoll *ep; 21568c2ecf20Sopenharmony_ci struct epitem *epi; 21578c2ecf20Sopenharmony_ci struct eventpoll *tep = NULL; 21588c2ecf20Sopenharmony_ci 21598c2ecf20Sopenharmony_ci error = -EBADF; 21608c2ecf20Sopenharmony_ci f = fdget(epfd); 21618c2ecf20Sopenharmony_ci if (!f.file) 21628c2ecf20Sopenharmony_ci goto error_return; 21638c2ecf20Sopenharmony_ci 21648c2ecf20Sopenharmony_ci /* Get the "struct file *" for the target file */ 21658c2ecf20Sopenharmony_ci tf = fdget(fd); 21668c2ecf20Sopenharmony_ci if (!tf.file) 21678c2ecf20Sopenharmony_ci goto error_fput; 21688c2ecf20Sopenharmony_ci 21698c2ecf20Sopenharmony_ci /* The target file descriptor must support poll */ 21708c2ecf20Sopenharmony_ci error = -EPERM; 21718c2ecf20Sopenharmony_ci if (!file_can_poll(tf.file)) 21728c2ecf20Sopenharmony_ci goto error_tgt_fput; 21738c2ecf20Sopenharmony_ci 21748c2ecf20Sopenharmony_ci /* Check if EPOLLWAKEUP is allowed */ 21758c2ecf20Sopenharmony_ci if (ep_op_has_event(op)) 21768c2ecf20Sopenharmony_ci ep_take_care_of_epollwakeup(epds); 21778c2ecf20Sopenharmony_ci 21788c2ecf20Sopenharmony_ci /* 21798c2ecf20Sopenharmony_ci * We have to check that the file structure underneath the file descriptor 21808c2ecf20Sopenharmony_ci * the user passed to us _is_ an eventpoll file. And also we do not permit 21818c2ecf20Sopenharmony_ci * adding an epoll file descriptor inside itself. 21828c2ecf20Sopenharmony_ci */ 21838c2ecf20Sopenharmony_ci error = -EINVAL; 21848c2ecf20Sopenharmony_ci if (f.file == tf.file || !is_file_epoll(f.file)) 21858c2ecf20Sopenharmony_ci goto error_tgt_fput; 21868c2ecf20Sopenharmony_ci 21878c2ecf20Sopenharmony_ci /* 21888c2ecf20Sopenharmony_ci * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, 21898c2ecf20Sopenharmony_ci * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. 21908c2ecf20Sopenharmony_ci * Also, we do not currently supported nested exclusive wakeups. 21918c2ecf20Sopenharmony_ci */ 21928c2ecf20Sopenharmony_ci if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { 21938c2ecf20Sopenharmony_ci if (op == EPOLL_CTL_MOD) 21948c2ecf20Sopenharmony_ci goto error_tgt_fput; 21958c2ecf20Sopenharmony_ci if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || 21968c2ecf20Sopenharmony_ci (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) 21978c2ecf20Sopenharmony_ci goto error_tgt_fput; 21988c2ecf20Sopenharmony_ci } 21998c2ecf20Sopenharmony_ci 22008c2ecf20Sopenharmony_ci /* 22018c2ecf20Sopenharmony_ci * At this point it is safe to assume that the "private_data" contains 22028c2ecf20Sopenharmony_ci * our own data structure. 22038c2ecf20Sopenharmony_ci */ 22048c2ecf20Sopenharmony_ci ep = f.file->private_data; 22058c2ecf20Sopenharmony_ci 22068c2ecf20Sopenharmony_ci /* 22078c2ecf20Sopenharmony_ci * When we insert an epoll file descriptor, inside another epoll file 22088c2ecf20Sopenharmony_ci * descriptor, there is the change of creating closed loops, which are 22098c2ecf20Sopenharmony_ci * better be handled here, than in more critical paths. While we are 22108c2ecf20Sopenharmony_ci * checking for loops we also determine the list of files reachable 22118c2ecf20Sopenharmony_ci * and hang them on the tfile_check_list, so we can check that we 22128c2ecf20Sopenharmony_ci * haven't created too many possible wakeup paths. 22138c2ecf20Sopenharmony_ci * 22148c2ecf20Sopenharmony_ci * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when 22158c2ecf20Sopenharmony_ci * the epoll file descriptor is attaching directly to a wakeup source, 22168c2ecf20Sopenharmony_ci * unless the epoll file descriptor is nested. The purpose of taking the 22178c2ecf20Sopenharmony_ci * 'epmutex' on add is to prevent complex toplogies such as loops and 22188c2ecf20Sopenharmony_ci * deep wakeup paths from forming in parallel through multiple 22198c2ecf20Sopenharmony_ci * EPOLL_CTL_ADD operations. 22208c2ecf20Sopenharmony_ci */ 22218c2ecf20Sopenharmony_ci error = epoll_mutex_lock(&ep->mtx, 0, nonblock); 22228c2ecf20Sopenharmony_ci if (error) 22238c2ecf20Sopenharmony_ci goto error_tgt_fput; 22248c2ecf20Sopenharmony_ci if (op == EPOLL_CTL_ADD) { 22258c2ecf20Sopenharmony_ci if (!list_empty(&f.file->f_ep_links) || 22268c2ecf20Sopenharmony_ci ep->gen == loop_check_gen || 22278c2ecf20Sopenharmony_ci is_file_epoll(tf.file)) { 22288c2ecf20Sopenharmony_ci mutex_unlock(&ep->mtx); 22298c2ecf20Sopenharmony_ci error = epoll_mutex_lock(&epmutex, 0, nonblock); 22308c2ecf20Sopenharmony_ci if (error) 22318c2ecf20Sopenharmony_ci goto error_tgt_fput; 22328c2ecf20Sopenharmony_ci loop_check_gen++; 22338c2ecf20Sopenharmony_ci full_check = 1; 22348c2ecf20Sopenharmony_ci if (is_file_epoll(tf.file)) { 22358c2ecf20Sopenharmony_ci error = -ELOOP; 22368c2ecf20Sopenharmony_ci if (ep_loop_check(ep, tf.file) != 0) 22378c2ecf20Sopenharmony_ci goto error_tgt_fput; 22388c2ecf20Sopenharmony_ci } else { 22398c2ecf20Sopenharmony_ci get_file(tf.file); 22408c2ecf20Sopenharmony_ci list_add(&tf.file->f_tfile_llink, 22418c2ecf20Sopenharmony_ci &tfile_check_list); 22428c2ecf20Sopenharmony_ci } 22438c2ecf20Sopenharmony_ci error = epoll_mutex_lock(&ep->mtx, 0, nonblock); 22448c2ecf20Sopenharmony_ci if (error) 22458c2ecf20Sopenharmony_ci goto error_tgt_fput; 22468c2ecf20Sopenharmony_ci if (is_file_epoll(tf.file)) { 22478c2ecf20Sopenharmony_ci tep = tf.file->private_data; 22488c2ecf20Sopenharmony_ci error = epoll_mutex_lock(&tep->mtx, 1, nonblock); 22498c2ecf20Sopenharmony_ci if (error) { 22508c2ecf20Sopenharmony_ci mutex_unlock(&ep->mtx); 22518c2ecf20Sopenharmony_ci goto error_tgt_fput; 22528c2ecf20Sopenharmony_ci } 22538c2ecf20Sopenharmony_ci } 22548c2ecf20Sopenharmony_ci } 22558c2ecf20Sopenharmony_ci } 22568c2ecf20Sopenharmony_ci 22578c2ecf20Sopenharmony_ci /* 22588c2ecf20Sopenharmony_ci * Try to lookup the file inside our RB tree, Since we grabbed "mtx" 22598c2ecf20Sopenharmony_ci * above, we can be sure to be able to use the item looked up by 22608c2ecf20Sopenharmony_ci * ep_find() till we release the mutex. 22618c2ecf20Sopenharmony_ci */ 22628c2ecf20Sopenharmony_ci epi = ep_find(ep, tf.file, fd); 22638c2ecf20Sopenharmony_ci 22648c2ecf20Sopenharmony_ci error = -EINVAL; 22658c2ecf20Sopenharmony_ci switch (op) { 22668c2ecf20Sopenharmony_ci case EPOLL_CTL_ADD: 22678c2ecf20Sopenharmony_ci if (!epi) { 22688c2ecf20Sopenharmony_ci epds->events |= EPOLLERR | EPOLLHUP; 22698c2ecf20Sopenharmony_ci error = ep_insert(ep, epds, tf.file, fd, full_check); 22708c2ecf20Sopenharmony_ci } else 22718c2ecf20Sopenharmony_ci error = -EEXIST; 22728c2ecf20Sopenharmony_ci break; 22738c2ecf20Sopenharmony_ci case EPOLL_CTL_DEL: 22748c2ecf20Sopenharmony_ci if (epi) 22758c2ecf20Sopenharmony_ci error = ep_remove(ep, epi); 22768c2ecf20Sopenharmony_ci else 22778c2ecf20Sopenharmony_ci error = -ENOENT; 22788c2ecf20Sopenharmony_ci break; 22798c2ecf20Sopenharmony_ci case EPOLL_CTL_MOD: 22808c2ecf20Sopenharmony_ci if (epi) { 22818c2ecf20Sopenharmony_ci if (!(epi->event.events & EPOLLEXCLUSIVE)) { 22828c2ecf20Sopenharmony_ci epds->events |= EPOLLERR | EPOLLHUP; 22838c2ecf20Sopenharmony_ci error = ep_modify(ep, epi, epds); 22848c2ecf20Sopenharmony_ci } 22858c2ecf20Sopenharmony_ci } else 22868c2ecf20Sopenharmony_ci error = -ENOENT; 22878c2ecf20Sopenharmony_ci break; 22888c2ecf20Sopenharmony_ci } 22898c2ecf20Sopenharmony_ci if (tep != NULL) 22908c2ecf20Sopenharmony_ci mutex_unlock(&tep->mtx); 22918c2ecf20Sopenharmony_ci mutex_unlock(&ep->mtx); 22928c2ecf20Sopenharmony_ci 22938c2ecf20Sopenharmony_cierror_tgt_fput: 22948c2ecf20Sopenharmony_ci if (full_check) { 22958c2ecf20Sopenharmony_ci clear_tfile_check_list(); 22968c2ecf20Sopenharmony_ci loop_check_gen++; 22978c2ecf20Sopenharmony_ci mutex_unlock(&epmutex); 22988c2ecf20Sopenharmony_ci } 22998c2ecf20Sopenharmony_ci 23008c2ecf20Sopenharmony_ci fdput(tf); 23018c2ecf20Sopenharmony_cierror_fput: 23028c2ecf20Sopenharmony_ci fdput(f); 23038c2ecf20Sopenharmony_cierror_return: 23048c2ecf20Sopenharmony_ci 23058c2ecf20Sopenharmony_ci return error; 23068c2ecf20Sopenharmony_ci} 23078c2ecf20Sopenharmony_ci 23088c2ecf20Sopenharmony_ci/* 23098c2ecf20Sopenharmony_ci * The following function implements the controller interface for 23108c2ecf20Sopenharmony_ci * the eventpoll file that enables the insertion/removal/change of 23118c2ecf20Sopenharmony_ci * file descriptors inside the interest set. 23128c2ecf20Sopenharmony_ci */ 23138c2ecf20Sopenharmony_ciSYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, 23148c2ecf20Sopenharmony_ci struct epoll_event __user *, event) 23158c2ecf20Sopenharmony_ci{ 23168c2ecf20Sopenharmony_ci struct epoll_event epds; 23178c2ecf20Sopenharmony_ci 23188c2ecf20Sopenharmony_ci if (ep_op_has_event(op) && 23198c2ecf20Sopenharmony_ci copy_from_user(&epds, event, sizeof(struct epoll_event))) 23208c2ecf20Sopenharmony_ci return -EFAULT; 23218c2ecf20Sopenharmony_ci 23228c2ecf20Sopenharmony_ci return do_epoll_ctl(epfd, op, fd, &epds, false); 23238c2ecf20Sopenharmony_ci} 23248c2ecf20Sopenharmony_ci 23258c2ecf20Sopenharmony_ci/* 23268c2ecf20Sopenharmony_ci * Implement the event wait interface for the eventpoll file. It is the kernel 23278c2ecf20Sopenharmony_ci * part of the user space epoll_wait(2). 23288c2ecf20Sopenharmony_ci */ 23298c2ecf20Sopenharmony_cistatic int do_epoll_wait(int epfd, struct epoll_event __user *events, 23308c2ecf20Sopenharmony_ci int maxevents, int timeout) 23318c2ecf20Sopenharmony_ci{ 23328c2ecf20Sopenharmony_ci int error; 23338c2ecf20Sopenharmony_ci struct fd f; 23348c2ecf20Sopenharmony_ci struct eventpoll *ep; 23358c2ecf20Sopenharmony_ci 23368c2ecf20Sopenharmony_ci /* The maximum number of event must be greater than zero */ 23378c2ecf20Sopenharmony_ci if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) 23388c2ecf20Sopenharmony_ci return -EINVAL; 23398c2ecf20Sopenharmony_ci 23408c2ecf20Sopenharmony_ci /* Verify that the area passed by the user is writeable */ 23418c2ecf20Sopenharmony_ci if (!access_ok(events, maxevents * sizeof(struct epoll_event))) 23428c2ecf20Sopenharmony_ci return -EFAULT; 23438c2ecf20Sopenharmony_ci 23448c2ecf20Sopenharmony_ci /* Get the "struct file *" for the eventpoll file */ 23458c2ecf20Sopenharmony_ci f = fdget(epfd); 23468c2ecf20Sopenharmony_ci if (!f.file) 23478c2ecf20Sopenharmony_ci return -EBADF; 23488c2ecf20Sopenharmony_ci 23498c2ecf20Sopenharmony_ci /* 23508c2ecf20Sopenharmony_ci * We have to check that the file structure underneath the fd 23518c2ecf20Sopenharmony_ci * the user passed to us _is_ an eventpoll file. 23528c2ecf20Sopenharmony_ci */ 23538c2ecf20Sopenharmony_ci error = -EINVAL; 23548c2ecf20Sopenharmony_ci if (!is_file_epoll(f.file)) 23558c2ecf20Sopenharmony_ci goto error_fput; 23568c2ecf20Sopenharmony_ci 23578c2ecf20Sopenharmony_ci /* 23588c2ecf20Sopenharmony_ci * At this point it is safe to assume that the "private_data" contains 23598c2ecf20Sopenharmony_ci * our own data structure. 23608c2ecf20Sopenharmony_ci */ 23618c2ecf20Sopenharmony_ci ep = f.file->private_data; 23628c2ecf20Sopenharmony_ci 23638c2ecf20Sopenharmony_ci /* Time to fish for events ... */ 23648c2ecf20Sopenharmony_ci error = ep_poll(ep, events, maxevents, timeout); 23658c2ecf20Sopenharmony_ci 23668c2ecf20Sopenharmony_cierror_fput: 23678c2ecf20Sopenharmony_ci fdput(f); 23688c2ecf20Sopenharmony_ci return error; 23698c2ecf20Sopenharmony_ci} 23708c2ecf20Sopenharmony_ci 23718c2ecf20Sopenharmony_ciSYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, 23728c2ecf20Sopenharmony_ci int, maxevents, int, timeout) 23738c2ecf20Sopenharmony_ci{ 23748c2ecf20Sopenharmony_ci return do_epoll_wait(epfd, events, maxevents, timeout); 23758c2ecf20Sopenharmony_ci} 23768c2ecf20Sopenharmony_ci 23778c2ecf20Sopenharmony_ci/* 23788c2ecf20Sopenharmony_ci * Implement the event wait interface for the eventpoll file. It is the kernel 23798c2ecf20Sopenharmony_ci * part of the user space epoll_pwait(2). 23808c2ecf20Sopenharmony_ci */ 23818c2ecf20Sopenharmony_ciSYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, 23828c2ecf20Sopenharmony_ci int, maxevents, int, timeout, const sigset_t __user *, sigmask, 23838c2ecf20Sopenharmony_ci size_t, sigsetsize) 23848c2ecf20Sopenharmony_ci{ 23858c2ecf20Sopenharmony_ci int error; 23868c2ecf20Sopenharmony_ci 23878c2ecf20Sopenharmony_ci /* 23888c2ecf20Sopenharmony_ci * If the caller wants a certain signal mask to be set during the wait, 23898c2ecf20Sopenharmony_ci * we apply it here. 23908c2ecf20Sopenharmony_ci */ 23918c2ecf20Sopenharmony_ci error = set_user_sigmask(sigmask, sigsetsize); 23928c2ecf20Sopenharmony_ci if (error) 23938c2ecf20Sopenharmony_ci return error; 23948c2ecf20Sopenharmony_ci 23958c2ecf20Sopenharmony_ci error = do_epoll_wait(epfd, events, maxevents, timeout); 23968c2ecf20Sopenharmony_ci restore_saved_sigmask_unless(error == -EINTR); 23978c2ecf20Sopenharmony_ci 23988c2ecf20Sopenharmony_ci return error; 23998c2ecf20Sopenharmony_ci} 24008c2ecf20Sopenharmony_ci 24018c2ecf20Sopenharmony_ci#ifdef CONFIG_COMPAT 24028c2ecf20Sopenharmony_ciCOMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, 24038c2ecf20Sopenharmony_ci struct epoll_event __user *, events, 24048c2ecf20Sopenharmony_ci int, maxevents, int, timeout, 24058c2ecf20Sopenharmony_ci const compat_sigset_t __user *, sigmask, 24068c2ecf20Sopenharmony_ci compat_size_t, sigsetsize) 24078c2ecf20Sopenharmony_ci{ 24088c2ecf20Sopenharmony_ci long err; 24098c2ecf20Sopenharmony_ci 24108c2ecf20Sopenharmony_ci /* 24118c2ecf20Sopenharmony_ci * If the caller wants a certain signal mask to be set during the wait, 24128c2ecf20Sopenharmony_ci * we apply it here. 24138c2ecf20Sopenharmony_ci */ 24148c2ecf20Sopenharmony_ci err = set_compat_user_sigmask(sigmask, sigsetsize); 24158c2ecf20Sopenharmony_ci if (err) 24168c2ecf20Sopenharmony_ci return err; 24178c2ecf20Sopenharmony_ci 24188c2ecf20Sopenharmony_ci err = do_epoll_wait(epfd, events, maxevents, timeout); 24198c2ecf20Sopenharmony_ci restore_saved_sigmask_unless(err == -EINTR); 24208c2ecf20Sopenharmony_ci 24218c2ecf20Sopenharmony_ci return err; 24228c2ecf20Sopenharmony_ci} 24238c2ecf20Sopenharmony_ci#endif 24248c2ecf20Sopenharmony_ci 24258c2ecf20Sopenharmony_cistatic int __init eventpoll_init(void) 24268c2ecf20Sopenharmony_ci{ 24278c2ecf20Sopenharmony_ci struct sysinfo si; 24288c2ecf20Sopenharmony_ci 24298c2ecf20Sopenharmony_ci si_meminfo(&si); 24308c2ecf20Sopenharmony_ci /* 24318c2ecf20Sopenharmony_ci * Allows top 4% of lomem to be allocated for epoll watches (per user). 24328c2ecf20Sopenharmony_ci */ 24338c2ecf20Sopenharmony_ci max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / 24348c2ecf20Sopenharmony_ci EP_ITEM_COST; 24358c2ecf20Sopenharmony_ci BUG_ON(max_user_watches < 0); 24368c2ecf20Sopenharmony_ci 24378c2ecf20Sopenharmony_ci /* 24388c2ecf20Sopenharmony_ci * Initialize the structure used to perform epoll file descriptor 24398c2ecf20Sopenharmony_ci * inclusion loops checks. 24408c2ecf20Sopenharmony_ci */ 24418c2ecf20Sopenharmony_ci ep_nested_calls_init(&poll_loop_ncalls); 24428c2ecf20Sopenharmony_ci 24438c2ecf20Sopenharmony_ci /* 24448c2ecf20Sopenharmony_ci * We can have many thousands of epitems, so prevent this from 24458c2ecf20Sopenharmony_ci * using an extra cache line on 64-bit (and smaller) CPUs 24468c2ecf20Sopenharmony_ci */ 24478c2ecf20Sopenharmony_ci BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); 24488c2ecf20Sopenharmony_ci 24498c2ecf20Sopenharmony_ci /* Allocates slab cache used to allocate "struct epitem" items */ 24508c2ecf20Sopenharmony_ci epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 24518c2ecf20Sopenharmony_ci 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); 24528c2ecf20Sopenharmony_ci 24538c2ecf20Sopenharmony_ci /* Allocates slab cache used to allocate "struct eppoll_entry" */ 24548c2ecf20Sopenharmony_ci pwq_cache = kmem_cache_create("eventpoll_pwq", 24558c2ecf20Sopenharmony_ci sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); 24568c2ecf20Sopenharmony_ci 24578c2ecf20Sopenharmony_ci return 0; 24588c2ecf20Sopenharmony_ci} 24598c2ecf20Sopenharmony_cifs_initcall(eventpoll_init); 2460