162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * fs/eventpoll.c (Efficient event retrieval implementation) 462306a36Sopenharmony_ci * Copyright (C) 2001,...,2009 Davide Libenzi 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * Davide Libenzi <davidel@xmailserver.org> 762306a36Sopenharmony_ci */ 862306a36Sopenharmony_ci 962306a36Sopenharmony_ci#include <linux/init.h> 1062306a36Sopenharmony_ci#include <linux/kernel.h> 1162306a36Sopenharmony_ci#include <linux/sched/signal.h> 1262306a36Sopenharmony_ci#include <linux/fs.h> 1362306a36Sopenharmony_ci#include <linux/file.h> 1462306a36Sopenharmony_ci#include <linux/signal.h> 1562306a36Sopenharmony_ci#include <linux/errno.h> 1662306a36Sopenharmony_ci#include <linux/mm.h> 1762306a36Sopenharmony_ci#include <linux/slab.h> 1862306a36Sopenharmony_ci#include <linux/poll.h> 1962306a36Sopenharmony_ci#include <linux/string.h> 2062306a36Sopenharmony_ci#include <linux/list.h> 2162306a36Sopenharmony_ci#include <linux/hash.h> 2262306a36Sopenharmony_ci#include <linux/spinlock.h> 2362306a36Sopenharmony_ci#include <linux/syscalls.h> 2462306a36Sopenharmony_ci#include <linux/rbtree.h> 2562306a36Sopenharmony_ci#include <linux/wait.h> 2662306a36Sopenharmony_ci#include <linux/eventpoll.h> 2762306a36Sopenharmony_ci#include <linux/mount.h> 2862306a36Sopenharmony_ci#include <linux/bitops.h> 2962306a36Sopenharmony_ci#include <linux/mutex.h> 3062306a36Sopenharmony_ci#include <linux/anon_inodes.h> 3162306a36Sopenharmony_ci#include <linux/device.h> 3262306a36Sopenharmony_ci#include <linux/uaccess.h> 3362306a36Sopenharmony_ci#include <asm/io.h> 3462306a36Sopenharmony_ci#include <asm/mman.h> 3562306a36Sopenharmony_ci#include <linux/atomic.h> 3662306a36Sopenharmony_ci#include <linux/proc_fs.h> 3762306a36Sopenharmony_ci#include <linux/seq_file.h> 3862306a36Sopenharmony_ci#include <linux/compat.h> 3962306a36Sopenharmony_ci#include <linux/rculist.h> 4062306a36Sopenharmony_ci#include <net/busy_poll.h> 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci/* 4362306a36Sopenharmony_ci * LOCKING: 4462306a36Sopenharmony_ci * There are three level of locking required by epoll : 4562306a36Sopenharmony_ci * 4662306a36Sopenharmony_ci * 1) epnested_mutex (mutex) 4762306a36Sopenharmony_ci * 2) ep->mtx (mutex) 4862306a36Sopenharmony_ci * 3) ep->lock (rwlock) 4962306a36Sopenharmony_ci * 5062306a36Sopenharmony_ci * The acquire order is the one listed above, from 1 to 3. 5162306a36Sopenharmony_ci * We need a rwlock (ep->lock) because we manipulate objects 5262306a36Sopenharmony_ci * from inside the poll callback, that might be triggered from 5362306a36Sopenharmony_ci * a wake_up() that in turn might be called from IRQ context. 5462306a36Sopenharmony_ci * So we can't sleep inside the poll callback and hence we need 5562306a36Sopenharmony_ci * a spinlock. During the event transfer loop (from kernel to 5662306a36Sopenharmony_ci * user space) we could end up sleeping due a copy_to_user(), so 5762306a36Sopenharmony_ci * we need a lock that will allow us to sleep. This lock is a 5862306a36Sopenharmony_ci * mutex (ep->mtx). It is acquired during the event transfer loop, 5962306a36Sopenharmony_ci * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). 6062306a36Sopenharmony_ci * The epnested_mutex is acquired when inserting an epoll fd onto another 6162306a36Sopenharmony_ci * epoll fd. We do this so that we walk the epoll tree and ensure that this 6262306a36Sopenharmony_ci * insertion does not create a cycle of epoll file descriptors, which 6362306a36Sopenharmony_ci * could lead to deadlock. We need a global mutex to prevent two 6462306a36Sopenharmony_ci * simultaneous inserts (A into B and B into A) from racing and 6562306a36Sopenharmony_ci * constructing a cycle without either insert observing that it is 6662306a36Sopenharmony_ci * going to. 6762306a36Sopenharmony_ci * It is necessary to acquire multiple "ep->mtx"es at once in the 6862306a36Sopenharmony_ci * case when one epoll fd is added to another. In this case, we 6962306a36Sopenharmony_ci * always acquire the locks in the order of nesting (i.e. after 7062306a36Sopenharmony_ci * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired 7162306a36Sopenharmony_ci * before e2->mtx). Since we disallow cycles of epoll file 7262306a36Sopenharmony_ci * descriptors, this ensures that the mutexes are well-ordered. In 7362306a36Sopenharmony_ci * order to communicate this nesting to lockdep, when walking a tree 7462306a36Sopenharmony_ci * of epoll file descriptors, we use the current recursion depth as 7562306a36Sopenharmony_ci * the lockdep subkey. 7662306a36Sopenharmony_ci * It is possible to drop the "ep->mtx" and to use the global 7762306a36Sopenharmony_ci * mutex "epnested_mutex" (together with "ep->lock") to have it working, 7862306a36Sopenharmony_ci * but having "ep->mtx" will make the interface more scalable. 7962306a36Sopenharmony_ci * Events that require holding "epnested_mutex" are very rare, while for 8062306a36Sopenharmony_ci * normal operations the epoll private "ep->mtx" will guarantee 8162306a36Sopenharmony_ci * a better scalability. 8262306a36Sopenharmony_ci */ 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci/* Epoll private bits inside the event mask */ 8562306a36Sopenharmony_ci#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT) 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \ 9062306a36Sopenharmony_ci EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_ci/* Maximum number of nesting allowed inside epoll sets */ 9362306a36Sopenharmony_ci#define EP_MAX_NESTS 4 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ci#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci#define EP_UNACTIVE_PTR ((void *) -1L) 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_cistruct epoll_filefd { 10262306a36Sopenharmony_ci struct file *file; 10362306a36Sopenharmony_ci int fd; 10462306a36Sopenharmony_ci} __packed; 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci/* Wait structure used by the poll hooks */ 10762306a36Sopenharmony_cistruct eppoll_entry { 10862306a36Sopenharmony_ci /* List header used to link this structure to the "struct epitem" */ 10962306a36Sopenharmony_ci struct eppoll_entry *next; 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci /* The "base" pointer is set to the container "struct epitem" */ 11262306a36Sopenharmony_ci struct epitem *base; 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci /* 11562306a36Sopenharmony_ci * Wait queue item that will be linked to the target file wait 11662306a36Sopenharmony_ci * queue head. 11762306a36Sopenharmony_ci */ 11862306a36Sopenharmony_ci wait_queue_entry_t wait; 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci /* The wait queue head that linked the "wait" wait queue item */ 12162306a36Sopenharmony_ci wait_queue_head_t *whead; 12262306a36Sopenharmony_ci}; 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci/* 12562306a36Sopenharmony_ci * Each file descriptor added to the eventpoll interface will 12662306a36Sopenharmony_ci * have an entry of this type linked to the "rbr" RB tree. 12762306a36Sopenharmony_ci * Avoid increasing the size of this struct, there can be many thousands 12862306a36Sopenharmony_ci * of these on a server and we do not want this to take another cache line. 12962306a36Sopenharmony_ci */ 13062306a36Sopenharmony_cistruct epitem { 13162306a36Sopenharmony_ci union { 13262306a36Sopenharmony_ci /* RB tree node links this structure to the eventpoll RB tree */ 13362306a36Sopenharmony_ci struct rb_node rbn; 13462306a36Sopenharmony_ci /* Used to free the struct epitem */ 13562306a36Sopenharmony_ci struct rcu_head rcu; 13662306a36Sopenharmony_ci }; 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci /* List header used to link this structure to the eventpoll ready list */ 13962306a36Sopenharmony_ci struct list_head rdllink; 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci /* 14262306a36Sopenharmony_ci * Works together "struct eventpoll"->ovflist in keeping the 14362306a36Sopenharmony_ci * single linked chain of items. 14462306a36Sopenharmony_ci */ 14562306a36Sopenharmony_ci struct epitem *next; 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci /* The file descriptor information this item refers to */ 14862306a36Sopenharmony_ci struct epoll_filefd ffd; 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_ci /* 15162306a36Sopenharmony_ci * Protected by file->f_lock, true for to-be-released epitem already 15262306a36Sopenharmony_ci * removed from the "struct file" items list; together with 15362306a36Sopenharmony_ci * eventpoll->refcount orchestrates "struct eventpoll" disposal 15462306a36Sopenharmony_ci */ 15562306a36Sopenharmony_ci bool dying; 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci /* List containing poll wait queues */ 15862306a36Sopenharmony_ci struct eppoll_entry *pwqlist; 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci /* The "container" of this item */ 16162306a36Sopenharmony_ci struct eventpoll *ep; 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci /* List header used to link this item to the "struct file" items list */ 16462306a36Sopenharmony_ci struct hlist_node fllink; 16562306a36Sopenharmony_ci 16662306a36Sopenharmony_ci /* wakeup_source used when EPOLLWAKEUP is set */ 16762306a36Sopenharmony_ci struct wakeup_source __rcu *ws; 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ci /* The structure that describe the interested events and the source fd */ 17062306a36Sopenharmony_ci struct epoll_event event; 17162306a36Sopenharmony_ci}; 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci/* 17462306a36Sopenharmony_ci * This structure is stored inside the "private_data" member of the file 17562306a36Sopenharmony_ci * structure and represents the main data structure for the eventpoll 17662306a36Sopenharmony_ci * interface. 17762306a36Sopenharmony_ci */ 17862306a36Sopenharmony_cistruct eventpoll { 17962306a36Sopenharmony_ci /* 18062306a36Sopenharmony_ci * This mutex is used to ensure that files are not removed 18162306a36Sopenharmony_ci * while epoll is using them. This is held during the event 18262306a36Sopenharmony_ci * collection loop, the file cleanup path, the epoll file exit 18362306a36Sopenharmony_ci * code and the ctl operations. 18462306a36Sopenharmony_ci */ 18562306a36Sopenharmony_ci struct mutex mtx; 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci /* Wait queue used by sys_epoll_wait() */ 18862306a36Sopenharmony_ci wait_queue_head_t wq; 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci /* Wait queue used by file->poll() */ 19162306a36Sopenharmony_ci wait_queue_head_t poll_wait; 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci /* List of ready file descriptors */ 19462306a36Sopenharmony_ci struct list_head rdllist; 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci /* Lock which protects rdllist and ovflist */ 19762306a36Sopenharmony_ci rwlock_t lock; 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci /* RB tree root used to store monitored fd structs */ 20062306a36Sopenharmony_ci struct rb_root_cached rbr; 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_ci /* 20362306a36Sopenharmony_ci * This is a single linked list that chains all the "struct epitem" that 20462306a36Sopenharmony_ci * happened while transferring ready events to userspace w/out 20562306a36Sopenharmony_ci * holding ->lock. 20662306a36Sopenharmony_ci */ 20762306a36Sopenharmony_ci struct epitem *ovflist; 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci /* wakeup_source used when ep_scan_ready_list is running */ 21062306a36Sopenharmony_ci struct wakeup_source *ws; 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci /* The user that created the eventpoll descriptor */ 21362306a36Sopenharmony_ci struct user_struct *user; 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci struct file *file; 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci /* used to optimize loop detection check */ 21862306a36Sopenharmony_ci u64 gen; 21962306a36Sopenharmony_ci struct hlist_head refs; 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_ci /* 22262306a36Sopenharmony_ci * usage count, used together with epitem->dying to 22362306a36Sopenharmony_ci * orchestrate the disposal of this struct 22462306a36Sopenharmony_ci */ 22562306a36Sopenharmony_ci refcount_t refcount; 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci#ifdef CONFIG_NET_RX_BUSY_POLL 22862306a36Sopenharmony_ci /* used to track busy poll napi_id */ 22962306a36Sopenharmony_ci unsigned int napi_id; 23062306a36Sopenharmony_ci#endif 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_LOCK_ALLOC 23362306a36Sopenharmony_ci /* tracks wakeup nests for lockdep validation */ 23462306a36Sopenharmony_ci u8 nests; 23562306a36Sopenharmony_ci#endif 23662306a36Sopenharmony_ci}; 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci/* Wrapper struct used by poll queueing */ 23962306a36Sopenharmony_cistruct ep_pqueue { 24062306a36Sopenharmony_ci poll_table pt; 24162306a36Sopenharmony_ci struct epitem *epi; 24262306a36Sopenharmony_ci}; 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci/* 24562306a36Sopenharmony_ci * Configuration options available inside /proc/sys/fs/epoll/ 24662306a36Sopenharmony_ci */ 24762306a36Sopenharmony_ci/* Maximum number of epoll watched descriptors, per user */ 24862306a36Sopenharmony_cistatic long max_user_watches __read_mostly; 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci/* Used for cycles detection */ 25162306a36Sopenharmony_cistatic DEFINE_MUTEX(epnested_mutex); 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_cistatic u64 loop_check_gen = 0; 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci/* Used to check for epoll file descriptor inclusion loops */ 25662306a36Sopenharmony_cistatic struct eventpoll *inserting_into; 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci/* Slab cache used to allocate "struct epitem" */ 25962306a36Sopenharmony_cistatic struct kmem_cache *epi_cache __read_mostly; 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci/* Slab cache used to allocate "struct eppoll_entry" */ 26262306a36Sopenharmony_cistatic struct kmem_cache *pwq_cache __read_mostly; 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci/* 26562306a36Sopenharmony_ci * List of files with newly added links, where we may need to limit the number 26662306a36Sopenharmony_ci * of emanating paths. Protected by the epnested_mutex. 26762306a36Sopenharmony_ci */ 26862306a36Sopenharmony_cistruct epitems_head { 26962306a36Sopenharmony_ci struct hlist_head epitems; 27062306a36Sopenharmony_ci struct epitems_head *next; 27162306a36Sopenharmony_ci}; 27262306a36Sopenharmony_cistatic struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_cistatic struct kmem_cache *ephead_cache __read_mostly; 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_cistatic inline void free_ephead(struct epitems_head *head) 27762306a36Sopenharmony_ci{ 27862306a36Sopenharmony_ci if (head) 27962306a36Sopenharmony_ci kmem_cache_free(ephead_cache, head); 28062306a36Sopenharmony_ci} 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_cistatic void list_file(struct file *file) 28362306a36Sopenharmony_ci{ 28462306a36Sopenharmony_ci struct epitems_head *head; 28562306a36Sopenharmony_ci 28662306a36Sopenharmony_ci head = container_of(file->f_ep, struct epitems_head, epitems); 28762306a36Sopenharmony_ci if (!head->next) { 28862306a36Sopenharmony_ci head->next = tfile_check_list; 28962306a36Sopenharmony_ci tfile_check_list = head; 29062306a36Sopenharmony_ci } 29162306a36Sopenharmony_ci} 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_cistatic void unlist_file(struct epitems_head *head) 29462306a36Sopenharmony_ci{ 29562306a36Sopenharmony_ci struct epitems_head *to_free = head; 29662306a36Sopenharmony_ci struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems)); 29762306a36Sopenharmony_ci if (p) { 29862306a36Sopenharmony_ci struct epitem *epi= container_of(p, struct epitem, fllink); 29962306a36Sopenharmony_ci spin_lock(&epi->ffd.file->f_lock); 30062306a36Sopenharmony_ci if (!hlist_empty(&head->epitems)) 30162306a36Sopenharmony_ci to_free = NULL; 30262306a36Sopenharmony_ci head->next = NULL; 30362306a36Sopenharmony_ci spin_unlock(&epi->ffd.file->f_lock); 30462306a36Sopenharmony_ci } 30562306a36Sopenharmony_ci free_ephead(to_free); 30662306a36Sopenharmony_ci} 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_ci#include <linux/sysctl.h> 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_cistatic long long_zero; 31362306a36Sopenharmony_cistatic long long_max = LONG_MAX; 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_cistatic struct ctl_table epoll_table[] = { 31662306a36Sopenharmony_ci { 31762306a36Sopenharmony_ci .procname = "max_user_watches", 31862306a36Sopenharmony_ci .data = &max_user_watches, 31962306a36Sopenharmony_ci .maxlen = sizeof(max_user_watches), 32062306a36Sopenharmony_ci .mode = 0644, 32162306a36Sopenharmony_ci .proc_handler = proc_doulongvec_minmax, 32262306a36Sopenharmony_ci .extra1 = &long_zero, 32362306a36Sopenharmony_ci .extra2 = &long_max, 32462306a36Sopenharmony_ci }, 32562306a36Sopenharmony_ci { } 32662306a36Sopenharmony_ci}; 32762306a36Sopenharmony_ci 32862306a36Sopenharmony_cistatic void __init epoll_sysctls_init(void) 32962306a36Sopenharmony_ci{ 33062306a36Sopenharmony_ci register_sysctl("fs/epoll", epoll_table); 33162306a36Sopenharmony_ci} 33262306a36Sopenharmony_ci#else 33362306a36Sopenharmony_ci#define epoll_sysctls_init() do { } while (0) 33462306a36Sopenharmony_ci#endif /* CONFIG_SYSCTL */ 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_cistatic const struct file_operations eventpoll_fops; 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_cistatic inline int is_file_epoll(struct file *f) 33962306a36Sopenharmony_ci{ 34062306a36Sopenharmony_ci return f->f_op == &eventpoll_fops; 34162306a36Sopenharmony_ci} 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_ci/* Setup the structure that is used as key for the RB tree */ 34462306a36Sopenharmony_cistatic inline void ep_set_ffd(struct epoll_filefd *ffd, 34562306a36Sopenharmony_ci struct file *file, int fd) 34662306a36Sopenharmony_ci{ 34762306a36Sopenharmony_ci ffd->file = file; 34862306a36Sopenharmony_ci ffd->fd = fd; 34962306a36Sopenharmony_ci} 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_ci/* Compare RB tree keys */ 35262306a36Sopenharmony_cistatic inline int ep_cmp_ffd(struct epoll_filefd *p1, 35362306a36Sopenharmony_ci struct epoll_filefd *p2) 35462306a36Sopenharmony_ci{ 35562306a36Sopenharmony_ci return (p1->file > p2->file ? +1: 35662306a36Sopenharmony_ci (p1->file < p2->file ? -1 : p1->fd - p2->fd)); 35762306a36Sopenharmony_ci} 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci/* Tells us if the item is currently linked */ 36062306a36Sopenharmony_cistatic inline int ep_is_linked(struct epitem *epi) 36162306a36Sopenharmony_ci{ 36262306a36Sopenharmony_ci return !list_empty(&epi->rdllink); 36362306a36Sopenharmony_ci} 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_cistatic inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) 36662306a36Sopenharmony_ci{ 36762306a36Sopenharmony_ci return container_of(p, struct eppoll_entry, wait); 36862306a36Sopenharmony_ci} 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci/* Get the "struct epitem" from a wait queue pointer */ 37162306a36Sopenharmony_cistatic inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) 37262306a36Sopenharmony_ci{ 37362306a36Sopenharmony_ci return container_of(p, struct eppoll_entry, wait)->base; 37462306a36Sopenharmony_ci} 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_ci/** 37762306a36Sopenharmony_ci * ep_events_available - Checks if ready events might be available. 37862306a36Sopenharmony_ci * 37962306a36Sopenharmony_ci * @ep: Pointer to the eventpoll context. 38062306a36Sopenharmony_ci * 38162306a36Sopenharmony_ci * Return: a value different than %zero if ready events are available, 38262306a36Sopenharmony_ci * or %zero otherwise. 38362306a36Sopenharmony_ci */ 38462306a36Sopenharmony_cistatic inline int ep_events_available(struct eventpoll *ep) 38562306a36Sopenharmony_ci{ 38662306a36Sopenharmony_ci return !list_empty_careful(&ep->rdllist) || 38762306a36Sopenharmony_ci READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; 38862306a36Sopenharmony_ci} 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci#ifdef CONFIG_NET_RX_BUSY_POLL 39162306a36Sopenharmony_cistatic bool ep_busy_loop_end(void *p, unsigned long start_time) 39262306a36Sopenharmony_ci{ 39362306a36Sopenharmony_ci struct eventpoll *ep = p; 39462306a36Sopenharmony_ci 39562306a36Sopenharmony_ci return ep_events_available(ep) || busy_loop_timeout(start_time); 39662306a36Sopenharmony_ci} 39762306a36Sopenharmony_ci 39862306a36Sopenharmony_ci/* 39962306a36Sopenharmony_ci * Busy poll if globally on and supporting sockets found && no events, 40062306a36Sopenharmony_ci * busy loop will return if need_resched or ep_events_available. 40162306a36Sopenharmony_ci * 40262306a36Sopenharmony_ci * we must do our busy polling with irqs enabled 40362306a36Sopenharmony_ci */ 40462306a36Sopenharmony_cistatic bool ep_busy_loop(struct eventpoll *ep, int nonblock) 40562306a36Sopenharmony_ci{ 40662306a36Sopenharmony_ci unsigned int napi_id = READ_ONCE(ep->napi_id); 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_ci if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) { 40962306a36Sopenharmony_ci napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, 41062306a36Sopenharmony_ci BUSY_POLL_BUDGET); 41162306a36Sopenharmony_ci if (ep_events_available(ep)) 41262306a36Sopenharmony_ci return true; 41362306a36Sopenharmony_ci /* 41462306a36Sopenharmony_ci * Busy poll timed out. Drop NAPI ID for now, we can add 41562306a36Sopenharmony_ci * it back in when we have moved a socket with a valid NAPI 41662306a36Sopenharmony_ci * ID onto the ready list. 41762306a36Sopenharmony_ci */ 41862306a36Sopenharmony_ci ep->napi_id = 0; 41962306a36Sopenharmony_ci return false; 42062306a36Sopenharmony_ci } 42162306a36Sopenharmony_ci return false; 42262306a36Sopenharmony_ci} 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci/* 42562306a36Sopenharmony_ci * Set epoll busy poll NAPI ID from sk. 42662306a36Sopenharmony_ci */ 42762306a36Sopenharmony_cistatic inline void ep_set_busy_poll_napi_id(struct epitem *epi) 42862306a36Sopenharmony_ci{ 42962306a36Sopenharmony_ci struct eventpoll *ep; 43062306a36Sopenharmony_ci unsigned int napi_id; 43162306a36Sopenharmony_ci struct socket *sock; 43262306a36Sopenharmony_ci struct sock *sk; 43362306a36Sopenharmony_ci 43462306a36Sopenharmony_ci if (!net_busy_loop_on()) 43562306a36Sopenharmony_ci return; 43662306a36Sopenharmony_ci 43762306a36Sopenharmony_ci sock = sock_from_file(epi->ffd.file); 43862306a36Sopenharmony_ci if (!sock) 43962306a36Sopenharmony_ci return; 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci sk = sock->sk; 44262306a36Sopenharmony_ci if (!sk) 44362306a36Sopenharmony_ci return; 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_ci napi_id = READ_ONCE(sk->sk_napi_id); 44662306a36Sopenharmony_ci ep = epi->ep; 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci /* Non-NAPI IDs can be rejected 44962306a36Sopenharmony_ci * or 45062306a36Sopenharmony_ci * Nothing to do if we already have this ID 45162306a36Sopenharmony_ci */ 45262306a36Sopenharmony_ci if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) 45362306a36Sopenharmony_ci return; 45462306a36Sopenharmony_ci 45562306a36Sopenharmony_ci /* record NAPI ID for use in next busy poll */ 45662306a36Sopenharmony_ci ep->napi_id = napi_id; 45762306a36Sopenharmony_ci} 45862306a36Sopenharmony_ci 45962306a36Sopenharmony_ci#else 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_cistatic inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) 46262306a36Sopenharmony_ci{ 46362306a36Sopenharmony_ci return false; 46462306a36Sopenharmony_ci} 46562306a36Sopenharmony_ci 46662306a36Sopenharmony_cistatic inline void ep_set_busy_poll_napi_id(struct epitem *epi) 46762306a36Sopenharmony_ci{ 46862306a36Sopenharmony_ci} 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_ci#endif /* CONFIG_NET_RX_BUSY_POLL */ 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_ci/* 47362306a36Sopenharmony_ci * As described in commit 0ccf831cb lockdep: annotate epoll 47462306a36Sopenharmony_ci * the use of wait queues used by epoll is done in a very controlled 47562306a36Sopenharmony_ci * manner. Wake ups can nest inside each other, but are never done 47662306a36Sopenharmony_ci * with the same locking. For example: 47762306a36Sopenharmony_ci * 47862306a36Sopenharmony_ci * dfd = socket(...); 47962306a36Sopenharmony_ci * efd1 = epoll_create(); 48062306a36Sopenharmony_ci * efd2 = epoll_create(); 48162306a36Sopenharmony_ci * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); 48262306a36Sopenharmony_ci * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); 48362306a36Sopenharmony_ci * 48462306a36Sopenharmony_ci * When a packet arrives to the device underneath "dfd", the net code will 48562306a36Sopenharmony_ci * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a 48662306a36Sopenharmony_ci * callback wakeup entry on that queue, and the wake_up() performed by the 48762306a36Sopenharmony_ci * "dfd" net code will end up in ep_poll_callback(). At this point epoll 48862306a36Sopenharmony_ci * (efd1) notices that it may have some event ready, so it needs to wake up 48962306a36Sopenharmony_ci * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() 49062306a36Sopenharmony_ci * that ends up in another wake_up(), after having checked about the 49162306a36Sopenharmony_ci * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid 49262306a36Sopenharmony_ci * stack blasting. 49362306a36Sopenharmony_ci * 49462306a36Sopenharmony_ci * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle 49562306a36Sopenharmony_ci * this special case of epoll. 49662306a36Sopenharmony_ci */ 49762306a36Sopenharmony_ci#ifdef CONFIG_DEBUG_LOCK_ALLOC 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_cistatic void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, 50062306a36Sopenharmony_ci unsigned pollflags) 50162306a36Sopenharmony_ci{ 50262306a36Sopenharmony_ci struct eventpoll *ep_src; 50362306a36Sopenharmony_ci unsigned long flags; 50462306a36Sopenharmony_ci u8 nests = 0; 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci /* 50762306a36Sopenharmony_ci * To set the subclass or nesting level for spin_lock_irqsave_nested() 50862306a36Sopenharmony_ci * it might be natural to create a per-cpu nest count. However, since 50962306a36Sopenharmony_ci * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can 51062306a36Sopenharmony_ci * schedule() in the -rt kernel, the per-cpu variable are no longer 51162306a36Sopenharmony_ci * protected. Thus, we are introducing a per eventpoll nest field. 51262306a36Sopenharmony_ci * If we are not being call from ep_poll_callback(), epi is NULL and 51362306a36Sopenharmony_ci * we are at the first level of nesting, 0. Otherwise, we are being 51462306a36Sopenharmony_ci * called from ep_poll_callback() and if a previous wakeup source is 51562306a36Sopenharmony_ci * not an epoll file itself, we are at depth 1 since the wakeup source 51662306a36Sopenharmony_ci * is depth 0. If the wakeup source is a previous epoll file in the 51762306a36Sopenharmony_ci * wakeup chain then we use its nests value and record ours as 51862306a36Sopenharmony_ci * nests + 1. The previous epoll file nests value is stable since its 51962306a36Sopenharmony_ci * already holding its own poll_wait.lock. 52062306a36Sopenharmony_ci */ 52162306a36Sopenharmony_ci if (epi) { 52262306a36Sopenharmony_ci if ((is_file_epoll(epi->ffd.file))) { 52362306a36Sopenharmony_ci ep_src = epi->ffd.file->private_data; 52462306a36Sopenharmony_ci nests = ep_src->nests; 52562306a36Sopenharmony_ci } else { 52662306a36Sopenharmony_ci nests = 1; 52762306a36Sopenharmony_ci } 52862306a36Sopenharmony_ci } 52962306a36Sopenharmony_ci spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); 53062306a36Sopenharmony_ci ep->nests = nests + 1; 53162306a36Sopenharmony_ci wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); 53262306a36Sopenharmony_ci ep->nests = 0; 53362306a36Sopenharmony_ci spin_unlock_irqrestore(&ep->poll_wait.lock, flags); 53462306a36Sopenharmony_ci} 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ci#else 53762306a36Sopenharmony_ci 53862306a36Sopenharmony_cistatic void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, 53962306a36Sopenharmony_ci __poll_t pollflags) 54062306a36Sopenharmony_ci{ 54162306a36Sopenharmony_ci wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); 54262306a36Sopenharmony_ci} 54362306a36Sopenharmony_ci 54462306a36Sopenharmony_ci#endif 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_cistatic void ep_remove_wait_queue(struct eppoll_entry *pwq) 54762306a36Sopenharmony_ci{ 54862306a36Sopenharmony_ci wait_queue_head_t *whead; 54962306a36Sopenharmony_ci 55062306a36Sopenharmony_ci rcu_read_lock(); 55162306a36Sopenharmony_ci /* 55262306a36Sopenharmony_ci * If it is cleared by POLLFREE, it should be rcu-safe. 55362306a36Sopenharmony_ci * If we read NULL we need a barrier paired with 55462306a36Sopenharmony_ci * smp_store_release() in ep_poll_callback(), otherwise 55562306a36Sopenharmony_ci * we rely on whead->lock. 55662306a36Sopenharmony_ci */ 55762306a36Sopenharmony_ci whead = smp_load_acquire(&pwq->whead); 55862306a36Sopenharmony_ci if (whead) 55962306a36Sopenharmony_ci remove_wait_queue(whead, &pwq->wait); 56062306a36Sopenharmony_ci rcu_read_unlock(); 56162306a36Sopenharmony_ci} 56262306a36Sopenharmony_ci 56362306a36Sopenharmony_ci/* 56462306a36Sopenharmony_ci * This function unregisters poll callbacks from the associated file 56562306a36Sopenharmony_ci * descriptor. Must be called with "mtx" held. 56662306a36Sopenharmony_ci */ 56762306a36Sopenharmony_cistatic void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) 56862306a36Sopenharmony_ci{ 56962306a36Sopenharmony_ci struct eppoll_entry **p = &epi->pwqlist; 57062306a36Sopenharmony_ci struct eppoll_entry *pwq; 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci while ((pwq = *p) != NULL) { 57362306a36Sopenharmony_ci *p = pwq->next; 57462306a36Sopenharmony_ci ep_remove_wait_queue(pwq); 57562306a36Sopenharmony_ci kmem_cache_free(pwq_cache, pwq); 57662306a36Sopenharmony_ci } 57762306a36Sopenharmony_ci} 57862306a36Sopenharmony_ci 57962306a36Sopenharmony_ci/* call only when ep->mtx is held */ 58062306a36Sopenharmony_cistatic inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) 58162306a36Sopenharmony_ci{ 58262306a36Sopenharmony_ci return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); 58362306a36Sopenharmony_ci} 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci/* call only when ep->mtx is held */ 58662306a36Sopenharmony_cistatic inline void ep_pm_stay_awake(struct epitem *epi) 58762306a36Sopenharmony_ci{ 58862306a36Sopenharmony_ci struct wakeup_source *ws = ep_wakeup_source(epi); 58962306a36Sopenharmony_ci 59062306a36Sopenharmony_ci if (ws) 59162306a36Sopenharmony_ci __pm_stay_awake(ws); 59262306a36Sopenharmony_ci} 59362306a36Sopenharmony_ci 59462306a36Sopenharmony_cistatic inline bool ep_has_wakeup_source(struct epitem *epi) 59562306a36Sopenharmony_ci{ 59662306a36Sopenharmony_ci return rcu_access_pointer(epi->ws) ? true : false; 59762306a36Sopenharmony_ci} 59862306a36Sopenharmony_ci 59962306a36Sopenharmony_ci/* call when ep->mtx cannot be held (ep_poll_callback) */ 60062306a36Sopenharmony_cistatic inline void ep_pm_stay_awake_rcu(struct epitem *epi) 60162306a36Sopenharmony_ci{ 60262306a36Sopenharmony_ci struct wakeup_source *ws; 60362306a36Sopenharmony_ci 60462306a36Sopenharmony_ci rcu_read_lock(); 60562306a36Sopenharmony_ci ws = rcu_dereference(epi->ws); 60662306a36Sopenharmony_ci if (ws) 60762306a36Sopenharmony_ci __pm_stay_awake(ws); 60862306a36Sopenharmony_ci rcu_read_unlock(); 60962306a36Sopenharmony_ci} 61062306a36Sopenharmony_ci 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci/* 61362306a36Sopenharmony_ci * ep->mutex needs to be held because we could be hit by 61462306a36Sopenharmony_ci * eventpoll_release_file() and epoll_ctl(). 61562306a36Sopenharmony_ci */ 61662306a36Sopenharmony_cistatic void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) 61762306a36Sopenharmony_ci{ 61862306a36Sopenharmony_ci /* 61962306a36Sopenharmony_ci * Steal the ready list, and re-init the original one to the 62062306a36Sopenharmony_ci * empty list. Also, set ep->ovflist to NULL so that events 62162306a36Sopenharmony_ci * happening while looping w/out locks, are not lost. We cannot 62262306a36Sopenharmony_ci * have the poll callback to queue directly on ep->rdllist, 62362306a36Sopenharmony_ci * because we want the "sproc" callback to be able to do it 62462306a36Sopenharmony_ci * in a lockless way. 62562306a36Sopenharmony_ci */ 62662306a36Sopenharmony_ci lockdep_assert_irqs_enabled(); 62762306a36Sopenharmony_ci write_lock_irq(&ep->lock); 62862306a36Sopenharmony_ci list_splice_init(&ep->rdllist, txlist); 62962306a36Sopenharmony_ci WRITE_ONCE(ep->ovflist, NULL); 63062306a36Sopenharmony_ci write_unlock_irq(&ep->lock); 63162306a36Sopenharmony_ci} 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_cistatic void ep_done_scan(struct eventpoll *ep, 63462306a36Sopenharmony_ci struct list_head *txlist) 63562306a36Sopenharmony_ci{ 63662306a36Sopenharmony_ci struct epitem *epi, *nepi; 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_ci write_lock_irq(&ep->lock); 63962306a36Sopenharmony_ci /* 64062306a36Sopenharmony_ci * During the time we spent inside the "sproc" callback, some 64162306a36Sopenharmony_ci * other events might have been queued by the poll callback. 64262306a36Sopenharmony_ci * We re-insert them inside the main ready-list here. 64362306a36Sopenharmony_ci */ 64462306a36Sopenharmony_ci for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; 64562306a36Sopenharmony_ci nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { 64662306a36Sopenharmony_ci /* 64762306a36Sopenharmony_ci * We need to check if the item is already in the list. 64862306a36Sopenharmony_ci * During the "sproc" callback execution time, items are 64962306a36Sopenharmony_ci * queued into ->ovflist but the "txlist" might already 65062306a36Sopenharmony_ci * contain them, and the list_splice() below takes care of them. 65162306a36Sopenharmony_ci */ 65262306a36Sopenharmony_ci if (!ep_is_linked(epi)) { 65362306a36Sopenharmony_ci /* 65462306a36Sopenharmony_ci * ->ovflist is LIFO, so we have to reverse it in order 65562306a36Sopenharmony_ci * to keep in FIFO. 65662306a36Sopenharmony_ci */ 65762306a36Sopenharmony_ci list_add(&epi->rdllink, &ep->rdllist); 65862306a36Sopenharmony_ci ep_pm_stay_awake(epi); 65962306a36Sopenharmony_ci } 66062306a36Sopenharmony_ci } 66162306a36Sopenharmony_ci /* 66262306a36Sopenharmony_ci * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after 66362306a36Sopenharmony_ci * releasing the lock, events will be queued in the normal way inside 66462306a36Sopenharmony_ci * ep->rdllist. 66562306a36Sopenharmony_ci */ 66662306a36Sopenharmony_ci WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); 66762306a36Sopenharmony_ci 66862306a36Sopenharmony_ci /* 66962306a36Sopenharmony_ci * Quickly re-inject items left on "txlist". 67062306a36Sopenharmony_ci */ 67162306a36Sopenharmony_ci list_splice(txlist, &ep->rdllist); 67262306a36Sopenharmony_ci __pm_relax(ep->ws); 67362306a36Sopenharmony_ci 67462306a36Sopenharmony_ci if (!list_empty(&ep->rdllist)) { 67562306a36Sopenharmony_ci if (waitqueue_active(&ep->wq)) 67662306a36Sopenharmony_ci wake_up(&ep->wq); 67762306a36Sopenharmony_ci } 67862306a36Sopenharmony_ci 67962306a36Sopenharmony_ci write_unlock_irq(&ep->lock); 68062306a36Sopenharmony_ci} 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_cistatic void epi_rcu_free(struct rcu_head *head) 68362306a36Sopenharmony_ci{ 68462306a36Sopenharmony_ci struct epitem *epi = container_of(head, struct epitem, rcu); 68562306a36Sopenharmony_ci kmem_cache_free(epi_cache, epi); 68662306a36Sopenharmony_ci} 68762306a36Sopenharmony_ci 68862306a36Sopenharmony_cistatic void ep_get(struct eventpoll *ep) 68962306a36Sopenharmony_ci{ 69062306a36Sopenharmony_ci refcount_inc(&ep->refcount); 69162306a36Sopenharmony_ci} 69262306a36Sopenharmony_ci 69362306a36Sopenharmony_ci/* 69462306a36Sopenharmony_ci * Returns true if the event poll can be disposed 69562306a36Sopenharmony_ci */ 69662306a36Sopenharmony_cistatic bool ep_refcount_dec_and_test(struct eventpoll *ep) 69762306a36Sopenharmony_ci{ 69862306a36Sopenharmony_ci if (!refcount_dec_and_test(&ep->refcount)) 69962306a36Sopenharmony_ci return false; 70062306a36Sopenharmony_ci 70162306a36Sopenharmony_ci WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root)); 70262306a36Sopenharmony_ci return true; 70362306a36Sopenharmony_ci} 70462306a36Sopenharmony_ci 70562306a36Sopenharmony_cistatic void ep_free(struct eventpoll *ep) 70662306a36Sopenharmony_ci{ 70762306a36Sopenharmony_ci mutex_destroy(&ep->mtx); 70862306a36Sopenharmony_ci free_uid(ep->user); 70962306a36Sopenharmony_ci wakeup_source_unregister(ep->ws); 71062306a36Sopenharmony_ci kfree(ep); 71162306a36Sopenharmony_ci} 71262306a36Sopenharmony_ci 71362306a36Sopenharmony_ci/* 71462306a36Sopenharmony_ci * Removes a "struct epitem" from the eventpoll RB tree and deallocates 71562306a36Sopenharmony_ci * all the associated resources. Must be called with "mtx" held. 71662306a36Sopenharmony_ci * If the dying flag is set, do the removal only if force is true. 71762306a36Sopenharmony_ci * This prevents ep_clear_and_put() from dropping all the ep references 71862306a36Sopenharmony_ci * while running concurrently with eventpoll_release_file(). 71962306a36Sopenharmony_ci * Returns true if the eventpoll can be disposed. 72062306a36Sopenharmony_ci */ 72162306a36Sopenharmony_cistatic bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) 72262306a36Sopenharmony_ci{ 72362306a36Sopenharmony_ci struct file *file = epi->ffd.file; 72462306a36Sopenharmony_ci struct epitems_head *to_free; 72562306a36Sopenharmony_ci struct hlist_head *head; 72662306a36Sopenharmony_ci 72762306a36Sopenharmony_ci lockdep_assert_irqs_enabled(); 72862306a36Sopenharmony_ci 72962306a36Sopenharmony_ci /* 73062306a36Sopenharmony_ci * Removes poll wait queue hooks. 73162306a36Sopenharmony_ci */ 73262306a36Sopenharmony_ci ep_unregister_pollwait(ep, epi); 73362306a36Sopenharmony_ci 73462306a36Sopenharmony_ci /* Remove the current item from the list of epoll hooks */ 73562306a36Sopenharmony_ci spin_lock(&file->f_lock); 73662306a36Sopenharmony_ci if (epi->dying && !force) { 73762306a36Sopenharmony_ci spin_unlock(&file->f_lock); 73862306a36Sopenharmony_ci return false; 73962306a36Sopenharmony_ci } 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ci to_free = NULL; 74262306a36Sopenharmony_ci head = file->f_ep; 74362306a36Sopenharmony_ci if (head->first == &epi->fllink && !epi->fllink.next) { 74462306a36Sopenharmony_ci file->f_ep = NULL; 74562306a36Sopenharmony_ci if (!is_file_epoll(file)) { 74662306a36Sopenharmony_ci struct epitems_head *v; 74762306a36Sopenharmony_ci v = container_of(head, struct epitems_head, epitems); 74862306a36Sopenharmony_ci if (!smp_load_acquire(&v->next)) 74962306a36Sopenharmony_ci to_free = v; 75062306a36Sopenharmony_ci } 75162306a36Sopenharmony_ci } 75262306a36Sopenharmony_ci hlist_del_rcu(&epi->fllink); 75362306a36Sopenharmony_ci spin_unlock(&file->f_lock); 75462306a36Sopenharmony_ci free_ephead(to_free); 75562306a36Sopenharmony_ci 75662306a36Sopenharmony_ci rb_erase_cached(&epi->rbn, &ep->rbr); 75762306a36Sopenharmony_ci 75862306a36Sopenharmony_ci write_lock_irq(&ep->lock); 75962306a36Sopenharmony_ci if (ep_is_linked(epi)) 76062306a36Sopenharmony_ci list_del_init(&epi->rdllink); 76162306a36Sopenharmony_ci write_unlock_irq(&ep->lock); 76262306a36Sopenharmony_ci 76362306a36Sopenharmony_ci wakeup_source_unregister(ep_wakeup_source(epi)); 76462306a36Sopenharmony_ci /* 76562306a36Sopenharmony_ci * At this point it is safe to free the eventpoll item. Use the union 76662306a36Sopenharmony_ci * field epi->rcu, since we are trying to minimize the size of 76762306a36Sopenharmony_ci * 'struct epitem'. The 'rbn' field is no longer in use. Protected by 76862306a36Sopenharmony_ci * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make 76962306a36Sopenharmony_ci * use of the rbn field. 77062306a36Sopenharmony_ci */ 77162306a36Sopenharmony_ci call_rcu(&epi->rcu, epi_rcu_free); 77262306a36Sopenharmony_ci 77362306a36Sopenharmony_ci percpu_counter_dec(&ep->user->epoll_watches); 77462306a36Sopenharmony_ci return ep_refcount_dec_and_test(ep); 77562306a36Sopenharmony_ci} 77662306a36Sopenharmony_ci 77762306a36Sopenharmony_ci/* 77862306a36Sopenharmony_ci * ep_remove variant for callers owing an additional reference to the ep 77962306a36Sopenharmony_ci */ 78062306a36Sopenharmony_cistatic void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) 78162306a36Sopenharmony_ci{ 78262306a36Sopenharmony_ci WARN_ON_ONCE(__ep_remove(ep, epi, false)); 78362306a36Sopenharmony_ci} 78462306a36Sopenharmony_ci 78562306a36Sopenharmony_cistatic void ep_clear_and_put(struct eventpoll *ep) 78662306a36Sopenharmony_ci{ 78762306a36Sopenharmony_ci struct rb_node *rbp, *next; 78862306a36Sopenharmony_ci struct epitem *epi; 78962306a36Sopenharmony_ci bool dispose; 79062306a36Sopenharmony_ci 79162306a36Sopenharmony_ci /* We need to release all tasks waiting for these file */ 79262306a36Sopenharmony_ci if (waitqueue_active(&ep->poll_wait)) 79362306a36Sopenharmony_ci ep_poll_safewake(ep, NULL, 0); 79462306a36Sopenharmony_ci 79562306a36Sopenharmony_ci mutex_lock(&ep->mtx); 79662306a36Sopenharmony_ci 79762306a36Sopenharmony_ci /* 79862306a36Sopenharmony_ci * Walks through the whole tree by unregistering poll callbacks. 79962306a36Sopenharmony_ci */ 80062306a36Sopenharmony_ci for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 80162306a36Sopenharmony_ci epi = rb_entry(rbp, struct epitem, rbn); 80262306a36Sopenharmony_ci 80362306a36Sopenharmony_ci ep_unregister_pollwait(ep, epi); 80462306a36Sopenharmony_ci cond_resched(); 80562306a36Sopenharmony_ci } 80662306a36Sopenharmony_ci 80762306a36Sopenharmony_ci /* 80862306a36Sopenharmony_ci * Walks through the whole tree and try to free each "struct epitem". 80962306a36Sopenharmony_ci * Note that ep_remove_safe() will not remove the epitem in case of a 81062306a36Sopenharmony_ci * racing eventpoll_release_file(); the latter will do the removal. 81162306a36Sopenharmony_ci * At this point we are sure no poll callbacks will be lingering around. 81262306a36Sopenharmony_ci * Since we still own a reference to the eventpoll struct, the loop can't 81362306a36Sopenharmony_ci * dispose it. 81462306a36Sopenharmony_ci */ 81562306a36Sopenharmony_ci for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { 81662306a36Sopenharmony_ci next = rb_next(rbp); 81762306a36Sopenharmony_ci epi = rb_entry(rbp, struct epitem, rbn); 81862306a36Sopenharmony_ci ep_remove_safe(ep, epi); 81962306a36Sopenharmony_ci cond_resched(); 82062306a36Sopenharmony_ci } 82162306a36Sopenharmony_ci 82262306a36Sopenharmony_ci dispose = ep_refcount_dec_and_test(ep); 82362306a36Sopenharmony_ci mutex_unlock(&ep->mtx); 82462306a36Sopenharmony_ci 82562306a36Sopenharmony_ci if (dispose) 82662306a36Sopenharmony_ci ep_free(ep); 82762306a36Sopenharmony_ci} 82862306a36Sopenharmony_ci 82962306a36Sopenharmony_cistatic int ep_eventpoll_release(struct inode *inode, struct file *file) 83062306a36Sopenharmony_ci{ 83162306a36Sopenharmony_ci struct eventpoll *ep = file->private_data; 83262306a36Sopenharmony_ci 83362306a36Sopenharmony_ci if (ep) 83462306a36Sopenharmony_ci ep_clear_and_put(ep); 83562306a36Sopenharmony_ci 83662306a36Sopenharmony_ci return 0; 83762306a36Sopenharmony_ci} 83862306a36Sopenharmony_ci 83962306a36Sopenharmony_cistatic __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth); 84062306a36Sopenharmony_ci 84162306a36Sopenharmony_cistatic __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) 84262306a36Sopenharmony_ci{ 84362306a36Sopenharmony_ci struct eventpoll *ep = file->private_data; 84462306a36Sopenharmony_ci LIST_HEAD(txlist); 84562306a36Sopenharmony_ci struct epitem *epi, *tmp; 84662306a36Sopenharmony_ci poll_table pt; 84762306a36Sopenharmony_ci __poll_t res = 0; 84862306a36Sopenharmony_ci 84962306a36Sopenharmony_ci init_poll_funcptr(&pt, NULL); 85062306a36Sopenharmony_ci 85162306a36Sopenharmony_ci /* Insert inside our poll wait queue */ 85262306a36Sopenharmony_ci poll_wait(file, &ep->poll_wait, wait); 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci /* 85562306a36Sopenharmony_ci * Proceed to find out if wanted events are really available inside 85662306a36Sopenharmony_ci * the ready list. 85762306a36Sopenharmony_ci */ 85862306a36Sopenharmony_ci mutex_lock_nested(&ep->mtx, depth); 85962306a36Sopenharmony_ci ep_start_scan(ep, &txlist); 86062306a36Sopenharmony_ci list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { 86162306a36Sopenharmony_ci if (ep_item_poll(epi, &pt, depth + 1)) { 86262306a36Sopenharmony_ci res = EPOLLIN | EPOLLRDNORM; 86362306a36Sopenharmony_ci break; 86462306a36Sopenharmony_ci } else { 86562306a36Sopenharmony_ci /* 86662306a36Sopenharmony_ci * Item has been dropped into the ready list by the poll 86762306a36Sopenharmony_ci * callback, but it's not actually ready, as far as 86862306a36Sopenharmony_ci * caller requested events goes. We can remove it here. 86962306a36Sopenharmony_ci */ 87062306a36Sopenharmony_ci __pm_relax(ep_wakeup_source(epi)); 87162306a36Sopenharmony_ci list_del_init(&epi->rdllink); 87262306a36Sopenharmony_ci } 87362306a36Sopenharmony_ci } 87462306a36Sopenharmony_ci ep_done_scan(ep, &txlist); 87562306a36Sopenharmony_ci mutex_unlock(&ep->mtx); 87662306a36Sopenharmony_ci return res; 87762306a36Sopenharmony_ci} 87862306a36Sopenharmony_ci 87962306a36Sopenharmony_ci/* 88062306a36Sopenharmony_ci * Differs from ep_eventpoll_poll() in that internal callers already have 88162306a36Sopenharmony_ci * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() 88262306a36Sopenharmony_ci * is correctly annotated. 88362306a36Sopenharmony_ci */ 88462306a36Sopenharmony_cistatic __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, 88562306a36Sopenharmony_ci int depth) 88662306a36Sopenharmony_ci{ 88762306a36Sopenharmony_ci struct file *file = epi->ffd.file; 88862306a36Sopenharmony_ci __poll_t res; 88962306a36Sopenharmony_ci 89062306a36Sopenharmony_ci pt->_key = epi->event.events; 89162306a36Sopenharmony_ci if (!is_file_epoll(file)) 89262306a36Sopenharmony_ci res = vfs_poll(file, pt); 89362306a36Sopenharmony_ci else 89462306a36Sopenharmony_ci res = __ep_eventpoll_poll(file, pt, depth); 89562306a36Sopenharmony_ci return res & epi->event.events; 89662306a36Sopenharmony_ci} 89762306a36Sopenharmony_ci 89862306a36Sopenharmony_cistatic __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) 89962306a36Sopenharmony_ci{ 90062306a36Sopenharmony_ci return __ep_eventpoll_poll(file, wait, 0); 90162306a36Sopenharmony_ci} 90262306a36Sopenharmony_ci 90362306a36Sopenharmony_ci#ifdef CONFIG_PROC_FS 90462306a36Sopenharmony_cistatic void ep_show_fdinfo(struct seq_file *m, struct file *f) 90562306a36Sopenharmony_ci{ 90662306a36Sopenharmony_ci struct eventpoll *ep = f->private_data; 90762306a36Sopenharmony_ci struct rb_node *rbp; 90862306a36Sopenharmony_ci 90962306a36Sopenharmony_ci mutex_lock(&ep->mtx); 91062306a36Sopenharmony_ci for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 91162306a36Sopenharmony_ci struct epitem *epi = rb_entry(rbp, struct epitem, rbn); 91262306a36Sopenharmony_ci struct inode *inode = file_inode(epi->ffd.file); 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_ci seq_printf(m, "tfd: %8d events: %8x data: %16llx " 91562306a36Sopenharmony_ci " pos:%lli ino:%lx sdev:%x\n", 91662306a36Sopenharmony_ci epi->ffd.fd, epi->event.events, 91762306a36Sopenharmony_ci (long long)epi->event.data, 91862306a36Sopenharmony_ci (long long)epi->ffd.file->f_pos, 91962306a36Sopenharmony_ci inode->i_ino, inode->i_sb->s_dev); 92062306a36Sopenharmony_ci if (seq_has_overflowed(m)) 92162306a36Sopenharmony_ci break; 92262306a36Sopenharmony_ci } 92362306a36Sopenharmony_ci mutex_unlock(&ep->mtx); 92462306a36Sopenharmony_ci} 92562306a36Sopenharmony_ci#endif 92662306a36Sopenharmony_ci 92762306a36Sopenharmony_ci/* File callbacks that implement the eventpoll file behaviour */ 92862306a36Sopenharmony_cistatic const struct file_operations eventpoll_fops = { 92962306a36Sopenharmony_ci#ifdef CONFIG_PROC_FS 93062306a36Sopenharmony_ci .show_fdinfo = ep_show_fdinfo, 93162306a36Sopenharmony_ci#endif 93262306a36Sopenharmony_ci .release = ep_eventpoll_release, 93362306a36Sopenharmony_ci .poll = ep_eventpoll_poll, 93462306a36Sopenharmony_ci .llseek = noop_llseek, 93562306a36Sopenharmony_ci}; 93662306a36Sopenharmony_ci 93762306a36Sopenharmony_ci/* 93862306a36Sopenharmony_ci * This is called from eventpoll_release() to unlink files from the eventpoll 93962306a36Sopenharmony_ci * interface. We need to have this facility to cleanup correctly files that are 94062306a36Sopenharmony_ci * closed without being removed from the eventpoll interface. 94162306a36Sopenharmony_ci */ 94262306a36Sopenharmony_civoid eventpoll_release_file(struct file *file) 94362306a36Sopenharmony_ci{ 94462306a36Sopenharmony_ci struct eventpoll *ep; 94562306a36Sopenharmony_ci struct epitem *epi; 94662306a36Sopenharmony_ci bool dispose; 94762306a36Sopenharmony_ci 94862306a36Sopenharmony_ci /* 94962306a36Sopenharmony_ci * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from 95062306a36Sopenharmony_ci * touching the epitems list before eventpoll_release_file() can access 95162306a36Sopenharmony_ci * the ep->mtx. 95262306a36Sopenharmony_ci */ 95362306a36Sopenharmony_ciagain: 95462306a36Sopenharmony_ci spin_lock(&file->f_lock); 95562306a36Sopenharmony_ci if (file->f_ep && file->f_ep->first) { 95662306a36Sopenharmony_ci epi = hlist_entry(file->f_ep->first, struct epitem, fllink); 95762306a36Sopenharmony_ci epi->dying = true; 95862306a36Sopenharmony_ci spin_unlock(&file->f_lock); 95962306a36Sopenharmony_ci 96062306a36Sopenharmony_ci /* 96162306a36Sopenharmony_ci * ep access is safe as we still own a reference to the ep 96262306a36Sopenharmony_ci * struct 96362306a36Sopenharmony_ci */ 96462306a36Sopenharmony_ci ep = epi->ep; 96562306a36Sopenharmony_ci mutex_lock(&ep->mtx); 96662306a36Sopenharmony_ci dispose = __ep_remove(ep, epi, true); 96762306a36Sopenharmony_ci mutex_unlock(&ep->mtx); 96862306a36Sopenharmony_ci 96962306a36Sopenharmony_ci if (dispose) 97062306a36Sopenharmony_ci ep_free(ep); 97162306a36Sopenharmony_ci goto again; 97262306a36Sopenharmony_ci } 97362306a36Sopenharmony_ci spin_unlock(&file->f_lock); 97462306a36Sopenharmony_ci} 97562306a36Sopenharmony_ci 97662306a36Sopenharmony_cistatic int ep_alloc(struct eventpoll **pep) 97762306a36Sopenharmony_ci{ 97862306a36Sopenharmony_ci struct eventpoll *ep; 97962306a36Sopenharmony_ci 98062306a36Sopenharmony_ci ep = kzalloc(sizeof(*ep), GFP_KERNEL); 98162306a36Sopenharmony_ci if (unlikely(!ep)) 98262306a36Sopenharmony_ci return -ENOMEM; 98362306a36Sopenharmony_ci 98462306a36Sopenharmony_ci mutex_init(&ep->mtx); 98562306a36Sopenharmony_ci rwlock_init(&ep->lock); 98662306a36Sopenharmony_ci init_waitqueue_head(&ep->wq); 98762306a36Sopenharmony_ci init_waitqueue_head(&ep->poll_wait); 98862306a36Sopenharmony_ci INIT_LIST_HEAD(&ep->rdllist); 98962306a36Sopenharmony_ci ep->rbr = RB_ROOT_CACHED; 99062306a36Sopenharmony_ci ep->ovflist = EP_UNACTIVE_PTR; 99162306a36Sopenharmony_ci ep->user = get_current_user(); 99262306a36Sopenharmony_ci refcount_set(&ep->refcount, 1); 99362306a36Sopenharmony_ci 99462306a36Sopenharmony_ci *pep = ep; 99562306a36Sopenharmony_ci 99662306a36Sopenharmony_ci return 0; 99762306a36Sopenharmony_ci} 99862306a36Sopenharmony_ci 99962306a36Sopenharmony_ci/* 100062306a36Sopenharmony_ci * Search the file inside the eventpoll tree. The RB tree operations 100162306a36Sopenharmony_ci * are protected by the "mtx" mutex, and ep_find() must be called with 100262306a36Sopenharmony_ci * "mtx" held. 100362306a36Sopenharmony_ci */ 100462306a36Sopenharmony_cistatic struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) 100562306a36Sopenharmony_ci{ 100662306a36Sopenharmony_ci int kcmp; 100762306a36Sopenharmony_ci struct rb_node *rbp; 100862306a36Sopenharmony_ci struct epitem *epi, *epir = NULL; 100962306a36Sopenharmony_ci struct epoll_filefd ffd; 101062306a36Sopenharmony_ci 101162306a36Sopenharmony_ci ep_set_ffd(&ffd, file, fd); 101262306a36Sopenharmony_ci for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { 101362306a36Sopenharmony_ci epi = rb_entry(rbp, struct epitem, rbn); 101462306a36Sopenharmony_ci kcmp = ep_cmp_ffd(&ffd, &epi->ffd); 101562306a36Sopenharmony_ci if (kcmp > 0) 101662306a36Sopenharmony_ci rbp = rbp->rb_right; 101762306a36Sopenharmony_ci else if (kcmp < 0) 101862306a36Sopenharmony_ci rbp = rbp->rb_left; 101962306a36Sopenharmony_ci else { 102062306a36Sopenharmony_ci epir = epi; 102162306a36Sopenharmony_ci break; 102262306a36Sopenharmony_ci } 102362306a36Sopenharmony_ci } 102462306a36Sopenharmony_ci 102562306a36Sopenharmony_ci return epir; 102662306a36Sopenharmony_ci} 102762306a36Sopenharmony_ci 102862306a36Sopenharmony_ci#ifdef CONFIG_KCMP 102962306a36Sopenharmony_cistatic struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) 103062306a36Sopenharmony_ci{ 103162306a36Sopenharmony_ci struct rb_node *rbp; 103262306a36Sopenharmony_ci struct epitem *epi; 103362306a36Sopenharmony_ci 103462306a36Sopenharmony_ci for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 103562306a36Sopenharmony_ci epi = rb_entry(rbp, struct epitem, rbn); 103662306a36Sopenharmony_ci if (epi->ffd.fd == tfd) { 103762306a36Sopenharmony_ci if (toff == 0) 103862306a36Sopenharmony_ci return epi; 103962306a36Sopenharmony_ci else 104062306a36Sopenharmony_ci toff--; 104162306a36Sopenharmony_ci } 104262306a36Sopenharmony_ci cond_resched(); 104362306a36Sopenharmony_ci } 104462306a36Sopenharmony_ci 104562306a36Sopenharmony_ci return NULL; 104662306a36Sopenharmony_ci} 104762306a36Sopenharmony_ci 104862306a36Sopenharmony_cistruct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, 104962306a36Sopenharmony_ci unsigned long toff) 105062306a36Sopenharmony_ci{ 105162306a36Sopenharmony_ci struct file *file_raw; 105262306a36Sopenharmony_ci struct eventpoll *ep; 105362306a36Sopenharmony_ci struct epitem *epi; 105462306a36Sopenharmony_ci 105562306a36Sopenharmony_ci if (!is_file_epoll(file)) 105662306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 105762306a36Sopenharmony_ci 105862306a36Sopenharmony_ci ep = file->private_data; 105962306a36Sopenharmony_ci 106062306a36Sopenharmony_ci mutex_lock(&ep->mtx); 106162306a36Sopenharmony_ci epi = ep_find_tfd(ep, tfd, toff); 106262306a36Sopenharmony_ci if (epi) 106362306a36Sopenharmony_ci file_raw = epi->ffd.file; 106462306a36Sopenharmony_ci else 106562306a36Sopenharmony_ci file_raw = ERR_PTR(-ENOENT); 106662306a36Sopenharmony_ci mutex_unlock(&ep->mtx); 106762306a36Sopenharmony_ci 106862306a36Sopenharmony_ci return file_raw; 106962306a36Sopenharmony_ci} 107062306a36Sopenharmony_ci#endif /* CONFIG_KCMP */ 107162306a36Sopenharmony_ci 107262306a36Sopenharmony_ci/* 107362306a36Sopenharmony_ci * Adds a new entry to the tail of the list in a lockless way, i.e. 107462306a36Sopenharmony_ci * multiple CPUs are allowed to call this function concurrently. 107562306a36Sopenharmony_ci * 107662306a36Sopenharmony_ci * Beware: it is necessary to prevent any other modifications of the 107762306a36Sopenharmony_ci * existing list until all changes are completed, in other words 107862306a36Sopenharmony_ci * concurrent list_add_tail_lockless() calls should be protected 107962306a36Sopenharmony_ci * with a read lock, where write lock acts as a barrier which 108062306a36Sopenharmony_ci * makes sure all list_add_tail_lockless() calls are fully 108162306a36Sopenharmony_ci * completed. 108262306a36Sopenharmony_ci * 108362306a36Sopenharmony_ci * Also an element can be locklessly added to the list only in one 108462306a36Sopenharmony_ci * direction i.e. either to the tail or to the head, otherwise 108562306a36Sopenharmony_ci * concurrent access will corrupt the list. 108662306a36Sopenharmony_ci * 108762306a36Sopenharmony_ci * Return: %false if element has been already added to the list, %true 108862306a36Sopenharmony_ci * otherwise. 108962306a36Sopenharmony_ci */ 109062306a36Sopenharmony_cistatic inline bool list_add_tail_lockless(struct list_head *new, 109162306a36Sopenharmony_ci struct list_head *head) 109262306a36Sopenharmony_ci{ 109362306a36Sopenharmony_ci struct list_head *prev; 109462306a36Sopenharmony_ci 109562306a36Sopenharmony_ci /* 109662306a36Sopenharmony_ci * This is simple 'new->next = head' operation, but cmpxchg() 109762306a36Sopenharmony_ci * is used in order to detect that same element has been just 109862306a36Sopenharmony_ci * added to the list from another CPU: the winner observes 109962306a36Sopenharmony_ci * new->next == new. 110062306a36Sopenharmony_ci */ 110162306a36Sopenharmony_ci if (!try_cmpxchg(&new->next, &new, head)) 110262306a36Sopenharmony_ci return false; 110362306a36Sopenharmony_ci 110462306a36Sopenharmony_ci /* 110562306a36Sopenharmony_ci * Initially ->next of a new element must be updated with the head 110662306a36Sopenharmony_ci * (we are inserting to the tail) and only then pointers are atomically 110762306a36Sopenharmony_ci * exchanged. XCHG guarantees memory ordering, thus ->next should be 110862306a36Sopenharmony_ci * updated before pointers are actually swapped and pointers are 110962306a36Sopenharmony_ci * swapped before prev->next is updated. 111062306a36Sopenharmony_ci */ 111162306a36Sopenharmony_ci 111262306a36Sopenharmony_ci prev = xchg(&head->prev, new); 111362306a36Sopenharmony_ci 111462306a36Sopenharmony_ci /* 111562306a36Sopenharmony_ci * It is safe to modify prev->next and new->prev, because a new element 111662306a36Sopenharmony_ci * is added only to the tail and new->next is updated before XCHG. 111762306a36Sopenharmony_ci */ 111862306a36Sopenharmony_ci 111962306a36Sopenharmony_ci prev->next = new; 112062306a36Sopenharmony_ci new->prev = prev; 112162306a36Sopenharmony_ci 112262306a36Sopenharmony_ci return true; 112362306a36Sopenharmony_ci} 112462306a36Sopenharmony_ci 112562306a36Sopenharmony_ci/* 112662306a36Sopenharmony_ci * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, 112762306a36Sopenharmony_ci * i.e. multiple CPUs are allowed to call this function concurrently. 112862306a36Sopenharmony_ci * 112962306a36Sopenharmony_ci * Return: %false if epi element has been already chained, %true otherwise. 113062306a36Sopenharmony_ci */ 113162306a36Sopenharmony_cistatic inline bool chain_epi_lockless(struct epitem *epi) 113262306a36Sopenharmony_ci{ 113362306a36Sopenharmony_ci struct eventpoll *ep = epi->ep; 113462306a36Sopenharmony_ci 113562306a36Sopenharmony_ci /* Fast preliminary check */ 113662306a36Sopenharmony_ci if (epi->next != EP_UNACTIVE_PTR) 113762306a36Sopenharmony_ci return false; 113862306a36Sopenharmony_ci 113962306a36Sopenharmony_ci /* Check that the same epi has not been just chained from another CPU */ 114062306a36Sopenharmony_ci if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) 114162306a36Sopenharmony_ci return false; 114262306a36Sopenharmony_ci 114362306a36Sopenharmony_ci /* Atomically exchange tail */ 114462306a36Sopenharmony_ci epi->next = xchg(&ep->ovflist, epi); 114562306a36Sopenharmony_ci 114662306a36Sopenharmony_ci return true; 114762306a36Sopenharmony_ci} 114862306a36Sopenharmony_ci 114962306a36Sopenharmony_ci/* 115062306a36Sopenharmony_ci * This is the callback that is passed to the wait queue wakeup 115162306a36Sopenharmony_ci * mechanism. It is called by the stored file descriptors when they 115262306a36Sopenharmony_ci * have events to report. 115362306a36Sopenharmony_ci * 115462306a36Sopenharmony_ci * This callback takes a read lock in order not to contend with concurrent 115562306a36Sopenharmony_ci * events from another file descriptor, thus all modifications to ->rdllist 115662306a36Sopenharmony_ci * or ->ovflist are lockless. Read lock is paired with the write lock from 115762306a36Sopenharmony_ci * ep_scan_ready_list(), which stops all list modifications and guarantees 115862306a36Sopenharmony_ci * that lists state is seen correctly. 115962306a36Sopenharmony_ci * 116062306a36Sopenharmony_ci * Another thing worth to mention is that ep_poll_callback() can be called 116162306a36Sopenharmony_ci * concurrently for the same @epi from different CPUs if poll table was inited 116262306a36Sopenharmony_ci * with several wait queues entries. Plural wakeup from different CPUs of a 116362306a36Sopenharmony_ci * single wait queue is serialized by wq.lock, but the case when multiple wait 116462306a36Sopenharmony_ci * queues are used should be detected accordingly. This is detected using 116562306a36Sopenharmony_ci * cmpxchg() operation. 116662306a36Sopenharmony_ci */ 116762306a36Sopenharmony_cistatic int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 116862306a36Sopenharmony_ci{ 116962306a36Sopenharmony_ci int pwake = 0; 117062306a36Sopenharmony_ci struct epitem *epi = ep_item_from_wait(wait); 117162306a36Sopenharmony_ci struct eventpoll *ep = epi->ep; 117262306a36Sopenharmony_ci __poll_t pollflags = key_to_poll(key); 117362306a36Sopenharmony_ci unsigned long flags; 117462306a36Sopenharmony_ci int ewake = 0; 117562306a36Sopenharmony_ci 117662306a36Sopenharmony_ci read_lock_irqsave(&ep->lock, flags); 117762306a36Sopenharmony_ci 117862306a36Sopenharmony_ci ep_set_busy_poll_napi_id(epi); 117962306a36Sopenharmony_ci 118062306a36Sopenharmony_ci /* 118162306a36Sopenharmony_ci * If the event mask does not contain any poll(2) event, we consider the 118262306a36Sopenharmony_ci * descriptor to be disabled. This condition is likely the effect of the 118362306a36Sopenharmony_ci * EPOLLONESHOT bit that disables the descriptor when an event is received, 118462306a36Sopenharmony_ci * until the next EPOLL_CTL_MOD will be issued. 118562306a36Sopenharmony_ci */ 118662306a36Sopenharmony_ci if (!(epi->event.events & ~EP_PRIVATE_BITS)) 118762306a36Sopenharmony_ci goto out_unlock; 118862306a36Sopenharmony_ci 118962306a36Sopenharmony_ci /* 119062306a36Sopenharmony_ci * Check the events coming with the callback. At this stage, not 119162306a36Sopenharmony_ci * every device reports the events in the "key" parameter of the 119262306a36Sopenharmony_ci * callback. We need to be able to handle both cases here, hence the 119362306a36Sopenharmony_ci * test for "key" != NULL before the event match test. 119462306a36Sopenharmony_ci */ 119562306a36Sopenharmony_ci if (pollflags && !(pollflags & epi->event.events)) 119662306a36Sopenharmony_ci goto out_unlock; 119762306a36Sopenharmony_ci 119862306a36Sopenharmony_ci /* 119962306a36Sopenharmony_ci * If we are transferring events to userspace, we can hold no locks 120062306a36Sopenharmony_ci * (because we're accessing user memory, and because of linux f_op->poll() 120162306a36Sopenharmony_ci * semantics). All the events that happen during that period of time are 120262306a36Sopenharmony_ci * chained in ep->ovflist and requeued later on. 120362306a36Sopenharmony_ci */ 120462306a36Sopenharmony_ci if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { 120562306a36Sopenharmony_ci if (chain_epi_lockless(epi)) 120662306a36Sopenharmony_ci ep_pm_stay_awake_rcu(epi); 120762306a36Sopenharmony_ci } else if (!ep_is_linked(epi)) { 120862306a36Sopenharmony_ci /* In the usual case, add event to ready list. */ 120962306a36Sopenharmony_ci if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) 121062306a36Sopenharmony_ci ep_pm_stay_awake_rcu(epi); 121162306a36Sopenharmony_ci } 121262306a36Sopenharmony_ci 121362306a36Sopenharmony_ci /* 121462306a36Sopenharmony_ci * Wake up ( if active ) both the eventpoll wait list and the ->poll() 121562306a36Sopenharmony_ci * wait list. 121662306a36Sopenharmony_ci */ 121762306a36Sopenharmony_ci if (waitqueue_active(&ep->wq)) { 121862306a36Sopenharmony_ci if ((epi->event.events & EPOLLEXCLUSIVE) && 121962306a36Sopenharmony_ci !(pollflags & POLLFREE)) { 122062306a36Sopenharmony_ci switch (pollflags & EPOLLINOUT_BITS) { 122162306a36Sopenharmony_ci case EPOLLIN: 122262306a36Sopenharmony_ci if (epi->event.events & EPOLLIN) 122362306a36Sopenharmony_ci ewake = 1; 122462306a36Sopenharmony_ci break; 122562306a36Sopenharmony_ci case EPOLLOUT: 122662306a36Sopenharmony_ci if (epi->event.events & EPOLLOUT) 122762306a36Sopenharmony_ci ewake = 1; 122862306a36Sopenharmony_ci break; 122962306a36Sopenharmony_ci case 0: 123062306a36Sopenharmony_ci ewake = 1; 123162306a36Sopenharmony_ci break; 123262306a36Sopenharmony_ci } 123362306a36Sopenharmony_ci } 123462306a36Sopenharmony_ci wake_up(&ep->wq); 123562306a36Sopenharmony_ci } 123662306a36Sopenharmony_ci if (waitqueue_active(&ep->poll_wait)) 123762306a36Sopenharmony_ci pwake++; 123862306a36Sopenharmony_ci 123962306a36Sopenharmony_ciout_unlock: 124062306a36Sopenharmony_ci read_unlock_irqrestore(&ep->lock, flags); 124162306a36Sopenharmony_ci 124262306a36Sopenharmony_ci /* We have to call this outside the lock */ 124362306a36Sopenharmony_ci if (pwake) 124462306a36Sopenharmony_ci ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); 124562306a36Sopenharmony_ci 124662306a36Sopenharmony_ci if (!(epi->event.events & EPOLLEXCLUSIVE)) 124762306a36Sopenharmony_ci ewake = 1; 124862306a36Sopenharmony_ci 124962306a36Sopenharmony_ci if (pollflags & POLLFREE) { 125062306a36Sopenharmony_ci /* 125162306a36Sopenharmony_ci * If we race with ep_remove_wait_queue() it can miss 125262306a36Sopenharmony_ci * ->whead = NULL and do another remove_wait_queue() after 125362306a36Sopenharmony_ci * us, so we can't use __remove_wait_queue(). 125462306a36Sopenharmony_ci */ 125562306a36Sopenharmony_ci list_del_init(&wait->entry); 125662306a36Sopenharmony_ci /* 125762306a36Sopenharmony_ci * ->whead != NULL protects us from the race with 125862306a36Sopenharmony_ci * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() 125962306a36Sopenharmony_ci * takes whead->lock held by the caller. Once we nullify it, 126062306a36Sopenharmony_ci * nothing protects ep/epi or even wait. 126162306a36Sopenharmony_ci */ 126262306a36Sopenharmony_ci smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); 126362306a36Sopenharmony_ci } 126462306a36Sopenharmony_ci 126562306a36Sopenharmony_ci return ewake; 126662306a36Sopenharmony_ci} 126762306a36Sopenharmony_ci 126862306a36Sopenharmony_ci/* 126962306a36Sopenharmony_ci * This is the callback that is used to add our wait queue to the 127062306a36Sopenharmony_ci * target file wakeup lists. 127162306a36Sopenharmony_ci */ 127262306a36Sopenharmony_cistatic void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, 127362306a36Sopenharmony_ci poll_table *pt) 127462306a36Sopenharmony_ci{ 127562306a36Sopenharmony_ci struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); 127662306a36Sopenharmony_ci struct epitem *epi = epq->epi; 127762306a36Sopenharmony_ci struct eppoll_entry *pwq; 127862306a36Sopenharmony_ci 127962306a36Sopenharmony_ci if (unlikely(!epi)) // an earlier allocation has failed 128062306a36Sopenharmony_ci return; 128162306a36Sopenharmony_ci 128262306a36Sopenharmony_ci pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); 128362306a36Sopenharmony_ci if (unlikely(!pwq)) { 128462306a36Sopenharmony_ci epq->epi = NULL; 128562306a36Sopenharmony_ci return; 128662306a36Sopenharmony_ci } 128762306a36Sopenharmony_ci 128862306a36Sopenharmony_ci init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); 128962306a36Sopenharmony_ci pwq->whead = whead; 129062306a36Sopenharmony_ci pwq->base = epi; 129162306a36Sopenharmony_ci if (epi->event.events & EPOLLEXCLUSIVE) 129262306a36Sopenharmony_ci add_wait_queue_exclusive(whead, &pwq->wait); 129362306a36Sopenharmony_ci else 129462306a36Sopenharmony_ci add_wait_queue(whead, &pwq->wait); 129562306a36Sopenharmony_ci pwq->next = epi->pwqlist; 129662306a36Sopenharmony_ci epi->pwqlist = pwq; 129762306a36Sopenharmony_ci} 129862306a36Sopenharmony_ci 129962306a36Sopenharmony_cistatic void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) 130062306a36Sopenharmony_ci{ 130162306a36Sopenharmony_ci int kcmp; 130262306a36Sopenharmony_ci struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; 130362306a36Sopenharmony_ci struct epitem *epic; 130462306a36Sopenharmony_ci bool leftmost = true; 130562306a36Sopenharmony_ci 130662306a36Sopenharmony_ci while (*p) { 130762306a36Sopenharmony_ci parent = *p; 130862306a36Sopenharmony_ci epic = rb_entry(parent, struct epitem, rbn); 130962306a36Sopenharmony_ci kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); 131062306a36Sopenharmony_ci if (kcmp > 0) { 131162306a36Sopenharmony_ci p = &parent->rb_right; 131262306a36Sopenharmony_ci leftmost = false; 131362306a36Sopenharmony_ci } else 131462306a36Sopenharmony_ci p = &parent->rb_left; 131562306a36Sopenharmony_ci } 131662306a36Sopenharmony_ci rb_link_node(&epi->rbn, parent, p); 131762306a36Sopenharmony_ci rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); 131862306a36Sopenharmony_ci} 131962306a36Sopenharmony_ci 132062306a36Sopenharmony_ci 132162306a36Sopenharmony_ci 132262306a36Sopenharmony_ci#define PATH_ARR_SIZE 5 132362306a36Sopenharmony_ci/* 132462306a36Sopenharmony_ci * These are the number paths of length 1 to 5, that we are allowing to emanate 132562306a36Sopenharmony_ci * from a single file of interest. For example, we allow 1000 paths of length 132662306a36Sopenharmony_ci * 1, to emanate from each file of interest. This essentially represents the 132762306a36Sopenharmony_ci * potential wakeup paths, which need to be limited in order to avoid massive 132862306a36Sopenharmony_ci * uncontrolled wakeup storms. The common use case should be a single ep which 132962306a36Sopenharmony_ci * is connected to n file sources. In this case each file source has 1 path 133062306a36Sopenharmony_ci * of length 1. Thus, the numbers below should be more than sufficient. These 133162306a36Sopenharmony_ci * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify 133262306a36Sopenharmony_ci * and delete can't add additional paths. Protected by the epnested_mutex. 133362306a36Sopenharmony_ci */ 133462306a36Sopenharmony_cistatic const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; 133562306a36Sopenharmony_cistatic int path_count[PATH_ARR_SIZE]; 133662306a36Sopenharmony_ci 133762306a36Sopenharmony_cistatic int path_count_inc(int nests) 133862306a36Sopenharmony_ci{ 133962306a36Sopenharmony_ci /* Allow an arbitrary number of depth 1 paths */ 134062306a36Sopenharmony_ci if (nests == 0) 134162306a36Sopenharmony_ci return 0; 134262306a36Sopenharmony_ci 134362306a36Sopenharmony_ci if (++path_count[nests] > path_limits[nests]) 134462306a36Sopenharmony_ci return -1; 134562306a36Sopenharmony_ci return 0; 134662306a36Sopenharmony_ci} 134762306a36Sopenharmony_ci 134862306a36Sopenharmony_cistatic void path_count_init(void) 134962306a36Sopenharmony_ci{ 135062306a36Sopenharmony_ci int i; 135162306a36Sopenharmony_ci 135262306a36Sopenharmony_ci for (i = 0; i < PATH_ARR_SIZE; i++) 135362306a36Sopenharmony_ci path_count[i] = 0; 135462306a36Sopenharmony_ci} 135562306a36Sopenharmony_ci 135662306a36Sopenharmony_cistatic int reverse_path_check_proc(struct hlist_head *refs, int depth) 135762306a36Sopenharmony_ci{ 135862306a36Sopenharmony_ci int error = 0; 135962306a36Sopenharmony_ci struct epitem *epi; 136062306a36Sopenharmony_ci 136162306a36Sopenharmony_ci if (depth > EP_MAX_NESTS) /* too deep nesting */ 136262306a36Sopenharmony_ci return -1; 136362306a36Sopenharmony_ci 136462306a36Sopenharmony_ci /* CTL_DEL can remove links here, but that can't increase our count */ 136562306a36Sopenharmony_ci hlist_for_each_entry_rcu(epi, refs, fllink) { 136662306a36Sopenharmony_ci struct hlist_head *refs = &epi->ep->refs; 136762306a36Sopenharmony_ci if (hlist_empty(refs)) 136862306a36Sopenharmony_ci error = path_count_inc(depth); 136962306a36Sopenharmony_ci else 137062306a36Sopenharmony_ci error = reverse_path_check_proc(refs, depth + 1); 137162306a36Sopenharmony_ci if (error != 0) 137262306a36Sopenharmony_ci break; 137362306a36Sopenharmony_ci } 137462306a36Sopenharmony_ci return error; 137562306a36Sopenharmony_ci} 137662306a36Sopenharmony_ci 137762306a36Sopenharmony_ci/** 137862306a36Sopenharmony_ci * reverse_path_check - The tfile_check_list is list of epitem_head, which have 137962306a36Sopenharmony_ci * links that are proposed to be newly added. We need to 138062306a36Sopenharmony_ci * make sure that those added links don't add too many 138162306a36Sopenharmony_ci * paths such that we will spend all our time waking up 138262306a36Sopenharmony_ci * eventpoll objects. 138362306a36Sopenharmony_ci * 138462306a36Sopenharmony_ci * Return: %zero if the proposed links don't create too many paths, 138562306a36Sopenharmony_ci * %-1 otherwise. 138662306a36Sopenharmony_ci */ 138762306a36Sopenharmony_cistatic int reverse_path_check(void) 138862306a36Sopenharmony_ci{ 138962306a36Sopenharmony_ci struct epitems_head *p; 139062306a36Sopenharmony_ci 139162306a36Sopenharmony_ci for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { 139262306a36Sopenharmony_ci int error; 139362306a36Sopenharmony_ci path_count_init(); 139462306a36Sopenharmony_ci rcu_read_lock(); 139562306a36Sopenharmony_ci error = reverse_path_check_proc(&p->epitems, 0); 139662306a36Sopenharmony_ci rcu_read_unlock(); 139762306a36Sopenharmony_ci if (error) 139862306a36Sopenharmony_ci return error; 139962306a36Sopenharmony_ci } 140062306a36Sopenharmony_ci return 0; 140162306a36Sopenharmony_ci} 140262306a36Sopenharmony_ci 140362306a36Sopenharmony_cistatic int ep_create_wakeup_source(struct epitem *epi) 140462306a36Sopenharmony_ci{ 140562306a36Sopenharmony_ci struct name_snapshot n; 140662306a36Sopenharmony_ci struct wakeup_source *ws; 140762306a36Sopenharmony_ci 140862306a36Sopenharmony_ci if (!epi->ep->ws) { 140962306a36Sopenharmony_ci epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); 141062306a36Sopenharmony_ci if (!epi->ep->ws) 141162306a36Sopenharmony_ci return -ENOMEM; 141262306a36Sopenharmony_ci } 141362306a36Sopenharmony_ci 141462306a36Sopenharmony_ci take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); 141562306a36Sopenharmony_ci ws = wakeup_source_register(NULL, n.name.name); 141662306a36Sopenharmony_ci release_dentry_name_snapshot(&n); 141762306a36Sopenharmony_ci 141862306a36Sopenharmony_ci if (!ws) 141962306a36Sopenharmony_ci return -ENOMEM; 142062306a36Sopenharmony_ci rcu_assign_pointer(epi->ws, ws); 142162306a36Sopenharmony_ci 142262306a36Sopenharmony_ci return 0; 142362306a36Sopenharmony_ci} 142462306a36Sopenharmony_ci 142562306a36Sopenharmony_ci/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ 142662306a36Sopenharmony_cistatic noinline void ep_destroy_wakeup_source(struct epitem *epi) 142762306a36Sopenharmony_ci{ 142862306a36Sopenharmony_ci struct wakeup_source *ws = ep_wakeup_source(epi); 142962306a36Sopenharmony_ci 143062306a36Sopenharmony_ci RCU_INIT_POINTER(epi->ws, NULL); 143162306a36Sopenharmony_ci 143262306a36Sopenharmony_ci /* 143362306a36Sopenharmony_ci * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is 143462306a36Sopenharmony_ci * used internally by wakeup_source_remove, too (called by 143562306a36Sopenharmony_ci * wakeup_source_unregister), so we cannot use call_rcu 143662306a36Sopenharmony_ci */ 143762306a36Sopenharmony_ci synchronize_rcu(); 143862306a36Sopenharmony_ci wakeup_source_unregister(ws); 143962306a36Sopenharmony_ci} 144062306a36Sopenharmony_ci 144162306a36Sopenharmony_cistatic int attach_epitem(struct file *file, struct epitem *epi) 144262306a36Sopenharmony_ci{ 144362306a36Sopenharmony_ci struct epitems_head *to_free = NULL; 144462306a36Sopenharmony_ci struct hlist_head *head = NULL; 144562306a36Sopenharmony_ci struct eventpoll *ep = NULL; 144662306a36Sopenharmony_ci 144762306a36Sopenharmony_ci if (is_file_epoll(file)) 144862306a36Sopenharmony_ci ep = file->private_data; 144962306a36Sopenharmony_ci 145062306a36Sopenharmony_ci if (ep) { 145162306a36Sopenharmony_ci head = &ep->refs; 145262306a36Sopenharmony_ci } else if (!READ_ONCE(file->f_ep)) { 145362306a36Sopenharmony_ciallocate: 145462306a36Sopenharmony_ci to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL); 145562306a36Sopenharmony_ci if (!to_free) 145662306a36Sopenharmony_ci return -ENOMEM; 145762306a36Sopenharmony_ci head = &to_free->epitems; 145862306a36Sopenharmony_ci } 145962306a36Sopenharmony_ci spin_lock(&file->f_lock); 146062306a36Sopenharmony_ci if (!file->f_ep) { 146162306a36Sopenharmony_ci if (unlikely(!head)) { 146262306a36Sopenharmony_ci spin_unlock(&file->f_lock); 146362306a36Sopenharmony_ci goto allocate; 146462306a36Sopenharmony_ci } 146562306a36Sopenharmony_ci file->f_ep = head; 146662306a36Sopenharmony_ci to_free = NULL; 146762306a36Sopenharmony_ci } 146862306a36Sopenharmony_ci hlist_add_head_rcu(&epi->fllink, file->f_ep); 146962306a36Sopenharmony_ci spin_unlock(&file->f_lock); 147062306a36Sopenharmony_ci free_ephead(to_free); 147162306a36Sopenharmony_ci return 0; 147262306a36Sopenharmony_ci} 147362306a36Sopenharmony_ci 147462306a36Sopenharmony_ci/* 147562306a36Sopenharmony_ci * Must be called with "mtx" held. 147662306a36Sopenharmony_ci */ 147762306a36Sopenharmony_cistatic int ep_insert(struct eventpoll *ep, const struct epoll_event *event, 147862306a36Sopenharmony_ci struct file *tfile, int fd, int full_check) 147962306a36Sopenharmony_ci{ 148062306a36Sopenharmony_ci int error, pwake = 0; 148162306a36Sopenharmony_ci __poll_t revents; 148262306a36Sopenharmony_ci struct epitem *epi; 148362306a36Sopenharmony_ci struct ep_pqueue epq; 148462306a36Sopenharmony_ci struct eventpoll *tep = NULL; 148562306a36Sopenharmony_ci 148662306a36Sopenharmony_ci if (is_file_epoll(tfile)) 148762306a36Sopenharmony_ci tep = tfile->private_data; 148862306a36Sopenharmony_ci 148962306a36Sopenharmony_ci lockdep_assert_irqs_enabled(); 149062306a36Sopenharmony_ci 149162306a36Sopenharmony_ci if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, 149262306a36Sopenharmony_ci max_user_watches) >= 0)) 149362306a36Sopenharmony_ci return -ENOSPC; 149462306a36Sopenharmony_ci percpu_counter_inc(&ep->user->epoll_watches); 149562306a36Sopenharmony_ci 149662306a36Sopenharmony_ci if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { 149762306a36Sopenharmony_ci percpu_counter_dec(&ep->user->epoll_watches); 149862306a36Sopenharmony_ci return -ENOMEM; 149962306a36Sopenharmony_ci } 150062306a36Sopenharmony_ci 150162306a36Sopenharmony_ci /* Item initialization follow here ... */ 150262306a36Sopenharmony_ci INIT_LIST_HEAD(&epi->rdllink); 150362306a36Sopenharmony_ci epi->ep = ep; 150462306a36Sopenharmony_ci ep_set_ffd(&epi->ffd, tfile, fd); 150562306a36Sopenharmony_ci epi->event = *event; 150662306a36Sopenharmony_ci epi->next = EP_UNACTIVE_PTR; 150762306a36Sopenharmony_ci 150862306a36Sopenharmony_ci if (tep) 150962306a36Sopenharmony_ci mutex_lock_nested(&tep->mtx, 1); 151062306a36Sopenharmony_ci /* Add the current item to the list of active epoll hook for this file */ 151162306a36Sopenharmony_ci if (unlikely(attach_epitem(tfile, epi) < 0)) { 151262306a36Sopenharmony_ci if (tep) 151362306a36Sopenharmony_ci mutex_unlock(&tep->mtx); 151462306a36Sopenharmony_ci kmem_cache_free(epi_cache, epi); 151562306a36Sopenharmony_ci percpu_counter_dec(&ep->user->epoll_watches); 151662306a36Sopenharmony_ci return -ENOMEM; 151762306a36Sopenharmony_ci } 151862306a36Sopenharmony_ci 151962306a36Sopenharmony_ci if (full_check && !tep) 152062306a36Sopenharmony_ci list_file(tfile); 152162306a36Sopenharmony_ci 152262306a36Sopenharmony_ci /* 152362306a36Sopenharmony_ci * Add the current item to the RB tree. All RB tree operations are 152462306a36Sopenharmony_ci * protected by "mtx", and ep_insert() is called with "mtx" held. 152562306a36Sopenharmony_ci */ 152662306a36Sopenharmony_ci ep_rbtree_insert(ep, epi); 152762306a36Sopenharmony_ci if (tep) 152862306a36Sopenharmony_ci mutex_unlock(&tep->mtx); 152962306a36Sopenharmony_ci 153062306a36Sopenharmony_ci /* 153162306a36Sopenharmony_ci * ep_remove_safe() calls in the later error paths can't lead to 153262306a36Sopenharmony_ci * ep_free() as the ep file itself still holds an ep reference. 153362306a36Sopenharmony_ci */ 153462306a36Sopenharmony_ci ep_get(ep); 153562306a36Sopenharmony_ci 153662306a36Sopenharmony_ci /* now check if we've created too many backpaths */ 153762306a36Sopenharmony_ci if (unlikely(full_check && reverse_path_check())) { 153862306a36Sopenharmony_ci ep_remove_safe(ep, epi); 153962306a36Sopenharmony_ci return -EINVAL; 154062306a36Sopenharmony_ci } 154162306a36Sopenharmony_ci 154262306a36Sopenharmony_ci if (epi->event.events & EPOLLWAKEUP) { 154362306a36Sopenharmony_ci error = ep_create_wakeup_source(epi); 154462306a36Sopenharmony_ci if (error) { 154562306a36Sopenharmony_ci ep_remove_safe(ep, epi); 154662306a36Sopenharmony_ci return error; 154762306a36Sopenharmony_ci } 154862306a36Sopenharmony_ci } 154962306a36Sopenharmony_ci 155062306a36Sopenharmony_ci /* Initialize the poll table using the queue callback */ 155162306a36Sopenharmony_ci epq.epi = epi; 155262306a36Sopenharmony_ci init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); 155362306a36Sopenharmony_ci 155462306a36Sopenharmony_ci /* 155562306a36Sopenharmony_ci * Attach the item to the poll hooks and get current event bits. 155662306a36Sopenharmony_ci * We can safely use the file* here because its usage count has 155762306a36Sopenharmony_ci * been increased by the caller of this function. Note that after 155862306a36Sopenharmony_ci * this operation completes, the poll callback can start hitting 155962306a36Sopenharmony_ci * the new item. 156062306a36Sopenharmony_ci */ 156162306a36Sopenharmony_ci revents = ep_item_poll(epi, &epq.pt, 1); 156262306a36Sopenharmony_ci 156362306a36Sopenharmony_ci /* 156462306a36Sopenharmony_ci * We have to check if something went wrong during the poll wait queue 156562306a36Sopenharmony_ci * install process. Namely an allocation for a wait queue failed due 156662306a36Sopenharmony_ci * high memory pressure. 156762306a36Sopenharmony_ci */ 156862306a36Sopenharmony_ci if (unlikely(!epq.epi)) { 156962306a36Sopenharmony_ci ep_remove_safe(ep, epi); 157062306a36Sopenharmony_ci return -ENOMEM; 157162306a36Sopenharmony_ci } 157262306a36Sopenharmony_ci 157362306a36Sopenharmony_ci /* We have to drop the new item inside our item list to keep track of it */ 157462306a36Sopenharmony_ci write_lock_irq(&ep->lock); 157562306a36Sopenharmony_ci 157662306a36Sopenharmony_ci /* record NAPI ID of new item if present */ 157762306a36Sopenharmony_ci ep_set_busy_poll_napi_id(epi); 157862306a36Sopenharmony_ci 157962306a36Sopenharmony_ci /* If the file is already "ready" we drop it inside the ready list */ 158062306a36Sopenharmony_ci if (revents && !ep_is_linked(epi)) { 158162306a36Sopenharmony_ci list_add_tail(&epi->rdllink, &ep->rdllist); 158262306a36Sopenharmony_ci ep_pm_stay_awake(epi); 158362306a36Sopenharmony_ci 158462306a36Sopenharmony_ci /* Notify waiting tasks that events are available */ 158562306a36Sopenharmony_ci if (waitqueue_active(&ep->wq)) 158662306a36Sopenharmony_ci wake_up(&ep->wq); 158762306a36Sopenharmony_ci if (waitqueue_active(&ep->poll_wait)) 158862306a36Sopenharmony_ci pwake++; 158962306a36Sopenharmony_ci } 159062306a36Sopenharmony_ci 159162306a36Sopenharmony_ci write_unlock_irq(&ep->lock); 159262306a36Sopenharmony_ci 159362306a36Sopenharmony_ci /* We have to call this outside the lock */ 159462306a36Sopenharmony_ci if (pwake) 159562306a36Sopenharmony_ci ep_poll_safewake(ep, NULL, 0); 159662306a36Sopenharmony_ci 159762306a36Sopenharmony_ci return 0; 159862306a36Sopenharmony_ci} 159962306a36Sopenharmony_ci 160062306a36Sopenharmony_ci/* 160162306a36Sopenharmony_ci * Modify the interest event mask by dropping an event if the new mask 160262306a36Sopenharmony_ci * has a match in the current file status. Must be called with "mtx" held. 160362306a36Sopenharmony_ci */ 160462306a36Sopenharmony_cistatic int ep_modify(struct eventpoll *ep, struct epitem *epi, 160562306a36Sopenharmony_ci const struct epoll_event *event) 160662306a36Sopenharmony_ci{ 160762306a36Sopenharmony_ci int pwake = 0; 160862306a36Sopenharmony_ci poll_table pt; 160962306a36Sopenharmony_ci 161062306a36Sopenharmony_ci lockdep_assert_irqs_enabled(); 161162306a36Sopenharmony_ci 161262306a36Sopenharmony_ci init_poll_funcptr(&pt, NULL); 161362306a36Sopenharmony_ci 161462306a36Sopenharmony_ci /* 161562306a36Sopenharmony_ci * Set the new event interest mask before calling f_op->poll(); 161662306a36Sopenharmony_ci * otherwise we might miss an event that happens between the 161762306a36Sopenharmony_ci * f_op->poll() call and the new event set registering. 161862306a36Sopenharmony_ci */ 161962306a36Sopenharmony_ci epi->event.events = event->events; /* need barrier below */ 162062306a36Sopenharmony_ci epi->event.data = event->data; /* protected by mtx */ 162162306a36Sopenharmony_ci if (epi->event.events & EPOLLWAKEUP) { 162262306a36Sopenharmony_ci if (!ep_has_wakeup_source(epi)) 162362306a36Sopenharmony_ci ep_create_wakeup_source(epi); 162462306a36Sopenharmony_ci } else if (ep_has_wakeup_source(epi)) { 162562306a36Sopenharmony_ci ep_destroy_wakeup_source(epi); 162662306a36Sopenharmony_ci } 162762306a36Sopenharmony_ci 162862306a36Sopenharmony_ci /* 162962306a36Sopenharmony_ci * The following barrier has two effects: 163062306a36Sopenharmony_ci * 163162306a36Sopenharmony_ci * 1) Flush epi changes above to other CPUs. This ensures 163262306a36Sopenharmony_ci * we do not miss events from ep_poll_callback if an 163362306a36Sopenharmony_ci * event occurs immediately after we call f_op->poll(). 163462306a36Sopenharmony_ci * We need this because we did not take ep->lock while 163562306a36Sopenharmony_ci * changing epi above (but ep_poll_callback does take 163662306a36Sopenharmony_ci * ep->lock). 163762306a36Sopenharmony_ci * 163862306a36Sopenharmony_ci * 2) We also need to ensure we do not miss _past_ events 163962306a36Sopenharmony_ci * when calling f_op->poll(). This barrier also 164062306a36Sopenharmony_ci * pairs with the barrier in wq_has_sleeper (see 164162306a36Sopenharmony_ci * comments for wq_has_sleeper). 164262306a36Sopenharmony_ci * 164362306a36Sopenharmony_ci * This barrier will now guarantee ep_poll_callback or f_op->poll 164462306a36Sopenharmony_ci * (or both) will notice the readiness of an item. 164562306a36Sopenharmony_ci */ 164662306a36Sopenharmony_ci smp_mb(); 164762306a36Sopenharmony_ci 164862306a36Sopenharmony_ci /* 164962306a36Sopenharmony_ci * Get current event bits. We can safely use the file* here because 165062306a36Sopenharmony_ci * its usage count has been increased by the caller of this function. 165162306a36Sopenharmony_ci * If the item is "hot" and it is not registered inside the ready 165262306a36Sopenharmony_ci * list, push it inside. 165362306a36Sopenharmony_ci */ 165462306a36Sopenharmony_ci if (ep_item_poll(epi, &pt, 1)) { 165562306a36Sopenharmony_ci write_lock_irq(&ep->lock); 165662306a36Sopenharmony_ci if (!ep_is_linked(epi)) { 165762306a36Sopenharmony_ci list_add_tail(&epi->rdllink, &ep->rdllist); 165862306a36Sopenharmony_ci ep_pm_stay_awake(epi); 165962306a36Sopenharmony_ci 166062306a36Sopenharmony_ci /* Notify waiting tasks that events are available */ 166162306a36Sopenharmony_ci if (waitqueue_active(&ep->wq)) 166262306a36Sopenharmony_ci wake_up(&ep->wq); 166362306a36Sopenharmony_ci if (waitqueue_active(&ep->poll_wait)) 166462306a36Sopenharmony_ci pwake++; 166562306a36Sopenharmony_ci } 166662306a36Sopenharmony_ci write_unlock_irq(&ep->lock); 166762306a36Sopenharmony_ci } 166862306a36Sopenharmony_ci 166962306a36Sopenharmony_ci /* We have to call this outside the lock */ 167062306a36Sopenharmony_ci if (pwake) 167162306a36Sopenharmony_ci ep_poll_safewake(ep, NULL, 0); 167262306a36Sopenharmony_ci 167362306a36Sopenharmony_ci return 0; 167462306a36Sopenharmony_ci} 167562306a36Sopenharmony_ci 167662306a36Sopenharmony_cistatic int ep_send_events(struct eventpoll *ep, 167762306a36Sopenharmony_ci struct epoll_event __user *events, int maxevents) 167862306a36Sopenharmony_ci{ 167962306a36Sopenharmony_ci struct epitem *epi, *tmp; 168062306a36Sopenharmony_ci LIST_HEAD(txlist); 168162306a36Sopenharmony_ci poll_table pt; 168262306a36Sopenharmony_ci int res = 0; 168362306a36Sopenharmony_ci 168462306a36Sopenharmony_ci /* 168562306a36Sopenharmony_ci * Always short-circuit for fatal signals to allow threads to make a 168662306a36Sopenharmony_ci * timely exit without the chance of finding more events available and 168762306a36Sopenharmony_ci * fetching repeatedly. 168862306a36Sopenharmony_ci */ 168962306a36Sopenharmony_ci if (fatal_signal_pending(current)) 169062306a36Sopenharmony_ci return -EINTR; 169162306a36Sopenharmony_ci 169262306a36Sopenharmony_ci init_poll_funcptr(&pt, NULL); 169362306a36Sopenharmony_ci 169462306a36Sopenharmony_ci mutex_lock(&ep->mtx); 169562306a36Sopenharmony_ci ep_start_scan(ep, &txlist); 169662306a36Sopenharmony_ci 169762306a36Sopenharmony_ci /* 169862306a36Sopenharmony_ci * We can loop without lock because we are passed a task private list. 169962306a36Sopenharmony_ci * Items cannot vanish during the loop we are holding ep->mtx. 170062306a36Sopenharmony_ci */ 170162306a36Sopenharmony_ci list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { 170262306a36Sopenharmony_ci struct wakeup_source *ws; 170362306a36Sopenharmony_ci __poll_t revents; 170462306a36Sopenharmony_ci 170562306a36Sopenharmony_ci if (res >= maxevents) 170662306a36Sopenharmony_ci break; 170762306a36Sopenharmony_ci 170862306a36Sopenharmony_ci /* 170962306a36Sopenharmony_ci * Activate ep->ws before deactivating epi->ws to prevent 171062306a36Sopenharmony_ci * triggering auto-suspend here (in case we reactive epi->ws 171162306a36Sopenharmony_ci * below). 171262306a36Sopenharmony_ci * 171362306a36Sopenharmony_ci * This could be rearranged to delay the deactivation of epi->ws 171462306a36Sopenharmony_ci * instead, but then epi->ws would temporarily be out of sync 171562306a36Sopenharmony_ci * with ep_is_linked(). 171662306a36Sopenharmony_ci */ 171762306a36Sopenharmony_ci ws = ep_wakeup_source(epi); 171862306a36Sopenharmony_ci if (ws) { 171962306a36Sopenharmony_ci if (ws->active) 172062306a36Sopenharmony_ci __pm_stay_awake(ep->ws); 172162306a36Sopenharmony_ci __pm_relax(ws); 172262306a36Sopenharmony_ci } 172362306a36Sopenharmony_ci 172462306a36Sopenharmony_ci list_del_init(&epi->rdllink); 172562306a36Sopenharmony_ci 172662306a36Sopenharmony_ci /* 172762306a36Sopenharmony_ci * If the event mask intersect the caller-requested one, 172862306a36Sopenharmony_ci * deliver the event to userspace. Again, we are holding ep->mtx, 172962306a36Sopenharmony_ci * so no operations coming from userspace can change the item. 173062306a36Sopenharmony_ci */ 173162306a36Sopenharmony_ci revents = ep_item_poll(epi, &pt, 1); 173262306a36Sopenharmony_ci if (!revents) 173362306a36Sopenharmony_ci continue; 173462306a36Sopenharmony_ci 173562306a36Sopenharmony_ci events = epoll_put_uevent(revents, epi->event.data, events); 173662306a36Sopenharmony_ci if (!events) { 173762306a36Sopenharmony_ci list_add(&epi->rdllink, &txlist); 173862306a36Sopenharmony_ci ep_pm_stay_awake(epi); 173962306a36Sopenharmony_ci if (!res) 174062306a36Sopenharmony_ci res = -EFAULT; 174162306a36Sopenharmony_ci break; 174262306a36Sopenharmony_ci } 174362306a36Sopenharmony_ci res++; 174462306a36Sopenharmony_ci if (epi->event.events & EPOLLONESHOT) 174562306a36Sopenharmony_ci epi->event.events &= EP_PRIVATE_BITS; 174662306a36Sopenharmony_ci else if (!(epi->event.events & EPOLLET)) { 174762306a36Sopenharmony_ci /* 174862306a36Sopenharmony_ci * If this file has been added with Level 174962306a36Sopenharmony_ci * Trigger mode, we need to insert back inside 175062306a36Sopenharmony_ci * the ready list, so that the next call to 175162306a36Sopenharmony_ci * epoll_wait() will check again the events 175262306a36Sopenharmony_ci * availability. At this point, no one can insert 175362306a36Sopenharmony_ci * into ep->rdllist besides us. The epoll_ctl() 175462306a36Sopenharmony_ci * callers are locked out by 175562306a36Sopenharmony_ci * ep_scan_ready_list() holding "mtx" and the 175662306a36Sopenharmony_ci * poll callback will queue them in ep->ovflist. 175762306a36Sopenharmony_ci */ 175862306a36Sopenharmony_ci list_add_tail(&epi->rdllink, &ep->rdllist); 175962306a36Sopenharmony_ci ep_pm_stay_awake(epi); 176062306a36Sopenharmony_ci } 176162306a36Sopenharmony_ci } 176262306a36Sopenharmony_ci ep_done_scan(ep, &txlist); 176362306a36Sopenharmony_ci mutex_unlock(&ep->mtx); 176462306a36Sopenharmony_ci 176562306a36Sopenharmony_ci return res; 176662306a36Sopenharmony_ci} 176762306a36Sopenharmony_ci 176862306a36Sopenharmony_cistatic struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) 176962306a36Sopenharmony_ci{ 177062306a36Sopenharmony_ci struct timespec64 now; 177162306a36Sopenharmony_ci 177262306a36Sopenharmony_ci if (ms < 0) 177362306a36Sopenharmony_ci return NULL; 177462306a36Sopenharmony_ci 177562306a36Sopenharmony_ci if (!ms) { 177662306a36Sopenharmony_ci to->tv_sec = 0; 177762306a36Sopenharmony_ci to->tv_nsec = 0; 177862306a36Sopenharmony_ci return to; 177962306a36Sopenharmony_ci } 178062306a36Sopenharmony_ci 178162306a36Sopenharmony_ci to->tv_sec = ms / MSEC_PER_SEC; 178262306a36Sopenharmony_ci to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC); 178362306a36Sopenharmony_ci 178462306a36Sopenharmony_ci ktime_get_ts64(&now); 178562306a36Sopenharmony_ci *to = timespec64_add_safe(now, *to); 178662306a36Sopenharmony_ci return to; 178762306a36Sopenharmony_ci} 178862306a36Sopenharmony_ci 178962306a36Sopenharmony_ci/* 179062306a36Sopenharmony_ci * autoremove_wake_function, but remove even on failure to wake up, because we 179162306a36Sopenharmony_ci * know that default_wake_function/ttwu will only fail if the thread is already 179262306a36Sopenharmony_ci * woken, and in that case the ep_poll loop will remove the entry anyways, not 179362306a36Sopenharmony_ci * try to reuse it. 179462306a36Sopenharmony_ci */ 179562306a36Sopenharmony_cistatic int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, 179662306a36Sopenharmony_ci unsigned int mode, int sync, void *key) 179762306a36Sopenharmony_ci{ 179862306a36Sopenharmony_ci int ret = default_wake_function(wq_entry, mode, sync, key); 179962306a36Sopenharmony_ci 180062306a36Sopenharmony_ci /* 180162306a36Sopenharmony_ci * Pairs with list_empty_careful in ep_poll, and ensures future loop 180262306a36Sopenharmony_ci * iterations see the cause of this wakeup. 180362306a36Sopenharmony_ci */ 180462306a36Sopenharmony_ci list_del_init_careful(&wq_entry->entry); 180562306a36Sopenharmony_ci return ret; 180662306a36Sopenharmony_ci} 180762306a36Sopenharmony_ci 180862306a36Sopenharmony_ci/** 180962306a36Sopenharmony_ci * ep_poll - Retrieves ready events, and delivers them to the caller-supplied 181062306a36Sopenharmony_ci * event buffer. 181162306a36Sopenharmony_ci * 181262306a36Sopenharmony_ci * @ep: Pointer to the eventpoll context. 181362306a36Sopenharmony_ci * @events: Pointer to the userspace buffer where the ready events should be 181462306a36Sopenharmony_ci * stored. 181562306a36Sopenharmony_ci * @maxevents: Size (in terms of number of events) of the caller event buffer. 181662306a36Sopenharmony_ci * @timeout: Maximum timeout for the ready events fetch operation, in 181762306a36Sopenharmony_ci * timespec. If the timeout is zero, the function will not block, 181862306a36Sopenharmony_ci * while if the @timeout ptr is NULL, the function will block 181962306a36Sopenharmony_ci * until at least one event has been retrieved (or an error 182062306a36Sopenharmony_ci * occurred). 182162306a36Sopenharmony_ci * 182262306a36Sopenharmony_ci * Return: the number of ready events which have been fetched, or an 182362306a36Sopenharmony_ci * error code, in case of error. 182462306a36Sopenharmony_ci */ 182562306a36Sopenharmony_cistatic int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 182662306a36Sopenharmony_ci int maxevents, struct timespec64 *timeout) 182762306a36Sopenharmony_ci{ 182862306a36Sopenharmony_ci int res, eavail, timed_out = 0; 182962306a36Sopenharmony_ci u64 slack = 0; 183062306a36Sopenharmony_ci wait_queue_entry_t wait; 183162306a36Sopenharmony_ci ktime_t expires, *to = NULL; 183262306a36Sopenharmony_ci 183362306a36Sopenharmony_ci lockdep_assert_irqs_enabled(); 183462306a36Sopenharmony_ci 183562306a36Sopenharmony_ci if (timeout && (timeout->tv_sec | timeout->tv_nsec)) { 183662306a36Sopenharmony_ci slack = select_estimate_accuracy(timeout); 183762306a36Sopenharmony_ci to = &expires; 183862306a36Sopenharmony_ci *to = timespec64_to_ktime(*timeout); 183962306a36Sopenharmony_ci } else if (timeout) { 184062306a36Sopenharmony_ci /* 184162306a36Sopenharmony_ci * Avoid the unnecessary trip to the wait queue loop, if the 184262306a36Sopenharmony_ci * caller specified a non blocking operation. 184362306a36Sopenharmony_ci */ 184462306a36Sopenharmony_ci timed_out = 1; 184562306a36Sopenharmony_ci } 184662306a36Sopenharmony_ci 184762306a36Sopenharmony_ci /* 184862306a36Sopenharmony_ci * This call is racy: We may or may not see events that are being added 184962306a36Sopenharmony_ci * to the ready list under the lock (e.g., in IRQ callbacks). For cases 185062306a36Sopenharmony_ci * with a non-zero timeout, this thread will check the ready list under 185162306a36Sopenharmony_ci * lock and will add to the wait queue. For cases with a zero 185262306a36Sopenharmony_ci * timeout, the user by definition should not care and will have to 185362306a36Sopenharmony_ci * recheck again. 185462306a36Sopenharmony_ci */ 185562306a36Sopenharmony_ci eavail = ep_events_available(ep); 185662306a36Sopenharmony_ci 185762306a36Sopenharmony_ci while (1) { 185862306a36Sopenharmony_ci if (eavail) { 185962306a36Sopenharmony_ci /* 186062306a36Sopenharmony_ci * Try to transfer events to user space. In case we get 186162306a36Sopenharmony_ci * 0 events and there's still timeout left over, we go 186262306a36Sopenharmony_ci * trying again in search of more luck. 186362306a36Sopenharmony_ci */ 186462306a36Sopenharmony_ci res = ep_send_events(ep, events, maxevents); 186562306a36Sopenharmony_ci if (res) 186662306a36Sopenharmony_ci return res; 186762306a36Sopenharmony_ci } 186862306a36Sopenharmony_ci 186962306a36Sopenharmony_ci if (timed_out) 187062306a36Sopenharmony_ci return 0; 187162306a36Sopenharmony_ci 187262306a36Sopenharmony_ci eavail = ep_busy_loop(ep, timed_out); 187362306a36Sopenharmony_ci if (eavail) 187462306a36Sopenharmony_ci continue; 187562306a36Sopenharmony_ci 187662306a36Sopenharmony_ci if (signal_pending(current)) 187762306a36Sopenharmony_ci return -EINTR; 187862306a36Sopenharmony_ci 187962306a36Sopenharmony_ci /* 188062306a36Sopenharmony_ci * Internally init_wait() uses autoremove_wake_function(), 188162306a36Sopenharmony_ci * thus wait entry is removed from the wait queue on each 188262306a36Sopenharmony_ci * wakeup. Why it is important? In case of several waiters 188362306a36Sopenharmony_ci * each new wakeup will hit the next waiter, giving it the 188462306a36Sopenharmony_ci * chance to harvest new event. Otherwise wakeup can be 188562306a36Sopenharmony_ci * lost. This is also good performance-wise, because on 188662306a36Sopenharmony_ci * normal wakeup path no need to call __remove_wait_queue() 188762306a36Sopenharmony_ci * explicitly, thus ep->lock is not taken, which halts the 188862306a36Sopenharmony_ci * event delivery. 188962306a36Sopenharmony_ci * 189062306a36Sopenharmony_ci * In fact, we now use an even more aggressive function that 189162306a36Sopenharmony_ci * unconditionally removes, because we don't reuse the wait 189262306a36Sopenharmony_ci * entry between loop iterations. This lets us also avoid the 189362306a36Sopenharmony_ci * performance issue if a process is killed, causing all of its 189462306a36Sopenharmony_ci * threads to wake up without being removed normally. 189562306a36Sopenharmony_ci */ 189662306a36Sopenharmony_ci init_wait(&wait); 189762306a36Sopenharmony_ci wait.func = ep_autoremove_wake_function; 189862306a36Sopenharmony_ci 189962306a36Sopenharmony_ci write_lock_irq(&ep->lock); 190062306a36Sopenharmony_ci /* 190162306a36Sopenharmony_ci * Barrierless variant, waitqueue_active() is called under 190262306a36Sopenharmony_ci * the same lock on wakeup ep_poll_callback() side, so it 190362306a36Sopenharmony_ci * is safe to avoid an explicit barrier. 190462306a36Sopenharmony_ci */ 190562306a36Sopenharmony_ci __set_current_state(TASK_INTERRUPTIBLE); 190662306a36Sopenharmony_ci 190762306a36Sopenharmony_ci /* 190862306a36Sopenharmony_ci * Do the final check under the lock. ep_scan_ready_list() 190962306a36Sopenharmony_ci * plays with two lists (->rdllist and ->ovflist) and there 191062306a36Sopenharmony_ci * is always a race when both lists are empty for short 191162306a36Sopenharmony_ci * period of time although events are pending, so lock is 191262306a36Sopenharmony_ci * important. 191362306a36Sopenharmony_ci */ 191462306a36Sopenharmony_ci eavail = ep_events_available(ep); 191562306a36Sopenharmony_ci if (!eavail) 191662306a36Sopenharmony_ci __add_wait_queue_exclusive(&ep->wq, &wait); 191762306a36Sopenharmony_ci 191862306a36Sopenharmony_ci write_unlock_irq(&ep->lock); 191962306a36Sopenharmony_ci 192062306a36Sopenharmony_ci if (!eavail) 192162306a36Sopenharmony_ci timed_out = !schedule_hrtimeout_range(to, slack, 192262306a36Sopenharmony_ci HRTIMER_MODE_ABS); 192362306a36Sopenharmony_ci __set_current_state(TASK_RUNNING); 192462306a36Sopenharmony_ci 192562306a36Sopenharmony_ci /* 192662306a36Sopenharmony_ci * We were woken up, thus go and try to harvest some events. 192762306a36Sopenharmony_ci * If timed out and still on the wait queue, recheck eavail 192862306a36Sopenharmony_ci * carefully under lock, below. 192962306a36Sopenharmony_ci */ 193062306a36Sopenharmony_ci eavail = 1; 193162306a36Sopenharmony_ci 193262306a36Sopenharmony_ci if (!list_empty_careful(&wait.entry)) { 193362306a36Sopenharmony_ci write_lock_irq(&ep->lock); 193462306a36Sopenharmony_ci /* 193562306a36Sopenharmony_ci * If the thread timed out and is not on the wait queue, 193662306a36Sopenharmony_ci * it means that the thread was woken up after its 193762306a36Sopenharmony_ci * timeout expired before it could reacquire the lock. 193862306a36Sopenharmony_ci * Thus, when wait.entry is empty, it needs to harvest 193962306a36Sopenharmony_ci * events. 194062306a36Sopenharmony_ci */ 194162306a36Sopenharmony_ci if (timed_out) 194262306a36Sopenharmony_ci eavail = list_empty(&wait.entry); 194362306a36Sopenharmony_ci __remove_wait_queue(&ep->wq, &wait); 194462306a36Sopenharmony_ci write_unlock_irq(&ep->lock); 194562306a36Sopenharmony_ci } 194662306a36Sopenharmony_ci } 194762306a36Sopenharmony_ci} 194862306a36Sopenharmony_ci 194962306a36Sopenharmony_ci/** 195062306a36Sopenharmony_ci * ep_loop_check_proc - verify that adding an epoll file inside another 195162306a36Sopenharmony_ci * epoll structure does not violate the constraints, in 195262306a36Sopenharmony_ci * terms of closed loops, or too deep chains (which can 195362306a36Sopenharmony_ci * result in excessive stack usage). 195462306a36Sopenharmony_ci * 195562306a36Sopenharmony_ci * @ep: the &struct eventpoll to be currently checked. 195662306a36Sopenharmony_ci * @depth: Current depth of the path being checked. 195762306a36Sopenharmony_ci * 195862306a36Sopenharmony_ci * Return: %zero if adding the epoll @file inside current epoll 195962306a36Sopenharmony_ci * structure @ep does not violate the constraints, or %-1 otherwise. 196062306a36Sopenharmony_ci */ 196162306a36Sopenharmony_cistatic int ep_loop_check_proc(struct eventpoll *ep, int depth) 196262306a36Sopenharmony_ci{ 196362306a36Sopenharmony_ci int error = 0; 196462306a36Sopenharmony_ci struct rb_node *rbp; 196562306a36Sopenharmony_ci struct epitem *epi; 196662306a36Sopenharmony_ci 196762306a36Sopenharmony_ci mutex_lock_nested(&ep->mtx, depth + 1); 196862306a36Sopenharmony_ci ep->gen = loop_check_gen; 196962306a36Sopenharmony_ci for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { 197062306a36Sopenharmony_ci epi = rb_entry(rbp, struct epitem, rbn); 197162306a36Sopenharmony_ci if (unlikely(is_file_epoll(epi->ffd.file))) { 197262306a36Sopenharmony_ci struct eventpoll *ep_tovisit; 197362306a36Sopenharmony_ci ep_tovisit = epi->ffd.file->private_data; 197462306a36Sopenharmony_ci if (ep_tovisit->gen == loop_check_gen) 197562306a36Sopenharmony_ci continue; 197662306a36Sopenharmony_ci if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) 197762306a36Sopenharmony_ci error = -1; 197862306a36Sopenharmony_ci else 197962306a36Sopenharmony_ci error = ep_loop_check_proc(ep_tovisit, depth + 1); 198062306a36Sopenharmony_ci if (error != 0) 198162306a36Sopenharmony_ci break; 198262306a36Sopenharmony_ci } else { 198362306a36Sopenharmony_ci /* 198462306a36Sopenharmony_ci * If we've reached a file that is not associated with 198562306a36Sopenharmony_ci * an ep, then we need to check if the newly added 198662306a36Sopenharmony_ci * links are going to add too many wakeup paths. We do 198762306a36Sopenharmony_ci * this by adding it to the tfile_check_list, if it's 198862306a36Sopenharmony_ci * not already there, and calling reverse_path_check() 198962306a36Sopenharmony_ci * during ep_insert(). 199062306a36Sopenharmony_ci */ 199162306a36Sopenharmony_ci list_file(epi->ffd.file); 199262306a36Sopenharmony_ci } 199362306a36Sopenharmony_ci } 199462306a36Sopenharmony_ci mutex_unlock(&ep->mtx); 199562306a36Sopenharmony_ci 199662306a36Sopenharmony_ci return error; 199762306a36Sopenharmony_ci} 199862306a36Sopenharmony_ci 199962306a36Sopenharmony_ci/** 200062306a36Sopenharmony_ci * ep_loop_check - Performs a check to verify that adding an epoll file (@to) 200162306a36Sopenharmony_ci * into another epoll file (represented by @ep) does not create 200262306a36Sopenharmony_ci * closed loops or too deep chains. 200362306a36Sopenharmony_ci * 200462306a36Sopenharmony_ci * @ep: Pointer to the epoll we are inserting into. 200562306a36Sopenharmony_ci * @to: Pointer to the epoll to be inserted. 200662306a36Sopenharmony_ci * 200762306a36Sopenharmony_ci * Return: %zero if adding the epoll @to inside the epoll @from 200862306a36Sopenharmony_ci * does not violate the constraints, or %-1 otherwise. 200962306a36Sopenharmony_ci */ 201062306a36Sopenharmony_cistatic int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) 201162306a36Sopenharmony_ci{ 201262306a36Sopenharmony_ci inserting_into = ep; 201362306a36Sopenharmony_ci return ep_loop_check_proc(to, 0); 201462306a36Sopenharmony_ci} 201562306a36Sopenharmony_ci 201662306a36Sopenharmony_cistatic void clear_tfile_check_list(void) 201762306a36Sopenharmony_ci{ 201862306a36Sopenharmony_ci rcu_read_lock(); 201962306a36Sopenharmony_ci while (tfile_check_list != EP_UNACTIVE_PTR) { 202062306a36Sopenharmony_ci struct epitems_head *head = tfile_check_list; 202162306a36Sopenharmony_ci tfile_check_list = head->next; 202262306a36Sopenharmony_ci unlist_file(head); 202362306a36Sopenharmony_ci } 202462306a36Sopenharmony_ci rcu_read_unlock(); 202562306a36Sopenharmony_ci} 202662306a36Sopenharmony_ci 202762306a36Sopenharmony_ci/* 202862306a36Sopenharmony_ci * Open an eventpoll file descriptor. 202962306a36Sopenharmony_ci */ 203062306a36Sopenharmony_cistatic int do_epoll_create(int flags) 203162306a36Sopenharmony_ci{ 203262306a36Sopenharmony_ci int error, fd; 203362306a36Sopenharmony_ci struct eventpoll *ep = NULL; 203462306a36Sopenharmony_ci struct file *file; 203562306a36Sopenharmony_ci 203662306a36Sopenharmony_ci /* Check the EPOLL_* constant for consistency. */ 203762306a36Sopenharmony_ci BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 203862306a36Sopenharmony_ci 203962306a36Sopenharmony_ci if (flags & ~EPOLL_CLOEXEC) 204062306a36Sopenharmony_ci return -EINVAL; 204162306a36Sopenharmony_ci /* 204262306a36Sopenharmony_ci * Create the internal data structure ("struct eventpoll"). 204362306a36Sopenharmony_ci */ 204462306a36Sopenharmony_ci error = ep_alloc(&ep); 204562306a36Sopenharmony_ci if (error < 0) 204662306a36Sopenharmony_ci return error; 204762306a36Sopenharmony_ci /* 204862306a36Sopenharmony_ci * Creates all the items needed to setup an eventpoll file. That is, 204962306a36Sopenharmony_ci * a file structure and a free file descriptor. 205062306a36Sopenharmony_ci */ 205162306a36Sopenharmony_ci fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); 205262306a36Sopenharmony_ci if (fd < 0) { 205362306a36Sopenharmony_ci error = fd; 205462306a36Sopenharmony_ci goto out_free_ep; 205562306a36Sopenharmony_ci } 205662306a36Sopenharmony_ci file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, 205762306a36Sopenharmony_ci O_RDWR | (flags & O_CLOEXEC)); 205862306a36Sopenharmony_ci if (IS_ERR(file)) { 205962306a36Sopenharmony_ci error = PTR_ERR(file); 206062306a36Sopenharmony_ci goto out_free_fd; 206162306a36Sopenharmony_ci } 206262306a36Sopenharmony_ci ep->file = file; 206362306a36Sopenharmony_ci fd_install(fd, file); 206462306a36Sopenharmony_ci return fd; 206562306a36Sopenharmony_ci 206662306a36Sopenharmony_ciout_free_fd: 206762306a36Sopenharmony_ci put_unused_fd(fd); 206862306a36Sopenharmony_ciout_free_ep: 206962306a36Sopenharmony_ci ep_clear_and_put(ep); 207062306a36Sopenharmony_ci return error; 207162306a36Sopenharmony_ci} 207262306a36Sopenharmony_ci 207362306a36Sopenharmony_ciSYSCALL_DEFINE1(epoll_create1, int, flags) 207462306a36Sopenharmony_ci{ 207562306a36Sopenharmony_ci return do_epoll_create(flags); 207662306a36Sopenharmony_ci} 207762306a36Sopenharmony_ci 207862306a36Sopenharmony_ciSYSCALL_DEFINE1(epoll_create, int, size) 207962306a36Sopenharmony_ci{ 208062306a36Sopenharmony_ci if (size <= 0) 208162306a36Sopenharmony_ci return -EINVAL; 208262306a36Sopenharmony_ci 208362306a36Sopenharmony_ci return do_epoll_create(0); 208462306a36Sopenharmony_ci} 208562306a36Sopenharmony_ci 208662306a36Sopenharmony_ci#ifdef CONFIG_PM_SLEEP 208762306a36Sopenharmony_cistatic inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) 208862306a36Sopenharmony_ci{ 208962306a36Sopenharmony_ci if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) 209062306a36Sopenharmony_ci epev->events &= ~EPOLLWAKEUP; 209162306a36Sopenharmony_ci} 209262306a36Sopenharmony_ci#else 209362306a36Sopenharmony_cistatic inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) 209462306a36Sopenharmony_ci{ 209562306a36Sopenharmony_ci epev->events &= ~EPOLLWAKEUP; 209662306a36Sopenharmony_ci} 209762306a36Sopenharmony_ci#endif 209862306a36Sopenharmony_ci 209962306a36Sopenharmony_cistatic inline int epoll_mutex_lock(struct mutex *mutex, int depth, 210062306a36Sopenharmony_ci bool nonblock) 210162306a36Sopenharmony_ci{ 210262306a36Sopenharmony_ci if (!nonblock) { 210362306a36Sopenharmony_ci mutex_lock_nested(mutex, depth); 210462306a36Sopenharmony_ci return 0; 210562306a36Sopenharmony_ci } 210662306a36Sopenharmony_ci if (mutex_trylock(mutex)) 210762306a36Sopenharmony_ci return 0; 210862306a36Sopenharmony_ci return -EAGAIN; 210962306a36Sopenharmony_ci} 211062306a36Sopenharmony_ci 211162306a36Sopenharmony_ciint do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, 211262306a36Sopenharmony_ci bool nonblock) 211362306a36Sopenharmony_ci{ 211462306a36Sopenharmony_ci int error; 211562306a36Sopenharmony_ci int full_check = 0; 211662306a36Sopenharmony_ci struct fd f, tf; 211762306a36Sopenharmony_ci struct eventpoll *ep; 211862306a36Sopenharmony_ci struct epitem *epi; 211962306a36Sopenharmony_ci struct eventpoll *tep = NULL; 212062306a36Sopenharmony_ci 212162306a36Sopenharmony_ci error = -EBADF; 212262306a36Sopenharmony_ci f = fdget(epfd); 212362306a36Sopenharmony_ci if (!f.file) 212462306a36Sopenharmony_ci goto error_return; 212562306a36Sopenharmony_ci 212662306a36Sopenharmony_ci /* Get the "struct file *" for the target file */ 212762306a36Sopenharmony_ci tf = fdget(fd); 212862306a36Sopenharmony_ci if (!tf.file) 212962306a36Sopenharmony_ci goto error_fput; 213062306a36Sopenharmony_ci 213162306a36Sopenharmony_ci /* The target file descriptor must support poll */ 213262306a36Sopenharmony_ci error = -EPERM; 213362306a36Sopenharmony_ci if (!file_can_poll(tf.file)) 213462306a36Sopenharmony_ci goto error_tgt_fput; 213562306a36Sopenharmony_ci 213662306a36Sopenharmony_ci /* Check if EPOLLWAKEUP is allowed */ 213762306a36Sopenharmony_ci if (ep_op_has_event(op)) 213862306a36Sopenharmony_ci ep_take_care_of_epollwakeup(epds); 213962306a36Sopenharmony_ci 214062306a36Sopenharmony_ci /* 214162306a36Sopenharmony_ci * We have to check that the file structure underneath the file descriptor 214262306a36Sopenharmony_ci * the user passed to us _is_ an eventpoll file. And also we do not permit 214362306a36Sopenharmony_ci * adding an epoll file descriptor inside itself. 214462306a36Sopenharmony_ci */ 214562306a36Sopenharmony_ci error = -EINVAL; 214662306a36Sopenharmony_ci if (f.file == tf.file || !is_file_epoll(f.file)) 214762306a36Sopenharmony_ci goto error_tgt_fput; 214862306a36Sopenharmony_ci 214962306a36Sopenharmony_ci /* 215062306a36Sopenharmony_ci * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, 215162306a36Sopenharmony_ci * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. 215262306a36Sopenharmony_ci * Also, we do not currently supported nested exclusive wakeups. 215362306a36Sopenharmony_ci */ 215462306a36Sopenharmony_ci if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { 215562306a36Sopenharmony_ci if (op == EPOLL_CTL_MOD) 215662306a36Sopenharmony_ci goto error_tgt_fput; 215762306a36Sopenharmony_ci if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || 215862306a36Sopenharmony_ci (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) 215962306a36Sopenharmony_ci goto error_tgt_fput; 216062306a36Sopenharmony_ci } 216162306a36Sopenharmony_ci 216262306a36Sopenharmony_ci /* 216362306a36Sopenharmony_ci * At this point it is safe to assume that the "private_data" contains 216462306a36Sopenharmony_ci * our own data structure. 216562306a36Sopenharmony_ci */ 216662306a36Sopenharmony_ci ep = f.file->private_data; 216762306a36Sopenharmony_ci 216862306a36Sopenharmony_ci /* 216962306a36Sopenharmony_ci * When we insert an epoll file descriptor inside another epoll file 217062306a36Sopenharmony_ci * descriptor, there is the chance of creating closed loops, which are 217162306a36Sopenharmony_ci * better be handled here, than in more critical paths. While we are 217262306a36Sopenharmony_ci * checking for loops we also determine the list of files reachable 217362306a36Sopenharmony_ci * and hang them on the tfile_check_list, so we can check that we 217462306a36Sopenharmony_ci * haven't created too many possible wakeup paths. 217562306a36Sopenharmony_ci * 217662306a36Sopenharmony_ci * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when 217762306a36Sopenharmony_ci * the epoll file descriptor is attaching directly to a wakeup source, 217862306a36Sopenharmony_ci * unless the epoll file descriptor is nested. The purpose of taking the 217962306a36Sopenharmony_ci * 'epnested_mutex' on add is to prevent complex toplogies such as loops and 218062306a36Sopenharmony_ci * deep wakeup paths from forming in parallel through multiple 218162306a36Sopenharmony_ci * EPOLL_CTL_ADD operations. 218262306a36Sopenharmony_ci */ 218362306a36Sopenharmony_ci error = epoll_mutex_lock(&ep->mtx, 0, nonblock); 218462306a36Sopenharmony_ci if (error) 218562306a36Sopenharmony_ci goto error_tgt_fput; 218662306a36Sopenharmony_ci if (op == EPOLL_CTL_ADD) { 218762306a36Sopenharmony_ci if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen || 218862306a36Sopenharmony_ci is_file_epoll(tf.file)) { 218962306a36Sopenharmony_ci mutex_unlock(&ep->mtx); 219062306a36Sopenharmony_ci error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); 219162306a36Sopenharmony_ci if (error) 219262306a36Sopenharmony_ci goto error_tgt_fput; 219362306a36Sopenharmony_ci loop_check_gen++; 219462306a36Sopenharmony_ci full_check = 1; 219562306a36Sopenharmony_ci if (is_file_epoll(tf.file)) { 219662306a36Sopenharmony_ci tep = tf.file->private_data; 219762306a36Sopenharmony_ci error = -ELOOP; 219862306a36Sopenharmony_ci if (ep_loop_check(ep, tep) != 0) 219962306a36Sopenharmony_ci goto error_tgt_fput; 220062306a36Sopenharmony_ci } 220162306a36Sopenharmony_ci error = epoll_mutex_lock(&ep->mtx, 0, nonblock); 220262306a36Sopenharmony_ci if (error) 220362306a36Sopenharmony_ci goto error_tgt_fput; 220462306a36Sopenharmony_ci } 220562306a36Sopenharmony_ci } 220662306a36Sopenharmony_ci 220762306a36Sopenharmony_ci /* 220862306a36Sopenharmony_ci * Try to lookup the file inside our RB tree. Since we grabbed "mtx" 220962306a36Sopenharmony_ci * above, we can be sure to be able to use the item looked up by 221062306a36Sopenharmony_ci * ep_find() till we release the mutex. 221162306a36Sopenharmony_ci */ 221262306a36Sopenharmony_ci epi = ep_find(ep, tf.file, fd); 221362306a36Sopenharmony_ci 221462306a36Sopenharmony_ci error = -EINVAL; 221562306a36Sopenharmony_ci switch (op) { 221662306a36Sopenharmony_ci case EPOLL_CTL_ADD: 221762306a36Sopenharmony_ci if (!epi) { 221862306a36Sopenharmony_ci epds->events |= EPOLLERR | EPOLLHUP; 221962306a36Sopenharmony_ci error = ep_insert(ep, epds, tf.file, fd, full_check); 222062306a36Sopenharmony_ci } else 222162306a36Sopenharmony_ci error = -EEXIST; 222262306a36Sopenharmony_ci break; 222362306a36Sopenharmony_ci case EPOLL_CTL_DEL: 222462306a36Sopenharmony_ci if (epi) { 222562306a36Sopenharmony_ci /* 222662306a36Sopenharmony_ci * The eventpoll itself is still alive: the refcount 222762306a36Sopenharmony_ci * can't go to zero here. 222862306a36Sopenharmony_ci */ 222962306a36Sopenharmony_ci ep_remove_safe(ep, epi); 223062306a36Sopenharmony_ci error = 0; 223162306a36Sopenharmony_ci } else { 223262306a36Sopenharmony_ci error = -ENOENT; 223362306a36Sopenharmony_ci } 223462306a36Sopenharmony_ci break; 223562306a36Sopenharmony_ci case EPOLL_CTL_MOD: 223662306a36Sopenharmony_ci if (epi) { 223762306a36Sopenharmony_ci if (!(epi->event.events & EPOLLEXCLUSIVE)) { 223862306a36Sopenharmony_ci epds->events |= EPOLLERR | EPOLLHUP; 223962306a36Sopenharmony_ci error = ep_modify(ep, epi, epds); 224062306a36Sopenharmony_ci } 224162306a36Sopenharmony_ci } else 224262306a36Sopenharmony_ci error = -ENOENT; 224362306a36Sopenharmony_ci break; 224462306a36Sopenharmony_ci } 224562306a36Sopenharmony_ci mutex_unlock(&ep->mtx); 224662306a36Sopenharmony_ci 224762306a36Sopenharmony_cierror_tgt_fput: 224862306a36Sopenharmony_ci if (full_check) { 224962306a36Sopenharmony_ci clear_tfile_check_list(); 225062306a36Sopenharmony_ci loop_check_gen++; 225162306a36Sopenharmony_ci mutex_unlock(&epnested_mutex); 225262306a36Sopenharmony_ci } 225362306a36Sopenharmony_ci 225462306a36Sopenharmony_ci fdput(tf); 225562306a36Sopenharmony_cierror_fput: 225662306a36Sopenharmony_ci fdput(f); 225762306a36Sopenharmony_cierror_return: 225862306a36Sopenharmony_ci 225962306a36Sopenharmony_ci return error; 226062306a36Sopenharmony_ci} 226162306a36Sopenharmony_ci 226262306a36Sopenharmony_ci/* 226362306a36Sopenharmony_ci * The following function implements the controller interface for 226462306a36Sopenharmony_ci * the eventpoll file that enables the insertion/removal/change of 226562306a36Sopenharmony_ci * file descriptors inside the interest set. 226662306a36Sopenharmony_ci */ 226762306a36Sopenharmony_ciSYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, 226862306a36Sopenharmony_ci struct epoll_event __user *, event) 226962306a36Sopenharmony_ci{ 227062306a36Sopenharmony_ci struct epoll_event epds; 227162306a36Sopenharmony_ci 227262306a36Sopenharmony_ci if (ep_op_has_event(op) && 227362306a36Sopenharmony_ci copy_from_user(&epds, event, sizeof(struct epoll_event))) 227462306a36Sopenharmony_ci return -EFAULT; 227562306a36Sopenharmony_ci 227662306a36Sopenharmony_ci return do_epoll_ctl(epfd, op, fd, &epds, false); 227762306a36Sopenharmony_ci} 227862306a36Sopenharmony_ci 227962306a36Sopenharmony_ci/* 228062306a36Sopenharmony_ci * Implement the event wait interface for the eventpoll file. It is the kernel 228162306a36Sopenharmony_ci * part of the user space epoll_wait(2). 228262306a36Sopenharmony_ci */ 228362306a36Sopenharmony_cistatic int do_epoll_wait(int epfd, struct epoll_event __user *events, 228462306a36Sopenharmony_ci int maxevents, struct timespec64 *to) 228562306a36Sopenharmony_ci{ 228662306a36Sopenharmony_ci int error; 228762306a36Sopenharmony_ci struct fd f; 228862306a36Sopenharmony_ci struct eventpoll *ep; 228962306a36Sopenharmony_ci 229062306a36Sopenharmony_ci /* The maximum number of event must be greater than zero */ 229162306a36Sopenharmony_ci if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) 229262306a36Sopenharmony_ci return -EINVAL; 229362306a36Sopenharmony_ci 229462306a36Sopenharmony_ci /* Verify that the area passed by the user is writeable */ 229562306a36Sopenharmony_ci if (!access_ok(events, maxevents * sizeof(struct epoll_event))) 229662306a36Sopenharmony_ci return -EFAULT; 229762306a36Sopenharmony_ci 229862306a36Sopenharmony_ci /* Get the "struct file *" for the eventpoll file */ 229962306a36Sopenharmony_ci f = fdget(epfd); 230062306a36Sopenharmony_ci if (!f.file) 230162306a36Sopenharmony_ci return -EBADF; 230262306a36Sopenharmony_ci 230362306a36Sopenharmony_ci /* 230462306a36Sopenharmony_ci * We have to check that the file structure underneath the fd 230562306a36Sopenharmony_ci * the user passed to us _is_ an eventpoll file. 230662306a36Sopenharmony_ci */ 230762306a36Sopenharmony_ci error = -EINVAL; 230862306a36Sopenharmony_ci if (!is_file_epoll(f.file)) 230962306a36Sopenharmony_ci goto error_fput; 231062306a36Sopenharmony_ci 231162306a36Sopenharmony_ci /* 231262306a36Sopenharmony_ci * At this point it is safe to assume that the "private_data" contains 231362306a36Sopenharmony_ci * our own data structure. 231462306a36Sopenharmony_ci */ 231562306a36Sopenharmony_ci ep = f.file->private_data; 231662306a36Sopenharmony_ci 231762306a36Sopenharmony_ci /* Time to fish for events ... */ 231862306a36Sopenharmony_ci error = ep_poll(ep, events, maxevents, to); 231962306a36Sopenharmony_ci 232062306a36Sopenharmony_cierror_fput: 232162306a36Sopenharmony_ci fdput(f); 232262306a36Sopenharmony_ci return error; 232362306a36Sopenharmony_ci} 232462306a36Sopenharmony_ci 232562306a36Sopenharmony_ciSYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, 232662306a36Sopenharmony_ci int, maxevents, int, timeout) 232762306a36Sopenharmony_ci{ 232862306a36Sopenharmony_ci struct timespec64 to; 232962306a36Sopenharmony_ci 233062306a36Sopenharmony_ci return do_epoll_wait(epfd, events, maxevents, 233162306a36Sopenharmony_ci ep_timeout_to_timespec(&to, timeout)); 233262306a36Sopenharmony_ci} 233362306a36Sopenharmony_ci 233462306a36Sopenharmony_ci/* 233562306a36Sopenharmony_ci * Implement the event wait interface for the eventpoll file. It is the kernel 233662306a36Sopenharmony_ci * part of the user space epoll_pwait(2). 233762306a36Sopenharmony_ci */ 233862306a36Sopenharmony_cistatic int do_epoll_pwait(int epfd, struct epoll_event __user *events, 233962306a36Sopenharmony_ci int maxevents, struct timespec64 *to, 234062306a36Sopenharmony_ci const sigset_t __user *sigmask, size_t sigsetsize) 234162306a36Sopenharmony_ci{ 234262306a36Sopenharmony_ci int error; 234362306a36Sopenharmony_ci 234462306a36Sopenharmony_ci /* 234562306a36Sopenharmony_ci * If the caller wants a certain signal mask to be set during the wait, 234662306a36Sopenharmony_ci * we apply it here. 234762306a36Sopenharmony_ci */ 234862306a36Sopenharmony_ci error = set_user_sigmask(sigmask, sigsetsize); 234962306a36Sopenharmony_ci if (error) 235062306a36Sopenharmony_ci return error; 235162306a36Sopenharmony_ci 235262306a36Sopenharmony_ci error = do_epoll_wait(epfd, events, maxevents, to); 235362306a36Sopenharmony_ci 235462306a36Sopenharmony_ci restore_saved_sigmask_unless(error == -EINTR); 235562306a36Sopenharmony_ci 235662306a36Sopenharmony_ci return error; 235762306a36Sopenharmony_ci} 235862306a36Sopenharmony_ci 235962306a36Sopenharmony_ciSYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, 236062306a36Sopenharmony_ci int, maxevents, int, timeout, const sigset_t __user *, sigmask, 236162306a36Sopenharmony_ci size_t, sigsetsize) 236262306a36Sopenharmony_ci{ 236362306a36Sopenharmony_ci struct timespec64 to; 236462306a36Sopenharmony_ci 236562306a36Sopenharmony_ci return do_epoll_pwait(epfd, events, maxevents, 236662306a36Sopenharmony_ci ep_timeout_to_timespec(&to, timeout), 236762306a36Sopenharmony_ci sigmask, sigsetsize); 236862306a36Sopenharmony_ci} 236962306a36Sopenharmony_ci 237062306a36Sopenharmony_ciSYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, 237162306a36Sopenharmony_ci int, maxevents, const struct __kernel_timespec __user *, timeout, 237262306a36Sopenharmony_ci const sigset_t __user *, sigmask, size_t, sigsetsize) 237362306a36Sopenharmony_ci{ 237462306a36Sopenharmony_ci struct timespec64 ts, *to = NULL; 237562306a36Sopenharmony_ci 237662306a36Sopenharmony_ci if (timeout) { 237762306a36Sopenharmony_ci if (get_timespec64(&ts, timeout)) 237862306a36Sopenharmony_ci return -EFAULT; 237962306a36Sopenharmony_ci to = &ts; 238062306a36Sopenharmony_ci if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) 238162306a36Sopenharmony_ci return -EINVAL; 238262306a36Sopenharmony_ci } 238362306a36Sopenharmony_ci 238462306a36Sopenharmony_ci return do_epoll_pwait(epfd, events, maxevents, to, 238562306a36Sopenharmony_ci sigmask, sigsetsize); 238662306a36Sopenharmony_ci} 238762306a36Sopenharmony_ci 238862306a36Sopenharmony_ci#ifdef CONFIG_COMPAT 238962306a36Sopenharmony_cistatic int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events, 239062306a36Sopenharmony_ci int maxevents, struct timespec64 *timeout, 239162306a36Sopenharmony_ci const compat_sigset_t __user *sigmask, 239262306a36Sopenharmony_ci compat_size_t sigsetsize) 239362306a36Sopenharmony_ci{ 239462306a36Sopenharmony_ci long err; 239562306a36Sopenharmony_ci 239662306a36Sopenharmony_ci /* 239762306a36Sopenharmony_ci * If the caller wants a certain signal mask to be set during the wait, 239862306a36Sopenharmony_ci * we apply it here. 239962306a36Sopenharmony_ci */ 240062306a36Sopenharmony_ci err = set_compat_user_sigmask(sigmask, sigsetsize); 240162306a36Sopenharmony_ci if (err) 240262306a36Sopenharmony_ci return err; 240362306a36Sopenharmony_ci 240462306a36Sopenharmony_ci err = do_epoll_wait(epfd, events, maxevents, timeout); 240562306a36Sopenharmony_ci 240662306a36Sopenharmony_ci restore_saved_sigmask_unless(err == -EINTR); 240762306a36Sopenharmony_ci 240862306a36Sopenharmony_ci return err; 240962306a36Sopenharmony_ci} 241062306a36Sopenharmony_ci 241162306a36Sopenharmony_ciCOMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, 241262306a36Sopenharmony_ci struct epoll_event __user *, events, 241362306a36Sopenharmony_ci int, maxevents, int, timeout, 241462306a36Sopenharmony_ci const compat_sigset_t __user *, sigmask, 241562306a36Sopenharmony_ci compat_size_t, sigsetsize) 241662306a36Sopenharmony_ci{ 241762306a36Sopenharmony_ci struct timespec64 to; 241862306a36Sopenharmony_ci 241962306a36Sopenharmony_ci return do_compat_epoll_pwait(epfd, events, maxevents, 242062306a36Sopenharmony_ci ep_timeout_to_timespec(&to, timeout), 242162306a36Sopenharmony_ci sigmask, sigsetsize); 242262306a36Sopenharmony_ci} 242362306a36Sopenharmony_ci 242462306a36Sopenharmony_ciCOMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd, 242562306a36Sopenharmony_ci struct epoll_event __user *, events, 242662306a36Sopenharmony_ci int, maxevents, 242762306a36Sopenharmony_ci const struct __kernel_timespec __user *, timeout, 242862306a36Sopenharmony_ci const compat_sigset_t __user *, sigmask, 242962306a36Sopenharmony_ci compat_size_t, sigsetsize) 243062306a36Sopenharmony_ci{ 243162306a36Sopenharmony_ci struct timespec64 ts, *to = NULL; 243262306a36Sopenharmony_ci 243362306a36Sopenharmony_ci if (timeout) { 243462306a36Sopenharmony_ci if (get_timespec64(&ts, timeout)) 243562306a36Sopenharmony_ci return -EFAULT; 243662306a36Sopenharmony_ci to = &ts; 243762306a36Sopenharmony_ci if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) 243862306a36Sopenharmony_ci return -EINVAL; 243962306a36Sopenharmony_ci } 244062306a36Sopenharmony_ci 244162306a36Sopenharmony_ci return do_compat_epoll_pwait(epfd, events, maxevents, to, 244262306a36Sopenharmony_ci sigmask, sigsetsize); 244362306a36Sopenharmony_ci} 244462306a36Sopenharmony_ci 244562306a36Sopenharmony_ci#endif 244662306a36Sopenharmony_ci 244762306a36Sopenharmony_cistatic int __init eventpoll_init(void) 244862306a36Sopenharmony_ci{ 244962306a36Sopenharmony_ci struct sysinfo si; 245062306a36Sopenharmony_ci 245162306a36Sopenharmony_ci si_meminfo(&si); 245262306a36Sopenharmony_ci /* 245362306a36Sopenharmony_ci * Allows top 4% of lomem to be allocated for epoll watches (per user). 245462306a36Sopenharmony_ci */ 245562306a36Sopenharmony_ci max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / 245662306a36Sopenharmony_ci EP_ITEM_COST; 245762306a36Sopenharmony_ci BUG_ON(max_user_watches < 0); 245862306a36Sopenharmony_ci 245962306a36Sopenharmony_ci /* 246062306a36Sopenharmony_ci * We can have many thousands of epitems, so prevent this from 246162306a36Sopenharmony_ci * using an extra cache line on 64-bit (and smaller) CPUs 246262306a36Sopenharmony_ci */ 246362306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); 246462306a36Sopenharmony_ci 246562306a36Sopenharmony_ci /* Allocates slab cache used to allocate "struct epitem" items */ 246662306a36Sopenharmony_ci epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 246762306a36Sopenharmony_ci 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); 246862306a36Sopenharmony_ci 246962306a36Sopenharmony_ci /* Allocates slab cache used to allocate "struct eppoll_entry" */ 247062306a36Sopenharmony_ci pwq_cache = kmem_cache_create("eventpoll_pwq", 247162306a36Sopenharmony_ci sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); 247262306a36Sopenharmony_ci epoll_sysctls_init(); 247362306a36Sopenharmony_ci 247462306a36Sopenharmony_ci ephead_cache = kmem_cache_create("ep_head", 247562306a36Sopenharmony_ci sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); 247662306a36Sopenharmony_ci 247762306a36Sopenharmony_ci return 0; 247862306a36Sopenharmony_ci} 247962306a36Sopenharmony_cifs_initcall(eventpoll_init); 2480