18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci *  fs/eventpoll.c (Efficient event retrieval implementation)
48c2ecf20Sopenharmony_ci *  Copyright (C) 2001,...,2009	 Davide Libenzi
58c2ecf20Sopenharmony_ci *
68c2ecf20Sopenharmony_ci *  Davide Libenzi <davidel@xmailserver.org>
78c2ecf20Sopenharmony_ci */
88c2ecf20Sopenharmony_ci
98c2ecf20Sopenharmony_ci#include <linux/init.h>
108c2ecf20Sopenharmony_ci#include <linux/kernel.h>
118c2ecf20Sopenharmony_ci#include <linux/sched/signal.h>
128c2ecf20Sopenharmony_ci#include <linux/fs.h>
138c2ecf20Sopenharmony_ci#include <linux/file.h>
148c2ecf20Sopenharmony_ci#include <linux/signal.h>
158c2ecf20Sopenharmony_ci#include <linux/errno.h>
168c2ecf20Sopenharmony_ci#include <linux/mm.h>
178c2ecf20Sopenharmony_ci#include <linux/slab.h>
188c2ecf20Sopenharmony_ci#include <linux/poll.h>
198c2ecf20Sopenharmony_ci#include <linux/string.h>
208c2ecf20Sopenharmony_ci#include <linux/list.h>
218c2ecf20Sopenharmony_ci#include <linux/hash.h>
228c2ecf20Sopenharmony_ci#include <linux/spinlock.h>
238c2ecf20Sopenharmony_ci#include <linux/syscalls.h>
248c2ecf20Sopenharmony_ci#include <linux/rbtree.h>
258c2ecf20Sopenharmony_ci#include <linux/wait.h>
268c2ecf20Sopenharmony_ci#include <linux/eventpoll.h>
278c2ecf20Sopenharmony_ci#include <linux/mount.h>
288c2ecf20Sopenharmony_ci#include <linux/bitops.h>
298c2ecf20Sopenharmony_ci#include <linux/mutex.h>
308c2ecf20Sopenharmony_ci#include <linux/anon_inodes.h>
318c2ecf20Sopenharmony_ci#include <linux/device.h>
328c2ecf20Sopenharmony_ci#include <linux/uaccess.h>
338c2ecf20Sopenharmony_ci#include <asm/io.h>
348c2ecf20Sopenharmony_ci#include <asm/mman.h>
358c2ecf20Sopenharmony_ci#include <linux/atomic.h>
368c2ecf20Sopenharmony_ci#include <linux/proc_fs.h>
378c2ecf20Sopenharmony_ci#include <linux/seq_file.h>
388c2ecf20Sopenharmony_ci#include <linux/compat.h>
398c2ecf20Sopenharmony_ci#include <linux/rculist.h>
408c2ecf20Sopenharmony_ci#include <net/busy_poll.h>
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_ci/*
438c2ecf20Sopenharmony_ci * LOCKING:
448c2ecf20Sopenharmony_ci * There are three level of locking required by epoll :
458c2ecf20Sopenharmony_ci *
468c2ecf20Sopenharmony_ci * 1) epmutex (mutex)
478c2ecf20Sopenharmony_ci * 2) ep->mtx (mutex)
488c2ecf20Sopenharmony_ci * 3) ep->lock (rwlock)
498c2ecf20Sopenharmony_ci *
508c2ecf20Sopenharmony_ci * The acquire order is the one listed above, from 1 to 3.
518c2ecf20Sopenharmony_ci * We need a rwlock (ep->lock) because we manipulate objects
528c2ecf20Sopenharmony_ci * from inside the poll callback, that might be triggered from
538c2ecf20Sopenharmony_ci * a wake_up() that in turn might be called from IRQ context.
548c2ecf20Sopenharmony_ci * So we can't sleep inside the poll callback and hence we need
558c2ecf20Sopenharmony_ci * a spinlock. During the event transfer loop (from kernel to
568c2ecf20Sopenharmony_ci * user space) we could end up sleeping due a copy_to_user(), so
578c2ecf20Sopenharmony_ci * we need a lock that will allow us to sleep. This lock is a
588c2ecf20Sopenharmony_ci * mutex (ep->mtx). It is acquired during the event transfer loop,
598c2ecf20Sopenharmony_ci * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
608c2ecf20Sopenharmony_ci * Then we also need a global mutex to serialize eventpoll_release_file()
618c2ecf20Sopenharmony_ci * and ep_free().
628c2ecf20Sopenharmony_ci * This mutex is acquired by ep_free() during the epoll file
638c2ecf20Sopenharmony_ci * cleanup path and it is also acquired by eventpoll_release_file()
648c2ecf20Sopenharmony_ci * if a file has been pushed inside an epoll set and it is then
658c2ecf20Sopenharmony_ci * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
668c2ecf20Sopenharmony_ci * It is also acquired when inserting an epoll fd onto another epoll
678c2ecf20Sopenharmony_ci * fd. We do this so that we walk the epoll tree and ensure that this
688c2ecf20Sopenharmony_ci * insertion does not create a cycle of epoll file descriptors, which
698c2ecf20Sopenharmony_ci * could lead to deadlock. We need a global mutex to prevent two
708c2ecf20Sopenharmony_ci * simultaneous inserts (A into B and B into A) from racing and
718c2ecf20Sopenharmony_ci * constructing a cycle without either insert observing that it is
728c2ecf20Sopenharmony_ci * going to.
738c2ecf20Sopenharmony_ci * It is necessary to acquire multiple "ep->mtx"es at once in the
748c2ecf20Sopenharmony_ci * case when one epoll fd is added to another. In this case, we
758c2ecf20Sopenharmony_ci * always acquire the locks in the order of nesting (i.e. after
768c2ecf20Sopenharmony_ci * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
778c2ecf20Sopenharmony_ci * before e2->mtx). Since we disallow cycles of epoll file
788c2ecf20Sopenharmony_ci * descriptors, this ensures that the mutexes are well-ordered. In
798c2ecf20Sopenharmony_ci * order to communicate this nesting to lockdep, when walking a tree
808c2ecf20Sopenharmony_ci * of epoll file descriptors, we use the current recursion depth as
818c2ecf20Sopenharmony_ci * the lockdep subkey.
828c2ecf20Sopenharmony_ci * It is possible to drop the "ep->mtx" and to use the global
838c2ecf20Sopenharmony_ci * mutex "epmutex" (together with "ep->lock") to have it working,
848c2ecf20Sopenharmony_ci * but having "ep->mtx" will make the interface more scalable.
858c2ecf20Sopenharmony_ci * Events that require holding "epmutex" are very rare, while for
868c2ecf20Sopenharmony_ci * normal operations the epoll private "ep->mtx" will guarantee
878c2ecf20Sopenharmony_ci * a better scalability.
888c2ecf20Sopenharmony_ci */
898c2ecf20Sopenharmony_ci
908c2ecf20Sopenharmony_ci/* Epoll private bits inside the event mask */
918c2ecf20Sopenharmony_ci#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_ci#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
968c2ecf20Sopenharmony_ci				EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
978c2ecf20Sopenharmony_ci
988c2ecf20Sopenharmony_ci/* Maximum number of nesting allowed inside epoll sets */
998c2ecf20Sopenharmony_ci#define EP_MAX_NESTS 4
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
1028c2ecf20Sopenharmony_ci
1038c2ecf20Sopenharmony_ci#define EP_UNACTIVE_PTR ((void *) -1L)
1048c2ecf20Sopenharmony_ci
1058c2ecf20Sopenharmony_ci#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_cistruct epoll_filefd {
1088c2ecf20Sopenharmony_ci	struct file *file;
1098c2ecf20Sopenharmony_ci	int fd;
1108c2ecf20Sopenharmony_ci} __packed;
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_ci/*
1138c2ecf20Sopenharmony_ci * Structure used to track possible nested calls, for too deep recursions
1148c2ecf20Sopenharmony_ci * and loop cycles.
1158c2ecf20Sopenharmony_ci */
1168c2ecf20Sopenharmony_cistruct nested_call_node {
1178c2ecf20Sopenharmony_ci	struct list_head llink;
1188c2ecf20Sopenharmony_ci	void *cookie;
1198c2ecf20Sopenharmony_ci	void *ctx;
1208c2ecf20Sopenharmony_ci};
1218c2ecf20Sopenharmony_ci
1228c2ecf20Sopenharmony_ci/*
1238c2ecf20Sopenharmony_ci * This structure is used as collector for nested calls, to check for
1248c2ecf20Sopenharmony_ci * maximum recursion dept and loop cycles.
1258c2ecf20Sopenharmony_ci */
1268c2ecf20Sopenharmony_cistruct nested_calls {
1278c2ecf20Sopenharmony_ci	struct list_head tasks_call_list;
1288c2ecf20Sopenharmony_ci	spinlock_t lock;
1298c2ecf20Sopenharmony_ci};
1308c2ecf20Sopenharmony_ci
1318c2ecf20Sopenharmony_ci/*
1328c2ecf20Sopenharmony_ci * Each file descriptor added to the eventpoll interface will
1338c2ecf20Sopenharmony_ci * have an entry of this type linked to the "rbr" RB tree.
1348c2ecf20Sopenharmony_ci * Avoid increasing the size of this struct, there can be many thousands
1358c2ecf20Sopenharmony_ci * of these on a server and we do not want this to take another cache line.
1368c2ecf20Sopenharmony_ci */
1378c2ecf20Sopenharmony_cistruct epitem {
1388c2ecf20Sopenharmony_ci	union {
1398c2ecf20Sopenharmony_ci		/* RB tree node links this structure to the eventpoll RB tree */
1408c2ecf20Sopenharmony_ci		struct rb_node rbn;
1418c2ecf20Sopenharmony_ci		/* Used to free the struct epitem */
1428c2ecf20Sopenharmony_ci		struct rcu_head rcu;
1438c2ecf20Sopenharmony_ci	};
1448c2ecf20Sopenharmony_ci
1458c2ecf20Sopenharmony_ci	/* List header used to link this structure to the eventpoll ready list */
1468c2ecf20Sopenharmony_ci	struct list_head rdllink;
1478c2ecf20Sopenharmony_ci
1488c2ecf20Sopenharmony_ci	/*
1498c2ecf20Sopenharmony_ci	 * Works together "struct eventpoll"->ovflist in keeping the
1508c2ecf20Sopenharmony_ci	 * single linked chain of items.
1518c2ecf20Sopenharmony_ci	 */
1528c2ecf20Sopenharmony_ci	struct epitem *next;
1538c2ecf20Sopenharmony_ci
1548c2ecf20Sopenharmony_ci	/* The file descriptor information this item refers to */
1558c2ecf20Sopenharmony_ci	struct epoll_filefd ffd;
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_ci	/* Number of active wait queue attached to poll operations */
1588c2ecf20Sopenharmony_ci	int nwait;
1598c2ecf20Sopenharmony_ci
1608c2ecf20Sopenharmony_ci	/* List containing poll wait queues */
1618c2ecf20Sopenharmony_ci	struct list_head pwqlist;
1628c2ecf20Sopenharmony_ci
1638c2ecf20Sopenharmony_ci	/* The "container" of this item */
1648c2ecf20Sopenharmony_ci	struct eventpoll *ep;
1658c2ecf20Sopenharmony_ci
1668c2ecf20Sopenharmony_ci	/* List header used to link this item to the "struct file" items list */
1678c2ecf20Sopenharmony_ci	struct list_head fllink;
1688c2ecf20Sopenharmony_ci
1698c2ecf20Sopenharmony_ci	/* wakeup_source used when EPOLLWAKEUP is set */
1708c2ecf20Sopenharmony_ci	struct wakeup_source __rcu *ws;
1718c2ecf20Sopenharmony_ci
1728c2ecf20Sopenharmony_ci	/* The structure that describe the interested events and the source fd */
1738c2ecf20Sopenharmony_ci	struct epoll_event event;
1748c2ecf20Sopenharmony_ci};
1758c2ecf20Sopenharmony_ci
1768c2ecf20Sopenharmony_ci/*
1778c2ecf20Sopenharmony_ci * This structure is stored inside the "private_data" member of the file
1788c2ecf20Sopenharmony_ci * structure and represents the main data structure for the eventpoll
1798c2ecf20Sopenharmony_ci * interface.
1808c2ecf20Sopenharmony_ci */
1818c2ecf20Sopenharmony_cistruct eventpoll {
1828c2ecf20Sopenharmony_ci	/*
1838c2ecf20Sopenharmony_ci	 * This mutex is used to ensure that files are not removed
1848c2ecf20Sopenharmony_ci	 * while epoll is using them. This is held during the event
1858c2ecf20Sopenharmony_ci	 * collection loop, the file cleanup path, the epoll file exit
1868c2ecf20Sopenharmony_ci	 * code and the ctl operations.
1878c2ecf20Sopenharmony_ci	 */
1888c2ecf20Sopenharmony_ci	struct mutex mtx;
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci	/* Wait queue used by sys_epoll_wait() */
1918c2ecf20Sopenharmony_ci	wait_queue_head_t wq;
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_ci	/* Wait queue used by file->poll() */
1948c2ecf20Sopenharmony_ci	wait_queue_head_t poll_wait;
1958c2ecf20Sopenharmony_ci
1968c2ecf20Sopenharmony_ci	/* List of ready file descriptors */
1978c2ecf20Sopenharmony_ci	struct list_head rdllist;
1988c2ecf20Sopenharmony_ci
1998c2ecf20Sopenharmony_ci	/* Lock which protects rdllist and ovflist */
2008c2ecf20Sopenharmony_ci	rwlock_t lock;
2018c2ecf20Sopenharmony_ci
2028c2ecf20Sopenharmony_ci	/* RB tree root used to store monitored fd structs */
2038c2ecf20Sopenharmony_ci	struct rb_root_cached rbr;
2048c2ecf20Sopenharmony_ci
2058c2ecf20Sopenharmony_ci	/*
2068c2ecf20Sopenharmony_ci	 * This is a single linked list that chains all the "struct epitem" that
2078c2ecf20Sopenharmony_ci	 * happened while transferring ready events to userspace w/out
2088c2ecf20Sopenharmony_ci	 * holding ->lock.
2098c2ecf20Sopenharmony_ci	 */
2108c2ecf20Sopenharmony_ci	struct epitem *ovflist;
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ci	/* wakeup_source used when ep_scan_ready_list is running */
2138c2ecf20Sopenharmony_ci	struct wakeup_source *ws;
2148c2ecf20Sopenharmony_ci
2158c2ecf20Sopenharmony_ci	/* The user that created the eventpoll descriptor */
2168c2ecf20Sopenharmony_ci	struct user_struct *user;
2178c2ecf20Sopenharmony_ci
2188c2ecf20Sopenharmony_ci	struct file *file;
2198c2ecf20Sopenharmony_ci
2208c2ecf20Sopenharmony_ci	/* used to optimize loop detection check */
2218c2ecf20Sopenharmony_ci	u64 gen;
2228c2ecf20Sopenharmony_ci
2238c2ecf20Sopenharmony_ci#ifdef CONFIG_NET_RX_BUSY_POLL
2248c2ecf20Sopenharmony_ci	/* used to track busy poll napi_id */
2258c2ecf20Sopenharmony_ci	unsigned int napi_id;
2268c2ecf20Sopenharmony_ci#endif
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_LOCK_ALLOC
2298c2ecf20Sopenharmony_ci	/* tracks wakeup nests for lockdep validation */
2308c2ecf20Sopenharmony_ci	u8 nests;
2318c2ecf20Sopenharmony_ci#endif
2328c2ecf20Sopenharmony_ci};
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_ci/* Wait structure used by the poll hooks */
2358c2ecf20Sopenharmony_cistruct eppoll_entry {
2368c2ecf20Sopenharmony_ci	/* List header used to link this structure to the "struct epitem" */
2378c2ecf20Sopenharmony_ci	struct list_head llink;
2388c2ecf20Sopenharmony_ci
2398c2ecf20Sopenharmony_ci	/* The "base" pointer is set to the container "struct epitem" */
2408c2ecf20Sopenharmony_ci	struct epitem *base;
2418c2ecf20Sopenharmony_ci
2428c2ecf20Sopenharmony_ci	/*
2438c2ecf20Sopenharmony_ci	 * Wait queue item that will be linked to the target file wait
2448c2ecf20Sopenharmony_ci	 * queue head.
2458c2ecf20Sopenharmony_ci	 */
2468c2ecf20Sopenharmony_ci	wait_queue_entry_t wait;
2478c2ecf20Sopenharmony_ci
2488c2ecf20Sopenharmony_ci	/* The wait queue head that linked the "wait" wait queue item */
2498c2ecf20Sopenharmony_ci	wait_queue_head_t *whead;
2508c2ecf20Sopenharmony_ci};
2518c2ecf20Sopenharmony_ci
2528c2ecf20Sopenharmony_ci/* Wrapper struct used by poll queueing */
2538c2ecf20Sopenharmony_cistruct ep_pqueue {
2548c2ecf20Sopenharmony_ci	poll_table pt;
2558c2ecf20Sopenharmony_ci	struct epitem *epi;
2568c2ecf20Sopenharmony_ci};
2578c2ecf20Sopenharmony_ci
2588c2ecf20Sopenharmony_ci/* Used by the ep_send_events() function as callback private data */
2598c2ecf20Sopenharmony_cistruct ep_send_events_data {
2608c2ecf20Sopenharmony_ci	int maxevents;
2618c2ecf20Sopenharmony_ci	struct epoll_event __user *events;
2628c2ecf20Sopenharmony_ci	int res;
2638c2ecf20Sopenharmony_ci};
2648c2ecf20Sopenharmony_ci
2658c2ecf20Sopenharmony_ci/*
2668c2ecf20Sopenharmony_ci * Configuration options available inside /proc/sys/fs/epoll/
2678c2ecf20Sopenharmony_ci */
2688c2ecf20Sopenharmony_ci/* Maximum number of epoll watched descriptors, per user */
2698c2ecf20Sopenharmony_cistatic long max_user_watches __read_mostly;
2708c2ecf20Sopenharmony_ci
2718c2ecf20Sopenharmony_ci/*
2728c2ecf20Sopenharmony_ci * This mutex is used to serialize ep_free() and eventpoll_release_file().
2738c2ecf20Sopenharmony_ci */
2748c2ecf20Sopenharmony_cistatic DEFINE_MUTEX(epmutex);
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_cistatic u64 loop_check_gen = 0;
2778c2ecf20Sopenharmony_ci
2788c2ecf20Sopenharmony_ci/* Used to check for epoll file descriptor inclusion loops */
2798c2ecf20Sopenharmony_cistatic struct nested_calls poll_loop_ncalls;
2808c2ecf20Sopenharmony_ci
2818c2ecf20Sopenharmony_ci/* Slab cache used to allocate "struct epitem" */
2828c2ecf20Sopenharmony_cistatic struct kmem_cache *epi_cache __read_mostly;
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci/* Slab cache used to allocate "struct eppoll_entry" */
2858c2ecf20Sopenharmony_cistatic struct kmem_cache *pwq_cache __read_mostly;
2868c2ecf20Sopenharmony_ci
2878c2ecf20Sopenharmony_ci/*
2888c2ecf20Sopenharmony_ci * List of files with newly added links, where we may need to limit the number
2898c2ecf20Sopenharmony_ci * of emanating paths. Protected by the epmutex.
2908c2ecf20Sopenharmony_ci */
2918c2ecf20Sopenharmony_cistatic LIST_HEAD(tfile_check_list);
2928c2ecf20Sopenharmony_ci
2938c2ecf20Sopenharmony_ci#ifdef CONFIG_SYSCTL
2948c2ecf20Sopenharmony_ci
2958c2ecf20Sopenharmony_ci#include <linux/sysctl.h>
2968c2ecf20Sopenharmony_ci
2978c2ecf20Sopenharmony_cistatic long long_zero;
2988c2ecf20Sopenharmony_cistatic long long_max = LONG_MAX;
2998c2ecf20Sopenharmony_ci
3008c2ecf20Sopenharmony_cistruct ctl_table epoll_table[] = {
3018c2ecf20Sopenharmony_ci	{
3028c2ecf20Sopenharmony_ci		.procname	= "max_user_watches",
3038c2ecf20Sopenharmony_ci		.data		= &max_user_watches,
3048c2ecf20Sopenharmony_ci		.maxlen		= sizeof(max_user_watches),
3058c2ecf20Sopenharmony_ci		.mode		= 0644,
3068c2ecf20Sopenharmony_ci		.proc_handler	= proc_doulongvec_minmax,
3078c2ecf20Sopenharmony_ci		.extra1		= &long_zero,
3088c2ecf20Sopenharmony_ci		.extra2		= &long_max,
3098c2ecf20Sopenharmony_ci	},
3108c2ecf20Sopenharmony_ci	{ }
3118c2ecf20Sopenharmony_ci};
3128c2ecf20Sopenharmony_ci#endif /* CONFIG_SYSCTL */
3138c2ecf20Sopenharmony_ci
3148c2ecf20Sopenharmony_cistatic const struct file_operations eventpoll_fops;
3158c2ecf20Sopenharmony_ci
3168c2ecf20Sopenharmony_cistatic inline int is_file_epoll(struct file *f)
3178c2ecf20Sopenharmony_ci{
3188c2ecf20Sopenharmony_ci	return f->f_op == &eventpoll_fops;
3198c2ecf20Sopenharmony_ci}
3208c2ecf20Sopenharmony_ci
3218c2ecf20Sopenharmony_ci/* Setup the structure that is used as key for the RB tree */
3228c2ecf20Sopenharmony_cistatic inline void ep_set_ffd(struct epoll_filefd *ffd,
3238c2ecf20Sopenharmony_ci			      struct file *file, int fd)
3248c2ecf20Sopenharmony_ci{
3258c2ecf20Sopenharmony_ci	ffd->file = file;
3268c2ecf20Sopenharmony_ci	ffd->fd = fd;
3278c2ecf20Sopenharmony_ci}
3288c2ecf20Sopenharmony_ci
3298c2ecf20Sopenharmony_ci/* Compare RB tree keys */
3308c2ecf20Sopenharmony_cistatic inline int ep_cmp_ffd(struct epoll_filefd *p1,
3318c2ecf20Sopenharmony_ci			     struct epoll_filefd *p2)
3328c2ecf20Sopenharmony_ci{
3338c2ecf20Sopenharmony_ci	return (p1->file > p2->file ? +1:
3348c2ecf20Sopenharmony_ci	        (p1->file < p2->file ? -1 : p1->fd - p2->fd));
3358c2ecf20Sopenharmony_ci}
3368c2ecf20Sopenharmony_ci
3378c2ecf20Sopenharmony_ci/* Tells us if the item is currently linked */
3388c2ecf20Sopenharmony_cistatic inline int ep_is_linked(struct epitem *epi)
3398c2ecf20Sopenharmony_ci{
3408c2ecf20Sopenharmony_ci	return !list_empty(&epi->rdllink);
3418c2ecf20Sopenharmony_ci}
3428c2ecf20Sopenharmony_ci
3438c2ecf20Sopenharmony_cistatic inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
3448c2ecf20Sopenharmony_ci{
3458c2ecf20Sopenharmony_ci	return container_of(p, struct eppoll_entry, wait);
3468c2ecf20Sopenharmony_ci}
3478c2ecf20Sopenharmony_ci
3488c2ecf20Sopenharmony_ci/* Get the "struct epitem" from a wait queue pointer */
3498c2ecf20Sopenharmony_cistatic inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
3508c2ecf20Sopenharmony_ci{
3518c2ecf20Sopenharmony_ci	return container_of(p, struct eppoll_entry, wait)->base;
3528c2ecf20Sopenharmony_ci}
3538c2ecf20Sopenharmony_ci
3548c2ecf20Sopenharmony_ci/* Get the "struct epitem" from an epoll queue wrapper */
3558c2ecf20Sopenharmony_cistatic inline struct epitem *ep_item_from_epqueue(poll_table *p)
3568c2ecf20Sopenharmony_ci{
3578c2ecf20Sopenharmony_ci	return container_of(p, struct ep_pqueue, pt)->epi;
3588c2ecf20Sopenharmony_ci}
3598c2ecf20Sopenharmony_ci
3608c2ecf20Sopenharmony_ci/* Initialize the poll safe wake up structure */
3618c2ecf20Sopenharmony_cistatic void ep_nested_calls_init(struct nested_calls *ncalls)
3628c2ecf20Sopenharmony_ci{
3638c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&ncalls->tasks_call_list);
3648c2ecf20Sopenharmony_ci	spin_lock_init(&ncalls->lock);
3658c2ecf20Sopenharmony_ci}
3668c2ecf20Sopenharmony_ci
3678c2ecf20Sopenharmony_ci/**
3688c2ecf20Sopenharmony_ci * ep_events_available - Checks if ready events might be available.
3698c2ecf20Sopenharmony_ci *
3708c2ecf20Sopenharmony_ci * @ep: Pointer to the eventpoll context.
3718c2ecf20Sopenharmony_ci *
3728c2ecf20Sopenharmony_ci * Returns: Returns a value different than zero if ready events are available,
3738c2ecf20Sopenharmony_ci *          or zero otherwise.
3748c2ecf20Sopenharmony_ci */
3758c2ecf20Sopenharmony_cistatic inline int ep_events_available(struct eventpoll *ep)
3768c2ecf20Sopenharmony_ci{
3778c2ecf20Sopenharmony_ci	return !list_empty_careful(&ep->rdllist) ||
3788c2ecf20Sopenharmony_ci		READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
3798c2ecf20Sopenharmony_ci}
3808c2ecf20Sopenharmony_ci
3818c2ecf20Sopenharmony_ci#ifdef CONFIG_NET_RX_BUSY_POLL
3828c2ecf20Sopenharmony_cistatic bool ep_busy_loop_end(void *p, unsigned long start_time)
3838c2ecf20Sopenharmony_ci{
3848c2ecf20Sopenharmony_ci	struct eventpoll *ep = p;
3858c2ecf20Sopenharmony_ci
3868c2ecf20Sopenharmony_ci	return ep_events_available(ep) || busy_loop_timeout(start_time);
3878c2ecf20Sopenharmony_ci}
3888c2ecf20Sopenharmony_ci
3898c2ecf20Sopenharmony_ci/*
3908c2ecf20Sopenharmony_ci * Busy poll if globally on and supporting sockets found && no events,
3918c2ecf20Sopenharmony_ci * busy loop will return if need_resched or ep_events_available.
3928c2ecf20Sopenharmony_ci *
3938c2ecf20Sopenharmony_ci * we must do our busy polling with irqs enabled
3948c2ecf20Sopenharmony_ci */
3958c2ecf20Sopenharmony_cistatic void ep_busy_loop(struct eventpoll *ep, int nonblock)
3968c2ecf20Sopenharmony_ci{
3978c2ecf20Sopenharmony_ci	unsigned int napi_id = READ_ONCE(ep->napi_id);
3988c2ecf20Sopenharmony_ci
3998c2ecf20Sopenharmony_ci	if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
4008c2ecf20Sopenharmony_ci		napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
4018c2ecf20Sopenharmony_ci}
4028c2ecf20Sopenharmony_ci
4038c2ecf20Sopenharmony_cistatic inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
4048c2ecf20Sopenharmony_ci{
4058c2ecf20Sopenharmony_ci	if (ep->napi_id)
4068c2ecf20Sopenharmony_ci		ep->napi_id = 0;
4078c2ecf20Sopenharmony_ci}
4088c2ecf20Sopenharmony_ci
4098c2ecf20Sopenharmony_ci/*
4108c2ecf20Sopenharmony_ci * Set epoll busy poll NAPI ID from sk.
4118c2ecf20Sopenharmony_ci */
4128c2ecf20Sopenharmony_cistatic inline void ep_set_busy_poll_napi_id(struct epitem *epi)
4138c2ecf20Sopenharmony_ci{
4148c2ecf20Sopenharmony_ci	struct eventpoll *ep;
4158c2ecf20Sopenharmony_ci	unsigned int napi_id;
4168c2ecf20Sopenharmony_ci	struct socket *sock;
4178c2ecf20Sopenharmony_ci	struct sock *sk;
4188c2ecf20Sopenharmony_ci	int err;
4198c2ecf20Sopenharmony_ci
4208c2ecf20Sopenharmony_ci	if (!net_busy_loop_on())
4218c2ecf20Sopenharmony_ci		return;
4228c2ecf20Sopenharmony_ci
4238c2ecf20Sopenharmony_ci	sock = sock_from_file(epi->ffd.file, &err);
4248c2ecf20Sopenharmony_ci	if (!sock)
4258c2ecf20Sopenharmony_ci		return;
4268c2ecf20Sopenharmony_ci
4278c2ecf20Sopenharmony_ci	sk = sock->sk;
4288c2ecf20Sopenharmony_ci	if (!sk)
4298c2ecf20Sopenharmony_ci		return;
4308c2ecf20Sopenharmony_ci
4318c2ecf20Sopenharmony_ci	napi_id = READ_ONCE(sk->sk_napi_id);
4328c2ecf20Sopenharmony_ci	ep = epi->ep;
4338c2ecf20Sopenharmony_ci
4348c2ecf20Sopenharmony_ci	/* Non-NAPI IDs can be rejected
4358c2ecf20Sopenharmony_ci	 *	or
4368c2ecf20Sopenharmony_ci	 * Nothing to do if we already have this ID
4378c2ecf20Sopenharmony_ci	 */
4388c2ecf20Sopenharmony_ci	if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
4398c2ecf20Sopenharmony_ci		return;
4408c2ecf20Sopenharmony_ci
4418c2ecf20Sopenharmony_ci	/* record NAPI ID for use in next busy poll */
4428c2ecf20Sopenharmony_ci	ep->napi_id = napi_id;
4438c2ecf20Sopenharmony_ci}
4448c2ecf20Sopenharmony_ci
4458c2ecf20Sopenharmony_ci#else
4468c2ecf20Sopenharmony_ci
4478c2ecf20Sopenharmony_cistatic inline void ep_busy_loop(struct eventpoll *ep, int nonblock)
4488c2ecf20Sopenharmony_ci{
4498c2ecf20Sopenharmony_ci}
4508c2ecf20Sopenharmony_ci
4518c2ecf20Sopenharmony_cistatic inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
4528c2ecf20Sopenharmony_ci{
4538c2ecf20Sopenharmony_ci}
4548c2ecf20Sopenharmony_ci
4558c2ecf20Sopenharmony_cistatic inline void ep_set_busy_poll_napi_id(struct epitem *epi)
4568c2ecf20Sopenharmony_ci{
4578c2ecf20Sopenharmony_ci}
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci#endif /* CONFIG_NET_RX_BUSY_POLL */
4608c2ecf20Sopenharmony_ci
4618c2ecf20Sopenharmony_ci/**
4628c2ecf20Sopenharmony_ci * ep_call_nested - Perform a bound (possibly) nested call, by checking
4638c2ecf20Sopenharmony_ci *                  that the recursion limit is not exceeded, and that
4648c2ecf20Sopenharmony_ci *                  the same nested call (by the meaning of same cookie) is
4658c2ecf20Sopenharmony_ci *                  no re-entered.
4668c2ecf20Sopenharmony_ci *
4678c2ecf20Sopenharmony_ci * @ncalls: Pointer to the nested_calls structure to be used for this call.
4688c2ecf20Sopenharmony_ci * @nproc: Nested call core function pointer.
4698c2ecf20Sopenharmony_ci * @priv: Opaque data to be passed to the @nproc callback.
4708c2ecf20Sopenharmony_ci * @cookie: Cookie to be used to identify this nested call.
4718c2ecf20Sopenharmony_ci * @ctx: This instance context.
4728c2ecf20Sopenharmony_ci *
4738c2ecf20Sopenharmony_ci * Returns: Returns the code returned by the @nproc callback, or -1 if
4748c2ecf20Sopenharmony_ci *          the maximum recursion limit has been exceeded.
4758c2ecf20Sopenharmony_ci */
4768c2ecf20Sopenharmony_cistatic int ep_call_nested(struct nested_calls *ncalls,
4778c2ecf20Sopenharmony_ci			  int (*nproc)(void *, void *, int), void *priv,
4788c2ecf20Sopenharmony_ci			  void *cookie, void *ctx)
4798c2ecf20Sopenharmony_ci{
4808c2ecf20Sopenharmony_ci	int error, call_nests = 0;
4818c2ecf20Sopenharmony_ci	unsigned long flags;
4828c2ecf20Sopenharmony_ci	struct list_head *lsthead = &ncalls->tasks_call_list;
4838c2ecf20Sopenharmony_ci	struct nested_call_node *tncur;
4848c2ecf20Sopenharmony_ci	struct nested_call_node tnode;
4858c2ecf20Sopenharmony_ci
4868c2ecf20Sopenharmony_ci	spin_lock_irqsave(&ncalls->lock, flags);
4878c2ecf20Sopenharmony_ci
4888c2ecf20Sopenharmony_ci	/*
4898c2ecf20Sopenharmony_ci	 * Try to see if the current task is already inside this wakeup call.
4908c2ecf20Sopenharmony_ci	 * We use a list here, since the population inside this set is always
4918c2ecf20Sopenharmony_ci	 * very much limited.
4928c2ecf20Sopenharmony_ci	 */
4938c2ecf20Sopenharmony_ci	list_for_each_entry(tncur, lsthead, llink) {
4948c2ecf20Sopenharmony_ci		if (tncur->ctx == ctx &&
4958c2ecf20Sopenharmony_ci		    (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
4968c2ecf20Sopenharmony_ci			/*
4978c2ecf20Sopenharmony_ci			 * Ops ... loop detected or maximum nest level reached.
4988c2ecf20Sopenharmony_ci			 * We abort this wake by breaking the cycle itself.
4998c2ecf20Sopenharmony_ci			 */
5008c2ecf20Sopenharmony_ci			error = -1;
5018c2ecf20Sopenharmony_ci			goto out_unlock;
5028c2ecf20Sopenharmony_ci		}
5038c2ecf20Sopenharmony_ci	}
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_ci	/* Add the current task and cookie to the list */
5068c2ecf20Sopenharmony_ci	tnode.ctx = ctx;
5078c2ecf20Sopenharmony_ci	tnode.cookie = cookie;
5088c2ecf20Sopenharmony_ci	list_add(&tnode.llink, lsthead);
5098c2ecf20Sopenharmony_ci
5108c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&ncalls->lock, flags);
5118c2ecf20Sopenharmony_ci
5128c2ecf20Sopenharmony_ci	/* Call the nested function */
5138c2ecf20Sopenharmony_ci	error = (*nproc)(priv, cookie, call_nests);
5148c2ecf20Sopenharmony_ci
5158c2ecf20Sopenharmony_ci	/* Remove the current task from the list */
5168c2ecf20Sopenharmony_ci	spin_lock_irqsave(&ncalls->lock, flags);
5178c2ecf20Sopenharmony_ci	list_del(&tnode.llink);
5188c2ecf20Sopenharmony_ciout_unlock:
5198c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&ncalls->lock, flags);
5208c2ecf20Sopenharmony_ci
5218c2ecf20Sopenharmony_ci	return error;
5228c2ecf20Sopenharmony_ci}
5238c2ecf20Sopenharmony_ci
5248c2ecf20Sopenharmony_ci/*
5258c2ecf20Sopenharmony_ci * As described in commit 0ccf831cb lockdep: annotate epoll
5268c2ecf20Sopenharmony_ci * the use of wait queues used by epoll is done in a very controlled
5278c2ecf20Sopenharmony_ci * manner. Wake ups can nest inside each other, but are never done
5288c2ecf20Sopenharmony_ci * with the same locking. For example:
5298c2ecf20Sopenharmony_ci *
5308c2ecf20Sopenharmony_ci *   dfd = socket(...);
5318c2ecf20Sopenharmony_ci *   efd1 = epoll_create();
5328c2ecf20Sopenharmony_ci *   efd2 = epoll_create();
5338c2ecf20Sopenharmony_ci *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
5348c2ecf20Sopenharmony_ci *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
5358c2ecf20Sopenharmony_ci *
5368c2ecf20Sopenharmony_ci * When a packet arrives to the device underneath "dfd", the net code will
5378c2ecf20Sopenharmony_ci * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
5388c2ecf20Sopenharmony_ci * callback wakeup entry on that queue, and the wake_up() performed by the
5398c2ecf20Sopenharmony_ci * "dfd" net code will end up in ep_poll_callback(). At this point epoll
5408c2ecf20Sopenharmony_ci * (efd1) notices that it may have some event ready, so it needs to wake up
5418c2ecf20Sopenharmony_ci * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
5428c2ecf20Sopenharmony_ci * that ends up in another wake_up(), after having checked about the
5438c2ecf20Sopenharmony_ci * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
5448c2ecf20Sopenharmony_ci * avoid stack blasting.
5458c2ecf20Sopenharmony_ci *
5468c2ecf20Sopenharmony_ci * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
5478c2ecf20Sopenharmony_ci * this special case of epoll.
5488c2ecf20Sopenharmony_ci */
5498c2ecf20Sopenharmony_ci#ifdef CONFIG_DEBUG_LOCK_ALLOC
5508c2ecf20Sopenharmony_ci
5518c2ecf20Sopenharmony_cistatic void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
5528c2ecf20Sopenharmony_ci			     unsigned pollflags)
5538c2ecf20Sopenharmony_ci{
5548c2ecf20Sopenharmony_ci	struct eventpoll *ep_src;
5558c2ecf20Sopenharmony_ci	unsigned long flags;
5568c2ecf20Sopenharmony_ci	u8 nests = 0;
5578c2ecf20Sopenharmony_ci
5588c2ecf20Sopenharmony_ci	/*
5598c2ecf20Sopenharmony_ci	 * To set the subclass or nesting level for spin_lock_irqsave_nested()
5608c2ecf20Sopenharmony_ci	 * it might be natural to create a per-cpu nest count. However, since
5618c2ecf20Sopenharmony_ci	 * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
5628c2ecf20Sopenharmony_ci	 * schedule() in the -rt kernel, the per-cpu variable are no longer
5638c2ecf20Sopenharmony_ci	 * protected. Thus, we are introducing a per eventpoll nest field.
5648c2ecf20Sopenharmony_ci	 * If we are not being call from ep_poll_callback(), epi is NULL and
5658c2ecf20Sopenharmony_ci	 * we are at the first level of nesting, 0. Otherwise, we are being
5668c2ecf20Sopenharmony_ci	 * called from ep_poll_callback() and if a previous wakeup source is
5678c2ecf20Sopenharmony_ci	 * not an epoll file itself, we are at depth 1 since the wakeup source
5688c2ecf20Sopenharmony_ci	 * is depth 0. If the wakeup source is a previous epoll file in the
5698c2ecf20Sopenharmony_ci	 * wakeup chain then we use its nests value and record ours as
5708c2ecf20Sopenharmony_ci	 * nests + 1. The previous epoll file nests value is stable since its
5718c2ecf20Sopenharmony_ci	 * already holding its own poll_wait.lock.
5728c2ecf20Sopenharmony_ci	 */
5738c2ecf20Sopenharmony_ci	if (epi) {
5748c2ecf20Sopenharmony_ci		if ((is_file_epoll(epi->ffd.file))) {
5758c2ecf20Sopenharmony_ci			ep_src = epi->ffd.file->private_data;
5768c2ecf20Sopenharmony_ci			nests = ep_src->nests;
5778c2ecf20Sopenharmony_ci		} else {
5788c2ecf20Sopenharmony_ci			nests = 1;
5798c2ecf20Sopenharmony_ci		}
5808c2ecf20Sopenharmony_ci	}
5818c2ecf20Sopenharmony_ci	spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
5828c2ecf20Sopenharmony_ci	ep->nests = nests + 1;
5838c2ecf20Sopenharmony_ci	wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
5848c2ecf20Sopenharmony_ci	ep->nests = 0;
5858c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
5868c2ecf20Sopenharmony_ci}
5878c2ecf20Sopenharmony_ci
5888c2ecf20Sopenharmony_ci#else
5898c2ecf20Sopenharmony_ci
5908c2ecf20Sopenharmony_cistatic void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
5918c2ecf20Sopenharmony_ci			     unsigned pollflags)
5928c2ecf20Sopenharmony_ci{
5938c2ecf20Sopenharmony_ci	wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
5948c2ecf20Sopenharmony_ci}
5958c2ecf20Sopenharmony_ci
5968c2ecf20Sopenharmony_ci#endif
5978c2ecf20Sopenharmony_ci
5988c2ecf20Sopenharmony_cistatic void ep_remove_wait_queue(struct eppoll_entry *pwq)
5998c2ecf20Sopenharmony_ci{
6008c2ecf20Sopenharmony_ci	wait_queue_head_t *whead;
6018c2ecf20Sopenharmony_ci
6028c2ecf20Sopenharmony_ci	rcu_read_lock();
6038c2ecf20Sopenharmony_ci	/*
6048c2ecf20Sopenharmony_ci	 * If it is cleared by POLLFREE, it should be rcu-safe.
6058c2ecf20Sopenharmony_ci	 * If we read NULL we need a barrier paired with
6068c2ecf20Sopenharmony_ci	 * smp_store_release() in ep_poll_callback(), otherwise
6078c2ecf20Sopenharmony_ci	 * we rely on whead->lock.
6088c2ecf20Sopenharmony_ci	 */
6098c2ecf20Sopenharmony_ci	whead = smp_load_acquire(&pwq->whead);
6108c2ecf20Sopenharmony_ci	if (whead)
6118c2ecf20Sopenharmony_ci		remove_wait_queue(whead, &pwq->wait);
6128c2ecf20Sopenharmony_ci	rcu_read_unlock();
6138c2ecf20Sopenharmony_ci}
6148c2ecf20Sopenharmony_ci
6158c2ecf20Sopenharmony_ci/*
6168c2ecf20Sopenharmony_ci * This function unregisters poll callbacks from the associated file
6178c2ecf20Sopenharmony_ci * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
6188c2ecf20Sopenharmony_ci * ep_free).
6198c2ecf20Sopenharmony_ci */
6208c2ecf20Sopenharmony_cistatic void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
6218c2ecf20Sopenharmony_ci{
6228c2ecf20Sopenharmony_ci	struct list_head *lsthead = &epi->pwqlist;
6238c2ecf20Sopenharmony_ci	struct eppoll_entry *pwq;
6248c2ecf20Sopenharmony_ci
6258c2ecf20Sopenharmony_ci	while (!list_empty(lsthead)) {
6268c2ecf20Sopenharmony_ci		pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
6278c2ecf20Sopenharmony_ci
6288c2ecf20Sopenharmony_ci		list_del(&pwq->llink);
6298c2ecf20Sopenharmony_ci		ep_remove_wait_queue(pwq);
6308c2ecf20Sopenharmony_ci		kmem_cache_free(pwq_cache, pwq);
6318c2ecf20Sopenharmony_ci	}
6328c2ecf20Sopenharmony_ci}
6338c2ecf20Sopenharmony_ci
6348c2ecf20Sopenharmony_ci/* call only when ep->mtx is held */
6358c2ecf20Sopenharmony_cistatic inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
6368c2ecf20Sopenharmony_ci{
6378c2ecf20Sopenharmony_ci	return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
6388c2ecf20Sopenharmony_ci}
6398c2ecf20Sopenharmony_ci
6408c2ecf20Sopenharmony_ci/* call only when ep->mtx is held */
6418c2ecf20Sopenharmony_cistatic inline void ep_pm_stay_awake(struct epitem *epi)
6428c2ecf20Sopenharmony_ci{
6438c2ecf20Sopenharmony_ci	struct wakeup_source *ws = ep_wakeup_source(epi);
6448c2ecf20Sopenharmony_ci
6458c2ecf20Sopenharmony_ci	if (ws)
6468c2ecf20Sopenharmony_ci		__pm_stay_awake(ws);
6478c2ecf20Sopenharmony_ci}
6488c2ecf20Sopenharmony_ci
6498c2ecf20Sopenharmony_cistatic inline bool ep_has_wakeup_source(struct epitem *epi)
6508c2ecf20Sopenharmony_ci{
6518c2ecf20Sopenharmony_ci	return rcu_access_pointer(epi->ws) ? true : false;
6528c2ecf20Sopenharmony_ci}
6538c2ecf20Sopenharmony_ci
6548c2ecf20Sopenharmony_ci/* call when ep->mtx cannot be held (ep_poll_callback) */
6558c2ecf20Sopenharmony_cistatic inline void ep_pm_stay_awake_rcu(struct epitem *epi)
6568c2ecf20Sopenharmony_ci{
6578c2ecf20Sopenharmony_ci	struct wakeup_source *ws;
6588c2ecf20Sopenharmony_ci
6598c2ecf20Sopenharmony_ci	rcu_read_lock();
6608c2ecf20Sopenharmony_ci	ws = rcu_dereference(epi->ws);
6618c2ecf20Sopenharmony_ci	if (ws)
6628c2ecf20Sopenharmony_ci		__pm_stay_awake(ws);
6638c2ecf20Sopenharmony_ci	rcu_read_unlock();
6648c2ecf20Sopenharmony_ci}
6658c2ecf20Sopenharmony_ci
6668c2ecf20Sopenharmony_ci/**
6678c2ecf20Sopenharmony_ci * ep_scan_ready_list - Scans the ready list in a way that makes possible for
6688c2ecf20Sopenharmony_ci *                      the scan code, to call f_op->poll(). Also allows for
6698c2ecf20Sopenharmony_ci *                      O(NumReady) performance.
6708c2ecf20Sopenharmony_ci *
6718c2ecf20Sopenharmony_ci * @ep: Pointer to the epoll private data structure.
6728c2ecf20Sopenharmony_ci * @sproc: Pointer to the scan callback.
6738c2ecf20Sopenharmony_ci * @priv: Private opaque data passed to the @sproc callback.
6748c2ecf20Sopenharmony_ci * @depth: The current depth of recursive f_op->poll calls.
6758c2ecf20Sopenharmony_ci * @ep_locked: caller already holds ep->mtx
6768c2ecf20Sopenharmony_ci *
6778c2ecf20Sopenharmony_ci * Returns: The same integer error code returned by the @sproc callback.
6788c2ecf20Sopenharmony_ci */
6798c2ecf20Sopenharmony_cistatic __poll_t ep_scan_ready_list(struct eventpoll *ep,
6808c2ecf20Sopenharmony_ci			      __poll_t (*sproc)(struct eventpoll *,
6818c2ecf20Sopenharmony_ci					   struct list_head *, void *),
6828c2ecf20Sopenharmony_ci			      void *priv, int depth, bool ep_locked)
6838c2ecf20Sopenharmony_ci{
6848c2ecf20Sopenharmony_ci	__poll_t res;
6858c2ecf20Sopenharmony_ci	struct epitem *epi, *nepi;
6868c2ecf20Sopenharmony_ci	LIST_HEAD(txlist);
6878c2ecf20Sopenharmony_ci
6888c2ecf20Sopenharmony_ci	lockdep_assert_irqs_enabled();
6898c2ecf20Sopenharmony_ci
6908c2ecf20Sopenharmony_ci	/*
6918c2ecf20Sopenharmony_ci	 * We need to lock this because we could be hit by
6928c2ecf20Sopenharmony_ci	 * eventpoll_release_file() and epoll_ctl().
6938c2ecf20Sopenharmony_ci	 */
6948c2ecf20Sopenharmony_ci
6958c2ecf20Sopenharmony_ci	if (!ep_locked)
6968c2ecf20Sopenharmony_ci		mutex_lock_nested(&ep->mtx, depth);
6978c2ecf20Sopenharmony_ci
6988c2ecf20Sopenharmony_ci	/*
6998c2ecf20Sopenharmony_ci	 * Steal the ready list, and re-init the original one to the
7008c2ecf20Sopenharmony_ci	 * empty list. Also, set ep->ovflist to NULL so that events
7018c2ecf20Sopenharmony_ci	 * happening while looping w/out locks, are not lost. We cannot
7028c2ecf20Sopenharmony_ci	 * have the poll callback to queue directly on ep->rdllist,
7038c2ecf20Sopenharmony_ci	 * because we want the "sproc" callback to be able to do it
7048c2ecf20Sopenharmony_ci	 * in a lockless way.
7058c2ecf20Sopenharmony_ci	 */
7068c2ecf20Sopenharmony_ci	write_lock_irq(&ep->lock);
7078c2ecf20Sopenharmony_ci	list_splice_init(&ep->rdllist, &txlist);
7088c2ecf20Sopenharmony_ci	WRITE_ONCE(ep->ovflist, NULL);
7098c2ecf20Sopenharmony_ci	write_unlock_irq(&ep->lock);
7108c2ecf20Sopenharmony_ci
7118c2ecf20Sopenharmony_ci	/*
7128c2ecf20Sopenharmony_ci	 * Now call the callback function.
7138c2ecf20Sopenharmony_ci	 */
7148c2ecf20Sopenharmony_ci	res = (*sproc)(ep, &txlist, priv);
7158c2ecf20Sopenharmony_ci
7168c2ecf20Sopenharmony_ci	write_lock_irq(&ep->lock);
7178c2ecf20Sopenharmony_ci	/*
7188c2ecf20Sopenharmony_ci	 * During the time we spent inside the "sproc" callback, some
7198c2ecf20Sopenharmony_ci	 * other events might have been queued by the poll callback.
7208c2ecf20Sopenharmony_ci	 * We re-insert them inside the main ready-list here.
7218c2ecf20Sopenharmony_ci	 */
7228c2ecf20Sopenharmony_ci	for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
7238c2ecf20Sopenharmony_ci	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
7248c2ecf20Sopenharmony_ci		/*
7258c2ecf20Sopenharmony_ci		 * We need to check if the item is already in the list.
7268c2ecf20Sopenharmony_ci		 * During the "sproc" callback execution time, items are
7278c2ecf20Sopenharmony_ci		 * queued into ->ovflist but the "txlist" might already
7288c2ecf20Sopenharmony_ci		 * contain them, and the list_splice() below takes care of them.
7298c2ecf20Sopenharmony_ci		 */
7308c2ecf20Sopenharmony_ci		if (!ep_is_linked(epi)) {
7318c2ecf20Sopenharmony_ci			/*
7328c2ecf20Sopenharmony_ci			 * ->ovflist is LIFO, so we have to reverse it in order
7338c2ecf20Sopenharmony_ci			 * to keep in FIFO.
7348c2ecf20Sopenharmony_ci			 */
7358c2ecf20Sopenharmony_ci			list_add(&epi->rdllink, &ep->rdllist);
7368c2ecf20Sopenharmony_ci			ep_pm_stay_awake(epi);
7378c2ecf20Sopenharmony_ci		}
7388c2ecf20Sopenharmony_ci	}
7398c2ecf20Sopenharmony_ci	/*
7408c2ecf20Sopenharmony_ci	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
7418c2ecf20Sopenharmony_ci	 * releasing the lock, events will be queued in the normal way inside
7428c2ecf20Sopenharmony_ci	 * ep->rdllist.
7438c2ecf20Sopenharmony_ci	 */
7448c2ecf20Sopenharmony_ci	WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
7458c2ecf20Sopenharmony_ci
7468c2ecf20Sopenharmony_ci	/*
7478c2ecf20Sopenharmony_ci	 * Quickly re-inject items left on "txlist".
7488c2ecf20Sopenharmony_ci	 */
7498c2ecf20Sopenharmony_ci	list_splice(&txlist, &ep->rdllist);
7508c2ecf20Sopenharmony_ci	__pm_relax(ep->ws);
7518c2ecf20Sopenharmony_ci
7528c2ecf20Sopenharmony_ci	if (!list_empty(&ep->rdllist)) {
7538c2ecf20Sopenharmony_ci		if (waitqueue_active(&ep->wq))
7548c2ecf20Sopenharmony_ci			wake_up(&ep->wq);
7558c2ecf20Sopenharmony_ci	}
7568c2ecf20Sopenharmony_ci
7578c2ecf20Sopenharmony_ci	write_unlock_irq(&ep->lock);
7588c2ecf20Sopenharmony_ci
7598c2ecf20Sopenharmony_ci	if (!ep_locked)
7608c2ecf20Sopenharmony_ci		mutex_unlock(&ep->mtx);
7618c2ecf20Sopenharmony_ci
7628c2ecf20Sopenharmony_ci	return res;
7638c2ecf20Sopenharmony_ci}
7648c2ecf20Sopenharmony_ci
7658c2ecf20Sopenharmony_cistatic void epi_rcu_free(struct rcu_head *head)
7668c2ecf20Sopenharmony_ci{
7678c2ecf20Sopenharmony_ci	struct epitem *epi = container_of(head, struct epitem, rcu);
7688c2ecf20Sopenharmony_ci	kmem_cache_free(epi_cache, epi);
7698c2ecf20Sopenharmony_ci}
7708c2ecf20Sopenharmony_ci
7718c2ecf20Sopenharmony_ci/*
7728c2ecf20Sopenharmony_ci * Removes a "struct epitem" from the eventpoll RB tree and deallocates
7738c2ecf20Sopenharmony_ci * all the associated resources. Must be called with "mtx" held.
7748c2ecf20Sopenharmony_ci */
7758c2ecf20Sopenharmony_cistatic int ep_remove(struct eventpoll *ep, struct epitem *epi)
7768c2ecf20Sopenharmony_ci{
7778c2ecf20Sopenharmony_ci	struct file *file = epi->ffd.file;
7788c2ecf20Sopenharmony_ci
7798c2ecf20Sopenharmony_ci	lockdep_assert_irqs_enabled();
7808c2ecf20Sopenharmony_ci
7818c2ecf20Sopenharmony_ci	/*
7828c2ecf20Sopenharmony_ci	 * Removes poll wait queue hooks.
7838c2ecf20Sopenharmony_ci	 */
7848c2ecf20Sopenharmony_ci	ep_unregister_pollwait(ep, epi);
7858c2ecf20Sopenharmony_ci
7868c2ecf20Sopenharmony_ci	/* Remove the current item from the list of epoll hooks */
7878c2ecf20Sopenharmony_ci	spin_lock(&file->f_lock);
7888c2ecf20Sopenharmony_ci	list_del_rcu(&epi->fllink);
7898c2ecf20Sopenharmony_ci	spin_unlock(&file->f_lock);
7908c2ecf20Sopenharmony_ci
7918c2ecf20Sopenharmony_ci	rb_erase_cached(&epi->rbn, &ep->rbr);
7928c2ecf20Sopenharmony_ci
7938c2ecf20Sopenharmony_ci	write_lock_irq(&ep->lock);
7948c2ecf20Sopenharmony_ci	if (ep_is_linked(epi))
7958c2ecf20Sopenharmony_ci		list_del_init(&epi->rdllink);
7968c2ecf20Sopenharmony_ci	write_unlock_irq(&ep->lock);
7978c2ecf20Sopenharmony_ci
7988c2ecf20Sopenharmony_ci	wakeup_source_unregister(ep_wakeup_source(epi));
7998c2ecf20Sopenharmony_ci	/*
8008c2ecf20Sopenharmony_ci	 * At this point it is safe to free the eventpoll item. Use the union
8018c2ecf20Sopenharmony_ci	 * field epi->rcu, since we are trying to minimize the size of
8028c2ecf20Sopenharmony_ci	 * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
8038c2ecf20Sopenharmony_ci	 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
8048c2ecf20Sopenharmony_ci	 * use of the rbn field.
8058c2ecf20Sopenharmony_ci	 */
8068c2ecf20Sopenharmony_ci	call_rcu(&epi->rcu, epi_rcu_free);
8078c2ecf20Sopenharmony_ci
8088c2ecf20Sopenharmony_ci	atomic_long_dec(&ep->user->epoll_watches);
8098c2ecf20Sopenharmony_ci
8108c2ecf20Sopenharmony_ci	return 0;
8118c2ecf20Sopenharmony_ci}
8128c2ecf20Sopenharmony_ci
8138c2ecf20Sopenharmony_cistatic void ep_free(struct eventpoll *ep)
8148c2ecf20Sopenharmony_ci{
8158c2ecf20Sopenharmony_ci	struct rb_node *rbp;
8168c2ecf20Sopenharmony_ci	struct epitem *epi;
8178c2ecf20Sopenharmony_ci
8188c2ecf20Sopenharmony_ci	/* We need to release all tasks waiting for these file */
8198c2ecf20Sopenharmony_ci	if (waitqueue_active(&ep->poll_wait))
8208c2ecf20Sopenharmony_ci		ep_poll_safewake(ep, NULL, 0);
8218c2ecf20Sopenharmony_ci
8228c2ecf20Sopenharmony_ci	/*
8238c2ecf20Sopenharmony_ci	 * We need to lock this because we could be hit by
8248c2ecf20Sopenharmony_ci	 * eventpoll_release_file() while we're freeing the "struct eventpoll".
8258c2ecf20Sopenharmony_ci	 * We do not need to hold "ep->mtx" here because the epoll file
8268c2ecf20Sopenharmony_ci	 * is on the way to be removed and no one has references to it
8278c2ecf20Sopenharmony_ci	 * anymore. The only hit might come from eventpoll_release_file() but
8288c2ecf20Sopenharmony_ci	 * holding "epmutex" is sufficient here.
8298c2ecf20Sopenharmony_ci	 */
8308c2ecf20Sopenharmony_ci	mutex_lock(&epmutex);
8318c2ecf20Sopenharmony_ci
8328c2ecf20Sopenharmony_ci	/*
8338c2ecf20Sopenharmony_ci	 * Walks through the whole tree by unregistering poll callbacks.
8348c2ecf20Sopenharmony_ci	 */
8358c2ecf20Sopenharmony_ci	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
8368c2ecf20Sopenharmony_ci		epi = rb_entry(rbp, struct epitem, rbn);
8378c2ecf20Sopenharmony_ci
8388c2ecf20Sopenharmony_ci		ep_unregister_pollwait(ep, epi);
8398c2ecf20Sopenharmony_ci		cond_resched();
8408c2ecf20Sopenharmony_ci	}
8418c2ecf20Sopenharmony_ci
8428c2ecf20Sopenharmony_ci	/*
8438c2ecf20Sopenharmony_ci	 * Walks through the whole tree by freeing each "struct epitem". At this
8448c2ecf20Sopenharmony_ci	 * point we are sure no poll callbacks will be lingering around, and also by
8458c2ecf20Sopenharmony_ci	 * holding "epmutex" we can be sure that no file cleanup code will hit
8468c2ecf20Sopenharmony_ci	 * us during this operation. So we can avoid the lock on "ep->lock".
8478c2ecf20Sopenharmony_ci	 * We do not need to lock ep->mtx, either, we only do it to prevent
8488c2ecf20Sopenharmony_ci	 * a lockdep warning.
8498c2ecf20Sopenharmony_ci	 */
8508c2ecf20Sopenharmony_ci	mutex_lock(&ep->mtx);
8518c2ecf20Sopenharmony_ci	while ((rbp = rb_first_cached(&ep->rbr)) != NULL) {
8528c2ecf20Sopenharmony_ci		epi = rb_entry(rbp, struct epitem, rbn);
8538c2ecf20Sopenharmony_ci		ep_remove(ep, epi);
8548c2ecf20Sopenharmony_ci		cond_resched();
8558c2ecf20Sopenharmony_ci	}
8568c2ecf20Sopenharmony_ci	mutex_unlock(&ep->mtx);
8578c2ecf20Sopenharmony_ci
8588c2ecf20Sopenharmony_ci	mutex_unlock(&epmutex);
8598c2ecf20Sopenharmony_ci	mutex_destroy(&ep->mtx);
8608c2ecf20Sopenharmony_ci	free_uid(ep->user);
8618c2ecf20Sopenharmony_ci	wakeup_source_unregister(ep->ws);
8628c2ecf20Sopenharmony_ci	kfree(ep);
8638c2ecf20Sopenharmony_ci}
8648c2ecf20Sopenharmony_ci
8658c2ecf20Sopenharmony_cistatic int ep_eventpoll_release(struct inode *inode, struct file *file)
8668c2ecf20Sopenharmony_ci{
8678c2ecf20Sopenharmony_ci	struct eventpoll *ep = file->private_data;
8688c2ecf20Sopenharmony_ci
8698c2ecf20Sopenharmony_ci	if (ep)
8708c2ecf20Sopenharmony_ci		ep_free(ep);
8718c2ecf20Sopenharmony_ci
8728c2ecf20Sopenharmony_ci	return 0;
8738c2ecf20Sopenharmony_ci}
8748c2ecf20Sopenharmony_ci
8758c2ecf20Sopenharmony_cistatic __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
8768c2ecf20Sopenharmony_ci			       void *priv);
8778c2ecf20Sopenharmony_cistatic void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
8788c2ecf20Sopenharmony_ci				 poll_table *pt);
8798c2ecf20Sopenharmony_ci
8808c2ecf20Sopenharmony_ci/*
8818c2ecf20Sopenharmony_ci * Differs from ep_eventpoll_poll() in that internal callers already have
8828c2ecf20Sopenharmony_ci * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
8838c2ecf20Sopenharmony_ci * is correctly annotated.
8848c2ecf20Sopenharmony_ci */
8858c2ecf20Sopenharmony_cistatic __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
8868c2ecf20Sopenharmony_ci				 int depth)
8878c2ecf20Sopenharmony_ci{
8888c2ecf20Sopenharmony_ci	struct eventpoll *ep;
8898c2ecf20Sopenharmony_ci	bool locked;
8908c2ecf20Sopenharmony_ci
8918c2ecf20Sopenharmony_ci	pt->_key = epi->event.events;
8928c2ecf20Sopenharmony_ci	if (!is_file_epoll(epi->ffd.file))
8938c2ecf20Sopenharmony_ci		return vfs_poll(epi->ffd.file, pt) & epi->event.events;
8948c2ecf20Sopenharmony_ci
8958c2ecf20Sopenharmony_ci	ep = epi->ffd.file->private_data;
8968c2ecf20Sopenharmony_ci	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
8978c2ecf20Sopenharmony_ci	locked = pt && (pt->_qproc == ep_ptable_queue_proc);
8988c2ecf20Sopenharmony_ci
8998c2ecf20Sopenharmony_ci	return ep_scan_ready_list(epi->ffd.file->private_data,
9008c2ecf20Sopenharmony_ci				  ep_read_events_proc, &depth, depth,
9018c2ecf20Sopenharmony_ci				  locked) & epi->event.events;
9028c2ecf20Sopenharmony_ci}
9038c2ecf20Sopenharmony_ci
9048c2ecf20Sopenharmony_cistatic __poll_t ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
9058c2ecf20Sopenharmony_ci			       void *priv)
9068c2ecf20Sopenharmony_ci{
9078c2ecf20Sopenharmony_ci	struct epitem *epi, *tmp;
9088c2ecf20Sopenharmony_ci	poll_table pt;
9098c2ecf20Sopenharmony_ci	int depth = *(int *)priv;
9108c2ecf20Sopenharmony_ci
9118c2ecf20Sopenharmony_ci	init_poll_funcptr(&pt, NULL);
9128c2ecf20Sopenharmony_ci	depth++;
9138c2ecf20Sopenharmony_ci
9148c2ecf20Sopenharmony_ci	list_for_each_entry_safe(epi, tmp, head, rdllink) {
9158c2ecf20Sopenharmony_ci		if (ep_item_poll(epi, &pt, depth)) {
9168c2ecf20Sopenharmony_ci			return EPOLLIN | EPOLLRDNORM;
9178c2ecf20Sopenharmony_ci		} else {
9188c2ecf20Sopenharmony_ci			/*
9198c2ecf20Sopenharmony_ci			 * Item has been dropped into the ready list by the poll
9208c2ecf20Sopenharmony_ci			 * callback, but it's not actually ready, as far as
9218c2ecf20Sopenharmony_ci			 * caller requested events goes. We can remove it here.
9228c2ecf20Sopenharmony_ci			 */
9238c2ecf20Sopenharmony_ci			__pm_relax(ep_wakeup_source(epi));
9248c2ecf20Sopenharmony_ci			list_del_init(&epi->rdllink);
9258c2ecf20Sopenharmony_ci		}
9268c2ecf20Sopenharmony_ci	}
9278c2ecf20Sopenharmony_ci
9288c2ecf20Sopenharmony_ci	return 0;
9298c2ecf20Sopenharmony_ci}
9308c2ecf20Sopenharmony_ci
9318c2ecf20Sopenharmony_cistatic __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
9328c2ecf20Sopenharmony_ci{
9338c2ecf20Sopenharmony_ci	struct eventpoll *ep = file->private_data;
9348c2ecf20Sopenharmony_ci	int depth = 0;
9358c2ecf20Sopenharmony_ci
9368c2ecf20Sopenharmony_ci	/* Insert inside our poll wait queue */
9378c2ecf20Sopenharmony_ci	poll_wait(file, &ep->poll_wait, wait);
9388c2ecf20Sopenharmony_ci
9398c2ecf20Sopenharmony_ci	/*
9408c2ecf20Sopenharmony_ci	 * Proceed to find out if wanted events are really available inside
9418c2ecf20Sopenharmony_ci	 * the ready list.
9428c2ecf20Sopenharmony_ci	 */
9438c2ecf20Sopenharmony_ci	return ep_scan_ready_list(ep, ep_read_events_proc,
9448c2ecf20Sopenharmony_ci				  &depth, depth, false);
9458c2ecf20Sopenharmony_ci}
9468c2ecf20Sopenharmony_ci
9478c2ecf20Sopenharmony_ci#ifdef CONFIG_PROC_FS
9488c2ecf20Sopenharmony_cistatic void ep_show_fdinfo(struct seq_file *m, struct file *f)
9498c2ecf20Sopenharmony_ci{
9508c2ecf20Sopenharmony_ci	struct eventpoll *ep = f->private_data;
9518c2ecf20Sopenharmony_ci	struct rb_node *rbp;
9528c2ecf20Sopenharmony_ci
9538c2ecf20Sopenharmony_ci	mutex_lock(&ep->mtx);
9548c2ecf20Sopenharmony_ci	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
9558c2ecf20Sopenharmony_ci		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
9568c2ecf20Sopenharmony_ci		struct inode *inode = file_inode(epi->ffd.file);
9578c2ecf20Sopenharmony_ci
9588c2ecf20Sopenharmony_ci		seq_printf(m, "tfd: %8d events: %8x data: %16llx "
9598c2ecf20Sopenharmony_ci			   " pos:%lli ino:%lx sdev:%x\n",
9608c2ecf20Sopenharmony_ci			   epi->ffd.fd, epi->event.events,
9618c2ecf20Sopenharmony_ci			   (long long)epi->event.data,
9628c2ecf20Sopenharmony_ci			   (long long)epi->ffd.file->f_pos,
9638c2ecf20Sopenharmony_ci			   inode->i_ino, inode->i_sb->s_dev);
9648c2ecf20Sopenharmony_ci		if (seq_has_overflowed(m))
9658c2ecf20Sopenharmony_ci			break;
9668c2ecf20Sopenharmony_ci	}
9678c2ecf20Sopenharmony_ci	mutex_unlock(&ep->mtx);
9688c2ecf20Sopenharmony_ci}
9698c2ecf20Sopenharmony_ci#endif
9708c2ecf20Sopenharmony_ci
9718c2ecf20Sopenharmony_ci/* File callbacks that implement the eventpoll file behaviour */
9728c2ecf20Sopenharmony_cistatic const struct file_operations eventpoll_fops = {
9738c2ecf20Sopenharmony_ci#ifdef CONFIG_PROC_FS
9748c2ecf20Sopenharmony_ci	.show_fdinfo	= ep_show_fdinfo,
9758c2ecf20Sopenharmony_ci#endif
9768c2ecf20Sopenharmony_ci	.release	= ep_eventpoll_release,
9778c2ecf20Sopenharmony_ci	.poll		= ep_eventpoll_poll,
9788c2ecf20Sopenharmony_ci	.llseek		= noop_llseek,
9798c2ecf20Sopenharmony_ci};
9808c2ecf20Sopenharmony_ci
9818c2ecf20Sopenharmony_ci/*
9828c2ecf20Sopenharmony_ci * This is called from eventpoll_release() to unlink files from the eventpoll
9838c2ecf20Sopenharmony_ci * interface. We need to have this facility to cleanup correctly files that are
9848c2ecf20Sopenharmony_ci * closed without being removed from the eventpoll interface.
9858c2ecf20Sopenharmony_ci */
9868c2ecf20Sopenharmony_civoid eventpoll_release_file(struct file *file)
9878c2ecf20Sopenharmony_ci{
9888c2ecf20Sopenharmony_ci	struct eventpoll *ep;
9898c2ecf20Sopenharmony_ci	struct epitem *epi, *next;
9908c2ecf20Sopenharmony_ci
9918c2ecf20Sopenharmony_ci	/*
9928c2ecf20Sopenharmony_ci	 * We don't want to get "file->f_lock" because it is not
9938c2ecf20Sopenharmony_ci	 * necessary. It is not necessary because we're in the "struct file"
9948c2ecf20Sopenharmony_ci	 * cleanup path, and this means that no one is using this file anymore.
9958c2ecf20Sopenharmony_ci	 * So, for example, epoll_ctl() cannot hit here since if we reach this
9968c2ecf20Sopenharmony_ci	 * point, the file counter already went to zero and fget() would fail.
9978c2ecf20Sopenharmony_ci	 * The only hit might come from ep_free() but by holding the mutex
9988c2ecf20Sopenharmony_ci	 * will correctly serialize the operation. We do need to acquire
9998c2ecf20Sopenharmony_ci	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
10008c2ecf20Sopenharmony_ci	 * from anywhere but ep_free().
10018c2ecf20Sopenharmony_ci	 *
10028c2ecf20Sopenharmony_ci	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
10038c2ecf20Sopenharmony_ci	 */
10048c2ecf20Sopenharmony_ci	mutex_lock(&epmutex);
10058c2ecf20Sopenharmony_ci	list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
10068c2ecf20Sopenharmony_ci		ep = epi->ep;
10078c2ecf20Sopenharmony_ci		mutex_lock_nested(&ep->mtx, 0);
10088c2ecf20Sopenharmony_ci		ep_remove(ep, epi);
10098c2ecf20Sopenharmony_ci		mutex_unlock(&ep->mtx);
10108c2ecf20Sopenharmony_ci	}
10118c2ecf20Sopenharmony_ci	mutex_unlock(&epmutex);
10128c2ecf20Sopenharmony_ci}
10138c2ecf20Sopenharmony_ci
10148c2ecf20Sopenharmony_cistatic int ep_alloc(struct eventpoll **pep)
10158c2ecf20Sopenharmony_ci{
10168c2ecf20Sopenharmony_ci	int error;
10178c2ecf20Sopenharmony_ci	struct user_struct *user;
10188c2ecf20Sopenharmony_ci	struct eventpoll *ep;
10198c2ecf20Sopenharmony_ci
10208c2ecf20Sopenharmony_ci	user = get_current_user();
10218c2ecf20Sopenharmony_ci	error = -ENOMEM;
10228c2ecf20Sopenharmony_ci	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
10238c2ecf20Sopenharmony_ci	if (unlikely(!ep))
10248c2ecf20Sopenharmony_ci		goto free_uid;
10258c2ecf20Sopenharmony_ci
10268c2ecf20Sopenharmony_ci	mutex_init(&ep->mtx);
10278c2ecf20Sopenharmony_ci	rwlock_init(&ep->lock);
10288c2ecf20Sopenharmony_ci	init_waitqueue_head(&ep->wq);
10298c2ecf20Sopenharmony_ci	init_waitqueue_head(&ep->poll_wait);
10308c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&ep->rdllist);
10318c2ecf20Sopenharmony_ci	ep->rbr = RB_ROOT_CACHED;
10328c2ecf20Sopenharmony_ci	ep->ovflist = EP_UNACTIVE_PTR;
10338c2ecf20Sopenharmony_ci	ep->user = user;
10348c2ecf20Sopenharmony_ci
10358c2ecf20Sopenharmony_ci	*pep = ep;
10368c2ecf20Sopenharmony_ci
10378c2ecf20Sopenharmony_ci	return 0;
10388c2ecf20Sopenharmony_ci
10398c2ecf20Sopenharmony_cifree_uid:
10408c2ecf20Sopenharmony_ci	free_uid(user);
10418c2ecf20Sopenharmony_ci	return error;
10428c2ecf20Sopenharmony_ci}
10438c2ecf20Sopenharmony_ci
10448c2ecf20Sopenharmony_ci/*
10458c2ecf20Sopenharmony_ci * Search the file inside the eventpoll tree. The RB tree operations
10468c2ecf20Sopenharmony_ci * are protected by the "mtx" mutex, and ep_find() must be called with
10478c2ecf20Sopenharmony_ci * "mtx" held.
10488c2ecf20Sopenharmony_ci */
10498c2ecf20Sopenharmony_cistatic struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
10508c2ecf20Sopenharmony_ci{
10518c2ecf20Sopenharmony_ci	int kcmp;
10528c2ecf20Sopenharmony_ci	struct rb_node *rbp;
10538c2ecf20Sopenharmony_ci	struct epitem *epi, *epir = NULL;
10548c2ecf20Sopenharmony_ci	struct epoll_filefd ffd;
10558c2ecf20Sopenharmony_ci
10568c2ecf20Sopenharmony_ci	ep_set_ffd(&ffd, file, fd);
10578c2ecf20Sopenharmony_ci	for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
10588c2ecf20Sopenharmony_ci		epi = rb_entry(rbp, struct epitem, rbn);
10598c2ecf20Sopenharmony_ci		kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
10608c2ecf20Sopenharmony_ci		if (kcmp > 0)
10618c2ecf20Sopenharmony_ci			rbp = rbp->rb_right;
10628c2ecf20Sopenharmony_ci		else if (kcmp < 0)
10638c2ecf20Sopenharmony_ci			rbp = rbp->rb_left;
10648c2ecf20Sopenharmony_ci		else {
10658c2ecf20Sopenharmony_ci			epir = epi;
10668c2ecf20Sopenharmony_ci			break;
10678c2ecf20Sopenharmony_ci		}
10688c2ecf20Sopenharmony_ci	}
10698c2ecf20Sopenharmony_ci
10708c2ecf20Sopenharmony_ci	return epir;
10718c2ecf20Sopenharmony_ci}
10728c2ecf20Sopenharmony_ci
10738c2ecf20Sopenharmony_ci#ifdef CONFIG_KCMP
10748c2ecf20Sopenharmony_cistatic struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
10758c2ecf20Sopenharmony_ci{
10768c2ecf20Sopenharmony_ci	struct rb_node *rbp;
10778c2ecf20Sopenharmony_ci	struct epitem *epi;
10788c2ecf20Sopenharmony_ci
10798c2ecf20Sopenharmony_ci	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
10808c2ecf20Sopenharmony_ci		epi = rb_entry(rbp, struct epitem, rbn);
10818c2ecf20Sopenharmony_ci		if (epi->ffd.fd == tfd) {
10828c2ecf20Sopenharmony_ci			if (toff == 0)
10838c2ecf20Sopenharmony_ci				return epi;
10848c2ecf20Sopenharmony_ci			else
10858c2ecf20Sopenharmony_ci				toff--;
10868c2ecf20Sopenharmony_ci		}
10878c2ecf20Sopenharmony_ci		cond_resched();
10888c2ecf20Sopenharmony_ci	}
10898c2ecf20Sopenharmony_ci
10908c2ecf20Sopenharmony_ci	return NULL;
10918c2ecf20Sopenharmony_ci}
10928c2ecf20Sopenharmony_ci
10938c2ecf20Sopenharmony_cistruct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
10948c2ecf20Sopenharmony_ci				     unsigned long toff)
10958c2ecf20Sopenharmony_ci{
10968c2ecf20Sopenharmony_ci	struct file *file_raw;
10978c2ecf20Sopenharmony_ci	struct eventpoll *ep;
10988c2ecf20Sopenharmony_ci	struct epitem *epi;
10998c2ecf20Sopenharmony_ci
11008c2ecf20Sopenharmony_ci	if (!is_file_epoll(file))
11018c2ecf20Sopenharmony_ci		return ERR_PTR(-EINVAL);
11028c2ecf20Sopenharmony_ci
11038c2ecf20Sopenharmony_ci	ep = file->private_data;
11048c2ecf20Sopenharmony_ci
11058c2ecf20Sopenharmony_ci	mutex_lock(&ep->mtx);
11068c2ecf20Sopenharmony_ci	epi = ep_find_tfd(ep, tfd, toff);
11078c2ecf20Sopenharmony_ci	if (epi)
11088c2ecf20Sopenharmony_ci		file_raw = epi->ffd.file;
11098c2ecf20Sopenharmony_ci	else
11108c2ecf20Sopenharmony_ci		file_raw = ERR_PTR(-ENOENT);
11118c2ecf20Sopenharmony_ci	mutex_unlock(&ep->mtx);
11128c2ecf20Sopenharmony_ci
11138c2ecf20Sopenharmony_ci	return file_raw;
11148c2ecf20Sopenharmony_ci}
11158c2ecf20Sopenharmony_ci#endif /* CONFIG_KCMP */
11168c2ecf20Sopenharmony_ci
11178c2ecf20Sopenharmony_ci/**
11188c2ecf20Sopenharmony_ci * Adds a new entry to the tail of the list in a lockless way, i.e.
11198c2ecf20Sopenharmony_ci * multiple CPUs are allowed to call this function concurrently.
11208c2ecf20Sopenharmony_ci *
11218c2ecf20Sopenharmony_ci * Beware: it is necessary to prevent any other modifications of the
11228c2ecf20Sopenharmony_ci *         existing list until all changes are completed, in other words
11238c2ecf20Sopenharmony_ci *         concurrent list_add_tail_lockless() calls should be protected
11248c2ecf20Sopenharmony_ci *         with a read lock, where write lock acts as a barrier which
11258c2ecf20Sopenharmony_ci *         makes sure all list_add_tail_lockless() calls are fully
11268c2ecf20Sopenharmony_ci *         completed.
11278c2ecf20Sopenharmony_ci *
11288c2ecf20Sopenharmony_ci *        Also an element can be locklessly added to the list only in one
11298c2ecf20Sopenharmony_ci *        direction i.e. either to the tail either to the head, otherwise
11308c2ecf20Sopenharmony_ci *        concurrent access will corrupt the list.
11318c2ecf20Sopenharmony_ci *
11328c2ecf20Sopenharmony_ci * Returns %false if element has been already added to the list, %true
11338c2ecf20Sopenharmony_ci * otherwise.
11348c2ecf20Sopenharmony_ci */
11358c2ecf20Sopenharmony_cistatic inline bool list_add_tail_lockless(struct list_head *new,
11368c2ecf20Sopenharmony_ci					  struct list_head *head)
11378c2ecf20Sopenharmony_ci{
11388c2ecf20Sopenharmony_ci	struct list_head *prev;
11398c2ecf20Sopenharmony_ci
11408c2ecf20Sopenharmony_ci	/*
11418c2ecf20Sopenharmony_ci	 * This is simple 'new->next = head' operation, but cmpxchg()
11428c2ecf20Sopenharmony_ci	 * is used in order to detect that same element has been just
11438c2ecf20Sopenharmony_ci	 * added to the list from another CPU: the winner observes
11448c2ecf20Sopenharmony_ci	 * new->next == new.
11458c2ecf20Sopenharmony_ci	 */
11468c2ecf20Sopenharmony_ci	if (cmpxchg(&new->next, new, head) != new)
11478c2ecf20Sopenharmony_ci		return false;
11488c2ecf20Sopenharmony_ci
11498c2ecf20Sopenharmony_ci	/*
11508c2ecf20Sopenharmony_ci	 * Initially ->next of a new element must be updated with the head
11518c2ecf20Sopenharmony_ci	 * (we are inserting to the tail) and only then pointers are atomically
11528c2ecf20Sopenharmony_ci	 * exchanged.  XCHG guarantees memory ordering, thus ->next should be
11538c2ecf20Sopenharmony_ci	 * updated before pointers are actually swapped and pointers are
11548c2ecf20Sopenharmony_ci	 * swapped before prev->next is updated.
11558c2ecf20Sopenharmony_ci	 */
11568c2ecf20Sopenharmony_ci
11578c2ecf20Sopenharmony_ci	prev = xchg(&head->prev, new);
11588c2ecf20Sopenharmony_ci
11598c2ecf20Sopenharmony_ci	/*
11608c2ecf20Sopenharmony_ci	 * It is safe to modify prev->next and new->prev, because a new element
11618c2ecf20Sopenharmony_ci	 * is added only to the tail and new->next is updated before XCHG.
11628c2ecf20Sopenharmony_ci	 */
11638c2ecf20Sopenharmony_ci
11648c2ecf20Sopenharmony_ci	prev->next = new;
11658c2ecf20Sopenharmony_ci	new->prev = prev;
11668c2ecf20Sopenharmony_ci
11678c2ecf20Sopenharmony_ci	return true;
11688c2ecf20Sopenharmony_ci}
11698c2ecf20Sopenharmony_ci
11708c2ecf20Sopenharmony_ci/**
11718c2ecf20Sopenharmony_ci * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
11728c2ecf20Sopenharmony_ci * i.e. multiple CPUs are allowed to call this function concurrently.
11738c2ecf20Sopenharmony_ci *
11748c2ecf20Sopenharmony_ci * Returns %false if epi element has been already chained, %true otherwise.
11758c2ecf20Sopenharmony_ci */
11768c2ecf20Sopenharmony_cistatic inline bool chain_epi_lockless(struct epitem *epi)
11778c2ecf20Sopenharmony_ci{
11788c2ecf20Sopenharmony_ci	struct eventpoll *ep = epi->ep;
11798c2ecf20Sopenharmony_ci
11808c2ecf20Sopenharmony_ci	/* Fast preliminary check */
11818c2ecf20Sopenharmony_ci	if (epi->next != EP_UNACTIVE_PTR)
11828c2ecf20Sopenharmony_ci		return false;
11838c2ecf20Sopenharmony_ci
11848c2ecf20Sopenharmony_ci	/* Check that the same epi has not been just chained from another CPU */
11858c2ecf20Sopenharmony_ci	if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
11868c2ecf20Sopenharmony_ci		return false;
11878c2ecf20Sopenharmony_ci
11888c2ecf20Sopenharmony_ci	/* Atomically exchange tail */
11898c2ecf20Sopenharmony_ci	epi->next = xchg(&ep->ovflist, epi);
11908c2ecf20Sopenharmony_ci
11918c2ecf20Sopenharmony_ci	return true;
11928c2ecf20Sopenharmony_ci}
11938c2ecf20Sopenharmony_ci
11948c2ecf20Sopenharmony_ci/*
11958c2ecf20Sopenharmony_ci * This is the callback that is passed to the wait queue wakeup
11968c2ecf20Sopenharmony_ci * mechanism. It is called by the stored file descriptors when they
11978c2ecf20Sopenharmony_ci * have events to report.
11988c2ecf20Sopenharmony_ci *
11998c2ecf20Sopenharmony_ci * This callback takes a read lock in order not to content with concurrent
12008c2ecf20Sopenharmony_ci * events from another file descriptors, thus all modifications to ->rdllist
12018c2ecf20Sopenharmony_ci * or ->ovflist are lockless.  Read lock is paired with the write lock from
12028c2ecf20Sopenharmony_ci * ep_scan_ready_list(), which stops all list modifications and guarantees
12038c2ecf20Sopenharmony_ci * that lists state is seen correctly.
12048c2ecf20Sopenharmony_ci *
12058c2ecf20Sopenharmony_ci * Another thing worth to mention is that ep_poll_callback() can be called
12068c2ecf20Sopenharmony_ci * concurrently for the same @epi from different CPUs if poll table was inited
12078c2ecf20Sopenharmony_ci * with several wait queues entries.  Plural wakeup from different CPUs of a
12088c2ecf20Sopenharmony_ci * single wait queue is serialized by wq.lock, but the case when multiple wait
12098c2ecf20Sopenharmony_ci * queues are used should be detected accordingly.  This is detected using
12108c2ecf20Sopenharmony_ci * cmpxchg() operation.
12118c2ecf20Sopenharmony_ci */
12128c2ecf20Sopenharmony_cistatic int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
12138c2ecf20Sopenharmony_ci{
12148c2ecf20Sopenharmony_ci	int pwake = 0;
12158c2ecf20Sopenharmony_ci	struct epitem *epi = ep_item_from_wait(wait);
12168c2ecf20Sopenharmony_ci	struct eventpoll *ep = epi->ep;
12178c2ecf20Sopenharmony_ci	__poll_t pollflags = key_to_poll(key);
12188c2ecf20Sopenharmony_ci	unsigned long flags;
12198c2ecf20Sopenharmony_ci	int ewake = 0;
12208c2ecf20Sopenharmony_ci
12218c2ecf20Sopenharmony_ci	read_lock_irqsave(&ep->lock, flags);
12228c2ecf20Sopenharmony_ci
12238c2ecf20Sopenharmony_ci	ep_set_busy_poll_napi_id(epi);
12248c2ecf20Sopenharmony_ci
12258c2ecf20Sopenharmony_ci	/*
12268c2ecf20Sopenharmony_ci	 * If the event mask does not contain any poll(2) event, we consider the
12278c2ecf20Sopenharmony_ci	 * descriptor to be disabled. This condition is likely the effect of the
12288c2ecf20Sopenharmony_ci	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
12298c2ecf20Sopenharmony_ci	 * until the next EPOLL_CTL_MOD will be issued.
12308c2ecf20Sopenharmony_ci	 */
12318c2ecf20Sopenharmony_ci	if (!(epi->event.events & ~EP_PRIVATE_BITS))
12328c2ecf20Sopenharmony_ci		goto out_unlock;
12338c2ecf20Sopenharmony_ci
12348c2ecf20Sopenharmony_ci	/*
12358c2ecf20Sopenharmony_ci	 * Check the events coming with the callback. At this stage, not
12368c2ecf20Sopenharmony_ci	 * every device reports the events in the "key" parameter of the
12378c2ecf20Sopenharmony_ci	 * callback. We need to be able to handle both cases here, hence the
12388c2ecf20Sopenharmony_ci	 * test for "key" != NULL before the event match test.
12398c2ecf20Sopenharmony_ci	 */
12408c2ecf20Sopenharmony_ci	if (pollflags && !(pollflags & epi->event.events))
12418c2ecf20Sopenharmony_ci		goto out_unlock;
12428c2ecf20Sopenharmony_ci
12438c2ecf20Sopenharmony_ci	/*
12448c2ecf20Sopenharmony_ci	 * If we are transferring events to userspace, we can hold no locks
12458c2ecf20Sopenharmony_ci	 * (because we're accessing user memory, and because of linux f_op->poll()
12468c2ecf20Sopenharmony_ci	 * semantics). All the events that happen during that period of time are
12478c2ecf20Sopenharmony_ci	 * chained in ep->ovflist and requeued later on.
12488c2ecf20Sopenharmony_ci	 */
12498c2ecf20Sopenharmony_ci	if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
12508c2ecf20Sopenharmony_ci		if (chain_epi_lockless(epi))
12518c2ecf20Sopenharmony_ci			ep_pm_stay_awake_rcu(epi);
12528c2ecf20Sopenharmony_ci	} else if (!ep_is_linked(epi)) {
12538c2ecf20Sopenharmony_ci		/* In the usual case, add event to ready list. */
12548c2ecf20Sopenharmony_ci		if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
12558c2ecf20Sopenharmony_ci			ep_pm_stay_awake_rcu(epi);
12568c2ecf20Sopenharmony_ci	}
12578c2ecf20Sopenharmony_ci
12588c2ecf20Sopenharmony_ci	/*
12598c2ecf20Sopenharmony_ci	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
12608c2ecf20Sopenharmony_ci	 * wait list.
12618c2ecf20Sopenharmony_ci	 */
12628c2ecf20Sopenharmony_ci	if (waitqueue_active(&ep->wq)) {
12638c2ecf20Sopenharmony_ci		if ((epi->event.events & EPOLLEXCLUSIVE) &&
12648c2ecf20Sopenharmony_ci					!(pollflags & POLLFREE)) {
12658c2ecf20Sopenharmony_ci			switch (pollflags & EPOLLINOUT_BITS) {
12668c2ecf20Sopenharmony_ci			case EPOLLIN:
12678c2ecf20Sopenharmony_ci				if (epi->event.events & EPOLLIN)
12688c2ecf20Sopenharmony_ci					ewake = 1;
12698c2ecf20Sopenharmony_ci				break;
12708c2ecf20Sopenharmony_ci			case EPOLLOUT:
12718c2ecf20Sopenharmony_ci				if (epi->event.events & EPOLLOUT)
12728c2ecf20Sopenharmony_ci					ewake = 1;
12738c2ecf20Sopenharmony_ci				break;
12748c2ecf20Sopenharmony_ci			case 0:
12758c2ecf20Sopenharmony_ci				ewake = 1;
12768c2ecf20Sopenharmony_ci				break;
12778c2ecf20Sopenharmony_ci			}
12788c2ecf20Sopenharmony_ci		}
12798c2ecf20Sopenharmony_ci		wake_up(&ep->wq);
12808c2ecf20Sopenharmony_ci	}
12818c2ecf20Sopenharmony_ci	if (waitqueue_active(&ep->poll_wait))
12828c2ecf20Sopenharmony_ci		pwake++;
12838c2ecf20Sopenharmony_ci
12848c2ecf20Sopenharmony_ciout_unlock:
12858c2ecf20Sopenharmony_ci	read_unlock_irqrestore(&ep->lock, flags);
12868c2ecf20Sopenharmony_ci
12878c2ecf20Sopenharmony_ci	/* We have to call this outside the lock */
12888c2ecf20Sopenharmony_ci	if (pwake)
12898c2ecf20Sopenharmony_ci		ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);
12908c2ecf20Sopenharmony_ci
12918c2ecf20Sopenharmony_ci	if (!(epi->event.events & EPOLLEXCLUSIVE))
12928c2ecf20Sopenharmony_ci		ewake = 1;
12938c2ecf20Sopenharmony_ci
12948c2ecf20Sopenharmony_ci	if (pollflags & POLLFREE) {
12958c2ecf20Sopenharmony_ci		/*
12968c2ecf20Sopenharmony_ci		 * If we race with ep_remove_wait_queue() it can miss
12978c2ecf20Sopenharmony_ci		 * ->whead = NULL and do another remove_wait_queue() after
12988c2ecf20Sopenharmony_ci		 * us, so we can't use __remove_wait_queue().
12998c2ecf20Sopenharmony_ci		 */
13008c2ecf20Sopenharmony_ci		list_del_init(&wait->entry);
13018c2ecf20Sopenharmony_ci		/*
13028c2ecf20Sopenharmony_ci		 * ->whead != NULL protects us from the race with ep_free()
13038c2ecf20Sopenharmony_ci		 * or ep_remove(), ep_remove_wait_queue() takes whead->lock
13048c2ecf20Sopenharmony_ci		 * held by the caller. Once we nullify it, nothing protects
13058c2ecf20Sopenharmony_ci		 * ep/epi or even wait.
13068c2ecf20Sopenharmony_ci		 */
13078c2ecf20Sopenharmony_ci		smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
13088c2ecf20Sopenharmony_ci	}
13098c2ecf20Sopenharmony_ci
13108c2ecf20Sopenharmony_ci	return ewake;
13118c2ecf20Sopenharmony_ci}
13128c2ecf20Sopenharmony_ci
13138c2ecf20Sopenharmony_ci/*
13148c2ecf20Sopenharmony_ci * This is the callback that is used to add our wait queue to the
13158c2ecf20Sopenharmony_ci * target file wakeup lists.
13168c2ecf20Sopenharmony_ci */
13178c2ecf20Sopenharmony_cistatic void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
13188c2ecf20Sopenharmony_ci				 poll_table *pt)
13198c2ecf20Sopenharmony_ci{
13208c2ecf20Sopenharmony_ci	struct epitem *epi = ep_item_from_epqueue(pt);
13218c2ecf20Sopenharmony_ci	struct eppoll_entry *pwq;
13228c2ecf20Sopenharmony_ci
13238c2ecf20Sopenharmony_ci	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
13248c2ecf20Sopenharmony_ci		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
13258c2ecf20Sopenharmony_ci		pwq->whead = whead;
13268c2ecf20Sopenharmony_ci		pwq->base = epi;
13278c2ecf20Sopenharmony_ci		if (epi->event.events & EPOLLEXCLUSIVE)
13288c2ecf20Sopenharmony_ci			add_wait_queue_exclusive(whead, &pwq->wait);
13298c2ecf20Sopenharmony_ci		else
13308c2ecf20Sopenharmony_ci			add_wait_queue(whead, &pwq->wait);
13318c2ecf20Sopenharmony_ci		list_add_tail(&pwq->llink, &epi->pwqlist);
13328c2ecf20Sopenharmony_ci		epi->nwait++;
13338c2ecf20Sopenharmony_ci	} else {
13348c2ecf20Sopenharmony_ci		/* We have to signal that an error occurred */
13358c2ecf20Sopenharmony_ci		epi->nwait = -1;
13368c2ecf20Sopenharmony_ci	}
13378c2ecf20Sopenharmony_ci}
13388c2ecf20Sopenharmony_ci
13398c2ecf20Sopenharmony_cistatic void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
13408c2ecf20Sopenharmony_ci{
13418c2ecf20Sopenharmony_ci	int kcmp;
13428c2ecf20Sopenharmony_ci	struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
13438c2ecf20Sopenharmony_ci	struct epitem *epic;
13448c2ecf20Sopenharmony_ci	bool leftmost = true;
13458c2ecf20Sopenharmony_ci
13468c2ecf20Sopenharmony_ci	while (*p) {
13478c2ecf20Sopenharmony_ci		parent = *p;
13488c2ecf20Sopenharmony_ci		epic = rb_entry(parent, struct epitem, rbn);
13498c2ecf20Sopenharmony_ci		kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
13508c2ecf20Sopenharmony_ci		if (kcmp > 0) {
13518c2ecf20Sopenharmony_ci			p = &parent->rb_right;
13528c2ecf20Sopenharmony_ci			leftmost = false;
13538c2ecf20Sopenharmony_ci		} else
13548c2ecf20Sopenharmony_ci			p = &parent->rb_left;
13558c2ecf20Sopenharmony_ci	}
13568c2ecf20Sopenharmony_ci	rb_link_node(&epi->rbn, parent, p);
13578c2ecf20Sopenharmony_ci	rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
13588c2ecf20Sopenharmony_ci}
13598c2ecf20Sopenharmony_ci
13608c2ecf20Sopenharmony_ci
13618c2ecf20Sopenharmony_ci
13628c2ecf20Sopenharmony_ci#define PATH_ARR_SIZE 5
13638c2ecf20Sopenharmony_ci/*
13648c2ecf20Sopenharmony_ci * These are the number paths of length 1 to 5, that we are allowing to emanate
13658c2ecf20Sopenharmony_ci * from a single file of interest. For example, we allow 1000 paths of length
13668c2ecf20Sopenharmony_ci * 1, to emanate from each file of interest. This essentially represents the
13678c2ecf20Sopenharmony_ci * potential wakeup paths, which need to be limited in order to avoid massive
13688c2ecf20Sopenharmony_ci * uncontrolled wakeup storms. The common use case should be a single ep which
13698c2ecf20Sopenharmony_ci * is connected to n file sources. In this case each file source has 1 path
13708c2ecf20Sopenharmony_ci * of length 1. Thus, the numbers below should be more than sufficient. These
13718c2ecf20Sopenharmony_ci * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
13728c2ecf20Sopenharmony_ci * and delete can't add additional paths. Protected by the epmutex.
13738c2ecf20Sopenharmony_ci */
13748c2ecf20Sopenharmony_cistatic const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
13758c2ecf20Sopenharmony_cistatic int path_count[PATH_ARR_SIZE];
13768c2ecf20Sopenharmony_ci
13778c2ecf20Sopenharmony_cistatic int path_count_inc(int nests)
13788c2ecf20Sopenharmony_ci{
13798c2ecf20Sopenharmony_ci	/* Allow an arbitrary number of depth 1 paths */
13808c2ecf20Sopenharmony_ci	if (nests == 0)
13818c2ecf20Sopenharmony_ci		return 0;
13828c2ecf20Sopenharmony_ci
13838c2ecf20Sopenharmony_ci	if (++path_count[nests] > path_limits[nests])
13848c2ecf20Sopenharmony_ci		return -1;
13858c2ecf20Sopenharmony_ci	return 0;
13868c2ecf20Sopenharmony_ci}
13878c2ecf20Sopenharmony_ci
13888c2ecf20Sopenharmony_cistatic void path_count_init(void)
13898c2ecf20Sopenharmony_ci{
13908c2ecf20Sopenharmony_ci	int i;
13918c2ecf20Sopenharmony_ci
13928c2ecf20Sopenharmony_ci	for (i = 0; i < PATH_ARR_SIZE; i++)
13938c2ecf20Sopenharmony_ci		path_count[i] = 0;
13948c2ecf20Sopenharmony_ci}
13958c2ecf20Sopenharmony_ci
13968c2ecf20Sopenharmony_cistatic int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
13978c2ecf20Sopenharmony_ci{
13988c2ecf20Sopenharmony_ci	int error = 0;
13998c2ecf20Sopenharmony_ci	struct file *file = priv;
14008c2ecf20Sopenharmony_ci	struct file *child_file;
14018c2ecf20Sopenharmony_ci	struct epitem *epi;
14028c2ecf20Sopenharmony_ci
14038c2ecf20Sopenharmony_ci	/* CTL_DEL can remove links here, but that can't increase our count */
14048c2ecf20Sopenharmony_ci	rcu_read_lock();
14058c2ecf20Sopenharmony_ci	list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
14068c2ecf20Sopenharmony_ci		child_file = epi->ep->file;
14078c2ecf20Sopenharmony_ci		if (is_file_epoll(child_file)) {
14088c2ecf20Sopenharmony_ci			if (list_empty(&child_file->f_ep_links)) {
14098c2ecf20Sopenharmony_ci				if (path_count_inc(call_nests)) {
14108c2ecf20Sopenharmony_ci					error = -1;
14118c2ecf20Sopenharmony_ci					break;
14128c2ecf20Sopenharmony_ci				}
14138c2ecf20Sopenharmony_ci			} else {
14148c2ecf20Sopenharmony_ci				error = ep_call_nested(&poll_loop_ncalls,
14158c2ecf20Sopenharmony_ci							reverse_path_check_proc,
14168c2ecf20Sopenharmony_ci							child_file, child_file,
14178c2ecf20Sopenharmony_ci							current);
14188c2ecf20Sopenharmony_ci			}
14198c2ecf20Sopenharmony_ci			if (error != 0)
14208c2ecf20Sopenharmony_ci				break;
14218c2ecf20Sopenharmony_ci		} else {
14228c2ecf20Sopenharmony_ci			printk(KERN_ERR "reverse_path_check_proc: "
14238c2ecf20Sopenharmony_ci				"file is not an ep!\n");
14248c2ecf20Sopenharmony_ci		}
14258c2ecf20Sopenharmony_ci	}
14268c2ecf20Sopenharmony_ci	rcu_read_unlock();
14278c2ecf20Sopenharmony_ci	return error;
14288c2ecf20Sopenharmony_ci}
14298c2ecf20Sopenharmony_ci
14308c2ecf20Sopenharmony_ci/**
14318c2ecf20Sopenharmony_ci * reverse_path_check - The tfile_check_list is list of file *, which have
14328c2ecf20Sopenharmony_ci *                      links that are proposed to be newly added. We need to
14338c2ecf20Sopenharmony_ci *                      make sure that those added links don't add too many
14348c2ecf20Sopenharmony_ci *                      paths such that we will spend all our time waking up
14358c2ecf20Sopenharmony_ci *                      eventpoll objects.
14368c2ecf20Sopenharmony_ci *
14378c2ecf20Sopenharmony_ci * Returns: Returns zero if the proposed links don't create too many paths,
14388c2ecf20Sopenharmony_ci *	    -1 otherwise.
14398c2ecf20Sopenharmony_ci */
14408c2ecf20Sopenharmony_cistatic int reverse_path_check(void)
14418c2ecf20Sopenharmony_ci{
14428c2ecf20Sopenharmony_ci	int error = 0;
14438c2ecf20Sopenharmony_ci	struct file *current_file;
14448c2ecf20Sopenharmony_ci
14458c2ecf20Sopenharmony_ci	/* let's call this for all tfiles */
14468c2ecf20Sopenharmony_ci	list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
14478c2ecf20Sopenharmony_ci		path_count_init();
14488c2ecf20Sopenharmony_ci		error = ep_call_nested(&poll_loop_ncalls,
14498c2ecf20Sopenharmony_ci					reverse_path_check_proc, current_file,
14508c2ecf20Sopenharmony_ci					current_file, current);
14518c2ecf20Sopenharmony_ci		if (error)
14528c2ecf20Sopenharmony_ci			break;
14538c2ecf20Sopenharmony_ci	}
14548c2ecf20Sopenharmony_ci	return error;
14558c2ecf20Sopenharmony_ci}
14568c2ecf20Sopenharmony_ci
14578c2ecf20Sopenharmony_cistatic int ep_create_wakeup_source(struct epitem *epi)
14588c2ecf20Sopenharmony_ci{
14598c2ecf20Sopenharmony_ci	struct name_snapshot n;
14608c2ecf20Sopenharmony_ci	struct wakeup_source *ws;
14618c2ecf20Sopenharmony_ci
14628c2ecf20Sopenharmony_ci	if (!epi->ep->ws) {
14638c2ecf20Sopenharmony_ci		epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
14648c2ecf20Sopenharmony_ci		if (!epi->ep->ws)
14658c2ecf20Sopenharmony_ci			return -ENOMEM;
14668c2ecf20Sopenharmony_ci	}
14678c2ecf20Sopenharmony_ci
14688c2ecf20Sopenharmony_ci	take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
14698c2ecf20Sopenharmony_ci	ws = wakeup_source_register(NULL, n.name.name);
14708c2ecf20Sopenharmony_ci	release_dentry_name_snapshot(&n);
14718c2ecf20Sopenharmony_ci
14728c2ecf20Sopenharmony_ci	if (!ws)
14738c2ecf20Sopenharmony_ci		return -ENOMEM;
14748c2ecf20Sopenharmony_ci	rcu_assign_pointer(epi->ws, ws);
14758c2ecf20Sopenharmony_ci
14768c2ecf20Sopenharmony_ci	return 0;
14778c2ecf20Sopenharmony_ci}
14788c2ecf20Sopenharmony_ci
14798c2ecf20Sopenharmony_ci/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
14808c2ecf20Sopenharmony_cistatic noinline void ep_destroy_wakeup_source(struct epitem *epi)
14818c2ecf20Sopenharmony_ci{
14828c2ecf20Sopenharmony_ci	struct wakeup_source *ws = ep_wakeup_source(epi);
14838c2ecf20Sopenharmony_ci
14848c2ecf20Sopenharmony_ci	RCU_INIT_POINTER(epi->ws, NULL);
14858c2ecf20Sopenharmony_ci
14868c2ecf20Sopenharmony_ci	/*
14878c2ecf20Sopenharmony_ci	 * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
14888c2ecf20Sopenharmony_ci	 * used internally by wakeup_source_remove, too (called by
14898c2ecf20Sopenharmony_ci	 * wakeup_source_unregister), so we cannot use call_rcu
14908c2ecf20Sopenharmony_ci	 */
14918c2ecf20Sopenharmony_ci	synchronize_rcu();
14928c2ecf20Sopenharmony_ci	wakeup_source_unregister(ws);
14938c2ecf20Sopenharmony_ci}
14948c2ecf20Sopenharmony_ci
14958c2ecf20Sopenharmony_ci/*
14968c2ecf20Sopenharmony_ci * Must be called with "mtx" held.
14978c2ecf20Sopenharmony_ci */
14988c2ecf20Sopenharmony_cistatic int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
14998c2ecf20Sopenharmony_ci		     struct file *tfile, int fd, int full_check)
15008c2ecf20Sopenharmony_ci{
15018c2ecf20Sopenharmony_ci	int error, pwake = 0;
15028c2ecf20Sopenharmony_ci	__poll_t revents;
15038c2ecf20Sopenharmony_ci	long user_watches;
15048c2ecf20Sopenharmony_ci	struct epitem *epi;
15058c2ecf20Sopenharmony_ci	struct ep_pqueue epq;
15068c2ecf20Sopenharmony_ci
15078c2ecf20Sopenharmony_ci	lockdep_assert_irqs_enabled();
15088c2ecf20Sopenharmony_ci
15098c2ecf20Sopenharmony_ci	user_watches = atomic_long_read(&ep->user->epoll_watches);
15108c2ecf20Sopenharmony_ci	if (unlikely(user_watches >= max_user_watches))
15118c2ecf20Sopenharmony_ci		return -ENOSPC;
15128c2ecf20Sopenharmony_ci	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
15138c2ecf20Sopenharmony_ci		return -ENOMEM;
15148c2ecf20Sopenharmony_ci
15158c2ecf20Sopenharmony_ci	/* Item initialization follow here ... */
15168c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&epi->rdllink);
15178c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&epi->fllink);
15188c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&epi->pwqlist);
15198c2ecf20Sopenharmony_ci	epi->ep = ep;
15208c2ecf20Sopenharmony_ci	ep_set_ffd(&epi->ffd, tfile, fd);
15218c2ecf20Sopenharmony_ci	epi->event = *event;
15228c2ecf20Sopenharmony_ci	epi->nwait = 0;
15238c2ecf20Sopenharmony_ci	epi->next = EP_UNACTIVE_PTR;
15248c2ecf20Sopenharmony_ci	if (epi->event.events & EPOLLWAKEUP) {
15258c2ecf20Sopenharmony_ci		error = ep_create_wakeup_source(epi);
15268c2ecf20Sopenharmony_ci		if (error)
15278c2ecf20Sopenharmony_ci			goto error_create_wakeup_source;
15288c2ecf20Sopenharmony_ci	} else {
15298c2ecf20Sopenharmony_ci		RCU_INIT_POINTER(epi->ws, NULL);
15308c2ecf20Sopenharmony_ci	}
15318c2ecf20Sopenharmony_ci
15328c2ecf20Sopenharmony_ci	/* Add the current item to the list of active epoll hook for this file */
15338c2ecf20Sopenharmony_ci	spin_lock(&tfile->f_lock);
15348c2ecf20Sopenharmony_ci	list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
15358c2ecf20Sopenharmony_ci	spin_unlock(&tfile->f_lock);
15368c2ecf20Sopenharmony_ci
15378c2ecf20Sopenharmony_ci	/*
15388c2ecf20Sopenharmony_ci	 * Add the current item to the RB tree. All RB tree operations are
15398c2ecf20Sopenharmony_ci	 * protected by "mtx", and ep_insert() is called with "mtx" held.
15408c2ecf20Sopenharmony_ci	 */
15418c2ecf20Sopenharmony_ci	ep_rbtree_insert(ep, epi);
15428c2ecf20Sopenharmony_ci
15438c2ecf20Sopenharmony_ci	/* now check if we've created too many backpaths */
15448c2ecf20Sopenharmony_ci	error = -EINVAL;
15458c2ecf20Sopenharmony_ci	if (full_check && reverse_path_check())
15468c2ecf20Sopenharmony_ci		goto error_remove_epi;
15478c2ecf20Sopenharmony_ci
15488c2ecf20Sopenharmony_ci	/* Initialize the poll table using the queue callback */
15498c2ecf20Sopenharmony_ci	epq.epi = epi;
15508c2ecf20Sopenharmony_ci	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
15518c2ecf20Sopenharmony_ci
15528c2ecf20Sopenharmony_ci	/*
15538c2ecf20Sopenharmony_ci	 * Attach the item to the poll hooks and get current event bits.
15548c2ecf20Sopenharmony_ci	 * We can safely use the file* here because its usage count has
15558c2ecf20Sopenharmony_ci	 * been increased by the caller of this function. Note that after
15568c2ecf20Sopenharmony_ci	 * this operation completes, the poll callback can start hitting
15578c2ecf20Sopenharmony_ci	 * the new item.
15588c2ecf20Sopenharmony_ci	 */
15598c2ecf20Sopenharmony_ci	revents = ep_item_poll(epi, &epq.pt, 1);
15608c2ecf20Sopenharmony_ci
15618c2ecf20Sopenharmony_ci	/*
15628c2ecf20Sopenharmony_ci	 * We have to check if something went wrong during the poll wait queue
15638c2ecf20Sopenharmony_ci	 * install process. Namely an allocation for a wait queue failed due
15648c2ecf20Sopenharmony_ci	 * high memory pressure.
15658c2ecf20Sopenharmony_ci	 */
15668c2ecf20Sopenharmony_ci	error = -ENOMEM;
15678c2ecf20Sopenharmony_ci	if (epi->nwait < 0)
15688c2ecf20Sopenharmony_ci		goto error_unregister;
15698c2ecf20Sopenharmony_ci
15708c2ecf20Sopenharmony_ci	/* We have to drop the new item inside our item list to keep track of it */
15718c2ecf20Sopenharmony_ci	write_lock_irq(&ep->lock);
15728c2ecf20Sopenharmony_ci
15738c2ecf20Sopenharmony_ci	/* record NAPI ID of new item if present */
15748c2ecf20Sopenharmony_ci	ep_set_busy_poll_napi_id(epi);
15758c2ecf20Sopenharmony_ci
15768c2ecf20Sopenharmony_ci	/* If the file is already "ready" we drop it inside the ready list */
15778c2ecf20Sopenharmony_ci	if (revents && !ep_is_linked(epi)) {
15788c2ecf20Sopenharmony_ci		list_add_tail(&epi->rdllink, &ep->rdllist);
15798c2ecf20Sopenharmony_ci		ep_pm_stay_awake(epi);
15808c2ecf20Sopenharmony_ci
15818c2ecf20Sopenharmony_ci		/* Notify waiting tasks that events are available */
15828c2ecf20Sopenharmony_ci		if (waitqueue_active(&ep->wq))
15838c2ecf20Sopenharmony_ci			wake_up(&ep->wq);
15848c2ecf20Sopenharmony_ci		if (waitqueue_active(&ep->poll_wait))
15858c2ecf20Sopenharmony_ci			pwake++;
15868c2ecf20Sopenharmony_ci	}
15878c2ecf20Sopenharmony_ci
15888c2ecf20Sopenharmony_ci	write_unlock_irq(&ep->lock);
15898c2ecf20Sopenharmony_ci
15908c2ecf20Sopenharmony_ci	atomic_long_inc(&ep->user->epoll_watches);
15918c2ecf20Sopenharmony_ci
15928c2ecf20Sopenharmony_ci	/* We have to call this outside the lock */
15938c2ecf20Sopenharmony_ci	if (pwake)
15948c2ecf20Sopenharmony_ci		ep_poll_safewake(ep, NULL, 0);
15958c2ecf20Sopenharmony_ci
15968c2ecf20Sopenharmony_ci	return 0;
15978c2ecf20Sopenharmony_ci
15988c2ecf20Sopenharmony_cierror_unregister:
15998c2ecf20Sopenharmony_ci	ep_unregister_pollwait(ep, epi);
16008c2ecf20Sopenharmony_cierror_remove_epi:
16018c2ecf20Sopenharmony_ci	spin_lock(&tfile->f_lock);
16028c2ecf20Sopenharmony_ci	list_del_rcu(&epi->fllink);
16038c2ecf20Sopenharmony_ci	spin_unlock(&tfile->f_lock);
16048c2ecf20Sopenharmony_ci
16058c2ecf20Sopenharmony_ci	rb_erase_cached(&epi->rbn, &ep->rbr);
16068c2ecf20Sopenharmony_ci
16078c2ecf20Sopenharmony_ci	/*
16088c2ecf20Sopenharmony_ci	 * We need to do this because an event could have been arrived on some
16098c2ecf20Sopenharmony_ci	 * allocated wait queue. Note that we don't care about the ep->ovflist
16108c2ecf20Sopenharmony_ci	 * list, since that is used/cleaned only inside a section bound by "mtx".
16118c2ecf20Sopenharmony_ci	 * And ep_insert() is called with "mtx" held.
16128c2ecf20Sopenharmony_ci	 */
16138c2ecf20Sopenharmony_ci	write_lock_irq(&ep->lock);
16148c2ecf20Sopenharmony_ci	if (ep_is_linked(epi))
16158c2ecf20Sopenharmony_ci		list_del_init(&epi->rdllink);
16168c2ecf20Sopenharmony_ci	write_unlock_irq(&ep->lock);
16178c2ecf20Sopenharmony_ci
16188c2ecf20Sopenharmony_ci	wakeup_source_unregister(ep_wakeup_source(epi));
16198c2ecf20Sopenharmony_ci
16208c2ecf20Sopenharmony_cierror_create_wakeup_source:
16218c2ecf20Sopenharmony_ci	kmem_cache_free(epi_cache, epi);
16228c2ecf20Sopenharmony_ci
16238c2ecf20Sopenharmony_ci	return error;
16248c2ecf20Sopenharmony_ci}
16258c2ecf20Sopenharmony_ci
16268c2ecf20Sopenharmony_ci/*
16278c2ecf20Sopenharmony_ci * Modify the interest event mask by dropping an event if the new mask
16288c2ecf20Sopenharmony_ci * has a match in the current file status. Must be called with "mtx" held.
16298c2ecf20Sopenharmony_ci */
16308c2ecf20Sopenharmony_cistatic int ep_modify(struct eventpoll *ep, struct epitem *epi,
16318c2ecf20Sopenharmony_ci		     const struct epoll_event *event)
16328c2ecf20Sopenharmony_ci{
16338c2ecf20Sopenharmony_ci	int pwake = 0;
16348c2ecf20Sopenharmony_ci	poll_table pt;
16358c2ecf20Sopenharmony_ci
16368c2ecf20Sopenharmony_ci	lockdep_assert_irqs_enabled();
16378c2ecf20Sopenharmony_ci
16388c2ecf20Sopenharmony_ci	init_poll_funcptr(&pt, NULL);
16398c2ecf20Sopenharmony_ci
16408c2ecf20Sopenharmony_ci	/*
16418c2ecf20Sopenharmony_ci	 * Set the new event interest mask before calling f_op->poll();
16428c2ecf20Sopenharmony_ci	 * otherwise we might miss an event that happens between the
16438c2ecf20Sopenharmony_ci	 * f_op->poll() call and the new event set registering.
16448c2ecf20Sopenharmony_ci	 */
16458c2ecf20Sopenharmony_ci	epi->event.events = event->events; /* need barrier below */
16468c2ecf20Sopenharmony_ci	epi->event.data = event->data; /* protected by mtx */
16478c2ecf20Sopenharmony_ci	if (epi->event.events & EPOLLWAKEUP) {
16488c2ecf20Sopenharmony_ci		if (!ep_has_wakeup_source(epi))
16498c2ecf20Sopenharmony_ci			ep_create_wakeup_source(epi);
16508c2ecf20Sopenharmony_ci	} else if (ep_has_wakeup_source(epi)) {
16518c2ecf20Sopenharmony_ci		ep_destroy_wakeup_source(epi);
16528c2ecf20Sopenharmony_ci	}
16538c2ecf20Sopenharmony_ci
16548c2ecf20Sopenharmony_ci	/*
16558c2ecf20Sopenharmony_ci	 * The following barrier has two effects:
16568c2ecf20Sopenharmony_ci	 *
16578c2ecf20Sopenharmony_ci	 * 1) Flush epi changes above to other CPUs.  This ensures
16588c2ecf20Sopenharmony_ci	 *    we do not miss events from ep_poll_callback if an
16598c2ecf20Sopenharmony_ci	 *    event occurs immediately after we call f_op->poll().
16608c2ecf20Sopenharmony_ci	 *    We need this because we did not take ep->lock while
16618c2ecf20Sopenharmony_ci	 *    changing epi above (but ep_poll_callback does take
16628c2ecf20Sopenharmony_ci	 *    ep->lock).
16638c2ecf20Sopenharmony_ci	 *
16648c2ecf20Sopenharmony_ci	 * 2) We also need to ensure we do not miss _past_ events
16658c2ecf20Sopenharmony_ci	 *    when calling f_op->poll().  This barrier also
16668c2ecf20Sopenharmony_ci	 *    pairs with the barrier in wq_has_sleeper (see
16678c2ecf20Sopenharmony_ci	 *    comments for wq_has_sleeper).
16688c2ecf20Sopenharmony_ci	 *
16698c2ecf20Sopenharmony_ci	 * This barrier will now guarantee ep_poll_callback or f_op->poll
16708c2ecf20Sopenharmony_ci	 * (or both) will notice the readiness of an item.
16718c2ecf20Sopenharmony_ci	 */
16728c2ecf20Sopenharmony_ci	smp_mb();
16738c2ecf20Sopenharmony_ci
16748c2ecf20Sopenharmony_ci	/*
16758c2ecf20Sopenharmony_ci	 * Get current event bits. We can safely use the file* here because
16768c2ecf20Sopenharmony_ci	 * its usage count has been increased by the caller of this function.
16778c2ecf20Sopenharmony_ci	 * If the item is "hot" and it is not registered inside the ready
16788c2ecf20Sopenharmony_ci	 * list, push it inside.
16798c2ecf20Sopenharmony_ci	 */
16808c2ecf20Sopenharmony_ci	if (ep_item_poll(epi, &pt, 1)) {
16818c2ecf20Sopenharmony_ci		write_lock_irq(&ep->lock);
16828c2ecf20Sopenharmony_ci		if (!ep_is_linked(epi)) {
16838c2ecf20Sopenharmony_ci			list_add_tail(&epi->rdllink, &ep->rdllist);
16848c2ecf20Sopenharmony_ci			ep_pm_stay_awake(epi);
16858c2ecf20Sopenharmony_ci
16868c2ecf20Sopenharmony_ci			/* Notify waiting tasks that events are available */
16878c2ecf20Sopenharmony_ci			if (waitqueue_active(&ep->wq))
16888c2ecf20Sopenharmony_ci				wake_up(&ep->wq);
16898c2ecf20Sopenharmony_ci			if (waitqueue_active(&ep->poll_wait))
16908c2ecf20Sopenharmony_ci				pwake++;
16918c2ecf20Sopenharmony_ci		}
16928c2ecf20Sopenharmony_ci		write_unlock_irq(&ep->lock);
16938c2ecf20Sopenharmony_ci	}
16948c2ecf20Sopenharmony_ci
16958c2ecf20Sopenharmony_ci	/* We have to call this outside the lock */
16968c2ecf20Sopenharmony_ci	if (pwake)
16978c2ecf20Sopenharmony_ci		ep_poll_safewake(ep, NULL, 0);
16988c2ecf20Sopenharmony_ci
16998c2ecf20Sopenharmony_ci	return 0;
17008c2ecf20Sopenharmony_ci}
17018c2ecf20Sopenharmony_ci
17028c2ecf20Sopenharmony_cistatic __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
17038c2ecf20Sopenharmony_ci			       void *priv)
17048c2ecf20Sopenharmony_ci{
17058c2ecf20Sopenharmony_ci	struct ep_send_events_data *esed = priv;
17068c2ecf20Sopenharmony_ci	__poll_t revents;
17078c2ecf20Sopenharmony_ci	struct epitem *epi, *tmp;
17088c2ecf20Sopenharmony_ci	struct epoll_event __user *uevent = esed->events;
17098c2ecf20Sopenharmony_ci	struct wakeup_source *ws;
17108c2ecf20Sopenharmony_ci	poll_table pt;
17118c2ecf20Sopenharmony_ci
17128c2ecf20Sopenharmony_ci	init_poll_funcptr(&pt, NULL);
17138c2ecf20Sopenharmony_ci	esed->res = 0;
17148c2ecf20Sopenharmony_ci
17158c2ecf20Sopenharmony_ci	/*
17168c2ecf20Sopenharmony_ci	 * We can loop without lock because we are passed a task private list.
17178c2ecf20Sopenharmony_ci	 * Items cannot vanish during the loop because ep_scan_ready_list() is
17188c2ecf20Sopenharmony_ci	 * holding "mtx" during this call.
17198c2ecf20Sopenharmony_ci	 */
17208c2ecf20Sopenharmony_ci	lockdep_assert_held(&ep->mtx);
17218c2ecf20Sopenharmony_ci
17228c2ecf20Sopenharmony_ci	list_for_each_entry_safe(epi, tmp, head, rdllink) {
17238c2ecf20Sopenharmony_ci		if (esed->res >= esed->maxevents)
17248c2ecf20Sopenharmony_ci			break;
17258c2ecf20Sopenharmony_ci
17268c2ecf20Sopenharmony_ci		/*
17278c2ecf20Sopenharmony_ci		 * Activate ep->ws before deactivating epi->ws to prevent
17288c2ecf20Sopenharmony_ci		 * triggering auto-suspend here (in case we reactive epi->ws
17298c2ecf20Sopenharmony_ci		 * below).
17308c2ecf20Sopenharmony_ci		 *
17318c2ecf20Sopenharmony_ci		 * This could be rearranged to delay the deactivation of epi->ws
17328c2ecf20Sopenharmony_ci		 * instead, but then epi->ws would temporarily be out of sync
17338c2ecf20Sopenharmony_ci		 * with ep_is_linked().
17348c2ecf20Sopenharmony_ci		 */
17358c2ecf20Sopenharmony_ci		ws = ep_wakeup_source(epi);
17368c2ecf20Sopenharmony_ci		if (ws) {
17378c2ecf20Sopenharmony_ci			if (ws->active)
17388c2ecf20Sopenharmony_ci				__pm_stay_awake(ep->ws);
17398c2ecf20Sopenharmony_ci			__pm_relax(ws);
17408c2ecf20Sopenharmony_ci		}
17418c2ecf20Sopenharmony_ci
17428c2ecf20Sopenharmony_ci		list_del_init(&epi->rdllink);
17438c2ecf20Sopenharmony_ci
17448c2ecf20Sopenharmony_ci		/*
17458c2ecf20Sopenharmony_ci		 * If the event mask intersect the caller-requested one,
17468c2ecf20Sopenharmony_ci		 * deliver the event to userspace. Again, ep_scan_ready_list()
17478c2ecf20Sopenharmony_ci		 * is holding ep->mtx, so no operations coming from userspace
17488c2ecf20Sopenharmony_ci		 * can change the item.
17498c2ecf20Sopenharmony_ci		 */
17508c2ecf20Sopenharmony_ci		revents = ep_item_poll(epi, &pt, 1);
17518c2ecf20Sopenharmony_ci		if (!revents)
17528c2ecf20Sopenharmony_ci			continue;
17538c2ecf20Sopenharmony_ci
17548c2ecf20Sopenharmony_ci		if (__put_user(revents, &uevent->events) ||
17558c2ecf20Sopenharmony_ci		    __put_user(epi->event.data, &uevent->data)) {
17568c2ecf20Sopenharmony_ci			list_add(&epi->rdllink, head);
17578c2ecf20Sopenharmony_ci			ep_pm_stay_awake(epi);
17588c2ecf20Sopenharmony_ci			if (!esed->res)
17598c2ecf20Sopenharmony_ci				esed->res = -EFAULT;
17608c2ecf20Sopenharmony_ci			return 0;
17618c2ecf20Sopenharmony_ci		}
17628c2ecf20Sopenharmony_ci		esed->res++;
17638c2ecf20Sopenharmony_ci		uevent++;
17648c2ecf20Sopenharmony_ci		if (epi->event.events & EPOLLONESHOT)
17658c2ecf20Sopenharmony_ci			epi->event.events &= EP_PRIVATE_BITS;
17668c2ecf20Sopenharmony_ci		else if (!(epi->event.events & EPOLLET)) {
17678c2ecf20Sopenharmony_ci			/*
17688c2ecf20Sopenharmony_ci			 * If this file has been added with Level
17698c2ecf20Sopenharmony_ci			 * Trigger mode, we need to insert back inside
17708c2ecf20Sopenharmony_ci			 * the ready list, so that the next call to
17718c2ecf20Sopenharmony_ci			 * epoll_wait() will check again the events
17728c2ecf20Sopenharmony_ci			 * availability. At this point, no one can insert
17738c2ecf20Sopenharmony_ci			 * into ep->rdllist besides us. The epoll_ctl()
17748c2ecf20Sopenharmony_ci			 * callers are locked out by
17758c2ecf20Sopenharmony_ci			 * ep_scan_ready_list() holding "mtx" and the
17768c2ecf20Sopenharmony_ci			 * poll callback will queue them in ep->ovflist.
17778c2ecf20Sopenharmony_ci			 */
17788c2ecf20Sopenharmony_ci			list_add_tail(&epi->rdllink, &ep->rdllist);
17798c2ecf20Sopenharmony_ci			ep_pm_stay_awake(epi);
17808c2ecf20Sopenharmony_ci		}
17818c2ecf20Sopenharmony_ci	}
17828c2ecf20Sopenharmony_ci
17838c2ecf20Sopenharmony_ci	return 0;
17848c2ecf20Sopenharmony_ci}
17858c2ecf20Sopenharmony_ci
17868c2ecf20Sopenharmony_cistatic int ep_send_events(struct eventpoll *ep,
17878c2ecf20Sopenharmony_ci			  struct epoll_event __user *events, int maxevents)
17888c2ecf20Sopenharmony_ci{
17898c2ecf20Sopenharmony_ci	struct ep_send_events_data esed;
17908c2ecf20Sopenharmony_ci
17918c2ecf20Sopenharmony_ci	esed.maxevents = maxevents;
17928c2ecf20Sopenharmony_ci	esed.events = events;
17938c2ecf20Sopenharmony_ci
17948c2ecf20Sopenharmony_ci	ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
17958c2ecf20Sopenharmony_ci	return esed.res;
17968c2ecf20Sopenharmony_ci}
17978c2ecf20Sopenharmony_ci
17988c2ecf20Sopenharmony_cistatic inline struct timespec64 ep_set_mstimeout(long ms)
17998c2ecf20Sopenharmony_ci{
18008c2ecf20Sopenharmony_ci	struct timespec64 now, ts = {
18018c2ecf20Sopenharmony_ci		.tv_sec = ms / MSEC_PER_SEC,
18028c2ecf20Sopenharmony_ci		.tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
18038c2ecf20Sopenharmony_ci	};
18048c2ecf20Sopenharmony_ci
18058c2ecf20Sopenharmony_ci	ktime_get_ts64(&now);
18068c2ecf20Sopenharmony_ci	return timespec64_add_safe(now, ts);
18078c2ecf20Sopenharmony_ci}
18088c2ecf20Sopenharmony_ci
18098c2ecf20Sopenharmony_ci/*
18108c2ecf20Sopenharmony_ci * autoremove_wake_function, but remove even on failure to wake up, because we
18118c2ecf20Sopenharmony_ci * know that default_wake_function/ttwu will only fail if the thread is already
18128c2ecf20Sopenharmony_ci * woken, and in that case the ep_poll loop will remove the entry anyways, not
18138c2ecf20Sopenharmony_ci * try to reuse it.
18148c2ecf20Sopenharmony_ci */
18158c2ecf20Sopenharmony_cistatic int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
18168c2ecf20Sopenharmony_ci				       unsigned int mode, int sync, void *key)
18178c2ecf20Sopenharmony_ci{
18188c2ecf20Sopenharmony_ci	int ret = default_wake_function(wq_entry, mode, sync, key);
18198c2ecf20Sopenharmony_ci
18208c2ecf20Sopenharmony_ci	/*
18218c2ecf20Sopenharmony_ci	 * Pairs with list_empty_careful in ep_poll, and ensures future loop
18228c2ecf20Sopenharmony_ci	 * iterations see the cause of this wakeup.
18238c2ecf20Sopenharmony_ci	 */
18248c2ecf20Sopenharmony_ci	list_del_init_careful(&wq_entry->entry);
18258c2ecf20Sopenharmony_ci	return ret;
18268c2ecf20Sopenharmony_ci}
18278c2ecf20Sopenharmony_ci
18288c2ecf20Sopenharmony_ci/**
18298c2ecf20Sopenharmony_ci * ep_poll - Retrieves ready events, and delivers them to the caller supplied
18308c2ecf20Sopenharmony_ci *           event buffer.
18318c2ecf20Sopenharmony_ci *
18328c2ecf20Sopenharmony_ci * @ep: Pointer to the eventpoll context.
18338c2ecf20Sopenharmony_ci * @events: Pointer to the userspace buffer where the ready events should be
18348c2ecf20Sopenharmony_ci *          stored.
18358c2ecf20Sopenharmony_ci * @maxevents: Size (in terms of number of events) of the caller event buffer.
18368c2ecf20Sopenharmony_ci * @timeout: Maximum timeout for the ready events fetch operation, in
18378c2ecf20Sopenharmony_ci *           milliseconds. If the @timeout is zero, the function will not block,
18388c2ecf20Sopenharmony_ci *           while if the @timeout is less than zero, the function will block
18398c2ecf20Sopenharmony_ci *           until at least one event has been retrieved (or an error
18408c2ecf20Sopenharmony_ci *           occurred).
18418c2ecf20Sopenharmony_ci *
18428c2ecf20Sopenharmony_ci * Returns: Returns the number of ready events which have been fetched, or an
18438c2ecf20Sopenharmony_ci *          error code, in case of error.
18448c2ecf20Sopenharmony_ci */
18458c2ecf20Sopenharmony_cistatic int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
18468c2ecf20Sopenharmony_ci		   int maxevents, long timeout)
18478c2ecf20Sopenharmony_ci{
18488c2ecf20Sopenharmony_ci	int res = 0, eavail, timed_out = 0;
18498c2ecf20Sopenharmony_ci	u64 slack = 0;
18508c2ecf20Sopenharmony_ci	wait_queue_entry_t wait;
18518c2ecf20Sopenharmony_ci	ktime_t expires, *to = NULL;
18528c2ecf20Sopenharmony_ci
18538c2ecf20Sopenharmony_ci	lockdep_assert_irqs_enabled();
18548c2ecf20Sopenharmony_ci
18558c2ecf20Sopenharmony_ci	if (timeout > 0) {
18568c2ecf20Sopenharmony_ci		struct timespec64 end_time = ep_set_mstimeout(timeout);
18578c2ecf20Sopenharmony_ci
18588c2ecf20Sopenharmony_ci		slack = select_estimate_accuracy(&end_time);
18598c2ecf20Sopenharmony_ci		to = &expires;
18608c2ecf20Sopenharmony_ci		*to = timespec64_to_ktime(end_time);
18618c2ecf20Sopenharmony_ci	} else if (timeout == 0) {
18628c2ecf20Sopenharmony_ci		/*
18638c2ecf20Sopenharmony_ci		 * Avoid the unnecessary trip to the wait queue loop, if the
18648c2ecf20Sopenharmony_ci		 * caller specified a non blocking operation. We still need
18658c2ecf20Sopenharmony_ci		 * lock because we could race and not see an epi being added
18668c2ecf20Sopenharmony_ci		 * to the ready list while in irq callback. Thus incorrectly
18678c2ecf20Sopenharmony_ci		 * returning 0 back to userspace.
18688c2ecf20Sopenharmony_ci		 */
18698c2ecf20Sopenharmony_ci		timed_out = 1;
18708c2ecf20Sopenharmony_ci
18718c2ecf20Sopenharmony_ci		write_lock_irq(&ep->lock);
18728c2ecf20Sopenharmony_ci		eavail = ep_events_available(ep);
18738c2ecf20Sopenharmony_ci		write_unlock_irq(&ep->lock);
18748c2ecf20Sopenharmony_ci
18758c2ecf20Sopenharmony_ci		goto send_events;
18768c2ecf20Sopenharmony_ci	}
18778c2ecf20Sopenharmony_ci
18788c2ecf20Sopenharmony_cifetch_events:
18798c2ecf20Sopenharmony_ci
18808c2ecf20Sopenharmony_ci	if (!ep_events_available(ep))
18818c2ecf20Sopenharmony_ci		ep_busy_loop(ep, timed_out);
18828c2ecf20Sopenharmony_ci
18838c2ecf20Sopenharmony_ci	eavail = ep_events_available(ep);
18848c2ecf20Sopenharmony_ci	if (eavail)
18858c2ecf20Sopenharmony_ci		goto send_events;
18868c2ecf20Sopenharmony_ci
18878c2ecf20Sopenharmony_ci	/*
18888c2ecf20Sopenharmony_ci	 * Busy poll timed out.  Drop NAPI ID for now, we can add
18898c2ecf20Sopenharmony_ci	 * it back in when we have moved a socket with a valid NAPI
18908c2ecf20Sopenharmony_ci	 * ID onto the ready list.
18918c2ecf20Sopenharmony_ci	 */
18928c2ecf20Sopenharmony_ci	ep_reset_busy_poll_napi_id(ep);
18938c2ecf20Sopenharmony_ci
18948c2ecf20Sopenharmony_ci	do {
18958c2ecf20Sopenharmony_ci		/*
18968c2ecf20Sopenharmony_ci		 * Internally init_wait() uses autoremove_wake_function(),
18978c2ecf20Sopenharmony_ci		 * thus wait entry is removed from the wait queue on each
18988c2ecf20Sopenharmony_ci		 * wakeup. Why it is important? In case of several waiters
18998c2ecf20Sopenharmony_ci		 * each new wakeup will hit the next waiter, giving it the
19008c2ecf20Sopenharmony_ci		 * chance to harvest new event. Otherwise wakeup can be
19018c2ecf20Sopenharmony_ci		 * lost. This is also good performance-wise, because on
19028c2ecf20Sopenharmony_ci		 * normal wakeup path no need to call __remove_wait_queue()
19038c2ecf20Sopenharmony_ci		 * explicitly, thus ep->lock is not taken, which halts the
19048c2ecf20Sopenharmony_ci		 * event delivery.
19058c2ecf20Sopenharmony_ci		 *
19068c2ecf20Sopenharmony_ci		 * In fact, we now use an even more aggressive function that
19078c2ecf20Sopenharmony_ci		 * unconditionally removes, because we don't reuse the wait
19088c2ecf20Sopenharmony_ci		 * entry between loop iterations. This lets us also avoid the
19098c2ecf20Sopenharmony_ci		 * performance issue if a process is killed, causing all of its
19108c2ecf20Sopenharmony_ci		 * threads to wake up without being removed normally.
19118c2ecf20Sopenharmony_ci		 */
19128c2ecf20Sopenharmony_ci		init_wait(&wait);
19138c2ecf20Sopenharmony_ci		wait.func = ep_autoremove_wake_function;
19148c2ecf20Sopenharmony_ci
19158c2ecf20Sopenharmony_ci		write_lock_irq(&ep->lock);
19168c2ecf20Sopenharmony_ci		/*
19178c2ecf20Sopenharmony_ci		 * Barrierless variant, waitqueue_active() is called under
19188c2ecf20Sopenharmony_ci		 * the same lock on wakeup ep_poll_callback() side, so it
19198c2ecf20Sopenharmony_ci		 * is safe to avoid an explicit barrier.
19208c2ecf20Sopenharmony_ci		 */
19218c2ecf20Sopenharmony_ci		__set_current_state(TASK_INTERRUPTIBLE);
19228c2ecf20Sopenharmony_ci
19238c2ecf20Sopenharmony_ci		/*
19248c2ecf20Sopenharmony_ci		 * Do the final check under the lock. ep_scan_ready_list()
19258c2ecf20Sopenharmony_ci		 * plays with two lists (->rdllist and ->ovflist) and there
19268c2ecf20Sopenharmony_ci		 * is always a race when both lists are empty for short
19278c2ecf20Sopenharmony_ci		 * period of time although events are pending, so lock is
19288c2ecf20Sopenharmony_ci		 * important.
19298c2ecf20Sopenharmony_ci		 */
19308c2ecf20Sopenharmony_ci		eavail = ep_events_available(ep);
19318c2ecf20Sopenharmony_ci		if (!eavail) {
19328c2ecf20Sopenharmony_ci			if (signal_pending(current))
19338c2ecf20Sopenharmony_ci				res = -EINTR;
19348c2ecf20Sopenharmony_ci			else
19358c2ecf20Sopenharmony_ci				__add_wait_queue_exclusive(&ep->wq, &wait);
19368c2ecf20Sopenharmony_ci		}
19378c2ecf20Sopenharmony_ci		write_unlock_irq(&ep->lock);
19388c2ecf20Sopenharmony_ci
19398c2ecf20Sopenharmony_ci		if (!eavail && !res)
19408c2ecf20Sopenharmony_ci			timed_out = !schedule_hrtimeout_range(to, slack,
19418c2ecf20Sopenharmony_ci							      HRTIMER_MODE_ABS);
19428c2ecf20Sopenharmony_ci
19438c2ecf20Sopenharmony_ci		/*
19448c2ecf20Sopenharmony_ci		 * We were woken up, thus go and try to harvest some events.
19458c2ecf20Sopenharmony_ci		 * If timed out and still on the wait queue, recheck eavail
19468c2ecf20Sopenharmony_ci		 * carefully under lock, below.
19478c2ecf20Sopenharmony_ci		 */
19488c2ecf20Sopenharmony_ci		eavail = 1;
19498c2ecf20Sopenharmony_ci	} while (0);
19508c2ecf20Sopenharmony_ci
19518c2ecf20Sopenharmony_ci	__set_current_state(TASK_RUNNING);
19528c2ecf20Sopenharmony_ci
19538c2ecf20Sopenharmony_ci	if (!list_empty_careful(&wait.entry)) {
19548c2ecf20Sopenharmony_ci		write_lock_irq(&ep->lock);
19558c2ecf20Sopenharmony_ci		/*
19568c2ecf20Sopenharmony_ci		 * If the thread timed out and is not on the wait queue, it
19578c2ecf20Sopenharmony_ci		 * means that the thread was woken up after its timeout expired
19588c2ecf20Sopenharmony_ci		 * before it could reacquire the lock. Thus, when wait.entry is
19598c2ecf20Sopenharmony_ci		 * empty, it needs to harvest events.
19608c2ecf20Sopenharmony_ci		 */
19618c2ecf20Sopenharmony_ci		if (timed_out)
19628c2ecf20Sopenharmony_ci			eavail = list_empty(&wait.entry);
19638c2ecf20Sopenharmony_ci		__remove_wait_queue(&ep->wq, &wait);
19648c2ecf20Sopenharmony_ci		write_unlock_irq(&ep->lock);
19658c2ecf20Sopenharmony_ci	}
19668c2ecf20Sopenharmony_ci
19678c2ecf20Sopenharmony_cisend_events:
19688c2ecf20Sopenharmony_ci	if (fatal_signal_pending(current)) {
19698c2ecf20Sopenharmony_ci		/*
19708c2ecf20Sopenharmony_ci		 * Always short-circuit for fatal signals to allow
19718c2ecf20Sopenharmony_ci		 * threads to make a timely exit without the chance of
19728c2ecf20Sopenharmony_ci		 * finding more events available and fetching
19738c2ecf20Sopenharmony_ci		 * repeatedly.
19748c2ecf20Sopenharmony_ci		 */
19758c2ecf20Sopenharmony_ci		res = -EINTR;
19768c2ecf20Sopenharmony_ci	}
19778c2ecf20Sopenharmony_ci	/*
19788c2ecf20Sopenharmony_ci	 * Try to transfer events to user space. In case we get 0 events and
19798c2ecf20Sopenharmony_ci	 * there's still timeout left over, we go trying again in search of
19808c2ecf20Sopenharmony_ci	 * more luck.
19818c2ecf20Sopenharmony_ci	 */
19828c2ecf20Sopenharmony_ci	if (!res && eavail &&
19838c2ecf20Sopenharmony_ci	    !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
19848c2ecf20Sopenharmony_ci		goto fetch_events;
19858c2ecf20Sopenharmony_ci
19868c2ecf20Sopenharmony_ci	return res;
19878c2ecf20Sopenharmony_ci}
19888c2ecf20Sopenharmony_ci
19898c2ecf20Sopenharmony_ci/**
19908c2ecf20Sopenharmony_ci * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
19918c2ecf20Sopenharmony_ci *                      API, to verify that adding an epoll file inside another
19928c2ecf20Sopenharmony_ci *                      epoll structure, does not violate the constraints, in
19938c2ecf20Sopenharmony_ci *                      terms of closed loops, or too deep chains (which can
19948c2ecf20Sopenharmony_ci *                      result in excessive stack usage).
19958c2ecf20Sopenharmony_ci *
19968c2ecf20Sopenharmony_ci * @priv: Pointer to the epoll file to be currently checked.
19978c2ecf20Sopenharmony_ci * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
19988c2ecf20Sopenharmony_ci *          data structure pointer.
19998c2ecf20Sopenharmony_ci * @call_nests: Current dept of the @ep_call_nested() call stack.
20008c2ecf20Sopenharmony_ci *
20018c2ecf20Sopenharmony_ci * Returns: Returns zero if adding the epoll @file inside current epoll
20028c2ecf20Sopenharmony_ci *          structure @ep does not violate the constraints, or -1 otherwise.
20038c2ecf20Sopenharmony_ci */
20048c2ecf20Sopenharmony_cistatic int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
20058c2ecf20Sopenharmony_ci{
20068c2ecf20Sopenharmony_ci	int error = 0;
20078c2ecf20Sopenharmony_ci	struct file *file = priv;
20088c2ecf20Sopenharmony_ci	struct eventpoll *ep = file->private_data;
20098c2ecf20Sopenharmony_ci	struct eventpoll *ep_tovisit;
20108c2ecf20Sopenharmony_ci	struct rb_node *rbp;
20118c2ecf20Sopenharmony_ci	struct epitem *epi;
20128c2ecf20Sopenharmony_ci
20138c2ecf20Sopenharmony_ci	mutex_lock_nested(&ep->mtx, call_nests + 1);
20148c2ecf20Sopenharmony_ci	ep->gen = loop_check_gen;
20158c2ecf20Sopenharmony_ci	for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
20168c2ecf20Sopenharmony_ci		epi = rb_entry(rbp, struct epitem, rbn);
20178c2ecf20Sopenharmony_ci		if (unlikely(is_file_epoll(epi->ffd.file))) {
20188c2ecf20Sopenharmony_ci			ep_tovisit = epi->ffd.file->private_data;
20198c2ecf20Sopenharmony_ci			if (ep_tovisit->gen == loop_check_gen)
20208c2ecf20Sopenharmony_ci				continue;
20218c2ecf20Sopenharmony_ci			error = ep_call_nested(&poll_loop_ncalls,
20228c2ecf20Sopenharmony_ci					ep_loop_check_proc, epi->ffd.file,
20238c2ecf20Sopenharmony_ci					ep_tovisit, current);
20248c2ecf20Sopenharmony_ci			if (error != 0)
20258c2ecf20Sopenharmony_ci				break;
20268c2ecf20Sopenharmony_ci		} else {
20278c2ecf20Sopenharmony_ci			/*
20288c2ecf20Sopenharmony_ci			 * If we've reached a file that is not associated with
20298c2ecf20Sopenharmony_ci			 * an ep, then we need to check if the newly added
20308c2ecf20Sopenharmony_ci			 * links are going to add too many wakeup paths. We do
20318c2ecf20Sopenharmony_ci			 * this by adding it to the tfile_check_list, if it's
20328c2ecf20Sopenharmony_ci			 * not already there, and calling reverse_path_check()
20338c2ecf20Sopenharmony_ci			 * during ep_insert().
20348c2ecf20Sopenharmony_ci			 */
20358c2ecf20Sopenharmony_ci			if (list_empty(&epi->ffd.file->f_tfile_llink)) {
20368c2ecf20Sopenharmony_ci				if (get_file_rcu(epi->ffd.file))
20378c2ecf20Sopenharmony_ci					list_add(&epi->ffd.file->f_tfile_llink,
20388c2ecf20Sopenharmony_ci						 &tfile_check_list);
20398c2ecf20Sopenharmony_ci			}
20408c2ecf20Sopenharmony_ci		}
20418c2ecf20Sopenharmony_ci	}
20428c2ecf20Sopenharmony_ci	mutex_unlock(&ep->mtx);
20438c2ecf20Sopenharmony_ci
20448c2ecf20Sopenharmony_ci	return error;
20458c2ecf20Sopenharmony_ci}
20468c2ecf20Sopenharmony_ci
20478c2ecf20Sopenharmony_ci/**
20488c2ecf20Sopenharmony_ci * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
20498c2ecf20Sopenharmony_ci *                 another epoll file (represented by @ep) does not create
20508c2ecf20Sopenharmony_ci *                 closed loops or too deep chains.
20518c2ecf20Sopenharmony_ci *
20528c2ecf20Sopenharmony_ci * @ep: Pointer to the epoll private data structure.
20538c2ecf20Sopenharmony_ci * @file: Pointer to the epoll file to be checked.
20548c2ecf20Sopenharmony_ci *
20558c2ecf20Sopenharmony_ci * Returns: Returns zero if adding the epoll @file inside current epoll
20568c2ecf20Sopenharmony_ci *          structure @ep does not violate the constraints, or -1 otherwise.
20578c2ecf20Sopenharmony_ci */
20588c2ecf20Sopenharmony_cistatic int ep_loop_check(struct eventpoll *ep, struct file *file)
20598c2ecf20Sopenharmony_ci{
20608c2ecf20Sopenharmony_ci	return ep_call_nested(&poll_loop_ncalls,
20618c2ecf20Sopenharmony_ci			      ep_loop_check_proc, file, ep, current);
20628c2ecf20Sopenharmony_ci}
20638c2ecf20Sopenharmony_ci
20648c2ecf20Sopenharmony_cistatic void clear_tfile_check_list(void)
20658c2ecf20Sopenharmony_ci{
20668c2ecf20Sopenharmony_ci	struct file *file;
20678c2ecf20Sopenharmony_ci
20688c2ecf20Sopenharmony_ci	/* first clear the tfile_check_list */
20698c2ecf20Sopenharmony_ci	while (!list_empty(&tfile_check_list)) {
20708c2ecf20Sopenharmony_ci		file = list_first_entry(&tfile_check_list, struct file,
20718c2ecf20Sopenharmony_ci					f_tfile_llink);
20728c2ecf20Sopenharmony_ci		list_del_init(&file->f_tfile_llink);
20738c2ecf20Sopenharmony_ci		fput(file);
20748c2ecf20Sopenharmony_ci	}
20758c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&tfile_check_list);
20768c2ecf20Sopenharmony_ci}
20778c2ecf20Sopenharmony_ci
20788c2ecf20Sopenharmony_ci/*
20798c2ecf20Sopenharmony_ci * Open an eventpoll file descriptor.
20808c2ecf20Sopenharmony_ci */
20818c2ecf20Sopenharmony_cistatic int do_epoll_create(int flags)
20828c2ecf20Sopenharmony_ci{
20838c2ecf20Sopenharmony_ci	int error, fd;
20848c2ecf20Sopenharmony_ci	struct eventpoll *ep = NULL;
20858c2ecf20Sopenharmony_ci	struct file *file;
20868c2ecf20Sopenharmony_ci
20878c2ecf20Sopenharmony_ci	/* Check the EPOLL_* constant for consistency.  */
20888c2ecf20Sopenharmony_ci	BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
20898c2ecf20Sopenharmony_ci
20908c2ecf20Sopenharmony_ci	if (flags & ~EPOLL_CLOEXEC)
20918c2ecf20Sopenharmony_ci		return -EINVAL;
20928c2ecf20Sopenharmony_ci	/*
20938c2ecf20Sopenharmony_ci	 * Create the internal data structure ("struct eventpoll").
20948c2ecf20Sopenharmony_ci	 */
20958c2ecf20Sopenharmony_ci	error = ep_alloc(&ep);
20968c2ecf20Sopenharmony_ci	if (error < 0)
20978c2ecf20Sopenharmony_ci		return error;
20988c2ecf20Sopenharmony_ci	/*
20998c2ecf20Sopenharmony_ci	 * Creates all the items needed to setup an eventpoll file. That is,
21008c2ecf20Sopenharmony_ci	 * a file structure and a free file descriptor.
21018c2ecf20Sopenharmony_ci	 */
21028c2ecf20Sopenharmony_ci	fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
21038c2ecf20Sopenharmony_ci	if (fd < 0) {
21048c2ecf20Sopenharmony_ci		error = fd;
21058c2ecf20Sopenharmony_ci		goto out_free_ep;
21068c2ecf20Sopenharmony_ci	}
21078c2ecf20Sopenharmony_ci	file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
21088c2ecf20Sopenharmony_ci				 O_RDWR | (flags & O_CLOEXEC));
21098c2ecf20Sopenharmony_ci	if (IS_ERR(file)) {
21108c2ecf20Sopenharmony_ci		error = PTR_ERR(file);
21118c2ecf20Sopenharmony_ci		goto out_free_fd;
21128c2ecf20Sopenharmony_ci	}
21138c2ecf20Sopenharmony_ci	ep->file = file;
21148c2ecf20Sopenharmony_ci	fd_install(fd, file);
21158c2ecf20Sopenharmony_ci	return fd;
21168c2ecf20Sopenharmony_ci
21178c2ecf20Sopenharmony_ciout_free_fd:
21188c2ecf20Sopenharmony_ci	put_unused_fd(fd);
21198c2ecf20Sopenharmony_ciout_free_ep:
21208c2ecf20Sopenharmony_ci	ep_free(ep);
21218c2ecf20Sopenharmony_ci	return error;
21228c2ecf20Sopenharmony_ci}
21238c2ecf20Sopenharmony_ci
21248c2ecf20Sopenharmony_ciSYSCALL_DEFINE1(epoll_create1, int, flags)
21258c2ecf20Sopenharmony_ci{
21268c2ecf20Sopenharmony_ci	return do_epoll_create(flags);
21278c2ecf20Sopenharmony_ci}
21288c2ecf20Sopenharmony_ci
21298c2ecf20Sopenharmony_ciSYSCALL_DEFINE1(epoll_create, int, size)
21308c2ecf20Sopenharmony_ci{
21318c2ecf20Sopenharmony_ci	if (size <= 0)
21328c2ecf20Sopenharmony_ci		return -EINVAL;
21338c2ecf20Sopenharmony_ci
21348c2ecf20Sopenharmony_ci	return do_epoll_create(0);
21358c2ecf20Sopenharmony_ci}
21368c2ecf20Sopenharmony_ci
21378c2ecf20Sopenharmony_cistatic inline int epoll_mutex_lock(struct mutex *mutex, int depth,
21388c2ecf20Sopenharmony_ci				   bool nonblock)
21398c2ecf20Sopenharmony_ci{
21408c2ecf20Sopenharmony_ci	if (!nonblock) {
21418c2ecf20Sopenharmony_ci		mutex_lock_nested(mutex, depth);
21428c2ecf20Sopenharmony_ci		return 0;
21438c2ecf20Sopenharmony_ci	}
21448c2ecf20Sopenharmony_ci	if (mutex_trylock(mutex))
21458c2ecf20Sopenharmony_ci		return 0;
21468c2ecf20Sopenharmony_ci	return -EAGAIN;
21478c2ecf20Sopenharmony_ci}
21488c2ecf20Sopenharmony_ci
21498c2ecf20Sopenharmony_ciint do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
21508c2ecf20Sopenharmony_ci		 bool nonblock)
21518c2ecf20Sopenharmony_ci{
21528c2ecf20Sopenharmony_ci	int error;
21538c2ecf20Sopenharmony_ci	int full_check = 0;
21548c2ecf20Sopenharmony_ci	struct fd f, tf;
21558c2ecf20Sopenharmony_ci	struct eventpoll *ep;
21568c2ecf20Sopenharmony_ci	struct epitem *epi;
21578c2ecf20Sopenharmony_ci	struct eventpoll *tep = NULL;
21588c2ecf20Sopenharmony_ci
21598c2ecf20Sopenharmony_ci	error = -EBADF;
21608c2ecf20Sopenharmony_ci	f = fdget(epfd);
21618c2ecf20Sopenharmony_ci	if (!f.file)
21628c2ecf20Sopenharmony_ci		goto error_return;
21638c2ecf20Sopenharmony_ci
21648c2ecf20Sopenharmony_ci	/* Get the "struct file *" for the target file */
21658c2ecf20Sopenharmony_ci	tf = fdget(fd);
21668c2ecf20Sopenharmony_ci	if (!tf.file)
21678c2ecf20Sopenharmony_ci		goto error_fput;
21688c2ecf20Sopenharmony_ci
21698c2ecf20Sopenharmony_ci	/* The target file descriptor must support poll */
21708c2ecf20Sopenharmony_ci	error = -EPERM;
21718c2ecf20Sopenharmony_ci	if (!file_can_poll(tf.file))
21728c2ecf20Sopenharmony_ci		goto error_tgt_fput;
21738c2ecf20Sopenharmony_ci
21748c2ecf20Sopenharmony_ci	/* Check if EPOLLWAKEUP is allowed */
21758c2ecf20Sopenharmony_ci	if (ep_op_has_event(op))
21768c2ecf20Sopenharmony_ci		ep_take_care_of_epollwakeup(epds);
21778c2ecf20Sopenharmony_ci
21788c2ecf20Sopenharmony_ci	/*
21798c2ecf20Sopenharmony_ci	 * We have to check that the file structure underneath the file descriptor
21808c2ecf20Sopenharmony_ci	 * the user passed to us _is_ an eventpoll file. And also we do not permit
21818c2ecf20Sopenharmony_ci	 * adding an epoll file descriptor inside itself.
21828c2ecf20Sopenharmony_ci	 */
21838c2ecf20Sopenharmony_ci	error = -EINVAL;
21848c2ecf20Sopenharmony_ci	if (f.file == tf.file || !is_file_epoll(f.file))
21858c2ecf20Sopenharmony_ci		goto error_tgt_fput;
21868c2ecf20Sopenharmony_ci
21878c2ecf20Sopenharmony_ci	/*
21888c2ecf20Sopenharmony_ci	 * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
21898c2ecf20Sopenharmony_ci	 * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
21908c2ecf20Sopenharmony_ci	 * Also, we do not currently supported nested exclusive wakeups.
21918c2ecf20Sopenharmony_ci	 */
21928c2ecf20Sopenharmony_ci	if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
21938c2ecf20Sopenharmony_ci		if (op == EPOLL_CTL_MOD)
21948c2ecf20Sopenharmony_ci			goto error_tgt_fput;
21958c2ecf20Sopenharmony_ci		if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
21968c2ecf20Sopenharmony_ci				(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
21978c2ecf20Sopenharmony_ci			goto error_tgt_fput;
21988c2ecf20Sopenharmony_ci	}
21998c2ecf20Sopenharmony_ci
22008c2ecf20Sopenharmony_ci	/*
22018c2ecf20Sopenharmony_ci	 * At this point it is safe to assume that the "private_data" contains
22028c2ecf20Sopenharmony_ci	 * our own data structure.
22038c2ecf20Sopenharmony_ci	 */
22048c2ecf20Sopenharmony_ci	ep = f.file->private_data;
22058c2ecf20Sopenharmony_ci
22068c2ecf20Sopenharmony_ci	/*
22078c2ecf20Sopenharmony_ci	 * When we insert an epoll file descriptor, inside another epoll file
22088c2ecf20Sopenharmony_ci	 * descriptor, there is the change of creating closed loops, which are
22098c2ecf20Sopenharmony_ci	 * better be handled here, than in more critical paths. While we are
22108c2ecf20Sopenharmony_ci	 * checking for loops we also determine the list of files reachable
22118c2ecf20Sopenharmony_ci	 * and hang them on the tfile_check_list, so we can check that we
22128c2ecf20Sopenharmony_ci	 * haven't created too many possible wakeup paths.
22138c2ecf20Sopenharmony_ci	 *
22148c2ecf20Sopenharmony_ci	 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
22158c2ecf20Sopenharmony_ci	 * the epoll file descriptor is attaching directly to a wakeup source,
22168c2ecf20Sopenharmony_ci	 * unless the epoll file descriptor is nested. The purpose of taking the
22178c2ecf20Sopenharmony_ci	 * 'epmutex' on add is to prevent complex toplogies such as loops and
22188c2ecf20Sopenharmony_ci	 * deep wakeup paths from forming in parallel through multiple
22198c2ecf20Sopenharmony_ci	 * EPOLL_CTL_ADD operations.
22208c2ecf20Sopenharmony_ci	 */
22218c2ecf20Sopenharmony_ci	error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
22228c2ecf20Sopenharmony_ci	if (error)
22238c2ecf20Sopenharmony_ci		goto error_tgt_fput;
22248c2ecf20Sopenharmony_ci	if (op == EPOLL_CTL_ADD) {
22258c2ecf20Sopenharmony_ci		if (!list_empty(&f.file->f_ep_links) ||
22268c2ecf20Sopenharmony_ci				ep->gen == loop_check_gen ||
22278c2ecf20Sopenharmony_ci						is_file_epoll(tf.file)) {
22288c2ecf20Sopenharmony_ci			mutex_unlock(&ep->mtx);
22298c2ecf20Sopenharmony_ci			error = epoll_mutex_lock(&epmutex, 0, nonblock);
22308c2ecf20Sopenharmony_ci			if (error)
22318c2ecf20Sopenharmony_ci				goto error_tgt_fput;
22328c2ecf20Sopenharmony_ci			loop_check_gen++;
22338c2ecf20Sopenharmony_ci			full_check = 1;
22348c2ecf20Sopenharmony_ci			if (is_file_epoll(tf.file)) {
22358c2ecf20Sopenharmony_ci				error = -ELOOP;
22368c2ecf20Sopenharmony_ci				if (ep_loop_check(ep, tf.file) != 0)
22378c2ecf20Sopenharmony_ci					goto error_tgt_fput;
22388c2ecf20Sopenharmony_ci			} else {
22398c2ecf20Sopenharmony_ci				get_file(tf.file);
22408c2ecf20Sopenharmony_ci				list_add(&tf.file->f_tfile_llink,
22418c2ecf20Sopenharmony_ci							&tfile_check_list);
22428c2ecf20Sopenharmony_ci			}
22438c2ecf20Sopenharmony_ci			error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
22448c2ecf20Sopenharmony_ci			if (error)
22458c2ecf20Sopenharmony_ci				goto error_tgt_fput;
22468c2ecf20Sopenharmony_ci			if (is_file_epoll(tf.file)) {
22478c2ecf20Sopenharmony_ci				tep = tf.file->private_data;
22488c2ecf20Sopenharmony_ci				error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
22498c2ecf20Sopenharmony_ci				if (error) {
22508c2ecf20Sopenharmony_ci					mutex_unlock(&ep->mtx);
22518c2ecf20Sopenharmony_ci					goto error_tgt_fput;
22528c2ecf20Sopenharmony_ci				}
22538c2ecf20Sopenharmony_ci			}
22548c2ecf20Sopenharmony_ci		}
22558c2ecf20Sopenharmony_ci	}
22568c2ecf20Sopenharmony_ci
22578c2ecf20Sopenharmony_ci	/*
22588c2ecf20Sopenharmony_ci	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
22598c2ecf20Sopenharmony_ci	 * above, we can be sure to be able to use the item looked up by
22608c2ecf20Sopenharmony_ci	 * ep_find() till we release the mutex.
22618c2ecf20Sopenharmony_ci	 */
22628c2ecf20Sopenharmony_ci	epi = ep_find(ep, tf.file, fd);
22638c2ecf20Sopenharmony_ci
22648c2ecf20Sopenharmony_ci	error = -EINVAL;
22658c2ecf20Sopenharmony_ci	switch (op) {
22668c2ecf20Sopenharmony_ci	case EPOLL_CTL_ADD:
22678c2ecf20Sopenharmony_ci		if (!epi) {
22688c2ecf20Sopenharmony_ci			epds->events |= EPOLLERR | EPOLLHUP;
22698c2ecf20Sopenharmony_ci			error = ep_insert(ep, epds, tf.file, fd, full_check);
22708c2ecf20Sopenharmony_ci		} else
22718c2ecf20Sopenharmony_ci			error = -EEXIST;
22728c2ecf20Sopenharmony_ci		break;
22738c2ecf20Sopenharmony_ci	case EPOLL_CTL_DEL:
22748c2ecf20Sopenharmony_ci		if (epi)
22758c2ecf20Sopenharmony_ci			error = ep_remove(ep, epi);
22768c2ecf20Sopenharmony_ci		else
22778c2ecf20Sopenharmony_ci			error = -ENOENT;
22788c2ecf20Sopenharmony_ci		break;
22798c2ecf20Sopenharmony_ci	case EPOLL_CTL_MOD:
22808c2ecf20Sopenharmony_ci		if (epi) {
22818c2ecf20Sopenharmony_ci			if (!(epi->event.events & EPOLLEXCLUSIVE)) {
22828c2ecf20Sopenharmony_ci				epds->events |= EPOLLERR | EPOLLHUP;
22838c2ecf20Sopenharmony_ci				error = ep_modify(ep, epi, epds);
22848c2ecf20Sopenharmony_ci			}
22858c2ecf20Sopenharmony_ci		} else
22868c2ecf20Sopenharmony_ci			error = -ENOENT;
22878c2ecf20Sopenharmony_ci		break;
22888c2ecf20Sopenharmony_ci	}
22898c2ecf20Sopenharmony_ci	if (tep != NULL)
22908c2ecf20Sopenharmony_ci		mutex_unlock(&tep->mtx);
22918c2ecf20Sopenharmony_ci	mutex_unlock(&ep->mtx);
22928c2ecf20Sopenharmony_ci
22938c2ecf20Sopenharmony_cierror_tgt_fput:
22948c2ecf20Sopenharmony_ci	if (full_check) {
22958c2ecf20Sopenharmony_ci		clear_tfile_check_list();
22968c2ecf20Sopenharmony_ci		loop_check_gen++;
22978c2ecf20Sopenharmony_ci		mutex_unlock(&epmutex);
22988c2ecf20Sopenharmony_ci	}
22998c2ecf20Sopenharmony_ci
23008c2ecf20Sopenharmony_ci	fdput(tf);
23018c2ecf20Sopenharmony_cierror_fput:
23028c2ecf20Sopenharmony_ci	fdput(f);
23038c2ecf20Sopenharmony_cierror_return:
23048c2ecf20Sopenharmony_ci
23058c2ecf20Sopenharmony_ci	return error;
23068c2ecf20Sopenharmony_ci}
23078c2ecf20Sopenharmony_ci
23088c2ecf20Sopenharmony_ci/*
23098c2ecf20Sopenharmony_ci * The following function implements the controller interface for
23108c2ecf20Sopenharmony_ci * the eventpoll file that enables the insertion/removal/change of
23118c2ecf20Sopenharmony_ci * file descriptors inside the interest set.
23128c2ecf20Sopenharmony_ci */
23138c2ecf20Sopenharmony_ciSYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
23148c2ecf20Sopenharmony_ci		struct epoll_event __user *, event)
23158c2ecf20Sopenharmony_ci{
23168c2ecf20Sopenharmony_ci	struct epoll_event epds;
23178c2ecf20Sopenharmony_ci
23188c2ecf20Sopenharmony_ci	if (ep_op_has_event(op) &&
23198c2ecf20Sopenharmony_ci	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
23208c2ecf20Sopenharmony_ci		return -EFAULT;
23218c2ecf20Sopenharmony_ci
23228c2ecf20Sopenharmony_ci	return do_epoll_ctl(epfd, op, fd, &epds, false);
23238c2ecf20Sopenharmony_ci}
23248c2ecf20Sopenharmony_ci
23258c2ecf20Sopenharmony_ci/*
23268c2ecf20Sopenharmony_ci * Implement the event wait interface for the eventpoll file. It is the kernel
23278c2ecf20Sopenharmony_ci * part of the user space epoll_wait(2).
23288c2ecf20Sopenharmony_ci */
23298c2ecf20Sopenharmony_cistatic int do_epoll_wait(int epfd, struct epoll_event __user *events,
23308c2ecf20Sopenharmony_ci			 int maxevents, int timeout)
23318c2ecf20Sopenharmony_ci{
23328c2ecf20Sopenharmony_ci	int error;
23338c2ecf20Sopenharmony_ci	struct fd f;
23348c2ecf20Sopenharmony_ci	struct eventpoll *ep;
23358c2ecf20Sopenharmony_ci
23368c2ecf20Sopenharmony_ci	/* The maximum number of event must be greater than zero */
23378c2ecf20Sopenharmony_ci	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
23388c2ecf20Sopenharmony_ci		return -EINVAL;
23398c2ecf20Sopenharmony_ci
23408c2ecf20Sopenharmony_ci	/* Verify that the area passed by the user is writeable */
23418c2ecf20Sopenharmony_ci	if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
23428c2ecf20Sopenharmony_ci		return -EFAULT;
23438c2ecf20Sopenharmony_ci
23448c2ecf20Sopenharmony_ci	/* Get the "struct file *" for the eventpoll file */
23458c2ecf20Sopenharmony_ci	f = fdget(epfd);
23468c2ecf20Sopenharmony_ci	if (!f.file)
23478c2ecf20Sopenharmony_ci		return -EBADF;
23488c2ecf20Sopenharmony_ci
23498c2ecf20Sopenharmony_ci	/*
23508c2ecf20Sopenharmony_ci	 * We have to check that the file structure underneath the fd
23518c2ecf20Sopenharmony_ci	 * the user passed to us _is_ an eventpoll file.
23528c2ecf20Sopenharmony_ci	 */
23538c2ecf20Sopenharmony_ci	error = -EINVAL;
23548c2ecf20Sopenharmony_ci	if (!is_file_epoll(f.file))
23558c2ecf20Sopenharmony_ci		goto error_fput;
23568c2ecf20Sopenharmony_ci
23578c2ecf20Sopenharmony_ci	/*
23588c2ecf20Sopenharmony_ci	 * At this point it is safe to assume that the "private_data" contains
23598c2ecf20Sopenharmony_ci	 * our own data structure.
23608c2ecf20Sopenharmony_ci	 */
23618c2ecf20Sopenharmony_ci	ep = f.file->private_data;
23628c2ecf20Sopenharmony_ci
23638c2ecf20Sopenharmony_ci	/* Time to fish for events ... */
23648c2ecf20Sopenharmony_ci	error = ep_poll(ep, events, maxevents, timeout);
23658c2ecf20Sopenharmony_ci
23668c2ecf20Sopenharmony_cierror_fput:
23678c2ecf20Sopenharmony_ci	fdput(f);
23688c2ecf20Sopenharmony_ci	return error;
23698c2ecf20Sopenharmony_ci}
23708c2ecf20Sopenharmony_ci
23718c2ecf20Sopenharmony_ciSYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
23728c2ecf20Sopenharmony_ci		int, maxevents, int, timeout)
23738c2ecf20Sopenharmony_ci{
23748c2ecf20Sopenharmony_ci	return do_epoll_wait(epfd, events, maxevents, timeout);
23758c2ecf20Sopenharmony_ci}
23768c2ecf20Sopenharmony_ci
23778c2ecf20Sopenharmony_ci/*
23788c2ecf20Sopenharmony_ci * Implement the event wait interface for the eventpoll file. It is the kernel
23798c2ecf20Sopenharmony_ci * part of the user space epoll_pwait(2).
23808c2ecf20Sopenharmony_ci */
23818c2ecf20Sopenharmony_ciSYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
23828c2ecf20Sopenharmony_ci		int, maxevents, int, timeout, const sigset_t __user *, sigmask,
23838c2ecf20Sopenharmony_ci		size_t, sigsetsize)
23848c2ecf20Sopenharmony_ci{
23858c2ecf20Sopenharmony_ci	int error;
23868c2ecf20Sopenharmony_ci
23878c2ecf20Sopenharmony_ci	/*
23888c2ecf20Sopenharmony_ci	 * If the caller wants a certain signal mask to be set during the wait,
23898c2ecf20Sopenharmony_ci	 * we apply it here.
23908c2ecf20Sopenharmony_ci	 */
23918c2ecf20Sopenharmony_ci	error = set_user_sigmask(sigmask, sigsetsize);
23928c2ecf20Sopenharmony_ci	if (error)
23938c2ecf20Sopenharmony_ci		return error;
23948c2ecf20Sopenharmony_ci
23958c2ecf20Sopenharmony_ci	error = do_epoll_wait(epfd, events, maxevents, timeout);
23968c2ecf20Sopenharmony_ci	restore_saved_sigmask_unless(error == -EINTR);
23978c2ecf20Sopenharmony_ci
23988c2ecf20Sopenharmony_ci	return error;
23998c2ecf20Sopenharmony_ci}
24008c2ecf20Sopenharmony_ci
24018c2ecf20Sopenharmony_ci#ifdef CONFIG_COMPAT
24028c2ecf20Sopenharmony_ciCOMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
24038c2ecf20Sopenharmony_ci			struct epoll_event __user *, events,
24048c2ecf20Sopenharmony_ci			int, maxevents, int, timeout,
24058c2ecf20Sopenharmony_ci			const compat_sigset_t __user *, sigmask,
24068c2ecf20Sopenharmony_ci			compat_size_t, sigsetsize)
24078c2ecf20Sopenharmony_ci{
24088c2ecf20Sopenharmony_ci	long err;
24098c2ecf20Sopenharmony_ci
24108c2ecf20Sopenharmony_ci	/*
24118c2ecf20Sopenharmony_ci	 * If the caller wants a certain signal mask to be set during the wait,
24128c2ecf20Sopenharmony_ci	 * we apply it here.
24138c2ecf20Sopenharmony_ci	 */
24148c2ecf20Sopenharmony_ci	err = set_compat_user_sigmask(sigmask, sigsetsize);
24158c2ecf20Sopenharmony_ci	if (err)
24168c2ecf20Sopenharmony_ci		return err;
24178c2ecf20Sopenharmony_ci
24188c2ecf20Sopenharmony_ci	err = do_epoll_wait(epfd, events, maxevents, timeout);
24198c2ecf20Sopenharmony_ci	restore_saved_sigmask_unless(err == -EINTR);
24208c2ecf20Sopenharmony_ci
24218c2ecf20Sopenharmony_ci	return err;
24228c2ecf20Sopenharmony_ci}
24238c2ecf20Sopenharmony_ci#endif
24248c2ecf20Sopenharmony_ci
24258c2ecf20Sopenharmony_cistatic int __init eventpoll_init(void)
24268c2ecf20Sopenharmony_ci{
24278c2ecf20Sopenharmony_ci	struct sysinfo si;
24288c2ecf20Sopenharmony_ci
24298c2ecf20Sopenharmony_ci	si_meminfo(&si);
24308c2ecf20Sopenharmony_ci	/*
24318c2ecf20Sopenharmony_ci	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
24328c2ecf20Sopenharmony_ci	 */
24338c2ecf20Sopenharmony_ci	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
24348c2ecf20Sopenharmony_ci		EP_ITEM_COST;
24358c2ecf20Sopenharmony_ci	BUG_ON(max_user_watches < 0);
24368c2ecf20Sopenharmony_ci
24378c2ecf20Sopenharmony_ci	/*
24388c2ecf20Sopenharmony_ci	 * Initialize the structure used to perform epoll file descriptor
24398c2ecf20Sopenharmony_ci	 * inclusion loops checks.
24408c2ecf20Sopenharmony_ci	 */
24418c2ecf20Sopenharmony_ci	ep_nested_calls_init(&poll_loop_ncalls);
24428c2ecf20Sopenharmony_ci
24438c2ecf20Sopenharmony_ci	/*
24448c2ecf20Sopenharmony_ci	 * We can have many thousands of epitems, so prevent this from
24458c2ecf20Sopenharmony_ci	 * using an extra cache line on 64-bit (and smaller) CPUs
24468c2ecf20Sopenharmony_ci	 */
24478c2ecf20Sopenharmony_ci	BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);
24488c2ecf20Sopenharmony_ci
24498c2ecf20Sopenharmony_ci	/* Allocates slab cache used to allocate "struct epitem" items */
24508c2ecf20Sopenharmony_ci	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
24518c2ecf20Sopenharmony_ci			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
24528c2ecf20Sopenharmony_ci
24538c2ecf20Sopenharmony_ci	/* Allocates slab cache used to allocate "struct eppoll_entry" */
24548c2ecf20Sopenharmony_ci	pwq_cache = kmem_cache_create("eventpoll_pwq",
24558c2ecf20Sopenharmony_ci		sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
24568c2ecf20Sopenharmony_ci
24578c2ecf20Sopenharmony_ci	return 0;
24588c2ecf20Sopenharmony_ci}
24598c2ecf20Sopenharmony_cifs_initcall(eventpoll_init);
2460