xref: /kernel/linux/linux-5.10/io_uring/io_uring.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Shared application/kernel submission and completion ring pairs, for
4 * supporting fast/efficient IO.
5 *
6 * A note on the read/write ordering memory barriers that are matched between
7 * the application and kernel side.
8 *
9 * After the application reads the CQ ring tail, it must use an
10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
11 * before writing the tail (using smp_load_acquire to read the tail will
12 * do). It also needs a smp_mb() before updating CQ head (ordering the
13 * entry load(s) with the head store), pairing with an implicit barrier
14 * through a control-dependency in io_get_cqe (smp_store_release to
15 * store head will do). Failure to do so could lead to reading invalid
16 * CQ entries.
17 *
18 * Likewise, the application must use an appropriate smp_wmb() before
19 * writing the SQ tail (ordering SQ entry stores with the tail store),
20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
21 * to store the tail will do). And it needs a barrier ordering the SQ
22 * head load before writing new SQ entries (smp_load_acquire to read
23 * head will do).
24 *
25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
27 * updating the SQ tail; a full memory barrier smp_mb() is needed
28 * between.
29 *
30 * Also see the examples in the liburing library:
31 *
32 *	git://git.kernel.dk/liburing
33 *
34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
35 * from data shared between the kernel and application. This is done both
36 * for ordering purposes, but also to ensure that once a value is loaded from
37 * data that the application could potentially modify, it remains stable.
38 *
39 * Copyright (C) 2018-2019 Jens Axboe
40 * Copyright (c) 2018-2019 Christoph Hellwig
41 */
42#include <linux/kernel.h>
43#include <linux/init.h>
44#include <linux/errno.h>
45#include <linux/syscalls.h>
46#include <linux/compat.h>
47#include <net/compat.h>
48#include <linux/refcount.h>
49#include <linux/uio.h>
50#include <linux/bits.h>
51
52#include <linux/sched/signal.h>
53#include <linux/fs.h>
54#include <linux/file.h>
55#include <linux/fdtable.h>
56#include <linux/mm.h>
57#include <linux/mman.h>
58#include <linux/percpu.h>
59#include <linux/slab.h>
60#include <linux/blkdev.h>
61#include <linux/bvec.h>
62#include <linux/net.h>
63#include <net/sock.h>
64#include <net/af_unix.h>
65#include <net/scm.h>
66#include <linux/anon_inodes.h>
67#include <linux/sched/mm.h>
68#include <linux/uaccess.h>
69#include <linux/nospec.h>
70#include <linux/sizes.h>
71#include <linux/hugetlb.h>
72#include <linux/highmem.h>
73#include <linux/namei.h>
74#include <linux/fsnotify.h>
75#include <linux/fadvise.h>
76#include <linux/eventpoll.h>
77#include <linux/splice.h>
78#include <linux/task_work.h>
79#include <linux/pagemap.h>
80#include <linux/io_uring.h>
81#include <linux/tracehook.h>
82
83#define CREATE_TRACE_POINTS
84#include <trace/events/io_uring.h>
85
86#include <uapi/linux/io_uring.h>
87
88#include "../fs/internal.h"
89#include "io-wq.h"
90
91#define IORING_MAX_ENTRIES	32768
92#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
93#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
94
95/* only define max */
96#define IORING_MAX_FIXED_FILES	(1U << 15)
97#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
98				 IORING_REGISTER_LAST + IORING_OP_LAST)
99
100#define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
101#define IO_RSRC_TAG_TABLE_MAX	(1U << IO_RSRC_TAG_TABLE_SHIFT)
102#define IO_RSRC_TAG_TABLE_MASK	(IO_RSRC_TAG_TABLE_MAX - 1)
103
104#define IORING_MAX_REG_BUFFERS	(1U << 14)
105
106#define SQE_VALID_FLAGS	(IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK|	\
107				IOSQE_IO_HARDLINK | IOSQE_ASYNC | \
108				IOSQE_BUFFER_SELECT)
109#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
110				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS)
111
112#define IO_TCTX_REFS_CACHE_NR	(1U << 10)
113
114struct io_uring {
115	u32 head ____cacheline_aligned_in_smp;
116	u32 tail ____cacheline_aligned_in_smp;
117};
118
119/*
120 * This data is shared with the application through the mmap at offsets
121 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
122 *
123 * The offsets to the member fields are published through struct
124 * io_sqring_offsets when calling io_uring_setup.
125 */
126struct io_rings {
127	/*
128	 * Head and tail offsets into the ring; the offsets need to be
129	 * masked to get valid indices.
130	 *
131	 * The kernel controls head of the sq ring and the tail of the cq ring,
132	 * and the application controls tail of the sq ring and the head of the
133	 * cq ring.
134	 */
135	struct io_uring		sq, cq;
136	/*
137	 * Bitmasks to apply to head and tail offsets (constant, equals
138	 * ring_entries - 1)
139	 */
140	u32			sq_ring_mask, cq_ring_mask;
141	/* Ring sizes (constant, power of 2) */
142	u32			sq_ring_entries, cq_ring_entries;
143	/*
144	 * Number of invalid entries dropped by the kernel due to
145	 * invalid index stored in array
146	 *
147	 * Written by the kernel, shouldn't be modified by the
148	 * application (i.e. get number of "new events" by comparing to
149	 * cached value).
150	 *
151	 * After a new SQ head value was read by the application this
152	 * counter includes all submissions that were dropped reaching
153	 * the new SQ head (and possibly more).
154	 */
155	u32			sq_dropped;
156	/*
157	 * Runtime SQ flags
158	 *
159	 * Written by the kernel, shouldn't be modified by the
160	 * application.
161	 *
162	 * The application needs a full memory barrier before checking
163	 * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
164	 */
165	u32			sq_flags;
166	/*
167	 * Runtime CQ flags
168	 *
169	 * Written by the application, shouldn't be modified by the
170	 * kernel.
171	 */
172	u32			cq_flags;
173	/*
174	 * Number of completion events lost because the queue was full;
175	 * this should be avoided by the application by making sure
176	 * there are not more requests pending than there is space in
177	 * the completion queue.
178	 *
179	 * Written by the kernel, shouldn't be modified by the
180	 * application (i.e. get number of "new events" by comparing to
181	 * cached value).
182	 *
183	 * As completion events come in out of order this counter is not
184	 * ordered with any other data.
185	 */
186	u32			cq_overflow;
187	/*
188	 * Ring buffer of completion events.
189	 *
190	 * The kernel writes completion events fresh every time they are
191	 * produced, so the application is allowed to modify pending
192	 * entries.
193	 */
194	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
195};
196
197enum io_uring_cmd_flags {
198	IO_URING_F_NONBLOCK		= 1,
199	IO_URING_F_COMPLETE_DEFER	= 2,
200};
201
202struct io_mapped_ubuf {
203	u64		ubuf;
204	u64		ubuf_end;
205	unsigned int	nr_bvecs;
206	unsigned long	acct_pages;
207	struct bio_vec	bvec[];
208};
209
210struct io_ring_ctx;
211
212struct io_overflow_cqe {
213	struct io_uring_cqe cqe;
214	struct list_head list;
215};
216
217struct io_fixed_file {
218	/* file * with additional FFS_* flags */
219	unsigned long file_ptr;
220};
221
222struct io_rsrc_put {
223	struct list_head list;
224	u64 tag;
225	union {
226		void *rsrc;
227		struct file *file;
228		struct io_mapped_ubuf *buf;
229	};
230};
231
232struct io_file_table {
233	struct io_fixed_file *files;
234};
235
236struct io_rsrc_node {
237	struct percpu_ref		refs;
238	struct list_head		node;
239	struct list_head		rsrc_list;
240	struct io_rsrc_data		*rsrc_data;
241	struct llist_node		llist;
242	bool				done;
243};
244
245typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
246
247struct io_rsrc_data {
248	struct io_ring_ctx		*ctx;
249
250	u64				**tags;
251	unsigned int			nr;
252	rsrc_put_fn			*do_put;
253	atomic_t			refs;
254	struct completion		done;
255	bool				quiesce;
256};
257
258struct io_buffer {
259	struct list_head list;
260	__u64 addr;
261	__u32 len;
262	__u16 bid;
263};
264
265struct io_restriction {
266	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
267	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
268	u8 sqe_flags_allowed;
269	u8 sqe_flags_required;
270	bool registered;
271};
272
273enum {
274	IO_SQ_THREAD_SHOULD_STOP = 0,
275	IO_SQ_THREAD_SHOULD_PARK,
276};
277
278struct io_sq_data {
279	refcount_t		refs;
280	atomic_t		park_pending;
281	struct mutex		lock;
282
283	/* ctx's that are using this sqd */
284	struct list_head	ctx_list;
285
286	struct task_struct	*thread;
287	struct wait_queue_head	wait;
288
289	unsigned		sq_thread_idle;
290	int			sq_cpu;
291	pid_t			task_pid;
292	pid_t			task_tgid;
293
294	unsigned long		state;
295	struct completion	exited;
296};
297
298#define IO_COMPL_BATCH			32
299#define IO_REQ_CACHE_SIZE		32
300#define IO_REQ_ALLOC_BATCH		8
301
302struct io_submit_link {
303	struct io_kiocb		*head;
304	struct io_kiocb		*last;
305};
306
307struct io_submit_state {
308	struct blk_plug		plug;
309	struct io_submit_link	link;
310
311	/*
312	 * io_kiocb alloc cache
313	 */
314	void			*reqs[IO_REQ_CACHE_SIZE];
315	unsigned int		free_reqs;
316
317	bool			plug_started;
318
319	/*
320	 * Batch completion logic
321	 */
322	struct io_kiocb		*compl_reqs[IO_COMPL_BATCH];
323	unsigned int		compl_nr;
324	/* inline/task_work completion list, under ->uring_lock */
325	struct list_head	free_list;
326
327	unsigned int		ios_left;
328};
329
330struct io_ring_ctx {
331	/* const or read-mostly hot data */
332	struct {
333		struct percpu_ref	refs;
334
335		struct io_rings		*rings;
336		unsigned int		flags;
337		unsigned int		compat: 1;
338		unsigned int		drain_next: 1;
339		unsigned int		eventfd_async: 1;
340		unsigned int		restricted: 1;
341		unsigned int		off_timeout_used: 1;
342		unsigned int		drain_active: 1;
343	} ____cacheline_aligned_in_smp;
344
345	/* submission data */
346	struct {
347		struct mutex		uring_lock;
348
349		/*
350		 * Ring buffer of indices into array of io_uring_sqe, which is
351		 * mmapped by the application using the IORING_OFF_SQES offset.
352		 *
353		 * This indirection could e.g. be used to assign fixed
354		 * io_uring_sqe entries to operations and only submit them to
355		 * the queue when needed.
356		 *
357		 * The kernel modifies neither the indices array nor the entries
358		 * array.
359		 */
360		u32			*sq_array;
361		struct io_uring_sqe	*sq_sqes;
362		unsigned		cached_sq_head;
363		unsigned		sq_entries;
364		struct list_head	defer_list;
365
366		/*
367		 * Fixed resources fast path, should be accessed only under
368		 * uring_lock, and updated through io_uring_register(2)
369		 */
370		struct io_rsrc_node	*rsrc_node;
371		struct io_file_table	file_table;
372		unsigned		nr_user_files;
373		unsigned		nr_user_bufs;
374		struct io_mapped_ubuf	**user_bufs;
375
376		struct io_submit_state	submit_state;
377		struct list_head	timeout_list;
378		struct list_head	ltimeout_list;
379		struct list_head	cq_overflow_list;
380		struct xarray		io_buffers;
381		struct xarray		personalities;
382		u32			pers_next;
383		unsigned		sq_thread_idle;
384	} ____cacheline_aligned_in_smp;
385
386	/* IRQ completion list, under ->completion_lock */
387	struct list_head	locked_free_list;
388	unsigned int		locked_free_nr;
389
390	const struct cred	*sq_creds;	/* cred used for __io_sq_thread() */
391	struct io_sq_data	*sq_data;	/* if using sq thread polling */
392
393	struct wait_queue_head	sqo_sq_wait;
394	struct list_head	sqd_list;
395
396	unsigned long		check_cq_overflow;
397
398	struct {
399		unsigned		cached_cq_tail;
400		unsigned		cq_entries;
401		struct eventfd_ctx	*cq_ev_fd;
402		struct wait_queue_head	poll_wait;
403		struct wait_queue_head	cq_wait;
404		unsigned		cq_extra;
405		atomic_t		cq_timeouts;
406		unsigned		cq_last_tm_flush;
407	} ____cacheline_aligned_in_smp;
408
409	struct {
410		spinlock_t		completion_lock;
411
412		spinlock_t		timeout_lock;
413
414		/*
415		 * ->iopoll_list is protected by the ctx->uring_lock for
416		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
417		 * For SQPOLL, only the single threaded io_sq_thread() will
418		 * manipulate the list, hence no extra locking is needed there.
419		 */
420		struct list_head	iopoll_list;
421		struct hlist_head	*cancel_hash;
422		unsigned		cancel_hash_bits;
423		bool			poll_multi_queue;
424	} ____cacheline_aligned_in_smp;
425
426	struct io_restriction		restrictions;
427
428	/* slow path rsrc auxilary data, used by update/register */
429	struct {
430		struct io_rsrc_node		*rsrc_backup_node;
431		struct io_mapped_ubuf		*dummy_ubuf;
432		struct io_rsrc_data		*file_data;
433		struct io_rsrc_data		*buf_data;
434
435		struct delayed_work		rsrc_put_work;
436		struct llist_head		rsrc_put_llist;
437		struct list_head		rsrc_ref_list;
438		spinlock_t			rsrc_ref_lock;
439	};
440
441	/* Keep this last, we don't need it for the fast path */
442	struct {
443		#if defined(CONFIG_UNIX)
444			struct socket		*ring_sock;
445		#endif
446		/* hashed buffered write serialization */
447		struct io_wq_hash		*hash_map;
448
449		/* Only used for accounting purposes */
450		struct user_struct		*user;
451		struct mm_struct		*mm_account;
452
453		/* ctx exit and cancelation */
454		struct llist_head		fallback_llist;
455		struct delayed_work		fallback_work;
456		struct work_struct		exit_work;
457		struct list_head		tctx_list;
458		struct completion		ref_comp;
459		u32				iowq_limits[2];
460		bool				iowq_limits_set;
461	};
462};
463
464struct io_uring_task {
465	/* submission side */
466	int			cached_refs;
467	struct xarray		xa;
468	struct wait_queue_head	wait;
469	const struct io_ring_ctx *last;
470	struct io_wq		*io_wq;
471	struct percpu_counter	inflight;
472	atomic_t		inflight_tracked;
473	atomic_t		in_idle;
474
475	spinlock_t		task_lock;
476	struct io_wq_work_list	task_list;
477	struct callback_head	task_work;
478	bool			task_running;
479};
480
481/*
482 * First field must be the file pointer in all the
483 * iocb unions! See also 'struct kiocb' in <linux/fs.h>
484 */
485struct io_poll_iocb {
486	struct file			*file;
487	struct wait_queue_head		*head;
488	__poll_t			events;
489	int				retries;
490	struct wait_queue_entry		wait;
491};
492
493struct io_poll_update {
494	struct file			*file;
495	u64				old_user_data;
496	u64				new_user_data;
497	__poll_t			events;
498	bool				update_events;
499	bool				update_user_data;
500};
501
502struct io_close {
503	struct file			*file;
504	int				fd;
505	u32				file_slot;
506};
507
508struct io_timeout_data {
509	struct io_kiocb			*req;
510	struct hrtimer			timer;
511	struct timespec64		ts;
512	enum hrtimer_mode		mode;
513	u32				flags;
514};
515
516struct io_accept {
517	struct file			*file;
518	struct sockaddr __user		*addr;
519	int __user			*addr_len;
520	int				flags;
521	u32				file_slot;
522	unsigned long			nofile;
523};
524
525struct io_sync {
526	struct file			*file;
527	loff_t				len;
528	loff_t				off;
529	int				flags;
530	int				mode;
531};
532
533struct io_cancel {
534	struct file			*file;
535	u64				addr;
536};
537
538struct io_timeout {
539	struct file			*file;
540	u32				off;
541	u32				target_seq;
542	struct list_head		list;
543	/* head of the link, used by linked timeouts only */
544	struct io_kiocb			*head;
545	/* for linked completions */
546	struct io_kiocb			*prev;
547};
548
549struct io_timeout_rem {
550	struct file			*file;
551	u64				addr;
552
553	/* timeout update */
554	struct timespec64		ts;
555	u32				flags;
556	bool				ltimeout;
557};
558
559struct io_rw {
560	/* NOTE: kiocb has the file as the first member, so don't do it here */
561	struct kiocb			kiocb;
562	u64				addr;
563	u64				len;
564};
565
566struct io_connect {
567	struct file			*file;
568	struct sockaddr __user		*addr;
569	int				addr_len;
570};
571
572struct io_sr_msg {
573	struct file			*file;
574	union {
575		struct compat_msghdr __user	*umsg_compat;
576		struct user_msghdr __user	*umsg;
577		void __user			*buf;
578	};
579	int				msg_flags;
580	int				bgid;
581	size_t				len;
582	size_t				done_io;
583	struct io_buffer		*kbuf;
584	void __user			*msg_control;
585};
586
587struct io_open {
588	struct file			*file;
589	int				dfd;
590	u32				file_slot;
591	struct filename			*filename;
592	struct open_how			how;
593	unsigned long			nofile;
594};
595
596struct io_rsrc_update {
597	struct file			*file;
598	u64				arg;
599	u32				nr_args;
600	u32				offset;
601};
602
603struct io_fadvise {
604	struct file			*file;
605	u64				offset;
606	u32				len;
607	u32				advice;
608};
609
610struct io_madvise {
611	struct file			*file;
612	u64				addr;
613	u32				len;
614	u32				advice;
615};
616
617struct io_epoll {
618	struct file			*file;
619	int				epfd;
620	int				op;
621	int				fd;
622	struct epoll_event		event;
623};
624
625struct io_splice {
626	struct file			*file_out;
627	loff_t				off_out;
628	loff_t				off_in;
629	u64				len;
630	int				splice_fd_in;
631	unsigned int			flags;
632};
633
634struct io_provide_buf {
635	struct file			*file;
636	__u64				addr;
637	__u32				len;
638	__u32				bgid;
639	__u16				nbufs;
640	__u16				bid;
641};
642
643struct io_statx {
644	struct file			*file;
645	int				dfd;
646	unsigned int			mask;
647	unsigned int			flags;
648	const char __user		*filename;
649	struct statx __user		*buffer;
650};
651
652struct io_shutdown {
653	struct file			*file;
654	int				how;
655};
656
657struct io_rename {
658	struct file			*file;
659	int				old_dfd;
660	int				new_dfd;
661	struct filename			*oldpath;
662	struct filename			*newpath;
663	int				flags;
664};
665
666struct io_unlink {
667	struct file			*file;
668	int				dfd;
669	int				flags;
670	struct filename			*filename;
671};
672
673struct io_mkdir {
674	struct file			*file;
675	int				dfd;
676	umode_t				mode;
677	struct filename			*filename;
678};
679
680struct io_symlink {
681	struct file			*file;
682	int				new_dfd;
683	struct filename			*oldpath;
684	struct filename			*newpath;
685};
686
687struct io_hardlink {
688	struct file			*file;
689	int				old_dfd;
690	int				new_dfd;
691	struct filename			*oldpath;
692	struct filename			*newpath;
693	int				flags;
694};
695
696struct io_completion {
697	struct file			*file;
698	u32				cflags;
699};
700
701struct io_async_connect {
702	struct sockaddr_storage		address;
703};
704
705struct io_async_msghdr {
706	struct iovec			fast_iov[UIO_FASTIOV];
707	/* points to an allocated iov, if NULL we use fast_iov instead */
708	struct iovec			*free_iov;
709	struct sockaddr __user		*uaddr;
710	struct msghdr			msg;
711	struct sockaddr_storage		addr;
712};
713
714struct io_async_rw {
715	struct iovec			fast_iov[UIO_FASTIOV];
716	const struct iovec		*free_iovec;
717	struct iov_iter			iter;
718	struct iov_iter_state		iter_state;
719	size_t				bytes_done;
720	struct wait_page_queue		wpq;
721};
722
723enum {
724	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
725	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
726	REQ_F_LINK_BIT		= IOSQE_IO_LINK_BIT,
727	REQ_F_HARDLINK_BIT	= IOSQE_IO_HARDLINK_BIT,
728	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
729	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
730
731	/* first byte is taken by user flags, shift it to not overlap */
732	REQ_F_FAIL_BIT		= 8,
733	REQ_F_INFLIGHT_BIT,
734	REQ_F_CUR_POS_BIT,
735	REQ_F_NOWAIT_BIT,
736	REQ_F_LINK_TIMEOUT_BIT,
737	REQ_F_NEED_CLEANUP_BIT,
738	REQ_F_POLLED_BIT,
739	REQ_F_BUFFER_SELECTED_BIT,
740	REQ_F_COMPLETE_INLINE_BIT,
741	REQ_F_REISSUE_BIT,
742	REQ_F_CREDS_BIT,
743	REQ_F_REFCOUNT_BIT,
744	REQ_F_ARM_LTIMEOUT_BIT,
745	REQ_F_PARTIAL_IO_BIT,
746	/* keep async read/write and isreg together and in order */
747	REQ_F_NOWAIT_READ_BIT,
748	REQ_F_NOWAIT_WRITE_BIT,
749	REQ_F_ISREG_BIT,
750
751	/* not a real bit, just to check we're not overflowing the space */
752	__REQ_F_LAST_BIT,
753};
754
755enum {
756	/* ctx owns file */
757	REQ_F_FIXED_FILE	= BIT(REQ_F_FIXED_FILE_BIT),
758	/* drain existing IO first */
759	REQ_F_IO_DRAIN		= BIT(REQ_F_IO_DRAIN_BIT),
760	/* linked sqes */
761	REQ_F_LINK		= BIT(REQ_F_LINK_BIT),
762	/* doesn't sever on completion < 0 */
763	REQ_F_HARDLINK		= BIT(REQ_F_HARDLINK_BIT),
764	/* IOSQE_ASYNC */
765	REQ_F_FORCE_ASYNC	= BIT(REQ_F_FORCE_ASYNC_BIT),
766	/* IOSQE_BUFFER_SELECT */
767	REQ_F_BUFFER_SELECT	= BIT(REQ_F_BUFFER_SELECT_BIT),
768
769	/* fail rest of links */
770	REQ_F_FAIL		= BIT(REQ_F_FAIL_BIT),
771	/* on inflight list, should be cancelled and waited on exit reliably */
772	REQ_F_INFLIGHT		= BIT(REQ_F_INFLIGHT_BIT),
773	/* read/write uses file position */
774	REQ_F_CUR_POS		= BIT(REQ_F_CUR_POS_BIT),
775	/* must not punt to workers */
776	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
777	/* has or had linked timeout */
778	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
779	/* needs cleanup */
780	REQ_F_NEED_CLEANUP	= BIT(REQ_F_NEED_CLEANUP_BIT),
781	/* already went through poll handler */
782	REQ_F_POLLED		= BIT(REQ_F_POLLED_BIT),
783	/* buffer already selected */
784	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
785	/* completion is deferred through io_comp_state */
786	REQ_F_COMPLETE_INLINE	= BIT(REQ_F_COMPLETE_INLINE_BIT),
787	/* caller should reissue async */
788	REQ_F_REISSUE		= BIT(REQ_F_REISSUE_BIT),
789	/* supports async reads */
790	REQ_F_NOWAIT_READ	= BIT(REQ_F_NOWAIT_READ_BIT),
791	/* supports async writes */
792	REQ_F_NOWAIT_WRITE	= BIT(REQ_F_NOWAIT_WRITE_BIT),
793	/* regular file */
794	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
795	/* has creds assigned */
796	REQ_F_CREDS		= BIT(REQ_F_CREDS_BIT),
797	/* skip refcounting if not set */
798	REQ_F_REFCOUNT		= BIT(REQ_F_REFCOUNT_BIT),
799	/* there is a linked timeout that has to be armed */
800	REQ_F_ARM_LTIMEOUT	= BIT(REQ_F_ARM_LTIMEOUT_BIT),
801	/* request has already done partial IO */
802	REQ_F_PARTIAL_IO	= BIT(REQ_F_PARTIAL_IO_BIT),
803};
804
805struct async_poll {
806	struct io_poll_iocb	poll;
807	struct io_poll_iocb	*double_poll;
808};
809
810typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
811
812struct io_task_work {
813	union {
814		struct io_wq_work_node	node;
815		struct llist_node	fallback_node;
816	};
817	io_req_tw_func_t		func;
818};
819
820enum {
821	IORING_RSRC_FILE		= 0,
822	IORING_RSRC_BUFFER		= 1,
823};
824
825/*
826 * NOTE! Each of the iocb union members has the file pointer
827 * as the first entry in their struct definition. So you can
828 * access the file pointer through any of the sub-structs,
829 * or directly as just 'ki_filp' in this struct.
830 */
831struct io_kiocb {
832	union {
833		struct file		*file;
834		struct io_rw		rw;
835		struct io_poll_iocb	poll;
836		struct io_poll_update	poll_update;
837		struct io_accept	accept;
838		struct io_sync		sync;
839		struct io_cancel	cancel;
840		struct io_timeout	timeout;
841		struct io_timeout_rem	timeout_rem;
842		struct io_connect	connect;
843		struct io_sr_msg	sr_msg;
844		struct io_open		open;
845		struct io_close		close;
846		struct io_rsrc_update	rsrc_update;
847		struct io_fadvise	fadvise;
848		struct io_madvise	madvise;
849		struct io_epoll		epoll;
850		struct io_splice	splice;
851		struct io_provide_buf	pbuf;
852		struct io_statx		statx;
853		struct io_shutdown	shutdown;
854		struct io_rename	rename;
855		struct io_unlink	unlink;
856		struct io_mkdir		mkdir;
857		struct io_symlink	symlink;
858		struct io_hardlink	hardlink;
859		/* use only after cleaning per-op data, see io_clean_op() */
860		struct io_completion	compl;
861	};
862
863	/* opcode allocated if it needs to store data for async defer */
864	void				*async_data;
865	u8				opcode;
866	/* polled IO has completed */
867	u8				iopoll_completed;
868
869	u16				buf_index;
870	u32				result;
871
872	struct io_ring_ctx		*ctx;
873	unsigned int			flags;
874	atomic_t			refs;
875	struct task_struct		*task;
876	u64				user_data;
877
878	struct io_kiocb			*link;
879	struct percpu_ref		*fixed_rsrc_refs;
880
881	/* used with ctx->iopoll_list with reads/writes */
882	struct list_head		inflight_entry;
883	struct io_task_work		io_task_work;
884	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
885	struct hlist_node		hash_node;
886	struct async_poll		*apoll;
887	struct io_wq_work		work;
888	const struct cred		*creds;
889
890	/* store used ubuf, so we can prevent reloading */
891	struct io_mapped_ubuf		*imu;
892	/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
893	struct io_buffer		*kbuf;
894	atomic_t			poll_refs;
895};
896
897struct io_tctx_node {
898	struct list_head	ctx_node;
899	struct task_struct	*task;
900	struct io_ring_ctx	*ctx;
901};
902
903struct io_defer_entry {
904	struct list_head	list;
905	struct io_kiocb		*req;
906	u32			seq;
907};
908
909struct io_op_def {
910	/* needs req->file assigned */
911	unsigned		needs_file : 1;
912	/* hash wq insertion if file is a regular file */
913	unsigned		hash_reg_file : 1;
914	/* unbound wq insertion if file is a non-regular file */
915	unsigned		unbound_nonreg_file : 1;
916	/* opcode is not supported by this kernel */
917	unsigned		not_supported : 1;
918	/* set if opcode supports polled "wait" */
919	unsigned		pollin : 1;
920	unsigned		pollout : 1;
921	/* op supports buffer selection */
922	unsigned		buffer_select : 1;
923	/* do prep async if is going to be punted */
924	unsigned		needs_async_setup : 1;
925	/* should block plug */
926	unsigned		plug : 1;
927	/* size of async data needed, if any */
928	unsigned short		async_size;
929};
930
931static const struct io_op_def io_op_defs[] = {
932	[IORING_OP_NOP] = {},
933	[IORING_OP_READV] = {
934		.needs_file		= 1,
935		.unbound_nonreg_file	= 1,
936		.pollin			= 1,
937		.buffer_select		= 1,
938		.needs_async_setup	= 1,
939		.plug			= 1,
940		.async_size		= sizeof(struct io_async_rw),
941	},
942	[IORING_OP_WRITEV] = {
943		.needs_file		= 1,
944		.hash_reg_file		= 1,
945		.unbound_nonreg_file	= 1,
946		.pollout		= 1,
947		.needs_async_setup	= 1,
948		.plug			= 1,
949		.async_size		= sizeof(struct io_async_rw),
950	},
951	[IORING_OP_FSYNC] = {
952		.needs_file		= 1,
953	},
954	[IORING_OP_READ_FIXED] = {
955		.needs_file		= 1,
956		.unbound_nonreg_file	= 1,
957		.pollin			= 1,
958		.plug			= 1,
959		.async_size		= sizeof(struct io_async_rw),
960	},
961	[IORING_OP_WRITE_FIXED] = {
962		.needs_file		= 1,
963		.hash_reg_file		= 1,
964		.unbound_nonreg_file	= 1,
965		.pollout		= 1,
966		.plug			= 1,
967		.async_size		= sizeof(struct io_async_rw),
968	},
969	[IORING_OP_POLL_ADD] = {
970		.needs_file		= 1,
971		.unbound_nonreg_file	= 1,
972	},
973	[IORING_OP_POLL_REMOVE] = {},
974	[IORING_OP_SYNC_FILE_RANGE] = {
975		.needs_file		= 1,
976	},
977	[IORING_OP_SENDMSG] = {
978		.needs_file		= 1,
979		.unbound_nonreg_file	= 1,
980		.pollout		= 1,
981		.needs_async_setup	= 1,
982		.async_size		= sizeof(struct io_async_msghdr),
983	},
984	[IORING_OP_RECVMSG] = {
985		.needs_file		= 1,
986		.unbound_nonreg_file	= 1,
987		.pollin			= 1,
988		.buffer_select		= 1,
989		.needs_async_setup	= 1,
990		.async_size		= sizeof(struct io_async_msghdr),
991	},
992	[IORING_OP_TIMEOUT] = {
993		.async_size		= sizeof(struct io_timeout_data),
994	},
995	[IORING_OP_TIMEOUT_REMOVE] = {
996		/* used by timeout updates' prep() */
997	},
998	[IORING_OP_ACCEPT] = {
999		.needs_file		= 1,
1000		.unbound_nonreg_file	= 1,
1001		.pollin			= 1,
1002	},
1003	[IORING_OP_ASYNC_CANCEL] = {},
1004	[IORING_OP_LINK_TIMEOUT] = {
1005		.async_size		= sizeof(struct io_timeout_data),
1006	},
1007	[IORING_OP_CONNECT] = {
1008		.needs_file		= 1,
1009		.unbound_nonreg_file	= 1,
1010		.pollout		= 1,
1011		.needs_async_setup	= 1,
1012		.async_size		= sizeof(struct io_async_connect),
1013	},
1014	[IORING_OP_FALLOCATE] = {
1015		.needs_file		= 1,
1016	},
1017	[IORING_OP_OPENAT] = {},
1018	[IORING_OP_CLOSE] = {},
1019	[IORING_OP_FILES_UPDATE] = {},
1020	[IORING_OP_STATX] = {},
1021	[IORING_OP_READ] = {
1022		.needs_file		= 1,
1023		.unbound_nonreg_file	= 1,
1024		.pollin			= 1,
1025		.buffer_select		= 1,
1026		.plug			= 1,
1027		.async_size		= sizeof(struct io_async_rw),
1028	},
1029	[IORING_OP_WRITE] = {
1030		.needs_file		= 1,
1031		.hash_reg_file		= 1,
1032		.unbound_nonreg_file	= 1,
1033		.pollout		= 1,
1034		.plug			= 1,
1035		.async_size		= sizeof(struct io_async_rw),
1036	},
1037	[IORING_OP_FADVISE] = {
1038		.needs_file		= 1,
1039	},
1040	[IORING_OP_MADVISE] = {},
1041	[IORING_OP_SEND] = {
1042		.needs_file		= 1,
1043		.unbound_nonreg_file	= 1,
1044		.pollout		= 1,
1045	},
1046	[IORING_OP_RECV] = {
1047		.needs_file		= 1,
1048		.unbound_nonreg_file	= 1,
1049		.pollin			= 1,
1050		.buffer_select		= 1,
1051	},
1052	[IORING_OP_OPENAT2] = {
1053	},
1054	[IORING_OP_EPOLL_CTL] = {
1055		.unbound_nonreg_file	= 1,
1056	},
1057	[IORING_OP_SPLICE] = {
1058		.needs_file		= 1,
1059		.hash_reg_file		= 1,
1060		.unbound_nonreg_file	= 1,
1061	},
1062	[IORING_OP_PROVIDE_BUFFERS] = {},
1063	[IORING_OP_REMOVE_BUFFERS] = {},
1064	[IORING_OP_TEE] = {
1065		.needs_file		= 1,
1066		.hash_reg_file		= 1,
1067		.unbound_nonreg_file	= 1,
1068	},
1069	[IORING_OP_SHUTDOWN] = {
1070		.needs_file		= 1,
1071	},
1072	[IORING_OP_RENAMEAT] = {},
1073	[IORING_OP_UNLINKAT] = {},
1074};
1075
1076/* requests with any of those set should undergo io_disarm_next() */
1077#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
1078
1079static bool io_disarm_next(struct io_kiocb *req);
1080static void io_uring_del_tctx_node(unsigned long index);
1081static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
1082					 struct task_struct *task,
1083					 bool cancel_all);
1084static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
1085
1086static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags);
1087
1088static void io_put_req(struct io_kiocb *req);
1089static void io_put_req_deferred(struct io_kiocb *req);
1090static void io_dismantle_req(struct io_kiocb *req);
1091static void io_queue_linked_timeout(struct io_kiocb *req);
1092static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
1093				     struct io_uring_rsrc_update2 *up,
1094				     unsigned nr_args);
1095static void io_clean_op(struct io_kiocb *req);
1096static struct file *io_file_get(struct io_ring_ctx *ctx,
1097				struct io_kiocb *req, int fd, bool fixed,
1098				unsigned int issue_flags);
1099static void __io_queue_sqe(struct io_kiocb *req);
1100static void io_rsrc_put_work(struct work_struct *work);
1101
1102static void io_req_task_queue(struct io_kiocb *req);
1103static void io_submit_flush_completions(struct io_ring_ctx *ctx);
1104static int io_req_prep_async(struct io_kiocb *req);
1105
1106static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
1107				 unsigned int issue_flags, u32 slot_index);
1108static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
1109
1110static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
1111
1112static struct kmem_cache *req_cachep;
1113
1114static const struct file_operations io_uring_fops;
1115
1116struct sock *io_uring_get_socket(struct file *file)
1117{
1118#if defined(CONFIG_UNIX)
1119	if (file->f_op == &io_uring_fops) {
1120		struct io_ring_ctx *ctx = file->private_data;
1121
1122		return ctx->ring_sock->sk;
1123	}
1124#endif
1125	return NULL;
1126}
1127EXPORT_SYMBOL(io_uring_get_socket);
1128
1129static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
1130{
1131	if (!*locked) {
1132		mutex_lock(&ctx->uring_lock);
1133		*locked = true;
1134	}
1135}
1136
1137#define io_for_each_link(pos, head) \
1138	for (pos = (head); pos; pos = pos->link)
1139
1140/*
1141 * Shamelessly stolen from the mm implementation of page reference checking,
1142 * see commit f958d7b528b1 for details.
1143 */
1144#define req_ref_zero_or_close_to_overflow(req)	\
1145	((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u)
1146
1147static inline bool req_ref_inc_not_zero(struct io_kiocb *req)
1148{
1149	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1150	return atomic_inc_not_zero(&req->refs);
1151}
1152
1153static inline bool req_ref_put_and_test(struct io_kiocb *req)
1154{
1155	if (likely(!(req->flags & REQ_F_REFCOUNT)))
1156		return true;
1157
1158	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1159	return atomic_dec_and_test(&req->refs);
1160}
1161
1162static inline void req_ref_get(struct io_kiocb *req)
1163{
1164	WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT));
1165	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
1166	atomic_inc(&req->refs);
1167}
1168
1169static inline void __io_req_set_refcount(struct io_kiocb *req, int nr)
1170{
1171	if (!(req->flags & REQ_F_REFCOUNT)) {
1172		req->flags |= REQ_F_REFCOUNT;
1173		atomic_set(&req->refs, nr);
1174	}
1175}
1176
1177static inline void io_req_set_refcount(struct io_kiocb *req)
1178{
1179	__io_req_set_refcount(req, 1);
1180}
1181
1182static inline void io_req_set_rsrc_node(struct io_kiocb *req)
1183{
1184	struct io_ring_ctx *ctx = req->ctx;
1185
1186	if (!req->fixed_rsrc_refs) {
1187		req->fixed_rsrc_refs = &ctx->rsrc_node->refs;
1188		percpu_ref_get(req->fixed_rsrc_refs);
1189	}
1190}
1191
1192static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
1193{
1194	bool got = percpu_ref_tryget(ref);
1195
1196	/* already at zero, wait for ->release() */
1197	if (!got)
1198		wait_for_completion(compl);
1199	percpu_ref_resurrect(ref);
1200	if (got)
1201		percpu_ref_put(ref);
1202}
1203
1204static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
1205			  bool cancel_all)
1206	__must_hold(&req->ctx->timeout_lock)
1207{
1208	struct io_kiocb *req;
1209
1210	if (task && head->task != task)
1211		return false;
1212	if (cancel_all)
1213		return true;
1214
1215	io_for_each_link(req, head) {
1216		if (req->flags & REQ_F_INFLIGHT)
1217			return true;
1218	}
1219	return false;
1220}
1221
1222static bool io_match_linked(struct io_kiocb *head)
1223{
1224	struct io_kiocb *req;
1225
1226	io_for_each_link(req, head) {
1227		if (req->flags & REQ_F_INFLIGHT)
1228			return true;
1229	}
1230	return false;
1231}
1232
1233/*
1234 * As io_match_task() but protected against racing with linked timeouts.
1235 * User must not hold timeout_lock.
1236 */
1237static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
1238			       bool cancel_all)
1239{
1240	bool matched;
1241
1242	if (task && head->task != task)
1243		return false;
1244	if (cancel_all)
1245		return true;
1246
1247	if (head->flags & REQ_F_LINK_TIMEOUT) {
1248		struct io_ring_ctx *ctx = head->ctx;
1249
1250		/* protect against races with linked timeouts */
1251		spin_lock_irq(&ctx->timeout_lock);
1252		matched = io_match_linked(head);
1253		spin_unlock_irq(&ctx->timeout_lock);
1254	} else {
1255		matched = io_match_linked(head);
1256	}
1257	return matched;
1258}
1259
1260static inline void req_set_fail(struct io_kiocb *req)
1261{
1262	req->flags |= REQ_F_FAIL;
1263}
1264
1265static inline void req_fail_link_node(struct io_kiocb *req, int res)
1266{
1267	req_set_fail(req);
1268	req->result = res;
1269}
1270
1271static void io_ring_ctx_ref_free(struct percpu_ref *ref)
1272{
1273	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
1274
1275	complete(&ctx->ref_comp);
1276}
1277
1278static inline bool io_is_timeout_noseq(struct io_kiocb *req)
1279{
1280	return !req->timeout.off;
1281}
1282
1283static void io_fallback_req_func(struct work_struct *work)
1284{
1285	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
1286						fallback_work.work);
1287	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
1288	struct io_kiocb *req, *tmp;
1289	bool locked = false;
1290
1291	percpu_ref_get(&ctx->refs);
1292	llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node)
1293		req->io_task_work.func(req, &locked);
1294
1295	if (locked) {
1296		if (ctx->submit_state.compl_nr)
1297			io_submit_flush_completions(ctx);
1298		mutex_unlock(&ctx->uring_lock);
1299	}
1300	percpu_ref_put(&ctx->refs);
1301
1302}
1303
1304static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
1305{
1306	struct io_ring_ctx *ctx;
1307	int hash_bits;
1308
1309	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1310	if (!ctx)
1311		return NULL;
1312
1313	/*
1314	 * Use 5 bits less than the max cq entries, that should give us around
1315	 * 32 entries per hash list if totally full and uniformly spread.
1316	 */
1317	hash_bits = ilog2(p->cq_entries);
1318	hash_bits -= 5;
1319	if (hash_bits <= 0)
1320		hash_bits = 1;
1321	ctx->cancel_hash_bits = hash_bits;
1322	ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
1323					GFP_KERNEL);
1324	if (!ctx->cancel_hash)
1325		goto err;
1326	__hash_init(ctx->cancel_hash, 1U << hash_bits);
1327
1328	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
1329	if (!ctx->dummy_ubuf)
1330		goto err;
1331	/* set invalid range, so io_import_fixed() fails meeting it */
1332	ctx->dummy_ubuf->ubuf = -1UL;
1333
1334	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
1335			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
1336		goto err;
1337
1338	ctx->flags = p->flags;
1339	init_waitqueue_head(&ctx->sqo_sq_wait);
1340	INIT_LIST_HEAD(&ctx->sqd_list);
1341	init_waitqueue_head(&ctx->poll_wait);
1342	INIT_LIST_HEAD(&ctx->cq_overflow_list);
1343	init_completion(&ctx->ref_comp);
1344	xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
1345	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
1346	mutex_init(&ctx->uring_lock);
1347	init_waitqueue_head(&ctx->cq_wait);
1348	spin_lock_init(&ctx->completion_lock);
1349	spin_lock_init(&ctx->timeout_lock);
1350	INIT_LIST_HEAD(&ctx->iopoll_list);
1351	INIT_LIST_HEAD(&ctx->defer_list);
1352	INIT_LIST_HEAD(&ctx->timeout_list);
1353	INIT_LIST_HEAD(&ctx->ltimeout_list);
1354	spin_lock_init(&ctx->rsrc_ref_lock);
1355	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
1356	INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work);
1357	init_llist_head(&ctx->rsrc_put_llist);
1358	INIT_LIST_HEAD(&ctx->tctx_list);
1359	INIT_LIST_HEAD(&ctx->submit_state.free_list);
1360	INIT_LIST_HEAD(&ctx->locked_free_list);
1361	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
1362	return ctx;
1363err:
1364	kfree(ctx->dummy_ubuf);
1365	kfree(ctx->cancel_hash);
1366	kfree(ctx);
1367	return NULL;
1368}
1369
1370static void io_account_cq_overflow(struct io_ring_ctx *ctx)
1371{
1372	struct io_rings *r = ctx->rings;
1373
1374	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
1375	ctx->cq_extra--;
1376}
1377
1378static bool req_need_defer(struct io_kiocb *req, u32 seq)
1379{
1380	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
1381		struct io_ring_ctx *ctx = req->ctx;
1382
1383		return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
1384	}
1385
1386	return false;
1387}
1388
1389#define FFS_ASYNC_READ		0x1UL
1390#define FFS_ASYNC_WRITE		0x2UL
1391#ifdef CONFIG_64BIT
1392#define FFS_ISREG		0x4UL
1393#else
1394#define FFS_ISREG		0x0UL
1395#endif
1396#define FFS_MASK		~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG)
1397
1398static inline bool io_req_ffs_set(struct io_kiocb *req)
1399{
1400	return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE);
1401}
1402
1403static void io_req_track_inflight(struct io_kiocb *req)
1404{
1405	if (!(req->flags & REQ_F_INFLIGHT)) {
1406		req->flags |= REQ_F_INFLIGHT;
1407		atomic_inc(&req->task->io_uring->inflight_tracked);
1408	}
1409}
1410
1411static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
1412{
1413	if (WARN_ON_ONCE(!req->link))
1414		return NULL;
1415
1416	req->flags &= ~REQ_F_ARM_LTIMEOUT;
1417	req->flags |= REQ_F_LINK_TIMEOUT;
1418
1419	/* linked timeouts should have two refs once prep'ed */
1420	io_req_set_refcount(req);
1421	__io_req_set_refcount(req->link, 2);
1422	return req->link;
1423}
1424
1425static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
1426{
1427	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
1428		return NULL;
1429	return __io_prep_linked_timeout(req);
1430}
1431
1432static void io_prep_async_work(struct io_kiocb *req)
1433{
1434	const struct io_op_def *def = &io_op_defs[req->opcode];
1435	struct io_ring_ctx *ctx = req->ctx;
1436
1437	if (!(req->flags & REQ_F_CREDS)) {
1438		req->flags |= REQ_F_CREDS;
1439		req->creds = get_current_cred();
1440	}
1441
1442	req->work.list.next = NULL;
1443	req->work.flags = 0;
1444	if (req->flags & REQ_F_FORCE_ASYNC)
1445		req->work.flags |= IO_WQ_WORK_CONCURRENT;
1446
1447	if (req->flags & REQ_F_ISREG) {
1448		if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL))
1449			io_wq_hash_work(&req->work, file_inode(req->file));
1450	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
1451		if (def->unbound_nonreg_file)
1452			req->work.flags |= IO_WQ_WORK_UNBOUND;
1453	}
1454}
1455
1456static void io_prep_async_link(struct io_kiocb *req)
1457{
1458	struct io_kiocb *cur;
1459
1460	if (req->flags & REQ_F_LINK_TIMEOUT) {
1461		struct io_ring_ctx *ctx = req->ctx;
1462
1463		spin_lock_irq(&ctx->timeout_lock);
1464		io_for_each_link(cur, req)
1465			io_prep_async_work(cur);
1466		spin_unlock_irq(&ctx->timeout_lock);
1467	} else {
1468		io_for_each_link(cur, req)
1469			io_prep_async_work(cur);
1470	}
1471}
1472
1473static void io_queue_async_work(struct io_kiocb *req, bool *locked)
1474{
1475	struct io_ring_ctx *ctx = req->ctx;
1476	struct io_kiocb *link = io_prep_linked_timeout(req);
1477	struct io_uring_task *tctx = req->task->io_uring;
1478
1479	/* must not take the lock, NULL it as a precaution */
1480	locked = NULL;
1481
1482	BUG_ON(!tctx);
1483	BUG_ON(!tctx->io_wq);
1484
1485	/* init ->work of the whole link before punting */
1486	io_prep_async_link(req);
1487
1488	/*
1489	 * Not expected to happen, but if we do have a bug where this _can_
1490	 * happen, catch it here and ensure the request is marked as
1491	 * canceled. That will make io-wq go through the usual work cancel
1492	 * procedure rather than attempt to run this request (or create a new
1493	 * worker for it).
1494	 */
1495	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
1496		req->work.flags |= IO_WQ_WORK_CANCEL;
1497
1498	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
1499					&req->work, req->flags);
1500	io_wq_enqueue(tctx->io_wq, &req->work);
1501	if (link)
1502		io_queue_linked_timeout(link);
1503}
1504
1505static void io_kill_timeout(struct io_kiocb *req, int status)
1506	__must_hold(&req->ctx->completion_lock)
1507	__must_hold(&req->ctx->timeout_lock)
1508{
1509	struct io_timeout_data *io = req->async_data;
1510
1511	if (hrtimer_try_to_cancel(&io->timer) != -1) {
1512		if (status)
1513			req_set_fail(req);
1514		atomic_set(&req->ctx->cq_timeouts,
1515			atomic_read(&req->ctx->cq_timeouts) + 1);
1516		list_del_init(&req->timeout.list);
1517		io_fill_cqe_req(req, status, 0);
1518		io_put_req_deferred(req);
1519	}
1520}
1521
1522static void io_queue_deferred(struct io_ring_ctx *ctx)
1523{
1524	lockdep_assert_held(&ctx->completion_lock);
1525
1526	while (!list_empty(&ctx->defer_list)) {
1527		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
1528						struct io_defer_entry, list);
1529
1530		if (req_need_defer(de->req, de->seq))
1531			break;
1532		list_del_init(&de->list);
1533		io_req_task_queue(de->req);
1534		kfree(de);
1535	}
1536}
1537
1538static void io_flush_timeouts(struct io_ring_ctx *ctx)
1539	__must_hold(&ctx->completion_lock)
1540{
1541	u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
1542	struct io_kiocb *req, *tmp;
1543
1544	spin_lock_irq(&ctx->timeout_lock);
1545	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
1546		u32 events_needed, events_got;
1547
1548		if (io_is_timeout_noseq(req))
1549			break;
1550
1551		/*
1552		 * Since seq can easily wrap around over time, subtract
1553		 * the last seq at which timeouts were flushed before comparing.
1554		 * Assuming not more than 2^31-1 events have happened since,
1555		 * these subtractions won't have wrapped, so we can check if
1556		 * target is in [last_seq, current_seq] by comparing the two.
1557		 */
1558		events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush;
1559		events_got = seq - ctx->cq_last_tm_flush;
1560		if (events_got < events_needed)
1561			break;
1562
1563		io_kill_timeout(req, 0);
1564	}
1565	ctx->cq_last_tm_flush = seq;
1566	spin_unlock_irq(&ctx->timeout_lock);
1567}
1568
1569static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
1570{
1571	if (ctx->off_timeout_used)
1572		io_flush_timeouts(ctx);
1573	if (ctx->drain_active)
1574		io_queue_deferred(ctx);
1575}
1576
1577static inline bool io_commit_needs_flush(struct io_ring_ctx *ctx)
1578{
1579	return ctx->off_timeout_used || ctx->drain_active;
1580}
1581
1582static inline void __io_commit_cqring(struct io_ring_ctx *ctx)
1583{
1584	/* order cqe stores with ring update */
1585	smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
1586}
1587
1588static inline void io_commit_cqring(struct io_ring_ctx *ctx)
1589{
1590	if (unlikely(io_commit_needs_flush(ctx)))
1591		__io_commit_cqring_flush(ctx);
1592	__io_commit_cqring(ctx);
1593}
1594
1595static inline bool io_sqring_full(struct io_ring_ctx *ctx)
1596{
1597	struct io_rings *r = ctx->rings;
1598
1599	return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries;
1600}
1601
1602static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
1603{
1604	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
1605}
1606
1607static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
1608{
1609	struct io_rings *rings = ctx->rings;
1610	unsigned tail, mask = ctx->cq_entries - 1;
1611
1612	/*
1613	 * writes to the cq entry need to come after reading head; the
1614	 * control dependency is enough as we're using WRITE_ONCE to
1615	 * fill the cq entry
1616	 */
1617	if (__io_cqring_events(ctx) == ctx->cq_entries)
1618		return NULL;
1619
1620	tail = ctx->cached_cq_tail++;
1621	return &rings->cqes[tail & mask];
1622}
1623
1624static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
1625{
1626	if (likely(!ctx->cq_ev_fd))
1627		return false;
1628	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
1629		return false;
1630	return !ctx->eventfd_async || io_wq_current_is_worker();
1631}
1632
1633/*
1634 * This should only get called when at least one event has been posted.
1635 * Some applications rely on the eventfd notification count only changing
1636 * IFF a new CQE has been added to the CQ ring. There's no depedency on
1637 * 1:1 relationship between how many times this function is called (and
1638 * hence the eventfd count) and number of CQEs posted to the CQ ring.
1639 */
1640static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
1641{
1642	/*
1643	 * wake_up_all() may seem excessive, but io_wake_function() and
1644	 * io_should_wake() handle the termination of the loop and only
1645	 * wake as many waiters as we need to.
1646	 */
1647	if (wq_has_sleeper(&ctx->cq_wait))
1648		__wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
1649				poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
1650	if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait))
1651		wake_up(&ctx->sq_data->wait);
1652	if (io_should_trigger_evfd(ctx))
1653		eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE);
1654	if (waitqueue_active(&ctx->poll_wait))
1655		__wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0,
1656				poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
1657}
1658
1659static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
1660{
1661	/* see waitqueue_active() comment */
1662	smp_mb();
1663
1664	if (ctx->flags & IORING_SETUP_SQPOLL) {
1665		if (waitqueue_active(&ctx->cq_wait))
1666			__wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
1667				  poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
1668	}
1669	if (io_should_trigger_evfd(ctx))
1670		eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE);
1671	if (waitqueue_active(&ctx->poll_wait))
1672		__wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0,
1673				poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
1674}
1675
1676/* Returns true if there are no backlogged entries after the flush */
1677static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
1678{
1679	bool all_flushed, posted;
1680
1681	if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
1682		return false;
1683
1684	posted = false;
1685	spin_lock(&ctx->completion_lock);
1686	while (!list_empty(&ctx->cq_overflow_list)) {
1687		struct io_uring_cqe *cqe = io_get_cqe(ctx);
1688		struct io_overflow_cqe *ocqe;
1689
1690		if (!cqe && !force)
1691			break;
1692		ocqe = list_first_entry(&ctx->cq_overflow_list,
1693					struct io_overflow_cqe, list);
1694		if (cqe)
1695			memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
1696		else
1697			io_account_cq_overflow(ctx);
1698
1699		posted = true;
1700		list_del(&ocqe->list);
1701		kfree(ocqe);
1702	}
1703
1704	all_flushed = list_empty(&ctx->cq_overflow_list);
1705	if (all_flushed) {
1706		clear_bit(0, &ctx->check_cq_overflow);
1707		WRITE_ONCE(ctx->rings->sq_flags,
1708			   ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
1709	}
1710
1711	if (posted)
1712		io_commit_cqring(ctx);
1713	spin_unlock(&ctx->completion_lock);
1714	if (posted)
1715		io_cqring_ev_posted(ctx);
1716	return all_flushed;
1717}
1718
1719static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
1720{
1721	bool ret = true;
1722
1723	if (test_bit(0, &ctx->check_cq_overflow)) {
1724		/* iopoll syncs against uring_lock, not completion_lock */
1725		if (ctx->flags & IORING_SETUP_IOPOLL)
1726			mutex_lock(&ctx->uring_lock);
1727		ret = __io_cqring_overflow_flush(ctx, false);
1728		if (ctx->flags & IORING_SETUP_IOPOLL)
1729			mutex_unlock(&ctx->uring_lock);
1730	}
1731
1732	return ret;
1733}
1734
1735/* must to be called somewhat shortly after putting a request */
1736static inline void io_put_task(struct task_struct *task, int nr)
1737{
1738	struct io_uring_task *tctx = task->io_uring;
1739
1740	if (likely(task == current)) {
1741		tctx->cached_refs += nr;
1742	} else {
1743		percpu_counter_sub(&tctx->inflight, nr);
1744		if (unlikely(atomic_read(&tctx->in_idle)))
1745			wake_up(&tctx->wait);
1746		put_task_struct_many(task, nr);
1747	}
1748}
1749
1750static void io_task_refs_refill(struct io_uring_task *tctx)
1751{
1752	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
1753
1754	percpu_counter_add(&tctx->inflight, refill);
1755	refcount_add(refill, &current->usage);
1756	tctx->cached_refs += refill;
1757}
1758
1759static inline void io_get_task_refs(int nr)
1760{
1761	struct io_uring_task *tctx = current->io_uring;
1762
1763	tctx->cached_refs -= nr;
1764	if (unlikely(tctx->cached_refs < 0))
1765		io_task_refs_refill(tctx);
1766}
1767
1768static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
1769{
1770	struct io_uring_task *tctx = task->io_uring;
1771	unsigned int refs = tctx->cached_refs;
1772
1773	if (refs) {
1774		tctx->cached_refs = 0;
1775		percpu_counter_sub(&tctx->inflight, refs);
1776		put_task_struct_many(task, refs);
1777	}
1778}
1779
1780static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
1781				     s32 res, u32 cflags)
1782{
1783	struct io_overflow_cqe *ocqe;
1784
1785	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
1786	if (!ocqe) {
1787		/*
1788		 * If we're in ring overflow flush mode, or in task cancel mode,
1789		 * or cannot allocate an overflow entry, then we need to drop it
1790		 * on the floor.
1791		 */
1792		io_account_cq_overflow(ctx);
1793		return false;
1794	}
1795	if (list_empty(&ctx->cq_overflow_list)) {
1796		set_bit(0, &ctx->check_cq_overflow);
1797		WRITE_ONCE(ctx->rings->sq_flags,
1798			   ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
1799
1800	}
1801	ocqe->cqe.user_data = user_data;
1802	ocqe->cqe.res = res;
1803	ocqe->cqe.flags = cflags;
1804	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
1805	return true;
1806}
1807
1808static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
1809				 s32 res, u32 cflags)
1810{
1811	struct io_uring_cqe *cqe;
1812
1813	trace_io_uring_complete(ctx, user_data, res, cflags);
1814
1815	/*
1816	 * If we can't get a cq entry, userspace overflowed the
1817	 * submission (by quite a lot). Increment the overflow count in
1818	 * the ring.
1819	 */
1820	cqe = io_get_cqe(ctx);
1821	if (likely(cqe)) {
1822		WRITE_ONCE(cqe->user_data, user_data);
1823		WRITE_ONCE(cqe->res, res);
1824		WRITE_ONCE(cqe->flags, cflags);
1825		return true;
1826	}
1827	return io_cqring_event_overflow(ctx, user_data, res, cflags);
1828}
1829
1830static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
1831{
1832	__io_fill_cqe(req->ctx, req->user_data, res, cflags);
1833}
1834
1835static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
1836				     s32 res, u32 cflags)
1837{
1838	ctx->cq_extra++;
1839	return __io_fill_cqe(ctx, user_data, res, cflags);
1840}
1841
1842static void io_req_complete_post(struct io_kiocb *req, s32 res,
1843				 u32 cflags)
1844{
1845	struct io_ring_ctx *ctx = req->ctx;
1846
1847	spin_lock(&ctx->completion_lock);
1848	__io_fill_cqe(ctx, req->user_data, res, cflags);
1849	/*
1850	 * If we're the last reference to this request, add to our locked
1851	 * free_list cache.
1852	 */
1853	if (req_ref_put_and_test(req)) {
1854		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
1855			if (req->flags & IO_DISARM_MASK)
1856				io_disarm_next(req);
1857			if (req->link) {
1858				io_req_task_queue(req->link);
1859				req->link = NULL;
1860			}
1861		}
1862		io_dismantle_req(req);
1863		io_put_task(req->task, 1);
1864		list_add(&req->inflight_entry, &ctx->locked_free_list);
1865		ctx->locked_free_nr++;
1866	} else {
1867		if (!percpu_ref_tryget(&ctx->refs))
1868			req = NULL;
1869	}
1870	io_commit_cqring(ctx);
1871	spin_unlock(&ctx->completion_lock);
1872
1873	if (req) {
1874		io_cqring_ev_posted(ctx);
1875		percpu_ref_put(&ctx->refs);
1876	}
1877}
1878
1879static inline bool io_req_needs_clean(struct io_kiocb *req)
1880{
1881	return req->flags & IO_REQ_CLEAN_FLAGS;
1882}
1883
1884static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
1885					 u32 cflags)
1886{
1887	if (io_req_needs_clean(req))
1888		io_clean_op(req);
1889	req->result = res;
1890	req->compl.cflags = cflags;
1891	req->flags |= REQ_F_COMPLETE_INLINE;
1892}
1893
1894static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
1895				     s32 res, u32 cflags)
1896{
1897	if (issue_flags & IO_URING_F_COMPLETE_DEFER)
1898		io_req_complete_state(req, res, cflags);
1899	else
1900		io_req_complete_post(req, res, cflags);
1901}
1902
1903static inline void io_req_complete(struct io_kiocb *req, s32 res)
1904{
1905	__io_req_complete(req, 0, res, 0);
1906}
1907
1908static void io_req_complete_failed(struct io_kiocb *req, s32 res)
1909{
1910	req_set_fail(req);
1911	io_req_complete_post(req, res, 0);
1912}
1913
1914static void io_req_complete_fail_submit(struct io_kiocb *req)
1915{
1916	/*
1917	 * We don't submit, fail them all, for that replace hardlinks with
1918	 * normal links. Extra REQ_F_LINK is tolerated.
1919	 */
1920	req->flags &= ~REQ_F_HARDLINK;
1921	req->flags |= REQ_F_LINK;
1922	io_req_complete_failed(req, req->result);
1923}
1924
1925/*
1926 * Don't initialise the fields below on every allocation, but do that in
1927 * advance and keep them valid across allocations.
1928 */
1929static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
1930{
1931	req->ctx = ctx;
1932	req->link = NULL;
1933	req->async_data = NULL;
1934	/* not necessary, but safer to zero */
1935	req->result = 0;
1936}
1937
1938static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
1939					struct io_submit_state *state)
1940{
1941	spin_lock(&ctx->completion_lock);
1942	list_splice_init(&ctx->locked_free_list, &state->free_list);
1943	ctx->locked_free_nr = 0;
1944	spin_unlock(&ctx->completion_lock);
1945}
1946
1947/* Returns true IFF there are requests in the cache */
1948static bool io_flush_cached_reqs(struct io_ring_ctx *ctx)
1949{
1950	struct io_submit_state *state = &ctx->submit_state;
1951	int nr;
1952
1953	/*
1954	 * If we have more than a batch's worth of requests in our IRQ side
1955	 * locked cache, grab the lock and move them over to our submission
1956	 * side cache.
1957	 */
1958	if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH)
1959		io_flush_cached_locked_reqs(ctx, state);
1960
1961	nr = state->free_reqs;
1962	while (!list_empty(&state->free_list)) {
1963		struct io_kiocb *req = list_first_entry(&state->free_list,
1964					struct io_kiocb, inflight_entry);
1965
1966		list_del(&req->inflight_entry);
1967		state->reqs[nr++] = req;
1968		if (nr == ARRAY_SIZE(state->reqs))
1969			break;
1970	}
1971
1972	state->free_reqs = nr;
1973	return nr != 0;
1974}
1975
1976/*
1977 * A request might get retired back into the request caches even before opcode
1978 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
1979 * Because of that, io_alloc_req() should be called only under ->uring_lock
1980 * and with extra caution to not get a request that is still worked on.
1981 */
1982static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
1983	__must_hold(&ctx->uring_lock)
1984{
1985	struct io_submit_state *state = &ctx->submit_state;
1986	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
1987	int ret, i;
1988
1989	BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH);
1990
1991	if (likely(state->free_reqs || io_flush_cached_reqs(ctx)))
1992		goto got_req;
1993
1994	ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH,
1995				    state->reqs);
1996
1997	/*
1998	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
1999	 * retry single alloc to be on the safe side.
2000	 */
2001	if (unlikely(ret <= 0)) {
2002		state->reqs[0] = kmem_cache_alloc(req_cachep, gfp);
2003		if (!state->reqs[0])
2004			return NULL;
2005		ret = 1;
2006	}
2007
2008	for (i = 0; i < ret; i++)
2009		io_preinit_req(state->reqs[i], ctx);
2010	state->free_reqs = ret;
2011got_req:
2012	state->free_reqs--;
2013	return state->reqs[state->free_reqs];
2014}
2015
2016static inline void io_put_file(struct file *file)
2017{
2018	if (file)
2019		fput(file);
2020}
2021
2022static void io_dismantle_req(struct io_kiocb *req)
2023{
2024	unsigned int flags = req->flags;
2025
2026	if (io_req_needs_clean(req))
2027		io_clean_op(req);
2028	if (!(flags & REQ_F_FIXED_FILE))
2029		io_put_file(req->file);
2030	if (req->fixed_rsrc_refs)
2031		percpu_ref_put(req->fixed_rsrc_refs);
2032	if (req->async_data) {
2033		kfree(req->async_data);
2034		req->async_data = NULL;
2035	}
2036}
2037
2038static void __io_free_req(struct io_kiocb *req)
2039{
2040	struct io_ring_ctx *ctx = req->ctx;
2041
2042	io_dismantle_req(req);
2043	io_put_task(req->task, 1);
2044
2045	spin_lock(&ctx->completion_lock);
2046	list_add(&req->inflight_entry, &ctx->locked_free_list);
2047	ctx->locked_free_nr++;
2048	spin_unlock(&ctx->completion_lock);
2049
2050	percpu_ref_put(&ctx->refs);
2051}
2052
2053static inline void io_remove_next_linked(struct io_kiocb *req)
2054{
2055	struct io_kiocb *nxt = req->link;
2056
2057	req->link = nxt->link;
2058	nxt->link = NULL;
2059}
2060
2061static bool io_kill_linked_timeout(struct io_kiocb *req)
2062	__must_hold(&req->ctx->completion_lock)
2063	__must_hold(&req->ctx->timeout_lock)
2064{
2065	struct io_kiocb *link = req->link;
2066
2067	if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2068		struct io_timeout_data *io = link->async_data;
2069
2070		io_remove_next_linked(req);
2071		link->timeout.head = NULL;
2072		if (hrtimer_try_to_cancel(&io->timer) != -1) {
2073			list_del(&link->timeout.list);
2074			io_fill_cqe_req(link, -ECANCELED, 0);
2075			io_put_req_deferred(link);
2076			return true;
2077		}
2078	}
2079	return false;
2080}
2081
2082static void io_fail_links(struct io_kiocb *req)
2083	__must_hold(&req->ctx->completion_lock)
2084{
2085	struct io_kiocb *nxt, *link = req->link;
2086
2087	req->link = NULL;
2088	while (link) {
2089		long res = -ECANCELED;
2090
2091		if (link->flags & REQ_F_FAIL)
2092			res = link->result;
2093
2094		nxt = link->link;
2095		link->link = NULL;
2096
2097		trace_io_uring_fail_link(req, link);
2098		io_fill_cqe_req(link, res, 0);
2099		io_put_req_deferred(link);
2100		link = nxt;
2101	}
2102}
2103
2104static bool io_disarm_next(struct io_kiocb *req)
2105	__must_hold(&req->ctx->completion_lock)
2106{
2107	bool posted = false;
2108
2109	if (req->flags & REQ_F_ARM_LTIMEOUT) {
2110		struct io_kiocb *link = req->link;
2111
2112		req->flags &= ~REQ_F_ARM_LTIMEOUT;
2113		if (link && link->opcode == IORING_OP_LINK_TIMEOUT) {
2114			io_remove_next_linked(req);
2115			io_fill_cqe_req(link, -ECANCELED, 0);
2116			io_put_req_deferred(link);
2117			posted = true;
2118		}
2119	} else if (req->flags & REQ_F_LINK_TIMEOUT) {
2120		struct io_ring_ctx *ctx = req->ctx;
2121
2122		spin_lock_irq(&ctx->timeout_lock);
2123		posted = io_kill_linked_timeout(req);
2124		spin_unlock_irq(&ctx->timeout_lock);
2125	}
2126	if (unlikely((req->flags & REQ_F_FAIL) &&
2127		     !(req->flags & REQ_F_HARDLINK))) {
2128		posted |= (req->link != NULL);
2129		io_fail_links(req);
2130	}
2131	return posted;
2132}
2133
2134static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
2135{
2136	struct io_kiocb *nxt;
2137
2138	/*
2139	 * If LINK is set, we have dependent requests in this chain. If we
2140	 * didn't fail this request, queue the first one up, moving any other
2141	 * dependencies to the next request. In case of failure, fail the rest
2142	 * of the chain.
2143	 */
2144	if (req->flags & IO_DISARM_MASK) {
2145		struct io_ring_ctx *ctx = req->ctx;
2146		bool posted;
2147
2148		spin_lock(&ctx->completion_lock);
2149		posted = io_disarm_next(req);
2150		if (posted)
2151			io_commit_cqring(req->ctx);
2152		spin_unlock(&ctx->completion_lock);
2153		if (posted)
2154			io_cqring_ev_posted(ctx);
2155	}
2156	nxt = req->link;
2157	req->link = NULL;
2158	return nxt;
2159}
2160
2161static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
2162{
2163	if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK))))
2164		return NULL;
2165	return __io_req_find_next(req);
2166}
2167
2168static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
2169{
2170	if (!ctx)
2171		return;
2172	if (*locked) {
2173		if (ctx->submit_state.compl_nr)
2174			io_submit_flush_completions(ctx);
2175		mutex_unlock(&ctx->uring_lock);
2176		*locked = false;
2177	}
2178	percpu_ref_put(&ctx->refs);
2179}
2180
2181static void tctx_task_work(struct callback_head *cb)
2182{
2183	bool locked = false;
2184	struct io_ring_ctx *ctx = NULL;
2185	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
2186						  task_work);
2187
2188	while (1) {
2189		struct io_wq_work_node *node;
2190
2191		if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
2192			io_submit_flush_completions(ctx);
2193
2194		spin_lock_irq(&tctx->task_lock);
2195		node = tctx->task_list.first;
2196		INIT_WQ_LIST(&tctx->task_list);
2197		if (!node)
2198			tctx->task_running = false;
2199		spin_unlock_irq(&tctx->task_lock);
2200		if (!node)
2201			break;
2202
2203		do {
2204			struct io_wq_work_node *next = node->next;
2205			struct io_kiocb *req = container_of(node, struct io_kiocb,
2206							    io_task_work.node);
2207
2208			if (req->ctx != ctx) {
2209				ctx_flush_and_put(ctx, &locked);
2210				ctx = req->ctx;
2211				/* if not contended, grab and improve batching */
2212				locked = mutex_trylock(&ctx->uring_lock);
2213				percpu_ref_get(&ctx->refs);
2214			}
2215			req->io_task_work.func(req, &locked);
2216			node = next;
2217			if (unlikely(need_resched())) {
2218				ctx_flush_and_put(ctx, &locked);
2219				ctx = NULL;
2220				cond_resched();
2221			}
2222		} while (node);
2223	}
2224
2225	ctx_flush_and_put(ctx, &locked);
2226
2227	/* relaxed read is enough as only the task itself sets ->in_idle */
2228	if (unlikely(atomic_read(&tctx->in_idle)))
2229		io_uring_drop_tctx_refs(current);
2230}
2231
2232static void io_req_task_work_add(struct io_kiocb *req)
2233{
2234	struct task_struct *tsk = req->task;
2235	struct io_uring_task *tctx = tsk->io_uring;
2236	enum task_work_notify_mode notify;
2237	struct io_wq_work_node *node;
2238	unsigned long flags;
2239	bool running;
2240
2241	WARN_ON_ONCE(!tctx);
2242
2243	spin_lock_irqsave(&tctx->task_lock, flags);
2244	wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
2245	running = tctx->task_running;
2246	if (!running)
2247		tctx->task_running = true;
2248	spin_unlock_irqrestore(&tctx->task_lock, flags);
2249
2250	/* task_work already pending, we're done */
2251	if (running)
2252		return;
2253
2254	/*
2255	 * SQPOLL kernel thread doesn't need notification, just a wakeup. For
2256	 * all other cases, use TWA_SIGNAL unconditionally to ensure we're
2257	 * processing task_work. There's no reliable way to tell if TWA_RESUME
2258	 * will do the job.
2259	 */
2260	notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
2261	if (!task_work_add(tsk, &tctx->task_work, notify)) {
2262		wake_up_process(tsk);
2263		return;
2264	}
2265
2266	spin_lock_irqsave(&tctx->task_lock, flags);
2267	tctx->task_running = false;
2268	node = tctx->task_list.first;
2269	INIT_WQ_LIST(&tctx->task_list);
2270	spin_unlock_irqrestore(&tctx->task_lock, flags);
2271
2272	while (node) {
2273		req = container_of(node, struct io_kiocb, io_task_work.node);
2274		node = node->next;
2275		if (llist_add(&req->io_task_work.fallback_node,
2276			      &req->ctx->fallback_llist))
2277			schedule_delayed_work(&req->ctx->fallback_work, 1);
2278	}
2279}
2280
2281static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
2282{
2283	struct io_ring_ctx *ctx = req->ctx;
2284
2285	/* not needed for normal modes, but SQPOLL depends on it */
2286	io_tw_lock(ctx, locked);
2287	io_req_complete_failed(req, req->result);
2288}
2289
2290static void io_req_task_submit(struct io_kiocb *req, bool *locked)
2291{
2292	struct io_ring_ctx *ctx = req->ctx;
2293
2294	io_tw_lock(ctx, locked);
2295	/* req->task == current here, checking PF_EXITING is safe */
2296	if (likely(!(req->task->flags & PF_EXITING)))
2297		__io_queue_sqe(req);
2298	else
2299		io_req_complete_failed(req, -EFAULT);
2300}
2301
2302static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
2303{
2304	req->result = ret;
2305	req->io_task_work.func = io_req_task_cancel;
2306	io_req_task_work_add(req);
2307}
2308
2309static void io_req_task_queue(struct io_kiocb *req)
2310{
2311	req->io_task_work.func = io_req_task_submit;
2312	io_req_task_work_add(req);
2313}
2314
2315static void io_req_task_queue_reissue(struct io_kiocb *req)
2316{
2317	req->io_task_work.func = io_queue_async_work;
2318	io_req_task_work_add(req);
2319}
2320
2321static inline void io_queue_next(struct io_kiocb *req)
2322{
2323	struct io_kiocb *nxt = io_req_find_next(req);
2324
2325	if (nxt)
2326		io_req_task_queue(nxt);
2327}
2328
2329static void io_free_req(struct io_kiocb *req)
2330{
2331	io_queue_next(req);
2332	__io_free_req(req);
2333}
2334
2335static void io_free_req_work(struct io_kiocb *req, bool *locked)
2336{
2337	io_free_req(req);
2338}
2339
2340struct req_batch {
2341	struct task_struct	*task;
2342	int			task_refs;
2343	int			ctx_refs;
2344};
2345
2346static inline void io_init_req_batch(struct req_batch *rb)
2347{
2348	rb->task_refs = 0;
2349	rb->ctx_refs = 0;
2350	rb->task = NULL;
2351}
2352
2353static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
2354				     struct req_batch *rb)
2355{
2356	if (rb->ctx_refs)
2357		percpu_ref_put_many(&ctx->refs, rb->ctx_refs);
2358	if (rb->task)
2359		io_put_task(rb->task, rb->task_refs);
2360}
2361
2362static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req,
2363			      struct io_submit_state *state)
2364{
2365	io_queue_next(req);
2366	io_dismantle_req(req);
2367
2368	if (req->task != rb->task) {
2369		if (rb->task)
2370			io_put_task(rb->task, rb->task_refs);
2371		rb->task = req->task;
2372		rb->task_refs = 0;
2373	}
2374	rb->task_refs++;
2375	rb->ctx_refs++;
2376
2377	if (state->free_reqs != ARRAY_SIZE(state->reqs))
2378		state->reqs[state->free_reqs++] = req;
2379	else
2380		list_add(&req->inflight_entry, &state->free_list);
2381}
2382
2383static void io_submit_flush_completions(struct io_ring_ctx *ctx)
2384	__must_hold(&ctx->uring_lock)
2385{
2386	struct io_submit_state *state = &ctx->submit_state;
2387	int i, nr = state->compl_nr;
2388	struct req_batch rb;
2389
2390	spin_lock(&ctx->completion_lock);
2391	for (i = 0; i < nr; i++) {
2392		struct io_kiocb *req = state->compl_reqs[i];
2393
2394		__io_fill_cqe(ctx, req->user_data, req->result,
2395			      req->compl.cflags);
2396	}
2397	io_commit_cqring(ctx);
2398	spin_unlock(&ctx->completion_lock);
2399	io_cqring_ev_posted(ctx);
2400
2401	io_init_req_batch(&rb);
2402	for (i = 0; i < nr; i++) {
2403		struct io_kiocb *req = state->compl_reqs[i];
2404
2405		if (req_ref_put_and_test(req))
2406			io_req_free_batch(&rb, req, &ctx->submit_state);
2407	}
2408
2409	io_req_free_batch_finish(ctx, &rb);
2410	state->compl_nr = 0;
2411}
2412
2413/*
2414 * Drop reference to request, return next in chain (if there is one) if this
2415 * was the last reference to this request.
2416 */
2417static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
2418{
2419	struct io_kiocb *nxt = NULL;
2420
2421	if (req_ref_put_and_test(req)) {
2422		nxt = io_req_find_next(req);
2423		__io_free_req(req);
2424	}
2425	return nxt;
2426}
2427
2428static inline void io_put_req(struct io_kiocb *req)
2429{
2430	if (req_ref_put_and_test(req))
2431		io_free_req(req);
2432}
2433
2434static inline void io_put_req_deferred(struct io_kiocb *req)
2435{
2436	if (req_ref_put_and_test(req)) {
2437		req->io_task_work.func = io_free_req_work;
2438		io_req_task_work_add(req);
2439	}
2440}
2441
2442static unsigned io_cqring_events(struct io_ring_ctx *ctx)
2443{
2444	/* See comment at the top of this file */
2445	smp_rmb();
2446	return __io_cqring_events(ctx);
2447}
2448
2449static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
2450{
2451	struct io_rings *rings = ctx->rings;
2452
2453	/* make sure SQ entry isn't read before tail */
2454	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
2455}
2456
2457static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
2458{
2459	unsigned int cflags;
2460
2461	cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
2462	cflags |= IORING_CQE_F_BUFFER;
2463	req->flags &= ~REQ_F_BUFFER_SELECTED;
2464	kfree(kbuf);
2465	return cflags;
2466}
2467
2468static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
2469{
2470	struct io_buffer *kbuf;
2471
2472	if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
2473		return 0;
2474	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
2475	return io_put_kbuf(req, kbuf);
2476}
2477
2478static inline bool io_run_task_work(void)
2479{
2480	/*
2481	 * PF_IO_WORKER never returns to userspace, so check here if we have
2482	 * notify work that needs processing.
2483	 */
2484	if (current->flags & PF_IO_WORKER &&
2485	    test_thread_flag(TIF_NOTIFY_RESUME)) {
2486		__set_current_state(TASK_RUNNING);
2487		tracehook_notify_resume(NULL);
2488	}
2489	if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) {
2490		__set_current_state(TASK_RUNNING);
2491		tracehook_notify_signal();
2492		return true;
2493	}
2494
2495	return false;
2496}
2497
2498/*
2499 * Find and free completed poll iocbs
2500 */
2501static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
2502			       struct list_head *done)
2503{
2504	struct req_batch rb;
2505	struct io_kiocb *req;
2506
2507	/* order with ->result store in io_complete_rw_iopoll() */
2508	smp_rmb();
2509
2510	io_init_req_batch(&rb);
2511	while (!list_empty(done)) {
2512		struct io_uring_cqe *cqe;
2513		unsigned cflags;
2514
2515		req = list_first_entry(done, struct io_kiocb, inflight_entry);
2516		list_del(&req->inflight_entry);
2517		cflags = io_put_rw_kbuf(req);
2518		(*nr_events)++;
2519
2520		cqe = io_get_cqe(ctx);
2521		if (cqe) {
2522			WRITE_ONCE(cqe->user_data, req->user_data);
2523			WRITE_ONCE(cqe->res, req->result);
2524			WRITE_ONCE(cqe->flags, cflags);
2525		} else {
2526			spin_lock(&ctx->completion_lock);
2527			io_cqring_event_overflow(ctx, req->user_data,
2528							req->result, cflags);
2529			spin_unlock(&ctx->completion_lock);
2530		}
2531
2532		if (req_ref_put_and_test(req))
2533			io_req_free_batch(&rb, req, &ctx->submit_state);
2534	}
2535
2536	if (io_commit_needs_flush(ctx)) {
2537		spin_lock(&ctx->completion_lock);
2538		__io_commit_cqring_flush(ctx);
2539		spin_unlock(&ctx->completion_lock);
2540	}
2541	__io_commit_cqring(ctx);
2542	io_cqring_ev_posted_iopoll(ctx);
2543	io_req_free_batch_finish(ctx, &rb);
2544}
2545
2546static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
2547			long min)
2548{
2549	struct io_kiocb *req, *tmp;
2550	LIST_HEAD(done);
2551	bool spin;
2552
2553	/*
2554	 * Only spin for completions if we don't have multiple devices hanging
2555	 * off our complete list, and we're under the requested amount.
2556	 */
2557	spin = !ctx->poll_multi_queue && *nr_events < min;
2558
2559	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
2560		struct kiocb *kiocb = &req->rw.kiocb;
2561		int ret;
2562
2563		/*
2564		 * Move completed and retryable entries to our local lists.
2565		 * If we find a request that requires polling, break out
2566		 * and complete those lists first, if we have entries there.
2567		 */
2568		if (READ_ONCE(req->iopoll_completed)) {
2569			list_move_tail(&req->inflight_entry, &done);
2570			continue;
2571		}
2572		if (!list_empty(&done))
2573			break;
2574
2575		ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin);
2576		if (unlikely(ret < 0))
2577			return ret;
2578		else if (ret)
2579			spin = false;
2580
2581		/* iopoll may have completed current req */
2582		if (READ_ONCE(req->iopoll_completed))
2583			list_move_tail(&req->inflight_entry, &done);
2584	}
2585
2586	if (!list_empty(&done))
2587		io_iopoll_complete(ctx, nr_events, &done);
2588
2589	return 0;
2590}
2591
2592/*
2593 * We can't just wait for polled events to come to us, we have to actively
2594 * find and complete them.
2595 */
2596static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
2597{
2598	if (!(ctx->flags & IORING_SETUP_IOPOLL))
2599		return;
2600
2601	mutex_lock(&ctx->uring_lock);
2602	while (!list_empty(&ctx->iopoll_list)) {
2603		unsigned int nr_events = 0;
2604
2605		io_do_iopoll(ctx, &nr_events, 0);
2606
2607		/* let it sleep and repeat later if can't complete a request */
2608		if (nr_events == 0)
2609			break;
2610		/*
2611		 * Ensure we allow local-to-the-cpu processing to take place,
2612		 * in this case we need to ensure that we reap all events.
2613		 * Also let task_work, etc. to progress by releasing the mutex
2614		 */
2615		if (need_resched()) {
2616			mutex_unlock(&ctx->uring_lock);
2617			cond_resched();
2618			mutex_lock(&ctx->uring_lock);
2619		}
2620	}
2621	mutex_unlock(&ctx->uring_lock);
2622}
2623
2624static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
2625{
2626	unsigned int nr_events = 0;
2627	int ret = 0;
2628
2629	/*
2630	 * We disallow the app entering submit/complete with polling, but we
2631	 * still need to lock the ring to prevent racing with polled issue
2632	 * that got punted to a workqueue.
2633	 */
2634	mutex_lock(&ctx->uring_lock);
2635	/*
2636	 * Don't enter poll loop if we already have events pending.
2637	 * If we do, we can potentially be spinning for commands that
2638	 * already triggered a CQE (eg in error).
2639	 */
2640	if (test_bit(0, &ctx->check_cq_overflow))
2641		__io_cqring_overflow_flush(ctx, false);
2642	if (io_cqring_events(ctx))
2643		goto out;
2644	do {
2645		/*
2646		 * If a submit got punted to a workqueue, we can have the
2647		 * application entering polling for a command before it gets
2648		 * issued. That app will hold the uring_lock for the duration
2649		 * of the poll right here, so we need to take a breather every
2650		 * now and then to ensure that the issue has a chance to add
2651		 * the poll to the issued list. Otherwise we can spin here
2652		 * forever, while the workqueue is stuck trying to acquire the
2653		 * very same mutex.
2654		 */
2655		if (list_empty(&ctx->iopoll_list)) {
2656			u32 tail = ctx->cached_cq_tail;
2657
2658			mutex_unlock(&ctx->uring_lock);
2659			io_run_task_work();
2660			mutex_lock(&ctx->uring_lock);
2661
2662			/* some requests don't go through iopoll_list */
2663			if (tail != ctx->cached_cq_tail ||
2664			    list_empty(&ctx->iopoll_list))
2665				break;
2666		}
2667		ret = io_do_iopoll(ctx, &nr_events, min);
2668
2669		if (task_sigpending(current)) {
2670			ret = -EINTR;
2671			goto out;
2672		}
2673	} while (!ret && nr_events < min && !need_resched());
2674out:
2675	mutex_unlock(&ctx->uring_lock);
2676	return ret;
2677}
2678
2679static void kiocb_end_write(struct io_kiocb *req)
2680{
2681	/*
2682	 * Tell lockdep we inherited freeze protection from submission
2683	 * thread.
2684	 */
2685	if (req->flags & REQ_F_ISREG) {
2686		struct super_block *sb = file_inode(req->file)->i_sb;
2687
2688		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
2689		sb_end_write(sb);
2690	}
2691}
2692
2693#ifdef CONFIG_BLOCK
2694static bool io_resubmit_prep(struct io_kiocb *req)
2695{
2696	struct io_async_rw *rw = req->async_data;
2697
2698	if (!rw)
2699		return !io_req_prep_async(req);
2700	iov_iter_restore(&rw->iter, &rw->iter_state);
2701	return true;
2702}
2703
2704static bool io_rw_should_reissue(struct io_kiocb *req)
2705{
2706	umode_t mode = file_inode(req->file)->i_mode;
2707	struct io_ring_ctx *ctx = req->ctx;
2708
2709	if (!S_ISBLK(mode) && !S_ISREG(mode))
2710		return false;
2711	if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() &&
2712	    !(ctx->flags & IORING_SETUP_IOPOLL)))
2713		return false;
2714	/*
2715	 * If ref is dying, we might be running poll reap from the exit work.
2716	 * Don't attempt to reissue from that path, just let it fail with
2717	 * -EAGAIN.
2718	 */
2719	if (percpu_ref_is_dying(&ctx->refs))
2720		return false;
2721	/*
2722	 * Play it safe and assume not safe to re-import and reissue if we're
2723	 * not in the original thread group (or in task context).
2724	 */
2725	if (!same_thread_group(req->task, current) || !in_task())
2726		return false;
2727	return true;
2728}
2729#else
2730static bool io_resubmit_prep(struct io_kiocb *req)
2731{
2732	return false;
2733}
2734static bool io_rw_should_reissue(struct io_kiocb *req)
2735{
2736	return false;
2737}
2738#endif
2739
2740/*
2741 * Trigger the notifications after having done some IO, and finish the write
2742 * accounting, if any.
2743 */
2744static void io_req_io_end(struct io_kiocb *req)
2745{
2746	struct io_rw *rw = &req->rw;
2747
2748	if (rw->kiocb.ki_flags & IOCB_WRITE) {
2749		kiocb_end_write(req);
2750		fsnotify_modify(req->file);
2751	} else {
2752		fsnotify_access(req->file);
2753	}
2754}
2755
2756static bool __io_complete_rw_common(struct io_kiocb *req, long res)
2757{
2758	if (res != req->result) {
2759		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
2760		    io_rw_should_reissue(req)) {
2761			/*
2762			 * Reissue will start accounting again, finish the
2763			 * current cycle.
2764			 */
2765			io_req_io_end(req);
2766			req->flags |= REQ_F_REISSUE;
2767			return true;
2768		}
2769		req_set_fail(req);
2770		req->result = res;
2771	}
2772	return false;
2773}
2774
2775static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
2776{
2777	struct io_async_rw *io = req->async_data;
2778
2779	/* add previously done IO, if any */
2780	if (io && io->bytes_done > 0) {
2781		if (res < 0)
2782			res = io->bytes_done;
2783		else
2784			res += io->bytes_done;
2785	}
2786	return res;
2787}
2788
2789static void io_req_task_complete(struct io_kiocb *req, bool *locked)
2790{
2791	unsigned int cflags = io_put_rw_kbuf(req);
2792	int res = req->result;
2793
2794	if (*locked) {
2795		struct io_ring_ctx *ctx = req->ctx;
2796		struct io_submit_state *state = &ctx->submit_state;
2797
2798		io_req_complete_state(req, res, cflags);
2799		state->compl_reqs[state->compl_nr++] = req;
2800		if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
2801			io_submit_flush_completions(ctx);
2802	} else {
2803		io_req_complete_post(req, res, cflags);
2804	}
2805}
2806
2807static void io_req_rw_complete(struct io_kiocb *req, bool *locked)
2808{
2809	io_req_io_end(req);
2810	io_req_task_complete(req, locked);
2811}
2812
2813static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
2814{
2815	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2816
2817	if (__io_complete_rw_common(req, res))
2818		return;
2819	req->result = io_fixup_rw_res(req, res);
2820	req->io_task_work.func = io_req_rw_complete;
2821	io_req_task_work_add(req);
2822}
2823
2824static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
2825{
2826	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
2827
2828	if (kiocb->ki_flags & IOCB_WRITE)
2829		kiocb_end_write(req);
2830	if (unlikely(res != req->result)) {
2831		if (res == -EAGAIN && io_rw_should_reissue(req)) {
2832			req->flags |= REQ_F_REISSUE;
2833			return;
2834		}
2835	}
2836
2837	WRITE_ONCE(req->result, res);
2838	/* order with io_iopoll_complete() checking ->result */
2839	smp_wmb();
2840	WRITE_ONCE(req->iopoll_completed, 1);
2841}
2842
2843/*
2844 * After the iocb has been issued, it's safe to be found on the poll list.
2845 * Adding the kiocb to the list AFTER submission ensures that we don't
2846 * find it from a io_do_iopoll() thread before the issuer is done
2847 * accessing the kiocb cookie.
2848 */
2849static void io_iopoll_req_issued(struct io_kiocb *req)
2850{
2851	struct io_ring_ctx *ctx = req->ctx;
2852	const bool in_async = io_wq_current_is_worker();
2853
2854	/* workqueue context doesn't hold uring_lock, grab it now */
2855	if (unlikely(in_async))
2856		mutex_lock(&ctx->uring_lock);
2857
2858	/*
2859	 * Track whether we have multiple files in our lists. This will impact
2860	 * how we do polling eventually, not spinning if we're on potentially
2861	 * different devices.
2862	 */
2863	if (list_empty(&ctx->iopoll_list)) {
2864		ctx->poll_multi_queue = false;
2865	} else if (!ctx->poll_multi_queue) {
2866		struct io_kiocb *list_req;
2867		unsigned int queue_num0, queue_num1;
2868
2869		list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
2870						inflight_entry);
2871
2872		if (list_req->file != req->file) {
2873			ctx->poll_multi_queue = true;
2874		} else {
2875			queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie);
2876			queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie);
2877			if (queue_num0 != queue_num1)
2878				ctx->poll_multi_queue = true;
2879		}
2880	}
2881
2882	/*
2883	 * For fast devices, IO may have already completed. If it has, add
2884	 * it to the front so we find it first.
2885	 */
2886	if (READ_ONCE(req->iopoll_completed))
2887		list_add(&req->inflight_entry, &ctx->iopoll_list);
2888	else
2889		list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
2890
2891	if (unlikely(in_async)) {
2892		/*
2893		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
2894		 * in sq thread task context or in io worker task context. If
2895		 * current task context is sq thread, we don't need to check
2896		 * whether should wake up sq thread.
2897		 */
2898		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
2899		    wq_has_sleeper(&ctx->sq_data->wait))
2900			wake_up(&ctx->sq_data->wait);
2901
2902		mutex_unlock(&ctx->uring_lock);
2903	}
2904}
2905
2906static bool io_bdev_nowait(struct block_device *bdev)
2907{
2908	return !bdev || blk_queue_nowait(bdev_get_queue(bdev));
2909}
2910
2911/*
2912 * If we tracked the file through the SCM inflight mechanism, we could support
2913 * any file. For now, just ensure that anything potentially problematic is done
2914 * inline.
2915 */
2916static bool __io_file_supports_nowait(struct file *file, int rw)
2917{
2918	umode_t mode = file_inode(file)->i_mode;
2919
2920	if (S_ISBLK(mode)) {
2921		if (IS_ENABLED(CONFIG_BLOCK) &&
2922		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
2923			return true;
2924		return false;
2925	}
2926	if (S_ISSOCK(mode))
2927		return true;
2928	if (S_ISREG(mode)) {
2929		if (IS_ENABLED(CONFIG_BLOCK) &&
2930		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
2931		    file->f_op != &io_uring_fops)
2932			return true;
2933		return false;
2934	}
2935
2936	/* any ->read/write should understand O_NONBLOCK */
2937	if (file->f_flags & O_NONBLOCK)
2938		return true;
2939
2940	if (!(file->f_mode & FMODE_NOWAIT))
2941		return false;
2942
2943	if (rw == READ)
2944		return file->f_op->read_iter != NULL;
2945
2946	return file->f_op->write_iter != NULL;
2947}
2948
2949static bool io_file_supports_nowait(struct io_kiocb *req, int rw)
2950{
2951	if (rw == READ && (req->flags & REQ_F_NOWAIT_READ))
2952		return true;
2953	else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE))
2954		return true;
2955
2956	return __io_file_supports_nowait(req->file, rw);
2957}
2958
2959static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
2960		      int rw)
2961{
2962	struct io_ring_ctx *ctx = req->ctx;
2963	struct kiocb *kiocb = &req->rw.kiocb;
2964	struct file *file = req->file;
2965	unsigned ioprio;
2966	int ret;
2967
2968	if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode))
2969		req->flags |= REQ_F_ISREG;
2970
2971	kiocb->ki_pos = READ_ONCE(sqe->off);
2972	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
2973	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
2974	ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
2975	if (unlikely(ret))
2976		return ret;
2977
2978	/*
2979	 * If the file is marked O_NONBLOCK, still allow retry for it if it
2980	 * supports async. Otherwise it's impossible to use O_NONBLOCK files
2981	 * reliably. If not, or it IOCB_NOWAIT is set, don't retry.
2982	 */
2983	if ((kiocb->ki_flags & IOCB_NOWAIT) ||
2984	    ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw)))
2985		req->flags |= REQ_F_NOWAIT;
2986
2987	ioprio = READ_ONCE(sqe->ioprio);
2988	if (ioprio) {
2989		ret = ioprio_check_cap(ioprio);
2990		if (ret)
2991			return ret;
2992
2993		kiocb->ki_ioprio = ioprio;
2994	} else
2995		kiocb->ki_ioprio = get_current_ioprio();
2996
2997	if (ctx->flags & IORING_SETUP_IOPOLL) {
2998		if (!(kiocb->ki_flags & IOCB_DIRECT) ||
2999		    !kiocb->ki_filp->f_op->iopoll)
3000			return -EOPNOTSUPP;
3001
3002		kiocb->ki_flags |= IOCB_HIPRI;
3003		kiocb->ki_complete = io_complete_rw_iopoll;
3004		req->iopoll_completed = 0;
3005	} else {
3006		if (kiocb->ki_flags & IOCB_HIPRI)
3007			return -EINVAL;
3008		kiocb->ki_complete = io_complete_rw;
3009	}
3010
3011	/* used for fixed read/write too - just read unconditionally */
3012	req->buf_index = READ_ONCE(sqe->buf_index);
3013	req->imu = NULL;
3014
3015	if (req->opcode == IORING_OP_READ_FIXED ||
3016	    req->opcode == IORING_OP_WRITE_FIXED) {
3017		struct io_ring_ctx *ctx = req->ctx;
3018		u16 index;
3019
3020		if (unlikely(req->buf_index >= ctx->nr_user_bufs))
3021			return -EFAULT;
3022		index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
3023		req->imu = ctx->user_bufs[index];
3024		io_req_set_rsrc_node(req);
3025	}
3026
3027	req->rw.addr = READ_ONCE(sqe->addr);
3028	req->rw.len = READ_ONCE(sqe->len);
3029	return 0;
3030}
3031
3032static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
3033{
3034	switch (ret) {
3035	case -EIOCBQUEUED:
3036		break;
3037	case -ERESTARTSYS:
3038	case -ERESTARTNOINTR:
3039	case -ERESTARTNOHAND:
3040	case -ERESTART_RESTARTBLOCK:
3041		/*
3042		 * We can't just restart the syscall, since previously
3043		 * submitted sqes may already be in progress. Just fail this
3044		 * IO with EINTR.
3045		 */
3046		ret = -EINTR;
3047		fallthrough;
3048	default:
3049		kiocb->ki_complete(kiocb, ret, 0);
3050	}
3051}
3052
3053static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
3054{
3055	struct kiocb *kiocb = &req->rw.kiocb;
3056
3057	if (kiocb->ki_pos != -1)
3058		return &kiocb->ki_pos;
3059
3060	if (!(req->file->f_mode & FMODE_STREAM)) {
3061		req->flags |= REQ_F_CUR_POS;
3062		kiocb->ki_pos = req->file->f_pos;
3063		return &kiocb->ki_pos;
3064	}
3065
3066	kiocb->ki_pos = 0;
3067	return NULL;
3068}
3069
3070static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
3071		       unsigned int issue_flags)
3072{
3073	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
3074
3075	if (req->flags & REQ_F_CUR_POS)
3076		req->file->f_pos = kiocb->ki_pos;
3077	if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) {
3078		if (!__io_complete_rw_common(req, ret)) {
3079			/*
3080			 * Safe to call io_end from here as we're inline
3081			 * from the submission path.
3082			 */
3083			io_req_io_end(req);
3084			__io_req_complete(req, issue_flags,
3085					  io_fixup_rw_res(req, ret),
3086					  io_put_rw_kbuf(req));
3087		}
3088	} else {
3089		io_rw_done(kiocb, ret);
3090	}
3091
3092	if (req->flags & REQ_F_REISSUE) {
3093		req->flags &= ~REQ_F_REISSUE;
3094		if (io_resubmit_prep(req)) {
3095			io_req_task_queue_reissue(req);
3096		} else {
3097			unsigned int cflags = io_put_rw_kbuf(req);
3098			struct io_ring_ctx *ctx = req->ctx;
3099
3100			ret = io_fixup_rw_res(req, ret);
3101			req_set_fail(req);
3102			if (!(issue_flags & IO_URING_F_NONBLOCK)) {
3103				mutex_lock(&ctx->uring_lock);
3104				__io_req_complete(req, issue_flags, ret, cflags);
3105				mutex_unlock(&ctx->uring_lock);
3106			} else {
3107				__io_req_complete(req, issue_flags, ret, cflags);
3108			}
3109		}
3110	}
3111}
3112
3113static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
3114			     struct io_mapped_ubuf *imu)
3115{
3116	size_t len = req->rw.len;
3117	u64 buf_end, buf_addr = req->rw.addr;
3118	size_t offset;
3119
3120	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
3121		return -EFAULT;
3122	/* not inside the mapped region */
3123	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
3124		return -EFAULT;
3125
3126	/*
3127	 * May not be a start of buffer, set size appropriately
3128	 * and advance us to the beginning.
3129	 */
3130	offset = buf_addr - imu->ubuf;
3131	iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len);
3132
3133	if (offset) {
3134		/*
3135		 * Don't use iov_iter_advance() here, as it's really slow for
3136		 * using the latter parts of a big fixed buffer - it iterates
3137		 * over each segment manually. We can cheat a bit here, because
3138		 * we know that:
3139		 *
3140		 * 1) it's a BVEC iter, we set it up
3141		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
3142		 *    first and last bvec
3143		 *
3144		 * So just find our index, and adjust the iterator afterwards.
3145		 * If the offset is within the first bvec (or the whole first
3146		 * bvec, just use iov_iter_advance(). This makes it easier
3147		 * since we can just skip the first segment, which may not
3148		 * be PAGE_SIZE aligned.
3149		 */
3150		const struct bio_vec *bvec = imu->bvec;
3151
3152		if (offset < bvec->bv_len) {
3153			iov_iter_advance(iter, offset);
3154		} else {
3155			unsigned long seg_skip;
3156
3157			/* skip first vec */
3158			offset -= bvec->bv_len;
3159			seg_skip = 1 + (offset >> PAGE_SHIFT);
3160
3161			iter->bvec = bvec + seg_skip;
3162			iter->nr_segs -= seg_skip;
3163			iter->count -= bvec->bv_len + offset;
3164			iter->iov_offset = offset & ~PAGE_MASK;
3165		}
3166	}
3167
3168	return 0;
3169}
3170
3171static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter)
3172{
3173	if (WARN_ON_ONCE(!req->imu))
3174		return -EFAULT;
3175	return __io_import_fixed(req, rw, iter, req->imu);
3176}
3177
3178static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock)
3179{
3180	if (needs_lock)
3181		mutex_unlock(&ctx->uring_lock);
3182}
3183
3184static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
3185{
3186	/*
3187	 * "Normal" inline submissions always hold the uring_lock, since we
3188	 * grab it from the system call. Same is true for the SQPOLL offload.
3189	 * The only exception is when we've detached the request and issue it
3190	 * from an async worker thread, grab the lock for that case.
3191	 */
3192	if (needs_lock)
3193		mutex_lock(&ctx->uring_lock);
3194}
3195
3196static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
3197					  int bgid, struct io_buffer *kbuf,
3198					  bool needs_lock)
3199{
3200	struct io_buffer *head;
3201
3202	if (req->flags & REQ_F_BUFFER_SELECTED)
3203		return kbuf;
3204
3205	io_ring_submit_lock(req->ctx, needs_lock);
3206
3207	lockdep_assert_held(&req->ctx->uring_lock);
3208
3209	head = xa_load(&req->ctx->io_buffers, bgid);
3210	if (head) {
3211		if (!list_empty(&head->list)) {
3212			kbuf = list_last_entry(&head->list, struct io_buffer,
3213							list);
3214			list_del(&kbuf->list);
3215		} else {
3216			kbuf = head;
3217			xa_erase(&req->ctx->io_buffers, bgid);
3218		}
3219		if (*len > kbuf->len)
3220			*len = kbuf->len;
3221	} else {
3222		kbuf = ERR_PTR(-ENOBUFS);
3223	}
3224
3225	io_ring_submit_unlock(req->ctx, needs_lock);
3226
3227	return kbuf;
3228}
3229
3230static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
3231					bool needs_lock)
3232{
3233	struct io_buffer *kbuf;
3234	u16 bgid;
3235
3236	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3237	bgid = req->buf_index;
3238	kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock);
3239	if (IS_ERR(kbuf))
3240		return kbuf;
3241	req->rw.addr = (u64) (unsigned long) kbuf;
3242	req->flags |= REQ_F_BUFFER_SELECTED;
3243	return u64_to_user_ptr(kbuf->addr);
3244}
3245
3246#ifdef CONFIG_COMPAT
3247static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
3248				bool needs_lock)
3249{
3250	struct compat_iovec __user *uiov;
3251	compat_ssize_t clen;
3252	void __user *buf;
3253	ssize_t len;
3254
3255	uiov = u64_to_user_ptr(req->rw.addr);
3256	if (!access_ok(uiov, sizeof(*uiov)))
3257		return -EFAULT;
3258	if (__get_user(clen, &uiov->iov_len))
3259		return -EFAULT;
3260	if (clen < 0)
3261		return -EINVAL;
3262
3263	len = clen;
3264	buf = io_rw_buffer_select(req, &len, needs_lock);
3265	if (IS_ERR(buf))
3266		return PTR_ERR(buf);
3267	iov[0].iov_base = buf;
3268	iov[0].iov_len = (compat_size_t) len;
3269	return 0;
3270}
3271#endif
3272
3273static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3274				      bool needs_lock)
3275{
3276	struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr);
3277	void __user *buf;
3278	ssize_t len;
3279
3280	if (copy_from_user(iov, uiov, sizeof(*uiov)))
3281		return -EFAULT;
3282
3283	len = iov[0].iov_len;
3284	if (len < 0)
3285		return -EINVAL;
3286	buf = io_rw_buffer_select(req, &len, needs_lock);
3287	if (IS_ERR(buf))
3288		return PTR_ERR(buf);
3289	iov[0].iov_base = buf;
3290	iov[0].iov_len = len;
3291	return 0;
3292}
3293
3294static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
3295				    bool needs_lock)
3296{
3297	if (req->flags & REQ_F_BUFFER_SELECTED) {
3298		struct io_buffer *kbuf;
3299
3300		kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
3301		iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
3302		iov[0].iov_len = kbuf->len;
3303		return 0;
3304	}
3305	if (req->rw.len != 1)
3306		return -EINVAL;
3307
3308#ifdef CONFIG_COMPAT
3309	if (req->ctx->compat)
3310		return io_compat_import(req, iov, needs_lock);
3311#endif
3312
3313	return __io_iov_buffer_select(req, iov, needs_lock);
3314}
3315
3316static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec,
3317			   struct iov_iter *iter, bool needs_lock)
3318{
3319	void __user *buf = u64_to_user_ptr(req->rw.addr);
3320	size_t sqe_len = req->rw.len;
3321	u8 opcode = req->opcode;
3322	ssize_t ret;
3323
3324	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
3325		*iovec = NULL;
3326		return io_import_fixed(req, rw, iter);
3327	}
3328
3329	/* buffer index only valid with fixed read/write, or buffer select  */
3330	if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))
3331		return -EINVAL;
3332
3333	if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
3334		if (req->flags & REQ_F_BUFFER_SELECT) {
3335			buf = io_rw_buffer_select(req, &sqe_len, needs_lock);
3336			if (IS_ERR(buf))
3337				return PTR_ERR(buf);
3338			req->rw.len = sqe_len;
3339		}
3340
3341		ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
3342		*iovec = NULL;
3343		return ret;
3344	}
3345
3346	if (req->flags & REQ_F_BUFFER_SELECT) {
3347		ret = io_iov_buffer_select(req, *iovec, needs_lock);
3348		if (!ret)
3349			iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len);
3350		*iovec = NULL;
3351		return ret;
3352	}
3353
3354	return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
3355			      req->ctx->compat);
3356}
3357
3358static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
3359{
3360	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
3361}
3362
3363/*
3364 * For files that don't have ->read_iter() and ->write_iter(), handle them
3365 * by looping over ->read() or ->write() manually.
3366 */
3367static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
3368{
3369	struct kiocb *kiocb = &req->rw.kiocb;
3370	struct file *file = req->file;
3371	ssize_t ret = 0;
3372	loff_t *ppos;
3373
3374	/*
3375	 * Don't support polled IO through this interface, and we can't
3376	 * support non-blocking either. For the latter, this just causes
3377	 * the kiocb to be handled from an async context.
3378	 */
3379	if (kiocb->ki_flags & IOCB_HIPRI)
3380		return -EOPNOTSUPP;
3381	if (kiocb->ki_flags & IOCB_NOWAIT)
3382		return -EAGAIN;
3383
3384	ppos = io_kiocb_ppos(kiocb);
3385
3386	while (iov_iter_count(iter)) {
3387		struct iovec iovec;
3388		ssize_t nr;
3389
3390		if (!iov_iter_is_bvec(iter)) {
3391			iovec = iov_iter_iovec(iter);
3392		} else {
3393			iovec.iov_base = u64_to_user_ptr(req->rw.addr);
3394			iovec.iov_len = req->rw.len;
3395		}
3396
3397		if (rw == READ) {
3398			nr = file->f_op->read(file, iovec.iov_base,
3399					      iovec.iov_len, ppos);
3400		} else {
3401			nr = file->f_op->write(file, iovec.iov_base,
3402					       iovec.iov_len, ppos);
3403		}
3404
3405		if (nr < 0) {
3406			if (!ret)
3407				ret = nr;
3408			break;
3409		}
3410		ret += nr;
3411		if (!iov_iter_is_bvec(iter)) {
3412			iov_iter_advance(iter, nr);
3413		} else {
3414			req->rw.addr += nr;
3415			req->rw.len -= nr;
3416			if (!req->rw.len)
3417				break;
3418		}
3419		if (nr != iovec.iov_len)
3420			break;
3421	}
3422
3423	return ret;
3424}
3425
3426static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
3427			  const struct iovec *fast_iov, struct iov_iter *iter)
3428{
3429	struct io_async_rw *rw = req->async_data;
3430
3431	memcpy(&rw->iter, iter, sizeof(*iter));
3432	rw->free_iovec = iovec;
3433	rw->bytes_done = 0;
3434	/* can only be fixed buffers, no need to do anything */
3435	if (iov_iter_is_bvec(iter))
3436		return;
3437	if (!iovec) {
3438		unsigned iov_off = 0;
3439
3440		rw->iter.iov = rw->fast_iov;
3441		if (iter->iov != fast_iov) {
3442			iov_off = iter->iov - fast_iov;
3443			rw->iter.iov += iov_off;
3444		}
3445		if (rw->fast_iov != fast_iov)
3446			memcpy(rw->fast_iov + iov_off, fast_iov + iov_off,
3447			       sizeof(struct iovec) * iter->nr_segs);
3448	} else {
3449		req->flags |= REQ_F_NEED_CLEANUP;
3450	}
3451}
3452
3453static inline int io_alloc_async_data(struct io_kiocb *req)
3454{
3455	WARN_ON_ONCE(!io_op_defs[req->opcode].async_size);
3456	req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL);
3457	return req->async_data == NULL;
3458}
3459
3460static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
3461			     const struct iovec *fast_iov,
3462			     struct iov_iter *iter, bool force)
3463{
3464	if (!force && !io_op_defs[req->opcode].needs_async_setup)
3465		return 0;
3466	if (!req->async_data) {
3467		struct io_async_rw *iorw;
3468
3469		if (io_alloc_async_data(req)) {
3470			kfree(iovec);
3471			return -ENOMEM;
3472		}
3473
3474		io_req_map_rw(req, iovec, fast_iov, iter);
3475		iorw = req->async_data;
3476		/* we've copied and mapped the iter, ensure state is saved */
3477		iov_iter_save_state(&iorw->iter, &iorw->iter_state);
3478	}
3479	return 0;
3480}
3481
3482static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
3483{
3484	struct io_async_rw *iorw = req->async_data;
3485	struct iovec *iov = iorw->fast_iov;
3486	int ret;
3487
3488	iorw->bytes_done = 0;
3489	iorw->free_iovec = NULL;
3490
3491	ret = io_import_iovec(rw, req, &iov, &iorw->iter, false);
3492	if (unlikely(ret < 0))
3493		return ret;
3494
3495	if (iov) {
3496		iorw->free_iovec = iov;
3497		req->flags |= REQ_F_NEED_CLEANUP;
3498	}
3499	iov_iter_save_state(&iorw->iter, &iorw->iter_state);
3500	return 0;
3501}
3502
3503static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3504{
3505	if (unlikely(!(req->file->f_mode & FMODE_READ)))
3506		return -EBADF;
3507	return io_prep_rw(req, sqe, READ);
3508}
3509
3510/*
3511 * This is our waitqueue callback handler, registered through lock_page_async()
3512 * when we initially tried to do the IO with the iocb armed our waitqueue.
3513 * This gets called when the page is unlocked, and we generally expect that to
3514 * happen when the page IO is completed and the page is now uptodate. This will
3515 * queue a task_work based retry of the operation, attempting to copy the data
3516 * again. If the latter fails because the page was NOT uptodate, then we will
3517 * do a thread based blocking retry of the operation. That's the unexpected
3518 * slow path.
3519 */
3520static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
3521			     int sync, void *arg)
3522{
3523	struct wait_page_queue *wpq;
3524	struct io_kiocb *req = wait->private;
3525	struct wait_page_key *key = arg;
3526
3527	wpq = container_of(wait, struct wait_page_queue, wait);
3528
3529	if (!wake_page_match(wpq, key))
3530		return 0;
3531
3532	req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
3533	list_del_init(&wait->entry);
3534	io_req_task_queue(req);
3535	return 1;
3536}
3537
3538/*
3539 * This controls whether a given IO request should be armed for async page
3540 * based retry. If we return false here, the request is handed to the async
3541 * worker threads for retry. If we're doing buffered reads on a regular file,
3542 * we prepare a private wait_page_queue entry and retry the operation. This
3543 * will either succeed because the page is now uptodate and unlocked, or it
3544 * will register a callback when the page is unlocked at IO completion. Through
3545 * that callback, io_uring uses task_work to setup a retry of the operation.
3546 * That retry will attempt the buffered read again. The retry will generally
3547 * succeed, or in rare cases where it fails, we then fall back to using the
3548 * async worker threads for a blocking retry.
3549 */
3550static bool io_rw_should_retry(struct io_kiocb *req)
3551{
3552	struct io_async_rw *rw = req->async_data;
3553	struct wait_page_queue *wait = &rw->wpq;
3554	struct kiocb *kiocb = &req->rw.kiocb;
3555
3556	/* never retry for NOWAIT, we just complete with -EAGAIN */
3557	if (req->flags & REQ_F_NOWAIT)
3558		return false;
3559
3560	/* Only for buffered IO */
3561	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI))
3562		return false;
3563
3564	/*
3565	 * just use poll if we can, and don't attempt if the fs doesn't
3566	 * support callback based unlocks
3567	 */
3568	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
3569		return false;
3570
3571	wait->wait.func = io_async_buf_func;
3572	wait->wait.private = req;
3573	wait->wait.flags = 0;
3574	INIT_LIST_HEAD(&wait->wait.entry);
3575	kiocb->ki_flags |= IOCB_WAITQ;
3576	kiocb->ki_flags &= ~IOCB_NOWAIT;
3577	kiocb->ki_waitq = wait;
3578	return true;
3579}
3580
3581static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
3582{
3583	if (req->file->f_op->read_iter)
3584		return call_read_iter(req->file, &req->rw.kiocb, iter);
3585	else if (req->file->f_op->read)
3586		return loop_rw_iter(READ, req, iter);
3587	else
3588		return -EINVAL;
3589}
3590
3591static bool need_read_all(struct io_kiocb *req)
3592{
3593	return req->flags & REQ_F_ISREG ||
3594		S_ISBLK(file_inode(req->file)->i_mode);
3595}
3596
3597static int io_read(struct io_kiocb *req, unsigned int issue_flags)
3598{
3599	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3600	struct kiocb *kiocb = &req->rw.kiocb;
3601	struct iov_iter __iter, *iter = &__iter;
3602	struct io_async_rw *rw = req->async_data;
3603	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3604	struct iov_iter_state __state, *state;
3605	ssize_t ret, ret2;
3606	loff_t *ppos;
3607
3608	if (rw) {
3609		iter = &rw->iter;
3610		state = &rw->iter_state;
3611		/*
3612		 * We come here from an earlier attempt, restore our state to
3613		 * match in case it doesn't. It's cheap enough that we don't
3614		 * need to make this conditional.
3615		 */
3616		iov_iter_restore(iter, state);
3617		iovec = NULL;
3618	} else {
3619		ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
3620		if (ret < 0)
3621			return ret;
3622		state = &__state;
3623		iov_iter_save_state(iter, state);
3624	}
3625	req->result = iov_iter_count(iter);
3626
3627	/* Ensure we clear previously set non-block flag */
3628	if (!force_nonblock)
3629		kiocb->ki_flags &= ~IOCB_NOWAIT;
3630	else
3631		kiocb->ki_flags |= IOCB_NOWAIT;
3632
3633	/* If the file doesn't support async, just async punt */
3634	if (force_nonblock && !io_file_supports_nowait(req, READ)) {
3635		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3636		return ret ?: -EAGAIN;
3637	}
3638
3639	ppos = io_kiocb_update_pos(req);
3640
3641	ret = rw_verify_area(READ, req->file, ppos, req->result);
3642	if (unlikely(ret)) {
3643		kfree(iovec);
3644		return ret;
3645	}
3646
3647	ret = io_iter_do_read(req, iter);
3648
3649	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
3650		req->flags &= ~REQ_F_REISSUE;
3651		/* IOPOLL retry should happen for io-wq threads */
3652		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
3653			goto done;
3654		/* no retry on NONBLOCK nor RWF_NOWAIT */
3655		if (req->flags & REQ_F_NOWAIT)
3656			goto done;
3657		ret = 0;
3658	} else if (ret == -EIOCBQUEUED) {
3659		goto out_free;
3660	} else if (ret <= 0 || ret == req->result || !force_nonblock ||
3661		   (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
3662		/* read all, failed, already did sync or don't want to retry */
3663		goto done;
3664	}
3665
3666	/*
3667	 * Don't depend on the iter state matching what was consumed, or being
3668	 * untouched in case of error. Restore it and we'll advance it
3669	 * manually if we need to.
3670	 */
3671	iov_iter_restore(iter, state);
3672
3673	ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
3674	if (ret2)
3675		return ret2;
3676
3677	iovec = NULL;
3678	rw = req->async_data;
3679	/*
3680	 * Now use our persistent iterator and state, if we aren't already.
3681	 * We've restored and mapped the iter to match.
3682	 */
3683	if (iter != &rw->iter) {
3684		iter = &rw->iter;
3685		state = &rw->iter_state;
3686	}
3687
3688	do {
3689		/*
3690		 * We end up here because of a partial read, either from
3691		 * above or inside this loop. Advance the iter by the bytes
3692		 * that were consumed.
3693		 */
3694		iov_iter_advance(iter, ret);
3695		if (!iov_iter_count(iter))
3696			break;
3697		rw->bytes_done += ret;
3698		iov_iter_save_state(iter, state);
3699
3700		/* if we can retry, do so with the callbacks armed */
3701		if (!io_rw_should_retry(req)) {
3702			kiocb->ki_flags &= ~IOCB_WAITQ;
3703			return -EAGAIN;
3704		}
3705
3706		req->result = iov_iter_count(iter);
3707		/*
3708		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
3709		 * we get -EIOCBQUEUED, then we'll get a notification when the
3710		 * desired page gets unlocked. We can also get a partial read
3711		 * here, and if we do, then just retry at the new offset.
3712		 */
3713		ret = io_iter_do_read(req, iter);
3714		if (ret == -EIOCBQUEUED)
3715			return 0;
3716		/* we got some bytes, but not all. retry. */
3717		kiocb->ki_flags &= ~IOCB_WAITQ;
3718		iov_iter_restore(iter, state);
3719	} while (ret > 0);
3720done:
3721	kiocb_done(kiocb, ret, issue_flags);
3722out_free:
3723	/* it's faster to check here then delegate to kfree */
3724	if (iovec)
3725		kfree(iovec);
3726	return 0;
3727}
3728
3729static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
3730{
3731	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
3732		return -EBADF;
3733	return io_prep_rw(req, sqe, WRITE);
3734}
3735
3736static int io_write(struct io_kiocb *req, unsigned int issue_flags)
3737{
3738	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
3739	struct kiocb *kiocb = &req->rw.kiocb;
3740	struct iov_iter __iter, *iter = &__iter;
3741	struct io_async_rw *rw = req->async_data;
3742	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
3743	struct iov_iter_state __state, *state;
3744	ssize_t ret, ret2;
3745	loff_t *ppos;
3746
3747	if (rw) {
3748		iter = &rw->iter;
3749		state = &rw->iter_state;
3750		iov_iter_restore(iter, state);
3751		iovec = NULL;
3752	} else {
3753		ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
3754		if (ret < 0)
3755			return ret;
3756		state = &__state;
3757		iov_iter_save_state(iter, state);
3758	}
3759	req->result = iov_iter_count(iter);
3760
3761	/* Ensure we clear previously set non-block flag */
3762	if (!force_nonblock)
3763		kiocb->ki_flags &= ~IOCB_NOWAIT;
3764	else
3765		kiocb->ki_flags |= IOCB_NOWAIT;
3766
3767	/* If the file doesn't support async, just async punt */
3768	if (force_nonblock && !io_file_supports_nowait(req, WRITE))
3769		goto copy_iov;
3770
3771	/* file path doesn't support NOWAIT for non-direct_IO */
3772	if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) &&
3773	    (req->flags & REQ_F_ISREG))
3774		goto copy_iov;
3775
3776	ppos = io_kiocb_update_pos(req);
3777
3778	ret = rw_verify_area(WRITE, req->file, ppos, req->result);
3779	if (unlikely(ret))
3780		goto out_free;
3781
3782	/*
3783	 * Open-code file_start_write here to grab freeze protection,
3784	 * which will be released by another thread in
3785	 * io_complete_rw().  Fool lockdep by telling it the lock got
3786	 * released so that it doesn't complain about the held lock when
3787	 * we return to userspace.
3788	 */
3789	if (req->flags & REQ_F_ISREG) {
3790		sb_start_write(file_inode(req->file)->i_sb);
3791		__sb_writers_release(file_inode(req->file)->i_sb,
3792					SB_FREEZE_WRITE);
3793	}
3794	kiocb->ki_flags |= IOCB_WRITE;
3795
3796	if (req->file->f_op->write_iter)
3797		ret2 = call_write_iter(req->file, kiocb, iter);
3798	else if (req->file->f_op->write)
3799		ret2 = loop_rw_iter(WRITE, req, iter);
3800	else
3801		ret2 = -EINVAL;
3802
3803	if (req->flags & REQ_F_REISSUE) {
3804		req->flags &= ~REQ_F_REISSUE;
3805		ret2 = -EAGAIN;
3806	}
3807
3808	/*
3809	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
3810	 * retry them without IOCB_NOWAIT.
3811	 */
3812	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
3813		ret2 = -EAGAIN;
3814	/* no retry on NONBLOCK nor RWF_NOWAIT */
3815	if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT))
3816		goto done;
3817	if (!force_nonblock || ret2 != -EAGAIN) {
3818		/* IOPOLL retry should happen for io-wq threads */
3819		if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
3820			goto copy_iov;
3821done:
3822		kiocb_done(kiocb, ret2, issue_flags);
3823	} else {
3824copy_iov:
3825		iov_iter_restore(iter, state);
3826		ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
3827		if (!ret) {
3828			if (kiocb->ki_flags & IOCB_WRITE)
3829				kiocb_end_write(req);
3830			return -EAGAIN;
3831		}
3832		return ret;
3833	}
3834out_free:
3835	/* it's reportedly faster than delegating the null check to kfree() */
3836	if (iovec)
3837		kfree(iovec);
3838	return ret;
3839}
3840
3841static int io_renameat_prep(struct io_kiocb *req,
3842			    const struct io_uring_sqe *sqe)
3843{
3844	struct io_rename *ren = &req->rename;
3845	const char __user *oldf, *newf;
3846
3847	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3848		return -EINVAL;
3849	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
3850		return -EINVAL;
3851	if (unlikely(req->flags & REQ_F_FIXED_FILE))
3852		return -EBADF;
3853
3854	ren->old_dfd = READ_ONCE(sqe->fd);
3855	oldf = u64_to_user_ptr(READ_ONCE(sqe->addr));
3856	newf = u64_to_user_ptr(READ_ONCE(sqe->addr2));
3857	ren->new_dfd = READ_ONCE(sqe->len);
3858	ren->flags = READ_ONCE(sqe->rename_flags);
3859
3860	ren->oldpath = getname(oldf);
3861	if (IS_ERR(ren->oldpath))
3862		return PTR_ERR(ren->oldpath);
3863
3864	ren->newpath = getname(newf);
3865	if (IS_ERR(ren->newpath)) {
3866		putname(ren->oldpath);
3867		return PTR_ERR(ren->newpath);
3868	}
3869
3870	req->flags |= REQ_F_NEED_CLEANUP;
3871	return 0;
3872}
3873
3874static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
3875{
3876	struct io_rename *ren = &req->rename;
3877	int ret;
3878
3879	if (issue_flags & IO_URING_F_NONBLOCK)
3880		return -EAGAIN;
3881
3882	ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
3883				ren->newpath, ren->flags);
3884
3885	req->flags &= ~REQ_F_NEED_CLEANUP;
3886	if (ret < 0)
3887		req_set_fail(req);
3888	io_req_complete(req, ret);
3889	return 0;
3890}
3891
3892static int io_unlinkat_prep(struct io_kiocb *req,
3893			    const struct io_uring_sqe *sqe)
3894{
3895	struct io_unlink *un = &req->unlink;
3896	const char __user *fname;
3897
3898	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3899		return -EINVAL;
3900	if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
3901	    sqe->splice_fd_in)
3902		return -EINVAL;
3903	if (unlikely(req->flags & REQ_F_FIXED_FILE))
3904		return -EBADF;
3905
3906	un->dfd = READ_ONCE(sqe->fd);
3907
3908	un->flags = READ_ONCE(sqe->unlink_flags);
3909	if (un->flags & ~AT_REMOVEDIR)
3910		return -EINVAL;
3911
3912	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
3913	un->filename = getname(fname);
3914	if (IS_ERR(un->filename))
3915		return PTR_ERR(un->filename);
3916
3917	req->flags |= REQ_F_NEED_CLEANUP;
3918	return 0;
3919}
3920
3921static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
3922{
3923	struct io_unlink *un = &req->unlink;
3924	int ret;
3925
3926	if (issue_flags & IO_URING_F_NONBLOCK)
3927		return -EAGAIN;
3928
3929	if (un->flags & AT_REMOVEDIR)
3930		ret = do_rmdir(un->dfd, un->filename);
3931	else
3932		ret = do_unlinkat(un->dfd, un->filename);
3933
3934	req->flags &= ~REQ_F_NEED_CLEANUP;
3935	if (ret < 0)
3936		req_set_fail(req);
3937	io_req_complete(req, ret);
3938	return 0;
3939}
3940
3941static int io_shutdown_prep(struct io_kiocb *req,
3942			    const struct io_uring_sqe *sqe)
3943{
3944#if defined(CONFIG_NET)
3945	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3946		return -EINVAL;
3947	if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
3948		     sqe->buf_index || sqe->splice_fd_in))
3949		return -EINVAL;
3950
3951	req->shutdown.how = READ_ONCE(sqe->len);
3952	return 0;
3953#else
3954	return -EOPNOTSUPP;
3955#endif
3956}
3957
3958static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
3959{
3960#if defined(CONFIG_NET)
3961	struct socket *sock;
3962	int ret;
3963
3964	if (issue_flags & IO_URING_F_NONBLOCK)
3965		return -EAGAIN;
3966
3967	sock = sock_from_file(req->file, &ret);
3968	if (unlikely(!sock))
3969		return ret;
3970
3971	ret = __sys_shutdown_sock(sock, req->shutdown.how);
3972	if (ret < 0)
3973		req_set_fail(req);
3974	io_req_complete(req, ret);
3975	return 0;
3976#else
3977	return -EOPNOTSUPP;
3978#endif
3979}
3980
3981static int __io_splice_prep(struct io_kiocb *req,
3982			    const struct io_uring_sqe *sqe)
3983{
3984	struct io_splice *sp = &req->splice;
3985	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
3986
3987	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
3988		return -EINVAL;
3989
3990	sp->len = READ_ONCE(sqe->len);
3991	sp->flags = READ_ONCE(sqe->splice_flags);
3992	if (unlikely(sp->flags & ~valid_flags))
3993		return -EINVAL;
3994	sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
3995	return 0;
3996}
3997
3998static int io_tee_prep(struct io_kiocb *req,
3999		       const struct io_uring_sqe *sqe)
4000{
4001	if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off))
4002		return -EINVAL;
4003	return __io_splice_prep(req, sqe);
4004}
4005
4006static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
4007{
4008	struct io_splice *sp = &req->splice;
4009	struct file *out = sp->file_out;
4010	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4011	struct file *in;
4012	long ret = 0;
4013
4014	if (issue_flags & IO_URING_F_NONBLOCK)
4015		return -EAGAIN;
4016
4017	in = io_file_get(req->ctx, req, sp->splice_fd_in,
4018			 (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
4019	if (!in) {
4020		ret = -EBADF;
4021		goto done;
4022	}
4023
4024	if (sp->len)
4025		ret = do_tee(in, out, sp->len, flags);
4026
4027	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4028		io_put_file(in);
4029done:
4030	if (ret != sp->len)
4031		req_set_fail(req);
4032	io_req_complete(req, ret);
4033	return 0;
4034}
4035
4036static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4037{
4038	struct io_splice *sp = &req->splice;
4039
4040	sp->off_in = READ_ONCE(sqe->splice_off_in);
4041	sp->off_out = READ_ONCE(sqe->off);
4042	return __io_splice_prep(req, sqe);
4043}
4044
4045static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
4046{
4047	struct io_splice *sp = &req->splice;
4048	struct file *out = sp->file_out;
4049	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
4050	loff_t *poff_in, *poff_out;
4051	struct file *in;
4052	long ret = 0;
4053
4054	if (issue_flags & IO_URING_F_NONBLOCK)
4055		return -EAGAIN;
4056
4057	in = io_file_get(req->ctx, req, sp->splice_fd_in,
4058			 (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
4059	if (!in) {
4060		ret = -EBADF;
4061		goto done;
4062	}
4063
4064	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
4065	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
4066
4067	if (sp->len)
4068		ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
4069
4070	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
4071		io_put_file(in);
4072done:
4073	if (ret != sp->len)
4074		req_set_fail(req);
4075	io_req_complete(req, ret);
4076	return 0;
4077}
4078
4079/*
4080 * IORING_OP_NOP just posts a completion event, nothing else.
4081 */
4082static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
4083{
4084	struct io_ring_ctx *ctx = req->ctx;
4085
4086	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4087		return -EINVAL;
4088
4089	__io_req_complete(req, issue_flags, 0, 0);
4090	return 0;
4091}
4092
4093static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4094{
4095	struct io_ring_ctx *ctx = req->ctx;
4096
4097	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4098		return -EINVAL;
4099	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4100		     sqe->splice_fd_in))
4101		return -EINVAL;
4102
4103	req->sync.flags = READ_ONCE(sqe->fsync_flags);
4104	if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC))
4105		return -EINVAL;
4106
4107	req->sync.off = READ_ONCE(sqe->off);
4108	req->sync.len = READ_ONCE(sqe->len);
4109	return 0;
4110}
4111
4112static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
4113{
4114	loff_t end = req->sync.off + req->sync.len;
4115	int ret;
4116
4117	/* fsync always requires a blocking context */
4118	if (issue_flags & IO_URING_F_NONBLOCK)
4119		return -EAGAIN;
4120
4121	ret = vfs_fsync_range(req->file, req->sync.off,
4122				end > 0 ? end : LLONG_MAX,
4123				req->sync.flags & IORING_FSYNC_DATASYNC);
4124	if (ret < 0)
4125		req_set_fail(req);
4126	io_req_complete(req, ret);
4127	return 0;
4128}
4129
4130static int io_fallocate_prep(struct io_kiocb *req,
4131			     const struct io_uring_sqe *sqe)
4132{
4133	if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
4134	    sqe->splice_fd_in)
4135		return -EINVAL;
4136	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4137		return -EINVAL;
4138
4139	req->sync.off = READ_ONCE(sqe->off);
4140	req->sync.len = READ_ONCE(sqe->addr);
4141	req->sync.mode = READ_ONCE(sqe->len);
4142	return 0;
4143}
4144
4145static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
4146{
4147	int ret;
4148
4149	/* fallocate always requiring blocking context */
4150	if (issue_flags & IO_URING_F_NONBLOCK)
4151		return -EAGAIN;
4152	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
4153				req->sync.len);
4154	if (ret < 0)
4155		req_set_fail(req);
4156	else
4157		fsnotify_modify(req->file);
4158	io_req_complete(req, ret);
4159	return 0;
4160}
4161
4162static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4163{
4164	const char __user *fname;
4165	int ret;
4166
4167	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4168		return -EINVAL;
4169	if (unlikely(sqe->ioprio || sqe->buf_index))
4170		return -EINVAL;
4171	if (unlikely(req->flags & REQ_F_FIXED_FILE))
4172		return -EBADF;
4173
4174	/* open.how should be already initialised */
4175	if (!(req->open.how.flags & O_PATH) && force_o_largefile())
4176		req->open.how.flags |= O_LARGEFILE;
4177
4178	req->open.dfd = READ_ONCE(sqe->fd);
4179	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
4180	req->open.filename = getname(fname);
4181	if (IS_ERR(req->open.filename)) {
4182		ret = PTR_ERR(req->open.filename);
4183		req->open.filename = NULL;
4184		return ret;
4185	}
4186
4187	req->open.file_slot = READ_ONCE(sqe->file_index);
4188	if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC))
4189		return -EINVAL;
4190
4191	req->open.nofile = rlimit(RLIMIT_NOFILE);
4192	req->flags |= REQ_F_NEED_CLEANUP;
4193	return 0;
4194}
4195
4196static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4197{
4198	u64 mode = READ_ONCE(sqe->len);
4199	u64 flags = READ_ONCE(sqe->open_flags);
4200
4201	req->open.how = build_open_how(flags, mode);
4202	return __io_openat_prep(req, sqe);
4203}
4204
4205static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4206{
4207	struct open_how __user *how;
4208	size_t len;
4209	int ret;
4210
4211	how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4212	len = READ_ONCE(sqe->len);
4213	if (len < OPEN_HOW_SIZE_VER0)
4214		return -EINVAL;
4215
4216	ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
4217					len);
4218	if (ret)
4219		return ret;
4220
4221	return __io_openat_prep(req, sqe);
4222}
4223
4224static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
4225{
4226	struct open_flags op;
4227	struct file *file;
4228	bool resolve_nonblock, nonblock_set;
4229	bool fixed = !!req->open.file_slot;
4230	int ret;
4231
4232	ret = build_open_flags(&req->open.how, &op);
4233	if (ret)
4234		goto err;
4235	nonblock_set = op.open_flag & O_NONBLOCK;
4236	resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED;
4237	if (issue_flags & IO_URING_F_NONBLOCK) {
4238		/*
4239		 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
4240		 * it'll always -EAGAIN. Note that we test for __O_TMPFILE
4241		 * because O_TMPFILE includes O_DIRECTORY, which isn't a flag
4242		 * we need to force async for.
4243		 */
4244		if (req->open.how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
4245			return -EAGAIN;
4246		op.lookup_flags |= LOOKUP_CACHED;
4247		op.open_flag |= O_NONBLOCK;
4248	}
4249
4250	if (!fixed) {
4251		ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile);
4252		if (ret < 0)
4253			goto err;
4254	}
4255
4256	file = do_filp_open(req->open.dfd, req->open.filename, &op);
4257	if (IS_ERR(file)) {
4258		/*
4259		 * We could hang on to this 'fd' on retrying, but seems like
4260		 * marginal gain for something that is now known to be a slower
4261		 * path. So just put it, and we'll get a new one when we retry.
4262		 */
4263		if (!fixed)
4264			put_unused_fd(ret);
4265
4266		ret = PTR_ERR(file);
4267		/* only retry if RESOLVE_CACHED wasn't already set by application */
4268		if (ret == -EAGAIN &&
4269		    (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK)))
4270			return -EAGAIN;
4271		goto err;
4272	}
4273
4274	if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
4275		file->f_flags &= ~O_NONBLOCK;
4276	fsnotify_open(file);
4277
4278	if (!fixed)
4279		fd_install(ret, file);
4280	else
4281		ret = io_install_fixed_file(req, file, issue_flags,
4282					    req->open.file_slot - 1);
4283err:
4284	putname(req->open.filename);
4285	req->flags &= ~REQ_F_NEED_CLEANUP;
4286	if (ret < 0)
4287		req_set_fail(req);
4288	__io_req_complete(req, issue_flags, ret, 0);
4289	return 0;
4290}
4291
4292static int io_openat(struct io_kiocb *req, unsigned int issue_flags)
4293{
4294	return io_openat2(req, issue_flags);
4295}
4296
4297static int io_remove_buffers_prep(struct io_kiocb *req,
4298				  const struct io_uring_sqe *sqe)
4299{
4300	struct io_provide_buf *p = &req->pbuf;
4301	u64 tmp;
4302
4303	if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
4304	    sqe->splice_fd_in)
4305		return -EINVAL;
4306
4307	tmp = READ_ONCE(sqe->fd);
4308	if (!tmp || tmp > USHRT_MAX)
4309		return -EINVAL;
4310
4311	memset(p, 0, sizeof(*p));
4312	p->nbufs = tmp;
4313	p->bgid = READ_ONCE(sqe->buf_group);
4314	return 0;
4315}
4316
4317static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
4318			       int bgid, unsigned nbufs)
4319{
4320	unsigned i = 0;
4321
4322	/* shouldn't happen */
4323	if (!nbufs)
4324		return 0;
4325
4326	/* the head kbuf is the list itself */
4327	while (!list_empty(&buf->list)) {
4328		struct io_buffer *nxt;
4329
4330		nxt = list_first_entry(&buf->list, struct io_buffer, list);
4331		list_del(&nxt->list);
4332		kfree(nxt);
4333		if (++i == nbufs)
4334			return i;
4335		cond_resched();
4336	}
4337	i++;
4338	kfree(buf);
4339	xa_erase(&ctx->io_buffers, bgid);
4340
4341	return i;
4342}
4343
4344static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
4345{
4346	struct io_provide_buf *p = &req->pbuf;
4347	struct io_ring_ctx *ctx = req->ctx;
4348	struct io_buffer *head;
4349	int ret = 0;
4350	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4351
4352	io_ring_submit_lock(ctx, !force_nonblock);
4353
4354	lockdep_assert_held(&ctx->uring_lock);
4355
4356	ret = -ENOENT;
4357	head = xa_load(&ctx->io_buffers, p->bgid);
4358	if (head)
4359		ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
4360	if (ret < 0)
4361		req_set_fail(req);
4362
4363	/* complete before unlock, IOPOLL may need the lock */
4364	__io_req_complete(req, issue_flags, ret, 0);
4365	io_ring_submit_unlock(ctx, !force_nonblock);
4366	return 0;
4367}
4368
4369static int io_provide_buffers_prep(struct io_kiocb *req,
4370				   const struct io_uring_sqe *sqe)
4371{
4372	unsigned long size, tmp_check;
4373	struct io_provide_buf *p = &req->pbuf;
4374	u64 tmp;
4375
4376	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
4377		return -EINVAL;
4378
4379	tmp = READ_ONCE(sqe->fd);
4380	if (!tmp || tmp > USHRT_MAX)
4381		return -E2BIG;
4382	p->nbufs = tmp;
4383	p->addr = READ_ONCE(sqe->addr);
4384	p->len = READ_ONCE(sqe->len);
4385
4386	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
4387				&size))
4388		return -EOVERFLOW;
4389	if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
4390		return -EOVERFLOW;
4391
4392	size = (unsigned long)p->len * p->nbufs;
4393	if (!access_ok(u64_to_user_ptr(p->addr), size))
4394		return -EFAULT;
4395
4396	p->bgid = READ_ONCE(sqe->buf_group);
4397	tmp = READ_ONCE(sqe->off);
4398	if (tmp > USHRT_MAX)
4399		return -E2BIG;
4400	p->bid = tmp;
4401	return 0;
4402}
4403
4404static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
4405{
4406	struct io_buffer *buf;
4407	u64 addr = pbuf->addr;
4408	int i, bid = pbuf->bid;
4409
4410	for (i = 0; i < pbuf->nbufs; i++) {
4411		buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
4412		if (!buf)
4413			break;
4414
4415		buf->addr = addr;
4416		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
4417		buf->bid = bid;
4418		addr += pbuf->len;
4419		bid++;
4420		if (!*head) {
4421			INIT_LIST_HEAD(&buf->list);
4422			*head = buf;
4423		} else {
4424			list_add_tail(&buf->list, &(*head)->list);
4425		}
4426		cond_resched();
4427	}
4428
4429	return i ? i : -ENOMEM;
4430}
4431
4432static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
4433{
4434	struct io_provide_buf *p = &req->pbuf;
4435	struct io_ring_ctx *ctx = req->ctx;
4436	struct io_buffer *head, *list;
4437	int ret = 0;
4438	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4439
4440	io_ring_submit_lock(ctx, !force_nonblock);
4441
4442	lockdep_assert_held(&ctx->uring_lock);
4443
4444	list = head = xa_load(&ctx->io_buffers, p->bgid);
4445
4446	ret = io_add_buffers(p, &head);
4447	if (ret >= 0 && !list) {
4448		ret = xa_insert(&ctx->io_buffers, p->bgid, head,
4449				GFP_KERNEL_ACCOUNT);
4450		if (ret < 0)
4451			__io_remove_buffers(ctx, head, p->bgid, -1U);
4452	}
4453	if (ret < 0)
4454		req_set_fail(req);
4455	/* complete before unlock, IOPOLL may need the lock */
4456	__io_req_complete(req, issue_flags, ret, 0);
4457	io_ring_submit_unlock(ctx, !force_nonblock);
4458	return 0;
4459}
4460
4461static int io_epoll_ctl_prep(struct io_kiocb *req,
4462			     const struct io_uring_sqe *sqe)
4463{
4464#if defined(CONFIG_EPOLL)
4465	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4466		return -EINVAL;
4467	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4468		return -EINVAL;
4469
4470	req->epoll.epfd = READ_ONCE(sqe->fd);
4471	req->epoll.op = READ_ONCE(sqe->len);
4472	req->epoll.fd = READ_ONCE(sqe->off);
4473
4474	if (ep_op_has_event(req->epoll.op)) {
4475		struct epoll_event __user *ev;
4476
4477		ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
4478		if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
4479			return -EFAULT;
4480	}
4481
4482	return 0;
4483#else
4484	return -EOPNOTSUPP;
4485#endif
4486}
4487
4488static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
4489{
4490#if defined(CONFIG_EPOLL)
4491	struct io_epoll *ie = &req->epoll;
4492	int ret;
4493	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
4494
4495	ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
4496	if (force_nonblock && ret == -EAGAIN)
4497		return -EAGAIN;
4498
4499	if (ret < 0)
4500		req_set_fail(req);
4501	__io_req_complete(req, issue_flags, ret, 0);
4502	return 0;
4503#else
4504	return -EOPNOTSUPP;
4505#endif
4506}
4507
4508static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4509{
4510#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4511	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
4512		return -EINVAL;
4513	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4514		return -EINVAL;
4515
4516	req->madvise.addr = READ_ONCE(sqe->addr);
4517	req->madvise.len = READ_ONCE(sqe->len);
4518	req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
4519	return 0;
4520#else
4521	return -EOPNOTSUPP;
4522#endif
4523}
4524
4525static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
4526{
4527#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
4528	struct io_madvise *ma = &req->madvise;
4529	int ret;
4530
4531	if (issue_flags & IO_URING_F_NONBLOCK)
4532		return -EAGAIN;
4533
4534	ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
4535	if (ret < 0)
4536		req_set_fail(req);
4537	io_req_complete(req, ret);
4538	return 0;
4539#else
4540	return -EOPNOTSUPP;
4541#endif
4542}
4543
4544static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4545{
4546	if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
4547		return -EINVAL;
4548	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4549		return -EINVAL;
4550
4551	req->fadvise.offset = READ_ONCE(sqe->off);
4552	req->fadvise.len = READ_ONCE(sqe->len);
4553	req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
4554	return 0;
4555}
4556
4557static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
4558{
4559	struct io_fadvise *fa = &req->fadvise;
4560	int ret;
4561
4562	if (issue_flags & IO_URING_F_NONBLOCK) {
4563		switch (fa->advice) {
4564		case POSIX_FADV_NORMAL:
4565		case POSIX_FADV_RANDOM:
4566		case POSIX_FADV_SEQUENTIAL:
4567			break;
4568		default:
4569			return -EAGAIN;
4570		}
4571	}
4572
4573	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
4574	if (ret < 0)
4575		req_set_fail(req);
4576	__io_req_complete(req, issue_flags, ret, 0);
4577	return 0;
4578}
4579
4580static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4581{
4582	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4583		return -EINVAL;
4584	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
4585		return -EINVAL;
4586	if (req->flags & REQ_F_FIXED_FILE)
4587		return -EBADF;
4588
4589	req->statx.dfd = READ_ONCE(sqe->fd);
4590	req->statx.mask = READ_ONCE(sqe->len);
4591	req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
4592	req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
4593	req->statx.flags = READ_ONCE(sqe->statx_flags);
4594
4595	return 0;
4596}
4597
4598static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
4599{
4600	struct io_statx *ctx = &req->statx;
4601	int ret;
4602
4603	if (issue_flags & IO_URING_F_NONBLOCK)
4604		return -EAGAIN;
4605
4606	ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
4607		       ctx->buffer);
4608
4609	if (ret < 0)
4610		req_set_fail(req);
4611	io_req_complete(req, ret);
4612	return 0;
4613}
4614
4615static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4616{
4617	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4618		return -EINVAL;
4619	if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
4620	    sqe->rw_flags || sqe->buf_index)
4621		return -EINVAL;
4622	if (req->flags & REQ_F_FIXED_FILE)
4623		return -EBADF;
4624
4625	req->close.fd = READ_ONCE(sqe->fd);
4626	req->close.file_slot = READ_ONCE(sqe->file_index);
4627	if (req->close.file_slot && req->close.fd)
4628		return -EINVAL;
4629
4630	return 0;
4631}
4632
4633static int io_close(struct io_kiocb *req, unsigned int issue_flags)
4634{
4635	struct files_struct *files = current->files;
4636	struct io_close *close = &req->close;
4637	struct fdtable *fdt;
4638	struct file *file = NULL;
4639	int ret = -EBADF;
4640
4641	if (req->close.file_slot) {
4642		ret = io_close_fixed(req, issue_flags);
4643		goto err;
4644	}
4645
4646	spin_lock(&files->file_lock);
4647	fdt = files_fdtable(files);
4648	if (close->fd >= fdt->max_fds) {
4649		spin_unlock(&files->file_lock);
4650		goto err;
4651	}
4652	file = fdt->fd[close->fd];
4653	if (!file || file->f_op == &io_uring_fops) {
4654		spin_unlock(&files->file_lock);
4655		file = NULL;
4656		goto err;
4657	}
4658
4659	/* if the file has a flush method, be safe and punt to async */
4660	if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) {
4661		spin_unlock(&files->file_lock);
4662		return -EAGAIN;
4663	}
4664
4665	ret = __close_fd_get_file(close->fd, &file);
4666	spin_unlock(&files->file_lock);
4667	if (ret < 0) {
4668		if (ret == -ENOENT)
4669			ret = -EBADF;
4670		goto err;
4671	}
4672
4673	/* No ->flush() or already async, safely close from here */
4674	ret = filp_close(file, current->files);
4675err:
4676	if (ret < 0)
4677		req_set_fail(req);
4678	if (file)
4679		fput(file);
4680	__io_req_complete(req, issue_flags, ret, 0);
4681	return 0;
4682}
4683
4684static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4685{
4686	struct io_ring_ctx *ctx = req->ctx;
4687
4688	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
4689		return -EINVAL;
4690	if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
4691		     sqe->splice_fd_in))
4692		return -EINVAL;
4693
4694	req->sync.off = READ_ONCE(sqe->off);
4695	req->sync.len = READ_ONCE(sqe->len);
4696	req->sync.flags = READ_ONCE(sqe->sync_range_flags);
4697	return 0;
4698}
4699
4700static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
4701{
4702	int ret;
4703
4704	/* sync_file_range always requires a blocking context */
4705	if (issue_flags & IO_URING_F_NONBLOCK)
4706		return -EAGAIN;
4707
4708	ret = sync_file_range(req->file, req->sync.off, req->sync.len,
4709				req->sync.flags);
4710	if (ret < 0)
4711		req_set_fail(req);
4712	io_req_complete(req, ret);
4713	return 0;
4714}
4715
4716#if defined(CONFIG_NET)
4717static bool io_net_retry(struct socket *sock, int flags)
4718{
4719	if (!(flags & MSG_WAITALL))
4720		return false;
4721	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
4722}
4723
4724static int io_setup_async_msg(struct io_kiocb *req,
4725			      struct io_async_msghdr *kmsg)
4726{
4727	struct io_async_msghdr *async_msg = req->async_data;
4728
4729	if (async_msg)
4730		return -EAGAIN;
4731	if (io_alloc_async_data(req)) {
4732		kfree(kmsg->free_iov);
4733		return -ENOMEM;
4734	}
4735	async_msg = req->async_data;
4736	req->flags |= REQ_F_NEED_CLEANUP;
4737	memcpy(async_msg, kmsg, sizeof(*kmsg));
4738	if (async_msg->msg.msg_name)
4739		async_msg->msg.msg_name = &async_msg->addr;
4740	/* if were using fast_iov, set it to the new one */
4741	if (!kmsg->free_iov) {
4742		size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov;
4743		async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx];
4744	}
4745
4746	return -EAGAIN;
4747}
4748
4749static int io_sendmsg_copy_hdr(struct io_kiocb *req,
4750			       struct io_async_msghdr *iomsg)
4751{
4752	struct io_sr_msg *sr = &req->sr_msg;
4753	int ret;
4754
4755	iomsg->msg.msg_name = &iomsg->addr;
4756	iomsg->free_iov = iomsg->fast_iov;
4757	ret = sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
4758				   req->sr_msg.msg_flags, &iomsg->free_iov);
4759	/* save msg_control as sys_sendmsg() overwrites it */
4760	sr->msg_control = iomsg->msg.msg_control;
4761	return ret;
4762}
4763
4764static int io_sendmsg_prep_async(struct io_kiocb *req)
4765{
4766	int ret;
4767
4768	ret = io_sendmsg_copy_hdr(req, req->async_data);
4769	if (!ret)
4770		req->flags |= REQ_F_NEED_CLEANUP;
4771	return ret;
4772}
4773
4774static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
4775{
4776	struct io_sr_msg *sr = &req->sr_msg;
4777
4778	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
4779		return -EINVAL;
4780	if (unlikely(sqe->addr2 || sqe->file_index))
4781		return -EINVAL;
4782	if (unlikely(sqe->addr2 || sqe->file_index || sqe->ioprio))
4783		return -EINVAL;
4784
4785	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
4786	sr->len = READ_ONCE(sqe->len);
4787	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
4788	if (sr->msg_flags & MSG_DONTWAIT)
4789		req->flags |= REQ_F_NOWAIT;
4790
4791#ifdef CONFIG_COMPAT
4792	if (req->ctx->compat)
4793		sr->msg_flags |= MSG_CMSG_COMPAT;
4794#endif
4795	sr->done_io = 0;
4796	return 0;
4797}
4798
4799static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
4800{
4801	struct io_async_msghdr iomsg, *kmsg;
4802	struct io_sr_msg *sr = &req->sr_msg;
4803	struct socket *sock;
4804	unsigned flags;
4805	int min_ret = 0;
4806	int ret;
4807
4808	sock = sock_from_file(req->file, &ret);
4809	if (unlikely(!sock))
4810		return ret;
4811
4812	kmsg = req->async_data;
4813	if (!kmsg) {
4814		ret = io_sendmsg_copy_hdr(req, &iomsg);
4815		if (ret)
4816			return ret;
4817		kmsg = &iomsg;
4818	} else {
4819		kmsg->msg.msg_control = sr->msg_control;
4820	}
4821
4822	flags = req->sr_msg.msg_flags;
4823	if (issue_flags & IO_URING_F_NONBLOCK)
4824		flags |= MSG_DONTWAIT;
4825	if (flags & MSG_WAITALL)
4826		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
4827
4828	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
4829
4830	if (ret < min_ret) {
4831		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
4832			return io_setup_async_msg(req, kmsg);
4833		if (ret == -ERESTARTSYS)
4834			ret = -EINTR;
4835		if (ret > 0 && io_net_retry(sock, flags)) {
4836			sr->done_io += ret;
4837			req->flags |= REQ_F_PARTIAL_IO;
4838			return io_setup_async_msg(req, kmsg);
4839		}
4840		req_set_fail(req);
4841	}
4842	/* fast path, check for non-NULL to avoid function call */
4843	if (kmsg->free_iov)
4844		kfree(kmsg->free_iov);
4845	req->flags &= ~REQ_F_NEED_CLEANUP;
4846	if (ret >= 0)
4847		ret += sr->done_io;
4848	else if (sr->done_io)
4849		ret = sr->done_io;
4850	__io_req_complete(req, issue_flags, ret, 0);
4851	return 0;
4852}
4853
4854static int io_send(struct io_kiocb *req, unsigned int issue_flags)
4855{
4856	struct io_sr_msg *sr = &req->sr_msg;
4857	struct msghdr msg;
4858	struct iovec iov;
4859	struct socket *sock;
4860	unsigned flags;
4861	int min_ret = 0;
4862	int ret;
4863
4864	sock = sock_from_file(req->file, &ret);
4865	if (unlikely(!sock))
4866		return ret;
4867
4868	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
4869	if (unlikely(ret))
4870		return ret;
4871
4872	msg.msg_name = NULL;
4873	msg.msg_control = NULL;
4874	msg.msg_controllen = 0;
4875	msg.msg_namelen = 0;
4876
4877	flags = req->sr_msg.msg_flags;
4878	if (issue_flags & IO_URING_F_NONBLOCK)
4879		flags |= MSG_DONTWAIT;
4880	if (flags & MSG_WAITALL)
4881		min_ret = iov_iter_count(&msg.msg_iter);
4882
4883	msg.msg_flags = flags;
4884	ret = sock_sendmsg(sock, &msg);
4885	if (ret < min_ret) {
4886		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
4887			return -EAGAIN;
4888		if (ret == -ERESTARTSYS)
4889			ret = -EINTR;
4890		if (ret > 0 && io_net_retry(sock, flags)) {
4891			sr->len -= ret;
4892			sr->buf += ret;
4893			sr->done_io += ret;
4894			req->flags |= REQ_F_PARTIAL_IO;
4895			return -EAGAIN;
4896		}
4897		req_set_fail(req);
4898	}
4899	if (ret >= 0)
4900		ret += sr->done_io;
4901	else if (sr->done_io)
4902		ret = sr->done_io;
4903	__io_req_complete(req, issue_flags, ret, 0);
4904	return 0;
4905}
4906
4907static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
4908				 struct io_async_msghdr *iomsg)
4909{
4910	struct io_sr_msg *sr = &req->sr_msg;
4911	struct iovec __user *uiov;
4912	size_t iov_len;
4913	int ret;
4914
4915	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
4916					&iomsg->uaddr, &uiov, &iov_len);
4917	if (ret)
4918		return ret;
4919
4920	if (req->flags & REQ_F_BUFFER_SELECT) {
4921		if (iov_len > 1)
4922			return -EINVAL;
4923		if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov)))
4924			return -EFAULT;
4925		sr->len = iomsg->fast_iov[0].iov_len;
4926		iomsg->free_iov = NULL;
4927	} else {
4928		iomsg->free_iov = iomsg->fast_iov;
4929		ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
4930				     &iomsg->free_iov, &iomsg->msg.msg_iter,
4931				     false);
4932		if (ret > 0)
4933			ret = 0;
4934	}
4935
4936	return ret;
4937}
4938
4939#ifdef CONFIG_COMPAT
4940static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
4941					struct io_async_msghdr *iomsg)
4942{
4943	struct io_sr_msg *sr = &req->sr_msg;
4944	struct compat_iovec __user *uiov;
4945	compat_uptr_t ptr;
4946	compat_size_t len;
4947	int ret;
4948
4949	ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr,
4950				  &ptr, &len);
4951	if (ret)
4952		return ret;
4953
4954	uiov = compat_ptr(ptr);
4955	if (req->flags & REQ_F_BUFFER_SELECT) {
4956		compat_ssize_t clen;
4957
4958		if (len > 1)
4959			return -EINVAL;
4960		if (!access_ok(uiov, sizeof(*uiov)))
4961			return -EFAULT;
4962		if (__get_user(clen, &uiov->iov_len))
4963			return -EFAULT;
4964		if (clen < 0)
4965			return -EINVAL;
4966		sr->len = clen;
4967		iomsg->free_iov = NULL;
4968	} else {
4969		iomsg->free_iov = iomsg->fast_iov;
4970		ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
4971				   UIO_FASTIOV, &iomsg->free_iov,
4972				   &iomsg->msg.msg_iter, true);
4973		if (ret < 0)
4974			return ret;
4975	}
4976
4977	return 0;
4978}
4979#endif
4980
4981static int io_recvmsg_copy_hdr(struct io_kiocb *req,
4982			       struct io_async_msghdr *iomsg)
4983{
4984	iomsg->msg.msg_name = &iomsg->addr;
4985
4986#ifdef CONFIG_COMPAT
4987	if (req->ctx->compat)
4988		return __io_compat_recvmsg_copy_hdr(req, iomsg);
4989#endif
4990
4991	return __io_recvmsg_copy_hdr(req, iomsg);
4992}
4993
4994static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
4995					       bool needs_lock)
4996{
4997	struct io_sr_msg *sr = &req->sr_msg;
4998	struct io_buffer *kbuf;
4999
5000	kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
5001	if (IS_ERR(kbuf))
5002		return kbuf;
5003
5004	sr->kbuf = kbuf;
5005	req->flags |= REQ_F_BUFFER_SELECTED;
5006	return kbuf;
5007}
5008
5009static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
5010{
5011	return io_put_kbuf(req, req->sr_msg.kbuf);
5012}
5013
5014static int io_recvmsg_prep_async(struct io_kiocb *req)
5015{
5016	int ret;
5017
5018	ret = io_recvmsg_copy_hdr(req, req->async_data);
5019	if (!ret)
5020		req->flags |= REQ_F_NEED_CLEANUP;
5021	return ret;
5022}
5023
5024static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5025{
5026	struct io_sr_msg *sr = &req->sr_msg;
5027
5028	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5029		return -EINVAL;
5030	if (unlikely(sqe->addr2 || sqe->file_index))
5031		return -EINVAL;
5032	if (unlikely(sqe->addr2 || sqe->file_index || sqe->ioprio))
5033		return -EINVAL;
5034
5035	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
5036	sr->len = READ_ONCE(sqe->len);
5037	sr->bgid = READ_ONCE(sqe->buf_group);
5038	sr->msg_flags = READ_ONCE(sqe->msg_flags);
5039	if (sr->msg_flags & MSG_DONTWAIT)
5040		req->flags |= REQ_F_NOWAIT;
5041
5042#ifdef CONFIG_COMPAT
5043	if (req->ctx->compat)
5044		sr->msg_flags |= MSG_CMSG_COMPAT;
5045#endif
5046	sr->done_io = 0;
5047	return 0;
5048}
5049
5050static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
5051{
5052	struct io_async_msghdr iomsg, *kmsg;
5053	struct io_sr_msg *sr = &req->sr_msg;
5054	struct socket *sock;
5055	struct io_buffer *kbuf;
5056	unsigned flags;
5057	int min_ret = 0;
5058	int ret, cflags = 0;
5059	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5060
5061	sock = sock_from_file(req->file, &ret);
5062	if (unlikely(!sock))
5063		return ret;
5064
5065	kmsg = req->async_data;
5066	if (!kmsg) {
5067		ret = io_recvmsg_copy_hdr(req, &iomsg);
5068		if (ret)
5069			return ret;
5070		kmsg = &iomsg;
5071	}
5072
5073	if (req->flags & REQ_F_BUFFER_SELECT) {
5074		kbuf = io_recv_buffer_select(req, !force_nonblock);
5075		if (IS_ERR(kbuf))
5076			return PTR_ERR(kbuf);
5077		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
5078		kmsg->fast_iov[0].iov_len = req->sr_msg.len;
5079		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
5080				1, req->sr_msg.len);
5081	}
5082
5083	flags = req->sr_msg.msg_flags;
5084	if (force_nonblock)
5085		flags |= MSG_DONTWAIT;
5086	if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
5087		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
5088
5089	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
5090					kmsg->uaddr, flags);
5091	if (ret < min_ret) {
5092		if (ret == -EAGAIN && force_nonblock)
5093			return io_setup_async_msg(req, kmsg);
5094		if (ret == -ERESTARTSYS)
5095			ret = -EINTR;
5096		if (ret > 0 && io_net_retry(sock, flags)) {
5097			kmsg->msg.msg_controllen = 0;
5098			kmsg->msg.msg_control = NULL;
5099			sr->done_io += ret;
5100			req->flags |= REQ_F_PARTIAL_IO;
5101			return io_setup_async_msg(req, kmsg);
5102		}
5103		req_set_fail(req);
5104	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
5105		req_set_fail(req);
5106	}
5107
5108	if (req->flags & REQ_F_BUFFER_SELECTED)
5109		cflags = io_put_recv_kbuf(req);
5110	/* fast path, check for non-NULL to avoid function call */
5111	if (kmsg->free_iov)
5112		kfree(kmsg->free_iov);
5113	req->flags &= ~REQ_F_NEED_CLEANUP;
5114	if (ret >= 0)
5115		ret += sr->done_io;
5116	else if (sr->done_io)
5117		ret = sr->done_io;
5118	__io_req_complete(req, issue_flags, ret, cflags);
5119	return 0;
5120}
5121
5122static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
5123{
5124	struct io_buffer *kbuf;
5125	struct io_sr_msg *sr = &req->sr_msg;
5126	struct msghdr msg;
5127	void __user *buf = sr->buf;
5128	struct socket *sock;
5129	struct iovec iov;
5130	unsigned flags;
5131	int min_ret = 0;
5132	int ret, cflags = 0;
5133	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5134
5135	sock = sock_from_file(req->file, &ret);
5136	if (unlikely(!sock))
5137		return ret;
5138
5139	if (req->flags & REQ_F_BUFFER_SELECT) {
5140		kbuf = io_recv_buffer_select(req, !force_nonblock);
5141		if (IS_ERR(kbuf))
5142			return PTR_ERR(kbuf);
5143		buf = u64_to_user_ptr(kbuf->addr);
5144	}
5145
5146	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
5147	if (unlikely(ret))
5148		goto out_free;
5149
5150	msg.msg_name = NULL;
5151	msg.msg_control = NULL;
5152	msg.msg_controllen = 0;
5153	msg.msg_namelen = 0;
5154	msg.msg_iocb = NULL;
5155	msg.msg_flags = 0;
5156
5157	flags = req->sr_msg.msg_flags;
5158	if (force_nonblock)
5159		flags |= MSG_DONTWAIT;
5160	if (flags & MSG_WAITALL)
5161		min_ret = iov_iter_count(&msg.msg_iter);
5162
5163	ret = sock_recvmsg(sock, &msg, flags);
5164	if (ret < min_ret) {
5165		if (ret == -EAGAIN && force_nonblock)
5166			return -EAGAIN;
5167		if (ret == -ERESTARTSYS)
5168			ret = -EINTR;
5169		if (ret > 0 && io_net_retry(sock, flags)) {
5170			sr->len -= ret;
5171			sr->buf += ret;
5172			sr->done_io += ret;
5173			req->flags |= REQ_F_PARTIAL_IO;
5174			return -EAGAIN;
5175		}
5176		req_set_fail(req);
5177	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
5178out_free:
5179		req_set_fail(req);
5180	}
5181	if (req->flags & REQ_F_BUFFER_SELECTED)
5182		cflags = io_put_recv_kbuf(req);
5183	if (ret >= 0)
5184		ret += sr->done_io;
5185	else if (sr->done_io)
5186		ret = sr->done_io;
5187	__io_req_complete(req, issue_flags, ret, cflags);
5188	return 0;
5189}
5190
5191static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5192{
5193	struct io_accept *accept = &req->accept;
5194
5195	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5196		return -EINVAL;
5197	if (sqe->ioprio || sqe->len || sqe->buf_index)
5198		return -EINVAL;
5199
5200	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5201	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
5202	accept->flags = READ_ONCE(sqe->accept_flags);
5203	accept->nofile = rlimit(RLIMIT_NOFILE);
5204
5205	accept->file_slot = READ_ONCE(sqe->file_index);
5206	if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
5207		return -EINVAL;
5208	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
5209		return -EINVAL;
5210	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
5211		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
5212	return 0;
5213}
5214
5215static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
5216{
5217	struct io_accept *accept = &req->accept;
5218	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5219	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
5220	bool fixed = !!accept->file_slot;
5221	struct file *file;
5222	int ret, fd;
5223
5224	if (!fixed) {
5225		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
5226		if (unlikely(fd < 0))
5227			return fd;
5228	}
5229	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
5230			 accept->flags);
5231
5232	if (IS_ERR(file)) {
5233		if (!fixed)
5234			put_unused_fd(fd);
5235		ret = PTR_ERR(file);
5236		/* safe to retry */
5237		req->flags |= REQ_F_PARTIAL_IO;
5238		if (ret == -EAGAIN && force_nonblock)
5239			return -EAGAIN;
5240		if (ret == -ERESTARTSYS)
5241			ret = -EINTR;
5242		req_set_fail(req);
5243	} else if (!fixed) {
5244		fd_install(fd, file);
5245		ret = fd;
5246	} else {
5247		ret = io_install_fixed_file(req, file, issue_flags,
5248					    accept->file_slot - 1);
5249	}
5250	__io_req_complete(req, issue_flags, ret, 0);
5251	return 0;
5252}
5253
5254static int io_connect_prep_async(struct io_kiocb *req)
5255{
5256	struct io_async_connect *io = req->async_data;
5257	struct io_connect *conn = &req->connect;
5258
5259	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
5260}
5261
5262static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5263{
5264	struct io_connect *conn = &req->connect;
5265
5266	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5267		return -EINVAL;
5268	if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
5269	    sqe->splice_fd_in)
5270		return -EINVAL;
5271
5272	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
5273	conn->addr_len =  READ_ONCE(sqe->addr2);
5274	return 0;
5275}
5276
5277static int io_connect(struct io_kiocb *req, unsigned int issue_flags)
5278{
5279	struct io_async_connect __io, *io;
5280	unsigned file_flags;
5281	int ret;
5282	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
5283
5284	if (req->async_data) {
5285		io = req->async_data;
5286	} else {
5287		ret = move_addr_to_kernel(req->connect.addr,
5288						req->connect.addr_len,
5289						&__io.address);
5290		if (ret)
5291			goto out;
5292		io = &__io;
5293	}
5294
5295	file_flags = force_nonblock ? O_NONBLOCK : 0;
5296
5297	ret = __sys_connect_file(req->file, &io->address,
5298					req->connect.addr_len, file_flags);
5299	if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
5300		if (req->async_data)
5301			return -EAGAIN;
5302		if (io_alloc_async_data(req)) {
5303			ret = -ENOMEM;
5304			goto out;
5305		}
5306		memcpy(req->async_data, &__io, sizeof(__io));
5307		return -EAGAIN;
5308	}
5309	if (ret == -ERESTARTSYS)
5310		ret = -EINTR;
5311out:
5312	if (ret < 0)
5313		req_set_fail(req);
5314	__io_req_complete(req, issue_flags, ret, 0);
5315	return 0;
5316}
5317#else /* !CONFIG_NET */
5318#define IO_NETOP_FN(op)							\
5319static int io_##op(struct io_kiocb *req, unsigned int issue_flags)	\
5320{									\
5321	return -EOPNOTSUPP;						\
5322}
5323
5324#define IO_NETOP_PREP(op)						\
5325IO_NETOP_FN(op)								\
5326static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \
5327{									\
5328	return -EOPNOTSUPP;						\
5329}									\
5330
5331#define IO_NETOP_PREP_ASYNC(op)						\
5332IO_NETOP_PREP(op)							\
5333static int io_##op##_prep_async(struct io_kiocb *req)			\
5334{									\
5335	return -EOPNOTSUPP;						\
5336}
5337
5338IO_NETOP_PREP_ASYNC(sendmsg);
5339IO_NETOP_PREP_ASYNC(recvmsg);
5340IO_NETOP_PREP_ASYNC(connect);
5341IO_NETOP_PREP(accept);
5342IO_NETOP_FN(send);
5343IO_NETOP_FN(recv);
5344#endif /* CONFIG_NET */
5345
5346struct io_poll_table {
5347	struct poll_table_struct pt;
5348	struct io_kiocb *req;
5349	int nr_entries;
5350	int error;
5351};
5352
5353#define IO_POLL_CANCEL_FLAG	BIT(31)
5354#define IO_POLL_RETRY_FLAG	BIT(30)
5355#define IO_POLL_REF_MASK	GENMASK(29, 0)
5356
5357/*
5358 * We usually have 1-2 refs taken, 128 is more than enough and we want to
5359 * maximise the margin between this amount and the moment when it overflows.
5360 */
5361#define IO_POLL_REF_BIAS       128
5362
5363static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
5364{
5365	int v;
5366
5367	/*
5368	 * poll_refs are already elevated and we don't have much hope for
5369	 * grabbing the ownership. Instead of incrementing set a retry flag
5370	 * to notify the loop that there might have been some change.
5371	 */
5372	v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs);
5373	if (v & IO_POLL_REF_MASK)
5374		return false;
5375	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
5376}
5377
5378/*
5379 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
5380 * bump it and acquire ownership. It's disallowed to modify requests while not
5381 * owning it, that prevents from races for enqueueing task_work's and b/w
5382 * arming poll and wakeups.
5383 */
5384static inline bool io_poll_get_ownership(struct io_kiocb *req)
5385{
5386	if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
5387		return io_poll_get_ownership_slowpath(req);
5388	return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
5389}
5390
5391static void io_poll_mark_cancelled(struct io_kiocb *req)
5392{
5393	atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs);
5394}
5395
5396static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
5397{
5398	/* pure poll stashes this in ->async_data, poll driven retry elsewhere */
5399	if (req->opcode == IORING_OP_POLL_ADD)
5400		return req->async_data;
5401	return req->apoll->double_poll;
5402}
5403
5404static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
5405{
5406	if (req->opcode == IORING_OP_POLL_ADD)
5407		return &req->poll;
5408	return &req->apoll->poll;
5409}
5410
5411static void io_poll_req_insert(struct io_kiocb *req)
5412{
5413	struct io_ring_ctx *ctx = req->ctx;
5414	struct hlist_head *list;
5415
5416	list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
5417	hlist_add_head(&req->hash_node, list);
5418}
5419
5420static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events,
5421			      wait_queue_func_t wake_func)
5422{
5423	poll->head = NULL;
5424#define IO_POLL_UNMASK	(EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
5425	/* mask in events that we always want/need */
5426	poll->events = events | IO_POLL_UNMASK;
5427	INIT_LIST_HEAD(&poll->wait.entry);
5428	init_waitqueue_func_entry(&poll->wait, wake_func);
5429}
5430
5431static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
5432{
5433	struct wait_queue_head *head = smp_load_acquire(&poll->head);
5434
5435	if (head) {
5436		spin_lock_irq(&head->lock);
5437		list_del_init(&poll->wait.entry);
5438		poll->head = NULL;
5439		spin_unlock_irq(&head->lock);
5440	}
5441}
5442
5443static void io_poll_remove_entries(struct io_kiocb *req)
5444{
5445	struct io_poll_iocb *poll = io_poll_get_single(req);
5446	struct io_poll_iocb *poll_double = io_poll_get_double(req);
5447
5448	/*
5449	 * While we hold the waitqueue lock and the waitqueue is nonempty,
5450	 * wake_up_pollfree() will wait for us.  However, taking the waitqueue
5451	 * lock in the first place can race with the waitqueue being freed.
5452	 *
5453	 * We solve this as eventpoll does: by taking advantage of the fact that
5454	 * all users of wake_up_pollfree() will RCU-delay the actual free.  If
5455	 * we enter rcu_read_lock() and see that the pointer to the queue is
5456	 * non-NULL, we can then lock it without the memory being freed out from
5457	 * under us.
5458	 *
5459	 * Keep holding rcu_read_lock() as long as we hold the queue lock, in
5460	 * case the caller deletes the entry from the queue, leaving it empty.
5461	 * In that case, only RCU prevents the queue memory from being freed.
5462	 */
5463	rcu_read_lock();
5464	io_poll_remove_entry(poll);
5465	if (poll_double)
5466		io_poll_remove_entry(poll_double);
5467	rcu_read_unlock();
5468}
5469
5470/*
5471 * All poll tw should go through this. Checks for poll events, manages
5472 * references, does rewait, etc.
5473 *
5474 * Returns a negative error on failure. >0 when no action require, which is
5475 * either spurious wakeup or multishot CQE is served. 0 when it's done with
5476 * the request, then the mask is stored in req->result.
5477 */
5478static int io_poll_check_events(struct io_kiocb *req)
5479{
5480	struct io_ring_ctx *ctx = req->ctx;
5481	struct io_poll_iocb *poll = io_poll_get_single(req);
5482	int v;
5483
5484	/* req->task == current here, checking PF_EXITING is safe */
5485	if (unlikely(req->task->flags & PF_EXITING))
5486		io_poll_mark_cancelled(req);
5487
5488	do {
5489		v = atomic_read(&req->poll_refs);
5490
5491		/* tw handler should be the owner, and so have some references */
5492		if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
5493			return 0;
5494		if (v & IO_POLL_CANCEL_FLAG)
5495			return -ECANCELED;
5496		/*
5497		 * cqe.res contains only events of the first wake up
5498		 * and all others are be lost. Redo vfs_poll() to get
5499		 * up to date state.
5500		 */
5501		if ((v & IO_POLL_REF_MASK) != 1)
5502			req->result = 0;
5503		if (v & IO_POLL_RETRY_FLAG) {
5504			req->result = 0;
5505			/*
5506			 * We won't find new events that came in between
5507			 * vfs_poll and the ref put unless we clear the
5508			 * flag in advance.
5509			 */
5510			atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
5511			v &= ~IO_POLL_RETRY_FLAG;
5512		}
5513
5514		if (!req->result) {
5515			struct poll_table_struct pt = { ._key = poll->events };
5516
5517			req->result = vfs_poll(req->file, &pt) & poll->events;
5518		}
5519
5520		/* multishot, just fill an CQE and proceed */
5521		if (req->result && !(poll->events & EPOLLONESHOT)) {
5522			__poll_t mask = mangle_poll(req->result & poll->events);
5523			bool filled;
5524
5525			spin_lock(&ctx->completion_lock);
5526			filled = io_fill_cqe_aux(ctx, req->user_data, mask,
5527						 IORING_CQE_F_MORE);
5528			io_commit_cqring(ctx);
5529			spin_unlock(&ctx->completion_lock);
5530			if (unlikely(!filled))
5531				return -ECANCELED;
5532			io_cqring_ev_posted(ctx);
5533		} else if (req->result) {
5534			return 0;
5535		}
5536
5537		/* force the next iteration to vfs_poll() */
5538		req->result = 0;
5539
5540		/*
5541		 * Release all references, retry if someone tried to restart
5542		 * task_work while we were executing it.
5543		 */
5544	} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) &
5545					IO_POLL_REF_MASK);
5546
5547	return 1;
5548}
5549
5550static void io_poll_task_func(struct io_kiocb *req, bool *locked)
5551{
5552	struct io_ring_ctx *ctx = req->ctx;
5553	int ret;
5554
5555	ret = io_poll_check_events(req);
5556	if (ret > 0)
5557		return;
5558
5559	if (!ret) {
5560		req->result = mangle_poll(req->result & req->poll.events);
5561	} else {
5562		req->result = ret;
5563		req_set_fail(req);
5564	}
5565
5566	io_poll_remove_entries(req);
5567	spin_lock(&ctx->completion_lock);
5568	hash_del(&req->hash_node);
5569	spin_unlock(&ctx->completion_lock);
5570	io_req_complete_post(req, req->result, 0);
5571}
5572
5573static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
5574{
5575	struct io_ring_ctx *ctx = req->ctx;
5576	int ret;
5577
5578	ret = io_poll_check_events(req);
5579	if (ret > 0)
5580		return;
5581
5582	io_tw_lock(req->ctx, locked);
5583	io_poll_remove_entries(req);
5584	spin_lock(&ctx->completion_lock);
5585	hash_del(&req->hash_node);
5586	spin_unlock(&ctx->completion_lock);
5587
5588	if (!ret)
5589		io_req_task_submit(req, locked);
5590	else
5591		io_req_complete_failed(req, ret);
5592}
5593
5594static void __io_poll_execute(struct io_kiocb *req, int mask)
5595{
5596	req->result = mask;
5597	if (req->opcode == IORING_OP_POLL_ADD)
5598		req->io_task_work.func = io_poll_task_func;
5599	else
5600		req->io_task_work.func = io_apoll_task_func;
5601
5602	trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
5603	io_req_task_work_add(req);
5604}
5605
5606static inline void io_poll_execute(struct io_kiocb *req, int res)
5607{
5608	if (io_poll_get_ownership(req))
5609		__io_poll_execute(req, res);
5610}
5611
5612static void io_poll_cancel_req(struct io_kiocb *req)
5613{
5614	io_poll_mark_cancelled(req);
5615	/* kick tw, which should complete the request */
5616	io_poll_execute(req, 0);
5617}
5618
5619static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
5620			void *key)
5621{
5622	struct io_kiocb *req = wait->private;
5623	struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb,
5624						 wait);
5625	__poll_t mask = key_to_poll(key);
5626
5627	if (unlikely(mask & POLLFREE)) {
5628		io_poll_mark_cancelled(req);
5629		/* we have to kick tw in case it's not already */
5630		io_poll_execute(req, 0);
5631
5632		/*
5633		 * If the waitqueue is being freed early but someone is already
5634		 * holds ownership over it, we have to tear down the request as
5635		 * best we can. That means immediately removing the request from
5636		 * its waitqueue and preventing all further accesses to the
5637		 * waitqueue via the request.
5638		 */
5639		list_del_init(&poll->wait.entry);
5640
5641		/*
5642		 * Careful: this *must* be the last step, since as soon
5643		 * as req->head is NULL'ed out, the request can be
5644		 * completed and freed, since aio_poll_complete_work()
5645		 * will no longer need to take the waitqueue lock.
5646		 */
5647		smp_store_release(&poll->head, NULL);
5648		return 1;
5649	}
5650
5651	/* for instances that support it check for an event match first */
5652	if (mask && !(mask & poll->events))
5653		return 0;
5654
5655	if (io_poll_get_ownership(req)) {
5656		/*
5657		 * If we trigger a multishot poll off our own wakeup path,
5658		 * disable multishot as there is a circular dependency between
5659		 * CQ posting and triggering the event.
5660		 */
5661		if (mask & EPOLL_URING_WAKE)
5662			poll->events |= EPOLLONESHOT;
5663
5664		__io_poll_execute(req, mask);
5665	}
5666	return 1;
5667}
5668
5669static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
5670			    struct wait_queue_head *head,
5671			    struct io_poll_iocb **poll_ptr)
5672{
5673	struct io_kiocb *req = pt->req;
5674
5675	/*
5676	 * The file being polled uses multiple waitqueues for poll handling
5677	 * (e.g. one for read, one for write). Setup a separate io_poll_iocb
5678	 * if this happens.
5679	 */
5680	if (unlikely(pt->nr_entries)) {
5681		struct io_poll_iocb *first = poll;
5682
5683		/* double add on the same waitqueue head, ignore */
5684		if (first->head == head)
5685			return;
5686		/* already have a 2nd entry, fail a third attempt */
5687		if (*poll_ptr) {
5688			if ((*poll_ptr)->head == head)
5689				return;
5690			pt->error = -EINVAL;
5691			return;
5692		}
5693
5694		poll = kmalloc(sizeof(*poll), GFP_ATOMIC);
5695		if (!poll) {
5696			pt->error = -ENOMEM;
5697			return;
5698		}
5699		io_init_poll_iocb(poll, first->events, first->wait.func);
5700		*poll_ptr = poll;
5701	}
5702
5703	pt->nr_entries++;
5704	poll->head = head;
5705	poll->wait.private = req;
5706
5707	if (poll->events & EPOLLEXCLUSIVE)
5708		add_wait_queue_exclusive(head, &poll->wait);
5709	else
5710		add_wait_queue(head, &poll->wait);
5711}
5712
5713static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
5714			       struct poll_table_struct *p)
5715{
5716	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5717
5718	__io_queue_proc(&pt->req->poll, pt, head,
5719			(struct io_poll_iocb **) &pt->req->async_data);
5720}
5721
5722static int __io_arm_poll_handler(struct io_kiocb *req,
5723				 struct io_poll_iocb *poll,
5724				 struct io_poll_table *ipt, __poll_t mask)
5725{
5726	struct io_ring_ctx *ctx = req->ctx;
5727
5728	INIT_HLIST_NODE(&req->hash_node);
5729	io_init_poll_iocb(poll, mask, io_poll_wake);
5730	poll->file = req->file;
5731	poll->wait.private = req;
5732
5733	ipt->pt._key = mask;
5734	ipt->req = req;
5735	ipt->error = 0;
5736	ipt->nr_entries = 0;
5737
5738	/*
5739	 * Take the ownership to delay any tw execution up until we're done
5740	 * with poll arming. see io_poll_get_ownership().
5741	 */
5742	atomic_set(&req->poll_refs, 1);
5743	mask = vfs_poll(req->file, &ipt->pt) & poll->events;
5744
5745	if (mask && (poll->events & EPOLLONESHOT)) {
5746		io_poll_remove_entries(req);
5747		/* no one else has access to the req, forget about the ref */
5748		return mask;
5749	}
5750	if (!mask && unlikely(ipt->error || !ipt->nr_entries)) {
5751		io_poll_remove_entries(req);
5752		if (!ipt->error)
5753			ipt->error = -EINVAL;
5754		return 0;
5755	}
5756
5757	spin_lock(&ctx->completion_lock);
5758	io_poll_req_insert(req);
5759	spin_unlock(&ctx->completion_lock);
5760
5761	if (mask) {
5762		/* can't multishot if failed, just queue the event we've got */
5763		if (unlikely(ipt->error || !ipt->nr_entries)) {
5764			poll->events |= EPOLLONESHOT;
5765			ipt->error = 0;
5766		}
5767		__io_poll_execute(req, mask);
5768		return 0;
5769	}
5770
5771	/*
5772	 * Try to release ownership. If we see a change of state, e.g.
5773	 * poll was waken up, queue up a tw, it'll deal with it.
5774	 */
5775	if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1)
5776		__io_poll_execute(req, 0);
5777	return 0;
5778}
5779
5780static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
5781			       struct poll_table_struct *p)
5782{
5783	struct io_poll_table *pt = container_of(p, struct io_poll_table, pt);
5784	struct async_poll *apoll = pt->req->apoll;
5785
5786	__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
5787}
5788
5789enum {
5790	IO_APOLL_OK,
5791	IO_APOLL_ABORTED,
5792	IO_APOLL_READY
5793};
5794
5795/*
5796 * We can't reliably detect loops in repeated poll triggers and issue
5797 * subsequently failing. But rather than fail these immediately, allow a
5798 * certain amount of retries before we give up. Given that this condition
5799 * should _rarely_ trigger even once, we should be fine with a larger value.
5800 */
5801#define APOLL_MAX_RETRY		128
5802
5803static int io_arm_poll_handler(struct io_kiocb *req)
5804{
5805	const struct io_op_def *def = &io_op_defs[req->opcode];
5806	struct io_ring_ctx *ctx = req->ctx;
5807	struct async_poll *apoll;
5808	struct io_poll_table ipt;
5809	__poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
5810	int ret;
5811
5812	if (!req->file || !file_can_poll(req->file))
5813		return IO_APOLL_ABORTED;
5814	if (!def->pollin && !def->pollout)
5815		return IO_APOLL_ABORTED;
5816
5817	if (def->pollin) {
5818		mask |= POLLIN | POLLRDNORM;
5819
5820		/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
5821		if ((req->opcode == IORING_OP_RECVMSG) &&
5822		    (req->sr_msg.msg_flags & MSG_ERRQUEUE))
5823			mask &= ~POLLIN;
5824	} else {
5825		mask |= POLLOUT | POLLWRNORM;
5826	}
5827
5828	if (req->flags & REQ_F_POLLED) {
5829		apoll = req->apoll;
5830		kfree(apoll->double_poll);
5831		if (unlikely(!--apoll->poll.retries)) {
5832			apoll->double_poll = NULL;
5833			return IO_APOLL_ABORTED;
5834		}
5835	} else {
5836		apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
5837		if (unlikely(!apoll))
5838			return IO_APOLL_ABORTED;
5839		apoll->poll.retries = APOLL_MAX_RETRY;
5840	}
5841	apoll->double_poll = NULL;
5842	req->apoll = apoll;
5843	req->flags |= REQ_F_POLLED;
5844	ipt.pt._qproc = io_async_queue_proc;
5845
5846	ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask);
5847	if (ret || ipt.error)
5848		return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
5849
5850	trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
5851				mask, apoll->poll.events);
5852	return IO_APOLL_OK;
5853}
5854
5855/*
5856 * Returns true if we found and killed one or more poll requests
5857 */
5858static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
5859			       bool cancel_all)
5860{
5861	struct hlist_node *tmp;
5862	struct io_kiocb *req;
5863	bool found = false;
5864	int i;
5865
5866	spin_lock(&ctx->completion_lock);
5867	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
5868		struct hlist_head *list;
5869
5870		list = &ctx->cancel_hash[i];
5871		hlist_for_each_entry_safe(req, tmp, list, hash_node) {
5872			if (io_match_task_safe(req, tsk, cancel_all)) {
5873				hlist_del_init(&req->hash_node);
5874				io_poll_cancel_req(req);
5875				found = true;
5876			}
5877		}
5878	}
5879	spin_unlock(&ctx->completion_lock);
5880	return found;
5881}
5882
5883static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr,
5884				     bool poll_only)
5885	__must_hold(&ctx->completion_lock)
5886{
5887	struct hlist_head *list;
5888	struct io_kiocb *req;
5889
5890	list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
5891	hlist_for_each_entry(req, list, hash_node) {
5892		if (sqe_addr != req->user_data)
5893			continue;
5894		if (poll_only && req->opcode != IORING_OP_POLL_ADD)
5895			continue;
5896		return req;
5897	}
5898	return NULL;
5899}
5900
5901static bool io_poll_disarm(struct io_kiocb *req)
5902	__must_hold(&ctx->completion_lock)
5903{
5904	if (!io_poll_get_ownership(req))
5905		return false;
5906	io_poll_remove_entries(req);
5907	hash_del(&req->hash_node);
5908	return true;
5909}
5910
5911static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr,
5912			  bool poll_only)
5913	__must_hold(&ctx->completion_lock)
5914{
5915	struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only);
5916
5917	if (!req)
5918		return -ENOENT;
5919	io_poll_cancel_req(req);
5920	return 0;
5921}
5922
5923static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe,
5924				     unsigned int flags)
5925{
5926	u32 events;
5927
5928	events = READ_ONCE(sqe->poll32_events);
5929#ifdef __BIG_ENDIAN
5930	events = swahw32(events);
5931#endif
5932	if (!(flags & IORING_POLL_ADD_MULTI))
5933		events |= EPOLLONESHOT;
5934	return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT));
5935}
5936
5937static int io_poll_update_prep(struct io_kiocb *req,
5938			       const struct io_uring_sqe *sqe)
5939{
5940	struct io_poll_update *upd = &req->poll_update;
5941	u32 flags;
5942
5943	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5944		return -EINVAL;
5945	if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
5946		return -EINVAL;
5947	flags = READ_ONCE(sqe->len);
5948	if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
5949		      IORING_POLL_ADD_MULTI))
5950		return -EINVAL;
5951	/* meaningless without update */
5952	if (flags == IORING_POLL_ADD_MULTI)
5953		return -EINVAL;
5954
5955	upd->old_user_data = READ_ONCE(sqe->addr);
5956	upd->update_events = flags & IORING_POLL_UPDATE_EVENTS;
5957	upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA;
5958
5959	upd->new_user_data = READ_ONCE(sqe->off);
5960	if (!upd->update_user_data && upd->new_user_data)
5961		return -EINVAL;
5962	if (upd->update_events)
5963		upd->events = io_poll_parse_events(sqe, flags);
5964	else if (sqe->poll32_events)
5965		return -EINVAL;
5966
5967	return 0;
5968}
5969
5970static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
5971{
5972	struct io_poll_iocb *poll = &req->poll;
5973	u32 flags;
5974
5975	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
5976		return -EINVAL;
5977	if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
5978		return -EINVAL;
5979	flags = READ_ONCE(sqe->len);
5980	if (flags & ~IORING_POLL_ADD_MULTI)
5981		return -EINVAL;
5982
5983	io_req_set_refcount(req);
5984	poll->events = io_poll_parse_events(sqe, flags);
5985	return 0;
5986}
5987
5988static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
5989{
5990	struct io_poll_iocb *poll = &req->poll;
5991	struct io_poll_table ipt;
5992	int ret;
5993
5994	ipt.pt._qproc = io_poll_queue_proc;
5995
5996	ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events);
5997	if (!ret && ipt.error)
5998		req_set_fail(req);
5999	ret = ret ?: ipt.error;
6000	if (ret)
6001		__io_req_complete(req, issue_flags, ret, 0);
6002	return 0;
6003}
6004
6005static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags)
6006{
6007	struct io_ring_ctx *ctx = req->ctx;
6008	struct io_kiocb *preq;
6009	int ret2, ret = 0;
6010
6011	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6012
6013	spin_lock(&ctx->completion_lock);
6014	preq = io_poll_find(ctx, req->poll_update.old_user_data, true);
6015	if (!preq || !io_poll_disarm(preq)) {
6016		spin_unlock(&ctx->completion_lock);
6017		ret = preq ? -EALREADY : -ENOENT;
6018		goto out;
6019	}
6020	spin_unlock(&ctx->completion_lock);
6021
6022	if (req->poll_update.update_events || req->poll_update.update_user_data) {
6023		/* only mask one event flags, keep behavior flags */
6024		if (req->poll_update.update_events) {
6025			preq->poll.events &= ~0xffff;
6026			preq->poll.events |= req->poll_update.events & 0xffff;
6027			preq->poll.events |= IO_POLL_UNMASK;
6028		}
6029		if (req->poll_update.update_user_data)
6030			preq->user_data = req->poll_update.new_user_data;
6031
6032		ret2 = io_poll_add(preq, issue_flags);
6033		/* successfully updated, don't complete poll request */
6034		if (!ret2)
6035			goto out;
6036	}
6037	req_set_fail(preq);
6038	io_req_complete(preq, -ECANCELED);
6039out:
6040	if (ret < 0)
6041		req_set_fail(req);
6042	/* complete update request, we're done with it */
6043	io_req_complete(req, ret);
6044	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6045	return 0;
6046}
6047
6048static void io_req_task_timeout(struct io_kiocb *req, bool *locked)
6049{
6050	req_set_fail(req);
6051	io_req_complete_post(req, -ETIME, 0);
6052}
6053
6054static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
6055{
6056	struct io_timeout_data *data = container_of(timer,
6057						struct io_timeout_data, timer);
6058	struct io_kiocb *req = data->req;
6059	struct io_ring_ctx *ctx = req->ctx;
6060	unsigned long flags;
6061
6062	spin_lock_irqsave(&ctx->timeout_lock, flags);
6063	list_del_init(&req->timeout.list);
6064	atomic_set(&req->ctx->cq_timeouts,
6065		atomic_read(&req->ctx->cq_timeouts) + 1);
6066	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
6067
6068	req->io_task_work.func = io_req_task_timeout;
6069	io_req_task_work_add(req);
6070	return HRTIMER_NORESTART;
6071}
6072
6073static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
6074					   __u64 user_data)
6075	__must_hold(&ctx->timeout_lock)
6076{
6077	struct io_timeout_data *io;
6078	struct io_kiocb *req;
6079	bool found = false;
6080
6081	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
6082		found = user_data == req->user_data;
6083		if (found)
6084			break;
6085	}
6086	if (!found)
6087		return ERR_PTR(-ENOENT);
6088
6089	io = req->async_data;
6090	if (hrtimer_try_to_cancel(&io->timer) == -1)
6091		return ERR_PTR(-EALREADY);
6092	list_del_init(&req->timeout.list);
6093	return req;
6094}
6095
6096static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
6097	__must_hold(&ctx->completion_lock)
6098	__must_hold(&ctx->timeout_lock)
6099{
6100	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6101
6102	if (IS_ERR(req))
6103		return PTR_ERR(req);
6104
6105	req_set_fail(req);
6106	io_fill_cqe_req(req, -ECANCELED, 0);
6107	io_put_req_deferred(req);
6108	return 0;
6109}
6110
6111static clockid_t io_timeout_get_clock(struct io_timeout_data *data)
6112{
6113	switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) {
6114	case IORING_TIMEOUT_BOOTTIME:
6115		return CLOCK_BOOTTIME;
6116	case IORING_TIMEOUT_REALTIME:
6117		return CLOCK_REALTIME;
6118	default:
6119		/* can't happen, vetted at prep time */
6120		WARN_ON_ONCE(1);
6121		fallthrough;
6122	case 0:
6123		return CLOCK_MONOTONIC;
6124	}
6125}
6126
6127static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6128				    struct timespec64 *ts, enum hrtimer_mode mode)
6129	__must_hold(&ctx->timeout_lock)
6130{
6131	struct io_timeout_data *io;
6132	struct io_kiocb *req;
6133	bool found = false;
6134
6135	list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) {
6136		found = user_data == req->user_data;
6137		if (found)
6138			break;
6139	}
6140	if (!found)
6141		return -ENOENT;
6142
6143	io = req->async_data;
6144	if (hrtimer_try_to_cancel(&io->timer) == -1)
6145		return -EALREADY;
6146	hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
6147	io->timer.function = io_link_timeout_fn;
6148	hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
6149	return 0;
6150}
6151
6152static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
6153			     struct timespec64 *ts, enum hrtimer_mode mode)
6154	__must_hold(&ctx->timeout_lock)
6155{
6156	struct io_kiocb *req = io_timeout_extract(ctx, user_data);
6157	struct io_timeout_data *data;
6158
6159	if (IS_ERR(req))
6160		return PTR_ERR(req);
6161
6162	req->timeout.off = 0; /* noseq */
6163	data = req->async_data;
6164	list_add_tail(&req->timeout.list, &ctx->timeout_list);
6165	hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
6166	data->timer.function = io_timeout_fn;
6167	hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode);
6168	return 0;
6169}
6170
6171static int io_timeout_remove_prep(struct io_kiocb *req,
6172				  const struct io_uring_sqe *sqe)
6173{
6174	struct io_timeout_rem *tr = &req->timeout_rem;
6175
6176	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6177		return -EINVAL;
6178	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6179		return -EINVAL;
6180	if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
6181		return -EINVAL;
6182
6183	tr->ltimeout = false;
6184	tr->addr = READ_ONCE(sqe->addr);
6185	tr->flags = READ_ONCE(sqe->timeout_flags);
6186	if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) {
6187		if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6188			return -EINVAL;
6189		if (tr->flags & IORING_LINK_TIMEOUT_UPDATE)
6190			tr->ltimeout = true;
6191		if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS))
6192			return -EINVAL;
6193		if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2)))
6194			return -EFAULT;
6195	} else if (tr->flags) {
6196		/* timeout removal doesn't support flags */
6197		return -EINVAL;
6198	}
6199
6200	return 0;
6201}
6202
6203static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags)
6204{
6205	return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS
6206					    : HRTIMER_MODE_REL;
6207}
6208
6209/*
6210 * Remove or update an existing timeout command
6211 */
6212static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
6213{
6214	struct io_timeout_rem *tr = &req->timeout_rem;
6215	struct io_ring_ctx *ctx = req->ctx;
6216	int ret;
6217
6218	if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) {
6219		spin_lock(&ctx->completion_lock);
6220		spin_lock_irq(&ctx->timeout_lock);
6221		ret = io_timeout_cancel(ctx, tr->addr);
6222		spin_unlock_irq(&ctx->timeout_lock);
6223		spin_unlock(&ctx->completion_lock);
6224	} else {
6225		enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags);
6226
6227		spin_lock_irq(&ctx->timeout_lock);
6228		if (tr->ltimeout)
6229			ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode);
6230		else
6231			ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode);
6232		spin_unlock_irq(&ctx->timeout_lock);
6233	}
6234
6235	if (ret < 0)
6236		req_set_fail(req);
6237	io_req_complete_post(req, ret, 0);
6238	return 0;
6239}
6240
6241static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
6242			   bool is_timeout_link)
6243{
6244	struct io_timeout_data *data;
6245	unsigned flags;
6246	u32 off = READ_ONCE(sqe->off);
6247
6248	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6249		return -EINVAL;
6250	if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
6251	    sqe->splice_fd_in)
6252		return -EINVAL;
6253	if (off && is_timeout_link)
6254		return -EINVAL;
6255	flags = READ_ONCE(sqe->timeout_flags);
6256	if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK))
6257		return -EINVAL;
6258	/* more than one clock specified is invalid, obviously */
6259	if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
6260		return -EINVAL;
6261
6262	INIT_LIST_HEAD(&req->timeout.list);
6263	req->timeout.off = off;
6264	if (unlikely(off && !req->ctx->off_timeout_used))
6265		req->ctx->off_timeout_used = true;
6266
6267	if (!req->async_data && io_alloc_async_data(req))
6268		return -ENOMEM;
6269
6270	data = req->async_data;
6271	data->req = req;
6272	data->flags = flags;
6273
6274	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
6275		return -EFAULT;
6276
6277	INIT_LIST_HEAD(&req->timeout.list);
6278	data->mode = io_translate_timeout_mode(flags);
6279	hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
6280
6281	if (is_timeout_link) {
6282		struct io_submit_link *link = &req->ctx->submit_state.link;
6283
6284		if (!link->head)
6285			return -EINVAL;
6286		if (link->last->opcode == IORING_OP_LINK_TIMEOUT)
6287			return -EINVAL;
6288		req->timeout.head = link->last;
6289		link->last->flags |= REQ_F_ARM_LTIMEOUT;
6290	}
6291	return 0;
6292}
6293
6294static int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
6295{
6296	struct io_ring_ctx *ctx = req->ctx;
6297	struct io_timeout_data *data = req->async_data;
6298	struct list_head *entry;
6299	u32 tail, off = req->timeout.off;
6300
6301	spin_lock_irq(&ctx->timeout_lock);
6302
6303	/*
6304	 * sqe->off holds how many events that need to occur for this
6305	 * timeout event to be satisfied. If it isn't set, then this is
6306	 * a pure timeout request, sequence isn't used.
6307	 */
6308	if (io_is_timeout_noseq(req)) {
6309		entry = ctx->timeout_list.prev;
6310		goto add;
6311	}
6312
6313	tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
6314	req->timeout.target_seq = tail + off;
6315
6316	/* Update the last seq here in case io_flush_timeouts() hasn't.
6317	 * This is safe because ->completion_lock is held, and submissions
6318	 * and completions are never mixed in the same ->completion_lock section.
6319	 */
6320	ctx->cq_last_tm_flush = tail;
6321
6322	/*
6323	 * Insertion sort, ensuring the first entry in the list is always
6324	 * the one we need first.
6325	 */
6326	list_for_each_prev(entry, &ctx->timeout_list) {
6327		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
6328						  timeout.list);
6329
6330		if (io_is_timeout_noseq(nxt))
6331			continue;
6332		/* nxt.seq is behind @tail, otherwise would've been completed */
6333		if (off >= nxt->timeout.target_seq - tail)
6334			break;
6335	}
6336add:
6337	list_add(&req->timeout.list, entry);
6338	data->timer.function = io_timeout_fn;
6339	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
6340	spin_unlock_irq(&ctx->timeout_lock);
6341	return 0;
6342}
6343
6344struct io_cancel_data {
6345	struct io_ring_ctx *ctx;
6346	u64 user_data;
6347};
6348
6349static bool io_cancel_cb(struct io_wq_work *work, void *data)
6350{
6351	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6352	struct io_cancel_data *cd = data;
6353
6354	return req->ctx == cd->ctx && req->user_data == cd->user_data;
6355}
6356
6357static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data,
6358			       struct io_ring_ctx *ctx)
6359{
6360	struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, };
6361	enum io_wq_cancel cancel_ret;
6362	int ret = 0;
6363
6364	if (!tctx || !tctx->io_wq)
6365		return -ENOENT;
6366
6367	cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false);
6368	switch (cancel_ret) {
6369	case IO_WQ_CANCEL_OK:
6370		ret = 0;
6371		break;
6372	case IO_WQ_CANCEL_RUNNING:
6373		ret = -EALREADY;
6374		break;
6375	case IO_WQ_CANCEL_NOTFOUND:
6376		ret = -ENOENT;
6377		break;
6378	}
6379
6380	return ret;
6381}
6382
6383static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr)
6384{
6385	struct io_ring_ctx *ctx = req->ctx;
6386	int ret;
6387
6388	WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current);
6389
6390	ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx);
6391	if (ret != -ENOENT)
6392		return ret;
6393
6394	spin_lock(&ctx->completion_lock);
6395	spin_lock_irq(&ctx->timeout_lock);
6396	ret = io_timeout_cancel(ctx, sqe_addr);
6397	spin_unlock_irq(&ctx->timeout_lock);
6398	if (ret != -ENOENT)
6399		goto out;
6400	ret = io_poll_cancel(ctx, sqe_addr, false);
6401out:
6402	spin_unlock(&ctx->completion_lock);
6403	return ret;
6404}
6405
6406static int io_async_cancel_prep(struct io_kiocb *req,
6407				const struct io_uring_sqe *sqe)
6408{
6409	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
6410		return -EINVAL;
6411	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6412		return -EINVAL;
6413	if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags ||
6414	    sqe->splice_fd_in)
6415		return -EINVAL;
6416
6417	req->cancel.addr = READ_ONCE(sqe->addr);
6418	return 0;
6419}
6420
6421static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
6422{
6423	struct io_ring_ctx *ctx = req->ctx;
6424	u64 sqe_addr = req->cancel.addr;
6425	struct io_tctx_node *node;
6426	int ret;
6427
6428	ret = io_try_cancel_userdata(req, sqe_addr);
6429	if (ret != -ENOENT)
6430		goto done;
6431
6432	/* slow path, try all io-wq's */
6433	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6434	ret = -ENOENT;
6435	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
6436		struct io_uring_task *tctx = node->task->io_uring;
6437
6438		ret = io_async_cancel_one(tctx, req->cancel.addr, ctx);
6439		if (ret != -ENOENT)
6440			break;
6441	}
6442	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6443done:
6444	if (ret < 0)
6445		req_set_fail(req);
6446	io_req_complete_post(req, ret, 0);
6447	return 0;
6448}
6449
6450static int io_rsrc_update_prep(struct io_kiocb *req,
6451				const struct io_uring_sqe *sqe)
6452{
6453	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
6454		return -EINVAL;
6455	if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
6456		return -EINVAL;
6457
6458	req->rsrc_update.offset = READ_ONCE(sqe->off);
6459	req->rsrc_update.nr_args = READ_ONCE(sqe->len);
6460	if (!req->rsrc_update.nr_args)
6461		return -EINVAL;
6462	req->rsrc_update.arg = READ_ONCE(sqe->addr);
6463	return 0;
6464}
6465
6466static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
6467{
6468	struct io_ring_ctx *ctx = req->ctx;
6469	struct io_uring_rsrc_update2 up;
6470	int ret;
6471
6472	up.offset = req->rsrc_update.offset;
6473	up.data = req->rsrc_update.arg;
6474	up.nr = 0;
6475	up.tags = 0;
6476	up.resv = 0;
6477	up.resv2 = 0;
6478
6479	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6480	ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
6481					&up, req->rsrc_update.nr_args);
6482	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6483
6484	if (ret < 0)
6485		req_set_fail(req);
6486	__io_req_complete(req, issue_flags, ret, 0);
6487	return 0;
6488}
6489
6490static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
6491{
6492	switch (req->opcode) {
6493	case IORING_OP_NOP:
6494		return 0;
6495	case IORING_OP_READV:
6496	case IORING_OP_READ_FIXED:
6497	case IORING_OP_READ:
6498		return io_read_prep(req, sqe);
6499	case IORING_OP_WRITEV:
6500	case IORING_OP_WRITE_FIXED:
6501	case IORING_OP_WRITE:
6502		return io_write_prep(req, sqe);
6503	case IORING_OP_POLL_ADD:
6504		return io_poll_add_prep(req, sqe);
6505	case IORING_OP_POLL_REMOVE:
6506		return io_poll_update_prep(req, sqe);
6507	case IORING_OP_FSYNC:
6508		return io_fsync_prep(req, sqe);
6509	case IORING_OP_SYNC_FILE_RANGE:
6510		return io_sfr_prep(req, sqe);
6511	case IORING_OP_SENDMSG:
6512	case IORING_OP_SEND:
6513		return io_sendmsg_prep(req, sqe);
6514	case IORING_OP_RECVMSG:
6515	case IORING_OP_RECV:
6516		return io_recvmsg_prep(req, sqe);
6517	case IORING_OP_CONNECT:
6518		return io_connect_prep(req, sqe);
6519	case IORING_OP_TIMEOUT:
6520		return io_timeout_prep(req, sqe, false);
6521	case IORING_OP_TIMEOUT_REMOVE:
6522		return io_timeout_remove_prep(req, sqe);
6523	case IORING_OP_ASYNC_CANCEL:
6524		return io_async_cancel_prep(req, sqe);
6525	case IORING_OP_LINK_TIMEOUT:
6526		return io_timeout_prep(req, sqe, true);
6527	case IORING_OP_ACCEPT:
6528		return io_accept_prep(req, sqe);
6529	case IORING_OP_FALLOCATE:
6530		return io_fallocate_prep(req, sqe);
6531	case IORING_OP_OPENAT:
6532		return io_openat_prep(req, sqe);
6533	case IORING_OP_CLOSE:
6534		return io_close_prep(req, sqe);
6535	case IORING_OP_FILES_UPDATE:
6536		return io_rsrc_update_prep(req, sqe);
6537	case IORING_OP_STATX:
6538		return io_statx_prep(req, sqe);
6539	case IORING_OP_FADVISE:
6540		return io_fadvise_prep(req, sqe);
6541	case IORING_OP_MADVISE:
6542		return io_madvise_prep(req, sqe);
6543	case IORING_OP_OPENAT2:
6544		return io_openat2_prep(req, sqe);
6545	case IORING_OP_EPOLL_CTL:
6546		return io_epoll_ctl_prep(req, sqe);
6547	case IORING_OP_SPLICE:
6548		return io_splice_prep(req, sqe);
6549	case IORING_OP_PROVIDE_BUFFERS:
6550		return io_provide_buffers_prep(req, sqe);
6551	case IORING_OP_REMOVE_BUFFERS:
6552		return io_remove_buffers_prep(req, sqe);
6553	case IORING_OP_TEE:
6554		return io_tee_prep(req, sqe);
6555	case IORING_OP_SHUTDOWN:
6556		return io_shutdown_prep(req, sqe);
6557	case IORING_OP_RENAMEAT:
6558		return io_renameat_prep(req, sqe);
6559	case IORING_OP_UNLINKAT:
6560		return io_unlinkat_prep(req, sqe);
6561	}
6562
6563	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
6564			req->opcode);
6565	return -EINVAL;
6566}
6567
6568static int io_req_prep_async(struct io_kiocb *req)
6569{
6570	if (!io_op_defs[req->opcode].needs_async_setup)
6571		return 0;
6572	if (WARN_ON_ONCE(req->async_data))
6573		return -EFAULT;
6574	if (io_alloc_async_data(req))
6575		return -EAGAIN;
6576
6577	switch (req->opcode) {
6578	case IORING_OP_READV:
6579		return io_rw_prep_async(req, READ);
6580	case IORING_OP_WRITEV:
6581		return io_rw_prep_async(req, WRITE);
6582	case IORING_OP_SENDMSG:
6583		return io_sendmsg_prep_async(req);
6584	case IORING_OP_RECVMSG:
6585		return io_recvmsg_prep_async(req);
6586	case IORING_OP_CONNECT:
6587		return io_connect_prep_async(req);
6588	}
6589	printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
6590		    req->opcode);
6591	return -EFAULT;
6592}
6593
6594static u32 io_get_sequence(struct io_kiocb *req)
6595{
6596	u32 seq = req->ctx->cached_sq_head;
6597
6598	/* need original cached_sq_head, but it was increased for each req */
6599	io_for_each_link(req, req)
6600		seq--;
6601	return seq;
6602}
6603
6604static bool io_drain_req(struct io_kiocb *req)
6605{
6606	struct io_kiocb *pos;
6607	struct io_ring_ctx *ctx = req->ctx;
6608	struct io_defer_entry *de;
6609	int ret;
6610	u32 seq;
6611
6612	if (req->flags & REQ_F_FAIL) {
6613		io_req_complete_fail_submit(req);
6614		return true;
6615	}
6616
6617	/*
6618	 * If we need to drain a request in the middle of a link, drain the
6619	 * head request and the next request/link after the current link.
6620	 * Considering sequential execution of links, IOSQE_IO_DRAIN will be
6621	 * maintained for every request of our link.
6622	 */
6623	if (ctx->drain_next) {
6624		req->flags |= REQ_F_IO_DRAIN;
6625		ctx->drain_next = false;
6626	}
6627	/* not interested in head, start from the first linked */
6628	io_for_each_link(pos, req->link) {
6629		if (pos->flags & REQ_F_IO_DRAIN) {
6630			ctx->drain_next = true;
6631			req->flags |= REQ_F_IO_DRAIN;
6632			break;
6633		}
6634	}
6635
6636	/* Still need defer if there is pending req in defer list. */
6637	spin_lock(&ctx->completion_lock);
6638	if (likely(list_empty_careful(&ctx->defer_list) &&
6639		!(req->flags & REQ_F_IO_DRAIN))) {
6640		spin_unlock(&ctx->completion_lock);
6641		ctx->drain_active = false;
6642		return false;
6643	}
6644	spin_unlock(&ctx->completion_lock);
6645
6646	seq = io_get_sequence(req);
6647	/* Still a chance to pass the sequence check */
6648	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
6649		return false;
6650
6651	ret = io_req_prep_async(req);
6652	if (ret)
6653		goto fail;
6654	io_prep_async_link(req);
6655	de = kmalloc(sizeof(*de), GFP_KERNEL);
6656	if (!de) {
6657		ret = -ENOMEM;
6658fail:
6659		io_req_complete_failed(req, ret);
6660		return true;
6661	}
6662
6663	spin_lock(&ctx->completion_lock);
6664	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
6665		spin_unlock(&ctx->completion_lock);
6666		kfree(de);
6667		io_queue_async_work(req, NULL);
6668		return true;
6669	}
6670
6671	trace_io_uring_defer(ctx, req, req->user_data);
6672	de->req = req;
6673	de->seq = seq;
6674	list_add_tail(&de->list, &ctx->defer_list);
6675	spin_unlock(&ctx->completion_lock);
6676	return true;
6677}
6678
6679static void io_clean_op(struct io_kiocb *req)
6680{
6681	if (req->flags & REQ_F_BUFFER_SELECTED) {
6682		switch (req->opcode) {
6683		case IORING_OP_READV:
6684		case IORING_OP_READ_FIXED:
6685		case IORING_OP_READ:
6686			kfree((void *)(unsigned long)req->rw.addr);
6687			break;
6688		case IORING_OP_RECVMSG:
6689		case IORING_OP_RECV:
6690			kfree(req->sr_msg.kbuf);
6691			break;
6692		}
6693	}
6694
6695	if (req->flags & REQ_F_NEED_CLEANUP) {
6696		switch (req->opcode) {
6697		case IORING_OP_READV:
6698		case IORING_OP_READ_FIXED:
6699		case IORING_OP_READ:
6700		case IORING_OP_WRITEV:
6701		case IORING_OP_WRITE_FIXED:
6702		case IORING_OP_WRITE: {
6703			struct io_async_rw *io = req->async_data;
6704
6705			kfree(io->free_iovec);
6706			break;
6707			}
6708		case IORING_OP_RECVMSG:
6709		case IORING_OP_SENDMSG: {
6710			struct io_async_msghdr *io = req->async_data;
6711
6712			kfree(io->free_iov);
6713			break;
6714			}
6715		case IORING_OP_OPENAT:
6716		case IORING_OP_OPENAT2:
6717			if (req->open.filename)
6718				putname(req->open.filename);
6719			break;
6720		case IORING_OP_RENAMEAT:
6721			putname(req->rename.oldpath);
6722			putname(req->rename.newpath);
6723			break;
6724		case IORING_OP_UNLINKAT:
6725			putname(req->unlink.filename);
6726			break;
6727		}
6728	}
6729	if ((req->flags & REQ_F_POLLED) && req->apoll) {
6730		kfree(req->apoll->double_poll);
6731		kfree(req->apoll);
6732		req->apoll = NULL;
6733	}
6734	if (req->flags & REQ_F_INFLIGHT) {
6735		struct io_uring_task *tctx = req->task->io_uring;
6736
6737		atomic_dec(&tctx->inflight_tracked);
6738	}
6739	if (req->flags & REQ_F_CREDS)
6740		put_cred(req->creds);
6741
6742	req->flags &= ~IO_REQ_CLEAN_FLAGS;
6743}
6744
6745static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
6746{
6747	struct io_ring_ctx *ctx = req->ctx;
6748	const struct cred *creds = NULL;
6749	int ret;
6750
6751	if ((req->flags & REQ_F_CREDS) && req->creds != current_cred())
6752		creds = override_creds(req->creds);
6753
6754	switch (req->opcode) {
6755	case IORING_OP_NOP:
6756		ret = io_nop(req, issue_flags);
6757		break;
6758	case IORING_OP_READV:
6759	case IORING_OP_READ_FIXED:
6760	case IORING_OP_READ:
6761		ret = io_read(req, issue_flags);
6762		break;
6763	case IORING_OP_WRITEV:
6764	case IORING_OP_WRITE_FIXED:
6765	case IORING_OP_WRITE:
6766		ret = io_write(req, issue_flags);
6767		break;
6768	case IORING_OP_FSYNC:
6769		ret = io_fsync(req, issue_flags);
6770		break;
6771	case IORING_OP_POLL_ADD:
6772		ret = io_poll_add(req, issue_flags);
6773		break;
6774	case IORING_OP_POLL_REMOVE:
6775		ret = io_poll_update(req, issue_flags);
6776		break;
6777	case IORING_OP_SYNC_FILE_RANGE:
6778		ret = io_sync_file_range(req, issue_flags);
6779		break;
6780	case IORING_OP_SENDMSG:
6781		ret = io_sendmsg(req, issue_flags);
6782		break;
6783	case IORING_OP_SEND:
6784		ret = io_send(req, issue_flags);
6785		break;
6786	case IORING_OP_RECVMSG:
6787		ret = io_recvmsg(req, issue_flags);
6788		break;
6789	case IORING_OP_RECV:
6790		ret = io_recv(req, issue_flags);
6791		break;
6792	case IORING_OP_TIMEOUT:
6793		ret = io_timeout(req, issue_flags);
6794		break;
6795	case IORING_OP_TIMEOUT_REMOVE:
6796		ret = io_timeout_remove(req, issue_flags);
6797		break;
6798	case IORING_OP_ACCEPT:
6799		ret = io_accept(req, issue_flags);
6800		break;
6801	case IORING_OP_CONNECT:
6802		ret = io_connect(req, issue_flags);
6803		break;
6804	case IORING_OP_ASYNC_CANCEL:
6805		ret = io_async_cancel(req, issue_flags);
6806		break;
6807	case IORING_OP_FALLOCATE:
6808		ret = io_fallocate(req, issue_flags);
6809		break;
6810	case IORING_OP_OPENAT:
6811		ret = io_openat(req, issue_flags);
6812		break;
6813	case IORING_OP_CLOSE:
6814		ret = io_close(req, issue_flags);
6815		break;
6816	case IORING_OP_FILES_UPDATE:
6817		ret = io_files_update(req, issue_flags);
6818		break;
6819	case IORING_OP_STATX:
6820		ret = io_statx(req, issue_flags);
6821		break;
6822	case IORING_OP_FADVISE:
6823		ret = io_fadvise(req, issue_flags);
6824		break;
6825	case IORING_OP_MADVISE:
6826		ret = io_madvise(req, issue_flags);
6827		break;
6828	case IORING_OP_OPENAT2:
6829		ret = io_openat2(req, issue_flags);
6830		break;
6831	case IORING_OP_EPOLL_CTL:
6832		ret = io_epoll_ctl(req, issue_flags);
6833		break;
6834	case IORING_OP_SPLICE:
6835		ret = io_splice(req, issue_flags);
6836		break;
6837	case IORING_OP_PROVIDE_BUFFERS:
6838		ret = io_provide_buffers(req, issue_flags);
6839		break;
6840	case IORING_OP_REMOVE_BUFFERS:
6841		ret = io_remove_buffers(req, issue_flags);
6842		break;
6843	case IORING_OP_TEE:
6844		ret = io_tee(req, issue_flags);
6845		break;
6846	case IORING_OP_SHUTDOWN:
6847		ret = io_shutdown(req, issue_flags);
6848		break;
6849	case IORING_OP_RENAMEAT:
6850		ret = io_renameat(req, issue_flags);
6851		break;
6852	case IORING_OP_UNLINKAT:
6853		ret = io_unlinkat(req, issue_flags);
6854		break;
6855	default:
6856		ret = -EINVAL;
6857		break;
6858	}
6859
6860	if (creds)
6861		revert_creds(creds);
6862	if (ret)
6863		return ret;
6864	/* If the op doesn't have a file, we're not polling for it */
6865	if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file)
6866		io_iopoll_req_issued(req);
6867
6868	return 0;
6869}
6870
6871static struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
6872{
6873	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6874
6875	req = io_put_req_find_next(req);
6876	return req ? &req->work : NULL;
6877}
6878
6879static void io_wq_submit_work(struct io_wq_work *work)
6880{
6881	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
6882	struct io_kiocb *timeout;
6883	int ret = 0;
6884
6885	/* one will be dropped by ->io_free_work() after returning to io-wq */
6886	if (!(req->flags & REQ_F_REFCOUNT))
6887		__io_req_set_refcount(req, 2);
6888	else
6889		req_ref_get(req);
6890
6891	timeout = io_prep_linked_timeout(req);
6892	if (timeout)
6893		io_queue_linked_timeout(timeout);
6894	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
6895	if (work->flags & IO_WQ_WORK_CANCEL)
6896		ret = -ECANCELED;
6897
6898	if (!ret) {
6899		do {
6900			ret = io_issue_sqe(req, 0);
6901			/*
6902			 * We can get EAGAIN for polled IO even though we're
6903			 * forcing a sync submission from here, since we can't
6904			 * wait for request slots on the block side.
6905			 */
6906			if (ret != -EAGAIN || !(req->ctx->flags & IORING_SETUP_IOPOLL))
6907				break;
6908			if (io_wq_worker_stopped())
6909				break;
6910			/*
6911			 * If REQ_F_NOWAIT is set, then don't wait or retry with
6912			 * poll. -EAGAIN is final for that case.
6913			 */
6914			if (req->flags & REQ_F_NOWAIT)
6915				break;
6916
6917			cond_resched();
6918		} while (1);
6919	}
6920
6921	/* avoid locking problems by failing it from a clean context */
6922	if (ret)
6923		io_req_task_queue_fail(req, ret);
6924}
6925
6926static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table,
6927						       unsigned i)
6928{
6929	return &table->files[i];
6930}
6931
6932static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
6933					      int index)
6934{
6935	struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index);
6936
6937	return (struct file *) (slot->file_ptr & FFS_MASK);
6938}
6939
6940static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file)
6941{
6942	unsigned long file_ptr = (unsigned long) file;
6943
6944	if (__io_file_supports_nowait(file, READ))
6945		file_ptr |= FFS_ASYNC_READ;
6946	if (__io_file_supports_nowait(file, WRITE))
6947		file_ptr |= FFS_ASYNC_WRITE;
6948	if (S_ISREG(file_inode(file)->i_mode))
6949		file_ptr |= FFS_ISREG;
6950	file_slot->file_ptr = file_ptr;
6951}
6952
6953static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
6954					     struct io_kiocb *req, int fd,
6955					     unsigned int issue_flags)
6956{
6957	struct file *file = NULL;
6958	unsigned long file_ptr;
6959
6960	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6961
6962	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
6963		goto out;
6964	fd = array_index_nospec(fd, ctx->nr_user_files);
6965	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
6966	file = (struct file *) (file_ptr & FFS_MASK);
6967	file_ptr &= ~FFS_MASK;
6968	/* mask in overlapping REQ_F and FFS bits */
6969	req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
6970	io_req_set_rsrc_node(req);
6971out:
6972	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
6973	return file;
6974}
6975
6976static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
6977				       struct io_kiocb *req, int fd)
6978{
6979	struct file *file = fget(fd);
6980
6981	trace_io_uring_file_get(ctx, fd);
6982
6983	/* we don't allow fixed io_uring files */
6984	if (file && unlikely(file->f_op == &io_uring_fops))
6985		io_req_track_inflight(req);
6986	return file;
6987}
6988
6989static inline struct file *io_file_get(struct io_ring_ctx *ctx,
6990				       struct io_kiocb *req, int fd, bool fixed,
6991				       unsigned int issue_flags)
6992{
6993	if (fixed)
6994		return io_file_get_fixed(ctx, req, fd, issue_flags);
6995	else
6996		return io_file_get_normal(ctx, req, fd);
6997}
6998
6999static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
7000{
7001	struct io_kiocb *prev = req->timeout.prev;
7002	int ret = -ENOENT;
7003
7004	if (prev) {
7005		if (!(req->task->flags & PF_EXITING))
7006			ret = io_try_cancel_userdata(req, prev->user_data);
7007		io_req_complete_post(req, ret ?: -ETIME, 0);
7008		io_put_req(prev);
7009	} else {
7010		io_req_complete_post(req, -ETIME, 0);
7011	}
7012}
7013
7014static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
7015{
7016	struct io_timeout_data *data = container_of(timer,
7017						struct io_timeout_data, timer);
7018	struct io_kiocb *prev, *req = data->req;
7019	struct io_ring_ctx *ctx = req->ctx;
7020	unsigned long flags;
7021
7022	spin_lock_irqsave(&ctx->timeout_lock, flags);
7023	prev = req->timeout.head;
7024	req->timeout.head = NULL;
7025
7026	/*
7027	 * We don't expect the list to be empty, that will only happen if we
7028	 * race with the completion of the linked work.
7029	 */
7030	if (prev) {
7031		io_remove_next_linked(prev);
7032		if (!req_ref_inc_not_zero(prev))
7033			prev = NULL;
7034	}
7035	list_del(&req->timeout.list);
7036	req->timeout.prev = prev;
7037	spin_unlock_irqrestore(&ctx->timeout_lock, flags);
7038
7039	req->io_task_work.func = io_req_task_link_timeout;
7040	io_req_task_work_add(req);
7041	return HRTIMER_NORESTART;
7042}
7043
7044static void io_queue_linked_timeout(struct io_kiocb *req)
7045{
7046	struct io_ring_ctx *ctx = req->ctx;
7047
7048	spin_lock_irq(&ctx->timeout_lock);
7049	/*
7050	 * If the back reference is NULL, then our linked request finished
7051	 * before we got a chance to setup the timer
7052	 */
7053	if (req->timeout.head) {
7054		struct io_timeout_data *data = req->async_data;
7055
7056		data->timer.function = io_link_timeout_fn;
7057		hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
7058				data->mode);
7059		list_add_tail(&req->timeout.list, &ctx->ltimeout_list);
7060	}
7061	spin_unlock_irq(&ctx->timeout_lock);
7062	/* drop submission reference */
7063	io_put_req(req);
7064}
7065
7066static void __io_queue_sqe(struct io_kiocb *req)
7067	__must_hold(&req->ctx->uring_lock)
7068{
7069	struct io_kiocb *linked_timeout;
7070	int ret;
7071
7072issue_sqe:
7073	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
7074
7075	/*
7076	 * We async punt it if the file wasn't marked NOWAIT, or if the file
7077	 * doesn't support non-blocking read/write attempts
7078	 */
7079	if (likely(!ret)) {
7080		if (req->flags & REQ_F_COMPLETE_INLINE) {
7081			struct io_ring_ctx *ctx = req->ctx;
7082			struct io_submit_state *state = &ctx->submit_state;
7083
7084			state->compl_reqs[state->compl_nr++] = req;
7085			if (state->compl_nr == ARRAY_SIZE(state->compl_reqs))
7086				io_submit_flush_completions(ctx);
7087			return;
7088		}
7089
7090		linked_timeout = io_prep_linked_timeout(req);
7091		if (linked_timeout)
7092			io_queue_linked_timeout(linked_timeout);
7093	} else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
7094		linked_timeout = io_prep_linked_timeout(req);
7095
7096		switch (io_arm_poll_handler(req)) {
7097		case IO_APOLL_READY:
7098			if (linked_timeout)
7099				io_queue_linked_timeout(linked_timeout);
7100			goto issue_sqe;
7101		case IO_APOLL_ABORTED:
7102			/*
7103			 * Queued up for async execution, worker will release
7104			 * submit reference when the iocb is actually submitted.
7105			 */
7106			io_queue_async_work(req, NULL);
7107			break;
7108		}
7109
7110		if (linked_timeout)
7111			io_queue_linked_timeout(linked_timeout);
7112	} else {
7113		io_req_complete_failed(req, ret);
7114	}
7115}
7116
7117static inline void io_queue_sqe(struct io_kiocb *req)
7118	__must_hold(&req->ctx->uring_lock)
7119{
7120	if (unlikely(req->ctx->drain_active) && io_drain_req(req))
7121		return;
7122
7123	if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
7124		__io_queue_sqe(req);
7125	} else if (req->flags & REQ_F_FAIL) {
7126		io_req_complete_fail_submit(req);
7127	} else {
7128		int ret = io_req_prep_async(req);
7129
7130		if (unlikely(ret))
7131			io_req_complete_failed(req, ret);
7132		else
7133			io_queue_async_work(req, NULL);
7134	}
7135}
7136
7137/*
7138 * Check SQE restrictions (opcode and flags).
7139 *
7140 * Returns 'true' if SQE is allowed, 'false' otherwise.
7141 */
7142static inline bool io_check_restriction(struct io_ring_ctx *ctx,
7143					struct io_kiocb *req,
7144					unsigned int sqe_flags)
7145{
7146	if (likely(!ctx->restricted))
7147		return true;
7148
7149	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
7150		return false;
7151
7152	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
7153	    ctx->restrictions.sqe_flags_required)
7154		return false;
7155
7156	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
7157			  ctx->restrictions.sqe_flags_required))
7158		return false;
7159
7160	return true;
7161}
7162
7163static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
7164		       const struct io_uring_sqe *sqe)
7165	__must_hold(&ctx->uring_lock)
7166{
7167	struct io_submit_state *state;
7168	unsigned int sqe_flags;
7169	int personality, ret = 0;
7170
7171	/* req is partially pre-initialised, see io_preinit_req() */
7172	req->opcode = READ_ONCE(sqe->opcode);
7173	/* same numerical values with corresponding REQ_F_*, safe to copy */
7174	req->flags = sqe_flags = READ_ONCE(sqe->flags);
7175	req->user_data = READ_ONCE(sqe->user_data);
7176	req->file = NULL;
7177	req->fixed_rsrc_refs = NULL;
7178	req->task = current;
7179
7180	/* enforce forwards compatibility on users */
7181	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
7182		return -EINVAL;
7183	if (unlikely(req->opcode >= IORING_OP_LAST))
7184		return -EINVAL;
7185	if (!io_check_restriction(ctx, req, sqe_flags))
7186		return -EACCES;
7187
7188	if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
7189	    !io_op_defs[req->opcode].buffer_select)
7190		return -EOPNOTSUPP;
7191	if (unlikely(sqe_flags & IOSQE_IO_DRAIN))
7192		ctx->drain_active = true;
7193
7194	personality = READ_ONCE(sqe->personality);
7195	if (personality) {
7196		req->creds = xa_load(&ctx->personalities, personality);
7197		if (!req->creds)
7198			return -EINVAL;
7199		get_cred(req->creds);
7200		req->flags |= REQ_F_CREDS;
7201	}
7202	state = &ctx->submit_state;
7203
7204	/*
7205	 * Plug now if we have more than 1 IO left after this, and the target
7206	 * is potentially a read/write to block based storage.
7207	 */
7208	if (!state->plug_started && state->ios_left > 1 &&
7209	    io_op_defs[req->opcode].plug) {
7210		blk_start_plug(&state->plug);
7211		state->plug_started = true;
7212	}
7213
7214	if (io_op_defs[req->opcode].needs_file) {
7215		req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
7216					(sqe_flags & IOSQE_FIXED_FILE),
7217					IO_URING_F_NONBLOCK);
7218		if (unlikely(!req->file))
7219			ret = -EBADF;
7220	}
7221
7222	state->ios_left--;
7223	return ret;
7224}
7225
7226static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
7227			 const struct io_uring_sqe *sqe)
7228	__must_hold(&ctx->uring_lock)
7229{
7230	struct io_submit_link *link = &ctx->submit_state.link;
7231	int ret;
7232
7233	ret = io_init_req(ctx, req, sqe);
7234	if (unlikely(ret)) {
7235fail_req:
7236		/* fail even hard links since we don't submit */
7237		if (link->head) {
7238			/*
7239			 * we can judge a link req is failed or cancelled by if
7240			 * REQ_F_FAIL is set, but the head is an exception since
7241			 * it may be set REQ_F_FAIL because of other req's failure
7242			 * so let's leverage req->result to distinguish if a head
7243			 * is set REQ_F_FAIL because of its failure or other req's
7244			 * failure so that we can set the correct ret code for it.
7245			 * init result here to avoid affecting the normal path.
7246			 */
7247			if (!(link->head->flags & REQ_F_FAIL))
7248				req_fail_link_node(link->head, -ECANCELED);
7249		} else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
7250			/*
7251			 * the current req is a normal req, we should return
7252			 * error and thus break the submittion loop.
7253			 */
7254			io_req_complete_failed(req, ret);
7255			return ret;
7256		}
7257		req_fail_link_node(req, ret);
7258	} else {
7259		ret = io_req_prep(req, sqe);
7260		if (unlikely(ret))
7261			goto fail_req;
7262	}
7263
7264	/* don't need @sqe from now on */
7265	trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
7266				  req->flags, true,
7267				  ctx->flags & IORING_SETUP_SQPOLL);
7268
7269	/*
7270	 * If we already have a head request, queue this one for async
7271	 * submittal once the head completes. If we don't have a head but
7272	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
7273	 * submitted sync once the chain is complete. If none of those
7274	 * conditions are true (normal request), then just queue it.
7275	 */
7276	if (link->head) {
7277		struct io_kiocb *head = link->head;
7278
7279		if (!(req->flags & REQ_F_FAIL)) {
7280			ret = io_req_prep_async(req);
7281			if (unlikely(ret)) {
7282				req_fail_link_node(req, ret);
7283				if (!(head->flags & REQ_F_FAIL))
7284					req_fail_link_node(head, -ECANCELED);
7285			}
7286		}
7287		trace_io_uring_link(ctx, req, head);
7288		link->last->link = req;
7289		link->last = req;
7290
7291		/* last request of a link, enqueue the link */
7292		if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
7293			link->head = NULL;
7294			io_queue_sqe(head);
7295		}
7296	} else {
7297		if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) {
7298			link->head = req;
7299			link->last = req;
7300		} else {
7301			io_queue_sqe(req);
7302		}
7303	}
7304
7305	return 0;
7306}
7307
7308/*
7309 * Batched submission is done, ensure local IO is flushed out.
7310 */
7311static void io_submit_state_end(struct io_submit_state *state,
7312				struct io_ring_ctx *ctx)
7313{
7314	if (state->link.head)
7315		io_queue_sqe(state->link.head);
7316	if (state->compl_nr)
7317		io_submit_flush_completions(ctx);
7318	if (state->plug_started)
7319		blk_finish_plug(&state->plug);
7320}
7321
7322/*
7323 * Start submission side cache.
7324 */
7325static void io_submit_state_start(struct io_submit_state *state,
7326				  unsigned int max_ios)
7327{
7328	state->plug_started = false;
7329	state->ios_left = max_ios;
7330	/* set only head, no need to init link_last in advance */
7331	state->link.head = NULL;
7332}
7333
7334static void io_commit_sqring(struct io_ring_ctx *ctx)
7335{
7336	struct io_rings *rings = ctx->rings;
7337
7338	/*
7339	 * Ensure any loads from the SQEs are done at this point,
7340	 * since once we write the new head, the application could
7341	 * write new data to them.
7342	 */
7343	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
7344}
7345
7346/*
7347 * Fetch an sqe, if one is available. Note this returns a pointer to memory
7348 * that is mapped by userspace. This means that care needs to be taken to
7349 * ensure that reads are stable, as we cannot rely on userspace always
7350 * being a good citizen. If members of the sqe are validated and then later
7351 * used, it's important that those reads are done through READ_ONCE() to
7352 * prevent a re-load down the line.
7353 */
7354static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
7355{
7356	unsigned head, mask = ctx->sq_entries - 1;
7357	unsigned sq_idx = ctx->cached_sq_head++ & mask;
7358
7359	/*
7360	 * The cached sq head (or cq tail) serves two purposes:
7361	 *
7362	 * 1) allows us to batch the cost of updating the user visible
7363	 *    head updates.
7364	 * 2) allows the kernel side to track the head on its own, even
7365	 *    though the application is the one updating it.
7366	 */
7367	head = READ_ONCE(ctx->sq_array[sq_idx]);
7368	if (likely(head < ctx->sq_entries))
7369		return &ctx->sq_sqes[head];
7370
7371	/* drop invalid entries */
7372	ctx->cq_extra--;
7373	WRITE_ONCE(ctx->rings->sq_dropped,
7374		   READ_ONCE(ctx->rings->sq_dropped) + 1);
7375	return NULL;
7376}
7377
7378static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
7379	__must_hold(&ctx->uring_lock)
7380{
7381	int submitted = 0;
7382
7383	/* make sure SQ entry isn't read before tail */
7384	nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
7385	if (!percpu_ref_tryget_many(&ctx->refs, nr))
7386		return -EAGAIN;
7387	io_get_task_refs(nr);
7388
7389	io_submit_state_start(&ctx->submit_state, nr);
7390	while (submitted < nr) {
7391		const struct io_uring_sqe *sqe;
7392		struct io_kiocb *req;
7393
7394		req = io_alloc_req(ctx);
7395		if (unlikely(!req)) {
7396			if (!submitted)
7397				submitted = -EAGAIN;
7398			break;
7399		}
7400		sqe = io_get_sqe(ctx);
7401		if (unlikely(!sqe)) {
7402			list_add(&req->inflight_entry, &ctx->submit_state.free_list);
7403			break;
7404		}
7405		/* will complete beyond this point, count as submitted */
7406		submitted++;
7407		if (io_submit_sqe(ctx, req, sqe))
7408			break;
7409	}
7410
7411	if (unlikely(submitted != nr)) {
7412		int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
7413		int unused = nr - ref_used;
7414
7415		current->io_uring->cached_refs += unused;
7416		percpu_ref_put_many(&ctx->refs, unused);
7417	}
7418
7419	io_submit_state_end(&ctx->submit_state, ctx);
7420	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
7421	io_commit_sqring(ctx);
7422
7423	return submitted;
7424}
7425
7426static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
7427{
7428	return READ_ONCE(sqd->state);
7429}
7430
7431static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
7432{
7433	/* Tell userspace we may need a wakeup call */
7434	spin_lock(&ctx->completion_lock);
7435	WRITE_ONCE(ctx->rings->sq_flags,
7436		   ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
7437	spin_unlock(&ctx->completion_lock);
7438}
7439
7440static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
7441{
7442	spin_lock(&ctx->completion_lock);
7443	WRITE_ONCE(ctx->rings->sq_flags,
7444		   ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
7445	spin_unlock(&ctx->completion_lock);
7446}
7447
7448static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
7449{
7450	unsigned int to_submit;
7451	int ret = 0;
7452
7453	to_submit = io_sqring_entries(ctx);
7454	/* if we're handling multiple rings, cap submit size for fairness */
7455	if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
7456		to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
7457
7458	if (!list_empty(&ctx->iopoll_list) || to_submit) {
7459		unsigned nr_events = 0;
7460		const struct cred *creds = NULL;
7461
7462		if (ctx->sq_creds != current_cred())
7463			creds = override_creds(ctx->sq_creds);
7464
7465		mutex_lock(&ctx->uring_lock);
7466		if (!list_empty(&ctx->iopoll_list))
7467			io_do_iopoll(ctx, &nr_events, 0);
7468
7469		/*
7470		 * Don't submit if refs are dying, good for io_uring_register(),
7471		 * but also it is relied upon by io_ring_exit_work()
7472		 */
7473		if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
7474		    !(ctx->flags & IORING_SETUP_R_DISABLED))
7475			ret = io_submit_sqes(ctx, to_submit);
7476		mutex_unlock(&ctx->uring_lock);
7477
7478		if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
7479			wake_up(&ctx->sqo_sq_wait);
7480		if (creds)
7481			revert_creds(creds);
7482	}
7483
7484	return ret;
7485}
7486
7487static void io_sqd_update_thread_idle(struct io_sq_data *sqd)
7488{
7489	struct io_ring_ctx *ctx;
7490	unsigned sq_thread_idle = 0;
7491
7492	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7493		sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle);
7494	sqd->sq_thread_idle = sq_thread_idle;
7495}
7496
7497static bool io_sqd_handle_event(struct io_sq_data *sqd)
7498{
7499	bool did_sig = false;
7500	struct ksignal ksig;
7501
7502	if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) ||
7503	    signal_pending(current)) {
7504		mutex_unlock(&sqd->lock);
7505		if (signal_pending(current))
7506			did_sig = get_signal(&ksig);
7507		cond_resched();
7508		mutex_lock(&sqd->lock);
7509	}
7510	return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
7511}
7512
7513static int io_sq_thread(void *data)
7514{
7515	struct io_sq_data *sqd = data;
7516	struct io_ring_ctx *ctx;
7517	unsigned long timeout = 0;
7518	char buf[TASK_COMM_LEN];
7519	DEFINE_WAIT(wait);
7520
7521	snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
7522	set_task_comm(current, buf);
7523
7524	if (sqd->sq_cpu != -1)
7525		set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
7526	else
7527		set_cpus_allowed_ptr(current, cpu_online_mask);
7528	current->flags |= PF_NO_SETAFFINITY;
7529
7530	mutex_lock(&sqd->lock);
7531	while (1) {
7532		bool cap_entries, sqt_spin = false;
7533
7534		if (io_sqd_events_pending(sqd) || signal_pending(current)) {
7535			if (io_sqd_handle_event(sqd))
7536				break;
7537			timeout = jiffies + sqd->sq_thread_idle;
7538		}
7539
7540		cap_entries = !list_is_singular(&sqd->ctx_list);
7541		list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7542			int ret = __io_sq_thread(ctx, cap_entries);
7543
7544			if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list)))
7545				sqt_spin = true;
7546		}
7547		if (io_run_task_work())
7548			sqt_spin = true;
7549
7550		if (sqt_spin || !time_after(jiffies, timeout)) {
7551			cond_resched();
7552			if (sqt_spin)
7553				timeout = jiffies + sqd->sq_thread_idle;
7554			continue;
7555		}
7556
7557		prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
7558		if (!io_sqd_events_pending(sqd) && !current->task_works) {
7559			bool needs_sched = true;
7560
7561			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
7562				io_ring_set_wakeup_flag(ctx);
7563
7564				if ((ctx->flags & IORING_SETUP_IOPOLL) &&
7565				    !list_empty_careful(&ctx->iopoll_list)) {
7566					needs_sched = false;
7567					break;
7568				}
7569				if (io_sqring_entries(ctx)) {
7570					needs_sched = false;
7571					break;
7572				}
7573			}
7574
7575			if (needs_sched) {
7576				mutex_unlock(&sqd->lock);
7577				schedule();
7578				mutex_lock(&sqd->lock);
7579			}
7580			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7581				io_ring_clear_wakeup_flag(ctx);
7582		}
7583
7584		finish_wait(&sqd->wait, &wait);
7585		timeout = jiffies + sqd->sq_thread_idle;
7586	}
7587
7588	io_uring_cancel_generic(true, sqd);
7589	sqd->thread = NULL;
7590	list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
7591		io_ring_set_wakeup_flag(ctx);
7592	io_run_task_work();
7593	mutex_unlock(&sqd->lock);
7594
7595	complete(&sqd->exited);
7596	do_exit(0);
7597}
7598
7599struct io_wait_queue {
7600	struct wait_queue_entry wq;
7601	struct io_ring_ctx *ctx;
7602	unsigned cq_tail;
7603	unsigned nr_timeouts;
7604};
7605
7606static inline bool io_should_wake(struct io_wait_queue *iowq)
7607{
7608	struct io_ring_ctx *ctx = iowq->ctx;
7609	int dist = ctx->cached_cq_tail - (int) iowq->cq_tail;
7610
7611	/*
7612	 * Wake up if we have enough events, or if a timeout occurred since we
7613	 * started waiting. For timeouts, we always want to return to userspace,
7614	 * regardless of event count.
7615	 */
7616	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
7617}
7618
7619static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
7620			    int wake_flags, void *key)
7621{
7622	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
7623							wq);
7624
7625	/*
7626	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
7627	 * the task, and the next invocation will do it.
7628	 */
7629	if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow))
7630		return autoremove_wake_function(curr, mode, wake_flags, key);
7631	return -1;
7632}
7633
7634static int io_run_task_work_sig(void)
7635{
7636	if (io_run_task_work())
7637		return 1;
7638	if (!signal_pending(current))
7639		return 0;
7640	if (test_thread_flag(TIF_NOTIFY_SIGNAL))
7641		return -ERESTARTSYS;
7642	return -EINTR;
7643}
7644
7645static bool current_pending_io(void)
7646{
7647	struct io_uring_task *tctx = current->io_uring;
7648
7649	if (!tctx)
7650		return false;
7651	return percpu_counter_read_positive(&tctx->inflight);
7652}
7653
7654/* when returns >0, the caller should retry */
7655static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
7656					  struct io_wait_queue *iowq,
7657					  ktime_t *timeout)
7658{
7659	int io_wait, ret;
7660
7661	/* make sure we run task_work before checking for signals */
7662	ret = io_run_task_work_sig();
7663	if (ret || io_should_wake(iowq))
7664		return ret;
7665	/* let the caller flush overflows, retry */
7666	if (test_bit(0, &ctx->check_cq_overflow))
7667		return 1;
7668
7669	/*
7670	 * Mark us as being in io_wait if we have pending requests, so cpufreq
7671	 * can take into account that the task is waiting for IO - turns out
7672	 * to be important for low QD IO.
7673	 */
7674	io_wait = current->in_iowait;
7675	if (current_pending_io())
7676		current->in_iowait = 1;
7677	ret = 1;
7678	if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS))
7679		ret = -ETIME;
7680	current->in_iowait = io_wait;
7681	return ret;
7682}
7683
7684/*
7685 * Wait until events become available, if we don't already have some. The
7686 * application must reap them itself, as they reside on the shared cq ring.
7687 */
7688static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
7689			  const sigset_t __user *sig, size_t sigsz,
7690			  struct __kernel_timespec __user *uts)
7691{
7692	struct io_wait_queue iowq;
7693	struct io_rings *rings = ctx->rings;
7694	ktime_t timeout = KTIME_MAX;
7695	int ret;
7696
7697	do {
7698		io_cqring_overflow_flush(ctx);
7699		if (io_cqring_events(ctx) >= min_events)
7700			return 0;
7701		if (!io_run_task_work())
7702			break;
7703	} while (1);
7704
7705	if (uts) {
7706		struct timespec64 ts;
7707
7708		if (get_timespec64(&ts, uts))
7709			return -EFAULT;
7710		timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
7711	}
7712
7713	if (sig) {
7714#ifdef CONFIG_COMPAT
7715		if (in_compat_syscall())
7716			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
7717						      sigsz);
7718		else
7719#endif
7720			ret = set_user_sigmask(sig, sigsz);
7721
7722		if (ret)
7723			return ret;
7724	}
7725
7726	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
7727	iowq.wq.private = current;
7728	INIT_LIST_HEAD(&iowq.wq.entry);
7729	iowq.ctx = ctx;
7730	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
7731	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
7732
7733	trace_io_uring_cqring_wait(ctx, min_events);
7734	do {
7735		/* if we can't even flush overflow, don't wait for more */
7736		if (!io_cqring_overflow_flush(ctx)) {
7737			ret = -EBUSY;
7738			break;
7739		}
7740		prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
7741						TASK_INTERRUPTIBLE);
7742		ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
7743		finish_wait(&ctx->cq_wait, &iowq.wq);
7744		cond_resched();
7745	} while (ret > 0);
7746
7747	restore_saved_sigmask_unless(ret == -EINTR);
7748
7749	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
7750}
7751
7752static void io_free_page_table(void **table, size_t size)
7753{
7754	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7755
7756	for (i = 0; i < nr_tables; i++)
7757		kfree(table[i]);
7758	kfree(table);
7759}
7760
7761static void **io_alloc_page_table(size_t size)
7762{
7763	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
7764	size_t init_size = size;
7765	void **table;
7766
7767	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
7768	if (!table)
7769		return NULL;
7770
7771	for (i = 0; i < nr_tables; i++) {
7772		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
7773
7774		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
7775		if (!table[i]) {
7776			io_free_page_table(table, init_size);
7777			return NULL;
7778		}
7779		size -= this_size;
7780	}
7781	return table;
7782}
7783
7784static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
7785{
7786	percpu_ref_exit(&ref_node->refs);
7787	kfree(ref_node);
7788}
7789
7790static void io_rsrc_node_ref_zero(struct percpu_ref *ref)
7791{
7792	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
7793	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
7794	unsigned long flags;
7795	bool first_add = false;
7796	unsigned long delay = HZ;
7797
7798	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
7799	node->done = true;
7800
7801	/* if we are mid-quiesce then do not delay */
7802	if (node->rsrc_data->quiesce)
7803		delay = 0;
7804
7805	while (!list_empty(&ctx->rsrc_ref_list)) {
7806		node = list_first_entry(&ctx->rsrc_ref_list,
7807					    struct io_rsrc_node, node);
7808		/* recycle ref nodes in order */
7809		if (!node->done)
7810			break;
7811		list_del(&node->node);
7812		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
7813	}
7814	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
7815
7816	if (first_add)
7817		mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
7818}
7819
7820static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
7821{
7822	struct io_rsrc_node *ref_node;
7823
7824	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
7825	if (!ref_node)
7826		return NULL;
7827
7828	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
7829			    0, GFP_KERNEL)) {
7830		kfree(ref_node);
7831		return NULL;
7832	}
7833	INIT_LIST_HEAD(&ref_node->node);
7834	INIT_LIST_HEAD(&ref_node->rsrc_list);
7835	ref_node->done = false;
7836	return ref_node;
7837}
7838
7839static void io_rsrc_node_switch(struct io_ring_ctx *ctx,
7840				struct io_rsrc_data *data_to_kill)
7841{
7842	WARN_ON_ONCE(!ctx->rsrc_backup_node);
7843	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
7844
7845	if (data_to_kill) {
7846		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
7847
7848		rsrc_node->rsrc_data = data_to_kill;
7849		spin_lock_irq(&ctx->rsrc_ref_lock);
7850		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
7851		spin_unlock_irq(&ctx->rsrc_ref_lock);
7852
7853		atomic_inc(&data_to_kill->refs);
7854		percpu_ref_kill(&rsrc_node->refs);
7855		ctx->rsrc_node = NULL;
7856	}
7857
7858	if (!ctx->rsrc_node) {
7859		ctx->rsrc_node = ctx->rsrc_backup_node;
7860		ctx->rsrc_backup_node = NULL;
7861	}
7862}
7863
7864static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
7865{
7866	if (ctx->rsrc_backup_node)
7867		return 0;
7868	ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx);
7869	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
7870}
7871
7872static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx)
7873{
7874	int ret;
7875
7876	/* As we may drop ->uring_lock, other task may have started quiesce */
7877	if (data->quiesce)
7878		return -ENXIO;
7879
7880	data->quiesce = true;
7881	do {
7882		ret = io_rsrc_node_switch_start(ctx);
7883		if (ret)
7884			break;
7885		io_rsrc_node_switch(ctx, data);
7886
7887		/* kill initial ref, already quiesced if zero */
7888		if (atomic_dec_and_test(&data->refs))
7889			break;
7890		mutex_unlock(&ctx->uring_lock);
7891		flush_delayed_work(&ctx->rsrc_put_work);
7892		ret = wait_for_completion_interruptible(&data->done);
7893		if (!ret) {
7894			mutex_lock(&ctx->uring_lock);
7895			if (atomic_read(&data->refs) > 0) {
7896				/*
7897				 * it has been revived by another thread while
7898				 * we were unlocked
7899				 */
7900				mutex_unlock(&ctx->uring_lock);
7901			} else {
7902				break;
7903			}
7904		}
7905
7906		atomic_inc(&data->refs);
7907		/* wait for all works potentially completing data->done */
7908		flush_delayed_work(&ctx->rsrc_put_work);
7909		reinit_completion(&data->done);
7910
7911		ret = io_run_task_work_sig();
7912		mutex_lock(&ctx->uring_lock);
7913	} while (ret >= 0);
7914	data->quiesce = false;
7915
7916	return ret;
7917}
7918
7919static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
7920{
7921	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
7922	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
7923
7924	return &data->tags[table_idx][off];
7925}
7926
7927static void io_rsrc_data_free(struct io_rsrc_data *data)
7928{
7929	size_t size = data->nr * sizeof(data->tags[0][0]);
7930
7931	if (data->tags)
7932		io_free_page_table((void **)data->tags, size);
7933	kfree(data);
7934}
7935
7936static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put,
7937			      u64 __user *utags, unsigned nr,
7938			      struct io_rsrc_data **pdata)
7939{
7940	struct io_rsrc_data *data;
7941	int ret = -ENOMEM;
7942	unsigned i;
7943
7944	data = kzalloc(sizeof(*data), GFP_KERNEL);
7945	if (!data)
7946		return -ENOMEM;
7947	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
7948	if (!data->tags) {
7949		kfree(data);
7950		return -ENOMEM;
7951	}
7952
7953	data->nr = nr;
7954	data->ctx = ctx;
7955	data->do_put = do_put;
7956	if (utags) {
7957		ret = -EFAULT;
7958		for (i = 0; i < nr; i++) {
7959			u64 *tag_slot = io_get_tag_slot(data, i);
7960
7961			if (copy_from_user(tag_slot, &utags[i],
7962					   sizeof(*tag_slot)))
7963				goto fail;
7964		}
7965	}
7966
7967	atomic_set(&data->refs, 1);
7968	init_completion(&data->done);
7969	*pdata = data;
7970	return 0;
7971fail:
7972	io_rsrc_data_free(data);
7973	return ret;
7974}
7975
7976static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
7977{
7978	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
7979				GFP_KERNEL_ACCOUNT);
7980	return !!table->files;
7981}
7982
7983static void io_free_file_tables(struct io_file_table *table)
7984{
7985	kvfree(table->files);
7986	table->files = NULL;
7987}
7988
7989static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
7990{
7991#if defined(CONFIG_UNIX)
7992	if (ctx->ring_sock) {
7993		struct sock *sock = ctx->ring_sock->sk;
7994		struct sk_buff *skb;
7995
7996		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
7997			kfree_skb(skb);
7998	}
7999#else
8000	int i;
8001
8002	for (i = 0; i < ctx->nr_user_files; i++) {
8003		struct file *file;
8004
8005		file = io_file_from_index(ctx, i);
8006		if (file)
8007			fput(file);
8008	}
8009#endif
8010	io_free_file_tables(&ctx->file_table);
8011	io_rsrc_data_free(ctx->file_data);
8012	ctx->file_data = NULL;
8013	ctx->nr_user_files = 0;
8014}
8015
8016static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
8017{
8018	unsigned nr = ctx->nr_user_files;
8019	int ret;
8020
8021	if (!ctx->file_data)
8022		return -ENXIO;
8023
8024	/*
8025	 * Quiesce may unlock ->uring_lock, and while it's not held
8026	 * prevent new requests using the table.
8027	 */
8028	ctx->nr_user_files = 0;
8029	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
8030	ctx->nr_user_files = nr;
8031	if (!ret)
8032		__io_sqe_files_unregister(ctx);
8033	return ret;
8034}
8035
8036static void io_sq_thread_unpark(struct io_sq_data *sqd)
8037	__releases(&sqd->lock)
8038{
8039	WARN_ON_ONCE(sqd->thread == current);
8040
8041	/*
8042	 * Do the dance but not conditional clear_bit() because it'd race with
8043	 * other threads incrementing park_pending and setting the bit.
8044	 */
8045	clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
8046	if (atomic_dec_return(&sqd->park_pending))
8047		set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
8048	mutex_unlock(&sqd->lock);
8049}
8050
8051static void io_sq_thread_park(struct io_sq_data *sqd)
8052	__acquires(&sqd->lock)
8053{
8054	WARN_ON_ONCE(sqd->thread == current);
8055
8056	atomic_inc(&sqd->park_pending);
8057	set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
8058	mutex_lock(&sqd->lock);
8059	if (sqd->thread)
8060		wake_up_process(sqd->thread);
8061}
8062
8063static void io_sq_thread_stop(struct io_sq_data *sqd)
8064{
8065	WARN_ON_ONCE(sqd->thread == current);
8066	WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
8067
8068	set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
8069	mutex_lock(&sqd->lock);
8070	if (sqd->thread)
8071		wake_up_process(sqd->thread);
8072	mutex_unlock(&sqd->lock);
8073	wait_for_completion(&sqd->exited);
8074}
8075
8076static void io_put_sq_data(struct io_sq_data *sqd)
8077{
8078	if (refcount_dec_and_test(&sqd->refs)) {
8079		WARN_ON_ONCE(atomic_read(&sqd->park_pending));
8080
8081		io_sq_thread_stop(sqd);
8082		kfree(sqd);
8083	}
8084}
8085
8086static void io_sq_thread_finish(struct io_ring_ctx *ctx)
8087{
8088	struct io_sq_data *sqd = ctx->sq_data;
8089
8090	if (sqd) {
8091		io_sq_thread_park(sqd);
8092		list_del_init(&ctx->sqd_list);
8093		io_sqd_update_thread_idle(sqd);
8094		io_sq_thread_unpark(sqd);
8095
8096		io_put_sq_data(sqd);
8097		ctx->sq_data = NULL;
8098	}
8099}
8100
8101static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p)
8102{
8103	struct io_ring_ctx *ctx_attach;
8104	struct io_sq_data *sqd;
8105	struct fd f;
8106
8107	f = fdget(p->wq_fd);
8108	if (!f.file)
8109		return ERR_PTR(-ENXIO);
8110	if (f.file->f_op != &io_uring_fops) {
8111		fdput(f);
8112		return ERR_PTR(-EINVAL);
8113	}
8114
8115	ctx_attach = f.file->private_data;
8116	sqd = ctx_attach->sq_data;
8117	if (!sqd) {
8118		fdput(f);
8119		return ERR_PTR(-EINVAL);
8120	}
8121	if (sqd->task_tgid != current->tgid) {
8122		fdput(f);
8123		return ERR_PTR(-EPERM);
8124	}
8125
8126	refcount_inc(&sqd->refs);
8127	fdput(f);
8128	return sqd;
8129}
8130
8131static struct io_sq_data *io_get_sq_data(struct io_uring_params *p,
8132					 bool *attached)
8133{
8134	struct io_sq_data *sqd;
8135
8136	*attached = false;
8137	if (p->flags & IORING_SETUP_ATTACH_WQ) {
8138		sqd = io_attach_sq_data(p);
8139		if (!IS_ERR(sqd)) {
8140			*attached = true;
8141			return sqd;
8142		}
8143		/* fall through for EPERM case, setup new sqd/task */
8144		if (PTR_ERR(sqd) != -EPERM)
8145			return sqd;
8146	}
8147
8148	sqd = kzalloc(sizeof(*sqd), GFP_KERNEL);
8149	if (!sqd)
8150		return ERR_PTR(-ENOMEM);
8151
8152	atomic_set(&sqd->park_pending, 0);
8153	refcount_set(&sqd->refs, 1);
8154	INIT_LIST_HEAD(&sqd->ctx_list);
8155	mutex_init(&sqd->lock);
8156	init_waitqueue_head(&sqd->wait);
8157	init_completion(&sqd->exited);
8158	return sqd;
8159}
8160
8161#if defined(CONFIG_UNIX)
8162/*
8163 * Ensure the UNIX gc is aware of our file set, so we are certain that
8164 * the io_uring can be safely unregistered on process exit, even if we have
8165 * loops in the file referencing.
8166 */
8167static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
8168{
8169	struct sock *sk = ctx->ring_sock->sk;
8170	struct scm_fp_list *fpl;
8171	struct sk_buff *skb;
8172	int i, nr_files;
8173
8174	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
8175	if (!fpl)
8176		return -ENOMEM;
8177
8178	skb = alloc_skb(0, GFP_KERNEL);
8179	if (!skb) {
8180		kfree(fpl);
8181		return -ENOMEM;
8182	}
8183
8184	skb->sk = sk;
8185	skb->scm_io_uring = 1;
8186
8187	nr_files = 0;
8188	fpl->user = get_uid(current_user());
8189	for (i = 0; i < nr; i++) {
8190		struct file *file = io_file_from_index(ctx, i + offset);
8191
8192		if (!file)
8193			continue;
8194		fpl->fp[nr_files] = get_file(file);
8195		unix_inflight(fpl->user, fpl->fp[nr_files]);
8196		nr_files++;
8197	}
8198
8199	if (nr_files) {
8200		fpl->max = SCM_MAX_FD;
8201		fpl->count = nr_files;
8202		UNIXCB(skb).fp = fpl;
8203		skb->destructor = unix_destruct_scm;
8204		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
8205		skb_queue_head(&sk->sk_receive_queue, skb);
8206
8207		for (i = 0; i < nr; i++) {
8208			struct file *file = io_file_from_index(ctx, i + offset);
8209
8210			if (file)
8211				fput(file);
8212		}
8213	} else {
8214		kfree_skb(skb);
8215		free_uid(fpl->user);
8216		kfree(fpl);
8217	}
8218
8219	return 0;
8220}
8221
8222/*
8223 * If UNIX sockets are enabled, fd passing can cause a reference cycle which
8224 * causes regular reference counting to break down. We rely on the UNIX
8225 * garbage collection to take care of this problem for us.
8226 */
8227static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8228{
8229	unsigned left, total;
8230	int ret = 0;
8231
8232	total = 0;
8233	left = ctx->nr_user_files;
8234	while (left) {
8235		unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
8236
8237		ret = __io_sqe_files_scm(ctx, this_files, total);
8238		if (ret)
8239			break;
8240		left -= this_files;
8241		total += this_files;
8242	}
8243
8244	if (!ret)
8245		return 0;
8246
8247	while (total < ctx->nr_user_files) {
8248		struct file *file = io_file_from_index(ctx, total);
8249
8250		if (file)
8251			fput(file);
8252		total++;
8253	}
8254
8255	return ret;
8256}
8257#else
8258static int io_sqe_files_scm(struct io_ring_ctx *ctx)
8259{
8260	return 0;
8261}
8262#endif
8263
8264static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8265{
8266	struct file *file = prsrc->file;
8267#if defined(CONFIG_UNIX)
8268	struct sock *sock = ctx->ring_sock->sk;
8269	struct sk_buff_head list, *head = &sock->sk_receive_queue;
8270	struct sk_buff *skb;
8271	int i;
8272
8273	__skb_queue_head_init(&list);
8274
8275	/*
8276	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
8277	 * remove this entry and rearrange the file array.
8278	 */
8279	skb = skb_dequeue(head);
8280	while (skb) {
8281		struct scm_fp_list *fp;
8282
8283		fp = UNIXCB(skb).fp;
8284		for (i = 0; i < fp->count; i++) {
8285			int left;
8286
8287			if (fp->fp[i] != file)
8288				continue;
8289
8290			unix_notinflight(fp->user, fp->fp[i]);
8291			left = fp->count - 1 - i;
8292			if (left) {
8293				memmove(&fp->fp[i], &fp->fp[i + 1],
8294						left * sizeof(struct file *));
8295			}
8296			fp->count--;
8297			if (!fp->count) {
8298				kfree_skb(skb);
8299				skb = NULL;
8300			} else {
8301				__skb_queue_tail(&list, skb);
8302			}
8303			fput(file);
8304			file = NULL;
8305			break;
8306		}
8307
8308		if (!file)
8309			break;
8310
8311		__skb_queue_tail(&list, skb);
8312
8313		skb = skb_dequeue(head);
8314	}
8315
8316	if (skb_peek(&list)) {
8317		spin_lock_irq(&head->lock);
8318		while ((skb = __skb_dequeue(&list)) != NULL)
8319			__skb_queue_tail(head, skb);
8320		spin_unlock_irq(&head->lock);
8321	}
8322#else
8323	fput(file);
8324#endif
8325}
8326
8327static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
8328{
8329	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
8330	struct io_ring_ctx *ctx = rsrc_data->ctx;
8331	struct io_rsrc_put *prsrc, *tmp;
8332
8333	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
8334		list_del(&prsrc->list);
8335
8336		if (prsrc->tag) {
8337			bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL;
8338
8339			io_ring_submit_lock(ctx, lock_ring);
8340			spin_lock(&ctx->completion_lock);
8341			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
8342			io_commit_cqring(ctx);
8343			spin_unlock(&ctx->completion_lock);
8344			io_cqring_ev_posted(ctx);
8345			io_ring_submit_unlock(ctx, lock_ring);
8346		}
8347
8348		rsrc_data->do_put(ctx, prsrc);
8349		kfree(prsrc);
8350	}
8351
8352	io_rsrc_node_destroy(ref_node);
8353	if (atomic_dec_and_test(&rsrc_data->refs))
8354		complete(&rsrc_data->done);
8355}
8356
8357static void io_rsrc_put_work(struct work_struct *work)
8358{
8359	struct io_ring_ctx *ctx;
8360	struct llist_node *node;
8361
8362	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
8363	node = llist_del_all(&ctx->rsrc_put_llist);
8364
8365	while (node) {
8366		struct io_rsrc_node *ref_node;
8367		struct llist_node *next = node->next;
8368
8369		ref_node = llist_entry(node, struct io_rsrc_node, llist);
8370		__io_rsrc_put_work(ref_node);
8371		node = next;
8372	}
8373}
8374
8375static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
8376				 unsigned nr_args, u64 __user *tags)
8377{
8378	__s32 __user *fds = (__s32 __user *) arg;
8379	struct file *file;
8380	int fd, ret;
8381	unsigned i;
8382
8383	if (ctx->file_data)
8384		return -EBUSY;
8385	if (!nr_args)
8386		return -EINVAL;
8387	if (nr_args > IORING_MAX_FIXED_FILES)
8388		return -EMFILE;
8389	if (nr_args > rlimit(RLIMIT_NOFILE))
8390		return -EMFILE;
8391	ret = io_rsrc_node_switch_start(ctx);
8392	if (ret)
8393		return ret;
8394	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
8395				 &ctx->file_data);
8396	if (ret)
8397		return ret;
8398
8399	ret = -ENOMEM;
8400	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
8401		goto out_free;
8402
8403	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
8404		if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
8405			ret = -EFAULT;
8406			goto out_fput;
8407		}
8408		/* allow sparse sets */
8409		if (fd == -1) {
8410			ret = -EINVAL;
8411			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
8412				goto out_fput;
8413			continue;
8414		}
8415
8416		file = fget(fd);
8417		ret = -EBADF;
8418		if (unlikely(!file))
8419			goto out_fput;
8420
8421		/*
8422		 * Don't allow io_uring instances to be registered. If UNIX
8423		 * isn't enabled, then this causes a reference cycle and this
8424		 * instance can never get freed. If UNIX is enabled we'll
8425		 * handle it just fine, but there's still no point in allowing
8426		 * a ring fd as it doesn't support regular read/write anyway.
8427		 */
8428		if (file->f_op == &io_uring_fops) {
8429			fput(file);
8430			goto out_fput;
8431		}
8432		io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file);
8433	}
8434
8435	ret = io_sqe_files_scm(ctx);
8436	if (ret) {
8437		__io_sqe_files_unregister(ctx);
8438		return ret;
8439	}
8440
8441	io_rsrc_node_switch(ctx, NULL);
8442	return ret;
8443out_fput:
8444	for (i = 0; i < ctx->nr_user_files; i++) {
8445		file = io_file_from_index(ctx, i);
8446		if (file)
8447			fput(file);
8448	}
8449	io_free_file_tables(&ctx->file_table);
8450	ctx->nr_user_files = 0;
8451out_free:
8452	io_rsrc_data_free(ctx->file_data);
8453	ctx->file_data = NULL;
8454	return ret;
8455}
8456
8457static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
8458				 struct io_rsrc_node *node, void *rsrc)
8459{
8460	u64 *tag_slot = io_get_tag_slot(data, idx);
8461	struct io_rsrc_put *prsrc;
8462
8463	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
8464	if (!prsrc)
8465		return -ENOMEM;
8466
8467	prsrc->tag = *tag_slot;
8468	*tag_slot = 0;
8469	prsrc->rsrc = rsrc;
8470	list_add(&prsrc->list, &node->rsrc_list);
8471	return 0;
8472}
8473
8474static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
8475				 unsigned int issue_flags, u32 slot_index)
8476{
8477	struct io_ring_ctx *ctx = req->ctx;
8478	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
8479	bool needs_switch = false;
8480	struct io_fixed_file *file_slot;
8481	int ret = -EBADF;
8482
8483	io_ring_submit_lock(ctx, !force_nonblock);
8484	if (file->f_op == &io_uring_fops)
8485		goto err;
8486	ret = -ENXIO;
8487	if (!ctx->file_data)
8488		goto err;
8489	ret = -EINVAL;
8490	if (slot_index >= ctx->nr_user_files)
8491		goto err;
8492
8493	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
8494	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
8495
8496	if (file_slot->file_ptr) {
8497		struct file *old_file;
8498
8499		ret = io_rsrc_node_switch_start(ctx);
8500		if (ret)
8501			goto err;
8502
8503		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8504		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
8505					    ctx->rsrc_node, old_file);
8506		if (ret)
8507			goto err;
8508		file_slot->file_ptr = 0;
8509		needs_switch = true;
8510	}
8511
8512	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
8513	io_fixed_file_set(file_slot, file);
8514	ret = 0;
8515err:
8516	if (needs_switch)
8517		io_rsrc_node_switch(ctx, ctx->file_data);
8518	io_ring_submit_unlock(ctx, !force_nonblock);
8519	if (ret)
8520		fput(file);
8521	return ret;
8522}
8523
8524static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
8525{
8526	unsigned int offset = req->close.file_slot - 1;
8527	struct io_ring_ctx *ctx = req->ctx;
8528	struct io_fixed_file *file_slot;
8529	struct file *file;
8530	int ret;
8531
8532	io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
8533	ret = -ENXIO;
8534	if (unlikely(!ctx->file_data))
8535		goto out;
8536	ret = -EINVAL;
8537	if (offset >= ctx->nr_user_files)
8538		goto out;
8539	ret = io_rsrc_node_switch_start(ctx);
8540	if (ret)
8541		goto out;
8542
8543	offset = array_index_nospec(offset, ctx->nr_user_files);
8544	file_slot = io_fixed_file_slot(&ctx->file_table, offset);
8545	ret = -EBADF;
8546	if (!file_slot->file_ptr)
8547		goto out;
8548
8549	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8550	ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
8551	if (ret)
8552		goto out;
8553
8554	file_slot->file_ptr = 0;
8555	io_rsrc_node_switch(ctx, ctx->file_data);
8556	ret = 0;
8557out:
8558	io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
8559	return ret;
8560}
8561
8562static int __io_sqe_files_update(struct io_ring_ctx *ctx,
8563				 struct io_uring_rsrc_update2 *up,
8564				 unsigned nr_args)
8565{
8566	u64 __user *tags = u64_to_user_ptr(up->tags);
8567	__s32 __user *fds = u64_to_user_ptr(up->data);
8568	struct io_rsrc_data *data = ctx->file_data;
8569	struct io_fixed_file *file_slot;
8570	struct file *file;
8571	int fd, i, err = 0;
8572	unsigned int done;
8573	bool needs_switch = false;
8574
8575	if (!ctx->file_data)
8576		return -ENXIO;
8577	if (up->offset + nr_args > ctx->nr_user_files)
8578		return -EINVAL;
8579
8580	for (done = 0; done < nr_args; done++) {
8581		u64 tag = 0;
8582
8583		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
8584		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
8585			err = -EFAULT;
8586			break;
8587		}
8588		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
8589			err = -EINVAL;
8590			break;
8591		}
8592		if (fd == IORING_REGISTER_FILES_SKIP)
8593			continue;
8594
8595		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
8596		file_slot = io_fixed_file_slot(&ctx->file_table, i);
8597
8598		if (file_slot->file_ptr) {
8599			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
8600			err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
8601			if (err)
8602				break;
8603			file_slot->file_ptr = 0;
8604			needs_switch = true;
8605		}
8606		if (fd != -1) {
8607			file = fget(fd);
8608			if (!file) {
8609				err = -EBADF;
8610				break;
8611			}
8612			/*
8613			 * Don't allow io_uring instances to be registered. If
8614			 * UNIX isn't enabled, then this causes a reference
8615			 * cycle and this instance can never get freed. If UNIX
8616			 * is enabled we'll handle it just fine, but there's
8617			 * still no point in allowing a ring fd as it doesn't
8618			 * support regular read/write anyway.
8619			 */
8620			if (file->f_op == &io_uring_fops) {
8621				fput(file);
8622				err = -EBADF;
8623				break;
8624			}
8625			*io_get_tag_slot(data, i) = tag;
8626			io_fixed_file_set(file_slot, file);
8627		}
8628	}
8629
8630	if (needs_switch)
8631		io_rsrc_node_switch(ctx, data);
8632	return done ? done : err;
8633}
8634
8635static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
8636					struct task_struct *task)
8637{
8638	struct io_wq_hash *hash;
8639	struct io_wq_data data;
8640	unsigned int concurrency;
8641
8642	mutex_lock(&ctx->uring_lock);
8643	hash = ctx->hash_map;
8644	if (!hash) {
8645		hash = kzalloc(sizeof(*hash), GFP_KERNEL);
8646		if (!hash) {
8647			mutex_unlock(&ctx->uring_lock);
8648			return ERR_PTR(-ENOMEM);
8649		}
8650		refcount_set(&hash->refs, 1);
8651		init_waitqueue_head(&hash->wait);
8652		ctx->hash_map = hash;
8653	}
8654	mutex_unlock(&ctx->uring_lock);
8655
8656	data.hash = hash;
8657	data.task = task;
8658	data.free_work = io_wq_free_work;
8659	data.do_work = io_wq_submit_work;
8660
8661	/* Do QD, or 4 * CPUS, whatever is smallest */
8662	concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
8663
8664	return io_wq_create(concurrency, &data);
8665}
8666
8667static int io_uring_alloc_task_context(struct task_struct *task,
8668				       struct io_ring_ctx *ctx)
8669{
8670	struct io_uring_task *tctx;
8671	int ret;
8672
8673	tctx = kzalloc(sizeof(*tctx), GFP_KERNEL);
8674	if (unlikely(!tctx))
8675		return -ENOMEM;
8676
8677	ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
8678	if (unlikely(ret)) {
8679		kfree(tctx);
8680		return ret;
8681	}
8682
8683	tctx->io_wq = io_init_wq_offload(ctx, task);
8684	if (IS_ERR(tctx->io_wq)) {
8685		ret = PTR_ERR(tctx->io_wq);
8686		percpu_counter_destroy(&tctx->inflight);
8687		kfree(tctx);
8688		return ret;
8689	}
8690
8691	xa_init(&tctx->xa);
8692	init_waitqueue_head(&tctx->wait);
8693	atomic_set(&tctx->in_idle, 0);
8694	atomic_set(&tctx->inflight_tracked, 0);
8695	task->io_uring = tctx;
8696	spin_lock_init(&tctx->task_lock);
8697	INIT_WQ_LIST(&tctx->task_list);
8698	init_task_work(&tctx->task_work, tctx_task_work);
8699	return 0;
8700}
8701
8702void __io_uring_free(struct task_struct *tsk)
8703{
8704	struct io_uring_task *tctx = tsk->io_uring;
8705
8706	WARN_ON_ONCE(!xa_empty(&tctx->xa));
8707	WARN_ON_ONCE(tctx->io_wq);
8708	WARN_ON_ONCE(tctx->cached_refs);
8709
8710	percpu_counter_destroy(&tctx->inflight);
8711	kfree(tctx);
8712	tsk->io_uring = NULL;
8713}
8714
8715static int io_sq_offload_create(struct io_ring_ctx *ctx,
8716				struct io_uring_params *p)
8717{
8718	int ret;
8719
8720	/* Retain compatibility with failing for an invalid attach attempt */
8721	if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) ==
8722				IORING_SETUP_ATTACH_WQ) {
8723		struct fd f;
8724
8725		f = fdget(p->wq_fd);
8726		if (!f.file)
8727			return -ENXIO;
8728		if (f.file->f_op != &io_uring_fops) {
8729			fdput(f);
8730			return -EINVAL;
8731		}
8732		fdput(f);
8733	}
8734	if (ctx->flags & IORING_SETUP_SQPOLL) {
8735		struct task_struct *tsk;
8736		struct io_sq_data *sqd;
8737		bool attached;
8738
8739		sqd = io_get_sq_data(p, &attached);
8740		if (IS_ERR(sqd)) {
8741			ret = PTR_ERR(sqd);
8742			goto err;
8743		}
8744
8745		ctx->sq_creds = get_current_cred();
8746		ctx->sq_data = sqd;
8747		ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle);
8748		if (!ctx->sq_thread_idle)
8749			ctx->sq_thread_idle = HZ;
8750
8751		io_sq_thread_park(sqd);
8752		list_add(&ctx->sqd_list, &sqd->ctx_list);
8753		io_sqd_update_thread_idle(sqd);
8754		/* don't attach to a dying SQPOLL thread, would be racy */
8755		ret = (attached && !sqd->thread) ? -ENXIO : 0;
8756		io_sq_thread_unpark(sqd);
8757
8758		if (ret < 0)
8759			goto err;
8760		if (attached)
8761			return 0;
8762
8763		if (p->flags & IORING_SETUP_SQ_AFF) {
8764			int cpu = p->sq_thread_cpu;
8765
8766			ret = -EINVAL;
8767			if (cpu >= nr_cpu_ids || !cpu_online(cpu))
8768				goto err_sqpoll;
8769			sqd->sq_cpu = cpu;
8770		} else {
8771			sqd->sq_cpu = -1;
8772		}
8773
8774		sqd->task_pid = current->pid;
8775		sqd->task_tgid = current->tgid;
8776		tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
8777		if (IS_ERR(tsk)) {
8778			ret = PTR_ERR(tsk);
8779			goto err_sqpoll;
8780		}
8781
8782		sqd->thread = tsk;
8783		ret = io_uring_alloc_task_context(tsk, ctx);
8784		wake_up_new_task(tsk);
8785		if (ret)
8786			goto err;
8787	} else if (p->flags & IORING_SETUP_SQ_AFF) {
8788		/* Can't have SQ_AFF without SQPOLL */
8789		ret = -EINVAL;
8790		goto err;
8791	}
8792
8793	return 0;
8794err_sqpoll:
8795	complete(&ctx->sq_data->exited);
8796err:
8797	io_sq_thread_finish(ctx);
8798	return ret;
8799}
8800
8801static inline void __io_unaccount_mem(struct user_struct *user,
8802				      unsigned long nr_pages)
8803{
8804	atomic_long_sub(nr_pages, &user->locked_vm);
8805}
8806
8807static inline int __io_account_mem(struct user_struct *user,
8808				   unsigned long nr_pages)
8809{
8810	unsigned long page_limit, cur_pages, new_pages;
8811
8812	/* Don't allow more pages than we can safely lock */
8813	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
8814
8815	do {
8816		cur_pages = atomic_long_read(&user->locked_vm);
8817		new_pages = cur_pages + nr_pages;
8818		if (new_pages > page_limit)
8819			return -ENOMEM;
8820	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
8821					new_pages) != cur_pages);
8822
8823	return 0;
8824}
8825
8826static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8827{
8828	if (ctx->user)
8829		__io_unaccount_mem(ctx->user, nr_pages);
8830
8831	if (ctx->mm_account)
8832		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
8833}
8834
8835static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
8836{
8837	int ret;
8838
8839	if (ctx->user) {
8840		ret = __io_account_mem(ctx->user, nr_pages);
8841		if (ret)
8842			return ret;
8843	}
8844
8845	if (ctx->mm_account)
8846		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
8847
8848	return 0;
8849}
8850
8851static void io_mem_free(void *ptr)
8852{
8853	struct page *page;
8854
8855	if (!ptr)
8856		return;
8857
8858	page = virt_to_head_page(ptr);
8859	if (put_page_testzero(page))
8860		free_compound_page(page);
8861}
8862
8863static void *io_mem_alloc(size_t size)
8864{
8865	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
8866
8867	return (void *) __get_free_pages(gfp, get_order(size));
8868}
8869
8870static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
8871				size_t *sq_offset)
8872{
8873	struct io_rings *rings;
8874	size_t off, sq_array_size;
8875
8876	off = struct_size(rings, cqes, cq_entries);
8877	if (off == SIZE_MAX)
8878		return SIZE_MAX;
8879
8880#ifdef CONFIG_SMP
8881	off = ALIGN(off, SMP_CACHE_BYTES);
8882	if (off == 0)
8883		return SIZE_MAX;
8884#endif
8885
8886	if (sq_offset)
8887		*sq_offset = off;
8888
8889	sq_array_size = array_size(sizeof(u32), sq_entries);
8890	if (sq_array_size == SIZE_MAX)
8891		return SIZE_MAX;
8892
8893	if (check_add_overflow(off, sq_array_size, &off))
8894		return SIZE_MAX;
8895
8896	return off;
8897}
8898
8899static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
8900{
8901	struct io_mapped_ubuf *imu = *slot;
8902	unsigned int i;
8903
8904	if (imu != ctx->dummy_ubuf) {
8905		for (i = 0; i < imu->nr_bvecs; i++)
8906			unpin_user_page(imu->bvec[i].bv_page);
8907		if (imu->acct_pages)
8908			io_unaccount_mem(ctx, imu->acct_pages);
8909		kvfree(imu);
8910	}
8911	*slot = NULL;
8912}
8913
8914static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
8915{
8916	io_buffer_unmap(ctx, &prsrc->buf);
8917	prsrc->buf = NULL;
8918}
8919
8920static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8921{
8922	unsigned int i;
8923
8924	for (i = 0; i < ctx->nr_user_bufs; i++)
8925		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
8926	kfree(ctx->user_bufs);
8927	io_rsrc_data_free(ctx->buf_data);
8928	ctx->user_bufs = NULL;
8929	ctx->buf_data = NULL;
8930	ctx->nr_user_bufs = 0;
8931}
8932
8933static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
8934{
8935	unsigned nr = ctx->nr_user_bufs;
8936	int ret;
8937
8938	if (!ctx->buf_data)
8939		return -ENXIO;
8940
8941	/*
8942	 * Quiesce may unlock ->uring_lock, and while it's not held
8943	 * prevent new requests using the table.
8944	 */
8945	ctx->nr_user_bufs = 0;
8946	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
8947	ctx->nr_user_bufs = nr;
8948	if (!ret)
8949		__io_sqe_buffers_unregister(ctx);
8950	return ret;
8951}
8952
8953static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
8954		       void __user *arg, unsigned index)
8955{
8956	struct iovec __user *src;
8957
8958#ifdef CONFIG_COMPAT
8959	if (ctx->compat) {
8960		struct compat_iovec __user *ciovs;
8961		struct compat_iovec ciov;
8962
8963		ciovs = (struct compat_iovec __user *) arg;
8964		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
8965			return -EFAULT;
8966
8967		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
8968		dst->iov_len = ciov.iov_len;
8969		return 0;
8970	}
8971#endif
8972	src = (struct iovec __user *) arg;
8973	if (copy_from_user(dst, &src[index], sizeof(*dst)))
8974		return -EFAULT;
8975	return 0;
8976}
8977
8978/*
8979 * Not super efficient, but this is just a registration time. And we do cache
8980 * the last compound head, so generally we'll only do a full search if we don't
8981 * match that one.
8982 *
8983 * We check if the given compound head page has already been accounted, to
8984 * avoid double accounting it. This allows us to account the full size of the
8985 * page, not just the constituent pages of a huge page.
8986 */
8987static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
8988				  int nr_pages, struct page *hpage)
8989{
8990	int i, j;
8991
8992	/* check current page array */
8993	for (i = 0; i < nr_pages; i++) {
8994		if (!PageCompound(pages[i]))
8995			continue;
8996		if (compound_head(pages[i]) == hpage)
8997			return true;
8998	}
8999
9000	/* check previously registered pages */
9001	for (i = 0; i < ctx->nr_user_bufs; i++) {
9002		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
9003
9004		for (j = 0; j < imu->nr_bvecs; j++) {
9005			if (!PageCompound(imu->bvec[j].bv_page))
9006				continue;
9007			if (compound_head(imu->bvec[j].bv_page) == hpage)
9008				return true;
9009		}
9010	}
9011
9012	return false;
9013}
9014
9015static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
9016				 int nr_pages, struct io_mapped_ubuf *imu,
9017				 struct page **last_hpage)
9018{
9019	int i, ret;
9020
9021	imu->acct_pages = 0;
9022	for (i = 0; i < nr_pages; i++) {
9023		if (!PageCompound(pages[i])) {
9024			imu->acct_pages++;
9025		} else {
9026			struct page *hpage;
9027
9028			hpage = compound_head(pages[i]);
9029			if (hpage == *last_hpage)
9030				continue;
9031			*last_hpage = hpage;
9032			if (headpage_already_acct(ctx, pages, i, hpage))
9033				continue;
9034			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
9035		}
9036	}
9037
9038	if (!imu->acct_pages)
9039		return 0;
9040
9041	ret = io_account_mem(ctx, imu->acct_pages);
9042	if (ret)
9043		imu->acct_pages = 0;
9044	return ret;
9045}
9046
9047static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
9048				  struct io_mapped_ubuf **pimu,
9049				  struct page **last_hpage)
9050{
9051	struct io_mapped_ubuf *imu = NULL;
9052	struct vm_area_struct **vmas = NULL;
9053	struct page **pages = NULL;
9054	unsigned long off, start, end, ubuf;
9055	size_t size;
9056	int ret, pret, nr_pages, i;
9057
9058	if (!iov->iov_base) {
9059		*pimu = ctx->dummy_ubuf;
9060		return 0;
9061	}
9062
9063	ubuf = (unsigned long) iov->iov_base;
9064	end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
9065	start = ubuf >> PAGE_SHIFT;
9066	nr_pages = end - start;
9067
9068	*pimu = NULL;
9069	ret = -ENOMEM;
9070
9071	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
9072	if (!pages)
9073		goto done;
9074
9075	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
9076			      GFP_KERNEL);
9077	if (!vmas)
9078		goto done;
9079
9080	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
9081	if (!imu)
9082		goto done;
9083
9084	ret = 0;
9085	mmap_read_lock(current->mm);
9086	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
9087			      pages, vmas);
9088	if (pret == nr_pages) {
9089		struct file *file = vmas[0]->vm_file;
9090
9091		/* don't support file backed memory */
9092		for (i = 0; i < nr_pages; i++) {
9093			if (vmas[i]->vm_file != file) {
9094				ret = -EINVAL;
9095				break;
9096			}
9097			if (!file)
9098				continue;
9099			if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
9100				ret = -EOPNOTSUPP;
9101				break;
9102			}
9103		}
9104	} else {
9105		ret = pret < 0 ? pret : -EFAULT;
9106	}
9107	mmap_read_unlock(current->mm);
9108	if (ret) {
9109		/*
9110		 * if we did partial map, or found file backed vmas,
9111		 * release any pages we did get
9112		 */
9113		if (pret > 0)
9114			unpin_user_pages(pages, pret);
9115		goto done;
9116	}
9117
9118	ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
9119	if (ret) {
9120		unpin_user_pages(pages, pret);
9121		goto done;
9122	}
9123
9124	off = ubuf & ~PAGE_MASK;
9125	size = iov->iov_len;
9126	for (i = 0; i < nr_pages; i++) {
9127		size_t vec_len;
9128
9129		vec_len = min_t(size_t, size, PAGE_SIZE - off);
9130		imu->bvec[i].bv_page = pages[i];
9131		imu->bvec[i].bv_len = vec_len;
9132		imu->bvec[i].bv_offset = off;
9133		off = 0;
9134		size -= vec_len;
9135	}
9136	/* store original address for later verification */
9137	imu->ubuf = ubuf;
9138	imu->ubuf_end = ubuf + iov->iov_len;
9139	imu->nr_bvecs = nr_pages;
9140	*pimu = imu;
9141	ret = 0;
9142done:
9143	if (ret)
9144		kvfree(imu);
9145	kvfree(pages);
9146	kvfree(vmas);
9147	return ret;
9148}
9149
9150static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
9151{
9152	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
9153	return ctx->user_bufs ? 0 : -ENOMEM;
9154}
9155
9156static int io_buffer_validate(struct iovec *iov)
9157{
9158	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
9159
9160	/*
9161	 * Don't impose further limits on the size and buffer
9162	 * constraints here, we'll -EINVAL later when IO is
9163	 * submitted if they are wrong.
9164	 */
9165	if (!iov->iov_base)
9166		return iov->iov_len ? -EFAULT : 0;
9167	if (!iov->iov_len)
9168		return -EFAULT;
9169
9170	/* arbitrary limit, but we need something */
9171	if (iov->iov_len > SZ_1G)
9172		return -EFAULT;
9173
9174	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
9175		return -EOVERFLOW;
9176
9177	return 0;
9178}
9179
9180static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
9181				   unsigned int nr_args, u64 __user *tags)
9182{
9183	struct page *last_hpage = NULL;
9184	struct io_rsrc_data *data;
9185	int i, ret;
9186	struct iovec iov;
9187
9188	if (ctx->user_bufs)
9189		return -EBUSY;
9190	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
9191		return -EINVAL;
9192	ret = io_rsrc_node_switch_start(ctx);
9193	if (ret)
9194		return ret;
9195	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
9196	if (ret)
9197		return ret;
9198	ret = io_buffers_map_alloc(ctx, nr_args);
9199	if (ret) {
9200		io_rsrc_data_free(data);
9201		return ret;
9202	}
9203
9204	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
9205		ret = io_copy_iov(ctx, &iov, arg, i);
9206		if (ret)
9207			break;
9208		ret = io_buffer_validate(&iov);
9209		if (ret)
9210			break;
9211		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
9212			ret = -EINVAL;
9213			break;
9214		}
9215
9216		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
9217					     &last_hpage);
9218		if (ret)
9219			break;
9220	}
9221
9222	WARN_ON_ONCE(ctx->buf_data);
9223
9224	ctx->buf_data = data;
9225	if (ret)
9226		__io_sqe_buffers_unregister(ctx);
9227	else
9228		io_rsrc_node_switch(ctx, NULL);
9229	return ret;
9230}
9231
9232static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
9233				   struct io_uring_rsrc_update2 *up,
9234				   unsigned int nr_args)
9235{
9236	u64 __user *tags = u64_to_user_ptr(up->tags);
9237	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
9238	struct page *last_hpage = NULL;
9239	bool needs_switch = false;
9240	__u32 done;
9241	int i, err;
9242
9243	if (!ctx->buf_data)
9244		return -ENXIO;
9245	if (up->offset + nr_args > ctx->nr_user_bufs)
9246		return -EINVAL;
9247
9248	for (done = 0; done < nr_args; done++) {
9249		struct io_mapped_ubuf *imu;
9250		int offset = up->offset + done;
9251		u64 tag = 0;
9252
9253		err = io_copy_iov(ctx, &iov, iovs, done);
9254		if (err)
9255			break;
9256		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
9257			err = -EFAULT;
9258			break;
9259		}
9260		err = io_buffer_validate(&iov);
9261		if (err)
9262			break;
9263		if (!iov.iov_base && tag) {
9264			err = -EINVAL;
9265			break;
9266		}
9267		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
9268		if (err)
9269			break;
9270
9271		i = array_index_nospec(offset, ctx->nr_user_bufs);
9272		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
9273			err = io_queue_rsrc_removal(ctx->buf_data, i,
9274						    ctx->rsrc_node, ctx->user_bufs[i]);
9275			if (unlikely(err)) {
9276				io_buffer_unmap(ctx, &imu);
9277				break;
9278			}
9279			ctx->user_bufs[i] = NULL;
9280			needs_switch = true;
9281		}
9282
9283		ctx->user_bufs[i] = imu;
9284		*io_get_tag_slot(ctx->buf_data, offset) = tag;
9285	}
9286
9287	if (needs_switch)
9288		io_rsrc_node_switch(ctx, ctx->buf_data);
9289	return done ? done : err;
9290}
9291
9292static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
9293{
9294	__s32 __user *fds = arg;
9295	int fd;
9296
9297	if (ctx->cq_ev_fd)
9298		return -EBUSY;
9299
9300	if (copy_from_user(&fd, fds, sizeof(*fds)))
9301		return -EFAULT;
9302
9303	ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
9304	if (IS_ERR(ctx->cq_ev_fd)) {
9305		int ret = PTR_ERR(ctx->cq_ev_fd);
9306
9307		ctx->cq_ev_fd = NULL;
9308		return ret;
9309	}
9310
9311	return 0;
9312}
9313
9314static int io_eventfd_unregister(struct io_ring_ctx *ctx)
9315{
9316	if (ctx->cq_ev_fd) {
9317		eventfd_ctx_put(ctx->cq_ev_fd);
9318		ctx->cq_ev_fd = NULL;
9319		return 0;
9320	}
9321
9322	return -ENXIO;
9323}
9324
9325static void io_destroy_buffers(struct io_ring_ctx *ctx)
9326{
9327	struct io_buffer *buf;
9328	unsigned long index;
9329
9330	xa_for_each(&ctx->io_buffers, index, buf)
9331		__io_remove_buffers(ctx, buf, index, -1U);
9332}
9333
9334static void io_req_cache_free(struct list_head *list)
9335{
9336	struct io_kiocb *req, *nxt;
9337
9338	list_for_each_entry_safe(req, nxt, list, inflight_entry) {
9339		list_del(&req->inflight_entry);
9340		kmem_cache_free(req_cachep, req);
9341	}
9342}
9343
9344static void io_req_caches_free(struct io_ring_ctx *ctx)
9345{
9346	struct io_submit_state *state = &ctx->submit_state;
9347
9348	mutex_lock(&ctx->uring_lock);
9349
9350	if (state->free_reqs) {
9351		kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs);
9352		state->free_reqs = 0;
9353	}
9354
9355	io_flush_cached_locked_reqs(ctx, state);
9356	io_req_cache_free(&state->free_list);
9357	mutex_unlock(&ctx->uring_lock);
9358}
9359
9360static void io_wait_rsrc_data(struct io_rsrc_data *data)
9361{
9362	if (data && !atomic_dec_and_test(&data->refs))
9363		wait_for_completion(&data->done);
9364}
9365
9366static void io_ring_ctx_free(struct io_ring_ctx *ctx)
9367{
9368	io_sq_thread_finish(ctx);
9369
9370	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
9371	io_wait_rsrc_data(ctx->buf_data);
9372	io_wait_rsrc_data(ctx->file_data);
9373
9374	mutex_lock(&ctx->uring_lock);
9375	if (ctx->buf_data)
9376		__io_sqe_buffers_unregister(ctx);
9377	if (ctx->file_data)
9378		__io_sqe_files_unregister(ctx);
9379	if (ctx->rings)
9380		__io_cqring_overflow_flush(ctx, true);
9381	mutex_unlock(&ctx->uring_lock);
9382	io_eventfd_unregister(ctx);
9383	io_destroy_buffers(ctx);
9384	if (ctx->sq_creds)
9385		put_cred(ctx->sq_creds);
9386
9387	/* there are no registered resources left, nobody uses it */
9388	if (ctx->rsrc_node)
9389		io_rsrc_node_destroy(ctx->rsrc_node);
9390	if (ctx->rsrc_backup_node)
9391		io_rsrc_node_destroy(ctx->rsrc_backup_node);
9392	flush_delayed_work(&ctx->rsrc_put_work);
9393
9394	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
9395	WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist));
9396
9397#if defined(CONFIG_UNIX)
9398	if (ctx->ring_sock) {
9399		ctx->ring_sock->file = NULL; /* so that iput() is called */
9400		sock_release(ctx->ring_sock);
9401	}
9402#endif
9403	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
9404
9405	if (ctx->mm_account) {
9406		mmdrop(ctx->mm_account);
9407		ctx->mm_account = NULL;
9408	}
9409
9410	io_mem_free(ctx->rings);
9411	io_mem_free(ctx->sq_sqes);
9412
9413	percpu_ref_exit(&ctx->refs);
9414	free_uid(ctx->user);
9415	io_req_caches_free(ctx);
9416	if (ctx->hash_map)
9417		io_wq_put_hash(ctx->hash_map);
9418	kfree(ctx->cancel_hash);
9419	kfree(ctx->dummy_ubuf);
9420	kfree(ctx);
9421}
9422
9423static __poll_t io_uring_poll(struct file *file, poll_table *wait)
9424{
9425	struct io_ring_ctx *ctx = file->private_data;
9426	__poll_t mask = 0;
9427
9428	poll_wait(file, &ctx->poll_wait, wait);
9429	/*
9430	 * synchronizes with barrier from wq_has_sleeper call in
9431	 * io_commit_cqring
9432	 */
9433	smp_rmb();
9434	if (!io_sqring_full(ctx))
9435		mask |= EPOLLOUT | EPOLLWRNORM;
9436
9437	/*
9438	 * Don't flush cqring overflow list here, just do a simple check.
9439	 * Otherwise there could possible be ABBA deadlock:
9440	 *      CPU0                    CPU1
9441	 *      ----                    ----
9442	 * lock(&ctx->uring_lock);
9443	 *                              lock(&ep->mtx);
9444	 *                              lock(&ctx->uring_lock);
9445	 * lock(&ep->mtx);
9446	 *
9447	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
9448	 * pushs them to do the flush.
9449	 */
9450	if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow))
9451		mask |= EPOLLIN | EPOLLRDNORM;
9452
9453	return mask;
9454}
9455
9456static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
9457{
9458	const struct cred *creds;
9459
9460	creds = xa_erase(&ctx->personalities, id);
9461	if (creds) {
9462		put_cred(creds);
9463		return 0;
9464	}
9465
9466	return -EINVAL;
9467}
9468
9469struct io_tctx_exit {
9470	struct callback_head		task_work;
9471	struct completion		completion;
9472	struct io_ring_ctx		*ctx;
9473};
9474
9475static void io_tctx_exit_cb(struct callback_head *cb)
9476{
9477	struct io_uring_task *tctx = current->io_uring;
9478	struct io_tctx_exit *work;
9479
9480	work = container_of(cb, struct io_tctx_exit, task_work);
9481	/*
9482	 * When @in_idle, we're in cancellation and it's racy to remove the
9483	 * node. It'll be removed by the end of cancellation, just ignore it.
9484	 * tctx can be NULL if the queueing of this task_work raced with
9485	 * work cancelation off the exec path.
9486	 */
9487	if (tctx && !atomic_read(&tctx->in_idle))
9488		io_uring_del_tctx_node((unsigned long)work->ctx);
9489	complete(&work->completion);
9490}
9491
9492static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
9493{
9494	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9495
9496	return req->ctx == data;
9497}
9498
9499static void io_ring_exit_work(struct work_struct *work)
9500{
9501	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
9502	unsigned long timeout = jiffies + HZ * 60 * 5;
9503	unsigned long interval = HZ / 20;
9504	struct io_tctx_exit exit;
9505	struct io_tctx_node *node;
9506	int ret;
9507
9508	/*
9509	 * If we're doing polled IO and end up having requests being
9510	 * submitted async (out-of-line), then completions can come in while
9511	 * we're waiting for refs to drop. We need to reap these manually,
9512	 * as nobody else will be looking for them.
9513	 */
9514	do {
9515		io_uring_try_cancel_requests(ctx, NULL, true);
9516		if (ctx->sq_data) {
9517			struct io_sq_data *sqd = ctx->sq_data;
9518			struct task_struct *tsk;
9519
9520			io_sq_thread_park(sqd);
9521			tsk = sqd->thread;
9522			if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
9523				io_wq_cancel_cb(tsk->io_uring->io_wq,
9524						io_cancel_ctx_cb, ctx, true);
9525			io_sq_thread_unpark(sqd);
9526		}
9527
9528		if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
9529			/* there is little hope left, don't run it too often */
9530			interval = HZ * 60;
9531		}
9532		/*
9533		 * This is really an uninterruptible wait, as it has to be
9534		 * complete. But it's also run from a kworker, which doesn't
9535		 * take signals, so it's fine to make it interruptible. This
9536		 * avoids scenarios where we knowingly can wait much longer
9537		 * on completions, for example if someone does a SIGSTOP on
9538		 * a task that needs to finish task_work to make this loop
9539		 * complete. That's a synthetic situation that should not
9540		 * cause a stuck task backtrace, and hence a potential panic
9541		 * on stuck tasks if that is enabled.
9542		 */
9543	} while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));
9544
9545	init_completion(&exit.completion);
9546	init_task_work(&exit.task_work, io_tctx_exit_cb);
9547	exit.ctx = ctx;
9548	/*
9549	 * Some may use context even when all refs and requests have been put,
9550	 * and they are free to do so while still holding uring_lock or
9551	 * completion_lock, see io_req_task_submit(). Apart from other work,
9552	 * this lock/unlock section also waits them to finish.
9553	 */
9554	mutex_lock(&ctx->uring_lock);
9555	while (!list_empty(&ctx->tctx_list)) {
9556		WARN_ON_ONCE(time_after(jiffies, timeout));
9557
9558		node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
9559					ctx_node);
9560		/* don't spin on a single task if cancellation failed */
9561		list_rotate_left(&ctx->tctx_list);
9562		ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
9563		if (WARN_ON_ONCE(ret))
9564			continue;
9565		wake_up_process(node->task);
9566
9567		mutex_unlock(&ctx->uring_lock);
9568		/*
9569		 * See comment above for
9570		 * wait_for_completion_interruptible_timeout() on why this
9571		 * wait is marked as interruptible.
9572		 */
9573		wait_for_completion_interruptible(&exit.completion);
9574		mutex_lock(&ctx->uring_lock);
9575	}
9576	mutex_unlock(&ctx->uring_lock);
9577	spin_lock(&ctx->completion_lock);
9578	spin_unlock(&ctx->completion_lock);
9579
9580	io_ring_ctx_free(ctx);
9581}
9582
9583/* Returns true if we found and killed one or more timeouts */
9584static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk,
9585			     bool cancel_all)
9586{
9587	struct io_kiocb *req, *tmp;
9588	int canceled = 0;
9589
9590	spin_lock(&ctx->completion_lock);
9591	spin_lock_irq(&ctx->timeout_lock);
9592	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
9593		if (io_match_task(req, tsk, cancel_all)) {
9594			io_kill_timeout(req, -ECANCELED);
9595			canceled++;
9596		}
9597	}
9598	spin_unlock_irq(&ctx->timeout_lock);
9599	if (canceled != 0)
9600		io_commit_cqring(ctx);
9601	spin_unlock(&ctx->completion_lock);
9602	if (canceled != 0)
9603		io_cqring_ev_posted(ctx);
9604	return canceled != 0;
9605}
9606
9607static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
9608{
9609	unsigned long index;
9610	struct creds *creds;
9611
9612	mutex_lock(&ctx->uring_lock);
9613	percpu_ref_kill(&ctx->refs);
9614	if (ctx->rings)
9615		__io_cqring_overflow_flush(ctx, true);
9616	xa_for_each(&ctx->personalities, index, creds)
9617		io_unregister_personality(ctx, index);
9618	mutex_unlock(&ctx->uring_lock);
9619
9620	io_kill_timeouts(ctx, NULL, true);
9621	io_poll_remove_all(ctx, NULL, true);
9622
9623	/* if we failed setting up the ctx, we might not have any rings */
9624	io_iopoll_try_reap_events(ctx);
9625
9626	/* drop cached put refs after potentially doing completions */
9627	if (current->io_uring)
9628		io_uring_drop_tctx_refs(current);
9629
9630	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
9631	/*
9632	 * Use system_unbound_wq to avoid spawning tons of event kworkers
9633	 * if we're exiting a ton of rings at the same time. It just adds
9634	 * noise and overhead, there's no discernable change in runtime
9635	 * over using system_wq.
9636	 */
9637	queue_work(system_unbound_wq, &ctx->exit_work);
9638}
9639
9640static int io_uring_release(struct inode *inode, struct file *file)
9641{
9642	struct io_ring_ctx *ctx = file->private_data;
9643
9644	file->private_data = NULL;
9645	io_ring_ctx_wait_and_kill(ctx);
9646	return 0;
9647}
9648
9649struct io_task_cancel {
9650	struct task_struct *task;
9651	bool all;
9652};
9653
9654static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
9655{
9656	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
9657	struct io_task_cancel *cancel = data;
9658
9659	return io_match_task_safe(req, cancel->task, cancel->all);
9660}
9661
9662static bool io_cancel_defer_files(struct io_ring_ctx *ctx,
9663				  struct task_struct *task, bool cancel_all)
9664{
9665	struct io_defer_entry *de;
9666	LIST_HEAD(list);
9667
9668	spin_lock(&ctx->completion_lock);
9669	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
9670		if (io_match_task_safe(de->req, task, cancel_all)) {
9671			list_cut_position(&list, &ctx->defer_list, &de->list);
9672			break;
9673		}
9674	}
9675	spin_unlock(&ctx->completion_lock);
9676	if (list_empty(&list))
9677		return false;
9678
9679	while (!list_empty(&list)) {
9680		de = list_first_entry(&list, struct io_defer_entry, list);
9681		list_del_init(&de->list);
9682		io_req_complete_failed(de->req, -ECANCELED);
9683		kfree(de);
9684	}
9685	return true;
9686}
9687
9688static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
9689{
9690	struct io_tctx_node *node;
9691	enum io_wq_cancel cret;
9692	bool ret = false;
9693
9694	mutex_lock(&ctx->uring_lock);
9695	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
9696		struct io_uring_task *tctx = node->task->io_uring;
9697
9698		/*
9699		 * io_wq will stay alive while we hold uring_lock, because it's
9700		 * killed after ctx nodes, which requires to take the lock.
9701		 */
9702		if (!tctx || !tctx->io_wq)
9703			continue;
9704		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
9705		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9706	}
9707	mutex_unlock(&ctx->uring_lock);
9708
9709	return ret;
9710}
9711
9712static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
9713					 struct task_struct *task,
9714					 bool cancel_all)
9715{
9716	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
9717	struct io_uring_task *tctx = task ? task->io_uring : NULL;
9718
9719	while (1) {
9720		enum io_wq_cancel cret;
9721		bool ret = false;
9722
9723		if (!task) {
9724			ret |= io_uring_try_cancel_iowq(ctx);
9725		} else if (tctx && tctx->io_wq) {
9726			/*
9727			 * Cancels requests of all rings, not only @ctx, but
9728			 * it's fine as the task is in exit/exec.
9729			 */
9730			cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
9731					       &cancel, true);
9732			ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
9733		}
9734
9735		/* SQPOLL thread does its own polling */
9736		if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
9737		    (ctx->sq_data && ctx->sq_data->thread == current)) {
9738			while (!list_empty_careful(&ctx->iopoll_list)) {
9739				io_iopoll_try_reap_events(ctx);
9740				ret = true;
9741				cond_resched();
9742			}
9743		}
9744
9745		ret |= io_cancel_defer_files(ctx, task, cancel_all);
9746		ret |= io_poll_remove_all(ctx, task, cancel_all);
9747		ret |= io_kill_timeouts(ctx, task, cancel_all);
9748		if (task)
9749			ret |= io_run_task_work();
9750		if (!ret)
9751			break;
9752		cond_resched();
9753	}
9754}
9755
9756static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9757{
9758	struct io_uring_task *tctx = current->io_uring;
9759	struct io_tctx_node *node;
9760	int ret;
9761
9762	if (unlikely(!tctx)) {
9763		ret = io_uring_alloc_task_context(current, ctx);
9764		if (unlikely(ret))
9765			return ret;
9766
9767		tctx = current->io_uring;
9768		if (ctx->iowq_limits_set) {
9769			unsigned int limits[2] = { ctx->iowq_limits[0],
9770						   ctx->iowq_limits[1], };
9771
9772			ret = io_wq_max_workers(tctx->io_wq, limits);
9773			if (ret)
9774				return ret;
9775		}
9776	}
9777	if (!xa_load(&tctx->xa, (unsigned long)ctx)) {
9778		node = kmalloc(sizeof(*node), GFP_KERNEL);
9779		if (!node)
9780			return -ENOMEM;
9781		node->ctx = ctx;
9782		node->task = current;
9783
9784		ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx,
9785					node, GFP_KERNEL));
9786		if (ret) {
9787			kfree(node);
9788			return ret;
9789		}
9790
9791		mutex_lock(&ctx->uring_lock);
9792		list_add(&node->ctx_node, &ctx->tctx_list);
9793		mutex_unlock(&ctx->uring_lock);
9794	}
9795	tctx->last = ctx;
9796	return 0;
9797}
9798
9799/*
9800 * Note that this task has used io_uring. We use it for cancelation purposes.
9801 */
9802static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx)
9803{
9804	struct io_uring_task *tctx = current->io_uring;
9805
9806	if (likely(tctx && tctx->last == ctx))
9807		return 0;
9808	return __io_uring_add_tctx_node(ctx);
9809}
9810
9811/*
9812 * Remove this io_uring_file -> task mapping.
9813 */
9814static void io_uring_del_tctx_node(unsigned long index)
9815{
9816	struct io_uring_task *tctx = current->io_uring;
9817	struct io_tctx_node *node;
9818
9819	if (!tctx)
9820		return;
9821	node = xa_erase(&tctx->xa, index);
9822	if (!node)
9823		return;
9824
9825	WARN_ON_ONCE(current != node->task);
9826	WARN_ON_ONCE(list_empty(&node->ctx_node));
9827
9828	mutex_lock(&node->ctx->uring_lock);
9829	list_del(&node->ctx_node);
9830	mutex_unlock(&node->ctx->uring_lock);
9831
9832	if (tctx->last == node->ctx)
9833		tctx->last = NULL;
9834	kfree(node);
9835}
9836
9837static void io_uring_clean_tctx(struct io_uring_task *tctx)
9838{
9839	struct io_wq *wq = tctx->io_wq;
9840	struct io_tctx_node *node;
9841	unsigned long index;
9842
9843	xa_for_each(&tctx->xa, index, node) {
9844		io_uring_del_tctx_node(index);
9845		cond_resched();
9846	}
9847	if (wq) {
9848		/*
9849		 * Must be after io_uring_del_task_file() (removes nodes under
9850		 * uring_lock) to avoid race with io_uring_try_cancel_iowq().
9851		 */
9852		io_wq_put_and_exit(wq);
9853		tctx->io_wq = NULL;
9854	}
9855}
9856
9857static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
9858{
9859	if (tracked)
9860		return atomic_read(&tctx->inflight_tracked);
9861	return percpu_counter_sum(&tctx->inflight);
9862}
9863
9864/*
9865 * Find any io_uring ctx that this task has registered or done IO on, and cancel
9866 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
9867 */
9868static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
9869{
9870	struct io_uring_task *tctx = current->io_uring;
9871	struct io_ring_ctx *ctx;
9872	s64 inflight;
9873	DEFINE_WAIT(wait);
9874
9875	WARN_ON_ONCE(sqd && sqd->thread != current);
9876
9877	if (!current->io_uring)
9878		return;
9879	if (tctx->io_wq)
9880		io_wq_exit_start(tctx->io_wq);
9881
9882	atomic_inc(&tctx->in_idle);
9883	do {
9884		io_uring_drop_tctx_refs(current);
9885		/* read completions before cancelations */
9886		inflight = tctx_inflight(tctx, !cancel_all);
9887		if (!inflight)
9888			break;
9889
9890		if (!sqd) {
9891			struct io_tctx_node *node;
9892			unsigned long index;
9893
9894			xa_for_each(&tctx->xa, index, node) {
9895				/* sqpoll task will cancel all its requests */
9896				if (node->ctx->sq_data)
9897					continue;
9898				io_uring_try_cancel_requests(node->ctx, current,
9899							     cancel_all);
9900			}
9901		} else {
9902			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
9903				io_uring_try_cancel_requests(ctx, current,
9904							     cancel_all);
9905		}
9906
9907		prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
9908		io_run_task_work();
9909		io_uring_drop_tctx_refs(current);
9910
9911		/*
9912		 * If we've seen completions, retry without waiting. This
9913		 * avoids a race where a completion comes in before we did
9914		 * prepare_to_wait().
9915		 */
9916		if (inflight == tctx_inflight(tctx, !cancel_all))
9917			schedule();
9918		finish_wait(&tctx->wait, &wait);
9919	} while (1);
9920
9921	io_uring_clean_tctx(tctx);
9922	if (cancel_all) {
9923		/*
9924		 * We shouldn't run task_works after cancel, so just leave
9925		 * ->in_idle set for normal exit.
9926		 */
9927		atomic_dec(&tctx->in_idle);
9928		/* for exec all current's requests should be gone, kill tctx */
9929		__io_uring_free(current);
9930	}
9931}
9932
9933void __io_uring_cancel(bool cancel_all)
9934{
9935	io_uring_cancel_generic(cancel_all, NULL);
9936}
9937
9938static void *io_uring_validate_mmap_request(struct file *file,
9939					    loff_t pgoff, size_t sz)
9940{
9941	struct io_ring_ctx *ctx = file->private_data;
9942	loff_t offset = pgoff << PAGE_SHIFT;
9943	struct page *page;
9944	void *ptr;
9945
9946	switch (offset) {
9947	case IORING_OFF_SQ_RING:
9948	case IORING_OFF_CQ_RING:
9949		ptr = ctx->rings;
9950		break;
9951	case IORING_OFF_SQES:
9952		ptr = ctx->sq_sqes;
9953		break;
9954	default:
9955		return ERR_PTR(-EINVAL);
9956	}
9957
9958	page = virt_to_head_page(ptr);
9959	if (sz > page_size(page))
9960		return ERR_PTR(-EINVAL);
9961
9962	return ptr;
9963}
9964
9965#ifdef CONFIG_MMU
9966
9967static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9968{
9969	size_t sz = vma->vm_end - vma->vm_start;
9970	unsigned long pfn;
9971	void *ptr;
9972
9973	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
9974	if (IS_ERR(ptr))
9975		return PTR_ERR(ptr);
9976
9977	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
9978	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
9979}
9980
9981#else /* !CONFIG_MMU */
9982
9983static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
9984{
9985	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
9986}
9987
9988static unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
9989{
9990	return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
9991}
9992
9993static unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
9994	unsigned long addr, unsigned long len,
9995	unsigned long pgoff, unsigned long flags)
9996{
9997	void *ptr;
9998
9999	ptr = io_uring_validate_mmap_request(file, pgoff, len);
10000	if (IS_ERR(ptr))
10001		return PTR_ERR(ptr);
10002
10003	return (unsigned long) ptr;
10004}
10005
10006#endif /* !CONFIG_MMU */
10007
10008static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
10009{
10010	DEFINE_WAIT(wait);
10011
10012	do {
10013		if (!io_sqring_full(ctx))
10014			break;
10015		prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE);
10016
10017		if (!io_sqring_full(ctx))
10018			break;
10019		schedule();
10020	} while (!signal_pending(current));
10021
10022	finish_wait(&ctx->sqo_sq_wait, &wait);
10023	return 0;
10024}
10025
10026static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
10027			  struct __kernel_timespec __user **ts,
10028			  const sigset_t __user **sig)
10029{
10030	struct io_uring_getevents_arg arg;
10031
10032	/*
10033	 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
10034	 * is just a pointer to the sigset_t.
10035	 */
10036	if (!(flags & IORING_ENTER_EXT_ARG)) {
10037		*sig = (const sigset_t __user *) argp;
10038		*ts = NULL;
10039		return 0;
10040	}
10041
10042	/*
10043	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
10044	 * timespec and sigset_t pointers if good.
10045	 */
10046	if (*argsz != sizeof(arg))
10047		return -EINVAL;
10048	if (copy_from_user(&arg, argp, sizeof(arg)))
10049		return -EFAULT;
10050	if (arg.pad)
10051		return -EINVAL;
10052	*sig = u64_to_user_ptr(arg.sigmask);
10053	*argsz = arg.sigmask_sz;
10054	*ts = u64_to_user_ptr(arg.ts);
10055	return 0;
10056}
10057
10058SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
10059		u32, min_complete, u32, flags, const void __user *, argp,
10060		size_t, argsz)
10061{
10062	struct io_ring_ctx *ctx;
10063	int submitted = 0;
10064	struct fd f;
10065	long ret;
10066
10067	io_run_task_work();
10068
10069	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
10070			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
10071		return -EINVAL;
10072
10073	f = fdget(fd);
10074	if (unlikely(!f.file))
10075		return -EBADF;
10076
10077	ret = -EOPNOTSUPP;
10078	if (unlikely(f.file->f_op != &io_uring_fops))
10079		goto out_fput;
10080
10081	ret = -ENXIO;
10082	ctx = f.file->private_data;
10083	if (unlikely(!percpu_ref_tryget(&ctx->refs)))
10084		goto out_fput;
10085
10086	ret = -EBADFD;
10087	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
10088		goto out;
10089
10090	/*
10091	 * For SQ polling, the thread will do all submissions and completions.
10092	 * Just return the requested submit count, and wake the thread if
10093	 * we were asked to.
10094	 */
10095	ret = 0;
10096	if (ctx->flags & IORING_SETUP_SQPOLL) {
10097		io_cqring_overflow_flush(ctx);
10098
10099		if (unlikely(ctx->sq_data->thread == NULL)) {
10100			ret = -EOWNERDEAD;
10101			goto out;
10102		}
10103		if (flags & IORING_ENTER_SQ_WAKEUP)
10104			wake_up(&ctx->sq_data->wait);
10105		if (flags & IORING_ENTER_SQ_WAIT) {
10106			ret = io_sqpoll_wait_sq(ctx);
10107			if (ret)
10108				goto out;
10109		}
10110		submitted = to_submit;
10111	} else if (to_submit) {
10112		ret = io_uring_add_tctx_node(ctx);
10113		if (unlikely(ret))
10114			goto out;
10115		mutex_lock(&ctx->uring_lock);
10116		submitted = io_submit_sqes(ctx, to_submit);
10117		mutex_unlock(&ctx->uring_lock);
10118
10119		if (submitted != to_submit)
10120			goto out;
10121	}
10122	if (flags & IORING_ENTER_GETEVENTS) {
10123		const sigset_t __user *sig;
10124		struct __kernel_timespec __user *ts;
10125
10126		ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
10127		if (unlikely(ret))
10128			goto out;
10129
10130		min_complete = min(min_complete, ctx->cq_entries);
10131
10132		/*
10133		 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
10134		 * space applications don't need to do io completion events
10135		 * polling again, they can rely on io_sq_thread to do polling
10136		 * work, which can reduce cpu usage and uring_lock contention.
10137		 */
10138		if (ctx->flags & IORING_SETUP_IOPOLL &&
10139		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
10140			ret = io_iopoll_check(ctx, min_complete);
10141		} else {
10142			ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts);
10143		}
10144	}
10145
10146out:
10147	percpu_ref_put(&ctx->refs);
10148out_fput:
10149	fdput(f);
10150	return submitted ? submitted : ret;
10151}
10152
10153#ifdef CONFIG_PROC_FS
10154static int io_uring_show_cred(struct seq_file *m, unsigned int id,
10155		const struct cred *cred)
10156{
10157	struct user_namespace *uns = seq_user_ns(m);
10158	struct group_info *gi;
10159	kernel_cap_t cap;
10160	unsigned __capi;
10161	int g;
10162
10163	seq_printf(m, "%5d\n", id);
10164	seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
10165	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
10166	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
10167	seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
10168	seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
10169	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
10170	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
10171	seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
10172	seq_puts(m, "\n\tGroups:\t");
10173	gi = cred->group_info;
10174	for (g = 0; g < gi->ngroups; g++) {
10175		seq_put_decimal_ull(m, g ? " " : "",
10176					from_kgid_munged(uns, gi->gid[g]));
10177	}
10178	seq_puts(m, "\n\tCapEff:\t");
10179	cap = cred->cap_effective;
10180	CAP_FOR_EACH_U32(__capi)
10181		seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8);
10182	seq_putc(m, '\n');
10183	return 0;
10184}
10185
10186static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
10187{
10188	int sq_pid = -1, sq_cpu = -1;
10189	bool has_lock;
10190	int i;
10191
10192	/*
10193	 * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
10194	 * since fdinfo case grabs it in the opposite direction of normal use
10195	 * cases. If we fail to get the lock, we just don't iterate any
10196	 * structures that could be going away outside the io_uring mutex.
10197	 */
10198	has_lock = mutex_trylock(&ctx->uring_lock);
10199
10200	if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
10201		struct io_sq_data *sq = ctx->sq_data;
10202
10203		if (mutex_trylock(&sq->lock)) {
10204			if (sq->thread) {
10205				sq_pid = task_pid_nr(sq->thread);
10206				sq_cpu = task_cpu(sq->thread);
10207			}
10208			mutex_unlock(&sq->lock);
10209		}
10210	}
10211
10212	seq_printf(m, "SqThread:\t%d\n", sq_pid);
10213	seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
10214	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
10215	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
10216		struct file *f = io_file_from_index(ctx, i);
10217
10218		if (f)
10219			seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
10220		else
10221			seq_printf(m, "%5u: <none>\n", i);
10222	}
10223	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
10224	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
10225		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
10226		unsigned int len = buf->ubuf_end - buf->ubuf;
10227
10228		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
10229	}
10230	if (has_lock && !xa_empty(&ctx->personalities)) {
10231		unsigned long index;
10232		const struct cred *cred;
10233
10234		seq_printf(m, "Personalities:\n");
10235		xa_for_each(&ctx->personalities, index, cred)
10236			io_uring_show_cred(m, index, cred);
10237	}
10238	seq_printf(m, "PollList:\n");
10239	spin_lock(&ctx->completion_lock);
10240	for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
10241		struct hlist_head *list = &ctx->cancel_hash[i];
10242		struct io_kiocb *req;
10243
10244		hlist_for_each_entry(req, list, hash_node)
10245			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
10246					req->task->task_works != NULL);
10247	}
10248	spin_unlock(&ctx->completion_lock);
10249	if (has_lock)
10250		mutex_unlock(&ctx->uring_lock);
10251}
10252
10253static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
10254{
10255	struct io_ring_ctx *ctx = f->private_data;
10256
10257	if (percpu_ref_tryget(&ctx->refs)) {
10258		__io_uring_show_fdinfo(ctx, m);
10259		percpu_ref_put(&ctx->refs);
10260	}
10261}
10262#endif
10263
10264static const struct file_operations io_uring_fops = {
10265	.release	= io_uring_release,
10266	.mmap		= io_uring_mmap,
10267#ifndef CONFIG_MMU
10268	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
10269	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
10270#endif
10271	.poll		= io_uring_poll,
10272#ifdef CONFIG_PROC_FS
10273	.show_fdinfo	= io_uring_show_fdinfo,
10274#endif
10275};
10276
10277static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
10278				  struct io_uring_params *p)
10279{
10280	struct io_rings *rings;
10281	size_t size, sq_array_offset;
10282
10283	/* make sure these are sane, as we already accounted them */
10284	ctx->sq_entries = p->sq_entries;
10285	ctx->cq_entries = p->cq_entries;
10286
10287	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
10288	if (size == SIZE_MAX)
10289		return -EOVERFLOW;
10290
10291	rings = io_mem_alloc(size);
10292	if (!rings)
10293		return -ENOMEM;
10294
10295	ctx->rings = rings;
10296	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
10297	rings->sq_ring_mask = p->sq_entries - 1;
10298	rings->cq_ring_mask = p->cq_entries - 1;
10299	rings->sq_ring_entries = p->sq_entries;
10300	rings->cq_ring_entries = p->cq_entries;
10301
10302	size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
10303	if (size == SIZE_MAX) {
10304		io_mem_free(ctx->rings);
10305		ctx->rings = NULL;
10306		return -EOVERFLOW;
10307	}
10308
10309	ctx->sq_sqes = io_mem_alloc(size);
10310	if (!ctx->sq_sqes) {
10311		io_mem_free(ctx->rings);
10312		ctx->rings = NULL;
10313		return -ENOMEM;
10314	}
10315
10316	return 0;
10317}
10318
10319static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
10320{
10321	int ret, fd;
10322
10323	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
10324	if (fd < 0)
10325		return fd;
10326
10327	ret = io_uring_add_tctx_node(ctx);
10328	if (ret) {
10329		put_unused_fd(fd);
10330		return ret;
10331	}
10332	fd_install(fd, file);
10333	return fd;
10334}
10335
10336/*
10337 * Allocate an anonymous fd, this is what constitutes the application
10338 * visible backing of an io_uring instance. The application mmaps this
10339 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
10340 * we have to tie this fd to a socket for file garbage collection purposes.
10341 */
10342static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
10343{
10344	struct file *file;
10345#if defined(CONFIG_UNIX)
10346	int ret;
10347
10348	ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
10349				&ctx->ring_sock);
10350	if (ret)
10351		return ERR_PTR(ret);
10352#endif
10353
10354	file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
10355					O_RDWR | O_CLOEXEC);
10356#if defined(CONFIG_UNIX)
10357	if (IS_ERR(file)) {
10358		sock_release(ctx->ring_sock);
10359		ctx->ring_sock = NULL;
10360	} else {
10361		ctx->ring_sock->file = file;
10362	}
10363#endif
10364	return file;
10365}
10366
10367static int io_uring_create(unsigned entries, struct io_uring_params *p,
10368			   struct io_uring_params __user *params)
10369{
10370	struct io_ring_ctx *ctx;
10371	struct file *file;
10372	int ret;
10373
10374	if (!entries)
10375		return -EINVAL;
10376	if (entries > IORING_MAX_ENTRIES) {
10377		if (!(p->flags & IORING_SETUP_CLAMP))
10378			return -EINVAL;
10379		entries = IORING_MAX_ENTRIES;
10380	}
10381
10382	/*
10383	 * Use twice as many entries for the CQ ring. It's possible for the
10384	 * application to drive a higher depth than the size of the SQ ring,
10385	 * since the sqes are only used at submission time. This allows for
10386	 * some flexibility in overcommitting a bit. If the application has
10387	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
10388	 * of CQ ring entries manually.
10389	 */
10390	p->sq_entries = roundup_pow_of_two(entries);
10391	if (p->flags & IORING_SETUP_CQSIZE) {
10392		/*
10393		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
10394		 * to a power-of-two, if it isn't already. We do NOT impose
10395		 * any cq vs sq ring sizing.
10396		 */
10397		if (!p->cq_entries)
10398			return -EINVAL;
10399		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
10400			if (!(p->flags & IORING_SETUP_CLAMP))
10401				return -EINVAL;
10402			p->cq_entries = IORING_MAX_CQ_ENTRIES;
10403		}
10404		p->cq_entries = roundup_pow_of_two(p->cq_entries);
10405		if (p->cq_entries < p->sq_entries)
10406			return -EINVAL;
10407	} else {
10408		p->cq_entries = 2 * p->sq_entries;
10409	}
10410
10411	ctx = io_ring_ctx_alloc(p);
10412	if (!ctx)
10413		return -ENOMEM;
10414	ctx->compat = in_compat_syscall();
10415	if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
10416		ctx->user = get_uid(current_user());
10417
10418	/*
10419	 * This is just grabbed for accounting purposes. When a process exits,
10420	 * the mm is exited and dropped before the files, hence we need to hang
10421	 * on to this mm purely for the purposes of being able to unaccount
10422	 * memory (locked/pinned vm). It's not used for anything else.
10423	 */
10424	mmgrab(current->mm);
10425	ctx->mm_account = current->mm;
10426
10427	ret = io_allocate_scq_urings(ctx, p);
10428	if (ret)
10429		goto err;
10430
10431	ret = io_sq_offload_create(ctx, p);
10432	if (ret)
10433		goto err;
10434	/* always set a rsrc node */
10435	ret = io_rsrc_node_switch_start(ctx);
10436	if (ret)
10437		goto err;
10438	io_rsrc_node_switch(ctx, NULL);
10439
10440	memset(&p->sq_off, 0, sizeof(p->sq_off));
10441	p->sq_off.head = offsetof(struct io_rings, sq.head);
10442	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
10443	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
10444	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
10445	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
10446	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
10447	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
10448
10449	memset(&p->cq_off, 0, sizeof(p->cq_off));
10450	p->cq_off.head = offsetof(struct io_rings, cq.head);
10451	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
10452	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
10453	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
10454	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
10455	p->cq_off.cqes = offsetof(struct io_rings, cqes);
10456	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
10457
10458	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
10459			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
10460			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
10461			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
10462			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
10463			IORING_FEAT_RSRC_TAGS;
10464
10465	if (copy_to_user(params, p, sizeof(*p))) {
10466		ret = -EFAULT;
10467		goto err;
10468	}
10469
10470	file = io_uring_get_file(ctx);
10471	if (IS_ERR(file)) {
10472		ret = PTR_ERR(file);
10473		goto err;
10474	}
10475
10476	/*
10477	 * Install ring fd as the very last thing, so we don't risk someone
10478	 * having closed it before we finish setup
10479	 */
10480	ret = io_uring_install_fd(ctx, file);
10481	if (ret < 0) {
10482		/* fput will clean it up */
10483		fput(file);
10484		return ret;
10485	}
10486
10487	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
10488	return ret;
10489err:
10490	io_ring_ctx_wait_and_kill(ctx);
10491	return ret;
10492}
10493
10494/*
10495 * Sets up an aio uring context, and returns the fd. Applications asks for a
10496 * ring size, we return the actual sq/cq ring sizes (among other things) in the
10497 * params structure passed in.
10498 */
10499static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
10500{
10501	struct io_uring_params p;
10502	int i;
10503
10504	if (copy_from_user(&p, params, sizeof(p)))
10505		return -EFAULT;
10506	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
10507		if (p.resv[i])
10508			return -EINVAL;
10509	}
10510
10511	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
10512			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
10513			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
10514			IORING_SETUP_R_DISABLED))
10515		return -EINVAL;
10516
10517	return  io_uring_create(entries, &p, params);
10518}
10519
10520SYSCALL_DEFINE2(io_uring_setup, u32, entries,
10521		struct io_uring_params __user *, params)
10522{
10523	return io_uring_setup(entries, params);
10524}
10525
10526static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
10527{
10528	struct io_uring_probe *p;
10529	size_t size;
10530	int i, ret;
10531
10532	size = struct_size(p, ops, nr_args);
10533	if (size == SIZE_MAX)
10534		return -EOVERFLOW;
10535	p = kzalloc(size, GFP_KERNEL);
10536	if (!p)
10537		return -ENOMEM;
10538
10539	ret = -EFAULT;
10540	if (copy_from_user(p, arg, size))
10541		goto out;
10542	ret = -EINVAL;
10543	if (memchr_inv(p, 0, size))
10544		goto out;
10545
10546	p->last_op = IORING_OP_LAST - 1;
10547	if (nr_args > IORING_OP_LAST)
10548		nr_args = IORING_OP_LAST;
10549
10550	for (i = 0; i < nr_args; i++) {
10551		p->ops[i].op = i;
10552		if (!io_op_defs[i].not_supported)
10553			p->ops[i].flags = IO_URING_OP_SUPPORTED;
10554	}
10555	p->ops_len = i;
10556
10557	ret = 0;
10558	if (copy_to_user(arg, p, size))
10559		ret = -EFAULT;
10560out:
10561	kfree(p);
10562	return ret;
10563}
10564
10565static int io_register_personality(struct io_ring_ctx *ctx)
10566{
10567	const struct cred *creds;
10568	u32 id;
10569	int ret;
10570
10571	creds = get_current_cred();
10572
10573	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
10574			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
10575	if (ret < 0) {
10576		put_cred(creds);
10577		return ret;
10578	}
10579	return id;
10580}
10581
10582static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg,
10583				    unsigned int nr_args)
10584{
10585	struct io_uring_restriction *res;
10586	size_t size;
10587	int i, ret;
10588
10589	/* Restrictions allowed only if rings started disabled */
10590	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10591		return -EBADFD;
10592
10593	/* We allow only a single restrictions registration */
10594	if (ctx->restrictions.registered)
10595		return -EBUSY;
10596
10597	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
10598		return -EINVAL;
10599
10600	size = array_size(nr_args, sizeof(*res));
10601	if (size == SIZE_MAX)
10602		return -EOVERFLOW;
10603
10604	res = memdup_user(arg, size);
10605	if (IS_ERR(res))
10606		return PTR_ERR(res);
10607
10608	ret = 0;
10609
10610	for (i = 0; i < nr_args; i++) {
10611		switch (res[i].opcode) {
10612		case IORING_RESTRICTION_REGISTER_OP:
10613			if (res[i].register_op >= IORING_REGISTER_LAST) {
10614				ret = -EINVAL;
10615				goto out;
10616			}
10617
10618			__set_bit(res[i].register_op,
10619				  ctx->restrictions.register_op);
10620			break;
10621		case IORING_RESTRICTION_SQE_OP:
10622			if (res[i].sqe_op >= IORING_OP_LAST) {
10623				ret = -EINVAL;
10624				goto out;
10625			}
10626
10627			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
10628			break;
10629		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
10630			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
10631			break;
10632		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
10633			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
10634			break;
10635		default:
10636			ret = -EINVAL;
10637			goto out;
10638		}
10639	}
10640
10641out:
10642	/* Reset all restrictions if an error happened */
10643	if (ret != 0)
10644		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
10645	else
10646		ctx->restrictions.registered = true;
10647
10648	kfree(res);
10649	return ret;
10650}
10651
10652static int io_register_enable_rings(struct io_ring_ctx *ctx)
10653{
10654	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
10655		return -EBADFD;
10656
10657	if (ctx->restrictions.registered)
10658		ctx->restricted = 1;
10659
10660	ctx->flags &= ~IORING_SETUP_R_DISABLED;
10661	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
10662		wake_up(&ctx->sq_data->wait);
10663	return 0;
10664}
10665
10666static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
10667				     struct io_uring_rsrc_update2 *up,
10668				     unsigned nr_args)
10669{
10670	__u32 tmp;
10671	int err;
10672
10673	if (check_add_overflow(up->offset, nr_args, &tmp))
10674		return -EOVERFLOW;
10675	err = io_rsrc_node_switch_start(ctx);
10676	if (err)
10677		return err;
10678
10679	switch (type) {
10680	case IORING_RSRC_FILE:
10681		return __io_sqe_files_update(ctx, up, nr_args);
10682	case IORING_RSRC_BUFFER:
10683		return __io_sqe_buffers_update(ctx, up, nr_args);
10684	}
10685	return -EINVAL;
10686}
10687
10688static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
10689				    unsigned nr_args)
10690{
10691	struct io_uring_rsrc_update2 up;
10692
10693	if (!nr_args)
10694		return -EINVAL;
10695	memset(&up, 0, sizeof(up));
10696	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
10697		return -EFAULT;
10698	if (up.resv || up.resv2)
10699		return -EINVAL;
10700	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
10701}
10702
10703static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
10704				   unsigned size, unsigned type)
10705{
10706	struct io_uring_rsrc_update2 up;
10707
10708	if (size != sizeof(up))
10709		return -EINVAL;
10710	if (copy_from_user(&up, arg, sizeof(up)))
10711		return -EFAULT;
10712	if (!up.nr || up.resv || up.resv2)
10713		return -EINVAL;
10714	return __io_register_rsrc_update(ctx, type, &up, up.nr);
10715}
10716
10717static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
10718			    unsigned int size, unsigned int type)
10719{
10720	struct io_uring_rsrc_register rr;
10721
10722	/* keep it extendible */
10723	if (size != sizeof(rr))
10724		return -EINVAL;
10725
10726	memset(&rr, 0, sizeof(rr));
10727	if (copy_from_user(&rr, arg, size))
10728		return -EFAULT;
10729	if (!rr.nr || rr.resv || rr.resv2)
10730		return -EINVAL;
10731
10732	switch (type) {
10733	case IORING_RSRC_FILE:
10734		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
10735					     rr.nr, u64_to_user_ptr(rr.tags));
10736	case IORING_RSRC_BUFFER:
10737		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
10738					       rr.nr, u64_to_user_ptr(rr.tags));
10739	}
10740	return -EINVAL;
10741}
10742
10743static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg,
10744				unsigned len)
10745{
10746	struct io_uring_task *tctx = current->io_uring;
10747	cpumask_var_t new_mask;
10748	int ret;
10749
10750	if (!tctx || !tctx->io_wq)
10751		return -EINVAL;
10752
10753	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
10754		return -ENOMEM;
10755
10756	cpumask_clear(new_mask);
10757	if (len > cpumask_size())
10758		len = cpumask_size();
10759
10760#ifdef CONFIG_COMPAT
10761	if (in_compat_syscall()) {
10762		ret = compat_get_bitmap(cpumask_bits(new_mask),
10763					(const compat_ulong_t __user *)arg,
10764					len * 8 /* CHAR_BIT */);
10765	} else {
10766		ret = copy_from_user(new_mask, arg, len);
10767	}
10768#else
10769	ret = copy_from_user(new_mask, arg, len);
10770#endif
10771
10772	if (ret) {
10773		free_cpumask_var(new_mask);
10774		return -EFAULT;
10775	}
10776
10777	ret = io_wq_cpu_affinity(tctx->io_wq, new_mask);
10778	free_cpumask_var(new_mask);
10779	return ret;
10780}
10781
10782static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
10783{
10784	struct io_uring_task *tctx = current->io_uring;
10785
10786	if (!tctx || !tctx->io_wq)
10787		return -EINVAL;
10788
10789	return io_wq_cpu_affinity(tctx->io_wq, NULL);
10790}
10791
10792static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
10793					void __user *arg)
10794	__must_hold(&ctx->uring_lock)
10795{
10796	struct io_tctx_node *node;
10797	struct io_uring_task *tctx = NULL;
10798	struct io_sq_data *sqd = NULL;
10799	__u32 new_count[2];
10800	int i, ret;
10801
10802	if (copy_from_user(new_count, arg, sizeof(new_count)))
10803		return -EFAULT;
10804	for (i = 0; i < ARRAY_SIZE(new_count); i++)
10805		if (new_count[i] > INT_MAX)
10806			return -EINVAL;
10807
10808	if (ctx->flags & IORING_SETUP_SQPOLL) {
10809		sqd = ctx->sq_data;
10810		if (sqd) {
10811			/*
10812			 * Observe the correct sqd->lock -> ctx->uring_lock
10813			 * ordering. Fine to drop uring_lock here, we hold
10814			 * a ref to the ctx.
10815			 */
10816			refcount_inc(&sqd->refs);
10817			mutex_unlock(&ctx->uring_lock);
10818			mutex_lock(&sqd->lock);
10819			mutex_lock(&ctx->uring_lock);
10820			if (sqd->thread)
10821				tctx = sqd->thread->io_uring;
10822		}
10823	} else {
10824		tctx = current->io_uring;
10825	}
10826
10827	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
10828
10829	for (i = 0; i < ARRAY_SIZE(new_count); i++)
10830		if (new_count[i])
10831			ctx->iowq_limits[i] = new_count[i];
10832	ctx->iowq_limits_set = true;
10833
10834	ret = -EINVAL;
10835	if (tctx && tctx->io_wq) {
10836		ret = io_wq_max_workers(tctx->io_wq, new_count);
10837		if (ret)
10838			goto err;
10839	} else {
10840		memset(new_count, 0, sizeof(new_count));
10841	}
10842
10843	if (sqd) {
10844		mutex_unlock(&sqd->lock);
10845		io_put_sq_data(sqd);
10846	}
10847
10848	if (copy_to_user(arg, new_count, sizeof(new_count)))
10849		return -EFAULT;
10850
10851	/* that's it for SQPOLL, only the SQPOLL task creates requests */
10852	if (sqd)
10853		return 0;
10854
10855	/* now propagate the restriction to all registered users */
10856	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
10857		struct io_uring_task *tctx = node->task->io_uring;
10858
10859		if (WARN_ON_ONCE(!tctx->io_wq))
10860			continue;
10861
10862		for (i = 0; i < ARRAY_SIZE(new_count); i++)
10863			new_count[i] = ctx->iowq_limits[i];
10864		/* ignore errors, it always returns zero anyway */
10865		(void)io_wq_max_workers(tctx->io_wq, new_count);
10866	}
10867	return 0;
10868err:
10869	if (sqd) {
10870		mutex_unlock(&sqd->lock);
10871		io_put_sq_data(sqd);
10872	}
10873	return ret;
10874}
10875
10876static bool io_register_op_must_quiesce(int op)
10877{
10878	switch (op) {
10879	case IORING_REGISTER_BUFFERS:
10880	case IORING_UNREGISTER_BUFFERS:
10881	case IORING_REGISTER_FILES:
10882	case IORING_UNREGISTER_FILES:
10883	case IORING_REGISTER_FILES_UPDATE:
10884	case IORING_REGISTER_PROBE:
10885	case IORING_REGISTER_PERSONALITY:
10886	case IORING_UNREGISTER_PERSONALITY:
10887	case IORING_REGISTER_FILES2:
10888	case IORING_REGISTER_FILES_UPDATE2:
10889	case IORING_REGISTER_BUFFERS2:
10890	case IORING_REGISTER_BUFFERS_UPDATE:
10891	case IORING_REGISTER_IOWQ_AFF:
10892	case IORING_UNREGISTER_IOWQ_AFF:
10893	case IORING_REGISTER_IOWQ_MAX_WORKERS:
10894		return false;
10895	default:
10896		return true;
10897	}
10898}
10899
10900static int io_ctx_quiesce(struct io_ring_ctx *ctx)
10901{
10902	long ret;
10903
10904	percpu_ref_kill(&ctx->refs);
10905
10906	/*
10907	 * Drop uring mutex before waiting for references to exit. If another
10908	 * thread is currently inside io_uring_enter() it might need to grab the
10909	 * uring_lock to make progress. If we hold it here across the drain
10910	 * wait, then we can deadlock. It's safe to drop the mutex here, since
10911	 * no new references will come in after we've killed the percpu ref.
10912	 */
10913	mutex_unlock(&ctx->uring_lock);
10914	do {
10915		ret = wait_for_completion_interruptible(&ctx->ref_comp);
10916		if (!ret)
10917			break;
10918		ret = io_run_task_work_sig();
10919	} while (ret >= 0);
10920	mutex_lock(&ctx->uring_lock);
10921
10922	if (ret)
10923		io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
10924	return ret;
10925}
10926
10927static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
10928			       void __user *arg, unsigned nr_args)
10929	__releases(ctx->uring_lock)
10930	__acquires(ctx->uring_lock)
10931{
10932	int ret;
10933
10934	/*
10935	 * We're inside the ring mutex, if the ref is already dying, then
10936	 * someone else killed the ctx or is already going through
10937	 * io_uring_register().
10938	 */
10939	if (percpu_ref_is_dying(&ctx->refs))
10940		return -ENXIO;
10941
10942	if (ctx->restricted) {
10943		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
10944		if (!test_bit(opcode, ctx->restrictions.register_op))
10945			return -EACCES;
10946	}
10947
10948	if (io_register_op_must_quiesce(opcode)) {
10949		ret = io_ctx_quiesce(ctx);
10950		if (ret)
10951			return ret;
10952	}
10953
10954	switch (opcode) {
10955	case IORING_REGISTER_BUFFERS:
10956		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
10957		break;
10958	case IORING_UNREGISTER_BUFFERS:
10959		ret = -EINVAL;
10960		if (arg || nr_args)
10961			break;
10962		ret = io_sqe_buffers_unregister(ctx);
10963		break;
10964	case IORING_REGISTER_FILES:
10965		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
10966		break;
10967	case IORING_UNREGISTER_FILES:
10968		ret = -EINVAL;
10969		if (arg || nr_args)
10970			break;
10971		ret = io_sqe_files_unregister(ctx);
10972		break;
10973	case IORING_REGISTER_FILES_UPDATE:
10974		ret = io_register_files_update(ctx, arg, nr_args);
10975		break;
10976	case IORING_REGISTER_EVENTFD:
10977	case IORING_REGISTER_EVENTFD_ASYNC:
10978		ret = -EINVAL;
10979		if (nr_args != 1)
10980			break;
10981		ret = io_eventfd_register(ctx, arg);
10982		if (ret)
10983			break;
10984		if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
10985			ctx->eventfd_async = 1;
10986		else
10987			ctx->eventfd_async = 0;
10988		break;
10989	case IORING_UNREGISTER_EVENTFD:
10990		ret = -EINVAL;
10991		if (arg || nr_args)
10992			break;
10993		ret = io_eventfd_unregister(ctx);
10994		break;
10995	case IORING_REGISTER_PROBE:
10996		ret = -EINVAL;
10997		if (!arg || nr_args > 256)
10998			break;
10999		ret = io_probe(ctx, arg, nr_args);
11000		break;
11001	case IORING_REGISTER_PERSONALITY:
11002		ret = -EINVAL;
11003		if (arg || nr_args)
11004			break;
11005		ret = io_register_personality(ctx);
11006		break;
11007	case IORING_UNREGISTER_PERSONALITY:
11008		ret = -EINVAL;
11009		if (arg)
11010			break;
11011		ret = io_unregister_personality(ctx, nr_args);
11012		break;
11013	case IORING_REGISTER_ENABLE_RINGS:
11014		ret = -EINVAL;
11015		if (arg || nr_args)
11016			break;
11017		ret = io_register_enable_rings(ctx);
11018		break;
11019	case IORING_REGISTER_RESTRICTIONS:
11020		ret = io_register_restrictions(ctx, arg, nr_args);
11021		break;
11022	case IORING_REGISTER_FILES2:
11023		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
11024		break;
11025	case IORING_REGISTER_FILES_UPDATE2:
11026		ret = io_register_rsrc_update(ctx, arg, nr_args,
11027					      IORING_RSRC_FILE);
11028		break;
11029	case IORING_REGISTER_BUFFERS2:
11030		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
11031		break;
11032	case IORING_REGISTER_BUFFERS_UPDATE:
11033		ret = io_register_rsrc_update(ctx, arg, nr_args,
11034					      IORING_RSRC_BUFFER);
11035		break;
11036	case IORING_REGISTER_IOWQ_AFF:
11037		ret = -EINVAL;
11038		if (!arg || !nr_args)
11039			break;
11040		ret = io_register_iowq_aff(ctx, arg, nr_args);
11041		break;
11042	case IORING_UNREGISTER_IOWQ_AFF:
11043		ret = -EINVAL;
11044		if (arg || nr_args)
11045			break;
11046		ret = io_unregister_iowq_aff(ctx);
11047		break;
11048	case IORING_REGISTER_IOWQ_MAX_WORKERS:
11049		ret = -EINVAL;
11050		if (!arg || nr_args != 2)
11051			break;
11052		ret = io_register_iowq_max_workers(ctx, arg);
11053		break;
11054	default:
11055		ret = -EINVAL;
11056		break;
11057	}
11058
11059	if (io_register_op_must_quiesce(opcode)) {
11060		/* bring the ctx back to life */
11061		percpu_ref_reinit(&ctx->refs);
11062		reinit_completion(&ctx->ref_comp);
11063	}
11064	return ret;
11065}
11066
11067SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
11068		void __user *, arg, unsigned int, nr_args)
11069{
11070	struct io_ring_ctx *ctx;
11071	long ret = -EBADF;
11072	struct fd f;
11073
11074	if (opcode >= IORING_REGISTER_LAST)
11075		return -EINVAL;
11076
11077	f = fdget(fd);
11078	if (!f.file)
11079		return -EBADF;
11080
11081	ret = -EOPNOTSUPP;
11082	if (f.file->f_op != &io_uring_fops)
11083		goto out_fput;
11084
11085	ctx = f.file->private_data;
11086
11087	io_run_task_work();
11088
11089	mutex_lock(&ctx->uring_lock);
11090	ret = __io_uring_register(ctx, opcode, arg, nr_args);
11091	mutex_unlock(&ctx->uring_lock);
11092	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
11093							ctx->cq_ev_fd != NULL, ret);
11094out_fput:
11095	fdput(f);
11096	return ret;
11097}
11098
11099static int __init io_uring_init(void)
11100{
11101#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \
11102	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
11103	BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \
11104} while (0)
11105
11106#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
11107	__BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename)
11108	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
11109	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
11110	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
11111	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
11112	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
11113	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
11114	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
11115	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
11116	BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
11117	BUILD_BUG_SQE_ELEM(24, __u32,  len);
11118	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
11119	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
11120	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
11121	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
11122	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
11123	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
11124	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
11125	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
11126	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
11127	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
11128	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
11129	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
11130	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
11131	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
11132	BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
11133	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
11134	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
11135	BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
11136	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
11137	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
11138	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
11139
11140	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
11141		     sizeof(struct io_uring_rsrc_update));
11142	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
11143		     sizeof(struct io_uring_rsrc_update2));
11144
11145	/* ->buf_index is u16 */
11146	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
11147
11148	/* should fit into one byte */
11149	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
11150
11151	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
11152	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
11153
11154	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
11155				SLAB_ACCOUNT);
11156	return 0;
11157};
11158__initcall(io_uring_init);
11159