xref: /kernel/linux/linux-6.6/io_uring/net.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/kernel.h>
3#include <linux/errno.h>
4#include <linux/file.h>
5#include <linux/slab.h>
6#include <linux/net.h>
7#include <linux/compat.h>
8#include <net/compat.h>
9#include <linux/io_uring.h>
10
11#include <uapi/linux/io_uring.h>
12
13#include "io_uring.h"
14#include "kbuf.h"
15#include "alloc_cache.h"
16#include "net.h"
17#include "notif.h"
18#include "rsrc.h"
19
20#if defined(CONFIG_NET)
21struct io_shutdown {
22	struct file			*file;
23	int				how;
24};
25
26struct io_accept {
27	struct file			*file;
28	struct sockaddr __user		*addr;
29	int __user			*addr_len;
30	int				flags;
31	u32				file_slot;
32	unsigned long			nofile;
33};
34
35struct io_socket {
36	struct file			*file;
37	int				domain;
38	int				type;
39	int				protocol;
40	int				flags;
41	u32				file_slot;
42	unsigned long			nofile;
43};
44
45struct io_connect {
46	struct file			*file;
47	struct sockaddr __user		*addr;
48	int				addr_len;
49	bool				in_progress;
50	bool				seen_econnaborted;
51};
52
53struct io_sr_msg {
54	struct file			*file;
55	union {
56		struct compat_msghdr __user	*umsg_compat;
57		struct user_msghdr __user	*umsg;
58		void __user			*buf;
59	};
60	unsigned			len;
61	unsigned			done_io;
62	unsigned			msg_flags;
63	unsigned			nr_multishot_loops;
64	u16				flags;
65	/* initialised and used only by !msg send variants */
66	u16				addr_len;
67	u16				buf_group;
68	void __user			*addr;
69	void __user			*msg_control;
70	/* used only for send zerocopy */
71	struct io_kiocb 		*notif;
72};
73
74/*
75 * Number of times we'll try and do receives if there's more data. If we
76 * exceed this limit, then add us to the back of the queue and retry from
77 * there. This helps fairness between flooding clients.
78 */
79#define MULTISHOT_MAX_RETRY	32
80
81static inline bool io_check_multishot(struct io_kiocb *req,
82				      unsigned int issue_flags)
83{
84	/*
85	 * When ->locked_cq is set we only allow to post CQEs from the original
86	 * task context. Usual request completions will be handled in other
87	 * generic paths but multipoll may decide to post extra cqes.
88	 */
89	return !(issue_flags & IO_URING_F_IOWQ) ||
90		!(req->flags & REQ_F_APOLL_MULTISHOT) ||
91		!req->ctx->task_complete;
92}
93
94int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
95{
96	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
97
98	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
99		     sqe->buf_index || sqe->splice_fd_in))
100		return -EINVAL;
101
102	shutdown->how = READ_ONCE(sqe->len);
103	req->flags |= REQ_F_FORCE_ASYNC;
104	return 0;
105}
106
107int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
108{
109	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
110	struct socket *sock;
111	int ret;
112
113	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
114
115	sock = sock_from_file(req->file);
116	if (unlikely(!sock))
117		return -ENOTSOCK;
118
119	ret = __sys_shutdown_sock(sock, shutdown->how);
120	io_req_set_res(req, ret, 0);
121	return IOU_OK;
122}
123
124static bool io_net_retry(struct socket *sock, int flags)
125{
126	if (!(flags & MSG_WAITALL))
127		return false;
128	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
129}
130
131static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
132{
133	struct io_async_msghdr *hdr = req->async_data;
134
135	if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED)
136		return;
137
138	/* Let normal cleanup path reap it if we fail adding to the cache */
139	if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) {
140		req->async_data = NULL;
141		req->flags &= ~REQ_F_ASYNC_DATA;
142	}
143}
144
145static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req,
146						  unsigned int issue_flags)
147{
148	struct io_ring_ctx *ctx = req->ctx;
149	struct io_cache_entry *entry;
150	struct io_async_msghdr *hdr;
151
152	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
153		entry = io_alloc_cache_get(&ctx->netmsg_cache);
154		if (entry) {
155			hdr = container_of(entry, struct io_async_msghdr, cache);
156			hdr->free_iov = NULL;
157			req->flags |= REQ_F_ASYNC_DATA;
158			req->async_data = hdr;
159			return hdr;
160		}
161	}
162
163	if (!io_alloc_async_data(req)) {
164		hdr = req->async_data;
165		hdr->free_iov = NULL;
166		return hdr;
167	}
168	return NULL;
169}
170
171static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req)
172{
173	/* ->prep_async is always called from the submission context */
174	return io_msg_alloc_async(req, 0);
175}
176
177static int io_setup_async_msg(struct io_kiocb *req,
178			      struct io_async_msghdr *kmsg,
179			      unsigned int issue_flags)
180{
181	struct io_async_msghdr *async_msg;
182
183	if (req_has_async_data(req))
184		return -EAGAIN;
185	async_msg = io_msg_alloc_async(req, issue_flags);
186	if (!async_msg) {
187		kfree(kmsg->free_iov);
188		return -ENOMEM;
189	}
190	req->flags |= REQ_F_NEED_CLEANUP;
191	memcpy(async_msg, kmsg, sizeof(*kmsg));
192	if (async_msg->msg.msg_name)
193		async_msg->msg.msg_name = &async_msg->addr;
194
195	if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs)
196		return -EAGAIN;
197
198	/* if were using fast_iov, set it to the new one */
199	if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) {
200		size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov;
201		async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx];
202	}
203
204	return -EAGAIN;
205}
206
207#ifdef CONFIG_COMPAT
208static int io_compat_msg_copy_hdr(struct io_kiocb *req,
209				  struct io_async_msghdr *iomsg,
210				  struct compat_msghdr *msg, int ddir)
211{
212	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
213	struct compat_iovec __user *uiov;
214	int ret;
215
216	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
217		return -EFAULT;
218
219	uiov = compat_ptr(msg->msg_iov);
220	if (req->flags & REQ_F_BUFFER_SELECT) {
221		compat_ssize_t clen;
222
223		iomsg->free_iov = NULL;
224		if (msg->msg_iovlen == 0) {
225			sr->len = 0;
226		} else if (msg->msg_iovlen > 1) {
227			return -EINVAL;
228		} else {
229			if (!access_ok(uiov, sizeof(*uiov)))
230				return -EFAULT;
231			if (__get_user(clen, &uiov->iov_len))
232				return -EFAULT;
233			if (clen < 0)
234				return -EINVAL;
235			sr->len = clen;
236		}
237
238		return 0;
239	}
240
241	iomsg->free_iov = iomsg->fast_iov;
242	ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen,
243				UIO_FASTIOV, &iomsg->free_iov,
244				&iomsg->msg.msg_iter, true);
245	if (unlikely(ret < 0))
246		return ret;
247
248	return 0;
249}
250#endif
251
252static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
253			   struct user_msghdr *msg, int ddir)
254{
255	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
256	int ret;
257
258	if (copy_from_user(msg, sr->umsg, sizeof(*sr->umsg)))
259		return -EFAULT;
260
261	if (req->flags & REQ_F_BUFFER_SELECT) {
262		if (msg->msg_iovlen == 0) {
263			sr->len = iomsg->fast_iov[0].iov_len = 0;
264			iomsg->fast_iov[0].iov_base = NULL;
265			iomsg->free_iov = NULL;
266		} else if (msg->msg_iovlen > 1) {
267			return -EINVAL;
268		} else {
269			if (copy_from_user(iomsg->fast_iov, msg->msg_iov,
270					   sizeof(*msg->msg_iov)))
271				return -EFAULT;
272			sr->len = iomsg->fast_iov[0].iov_len;
273			iomsg->free_iov = NULL;
274		}
275
276		return 0;
277	}
278
279	iomsg->free_iov = iomsg->fast_iov;
280	ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV,
281				&iomsg->free_iov, &iomsg->msg.msg_iter, false);
282	if (unlikely(ret < 0))
283		return ret;
284
285	return 0;
286}
287
288static int io_sendmsg_copy_hdr(struct io_kiocb *req,
289			       struct io_async_msghdr *iomsg)
290{
291	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
292	struct user_msghdr msg;
293	int ret;
294
295	iomsg->msg.msg_name = &iomsg->addr;
296	iomsg->msg.msg_iter.nr_segs = 0;
297
298#ifdef CONFIG_COMPAT
299	if (unlikely(req->ctx->compat)) {
300		struct compat_msghdr cmsg;
301
302		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE);
303		if (unlikely(ret))
304			return ret;
305
306		return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL);
307	}
308#endif
309
310	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE);
311	if (unlikely(ret))
312		return ret;
313
314	ret = __copy_msghdr(&iomsg->msg, &msg, NULL);
315
316	/* save msg_control as sys_sendmsg() overwrites it */
317	sr->msg_control = iomsg->msg.msg_control_user;
318	return ret;
319}
320
321int io_send_prep_async(struct io_kiocb *req)
322{
323	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
324	struct io_async_msghdr *io;
325	int ret;
326
327	if (!zc->addr || req_has_async_data(req))
328		return 0;
329	io = io_msg_alloc_async_prep(req);
330	if (!io)
331		return -ENOMEM;
332	ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr);
333	return ret;
334}
335
336static int io_setup_async_addr(struct io_kiocb *req,
337			      struct sockaddr_storage *addr_storage,
338			      unsigned int issue_flags)
339{
340	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
341	struct io_async_msghdr *io;
342
343	if (!sr->addr || req_has_async_data(req))
344		return -EAGAIN;
345	io = io_msg_alloc_async(req, issue_flags);
346	if (!io)
347		return -ENOMEM;
348	memcpy(&io->addr, addr_storage, sizeof(io->addr));
349	return -EAGAIN;
350}
351
352int io_sendmsg_prep_async(struct io_kiocb *req)
353{
354	int ret;
355
356	if (!io_msg_alloc_async_prep(req))
357		return -ENOMEM;
358	ret = io_sendmsg_copy_hdr(req, req->async_data);
359	if (!ret)
360		req->flags |= REQ_F_NEED_CLEANUP;
361	return ret;
362}
363
364void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
365{
366	struct io_async_msghdr *io = req->async_data;
367
368	kfree(io->free_iov);
369}
370
371int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
372{
373	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
374
375	if (req->opcode == IORING_OP_SEND) {
376		if (READ_ONCE(sqe->__pad3[0]))
377			return -EINVAL;
378		sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
379		sr->addr_len = READ_ONCE(sqe->addr_len);
380	} else if (sqe->addr2 || sqe->file_index) {
381		return -EINVAL;
382	}
383
384	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
385	sr->len = READ_ONCE(sqe->len);
386	sr->flags = READ_ONCE(sqe->ioprio);
387	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
388		return -EINVAL;
389	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
390	if (sr->msg_flags & MSG_DONTWAIT)
391		req->flags |= REQ_F_NOWAIT;
392
393#ifdef CONFIG_COMPAT
394	if (req->ctx->compat)
395		sr->msg_flags |= MSG_CMSG_COMPAT;
396#endif
397	sr->done_io = 0;
398	return 0;
399}
400
401int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
402{
403	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
404	struct io_async_msghdr iomsg, *kmsg;
405	struct socket *sock;
406	unsigned flags;
407	int min_ret = 0;
408	int ret;
409
410	sock = sock_from_file(req->file);
411	if (unlikely(!sock))
412		return -ENOTSOCK;
413
414	if (req_has_async_data(req)) {
415		kmsg = req->async_data;
416		kmsg->msg.msg_control_user = sr->msg_control;
417	} else {
418		ret = io_sendmsg_copy_hdr(req, &iomsg);
419		if (ret)
420			return ret;
421		kmsg = &iomsg;
422	}
423
424	if (!(req->flags & REQ_F_POLLED) &&
425	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
426		return io_setup_async_msg(req, kmsg, issue_flags);
427
428	flags = sr->msg_flags;
429	if (issue_flags & IO_URING_F_NONBLOCK)
430		flags |= MSG_DONTWAIT;
431	if (flags & MSG_WAITALL)
432		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
433
434	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
435
436	if (ret < min_ret) {
437		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
438			return io_setup_async_msg(req, kmsg, issue_flags);
439		if (ret > 0 && io_net_retry(sock, flags)) {
440			kmsg->msg.msg_controllen = 0;
441			kmsg->msg.msg_control = NULL;
442			sr->done_io += ret;
443			req->flags |= REQ_F_PARTIAL_IO;
444			return io_setup_async_msg(req, kmsg, issue_flags);
445		}
446		if (ret == -ERESTARTSYS)
447			ret = -EINTR;
448		req_set_fail(req);
449	}
450	/* fast path, check for non-NULL to avoid function call */
451	if (kmsg->free_iov)
452		kfree(kmsg->free_iov);
453	req->flags &= ~REQ_F_NEED_CLEANUP;
454	io_netmsg_recycle(req, issue_flags);
455	if (ret >= 0)
456		ret += sr->done_io;
457	else if (sr->done_io)
458		ret = sr->done_io;
459	io_req_set_res(req, ret, 0);
460	return IOU_OK;
461}
462
463int io_send(struct io_kiocb *req, unsigned int issue_flags)
464{
465	struct sockaddr_storage __address;
466	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
467	struct msghdr msg;
468	struct socket *sock;
469	unsigned flags;
470	int min_ret = 0;
471	int ret;
472
473	msg.msg_name = NULL;
474	msg.msg_control = NULL;
475	msg.msg_controllen = 0;
476	msg.msg_namelen = 0;
477	msg.msg_ubuf = NULL;
478
479	if (sr->addr) {
480		if (req_has_async_data(req)) {
481			struct io_async_msghdr *io = req->async_data;
482
483			msg.msg_name = &io->addr;
484		} else {
485			ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address);
486			if (unlikely(ret < 0))
487				return ret;
488			msg.msg_name = (struct sockaddr *)&__address;
489		}
490		msg.msg_namelen = sr->addr_len;
491	}
492
493	if (!(req->flags & REQ_F_POLLED) &&
494	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
495		return io_setup_async_addr(req, &__address, issue_flags);
496
497	sock = sock_from_file(req->file);
498	if (unlikely(!sock))
499		return -ENOTSOCK;
500
501	ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter);
502	if (unlikely(ret))
503		return ret;
504
505	flags = sr->msg_flags;
506	if (issue_flags & IO_URING_F_NONBLOCK)
507		flags |= MSG_DONTWAIT;
508	if (flags & MSG_WAITALL)
509		min_ret = iov_iter_count(&msg.msg_iter);
510
511	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
512	msg.msg_flags = flags;
513	ret = sock_sendmsg(sock, &msg);
514	if (ret < min_ret) {
515		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
516			return io_setup_async_addr(req, &__address, issue_flags);
517
518		if (ret > 0 && io_net_retry(sock, flags)) {
519			sr->len -= ret;
520			sr->buf += ret;
521			sr->done_io += ret;
522			req->flags |= REQ_F_PARTIAL_IO;
523			return io_setup_async_addr(req, &__address, issue_flags);
524		}
525		if (ret == -ERESTARTSYS)
526			ret = -EINTR;
527		req_set_fail(req);
528	}
529	if (ret >= 0)
530		ret += sr->done_io;
531	else if (sr->done_io)
532		ret = sr->done_io;
533	io_req_set_res(req, ret, 0);
534	return IOU_OK;
535}
536
537static int io_recvmsg_mshot_prep(struct io_kiocb *req,
538				 struct io_async_msghdr *iomsg,
539				 int namelen, size_t controllen)
540{
541	if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
542			  (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
543		int hdr;
544
545		if (unlikely(namelen < 0))
546			return -EOVERFLOW;
547		if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
548					namelen, &hdr))
549			return -EOVERFLOW;
550		if (check_add_overflow(hdr, controllen, &hdr))
551			return -EOVERFLOW;
552
553		iomsg->namelen = namelen;
554		iomsg->controllen = controllen;
555		return 0;
556	}
557
558	return 0;
559}
560
561static int io_recvmsg_copy_hdr(struct io_kiocb *req,
562			       struct io_async_msghdr *iomsg)
563{
564	struct user_msghdr msg;
565	int ret;
566
567	iomsg->msg.msg_name = &iomsg->addr;
568	iomsg->msg.msg_iter.nr_segs = 0;
569
570#ifdef CONFIG_COMPAT
571	if (unlikely(req->ctx->compat)) {
572		struct compat_msghdr cmsg;
573
574		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST);
575		if (unlikely(ret))
576			return ret;
577
578		ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr);
579		if (unlikely(ret))
580			return ret;
581
582		return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen,
583						cmsg.msg_controllen);
584	}
585#endif
586
587	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST);
588	if (unlikely(ret))
589		return ret;
590
591	ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
592	if (unlikely(ret))
593		return ret;
594
595	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
596					msg.msg_controllen);
597}
598
599int io_recvmsg_prep_async(struct io_kiocb *req)
600{
601	struct io_async_msghdr *iomsg;
602	int ret;
603
604	if (!io_msg_alloc_async_prep(req))
605		return -ENOMEM;
606	iomsg = req->async_data;
607	ret = io_recvmsg_copy_hdr(req, iomsg);
608	if (!ret)
609		req->flags |= REQ_F_NEED_CLEANUP;
610	return ret;
611}
612
613#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)
614
615int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
616{
617	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
618
619	if (unlikely(sqe->file_index || sqe->addr2))
620		return -EINVAL;
621
622	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
623	sr->len = READ_ONCE(sqe->len);
624	sr->flags = READ_ONCE(sqe->ioprio);
625	if (sr->flags & ~(RECVMSG_FLAGS))
626		return -EINVAL;
627	sr->msg_flags = READ_ONCE(sqe->msg_flags);
628	if (sr->msg_flags & MSG_DONTWAIT)
629		req->flags |= REQ_F_NOWAIT;
630	if (sr->msg_flags & MSG_ERRQUEUE)
631		req->flags |= REQ_F_CLEAR_POLLIN;
632	if (sr->flags & IORING_RECV_MULTISHOT) {
633		if (!(req->flags & REQ_F_BUFFER_SELECT))
634			return -EINVAL;
635		if (sr->msg_flags & MSG_WAITALL)
636			return -EINVAL;
637		if (req->opcode == IORING_OP_RECV && sr->len)
638			return -EINVAL;
639		req->flags |= REQ_F_APOLL_MULTISHOT;
640		/*
641		 * Store the buffer group for this multishot receive separately,
642		 * as if we end up doing an io-wq based issue that selects a
643		 * buffer, it has to be committed immediately and that will
644		 * clear ->buf_list. This means we lose the link to the buffer
645		 * list, and the eventual buffer put on completion then cannot
646		 * restore it.
647		 */
648		sr->buf_group = req->buf_index;
649	}
650
651#ifdef CONFIG_COMPAT
652	if (req->ctx->compat)
653		sr->msg_flags |= MSG_CMSG_COMPAT;
654#endif
655	sr->done_io = 0;
656	sr->nr_multishot_loops = 0;
657	return 0;
658}
659
660static inline void io_recv_prep_retry(struct io_kiocb *req)
661{
662	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
663
664	sr->done_io = 0;
665	sr->len = 0; /* get from the provided buffer */
666	req->buf_index = sr->buf_group;
667}
668
669/*
670 * Finishes io_recv and io_recvmsg.
671 *
672 * Returns true if it is actually finished, or false if it should run
673 * again (for multishot).
674 */
675static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
676				  struct msghdr *msg, bool mshot_finished,
677				  unsigned issue_flags)
678{
679	unsigned int cflags;
680
681	cflags = io_put_kbuf(req, issue_flags);
682	if (msg->msg_inq && msg->msg_inq != -1)
683		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
684
685	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
686		io_req_set_res(req, *ret, cflags);
687		*ret = IOU_OK;
688		return true;
689	}
690
691	if (mshot_finished)
692		goto finish;
693
694	/*
695	 * Fill CQE for this receive and see if we should keep trying to
696	 * receive from this socket.
697	 */
698	if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
699				*ret, cflags | IORING_CQE_F_MORE)) {
700		struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
701		int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
702
703		io_recv_prep_retry(req);
704		/* Known not-empty or unknown state, retry */
705		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) {
706			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
707				return false;
708			/* mshot retries exceeded, force a requeue */
709			sr->nr_multishot_loops = 0;
710			mshot_retry_ret = IOU_REQUEUE;
711		}
712		if (issue_flags & IO_URING_F_MULTISHOT)
713			*ret = mshot_retry_ret;
714		else
715			*ret = -EAGAIN;
716		return true;
717	}
718	/* Otherwise stop multishot but use the current result. */
719finish:
720	io_req_set_res(req, *ret, cflags);
721
722	if (issue_flags & IO_URING_F_MULTISHOT)
723		*ret = IOU_STOP_MULTISHOT;
724	else
725		*ret = IOU_OK;
726	return true;
727}
728
729static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
730				     struct io_sr_msg *sr, void __user **buf,
731				     size_t *len)
732{
733	unsigned long ubuf = (unsigned long) *buf;
734	unsigned long hdr;
735
736	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
737		kmsg->controllen;
738	if (*len < hdr)
739		return -EFAULT;
740
741	if (kmsg->controllen) {
742		unsigned long control = ubuf + hdr - kmsg->controllen;
743
744		kmsg->msg.msg_control_user = (void __user *) control;
745		kmsg->msg.msg_controllen = kmsg->controllen;
746	}
747
748	sr->buf = *buf; /* stash for later copy */
749	*buf = (void __user *) (ubuf + hdr);
750	kmsg->payloadlen = *len = *len - hdr;
751	return 0;
752}
753
754struct io_recvmsg_multishot_hdr {
755	struct io_uring_recvmsg_out msg;
756	struct sockaddr_storage addr;
757};
758
759static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
760				struct io_async_msghdr *kmsg,
761				unsigned int flags, bool *finished)
762{
763	int err;
764	int copy_len;
765	struct io_recvmsg_multishot_hdr hdr;
766
767	if (kmsg->namelen)
768		kmsg->msg.msg_name = &hdr.addr;
769	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
770	kmsg->msg.msg_namelen = 0;
771
772	if (sock->file->f_flags & O_NONBLOCK)
773		flags |= MSG_DONTWAIT;
774
775	err = sock_recvmsg(sock, &kmsg->msg, flags);
776	*finished = err <= 0;
777	if (err < 0)
778		return err;
779
780	hdr.msg = (struct io_uring_recvmsg_out) {
781		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
782		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
783	};
784
785	hdr.msg.payloadlen = err;
786	if (err > kmsg->payloadlen)
787		err = kmsg->payloadlen;
788
789	copy_len = sizeof(struct io_uring_recvmsg_out);
790	if (kmsg->msg.msg_namelen > kmsg->namelen)
791		copy_len += kmsg->namelen;
792	else
793		copy_len += kmsg->msg.msg_namelen;
794
795	/*
796	 *      "fromlen shall refer to the value before truncation.."
797	 *                      1003.1g
798	 */
799	hdr.msg.namelen = kmsg->msg.msg_namelen;
800
801	/* ensure that there is no gap between hdr and sockaddr_storage */
802	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
803		     sizeof(struct io_uring_recvmsg_out));
804	if (copy_to_user(io->buf, &hdr, copy_len)) {
805		*finished = true;
806		return -EFAULT;
807	}
808
809	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
810			kmsg->controllen + err;
811}
812
813int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
814{
815	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
816	struct io_async_msghdr iomsg, *kmsg;
817	struct socket *sock;
818	unsigned flags;
819	int ret, min_ret = 0;
820	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
821	bool mshot_finished = true;
822
823	sock = sock_from_file(req->file);
824	if (unlikely(!sock))
825		return -ENOTSOCK;
826
827	if (req_has_async_data(req)) {
828		kmsg = req->async_data;
829	} else {
830		ret = io_recvmsg_copy_hdr(req, &iomsg);
831		if (ret)
832			return ret;
833		kmsg = &iomsg;
834	}
835
836	if (!(req->flags & REQ_F_POLLED) &&
837	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
838		return io_setup_async_msg(req, kmsg, issue_flags);
839
840	if (!io_check_multishot(req, issue_flags))
841		return io_setup_async_msg(req, kmsg, issue_flags);
842
843retry_multishot:
844	if (io_do_buffer_select(req)) {
845		void __user *buf;
846		size_t len = sr->len;
847
848		buf = io_buffer_select(req, &len, issue_flags);
849		if (!buf)
850			return -ENOBUFS;
851
852		if (req->flags & REQ_F_APOLL_MULTISHOT) {
853			ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len);
854			if (ret) {
855				io_kbuf_recycle(req, issue_flags);
856				return ret;
857			}
858		}
859
860		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len);
861	}
862
863	flags = sr->msg_flags;
864	if (force_nonblock)
865		flags |= MSG_DONTWAIT;
866
867	kmsg->msg.msg_get_inq = 1;
868	kmsg->msg.msg_inq = -1;
869	if (req->flags & REQ_F_APOLL_MULTISHOT) {
870		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
871					   &mshot_finished);
872	} else {
873		/* disable partial retry for recvmsg with cmsg attached */
874		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
875			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
876
877		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
878					 kmsg->uaddr, flags);
879	}
880
881	if (ret < min_ret) {
882		if (ret == -EAGAIN && force_nonblock) {
883			ret = io_setup_async_msg(req, kmsg, issue_flags);
884			if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) {
885				io_kbuf_recycle(req, issue_flags);
886				return IOU_ISSUE_SKIP_COMPLETE;
887			}
888			return ret;
889		}
890		if (ret > 0 && io_net_retry(sock, flags)) {
891			sr->done_io += ret;
892			req->flags |= REQ_F_PARTIAL_IO;
893			return io_setup_async_msg(req, kmsg, issue_flags);
894		}
895		if (ret == -ERESTARTSYS)
896			ret = -EINTR;
897		req_set_fail(req);
898	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
899		req_set_fail(req);
900	}
901
902	if (ret > 0)
903		ret += sr->done_io;
904	else if (sr->done_io)
905		ret = sr->done_io;
906	else
907		io_kbuf_recycle(req, issue_flags);
908
909	if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags))
910		goto retry_multishot;
911
912	if (mshot_finished) {
913		/* fast path, check for non-NULL to avoid function call */
914		if (kmsg->free_iov)
915			kfree(kmsg->free_iov);
916		io_netmsg_recycle(req, issue_flags);
917		req->flags &= ~REQ_F_NEED_CLEANUP;
918	} else if (ret == -EAGAIN)
919		return io_setup_async_msg(req, kmsg, issue_flags);
920
921	return ret;
922}
923
924int io_recv(struct io_kiocb *req, unsigned int issue_flags)
925{
926	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
927	struct msghdr msg;
928	struct socket *sock;
929	unsigned flags;
930	int ret, min_ret = 0;
931	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
932	size_t len = sr->len;
933
934	if (!(req->flags & REQ_F_POLLED) &&
935	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
936		return -EAGAIN;
937
938	if (!io_check_multishot(req, issue_flags))
939		return -EAGAIN;
940
941	sock = sock_from_file(req->file);
942	if (unlikely(!sock))
943		return -ENOTSOCK;
944
945	msg.msg_name = NULL;
946	msg.msg_namelen = 0;
947	msg.msg_control = NULL;
948	msg.msg_get_inq = 1;
949	msg.msg_controllen = 0;
950	msg.msg_iocb = NULL;
951	msg.msg_ubuf = NULL;
952
953retry_multishot:
954	if (io_do_buffer_select(req)) {
955		void __user *buf;
956
957		buf = io_buffer_select(req, &len, issue_flags);
958		if (!buf)
959			return -ENOBUFS;
960		sr->buf = buf;
961		sr->len = len;
962	}
963
964	ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter);
965	if (unlikely(ret))
966		goto out_free;
967
968	msg.msg_inq = -1;
969	msg.msg_flags = 0;
970
971	flags = sr->msg_flags;
972	if (force_nonblock)
973		flags |= MSG_DONTWAIT;
974	if (flags & MSG_WAITALL)
975		min_ret = iov_iter_count(&msg.msg_iter);
976
977	ret = sock_recvmsg(sock, &msg, flags);
978	if (ret < min_ret) {
979		if (ret == -EAGAIN && force_nonblock) {
980			if (issue_flags & IO_URING_F_MULTISHOT) {
981				io_kbuf_recycle(req, issue_flags);
982				return IOU_ISSUE_SKIP_COMPLETE;
983			}
984
985			return -EAGAIN;
986		}
987		if (ret > 0 && io_net_retry(sock, flags)) {
988			sr->len -= ret;
989			sr->buf += ret;
990			sr->done_io += ret;
991			req->flags |= REQ_F_PARTIAL_IO;
992			return -EAGAIN;
993		}
994		if (ret == -ERESTARTSYS)
995			ret = -EINTR;
996		req_set_fail(req);
997	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
998out_free:
999		req_set_fail(req);
1000	}
1001
1002	if (ret > 0)
1003		ret += sr->done_io;
1004	else if (sr->done_io)
1005		ret = sr->done_io;
1006	else
1007		io_kbuf_recycle(req, issue_flags);
1008
1009	if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags))
1010		goto retry_multishot;
1011
1012	return ret;
1013}
1014
1015void io_send_zc_cleanup(struct io_kiocb *req)
1016{
1017	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1018	struct io_async_msghdr *io;
1019
1020	if (req_has_async_data(req)) {
1021		io = req->async_data;
1022		/* might be ->fast_iov if *msg_copy_hdr failed */
1023		if (io->free_iov != io->fast_iov)
1024			kfree(io->free_iov);
1025	}
1026	if (zc->notif) {
1027		io_notif_flush(zc->notif);
1028		zc->notif = NULL;
1029	}
1030}
1031
1032#define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1033#define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE)
1034
1035int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1036{
1037	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1038	struct io_ring_ctx *ctx = req->ctx;
1039	struct io_kiocb *notif;
1040
1041	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
1042		return -EINVAL;
1043	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1044	if (req->flags & REQ_F_CQE_SKIP)
1045		return -EINVAL;
1046
1047	notif = zc->notif = io_alloc_notif(ctx);
1048	if (!notif)
1049		return -ENOMEM;
1050	notif->cqe.user_data = req->cqe.user_data;
1051	notif->cqe.res = 0;
1052	notif->cqe.flags = IORING_CQE_F_NOTIF;
1053	req->flags |= REQ_F_NEED_CLEANUP;
1054
1055	zc->flags = READ_ONCE(sqe->ioprio);
1056	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1057		if (zc->flags & ~IO_ZC_FLAGS_VALID)
1058			return -EINVAL;
1059		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1060			io_notif_set_extended(notif);
1061			io_notif_to_data(notif)->zc_report = true;
1062		}
1063	}
1064
1065	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
1066		unsigned idx = READ_ONCE(sqe->buf_index);
1067
1068		if (unlikely(idx >= ctx->nr_user_bufs))
1069			return -EFAULT;
1070		idx = array_index_nospec(idx, ctx->nr_user_bufs);
1071		req->imu = READ_ONCE(ctx->user_bufs[idx]);
1072		io_req_set_rsrc_node(notif, ctx, 0);
1073	}
1074
1075	if (req->opcode == IORING_OP_SEND_ZC) {
1076		if (READ_ONCE(sqe->__pad3[0]))
1077			return -EINVAL;
1078		zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1079		zc->addr_len = READ_ONCE(sqe->addr_len);
1080	} else {
1081		if (unlikely(sqe->addr2 || sqe->file_index))
1082			return -EINVAL;
1083		if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
1084			return -EINVAL;
1085	}
1086
1087	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
1088	zc->len = READ_ONCE(sqe->len);
1089	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
1090	if (zc->msg_flags & MSG_DONTWAIT)
1091		req->flags |= REQ_F_NOWAIT;
1092
1093	zc->done_io = 0;
1094
1095#ifdef CONFIG_COMPAT
1096	if (req->ctx->compat)
1097		zc->msg_flags |= MSG_CMSG_COMPAT;
1098#endif
1099	return 0;
1100}
1101
1102static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb,
1103				 struct iov_iter *from, size_t length)
1104{
1105	skb_zcopy_downgrade_managed(skb);
1106	return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
1107}
1108
1109static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
1110			   struct iov_iter *from, size_t length)
1111{
1112	struct skb_shared_info *shinfo = skb_shinfo(skb);
1113	int frag = shinfo->nr_frags;
1114	int ret = 0;
1115	struct bvec_iter bi;
1116	ssize_t copied = 0;
1117	unsigned long truesize = 0;
1118
1119	if (!frag)
1120		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1121	else if (unlikely(!skb_zcopy_managed(skb)))
1122		return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
1123
1124	bi.bi_size = min(from->count, length);
1125	bi.bi_bvec_done = from->iov_offset;
1126	bi.bi_idx = 0;
1127
1128	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1129		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1130
1131		copied += v.bv_len;
1132		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1133		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1134					   v.bv_offset, v.bv_len);
1135		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1136	}
1137	if (bi.bi_size)
1138		ret = -EMSGSIZE;
1139
1140	shinfo->nr_frags = frag;
1141	from->bvec += bi.bi_idx;
1142	from->nr_segs -= bi.bi_idx;
1143	from->count -= copied;
1144	from->iov_offset = bi.bi_bvec_done;
1145
1146	skb->data_len += copied;
1147	skb->len += copied;
1148	skb->truesize += truesize;
1149
1150	if (sk && sk->sk_type == SOCK_STREAM) {
1151		sk_wmem_queued_add(sk, truesize);
1152		if (!skb_zcopy_pure(skb))
1153			sk_mem_charge(sk, truesize);
1154	} else {
1155		refcount_add(truesize, &skb->sk->sk_wmem_alloc);
1156	}
1157	return ret;
1158}
1159
1160int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
1161{
1162	struct sockaddr_storage __address;
1163	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1164	struct msghdr msg;
1165	struct socket *sock;
1166	unsigned msg_flags;
1167	int ret, min_ret = 0;
1168
1169	sock = sock_from_file(req->file);
1170	if (unlikely(!sock))
1171		return -ENOTSOCK;
1172	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1173		return -EOPNOTSUPP;
1174
1175	msg.msg_name = NULL;
1176	msg.msg_control = NULL;
1177	msg.msg_controllen = 0;
1178	msg.msg_namelen = 0;
1179
1180	if (zc->addr) {
1181		if (req_has_async_data(req)) {
1182			struct io_async_msghdr *io = req->async_data;
1183
1184			msg.msg_name = &io->addr;
1185		} else {
1186			ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address);
1187			if (unlikely(ret < 0))
1188				return ret;
1189			msg.msg_name = (struct sockaddr *)&__address;
1190		}
1191		msg.msg_namelen = zc->addr_len;
1192	}
1193
1194	if (!(req->flags & REQ_F_POLLED) &&
1195	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1196		return io_setup_async_addr(req, &__address, issue_flags);
1197
1198	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
1199		ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu,
1200					(u64)(uintptr_t)zc->buf, zc->len);
1201		if (unlikely(ret))
1202			return ret;
1203		msg.sg_from_iter = io_sg_from_iter;
1204	} else {
1205		io_notif_set_extended(zc->notif);
1206		ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter);
1207		if (unlikely(ret))
1208			return ret;
1209		ret = io_notif_account_mem(zc->notif, zc->len);
1210		if (unlikely(ret))
1211			return ret;
1212		msg.sg_from_iter = io_sg_from_iter_iovec;
1213	}
1214
1215	msg_flags = zc->msg_flags | MSG_ZEROCOPY;
1216	if (issue_flags & IO_URING_F_NONBLOCK)
1217		msg_flags |= MSG_DONTWAIT;
1218	if (msg_flags & MSG_WAITALL)
1219		min_ret = iov_iter_count(&msg.msg_iter);
1220	msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1221
1222	msg.msg_flags = msg_flags;
1223	msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
1224	ret = sock_sendmsg(sock, &msg);
1225
1226	if (unlikely(ret < min_ret)) {
1227		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1228			return io_setup_async_addr(req, &__address, issue_flags);
1229
1230		if (ret > 0 && io_net_retry(sock, msg.msg_flags)) {
1231			zc->len -= ret;
1232			zc->buf += ret;
1233			zc->done_io += ret;
1234			req->flags |= REQ_F_PARTIAL_IO;
1235			return io_setup_async_addr(req, &__address, issue_flags);
1236		}
1237		if (ret == -ERESTARTSYS)
1238			ret = -EINTR;
1239		req_set_fail(req);
1240	}
1241
1242	if (ret >= 0)
1243		ret += zc->done_io;
1244	else if (zc->done_io)
1245		ret = zc->done_io;
1246
1247	/*
1248	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1249	 * flushing notif to io_send_zc_cleanup()
1250	 */
1251	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1252		io_notif_flush(zc->notif);
1253		req->flags &= ~REQ_F_NEED_CLEANUP;
1254	}
1255	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1256	return IOU_OK;
1257}
1258
1259int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1260{
1261	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1262	struct io_async_msghdr iomsg, *kmsg;
1263	struct socket *sock;
1264	unsigned flags;
1265	int ret, min_ret = 0;
1266
1267	io_notif_set_extended(sr->notif);
1268
1269	sock = sock_from_file(req->file);
1270	if (unlikely(!sock))
1271		return -ENOTSOCK;
1272	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1273		return -EOPNOTSUPP;
1274
1275	if (req_has_async_data(req)) {
1276		kmsg = req->async_data;
1277	} else {
1278		ret = io_sendmsg_copy_hdr(req, &iomsg);
1279		if (ret)
1280			return ret;
1281		kmsg = &iomsg;
1282	}
1283
1284	if (!(req->flags & REQ_F_POLLED) &&
1285	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1286		return io_setup_async_msg(req, kmsg, issue_flags);
1287
1288	flags = sr->msg_flags | MSG_ZEROCOPY;
1289	if (issue_flags & IO_URING_F_NONBLOCK)
1290		flags |= MSG_DONTWAIT;
1291	if (flags & MSG_WAITALL)
1292		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1293
1294	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1295	kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1296	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
1297
1298	if (unlikely(ret < min_ret)) {
1299		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1300			return io_setup_async_msg(req, kmsg, issue_flags);
1301
1302		if (ret > 0 && io_net_retry(sock, flags)) {
1303			sr->done_io += ret;
1304			req->flags |= REQ_F_PARTIAL_IO;
1305			return io_setup_async_msg(req, kmsg, issue_flags);
1306		}
1307		if (ret == -ERESTARTSYS)
1308			ret = -EINTR;
1309		req_set_fail(req);
1310	}
1311	/* fast path, check for non-NULL to avoid function call */
1312	if (kmsg->free_iov) {
1313		kfree(kmsg->free_iov);
1314		kmsg->free_iov = NULL;
1315	}
1316
1317	io_netmsg_recycle(req, issue_flags);
1318	if (ret >= 0)
1319		ret += sr->done_io;
1320	else if (sr->done_io)
1321		ret = sr->done_io;
1322
1323	/*
1324	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1325	 * flushing notif to io_send_zc_cleanup()
1326	 */
1327	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1328		io_notif_flush(sr->notif);
1329		req->flags &= ~REQ_F_NEED_CLEANUP;
1330	}
1331	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1332	return IOU_OK;
1333}
1334
1335void io_sendrecv_fail(struct io_kiocb *req)
1336{
1337	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1338
1339	if (req->flags & REQ_F_PARTIAL_IO)
1340		req->cqe.res = sr->done_io;
1341
1342	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1343	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1344		req->cqe.flags |= IORING_CQE_F_MORE;
1345}
1346
1347int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1348{
1349	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1350	unsigned flags;
1351
1352	if (sqe->len || sqe->buf_index)
1353		return -EINVAL;
1354
1355	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1356	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1357	accept->flags = READ_ONCE(sqe->accept_flags);
1358	accept->nofile = rlimit(RLIMIT_NOFILE);
1359	flags = READ_ONCE(sqe->ioprio);
1360	if (flags & ~IORING_ACCEPT_MULTISHOT)
1361		return -EINVAL;
1362
1363	accept->file_slot = READ_ONCE(sqe->file_index);
1364	if (accept->file_slot) {
1365		if (accept->flags & SOCK_CLOEXEC)
1366			return -EINVAL;
1367		if (flags & IORING_ACCEPT_MULTISHOT &&
1368		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1369			return -EINVAL;
1370	}
1371	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1372		return -EINVAL;
1373	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1374		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1375	if (flags & IORING_ACCEPT_MULTISHOT)
1376		req->flags |= REQ_F_APOLL_MULTISHOT;
1377	return 0;
1378}
1379
1380int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1381{
1382	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1383	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1384	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
1385	bool fixed = !!accept->file_slot;
1386	struct file *file;
1387	int ret, fd;
1388
1389	if (!io_check_multishot(req, issue_flags))
1390		return -EAGAIN;
1391retry:
1392	if (!fixed) {
1393		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1394		if (unlikely(fd < 0))
1395			return fd;
1396	}
1397	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
1398			 accept->flags);
1399	if (IS_ERR(file)) {
1400		if (!fixed)
1401			put_unused_fd(fd);
1402		ret = PTR_ERR(file);
1403		if (ret == -EAGAIN && force_nonblock) {
1404			/*
1405			 * if it's multishot and polled, we don't need to
1406			 * return EAGAIN to arm the poll infra since it
1407			 * has already been done
1408			 */
1409			if (issue_flags & IO_URING_F_MULTISHOT)
1410				return IOU_ISSUE_SKIP_COMPLETE;
1411			return ret;
1412		}
1413		if (ret == -ERESTARTSYS)
1414			ret = -EINTR;
1415		req_set_fail(req);
1416	} else if (!fixed) {
1417		fd_install(fd, file);
1418		ret = fd;
1419	} else {
1420		ret = io_fixed_fd_install(req, issue_flags, file,
1421						accept->file_slot);
1422	}
1423
1424	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
1425		io_req_set_res(req, ret, 0);
1426		return IOU_OK;
1427	}
1428
1429	if (ret < 0)
1430		return ret;
1431	if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
1432				ret, IORING_CQE_F_MORE))
1433		goto retry;
1434
1435	io_req_set_res(req, ret, 0);
1436	return IOU_STOP_MULTISHOT;
1437}
1438
1439int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1440{
1441	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1442
1443	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1444		return -EINVAL;
1445
1446	sock->domain = READ_ONCE(sqe->fd);
1447	sock->type = READ_ONCE(sqe->off);
1448	sock->protocol = READ_ONCE(sqe->len);
1449	sock->file_slot = READ_ONCE(sqe->file_index);
1450	sock->nofile = rlimit(RLIMIT_NOFILE);
1451
1452	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1453	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1454		return -EINVAL;
1455	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1456		return -EINVAL;
1457	return 0;
1458}
1459
1460int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1461{
1462	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1463	bool fixed = !!sock->file_slot;
1464	struct file *file;
1465	int ret, fd;
1466
1467	if (!fixed) {
1468		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1469		if (unlikely(fd < 0))
1470			return fd;
1471	}
1472	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1473	if (IS_ERR(file)) {
1474		if (!fixed)
1475			put_unused_fd(fd);
1476		ret = PTR_ERR(file);
1477		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1478			return -EAGAIN;
1479		if (ret == -ERESTARTSYS)
1480			ret = -EINTR;
1481		req_set_fail(req);
1482	} else if (!fixed) {
1483		fd_install(fd, file);
1484		ret = fd;
1485	} else {
1486		ret = io_fixed_fd_install(req, issue_flags, file,
1487					    sock->file_slot);
1488	}
1489	io_req_set_res(req, ret, 0);
1490	return IOU_OK;
1491}
1492
1493int io_connect_prep_async(struct io_kiocb *req)
1494{
1495	struct io_async_connect *io = req->async_data;
1496	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1497
1498	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
1499}
1500
1501int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1502{
1503	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1504
1505	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1506		return -EINVAL;
1507
1508	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1509	conn->addr_len =  READ_ONCE(sqe->addr2);
1510	conn->in_progress = conn->seen_econnaborted = false;
1511	return 0;
1512}
1513
1514int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1515{
1516	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1517	struct io_async_connect __io, *io;
1518	unsigned file_flags;
1519	int ret;
1520	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1521
1522	if (req_has_async_data(req)) {
1523		io = req->async_data;
1524	} else {
1525		ret = move_addr_to_kernel(connect->addr,
1526						connect->addr_len,
1527						&__io.address);
1528		if (ret)
1529			goto out;
1530		io = &__io;
1531	}
1532
1533	file_flags = force_nonblock ? O_NONBLOCK : 0;
1534
1535	ret = __sys_connect_file(req->file, &io->address,
1536					connect->addr_len, file_flags);
1537	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1538	    && force_nonblock) {
1539		if (ret == -EINPROGRESS) {
1540			connect->in_progress = true;
1541		} else if (ret == -ECONNABORTED) {
1542			if (connect->seen_econnaborted)
1543				goto out;
1544			connect->seen_econnaborted = true;
1545		}
1546		if (req_has_async_data(req))
1547			return -EAGAIN;
1548		if (io_alloc_async_data(req)) {
1549			ret = -ENOMEM;
1550			goto out;
1551		}
1552		memcpy(req->async_data, &__io, sizeof(__io));
1553		return -EAGAIN;
1554	}
1555	if (connect->in_progress) {
1556		/*
1557		 * At least bluetooth will return -EBADFD on a re-connect
1558		 * attempt, and it's (supposedly) also valid to get -EISCONN
1559		 * which means the previous result is good. For both of these,
1560		 * grab the sock_error() and use that for the completion.
1561		 */
1562		if (ret == -EBADFD || ret == -EISCONN)
1563			ret = sock_error(sock_from_file(req->file)->sk);
1564	}
1565	if (ret == -ERESTARTSYS)
1566		ret = -EINTR;
1567out:
1568	if (ret < 0)
1569		req_set_fail(req);
1570	io_req_set_res(req, ret, 0);
1571	return IOU_OK;
1572}
1573
1574void io_netmsg_cache_free(struct io_cache_entry *entry)
1575{
1576	kfree(container_of(entry, struct io_async_msghdr, cache));
1577}
1578#endif
1579