162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Shared application/kernel submission and completion ring pairs, for
462306a36Sopenharmony_ci * supporting fast/efficient IO.
562306a36Sopenharmony_ci *
662306a36Sopenharmony_ci * A note on the read/write ordering memory barriers that are matched between
762306a36Sopenharmony_ci * the application and kernel side.
862306a36Sopenharmony_ci *
962306a36Sopenharmony_ci * After the application reads the CQ ring tail, it must use an
1062306a36Sopenharmony_ci * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
1162306a36Sopenharmony_ci * before writing the tail (using smp_load_acquire to read the tail will
1262306a36Sopenharmony_ci * do). It also needs a smp_mb() before updating CQ head (ordering the
1362306a36Sopenharmony_ci * entry load(s) with the head store), pairing with an implicit barrier
1462306a36Sopenharmony_ci * through a control-dependency in io_get_cqe (smp_store_release to
1562306a36Sopenharmony_ci * store head will do). Failure to do so could lead to reading invalid
1662306a36Sopenharmony_ci * CQ entries.
1762306a36Sopenharmony_ci *
1862306a36Sopenharmony_ci * Likewise, the application must use an appropriate smp_wmb() before
1962306a36Sopenharmony_ci * writing the SQ tail (ordering SQ entry stores with the tail store),
2062306a36Sopenharmony_ci * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
2162306a36Sopenharmony_ci * to store the tail will do). And it needs a barrier ordering the SQ
2262306a36Sopenharmony_ci * head load before writing new SQ entries (smp_load_acquire to read
2362306a36Sopenharmony_ci * head will do).
2462306a36Sopenharmony_ci *
2562306a36Sopenharmony_ci * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
2662306a36Sopenharmony_ci * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
2762306a36Sopenharmony_ci * updating the SQ tail; a full memory barrier smp_mb() is needed
2862306a36Sopenharmony_ci * between.
2962306a36Sopenharmony_ci *
3062306a36Sopenharmony_ci * Also see the examples in the liburing library:
3162306a36Sopenharmony_ci *
3262306a36Sopenharmony_ci *	git://git.kernel.dk/liburing
3362306a36Sopenharmony_ci *
3462306a36Sopenharmony_ci * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
3562306a36Sopenharmony_ci * from data shared between the kernel and application. This is done both
3662306a36Sopenharmony_ci * for ordering purposes, but also to ensure that once a value is loaded from
3762306a36Sopenharmony_ci * data that the application could potentially modify, it remains stable.
3862306a36Sopenharmony_ci *
3962306a36Sopenharmony_ci * Copyright (C) 2018-2019 Jens Axboe
4062306a36Sopenharmony_ci * Copyright (c) 2018-2019 Christoph Hellwig
4162306a36Sopenharmony_ci */
4262306a36Sopenharmony_ci#include <linux/kernel.h>
4362306a36Sopenharmony_ci#include <linux/init.h>
4462306a36Sopenharmony_ci#include <linux/errno.h>
4562306a36Sopenharmony_ci#include <linux/syscalls.h>
4662306a36Sopenharmony_ci#include <net/compat.h>
4762306a36Sopenharmony_ci#include <linux/refcount.h>
4862306a36Sopenharmony_ci#include <linux/uio.h>
4962306a36Sopenharmony_ci#include <linux/bits.h>
5062306a36Sopenharmony_ci
5162306a36Sopenharmony_ci#include <linux/sched/signal.h>
5262306a36Sopenharmony_ci#include <linux/fs.h>
5362306a36Sopenharmony_ci#include <linux/file.h>
5462306a36Sopenharmony_ci#include <linux/fdtable.h>
5562306a36Sopenharmony_ci#include <linux/mm.h>
5662306a36Sopenharmony_ci#include <linux/mman.h>
5762306a36Sopenharmony_ci#include <linux/percpu.h>
5862306a36Sopenharmony_ci#include <linux/slab.h>
5962306a36Sopenharmony_ci#include <linux/bvec.h>
6062306a36Sopenharmony_ci#include <linux/net.h>
6162306a36Sopenharmony_ci#include <net/sock.h>
6262306a36Sopenharmony_ci#include <net/af_unix.h>
6362306a36Sopenharmony_ci#include <linux/anon_inodes.h>
6462306a36Sopenharmony_ci#include <linux/sched/mm.h>
6562306a36Sopenharmony_ci#include <linux/uaccess.h>
6662306a36Sopenharmony_ci#include <linux/nospec.h>
6762306a36Sopenharmony_ci#include <linux/highmem.h>
6862306a36Sopenharmony_ci#include <linux/fsnotify.h>
6962306a36Sopenharmony_ci#include <linux/fadvise.h>
7062306a36Sopenharmony_ci#include <linux/task_work.h>
7162306a36Sopenharmony_ci#include <linux/io_uring.h>
7262306a36Sopenharmony_ci#include <linux/audit.h>
7362306a36Sopenharmony_ci#include <linux/security.h>
7462306a36Sopenharmony_ci#include <asm/shmparam.h>
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci#define CREATE_TRACE_POINTS
7762306a36Sopenharmony_ci#include <trace/events/io_uring.h>
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci#include <uapi/linux/io_uring.h>
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci#include "io-wq.h"
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci#include "io_uring.h"
8462306a36Sopenharmony_ci#include "opdef.h"
8562306a36Sopenharmony_ci#include "refs.h"
8662306a36Sopenharmony_ci#include "tctx.h"
8762306a36Sopenharmony_ci#include "sqpoll.h"
8862306a36Sopenharmony_ci#include "fdinfo.h"
8962306a36Sopenharmony_ci#include "kbuf.h"
9062306a36Sopenharmony_ci#include "rsrc.h"
9162306a36Sopenharmony_ci#include "cancel.h"
9262306a36Sopenharmony_ci#include "net.h"
9362306a36Sopenharmony_ci#include "notif.h"
9462306a36Sopenharmony_ci
9562306a36Sopenharmony_ci#include "timeout.h"
9662306a36Sopenharmony_ci#include "poll.h"
9762306a36Sopenharmony_ci#include "rw.h"
9862306a36Sopenharmony_ci#include "alloc_cache.h"
9962306a36Sopenharmony_ci
10062306a36Sopenharmony_ci#define IORING_MAX_ENTRIES	32768
10162306a36Sopenharmony_ci#define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
10262306a36Sopenharmony_ci
10362306a36Sopenharmony_ci#define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
10462306a36Sopenharmony_ci				 IORING_REGISTER_LAST + IORING_OP_LAST)
10562306a36Sopenharmony_ci
10662306a36Sopenharmony_ci#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
10762306a36Sopenharmony_ci			  IOSQE_IO_HARDLINK | IOSQE_ASYNC)
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci#define SQE_VALID_FLAGS	(SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \
11062306a36Sopenharmony_ci			IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS)
11162306a36Sopenharmony_ci
11262306a36Sopenharmony_ci#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \
11362306a36Sopenharmony_ci				REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \
11462306a36Sopenharmony_ci				REQ_F_ASYNC_DATA)
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
11762306a36Sopenharmony_ci				 IO_REQ_CLEAN_FLAGS)
11862306a36Sopenharmony_ci
11962306a36Sopenharmony_ci#define IO_TCTX_REFS_CACHE_NR	(1U << 10)
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci#define IO_COMPL_BATCH			32
12262306a36Sopenharmony_ci#define IO_REQ_ALLOC_BATCH		8
12362306a36Sopenharmony_ci
12462306a36Sopenharmony_cienum {
12562306a36Sopenharmony_ci	IO_CHECK_CQ_OVERFLOW_BIT,
12662306a36Sopenharmony_ci	IO_CHECK_CQ_DROPPED_BIT,
12762306a36Sopenharmony_ci};
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_cienum {
13062306a36Sopenharmony_ci	IO_EVENTFD_OP_SIGNAL_BIT,
13162306a36Sopenharmony_ci	IO_EVENTFD_OP_FREE_BIT,
13262306a36Sopenharmony_ci};
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_cistruct io_defer_entry {
13562306a36Sopenharmony_ci	struct list_head	list;
13662306a36Sopenharmony_ci	struct io_kiocb		*req;
13762306a36Sopenharmony_ci	u32			seq;
13862306a36Sopenharmony_ci};
13962306a36Sopenharmony_ci
14062306a36Sopenharmony_ci/* requests with any of those set should undergo io_disarm_next() */
14162306a36Sopenharmony_ci#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL)
14262306a36Sopenharmony_ci#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK)
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_cistatic bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
14562306a36Sopenharmony_ci					 struct task_struct *task,
14662306a36Sopenharmony_ci					 bool cancel_all);
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_cistatic void io_queue_sqe(struct io_kiocb *req);
14962306a36Sopenharmony_ci
15062306a36Sopenharmony_cistruct kmem_cache *req_cachep;
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_cistatic int __read_mostly sysctl_io_uring_disabled;
15362306a36Sopenharmony_cistatic int __read_mostly sysctl_io_uring_group = -1;
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL
15662306a36Sopenharmony_cistatic struct ctl_table kernel_io_uring_disabled_table[] = {
15762306a36Sopenharmony_ci	{
15862306a36Sopenharmony_ci		.procname	= "io_uring_disabled",
15962306a36Sopenharmony_ci		.data		= &sysctl_io_uring_disabled,
16062306a36Sopenharmony_ci		.maxlen		= sizeof(sysctl_io_uring_disabled),
16162306a36Sopenharmony_ci		.mode		= 0644,
16262306a36Sopenharmony_ci		.proc_handler	= proc_dointvec_minmax,
16362306a36Sopenharmony_ci		.extra1		= SYSCTL_ZERO,
16462306a36Sopenharmony_ci		.extra2		= SYSCTL_TWO,
16562306a36Sopenharmony_ci	},
16662306a36Sopenharmony_ci	{
16762306a36Sopenharmony_ci		.procname	= "io_uring_group",
16862306a36Sopenharmony_ci		.data		= &sysctl_io_uring_group,
16962306a36Sopenharmony_ci		.maxlen		= sizeof(gid_t),
17062306a36Sopenharmony_ci		.mode		= 0644,
17162306a36Sopenharmony_ci		.proc_handler	= proc_dointvec,
17262306a36Sopenharmony_ci	},
17362306a36Sopenharmony_ci	{},
17462306a36Sopenharmony_ci};
17562306a36Sopenharmony_ci#endif
17662306a36Sopenharmony_ci
17762306a36Sopenharmony_cistatic inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
17862306a36Sopenharmony_ci{
17962306a36Sopenharmony_ci	if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
18062306a36Sopenharmony_ci	    ctx->submit_state.cqes_count)
18162306a36Sopenharmony_ci		__io_submit_flush_completions(ctx);
18262306a36Sopenharmony_ci}
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_cistatic inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
18562306a36Sopenharmony_ci{
18662306a36Sopenharmony_ci	return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
18762306a36Sopenharmony_ci}
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_cistatic inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
19062306a36Sopenharmony_ci{
19162306a36Sopenharmony_ci	return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
19262306a36Sopenharmony_ci}
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_cistatic bool io_match_linked(struct io_kiocb *head)
19562306a36Sopenharmony_ci{
19662306a36Sopenharmony_ci	struct io_kiocb *req;
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci	io_for_each_link(req, head) {
19962306a36Sopenharmony_ci		if (req->flags & REQ_F_INFLIGHT)
20062306a36Sopenharmony_ci			return true;
20162306a36Sopenharmony_ci	}
20262306a36Sopenharmony_ci	return false;
20362306a36Sopenharmony_ci}
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci/*
20662306a36Sopenharmony_ci * As io_match_task() but protected against racing with linked timeouts.
20762306a36Sopenharmony_ci * User must not hold timeout_lock.
20862306a36Sopenharmony_ci */
20962306a36Sopenharmony_cibool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
21062306a36Sopenharmony_ci			bool cancel_all)
21162306a36Sopenharmony_ci{
21262306a36Sopenharmony_ci	bool matched;
21362306a36Sopenharmony_ci
21462306a36Sopenharmony_ci	if (task && head->task != task)
21562306a36Sopenharmony_ci		return false;
21662306a36Sopenharmony_ci	if (cancel_all)
21762306a36Sopenharmony_ci		return true;
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci	if (head->flags & REQ_F_LINK_TIMEOUT) {
22062306a36Sopenharmony_ci		struct io_ring_ctx *ctx = head->ctx;
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_ci		/* protect against races with linked timeouts */
22362306a36Sopenharmony_ci		spin_lock_irq(&ctx->timeout_lock);
22462306a36Sopenharmony_ci		matched = io_match_linked(head);
22562306a36Sopenharmony_ci		spin_unlock_irq(&ctx->timeout_lock);
22662306a36Sopenharmony_ci	} else {
22762306a36Sopenharmony_ci		matched = io_match_linked(head);
22862306a36Sopenharmony_ci	}
22962306a36Sopenharmony_ci	return matched;
23062306a36Sopenharmony_ci}
23162306a36Sopenharmony_ci
23262306a36Sopenharmony_cistatic inline void req_fail_link_node(struct io_kiocb *req, int res)
23362306a36Sopenharmony_ci{
23462306a36Sopenharmony_ci	req_set_fail(req);
23562306a36Sopenharmony_ci	io_req_set_res(req, res, 0);
23662306a36Sopenharmony_ci}
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_cistatic inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
23962306a36Sopenharmony_ci{
24062306a36Sopenharmony_ci	wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list);
24162306a36Sopenharmony_ci}
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_cistatic __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
24462306a36Sopenharmony_ci{
24562306a36Sopenharmony_ci	struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	complete(&ctx->ref_comp);
24862306a36Sopenharmony_ci}
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_cistatic __cold void io_fallback_req_func(struct work_struct *work)
25162306a36Sopenharmony_ci{
25262306a36Sopenharmony_ci	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
25362306a36Sopenharmony_ci						fallback_work.work);
25462306a36Sopenharmony_ci	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
25562306a36Sopenharmony_ci	struct io_kiocb *req, *tmp;
25662306a36Sopenharmony_ci	struct io_tw_state ts = { .locked = true, };
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_ci	percpu_ref_get(&ctx->refs);
25962306a36Sopenharmony_ci	mutex_lock(&ctx->uring_lock);
26062306a36Sopenharmony_ci	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
26162306a36Sopenharmony_ci		req->io_task_work.func(req, &ts);
26262306a36Sopenharmony_ci	if (WARN_ON_ONCE(!ts.locked))
26362306a36Sopenharmony_ci		return;
26462306a36Sopenharmony_ci	io_submit_flush_completions(ctx);
26562306a36Sopenharmony_ci	mutex_unlock(&ctx->uring_lock);
26662306a36Sopenharmony_ci	percpu_ref_put(&ctx->refs);
26762306a36Sopenharmony_ci}
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_cistatic int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
27062306a36Sopenharmony_ci{
27162306a36Sopenharmony_ci	unsigned hash_buckets = 1U << bits;
27262306a36Sopenharmony_ci	size_t hash_size = hash_buckets * sizeof(table->hbs[0]);
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_ci	table->hbs = kmalloc(hash_size, GFP_KERNEL);
27562306a36Sopenharmony_ci	if (!table->hbs)
27662306a36Sopenharmony_ci		return -ENOMEM;
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_ci	table->hash_bits = bits;
27962306a36Sopenharmony_ci	init_hash_table(table, hash_buckets);
28062306a36Sopenharmony_ci	return 0;
28162306a36Sopenharmony_ci}
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_cistatic __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
28462306a36Sopenharmony_ci{
28562306a36Sopenharmony_ci	struct io_ring_ctx *ctx;
28662306a36Sopenharmony_ci	int hash_bits;
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
28962306a36Sopenharmony_ci	if (!ctx)
29062306a36Sopenharmony_ci		return NULL;
29162306a36Sopenharmony_ci
29262306a36Sopenharmony_ci	xa_init(&ctx->io_bl_xa);
29362306a36Sopenharmony_ci
29462306a36Sopenharmony_ci	/*
29562306a36Sopenharmony_ci	 * Use 5 bits less than the max cq entries, that should give us around
29662306a36Sopenharmony_ci	 * 32 entries per hash list if totally full and uniformly spread, but
29762306a36Sopenharmony_ci	 * don't keep too many buckets to not overconsume memory.
29862306a36Sopenharmony_ci	 */
29962306a36Sopenharmony_ci	hash_bits = ilog2(p->cq_entries) - 5;
30062306a36Sopenharmony_ci	hash_bits = clamp(hash_bits, 1, 8);
30162306a36Sopenharmony_ci	if (io_alloc_hash_table(&ctx->cancel_table, hash_bits))
30262306a36Sopenharmony_ci		goto err;
30362306a36Sopenharmony_ci	if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits))
30462306a36Sopenharmony_ci		goto err;
30562306a36Sopenharmony_ci	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
30662306a36Sopenharmony_ci			    0, GFP_KERNEL))
30762306a36Sopenharmony_ci		goto err;
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ci	ctx->flags = p->flags;
31062306a36Sopenharmony_ci	init_waitqueue_head(&ctx->sqo_sq_wait);
31162306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->sqd_list);
31262306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->cq_overflow_list);
31362306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->io_buffers_cache);
31462306a36Sopenharmony_ci	INIT_HLIST_HEAD(&ctx->io_buf_list);
31562306a36Sopenharmony_ci	io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
31662306a36Sopenharmony_ci			    sizeof(struct io_rsrc_node));
31762306a36Sopenharmony_ci	io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
31862306a36Sopenharmony_ci			    sizeof(struct async_poll));
31962306a36Sopenharmony_ci	io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
32062306a36Sopenharmony_ci			    sizeof(struct io_async_msghdr));
32162306a36Sopenharmony_ci	init_completion(&ctx->ref_comp);
32262306a36Sopenharmony_ci	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
32362306a36Sopenharmony_ci	mutex_init(&ctx->uring_lock);
32462306a36Sopenharmony_ci	init_waitqueue_head(&ctx->cq_wait);
32562306a36Sopenharmony_ci	init_waitqueue_head(&ctx->poll_wq);
32662306a36Sopenharmony_ci	init_waitqueue_head(&ctx->rsrc_quiesce_wq);
32762306a36Sopenharmony_ci	spin_lock_init(&ctx->completion_lock);
32862306a36Sopenharmony_ci	spin_lock_init(&ctx->timeout_lock);
32962306a36Sopenharmony_ci	INIT_WQ_LIST(&ctx->iopoll_list);
33062306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->io_buffers_pages);
33162306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->io_buffers_comp);
33262306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->defer_list);
33362306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->timeout_list);
33462306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->ltimeout_list);
33562306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
33662306a36Sopenharmony_ci	init_llist_head(&ctx->work_llist);
33762306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->tctx_list);
33862306a36Sopenharmony_ci	ctx->submit_state.free_list.next = NULL;
33962306a36Sopenharmony_ci	INIT_WQ_LIST(&ctx->locked_free_list);
34062306a36Sopenharmony_ci	INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
34162306a36Sopenharmony_ci	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
34262306a36Sopenharmony_ci	return ctx;
34362306a36Sopenharmony_cierr:
34462306a36Sopenharmony_ci	kfree(ctx->cancel_table.hbs);
34562306a36Sopenharmony_ci	kfree(ctx->cancel_table_locked.hbs);
34662306a36Sopenharmony_ci	kfree(ctx->io_bl);
34762306a36Sopenharmony_ci	xa_destroy(&ctx->io_bl_xa);
34862306a36Sopenharmony_ci	kfree(ctx);
34962306a36Sopenharmony_ci	return NULL;
35062306a36Sopenharmony_ci}
35162306a36Sopenharmony_ci
35262306a36Sopenharmony_cistatic void io_account_cq_overflow(struct io_ring_ctx *ctx)
35362306a36Sopenharmony_ci{
35462306a36Sopenharmony_ci	struct io_rings *r = ctx->rings;
35562306a36Sopenharmony_ci
35662306a36Sopenharmony_ci	WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
35762306a36Sopenharmony_ci	ctx->cq_extra--;
35862306a36Sopenharmony_ci}
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_cistatic bool req_need_defer(struct io_kiocb *req, u32 seq)
36162306a36Sopenharmony_ci{
36262306a36Sopenharmony_ci	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
36362306a36Sopenharmony_ci		struct io_ring_ctx *ctx = req->ctx;
36462306a36Sopenharmony_ci
36562306a36Sopenharmony_ci		return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
36662306a36Sopenharmony_ci	}
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci	return false;
36962306a36Sopenharmony_ci}
37062306a36Sopenharmony_ci
37162306a36Sopenharmony_cistatic void io_clean_op(struct io_kiocb *req)
37262306a36Sopenharmony_ci{
37362306a36Sopenharmony_ci	if (req->flags & REQ_F_BUFFER_SELECTED) {
37462306a36Sopenharmony_ci		spin_lock(&req->ctx->completion_lock);
37562306a36Sopenharmony_ci		io_put_kbuf_comp(req);
37662306a36Sopenharmony_ci		spin_unlock(&req->ctx->completion_lock);
37762306a36Sopenharmony_ci	}
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci	if (req->flags & REQ_F_NEED_CLEANUP) {
38062306a36Sopenharmony_ci		const struct io_cold_def *def = &io_cold_defs[req->opcode];
38162306a36Sopenharmony_ci
38262306a36Sopenharmony_ci		if (def->cleanup)
38362306a36Sopenharmony_ci			def->cleanup(req);
38462306a36Sopenharmony_ci	}
38562306a36Sopenharmony_ci	if ((req->flags & REQ_F_POLLED) && req->apoll) {
38662306a36Sopenharmony_ci		kfree(req->apoll->double_poll);
38762306a36Sopenharmony_ci		kfree(req->apoll);
38862306a36Sopenharmony_ci		req->apoll = NULL;
38962306a36Sopenharmony_ci	}
39062306a36Sopenharmony_ci	if (req->flags & REQ_F_INFLIGHT) {
39162306a36Sopenharmony_ci		struct io_uring_task *tctx = req->task->io_uring;
39262306a36Sopenharmony_ci
39362306a36Sopenharmony_ci		atomic_dec(&tctx->inflight_tracked);
39462306a36Sopenharmony_ci	}
39562306a36Sopenharmony_ci	if (req->flags & REQ_F_CREDS)
39662306a36Sopenharmony_ci		put_cred(req->creds);
39762306a36Sopenharmony_ci	if (req->flags & REQ_F_ASYNC_DATA) {
39862306a36Sopenharmony_ci		kfree(req->async_data);
39962306a36Sopenharmony_ci		req->async_data = NULL;
40062306a36Sopenharmony_ci	}
40162306a36Sopenharmony_ci	req->flags &= ~IO_REQ_CLEAN_FLAGS;
40262306a36Sopenharmony_ci}
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_cistatic inline void io_req_track_inflight(struct io_kiocb *req)
40562306a36Sopenharmony_ci{
40662306a36Sopenharmony_ci	if (!(req->flags & REQ_F_INFLIGHT)) {
40762306a36Sopenharmony_ci		req->flags |= REQ_F_INFLIGHT;
40862306a36Sopenharmony_ci		atomic_inc(&req->task->io_uring->inflight_tracked);
40962306a36Sopenharmony_ci	}
41062306a36Sopenharmony_ci}
41162306a36Sopenharmony_ci
41262306a36Sopenharmony_cistatic struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
41362306a36Sopenharmony_ci{
41462306a36Sopenharmony_ci	if (WARN_ON_ONCE(!req->link))
41562306a36Sopenharmony_ci		return NULL;
41662306a36Sopenharmony_ci
41762306a36Sopenharmony_ci	req->flags &= ~REQ_F_ARM_LTIMEOUT;
41862306a36Sopenharmony_ci	req->flags |= REQ_F_LINK_TIMEOUT;
41962306a36Sopenharmony_ci
42062306a36Sopenharmony_ci	/* linked timeouts should have two refs once prep'ed */
42162306a36Sopenharmony_ci	io_req_set_refcount(req);
42262306a36Sopenharmony_ci	__io_req_set_refcount(req->link, 2);
42362306a36Sopenharmony_ci	return req->link;
42462306a36Sopenharmony_ci}
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_cistatic inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
42762306a36Sopenharmony_ci{
42862306a36Sopenharmony_ci	if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT)))
42962306a36Sopenharmony_ci		return NULL;
43062306a36Sopenharmony_ci	return __io_prep_linked_timeout(req);
43162306a36Sopenharmony_ci}
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_cistatic noinline void __io_arm_ltimeout(struct io_kiocb *req)
43462306a36Sopenharmony_ci{
43562306a36Sopenharmony_ci	io_queue_linked_timeout(__io_prep_linked_timeout(req));
43662306a36Sopenharmony_ci}
43762306a36Sopenharmony_ci
43862306a36Sopenharmony_cistatic inline void io_arm_ltimeout(struct io_kiocb *req)
43962306a36Sopenharmony_ci{
44062306a36Sopenharmony_ci	if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT))
44162306a36Sopenharmony_ci		__io_arm_ltimeout(req);
44262306a36Sopenharmony_ci}
44362306a36Sopenharmony_ci
44462306a36Sopenharmony_cistatic void io_prep_async_work(struct io_kiocb *req)
44562306a36Sopenharmony_ci{
44662306a36Sopenharmony_ci	const struct io_issue_def *def = &io_issue_defs[req->opcode];
44762306a36Sopenharmony_ci	struct io_ring_ctx *ctx = req->ctx;
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ci	if (!(req->flags & REQ_F_CREDS)) {
45062306a36Sopenharmony_ci		req->flags |= REQ_F_CREDS;
45162306a36Sopenharmony_ci		req->creds = get_current_cred();
45262306a36Sopenharmony_ci	}
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci	req->work.list.next = NULL;
45562306a36Sopenharmony_ci	req->work.flags = 0;
45662306a36Sopenharmony_ci	req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
45762306a36Sopenharmony_ci	if (req->flags & REQ_F_FORCE_ASYNC)
45862306a36Sopenharmony_ci		req->work.flags |= IO_WQ_WORK_CONCURRENT;
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
46162306a36Sopenharmony_ci		req->flags |= io_file_get_flags(req->file);
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_ci	if (req->file && (req->flags & REQ_F_ISREG)) {
46462306a36Sopenharmony_ci		bool should_hash = def->hash_reg_file;
46562306a36Sopenharmony_ci
46662306a36Sopenharmony_ci		/* don't serialize this request if the fs doesn't need it */
46762306a36Sopenharmony_ci		if (should_hash && (req->file->f_flags & O_DIRECT) &&
46862306a36Sopenharmony_ci		    (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE))
46962306a36Sopenharmony_ci			should_hash = false;
47062306a36Sopenharmony_ci		if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
47162306a36Sopenharmony_ci			io_wq_hash_work(&req->work, file_inode(req->file));
47262306a36Sopenharmony_ci	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
47362306a36Sopenharmony_ci		if (def->unbound_nonreg_file)
47462306a36Sopenharmony_ci			req->work.flags |= IO_WQ_WORK_UNBOUND;
47562306a36Sopenharmony_ci	}
47662306a36Sopenharmony_ci}
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_cistatic void io_prep_async_link(struct io_kiocb *req)
47962306a36Sopenharmony_ci{
48062306a36Sopenharmony_ci	struct io_kiocb *cur;
48162306a36Sopenharmony_ci
48262306a36Sopenharmony_ci	if (req->flags & REQ_F_LINK_TIMEOUT) {
48362306a36Sopenharmony_ci		struct io_ring_ctx *ctx = req->ctx;
48462306a36Sopenharmony_ci
48562306a36Sopenharmony_ci		spin_lock_irq(&ctx->timeout_lock);
48662306a36Sopenharmony_ci		io_for_each_link(cur, req)
48762306a36Sopenharmony_ci			io_prep_async_work(cur);
48862306a36Sopenharmony_ci		spin_unlock_irq(&ctx->timeout_lock);
48962306a36Sopenharmony_ci	} else {
49062306a36Sopenharmony_ci		io_for_each_link(cur, req)
49162306a36Sopenharmony_ci			io_prep_async_work(cur);
49262306a36Sopenharmony_ci	}
49362306a36Sopenharmony_ci}
49462306a36Sopenharmony_ci
49562306a36Sopenharmony_civoid io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use)
49662306a36Sopenharmony_ci{
49762306a36Sopenharmony_ci	struct io_kiocb *link = io_prep_linked_timeout(req);
49862306a36Sopenharmony_ci	struct io_uring_task *tctx = req->task->io_uring;
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci	BUG_ON(!tctx);
50162306a36Sopenharmony_ci	BUG_ON(!tctx->io_wq);
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci	/* init ->work of the whole link before punting */
50462306a36Sopenharmony_ci	io_prep_async_link(req);
50562306a36Sopenharmony_ci
50662306a36Sopenharmony_ci	/*
50762306a36Sopenharmony_ci	 * Not expected to happen, but if we do have a bug where this _can_
50862306a36Sopenharmony_ci	 * happen, catch it here and ensure the request is marked as
50962306a36Sopenharmony_ci	 * canceled. That will make io-wq go through the usual work cancel
51062306a36Sopenharmony_ci	 * procedure rather than attempt to run this request (or create a new
51162306a36Sopenharmony_ci	 * worker for it).
51262306a36Sopenharmony_ci	 */
51362306a36Sopenharmony_ci	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
51462306a36Sopenharmony_ci		req->work.flags |= IO_WQ_WORK_CANCEL;
51562306a36Sopenharmony_ci
51662306a36Sopenharmony_ci	trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
51762306a36Sopenharmony_ci	io_wq_enqueue(tctx->io_wq, &req->work);
51862306a36Sopenharmony_ci	if (link)
51962306a36Sopenharmony_ci		io_queue_linked_timeout(link);
52062306a36Sopenharmony_ci}
52162306a36Sopenharmony_ci
52262306a36Sopenharmony_cistatic __cold void io_queue_deferred(struct io_ring_ctx *ctx)
52362306a36Sopenharmony_ci{
52462306a36Sopenharmony_ci	while (!list_empty(&ctx->defer_list)) {
52562306a36Sopenharmony_ci		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
52662306a36Sopenharmony_ci						struct io_defer_entry, list);
52762306a36Sopenharmony_ci
52862306a36Sopenharmony_ci		if (req_need_defer(de->req, de->seq))
52962306a36Sopenharmony_ci			break;
53062306a36Sopenharmony_ci		list_del_init(&de->list);
53162306a36Sopenharmony_ci		io_req_task_queue(de->req);
53262306a36Sopenharmony_ci		kfree(de);
53362306a36Sopenharmony_ci	}
53462306a36Sopenharmony_ci}
53562306a36Sopenharmony_ci
53662306a36Sopenharmony_ci
53762306a36Sopenharmony_cistatic void io_eventfd_ops(struct rcu_head *rcu)
53862306a36Sopenharmony_ci{
53962306a36Sopenharmony_ci	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
54062306a36Sopenharmony_ci	int ops = atomic_xchg(&ev_fd->ops, 0);
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci	if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
54362306a36Sopenharmony_ci		eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
54462306a36Sopenharmony_ci
54562306a36Sopenharmony_ci	/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
54662306a36Sopenharmony_ci	 * ordering in a race but if references are 0 we know we have to free
54762306a36Sopenharmony_ci	 * it regardless.
54862306a36Sopenharmony_ci	 */
54962306a36Sopenharmony_ci	if (atomic_dec_and_test(&ev_fd->refs)) {
55062306a36Sopenharmony_ci		eventfd_ctx_put(ev_fd->cq_ev_fd);
55162306a36Sopenharmony_ci		kfree(ev_fd);
55262306a36Sopenharmony_ci	}
55362306a36Sopenharmony_ci}
55462306a36Sopenharmony_ci
55562306a36Sopenharmony_cistatic void io_eventfd_signal(struct io_ring_ctx *ctx)
55662306a36Sopenharmony_ci{
55762306a36Sopenharmony_ci	struct io_ev_fd *ev_fd = NULL;
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_ci	rcu_read_lock();
56062306a36Sopenharmony_ci	/*
56162306a36Sopenharmony_ci	 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
56262306a36Sopenharmony_ci	 * and eventfd_signal
56362306a36Sopenharmony_ci	 */
56462306a36Sopenharmony_ci	ev_fd = rcu_dereference(ctx->io_ev_fd);
56562306a36Sopenharmony_ci
56662306a36Sopenharmony_ci	/*
56762306a36Sopenharmony_ci	 * Check again if ev_fd exists incase an io_eventfd_unregister call
56862306a36Sopenharmony_ci	 * completed between the NULL check of ctx->io_ev_fd at the start of
56962306a36Sopenharmony_ci	 * the function and rcu_read_lock.
57062306a36Sopenharmony_ci	 */
57162306a36Sopenharmony_ci	if (unlikely(!ev_fd))
57262306a36Sopenharmony_ci		goto out;
57362306a36Sopenharmony_ci	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
57462306a36Sopenharmony_ci		goto out;
57562306a36Sopenharmony_ci	if (ev_fd->eventfd_async && !io_wq_current_is_worker())
57662306a36Sopenharmony_ci		goto out;
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	if (likely(eventfd_signal_allowed())) {
57962306a36Sopenharmony_ci		eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
58062306a36Sopenharmony_ci	} else {
58162306a36Sopenharmony_ci		atomic_inc(&ev_fd->refs);
58262306a36Sopenharmony_ci		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
58362306a36Sopenharmony_ci			call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops);
58462306a36Sopenharmony_ci		else
58562306a36Sopenharmony_ci			atomic_dec(&ev_fd->refs);
58662306a36Sopenharmony_ci	}
58762306a36Sopenharmony_ci
58862306a36Sopenharmony_ciout:
58962306a36Sopenharmony_ci	rcu_read_unlock();
59062306a36Sopenharmony_ci}
59162306a36Sopenharmony_ci
59262306a36Sopenharmony_cistatic void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
59362306a36Sopenharmony_ci{
59462306a36Sopenharmony_ci	bool skip;
59562306a36Sopenharmony_ci
59662306a36Sopenharmony_ci	spin_lock(&ctx->completion_lock);
59762306a36Sopenharmony_ci
59862306a36Sopenharmony_ci	/*
59962306a36Sopenharmony_ci	 * Eventfd should only get triggered when at least one event has been
60062306a36Sopenharmony_ci	 * posted. Some applications rely on the eventfd notification count
60162306a36Sopenharmony_ci	 * only changing IFF a new CQE has been added to the CQ ring. There's
60262306a36Sopenharmony_ci	 * no depedency on 1:1 relationship between how many times this
60362306a36Sopenharmony_ci	 * function is called (and hence the eventfd count) and number of CQEs
60462306a36Sopenharmony_ci	 * posted to the CQ ring.
60562306a36Sopenharmony_ci	 */
60662306a36Sopenharmony_ci	skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
60762306a36Sopenharmony_ci	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
60862306a36Sopenharmony_ci	spin_unlock(&ctx->completion_lock);
60962306a36Sopenharmony_ci	if (skip)
61062306a36Sopenharmony_ci		return;
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	io_eventfd_signal(ctx);
61362306a36Sopenharmony_ci}
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_civoid __io_commit_cqring_flush(struct io_ring_ctx *ctx)
61662306a36Sopenharmony_ci{
61762306a36Sopenharmony_ci	if (ctx->poll_activated)
61862306a36Sopenharmony_ci		io_poll_wq_wake(ctx);
61962306a36Sopenharmony_ci	if (ctx->off_timeout_used)
62062306a36Sopenharmony_ci		io_flush_timeouts(ctx);
62162306a36Sopenharmony_ci	if (ctx->drain_active) {
62262306a36Sopenharmony_ci		spin_lock(&ctx->completion_lock);
62362306a36Sopenharmony_ci		io_queue_deferred(ctx);
62462306a36Sopenharmony_ci		spin_unlock(&ctx->completion_lock);
62562306a36Sopenharmony_ci	}
62662306a36Sopenharmony_ci	if (ctx->has_evfd)
62762306a36Sopenharmony_ci		io_eventfd_flush_signal(ctx);
62862306a36Sopenharmony_ci}
62962306a36Sopenharmony_ci
63062306a36Sopenharmony_cistatic inline void __io_cq_lock(struct io_ring_ctx *ctx)
63162306a36Sopenharmony_ci{
63262306a36Sopenharmony_ci	if (!ctx->lockless_cq)
63362306a36Sopenharmony_ci		spin_lock(&ctx->completion_lock);
63462306a36Sopenharmony_ci}
63562306a36Sopenharmony_ci
63662306a36Sopenharmony_cistatic inline void io_cq_lock(struct io_ring_ctx *ctx)
63762306a36Sopenharmony_ci	__acquires(ctx->completion_lock)
63862306a36Sopenharmony_ci{
63962306a36Sopenharmony_ci	spin_lock(&ctx->completion_lock);
64062306a36Sopenharmony_ci}
64162306a36Sopenharmony_ci
64262306a36Sopenharmony_cistatic inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
64362306a36Sopenharmony_ci{
64462306a36Sopenharmony_ci	io_commit_cqring(ctx);
64562306a36Sopenharmony_ci	if (!ctx->task_complete) {
64662306a36Sopenharmony_ci		if (!ctx->lockless_cq)
64762306a36Sopenharmony_ci			spin_unlock(&ctx->completion_lock);
64862306a36Sopenharmony_ci		/* IOPOLL rings only need to wake up if it's also SQPOLL */
64962306a36Sopenharmony_ci		if (!ctx->syscall_iopoll)
65062306a36Sopenharmony_ci			io_cqring_wake(ctx);
65162306a36Sopenharmony_ci	}
65262306a36Sopenharmony_ci	io_commit_cqring_flush(ctx);
65362306a36Sopenharmony_ci}
65462306a36Sopenharmony_ci
65562306a36Sopenharmony_cistatic void io_cq_unlock_post(struct io_ring_ctx *ctx)
65662306a36Sopenharmony_ci	__releases(ctx->completion_lock)
65762306a36Sopenharmony_ci{
65862306a36Sopenharmony_ci	io_commit_cqring(ctx);
65962306a36Sopenharmony_ci	spin_unlock(&ctx->completion_lock);
66062306a36Sopenharmony_ci	io_cqring_wake(ctx);
66162306a36Sopenharmony_ci	io_commit_cqring_flush(ctx);
66262306a36Sopenharmony_ci}
66362306a36Sopenharmony_ci
66462306a36Sopenharmony_ci/* Returns true if there are no backlogged entries after the flush */
66562306a36Sopenharmony_cistatic void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
66662306a36Sopenharmony_ci{
66762306a36Sopenharmony_ci	struct io_overflow_cqe *ocqe;
66862306a36Sopenharmony_ci	LIST_HEAD(list);
66962306a36Sopenharmony_ci
67062306a36Sopenharmony_ci	spin_lock(&ctx->completion_lock);
67162306a36Sopenharmony_ci	list_splice_init(&ctx->cq_overflow_list, &list);
67262306a36Sopenharmony_ci	clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
67362306a36Sopenharmony_ci	spin_unlock(&ctx->completion_lock);
67462306a36Sopenharmony_ci
67562306a36Sopenharmony_ci	while (!list_empty(&list)) {
67662306a36Sopenharmony_ci		ocqe = list_first_entry(&list, struct io_overflow_cqe, list);
67762306a36Sopenharmony_ci		list_del(&ocqe->list);
67862306a36Sopenharmony_ci		kfree(ocqe);
67962306a36Sopenharmony_ci	}
68062306a36Sopenharmony_ci}
68162306a36Sopenharmony_ci
68262306a36Sopenharmony_cistatic void __io_cqring_overflow_flush(struct io_ring_ctx *ctx)
68362306a36Sopenharmony_ci{
68462306a36Sopenharmony_ci	size_t cqe_size = sizeof(struct io_uring_cqe);
68562306a36Sopenharmony_ci
68662306a36Sopenharmony_ci	if (__io_cqring_events(ctx) == ctx->cq_entries)
68762306a36Sopenharmony_ci		return;
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_CQE32)
69062306a36Sopenharmony_ci		cqe_size <<= 1;
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci	io_cq_lock(ctx);
69362306a36Sopenharmony_ci	while (!list_empty(&ctx->cq_overflow_list)) {
69462306a36Sopenharmony_ci		struct io_uring_cqe *cqe;
69562306a36Sopenharmony_ci		struct io_overflow_cqe *ocqe;
69662306a36Sopenharmony_ci
69762306a36Sopenharmony_ci		if (!io_get_cqe_overflow(ctx, &cqe, true))
69862306a36Sopenharmony_ci			break;
69962306a36Sopenharmony_ci		ocqe = list_first_entry(&ctx->cq_overflow_list,
70062306a36Sopenharmony_ci					struct io_overflow_cqe, list);
70162306a36Sopenharmony_ci		memcpy(cqe, &ocqe->cqe, cqe_size);
70262306a36Sopenharmony_ci		list_del(&ocqe->list);
70362306a36Sopenharmony_ci		kfree(ocqe);
70462306a36Sopenharmony_ci	}
70562306a36Sopenharmony_ci
70662306a36Sopenharmony_ci	if (list_empty(&ctx->cq_overflow_list)) {
70762306a36Sopenharmony_ci		clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
70862306a36Sopenharmony_ci		atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
70962306a36Sopenharmony_ci	}
71062306a36Sopenharmony_ci	io_cq_unlock_post(ctx);
71162306a36Sopenharmony_ci}
71262306a36Sopenharmony_ci
71362306a36Sopenharmony_cistatic void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
71462306a36Sopenharmony_ci{
71562306a36Sopenharmony_ci	/* iopoll syncs against uring_lock, not completion_lock */
71662306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_IOPOLL)
71762306a36Sopenharmony_ci		mutex_lock(&ctx->uring_lock);
71862306a36Sopenharmony_ci	__io_cqring_overflow_flush(ctx);
71962306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_IOPOLL)
72062306a36Sopenharmony_ci		mutex_unlock(&ctx->uring_lock);
72162306a36Sopenharmony_ci}
72262306a36Sopenharmony_ci
72362306a36Sopenharmony_cistatic void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
72462306a36Sopenharmony_ci{
72562306a36Sopenharmony_ci	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
72662306a36Sopenharmony_ci		io_cqring_do_overflow_flush(ctx);
72762306a36Sopenharmony_ci}
72862306a36Sopenharmony_ci
72962306a36Sopenharmony_ci/* can be called by any task */
73062306a36Sopenharmony_cistatic void io_put_task_remote(struct task_struct *task)
73162306a36Sopenharmony_ci{
73262306a36Sopenharmony_ci	struct io_uring_task *tctx = task->io_uring;
73362306a36Sopenharmony_ci
73462306a36Sopenharmony_ci	percpu_counter_sub(&tctx->inflight, 1);
73562306a36Sopenharmony_ci	if (unlikely(atomic_read(&tctx->in_cancel)))
73662306a36Sopenharmony_ci		wake_up(&tctx->wait);
73762306a36Sopenharmony_ci	put_task_struct(task);
73862306a36Sopenharmony_ci}
73962306a36Sopenharmony_ci
74062306a36Sopenharmony_ci/* used by a task to put its own references */
74162306a36Sopenharmony_cistatic void io_put_task_local(struct task_struct *task)
74262306a36Sopenharmony_ci{
74362306a36Sopenharmony_ci	task->io_uring->cached_refs++;
74462306a36Sopenharmony_ci}
74562306a36Sopenharmony_ci
74662306a36Sopenharmony_ci/* must to be called somewhat shortly after putting a request */
74762306a36Sopenharmony_cistatic inline void io_put_task(struct task_struct *task)
74862306a36Sopenharmony_ci{
74962306a36Sopenharmony_ci	if (likely(task == current))
75062306a36Sopenharmony_ci		io_put_task_local(task);
75162306a36Sopenharmony_ci	else
75262306a36Sopenharmony_ci		io_put_task_remote(task);
75362306a36Sopenharmony_ci}
75462306a36Sopenharmony_ci
75562306a36Sopenharmony_civoid io_task_refs_refill(struct io_uring_task *tctx)
75662306a36Sopenharmony_ci{
75762306a36Sopenharmony_ci	unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
75862306a36Sopenharmony_ci
75962306a36Sopenharmony_ci	percpu_counter_add(&tctx->inflight, refill);
76062306a36Sopenharmony_ci	refcount_add(refill, &current->usage);
76162306a36Sopenharmony_ci	tctx->cached_refs += refill;
76262306a36Sopenharmony_ci}
76362306a36Sopenharmony_ci
76462306a36Sopenharmony_cistatic __cold void io_uring_drop_tctx_refs(struct task_struct *task)
76562306a36Sopenharmony_ci{
76662306a36Sopenharmony_ci	struct io_uring_task *tctx = task->io_uring;
76762306a36Sopenharmony_ci	unsigned int refs = tctx->cached_refs;
76862306a36Sopenharmony_ci
76962306a36Sopenharmony_ci	if (refs) {
77062306a36Sopenharmony_ci		tctx->cached_refs = 0;
77162306a36Sopenharmony_ci		percpu_counter_sub(&tctx->inflight, refs);
77262306a36Sopenharmony_ci		put_task_struct_many(task, refs);
77362306a36Sopenharmony_ci	}
77462306a36Sopenharmony_ci}
77562306a36Sopenharmony_ci
77662306a36Sopenharmony_cistatic bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
77762306a36Sopenharmony_ci				     s32 res, u32 cflags, u64 extra1, u64 extra2)
77862306a36Sopenharmony_ci{
77962306a36Sopenharmony_ci	struct io_overflow_cqe *ocqe;
78062306a36Sopenharmony_ci	size_t ocq_size = sizeof(struct io_overflow_cqe);
78162306a36Sopenharmony_ci	bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
78262306a36Sopenharmony_ci
78362306a36Sopenharmony_ci	lockdep_assert_held(&ctx->completion_lock);
78462306a36Sopenharmony_ci
78562306a36Sopenharmony_ci	if (is_cqe32)
78662306a36Sopenharmony_ci		ocq_size += sizeof(struct io_uring_cqe);
78762306a36Sopenharmony_ci
78862306a36Sopenharmony_ci	ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
78962306a36Sopenharmony_ci	trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
79062306a36Sopenharmony_ci	if (!ocqe) {
79162306a36Sopenharmony_ci		/*
79262306a36Sopenharmony_ci		 * If we're in ring overflow flush mode, or in task cancel mode,
79362306a36Sopenharmony_ci		 * or cannot allocate an overflow entry, then we need to drop it
79462306a36Sopenharmony_ci		 * on the floor.
79562306a36Sopenharmony_ci		 */
79662306a36Sopenharmony_ci		io_account_cq_overflow(ctx);
79762306a36Sopenharmony_ci		set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
79862306a36Sopenharmony_ci		return false;
79962306a36Sopenharmony_ci	}
80062306a36Sopenharmony_ci	if (list_empty(&ctx->cq_overflow_list)) {
80162306a36Sopenharmony_ci		set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
80262306a36Sopenharmony_ci		atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
80362306a36Sopenharmony_ci
80462306a36Sopenharmony_ci	}
80562306a36Sopenharmony_ci	ocqe->cqe.user_data = user_data;
80662306a36Sopenharmony_ci	ocqe->cqe.res = res;
80762306a36Sopenharmony_ci	ocqe->cqe.flags = cflags;
80862306a36Sopenharmony_ci	if (is_cqe32) {
80962306a36Sopenharmony_ci		ocqe->cqe.big_cqe[0] = extra1;
81062306a36Sopenharmony_ci		ocqe->cqe.big_cqe[1] = extra2;
81162306a36Sopenharmony_ci	}
81262306a36Sopenharmony_ci	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
81362306a36Sopenharmony_ci	return true;
81462306a36Sopenharmony_ci}
81562306a36Sopenharmony_ci
81662306a36Sopenharmony_civoid io_req_cqe_overflow(struct io_kiocb *req)
81762306a36Sopenharmony_ci{
81862306a36Sopenharmony_ci	io_cqring_event_overflow(req->ctx, req->cqe.user_data,
81962306a36Sopenharmony_ci				req->cqe.res, req->cqe.flags,
82062306a36Sopenharmony_ci				req->big_cqe.extra1, req->big_cqe.extra2);
82162306a36Sopenharmony_ci	memset(&req->big_cqe, 0, sizeof(req->big_cqe));
82262306a36Sopenharmony_ci}
82362306a36Sopenharmony_ci
82462306a36Sopenharmony_ci/*
82562306a36Sopenharmony_ci * writes to the cq entry need to come after reading head; the
82662306a36Sopenharmony_ci * control dependency is enough as we're using WRITE_ONCE to
82762306a36Sopenharmony_ci * fill the cq entry
82862306a36Sopenharmony_ci */
82962306a36Sopenharmony_cibool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
83062306a36Sopenharmony_ci{
83162306a36Sopenharmony_ci	struct io_rings *rings = ctx->rings;
83262306a36Sopenharmony_ci	unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
83362306a36Sopenharmony_ci	unsigned int free, queued, len;
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_ci	/*
83662306a36Sopenharmony_ci	 * Posting into the CQ when there are pending overflowed CQEs may break
83762306a36Sopenharmony_ci	 * ordering guarantees, which will affect links, F_MORE users and more.
83862306a36Sopenharmony_ci	 * Force overflow the completion.
83962306a36Sopenharmony_ci	 */
84062306a36Sopenharmony_ci	if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
84162306a36Sopenharmony_ci		return false;
84262306a36Sopenharmony_ci
84362306a36Sopenharmony_ci	/* userspace may cheat modifying the tail, be safe and do min */
84462306a36Sopenharmony_ci	queued = min(__io_cqring_events(ctx), ctx->cq_entries);
84562306a36Sopenharmony_ci	free = ctx->cq_entries - queued;
84662306a36Sopenharmony_ci	/* we need a contiguous range, limit based on the current array offset */
84762306a36Sopenharmony_ci	len = min(free, ctx->cq_entries - off);
84862306a36Sopenharmony_ci	if (!len)
84962306a36Sopenharmony_ci		return false;
85062306a36Sopenharmony_ci
85162306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_CQE32) {
85262306a36Sopenharmony_ci		off <<= 1;
85362306a36Sopenharmony_ci		len <<= 1;
85462306a36Sopenharmony_ci	}
85562306a36Sopenharmony_ci
85662306a36Sopenharmony_ci	ctx->cqe_cached = &rings->cqes[off];
85762306a36Sopenharmony_ci	ctx->cqe_sentinel = ctx->cqe_cached + len;
85862306a36Sopenharmony_ci	return true;
85962306a36Sopenharmony_ci}
86062306a36Sopenharmony_ci
86162306a36Sopenharmony_cistatic bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
86262306a36Sopenharmony_ci			      u32 cflags)
86362306a36Sopenharmony_ci{
86462306a36Sopenharmony_ci	struct io_uring_cqe *cqe;
86562306a36Sopenharmony_ci
86662306a36Sopenharmony_ci	ctx->cq_extra++;
86762306a36Sopenharmony_ci
86862306a36Sopenharmony_ci	/*
86962306a36Sopenharmony_ci	 * If we can't get a cq entry, userspace overflowed the
87062306a36Sopenharmony_ci	 * submission (by quite a lot). Increment the overflow count in
87162306a36Sopenharmony_ci	 * the ring.
87262306a36Sopenharmony_ci	 */
87362306a36Sopenharmony_ci	if (likely(io_get_cqe(ctx, &cqe))) {
87462306a36Sopenharmony_ci		trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
87562306a36Sopenharmony_ci
87662306a36Sopenharmony_ci		WRITE_ONCE(cqe->user_data, user_data);
87762306a36Sopenharmony_ci		WRITE_ONCE(cqe->res, res);
87862306a36Sopenharmony_ci		WRITE_ONCE(cqe->flags, cflags);
87962306a36Sopenharmony_ci
88062306a36Sopenharmony_ci		if (ctx->flags & IORING_SETUP_CQE32) {
88162306a36Sopenharmony_ci			WRITE_ONCE(cqe->big_cqe[0], 0);
88262306a36Sopenharmony_ci			WRITE_ONCE(cqe->big_cqe[1], 0);
88362306a36Sopenharmony_ci		}
88462306a36Sopenharmony_ci		return true;
88562306a36Sopenharmony_ci	}
88662306a36Sopenharmony_ci	return false;
88762306a36Sopenharmony_ci}
88862306a36Sopenharmony_ci
88962306a36Sopenharmony_cistatic void __io_flush_post_cqes(struct io_ring_ctx *ctx)
89062306a36Sopenharmony_ci	__must_hold(&ctx->uring_lock)
89162306a36Sopenharmony_ci{
89262306a36Sopenharmony_ci	struct io_submit_state *state = &ctx->submit_state;
89362306a36Sopenharmony_ci	unsigned int i;
89462306a36Sopenharmony_ci
89562306a36Sopenharmony_ci	lockdep_assert_held(&ctx->uring_lock);
89662306a36Sopenharmony_ci	for (i = 0; i < state->cqes_count; i++) {
89762306a36Sopenharmony_ci		struct io_uring_cqe *cqe = &ctx->completion_cqes[i];
89862306a36Sopenharmony_ci
89962306a36Sopenharmony_ci		if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
90062306a36Sopenharmony_ci			if (ctx->lockless_cq) {
90162306a36Sopenharmony_ci				spin_lock(&ctx->completion_lock);
90262306a36Sopenharmony_ci				io_cqring_event_overflow(ctx, cqe->user_data,
90362306a36Sopenharmony_ci							cqe->res, cqe->flags, 0, 0);
90462306a36Sopenharmony_ci				spin_unlock(&ctx->completion_lock);
90562306a36Sopenharmony_ci			} else {
90662306a36Sopenharmony_ci				io_cqring_event_overflow(ctx, cqe->user_data,
90762306a36Sopenharmony_ci							cqe->res, cqe->flags, 0, 0);
90862306a36Sopenharmony_ci			}
90962306a36Sopenharmony_ci		}
91062306a36Sopenharmony_ci	}
91162306a36Sopenharmony_ci	state->cqes_count = 0;
91262306a36Sopenharmony_ci}
91362306a36Sopenharmony_ci
91462306a36Sopenharmony_cistatic bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
91562306a36Sopenharmony_ci			      bool allow_overflow)
91662306a36Sopenharmony_ci{
91762306a36Sopenharmony_ci	bool filled;
91862306a36Sopenharmony_ci
91962306a36Sopenharmony_ci	io_cq_lock(ctx);
92062306a36Sopenharmony_ci	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
92162306a36Sopenharmony_ci	if (!filled && allow_overflow)
92262306a36Sopenharmony_ci		filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_ci	io_cq_unlock_post(ctx);
92562306a36Sopenharmony_ci	return filled;
92662306a36Sopenharmony_ci}
92762306a36Sopenharmony_ci
92862306a36Sopenharmony_cibool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
92962306a36Sopenharmony_ci{
93062306a36Sopenharmony_ci	return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
93162306a36Sopenharmony_ci}
93262306a36Sopenharmony_ci
93362306a36Sopenharmony_ci/*
93462306a36Sopenharmony_ci * A helper for multishot requests posting additional CQEs.
93562306a36Sopenharmony_ci * Should only be used from a task_work including IO_URING_F_MULTISHOT.
93662306a36Sopenharmony_ci */
93762306a36Sopenharmony_cibool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags)
93862306a36Sopenharmony_ci{
93962306a36Sopenharmony_ci	struct io_ring_ctx *ctx = req->ctx;
94062306a36Sopenharmony_ci	u64 user_data = req->cqe.user_data;
94162306a36Sopenharmony_ci	struct io_uring_cqe *cqe;
94262306a36Sopenharmony_ci
94362306a36Sopenharmony_ci	if (!defer)
94462306a36Sopenharmony_ci		return __io_post_aux_cqe(ctx, user_data, res, cflags, false);
94562306a36Sopenharmony_ci
94662306a36Sopenharmony_ci	lockdep_assert_held(&ctx->uring_lock);
94762306a36Sopenharmony_ci
94862306a36Sopenharmony_ci	if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) {
94962306a36Sopenharmony_ci		__io_cq_lock(ctx);
95062306a36Sopenharmony_ci		__io_flush_post_cqes(ctx);
95162306a36Sopenharmony_ci		/* no need to flush - flush is deferred */
95262306a36Sopenharmony_ci		__io_cq_unlock_post(ctx);
95362306a36Sopenharmony_ci	}
95462306a36Sopenharmony_ci
95562306a36Sopenharmony_ci	/* For defered completions this is not as strict as it is otherwise,
95662306a36Sopenharmony_ci	 * however it's main job is to prevent unbounded posted completions,
95762306a36Sopenharmony_ci	 * and in that it works just as well.
95862306a36Sopenharmony_ci	 */
95962306a36Sopenharmony_ci	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
96062306a36Sopenharmony_ci		return false;
96162306a36Sopenharmony_ci
96262306a36Sopenharmony_ci	cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++];
96362306a36Sopenharmony_ci	cqe->user_data = user_data;
96462306a36Sopenharmony_ci	cqe->res = res;
96562306a36Sopenharmony_ci	cqe->flags = cflags;
96662306a36Sopenharmony_ci	return true;
96762306a36Sopenharmony_ci}
96862306a36Sopenharmony_ci
96962306a36Sopenharmony_cistatic void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
97062306a36Sopenharmony_ci{
97162306a36Sopenharmony_ci	struct io_ring_ctx *ctx = req->ctx;
97262306a36Sopenharmony_ci	struct io_rsrc_node *rsrc_node = NULL;
97362306a36Sopenharmony_ci
97462306a36Sopenharmony_ci	io_cq_lock(ctx);
97562306a36Sopenharmony_ci	if (!(req->flags & REQ_F_CQE_SKIP)) {
97662306a36Sopenharmony_ci		if (!io_fill_cqe_req(ctx, req))
97762306a36Sopenharmony_ci			io_req_cqe_overflow(req);
97862306a36Sopenharmony_ci	}
97962306a36Sopenharmony_ci
98062306a36Sopenharmony_ci	/*
98162306a36Sopenharmony_ci	 * If we're the last reference to this request, add to our locked
98262306a36Sopenharmony_ci	 * free_list cache.
98362306a36Sopenharmony_ci	 */
98462306a36Sopenharmony_ci	if (req_ref_put_and_test(req)) {
98562306a36Sopenharmony_ci		if (req->flags & IO_REQ_LINK_FLAGS) {
98662306a36Sopenharmony_ci			if (req->flags & IO_DISARM_MASK)
98762306a36Sopenharmony_ci				io_disarm_next(req);
98862306a36Sopenharmony_ci			if (req->link) {
98962306a36Sopenharmony_ci				io_req_task_queue(req->link);
99062306a36Sopenharmony_ci				req->link = NULL;
99162306a36Sopenharmony_ci			}
99262306a36Sopenharmony_ci		}
99362306a36Sopenharmony_ci		io_put_kbuf_comp(req);
99462306a36Sopenharmony_ci		if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
99562306a36Sopenharmony_ci			io_clean_op(req);
99662306a36Sopenharmony_ci		io_put_file(req);
99762306a36Sopenharmony_ci
99862306a36Sopenharmony_ci		rsrc_node = req->rsrc_node;
99962306a36Sopenharmony_ci		/*
100062306a36Sopenharmony_ci		 * Selected buffer deallocation in io_clean_op() assumes that
100162306a36Sopenharmony_ci		 * we don't hold ->completion_lock. Clean them here to avoid
100262306a36Sopenharmony_ci		 * deadlocks.
100362306a36Sopenharmony_ci		 */
100462306a36Sopenharmony_ci		io_put_task_remote(req->task);
100562306a36Sopenharmony_ci		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
100662306a36Sopenharmony_ci		ctx->locked_free_nr++;
100762306a36Sopenharmony_ci	}
100862306a36Sopenharmony_ci	io_cq_unlock_post(ctx);
100962306a36Sopenharmony_ci
101062306a36Sopenharmony_ci	if (rsrc_node) {
101162306a36Sopenharmony_ci		io_ring_submit_lock(ctx, issue_flags);
101262306a36Sopenharmony_ci		io_put_rsrc_node(ctx, rsrc_node);
101362306a36Sopenharmony_ci		io_ring_submit_unlock(ctx, issue_flags);
101462306a36Sopenharmony_ci	}
101562306a36Sopenharmony_ci}
101662306a36Sopenharmony_ci
101762306a36Sopenharmony_civoid io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
101862306a36Sopenharmony_ci{
101962306a36Sopenharmony_ci	if (req->ctx->task_complete && req->ctx->submitter_task != current) {
102062306a36Sopenharmony_ci		req->io_task_work.func = io_req_task_complete;
102162306a36Sopenharmony_ci		io_req_task_work_add(req);
102262306a36Sopenharmony_ci	} else if (!(issue_flags & IO_URING_F_UNLOCKED) ||
102362306a36Sopenharmony_ci		   !(req->ctx->flags & IORING_SETUP_IOPOLL)) {
102462306a36Sopenharmony_ci		__io_req_complete_post(req, issue_flags);
102562306a36Sopenharmony_ci	} else {
102662306a36Sopenharmony_ci		struct io_ring_ctx *ctx = req->ctx;
102762306a36Sopenharmony_ci
102862306a36Sopenharmony_ci		mutex_lock(&ctx->uring_lock);
102962306a36Sopenharmony_ci		__io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED);
103062306a36Sopenharmony_ci		mutex_unlock(&ctx->uring_lock);
103162306a36Sopenharmony_ci	}
103262306a36Sopenharmony_ci}
103362306a36Sopenharmony_ci
103462306a36Sopenharmony_civoid io_req_defer_failed(struct io_kiocb *req, s32 res)
103562306a36Sopenharmony_ci	__must_hold(&ctx->uring_lock)
103662306a36Sopenharmony_ci{
103762306a36Sopenharmony_ci	const struct io_cold_def *def = &io_cold_defs[req->opcode];
103862306a36Sopenharmony_ci
103962306a36Sopenharmony_ci	lockdep_assert_held(&req->ctx->uring_lock);
104062306a36Sopenharmony_ci
104162306a36Sopenharmony_ci	req_set_fail(req);
104262306a36Sopenharmony_ci	io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED));
104362306a36Sopenharmony_ci	if (def->fail)
104462306a36Sopenharmony_ci		def->fail(req);
104562306a36Sopenharmony_ci	io_req_complete_defer(req);
104662306a36Sopenharmony_ci}
104762306a36Sopenharmony_ci
104862306a36Sopenharmony_ci/*
104962306a36Sopenharmony_ci * Don't initialise the fields below on every allocation, but do that in
105062306a36Sopenharmony_ci * advance and keep them valid across allocations.
105162306a36Sopenharmony_ci */
105262306a36Sopenharmony_cistatic void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
105362306a36Sopenharmony_ci{
105462306a36Sopenharmony_ci	req->ctx = ctx;
105562306a36Sopenharmony_ci	req->link = NULL;
105662306a36Sopenharmony_ci	req->async_data = NULL;
105762306a36Sopenharmony_ci	/* not necessary, but safer to zero */
105862306a36Sopenharmony_ci	memset(&req->cqe, 0, sizeof(req->cqe));
105962306a36Sopenharmony_ci	memset(&req->big_cqe, 0, sizeof(req->big_cqe));
106062306a36Sopenharmony_ci}
106162306a36Sopenharmony_ci
106262306a36Sopenharmony_cistatic void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
106362306a36Sopenharmony_ci					struct io_submit_state *state)
106462306a36Sopenharmony_ci{
106562306a36Sopenharmony_ci	spin_lock(&ctx->completion_lock);
106662306a36Sopenharmony_ci	wq_list_splice(&ctx->locked_free_list, &state->free_list);
106762306a36Sopenharmony_ci	ctx->locked_free_nr = 0;
106862306a36Sopenharmony_ci	spin_unlock(&ctx->completion_lock);
106962306a36Sopenharmony_ci}
107062306a36Sopenharmony_ci
107162306a36Sopenharmony_ci/*
107262306a36Sopenharmony_ci * A request might get retired back into the request caches even before opcode
107362306a36Sopenharmony_ci * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
107462306a36Sopenharmony_ci * Because of that, io_alloc_req() should be called only under ->uring_lock
107562306a36Sopenharmony_ci * and with extra caution to not get a request that is still worked on.
107662306a36Sopenharmony_ci */
107762306a36Sopenharmony_ci__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
107862306a36Sopenharmony_ci	__must_hold(&ctx->uring_lock)
107962306a36Sopenharmony_ci{
108062306a36Sopenharmony_ci	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
108162306a36Sopenharmony_ci	void *reqs[IO_REQ_ALLOC_BATCH];
108262306a36Sopenharmony_ci	int ret, i;
108362306a36Sopenharmony_ci
108462306a36Sopenharmony_ci	/*
108562306a36Sopenharmony_ci	 * If we have more than a batch's worth of requests in our IRQ side
108662306a36Sopenharmony_ci	 * locked cache, grab the lock and move them over to our submission
108762306a36Sopenharmony_ci	 * side cache.
108862306a36Sopenharmony_ci	 */
108962306a36Sopenharmony_ci	if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
109062306a36Sopenharmony_ci		io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
109162306a36Sopenharmony_ci		if (!io_req_cache_empty(ctx))
109262306a36Sopenharmony_ci			return true;
109362306a36Sopenharmony_ci	}
109462306a36Sopenharmony_ci
109562306a36Sopenharmony_ci	ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
109662306a36Sopenharmony_ci
109762306a36Sopenharmony_ci	/*
109862306a36Sopenharmony_ci	 * Bulk alloc is all-or-nothing. If we fail to get a batch,
109962306a36Sopenharmony_ci	 * retry single alloc to be on the safe side.
110062306a36Sopenharmony_ci	 */
110162306a36Sopenharmony_ci	if (unlikely(ret <= 0)) {
110262306a36Sopenharmony_ci		reqs[0] = kmem_cache_alloc(req_cachep, gfp);
110362306a36Sopenharmony_ci		if (!reqs[0])
110462306a36Sopenharmony_ci			return false;
110562306a36Sopenharmony_ci		ret = 1;
110662306a36Sopenharmony_ci	}
110762306a36Sopenharmony_ci
110862306a36Sopenharmony_ci	percpu_ref_get_many(&ctx->refs, ret);
110962306a36Sopenharmony_ci	for (i = 0; i < ret; i++) {
111062306a36Sopenharmony_ci		struct io_kiocb *req = reqs[i];
111162306a36Sopenharmony_ci
111262306a36Sopenharmony_ci		io_preinit_req(req, ctx);
111362306a36Sopenharmony_ci		io_req_add_to_cache(req, ctx);
111462306a36Sopenharmony_ci	}
111562306a36Sopenharmony_ci	return true;
111662306a36Sopenharmony_ci}
111762306a36Sopenharmony_ci
111862306a36Sopenharmony_ci__cold void io_free_req(struct io_kiocb *req)
111962306a36Sopenharmony_ci{
112062306a36Sopenharmony_ci	/* refs were already put, restore them for io_req_task_complete() */
112162306a36Sopenharmony_ci	req->flags &= ~REQ_F_REFCOUNT;
112262306a36Sopenharmony_ci	/* we only want to free it, don't post CQEs */
112362306a36Sopenharmony_ci	req->flags |= REQ_F_CQE_SKIP;
112462306a36Sopenharmony_ci	req->io_task_work.func = io_req_task_complete;
112562306a36Sopenharmony_ci	io_req_task_work_add(req);
112662306a36Sopenharmony_ci}
112762306a36Sopenharmony_ci
112862306a36Sopenharmony_cistatic void __io_req_find_next_prep(struct io_kiocb *req)
112962306a36Sopenharmony_ci{
113062306a36Sopenharmony_ci	struct io_ring_ctx *ctx = req->ctx;
113162306a36Sopenharmony_ci
113262306a36Sopenharmony_ci	spin_lock(&ctx->completion_lock);
113362306a36Sopenharmony_ci	io_disarm_next(req);
113462306a36Sopenharmony_ci	spin_unlock(&ctx->completion_lock);
113562306a36Sopenharmony_ci}
113662306a36Sopenharmony_ci
113762306a36Sopenharmony_cistatic inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
113862306a36Sopenharmony_ci{
113962306a36Sopenharmony_ci	struct io_kiocb *nxt;
114062306a36Sopenharmony_ci
114162306a36Sopenharmony_ci	/*
114262306a36Sopenharmony_ci	 * If LINK is set, we have dependent requests in this chain. If we
114362306a36Sopenharmony_ci	 * didn't fail this request, queue the first one up, moving any other
114462306a36Sopenharmony_ci	 * dependencies to the next request. In case of failure, fail the rest
114562306a36Sopenharmony_ci	 * of the chain.
114662306a36Sopenharmony_ci	 */
114762306a36Sopenharmony_ci	if (unlikely(req->flags & IO_DISARM_MASK))
114862306a36Sopenharmony_ci		__io_req_find_next_prep(req);
114962306a36Sopenharmony_ci	nxt = req->link;
115062306a36Sopenharmony_ci	req->link = NULL;
115162306a36Sopenharmony_ci	return nxt;
115262306a36Sopenharmony_ci}
115362306a36Sopenharmony_ci
115462306a36Sopenharmony_cistatic void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
115562306a36Sopenharmony_ci{
115662306a36Sopenharmony_ci	if (!ctx)
115762306a36Sopenharmony_ci		return;
115862306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
115962306a36Sopenharmony_ci		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
116062306a36Sopenharmony_ci	if (ts->locked) {
116162306a36Sopenharmony_ci		io_submit_flush_completions(ctx);
116262306a36Sopenharmony_ci		mutex_unlock(&ctx->uring_lock);
116362306a36Sopenharmony_ci		ts->locked = false;
116462306a36Sopenharmony_ci	}
116562306a36Sopenharmony_ci	percpu_ref_put(&ctx->refs);
116662306a36Sopenharmony_ci}
116762306a36Sopenharmony_ci
116862306a36Sopenharmony_cistatic unsigned int handle_tw_list(struct llist_node *node,
116962306a36Sopenharmony_ci				   struct io_ring_ctx **ctx,
117062306a36Sopenharmony_ci				   struct io_tw_state *ts)
117162306a36Sopenharmony_ci{
117262306a36Sopenharmony_ci	unsigned int count = 0;
117362306a36Sopenharmony_ci
117462306a36Sopenharmony_ci	do {
117562306a36Sopenharmony_ci		struct llist_node *next = node->next;
117662306a36Sopenharmony_ci		struct io_kiocb *req = container_of(node, struct io_kiocb,
117762306a36Sopenharmony_ci						    io_task_work.node);
117862306a36Sopenharmony_ci
117962306a36Sopenharmony_ci		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
118062306a36Sopenharmony_ci
118162306a36Sopenharmony_ci		if (req->ctx != *ctx) {
118262306a36Sopenharmony_ci			ctx_flush_and_put(*ctx, ts);
118362306a36Sopenharmony_ci			*ctx = req->ctx;
118462306a36Sopenharmony_ci			/* if not contended, grab and improve batching */
118562306a36Sopenharmony_ci			ts->locked = mutex_trylock(&(*ctx)->uring_lock);
118662306a36Sopenharmony_ci			percpu_ref_get(&(*ctx)->refs);
118762306a36Sopenharmony_ci		}
118862306a36Sopenharmony_ci		INDIRECT_CALL_2(req->io_task_work.func,
118962306a36Sopenharmony_ci				io_poll_task_func, io_req_rw_complete,
119062306a36Sopenharmony_ci				req, ts);
119162306a36Sopenharmony_ci		node = next;
119262306a36Sopenharmony_ci		count++;
119362306a36Sopenharmony_ci		if (unlikely(need_resched())) {
119462306a36Sopenharmony_ci			ctx_flush_and_put(*ctx, ts);
119562306a36Sopenharmony_ci			*ctx = NULL;
119662306a36Sopenharmony_ci			cond_resched();
119762306a36Sopenharmony_ci		}
119862306a36Sopenharmony_ci	} while (node);
119962306a36Sopenharmony_ci
120062306a36Sopenharmony_ci	return count;
120162306a36Sopenharmony_ci}
120262306a36Sopenharmony_ci
120362306a36Sopenharmony_ci/**
120462306a36Sopenharmony_ci * io_llist_xchg - swap all entries in a lock-less list
120562306a36Sopenharmony_ci * @head:	the head of lock-less list to delete all entries
120662306a36Sopenharmony_ci * @new:	new entry as the head of the list
120762306a36Sopenharmony_ci *
120862306a36Sopenharmony_ci * If list is empty, return NULL, otherwise, return the pointer to the first entry.
120962306a36Sopenharmony_ci * The order of entries returned is from the newest to the oldest added one.
121062306a36Sopenharmony_ci */
121162306a36Sopenharmony_cistatic inline struct llist_node *io_llist_xchg(struct llist_head *head,
121262306a36Sopenharmony_ci					       struct llist_node *new)
121362306a36Sopenharmony_ci{
121462306a36Sopenharmony_ci	return xchg(&head->first, new);
121562306a36Sopenharmony_ci}
121662306a36Sopenharmony_ci
121762306a36Sopenharmony_cistatic __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
121862306a36Sopenharmony_ci{
121962306a36Sopenharmony_ci	struct llist_node *node = llist_del_all(&tctx->task_list);
122062306a36Sopenharmony_ci	struct io_ring_ctx *last_ctx = NULL;
122162306a36Sopenharmony_ci	struct io_kiocb *req;
122262306a36Sopenharmony_ci
122362306a36Sopenharmony_ci	while (node) {
122462306a36Sopenharmony_ci		req = container_of(node, struct io_kiocb, io_task_work.node);
122562306a36Sopenharmony_ci		node = node->next;
122662306a36Sopenharmony_ci		if (sync && last_ctx != req->ctx) {
122762306a36Sopenharmony_ci			if (last_ctx) {
122862306a36Sopenharmony_ci				flush_delayed_work(&last_ctx->fallback_work);
122962306a36Sopenharmony_ci				percpu_ref_put(&last_ctx->refs);
123062306a36Sopenharmony_ci			}
123162306a36Sopenharmony_ci			last_ctx = req->ctx;
123262306a36Sopenharmony_ci			percpu_ref_get(&last_ctx->refs);
123362306a36Sopenharmony_ci		}
123462306a36Sopenharmony_ci		if (llist_add(&req->io_task_work.node,
123562306a36Sopenharmony_ci			      &req->ctx->fallback_llist))
123662306a36Sopenharmony_ci			schedule_delayed_work(&req->ctx->fallback_work, 1);
123762306a36Sopenharmony_ci	}
123862306a36Sopenharmony_ci
123962306a36Sopenharmony_ci	if (last_ctx) {
124062306a36Sopenharmony_ci		flush_delayed_work(&last_ctx->fallback_work);
124162306a36Sopenharmony_ci		percpu_ref_put(&last_ctx->refs);
124262306a36Sopenharmony_ci	}
124362306a36Sopenharmony_ci}
124462306a36Sopenharmony_ci
124562306a36Sopenharmony_civoid tctx_task_work(struct callback_head *cb)
124662306a36Sopenharmony_ci{
124762306a36Sopenharmony_ci	struct io_tw_state ts = {};
124862306a36Sopenharmony_ci	struct io_ring_ctx *ctx = NULL;
124962306a36Sopenharmony_ci	struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
125062306a36Sopenharmony_ci						  task_work);
125162306a36Sopenharmony_ci	struct llist_node *node;
125262306a36Sopenharmony_ci	unsigned int count = 0;
125362306a36Sopenharmony_ci
125462306a36Sopenharmony_ci	if (unlikely(current->flags & PF_EXITING)) {
125562306a36Sopenharmony_ci		io_fallback_tw(tctx, true);
125662306a36Sopenharmony_ci		return;
125762306a36Sopenharmony_ci	}
125862306a36Sopenharmony_ci
125962306a36Sopenharmony_ci	node = llist_del_all(&tctx->task_list);
126062306a36Sopenharmony_ci	if (node)
126162306a36Sopenharmony_ci		count = handle_tw_list(node, &ctx, &ts);
126262306a36Sopenharmony_ci
126362306a36Sopenharmony_ci	ctx_flush_and_put(ctx, &ts);
126462306a36Sopenharmony_ci
126562306a36Sopenharmony_ci	/* relaxed read is enough as only the task itself sets ->in_cancel */
126662306a36Sopenharmony_ci	if (unlikely(atomic_read(&tctx->in_cancel)))
126762306a36Sopenharmony_ci		io_uring_drop_tctx_refs(current);
126862306a36Sopenharmony_ci
126962306a36Sopenharmony_ci	trace_io_uring_task_work_run(tctx, count, 1);
127062306a36Sopenharmony_ci}
127162306a36Sopenharmony_ci
127262306a36Sopenharmony_cistatic inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
127362306a36Sopenharmony_ci{
127462306a36Sopenharmony_ci	struct io_ring_ctx *ctx = req->ctx;
127562306a36Sopenharmony_ci	unsigned nr_wait, nr_tw, nr_tw_prev;
127662306a36Sopenharmony_ci	struct llist_node *first;
127762306a36Sopenharmony_ci
127862306a36Sopenharmony_ci	if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
127962306a36Sopenharmony_ci		flags &= ~IOU_F_TWQ_LAZY_WAKE;
128062306a36Sopenharmony_ci
128162306a36Sopenharmony_ci	first = READ_ONCE(ctx->work_llist.first);
128262306a36Sopenharmony_ci	do {
128362306a36Sopenharmony_ci		nr_tw_prev = 0;
128462306a36Sopenharmony_ci		if (first) {
128562306a36Sopenharmony_ci			struct io_kiocb *first_req = container_of(first,
128662306a36Sopenharmony_ci							struct io_kiocb,
128762306a36Sopenharmony_ci							io_task_work.node);
128862306a36Sopenharmony_ci			/*
128962306a36Sopenharmony_ci			 * Might be executed at any moment, rely on
129062306a36Sopenharmony_ci			 * SLAB_TYPESAFE_BY_RCU to keep it alive.
129162306a36Sopenharmony_ci			 */
129262306a36Sopenharmony_ci			nr_tw_prev = READ_ONCE(first_req->nr_tw);
129362306a36Sopenharmony_ci		}
129462306a36Sopenharmony_ci		nr_tw = nr_tw_prev + 1;
129562306a36Sopenharmony_ci		/* Large enough to fail the nr_wait comparison below */
129662306a36Sopenharmony_ci		if (!(flags & IOU_F_TWQ_LAZY_WAKE))
129762306a36Sopenharmony_ci			nr_tw = INT_MAX;
129862306a36Sopenharmony_ci
129962306a36Sopenharmony_ci		req->nr_tw = nr_tw;
130062306a36Sopenharmony_ci		req->io_task_work.node.next = first;
130162306a36Sopenharmony_ci	} while (!try_cmpxchg(&ctx->work_llist.first, &first,
130262306a36Sopenharmony_ci			      &req->io_task_work.node));
130362306a36Sopenharmony_ci
130462306a36Sopenharmony_ci	if (!first) {
130562306a36Sopenharmony_ci		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
130662306a36Sopenharmony_ci			atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
130762306a36Sopenharmony_ci		if (ctx->has_evfd)
130862306a36Sopenharmony_ci			io_eventfd_signal(ctx);
130962306a36Sopenharmony_ci	}
131062306a36Sopenharmony_ci
131162306a36Sopenharmony_ci	nr_wait = atomic_read(&ctx->cq_wait_nr);
131262306a36Sopenharmony_ci	/* no one is waiting */
131362306a36Sopenharmony_ci	if (!nr_wait)
131462306a36Sopenharmony_ci		return;
131562306a36Sopenharmony_ci	/* either not enough or the previous add has already woken it up */
131662306a36Sopenharmony_ci	if (nr_wait > nr_tw || nr_tw_prev >= nr_wait)
131762306a36Sopenharmony_ci		return;
131862306a36Sopenharmony_ci	/* pairs with set_current_state() in io_cqring_wait() */
131962306a36Sopenharmony_ci	smp_mb__after_atomic();
132062306a36Sopenharmony_ci	wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
132162306a36Sopenharmony_ci}
132262306a36Sopenharmony_ci
132362306a36Sopenharmony_cistatic void io_req_normal_work_add(struct io_kiocb *req)
132462306a36Sopenharmony_ci{
132562306a36Sopenharmony_ci	struct io_uring_task *tctx = req->task->io_uring;
132662306a36Sopenharmony_ci	struct io_ring_ctx *ctx = req->ctx;
132762306a36Sopenharmony_ci
132862306a36Sopenharmony_ci	/* task_work already pending, we're done */
132962306a36Sopenharmony_ci	if (!llist_add(&req->io_task_work.node, &tctx->task_list))
133062306a36Sopenharmony_ci		return;
133162306a36Sopenharmony_ci
133262306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
133362306a36Sopenharmony_ci		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
133462306a36Sopenharmony_ci
133562306a36Sopenharmony_ci	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
133662306a36Sopenharmony_ci		return;
133762306a36Sopenharmony_ci
133862306a36Sopenharmony_ci	io_fallback_tw(tctx, false);
133962306a36Sopenharmony_ci}
134062306a36Sopenharmony_ci
134162306a36Sopenharmony_civoid __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
134262306a36Sopenharmony_ci{
134362306a36Sopenharmony_ci	if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
134462306a36Sopenharmony_ci		rcu_read_lock();
134562306a36Sopenharmony_ci		io_req_local_work_add(req, flags);
134662306a36Sopenharmony_ci		rcu_read_unlock();
134762306a36Sopenharmony_ci	} else {
134862306a36Sopenharmony_ci		io_req_normal_work_add(req);
134962306a36Sopenharmony_ci	}
135062306a36Sopenharmony_ci}
135162306a36Sopenharmony_ci
135262306a36Sopenharmony_cistatic void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
135362306a36Sopenharmony_ci{
135462306a36Sopenharmony_ci	struct llist_node *node;
135562306a36Sopenharmony_ci
135662306a36Sopenharmony_ci	node = llist_del_all(&ctx->work_llist);
135762306a36Sopenharmony_ci	while (node) {
135862306a36Sopenharmony_ci		struct io_kiocb *req = container_of(node, struct io_kiocb,
135962306a36Sopenharmony_ci						    io_task_work.node);
136062306a36Sopenharmony_ci
136162306a36Sopenharmony_ci		node = node->next;
136262306a36Sopenharmony_ci		io_req_normal_work_add(req);
136362306a36Sopenharmony_ci	}
136462306a36Sopenharmony_ci}
136562306a36Sopenharmony_ci
136662306a36Sopenharmony_cistatic bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
136762306a36Sopenharmony_ci				       int min_events)
136862306a36Sopenharmony_ci{
136962306a36Sopenharmony_ci	if (llist_empty(&ctx->work_llist))
137062306a36Sopenharmony_ci		return false;
137162306a36Sopenharmony_ci	if (events < min_events)
137262306a36Sopenharmony_ci		return true;
137362306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
137462306a36Sopenharmony_ci		atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
137562306a36Sopenharmony_ci	return false;
137662306a36Sopenharmony_ci}
137762306a36Sopenharmony_ci
137862306a36Sopenharmony_cistatic int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
137962306a36Sopenharmony_ci			       int min_events)
138062306a36Sopenharmony_ci{
138162306a36Sopenharmony_ci	struct llist_node *node;
138262306a36Sopenharmony_ci	unsigned int loops = 0;
138362306a36Sopenharmony_ci	int ret = 0;
138462306a36Sopenharmony_ci
138562306a36Sopenharmony_ci	if (WARN_ON_ONCE(ctx->submitter_task != current))
138662306a36Sopenharmony_ci		return -EEXIST;
138762306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
138862306a36Sopenharmony_ci		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
138962306a36Sopenharmony_ciagain:
139062306a36Sopenharmony_ci	/*
139162306a36Sopenharmony_ci	 * llists are in reverse order, flip it back the right way before
139262306a36Sopenharmony_ci	 * running the pending items.
139362306a36Sopenharmony_ci	 */
139462306a36Sopenharmony_ci	node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL));
139562306a36Sopenharmony_ci	while (node) {
139662306a36Sopenharmony_ci		struct llist_node *next = node->next;
139762306a36Sopenharmony_ci		struct io_kiocb *req = container_of(node, struct io_kiocb,
139862306a36Sopenharmony_ci						    io_task_work.node);
139962306a36Sopenharmony_ci		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
140062306a36Sopenharmony_ci		INDIRECT_CALL_2(req->io_task_work.func,
140162306a36Sopenharmony_ci				io_poll_task_func, io_req_rw_complete,
140262306a36Sopenharmony_ci				req, ts);
140362306a36Sopenharmony_ci		ret++;
140462306a36Sopenharmony_ci		node = next;
140562306a36Sopenharmony_ci	}
140662306a36Sopenharmony_ci	loops++;
140762306a36Sopenharmony_ci
140862306a36Sopenharmony_ci	if (io_run_local_work_continue(ctx, ret, min_events))
140962306a36Sopenharmony_ci		goto again;
141062306a36Sopenharmony_ci	if (ts->locked) {
141162306a36Sopenharmony_ci		io_submit_flush_completions(ctx);
141262306a36Sopenharmony_ci		if (io_run_local_work_continue(ctx, ret, min_events))
141362306a36Sopenharmony_ci			goto again;
141462306a36Sopenharmony_ci	}
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ci	trace_io_uring_local_work_run(ctx, ret, loops);
141762306a36Sopenharmony_ci	return ret;
141862306a36Sopenharmony_ci}
141962306a36Sopenharmony_ci
142062306a36Sopenharmony_cistatic inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
142162306a36Sopenharmony_ci					   int min_events)
142262306a36Sopenharmony_ci{
142362306a36Sopenharmony_ci	struct io_tw_state ts = { .locked = true, };
142462306a36Sopenharmony_ci	int ret;
142562306a36Sopenharmony_ci
142662306a36Sopenharmony_ci	if (llist_empty(&ctx->work_llist))
142762306a36Sopenharmony_ci		return 0;
142862306a36Sopenharmony_ci
142962306a36Sopenharmony_ci	ret = __io_run_local_work(ctx, &ts, min_events);
143062306a36Sopenharmony_ci	/* shouldn't happen! */
143162306a36Sopenharmony_ci	if (WARN_ON_ONCE(!ts.locked))
143262306a36Sopenharmony_ci		mutex_lock(&ctx->uring_lock);
143362306a36Sopenharmony_ci	return ret;
143462306a36Sopenharmony_ci}
143562306a36Sopenharmony_ci
143662306a36Sopenharmony_cistatic int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
143762306a36Sopenharmony_ci{
143862306a36Sopenharmony_ci	struct io_tw_state ts = {};
143962306a36Sopenharmony_ci	int ret;
144062306a36Sopenharmony_ci
144162306a36Sopenharmony_ci	ts.locked = mutex_trylock(&ctx->uring_lock);
144262306a36Sopenharmony_ci	ret = __io_run_local_work(ctx, &ts, min_events);
144362306a36Sopenharmony_ci	if (ts.locked)
144462306a36Sopenharmony_ci		mutex_unlock(&ctx->uring_lock);
144562306a36Sopenharmony_ci
144662306a36Sopenharmony_ci	return ret;
144762306a36Sopenharmony_ci}
144862306a36Sopenharmony_ci
144962306a36Sopenharmony_cistatic void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts)
145062306a36Sopenharmony_ci{
145162306a36Sopenharmony_ci	io_tw_lock(req->ctx, ts);
145262306a36Sopenharmony_ci	io_req_defer_failed(req, req->cqe.res);
145362306a36Sopenharmony_ci}
145462306a36Sopenharmony_ci
145562306a36Sopenharmony_civoid io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
145662306a36Sopenharmony_ci{
145762306a36Sopenharmony_ci	io_tw_lock(req->ctx, ts);
145862306a36Sopenharmony_ci	/* req->task == current here, checking PF_EXITING is safe */
145962306a36Sopenharmony_ci	if (unlikely(req->task->flags & PF_EXITING))
146062306a36Sopenharmony_ci		io_req_defer_failed(req, -EFAULT);
146162306a36Sopenharmony_ci	else if (req->flags & REQ_F_FORCE_ASYNC)
146262306a36Sopenharmony_ci		io_queue_iowq(req, ts);
146362306a36Sopenharmony_ci	else
146462306a36Sopenharmony_ci		io_queue_sqe(req);
146562306a36Sopenharmony_ci}
146662306a36Sopenharmony_ci
146762306a36Sopenharmony_civoid io_req_task_queue_fail(struct io_kiocb *req, int ret)
146862306a36Sopenharmony_ci{
146962306a36Sopenharmony_ci	io_req_set_res(req, ret, 0);
147062306a36Sopenharmony_ci	req->io_task_work.func = io_req_task_cancel;
147162306a36Sopenharmony_ci	io_req_task_work_add(req);
147262306a36Sopenharmony_ci}
147362306a36Sopenharmony_ci
147462306a36Sopenharmony_civoid io_req_task_queue(struct io_kiocb *req)
147562306a36Sopenharmony_ci{
147662306a36Sopenharmony_ci	req->io_task_work.func = io_req_task_submit;
147762306a36Sopenharmony_ci	io_req_task_work_add(req);
147862306a36Sopenharmony_ci}
147962306a36Sopenharmony_ci
148062306a36Sopenharmony_civoid io_queue_next(struct io_kiocb *req)
148162306a36Sopenharmony_ci{
148262306a36Sopenharmony_ci	struct io_kiocb *nxt = io_req_find_next(req);
148362306a36Sopenharmony_ci
148462306a36Sopenharmony_ci	if (nxt)
148562306a36Sopenharmony_ci		io_req_task_queue(nxt);
148662306a36Sopenharmony_ci}
148762306a36Sopenharmony_ci
148862306a36Sopenharmony_cistatic void io_free_batch_list(struct io_ring_ctx *ctx,
148962306a36Sopenharmony_ci			       struct io_wq_work_node *node)
149062306a36Sopenharmony_ci	__must_hold(&ctx->uring_lock)
149162306a36Sopenharmony_ci{
149262306a36Sopenharmony_ci	do {
149362306a36Sopenharmony_ci		struct io_kiocb *req = container_of(node, struct io_kiocb,
149462306a36Sopenharmony_ci						    comp_list);
149562306a36Sopenharmony_ci
149662306a36Sopenharmony_ci		if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
149762306a36Sopenharmony_ci			if (req->flags & REQ_F_REFCOUNT) {
149862306a36Sopenharmony_ci				node = req->comp_list.next;
149962306a36Sopenharmony_ci				if (!req_ref_put_and_test(req))
150062306a36Sopenharmony_ci					continue;
150162306a36Sopenharmony_ci			}
150262306a36Sopenharmony_ci			if ((req->flags & REQ_F_POLLED) && req->apoll) {
150362306a36Sopenharmony_ci				struct async_poll *apoll = req->apoll;
150462306a36Sopenharmony_ci
150562306a36Sopenharmony_ci				if (apoll->double_poll)
150662306a36Sopenharmony_ci					kfree(apoll->double_poll);
150762306a36Sopenharmony_ci				if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache))
150862306a36Sopenharmony_ci					kfree(apoll);
150962306a36Sopenharmony_ci				req->flags &= ~REQ_F_POLLED;
151062306a36Sopenharmony_ci			}
151162306a36Sopenharmony_ci			if (req->flags & IO_REQ_LINK_FLAGS)
151262306a36Sopenharmony_ci				io_queue_next(req);
151362306a36Sopenharmony_ci			if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
151462306a36Sopenharmony_ci				io_clean_op(req);
151562306a36Sopenharmony_ci		}
151662306a36Sopenharmony_ci		io_put_file(req);
151762306a36Sopenharmony_ci
151862306a36Sopenharmony_ci		io_req_put_rsrc_locked(req, ctx);
151962306a36Sopenharmony_ci
152062306a36Sopenharmony_ci		io_put_task(req->task);
152162306a36Sopenharmony_ci		node = req->comp_list.next;
152262306a36Sopenharmony_ci		io_req_add_to_cache(req, ctx);
152362306a36Sopenharmony_ci	} while (node);
152462306a36Sopenharmony_ci}
152562306a36Sopenharmony_ci
152662306a36Sopenharmony_civoid __io_submit_flush_completions(struct io_ring_ctx *ctx)
152762306a36Sopenharmony_ci	__must_hold(&ctx->uring_lock)
152862306a36Sopenharmony_ci{
152962306a36Sopenharmony_ci	struct io_submit_state *state = &ctx->submit_state;
153062306a36Sopenharmony_ci	struct io_wq_work_node *node;
153162306a36Sopenharmony_ci
153262306a36Sopenharmony_ci	__io_cq_lock(ctx);
153362306a36Sopenharmony_ci	/* must come first to preserve CQE ordering in failure cases */
153462306a36Sopenharmony_ci	if (state->cqes_count)
153562306a36Sopenharmony_ci		__io_flush_post_cqes(ctx);
153662306a36Sopenharmony_ci	__wq_list_for_each(node, &state->compl_reqs) {
153762306a36Sopenharmony_ci		struct io_kiocb *req = container_of(node, struct io_kiocb,
153862306a36Sopenharmony_ci					    comp_list);
153962306a36Sopenharmony_ci
154062306a36Sopenharmony_ci		if (!(req->flags & REQ_F_CQE_SKIP) &&
154162306a36Sopenharmony_ci		    unlikely(!io_fill_cqe_req(ctx, req))) {
154262306a36Sopenharmony_ci			if (ctx->lockless_cq) {
154362306a36Sopenharmony_ci				spin_lock(&ctx->completion_lock);
154462306a36Sopenharmony_ci				io_req_cqe_overflow(req);
154562306a36Sopenharmony_ci				spin_unlock(&ctx->completion_lock);
154662306a36Sopenharmony_ci			} else {
154762306a36Sopenharmony_ci				io_req_cqe_overflow(req);
154862306a36Sopenharmony_ci			}
154962306a36Sopenharmony_ci		}
155062306a36Sopenharmony_ci	}
155162306a36Sopenharmony_ci	__io_cq_unlock_post(ctx);
155262306a36Sopenharmony_ci
155362306a36Sopenharmony_ci	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
155462306a36Sopenharmony_ci		io_free_batch_list(ctx, state->compl_reqs.first);
155562306a36Sopenharmony_ci		INIT_WQ_LIST(&state->compl_reqs);
155662306a36Sopenharmony_ci	}
155762306a36Sopenharmony_ci}
155862306a36Sopenharmony_ci
155962306a36Sopenharmony_cistatic unsigned io_cqring_events(struct io_ring_ctx *ctx)
156062306a36Sopenharmony_ci{
156162306a36Sopenharmony_ci	/* See comment at the top of this file */
156262306a36Sopenharmony_ci	smp_rmb();
156362306a36Sopenharmony_ci	return __io_cqring_events(ctx);
156462306a36Sopenharmony_ci}
156562306a36Sopenharmony_ci
156662306a36Sopenharmony_ci/*
156762306a36Sopenharmony_ci * We can't just wait for polled events to come to us, we have to actively
156862306a36Sopenharmony_ci * find and complete them.
156962306a36Sopenharmony_ci */
157062306a36Sopenharmony_cistatic __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
157162306a36Sopenharmony_ci{
157262306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_IOPOLL))
157362306a36Sopenharmony_ci		return;
157462306a36Sopenharmony_ci
157562306a36Sopenharmony_ci	mutex_lock(&ctx->uring_lock);
157662306a36Sopenharmony_ci	while (!wq_list_empty(&ctx->iopoll_list)) {
157762306a36Sopenharmony_ci		/* let it sleep and repeat later if can't complete a request */
157862306a36Sopenharmony_ci		if (io_do_iopoll(ctx, true) == 0)
157962306a36Sopenharmony_ci			break;
158062306a36Sopenharmony_ci		/*
158162306a36Sopenharmony_ci		 * Ensure we allow local-to-the-cpu processing to take place,
158262306a36Sopenharmony_ci		 * in this case we need to ensure that we reap all events.
158362306a36Sopenharmony_ci		 * Also let task_work, etc. to progress by releasing the mutex
158462306a36Sopenharmony_ci		 */
158562306a36Sopenharmony_ci		if (need_resched()) {
158662306a36Sopenharmony_ci			mutex_unlock(&ctx->uring_lock);
158762306a36Sopenharmony_ci			cond_resched();
158862306a36Sopenharmony_ci			mutex_lock(&ctx->uring_lock);
158962306a36Sopenharmony_ci		}
159062306a36Sopenharmony_ci	}
159162306a36Sopenharmony_ci	mutex_unlock(&ctx->uring_lock);
159262306a36Sopenharmony_ci}
159362306a36Sopenharmony_ci
159462306a36Sopenharmony_cistatic int io_iopoll_check(struct io_ring_ctx *ctx, long min)
159562306a36Sopenharmony_ci{
159662306a36Sopenharmony_ci	unsigned int nr_events = 0;
159762306a36Sopenharmony_ci	unsigned long check_cq;
159862306a36Sopenharmony_ci
159962306a36Sopenharmony_ci	if (!io_allowed_run_tw(ctx))
160062306a36Sopenharmony_ci		return -EEXIST;
160162306a36Sopenharmony_ci
160262306a36Sopenharmony_ci	check_cq = READ_ONCE(ctx->check_cq);
160362306a36Sopenharmony_ci	if (unlikely(check_cq)) {
160462306a36Sopenharmony_ci		if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
160562306a36Sopenharmony_ci			__io_cqring_overflow_flush(ctx);
160662306a36Sopenharmony_ci		/*
160762306a36Sopenharmony_ci		 * Similarly do not spin if we have not informed the user of any
160862306a36Sopenharmony_ci		 * dropped CQE.
160962306a36Sopenharmony_ci		 */
161062306a36Sopenharmony_ci		if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))
161162306a36Sopenharmony_ci			return -EBADR;
161262306a36Sopenharmony_ci	}
161362306a36Sopenharmony_ci	/*
161462306a36Sopenharmony_ci	 * Don't enter poll loop if we already have events pending.
161562306a36Sopenharmony_ci	 * If we do, we can potentially be spinning for commands that
161662306a36Sopenharmony_ci	 * already triggered a CQE (eg in error).
161762306a36Sopenharmony_ci	 */
161862306a36Sopenharmony_ci	if (io_cqring_events(ctx))
161962306a36Sopenharmony_ci		return 0;
162062306a36Sopenharmony_ci
162162306a36Sopenharmony_ci	do {
162262306a36Sopenharmony_ci		int ret = 0;
162362306a36Sopenharmony_ci
162462306a36Sopenharmony_ci		/*
162562306a36Sopenharmony_ci		 * If a submit got punted to a workqueue, we can have the
162662306a36Sopenharmony_ci		 * application entering polling for a command before it gets
162762306a36Sopenharmony_ci		 * issued. That app will hold the uring_lock for the duration
162862306a36Sopenharmony_ci		 * of the poll right here, so we need to take a breather every
162962306a36Sopenharmony_ci		 * now and then to ensure that the issue has a chance to add
163062306a36Sopenharmony_ci		 * the poll to the issued list. Otherwise we can spin here
163162306a36Sopenharmony_ci		 * forever, while the workqueue is stuck trying to acquire the
163262306a36Sopenharmony_ci		 * very same mutex.
163362306a36Sopenharmony_ci		 */
163462306a36Sopenharmony_ci		if (wq_list_empty(&ctx->iopoll_list) ||
163562306a36Sopenharmony_ci		    io_task_work_pending(ctx)) {
163662306a36Sopenharmony_ci			u32 tail = ctx->cached_cq_tail;
163762306a36Sopenharmony_ci
163862306a36Sopenharmony_ci			(void) io_run_local_work_locked(ctx, min);
163962306a36Sopenharmony_ci
164062306a36Sopenharmony_ci			if (task_work_pending(current) ||
164162306a36Sopenharmony_ci			    wq_list_empty(&ctx->iopoll_list)) {
164262306a36Sopenharmony_ci				mutex_unlock(&ctx->uring_lock);
164362306a36Sopenharmony_ci				io_run_task_work();
164462306a36Sopenharmony_ci				mutex_lock(&ctx->uring_lock);
164562306a36Sopenharmony_ci			}
164662306a36Sopenharmony_ci			/* some requests don't go through iopoll_list */
164762306a36Sopenharmony_ci			if (tail != ctx->cached_cq_tail ||
164862306a36Sopenharmony_ci			    wq_list_empty(&ctx->iopoll_list))
164962306a36Sopenharmony_ci				break;
165062306a36Sopenharmony_ci		}
165162306a36Sopenharmony_ci		ret = io_do_iopoll(ctx, !min);
165262306a36Sopenharmony_ci		if (unlikely(ret < 0))
165362306a36Sopenharmony_ci			return ret;
165462306a36Sopenharmony_ci
165562306a36Sopenharmony_ci		if (task_sigpending(current))
165662306a36Sopenharmony_ci			return -EINTR;
165762306a36Sopenharmony_ci		if (need_resched())
165862306a36Sopenharmony_ci			break;
165962306a36Sopenharmony_ci
166062306a36Sopenharmony_ci		nr_events += ret;
166162306a36Sopenharmony_ci	} while (nr_events < min);
166262306a36Sopenharmony_ci
166362306a36Sopenharmony_ci	return 0;
166462306a36Sopenharmony_ci}
166562306a36Sopenharmony_ci
166662306a36Sopenharmony_civoid io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
166762306a36Sopenharmony_ci{
166862306a36Sopenharmony_ci	if (ts->locked)
166962306a36Sopenharmony_ci		io_req_complete_defer(req);
167062306a36Sopenharmony_ci	else
167162306a36Sopenharmony_ci		io_req_complete_post(req, IO_URING_F_UNLOCKED);
167262306a36Sopenharmony_ci}
167362306a36Sopenharmony_ci
167462306a36Sopenharmony_ci/*
167562306a36Sopenharmony_ci * After the iocb has been issued, it's safe to be found on the poll list.
167662306a36Sopenharmony_ci * Adding the kiocb to the list AFTER submission ensures that we don't
167762306a36Sopenharmony_ci * find it from a io_do_iopoll() thread before the issuer is done
167862306a36Sopenharmony_ci * accessing the kiocb cookie.
167962306a36Sopenharmony_ci */
168062306a36Sopenharmony_cistatic void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
168162306a36Sopenharmony_ci{
168262306a36Sopenharmony_ci	struct io_ring_ctx *ctx = req->ctx;
168362306a36Sopenharmony_ci	const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
168462306a36Sopenharmony_ci
168562306a36Sopenharmony_ci	/* workqueue context doesn't hold uring_lock, grab it now */
168662306a36Sopenharmony_ci	if (unlikely(needs_lock))
168762306a36Sopenharmony_ci		mutex_lock(&ctx->uring_lock);
168862306a36Sopenharmony_ci
168962306a36Sopenharmony_ci	/*
169062306a36Sopenharmony_ci	 * Track whether we have multiple files in our lists. This will impact
169162306a36Sopenharmony_ci	 * how we do polling eventually, not spinning if we're on potentially
169262306a36Sopenharmony_ci	 * different devices.
169362306a36Sopenharmony_ci	 */
169462306a36Sopenharmony_ci	if (wq_list_empty(&ctx->iopoll_list)) {
169562306a36Sopenharmony_ci		ctx->poll_multi_queue = false;
169662306a36Sopenharmony_ci	} else if (!ctx->poll_multi_queue) {
169762306a36Sopenharmony_ci		struct io_kiocb *list_req;
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_ci		list_req = container_of(ctx->iopoll_list.first, struct io_kiocb,
170062306a36Sopenharmony_ci					comp_list);
170162306a36Sopenharmony_ci		if (list_req->file != req->file)
170262306a36Sopenharmony_ci			ctx->poll_multi_queue = true;
170362306a36Sopenharmony_ci	}
170462306a36Sopenharmony_ci
170562306a36Sopenharmony_ci	/*
170662306a36Sopenharmony_ci	 * For fast devices, IO may have already completed. If it has, add
170762306a36Sopenharmony_ci	 * it to the front so we find it first.
170862306a36Sopenharmony_ci	 */
170962306a36Sopenharmony_ci	if (READ_ONCE(req->iopoll_completed))
171062306a36Sopenharmony_ci		wq_list_add_head(&req->comp_list, &ctx->iopoll_list);
171162306a36Sopenharmony_ci	else
171262306a36Sopenharmony_ci		wq_list_add_tail(&req->comp_list, &ctx->iopoll_list);
171362306a36Sopenharmony_ci
171462306a36Sopenharmony_ci	if (unlikely(needs_lock)) {
171562306a36Sopenharmony_ci		/*
171662306a36Sopenharmony_ci		 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle
171762306a36Sopenharmony_ci		 * in sq thread task context or in io worker task context. If
171862306a36Sopenharmony_ci		 * current task context is sq thread, we don't need to check
171962306a36Sopenharmony_ci		 * whether should wake up sq thread.
172062306a36Sopenharmony_ci		 */
172162306a36Sopenharmony_ci		if ((ctx->flags & IORING_SETUP_SQPOLL) &&
172262306a36Sopenharmony_ci		    wq_has_sleeper(&ctx->sq_data->wait))
172362306a36Sopenharmony_ci			wake_up(&ctx->sq_data->wait);
172462306a36Sopenharmony_ci
172562306a36Sopenharmony_ci		mutex_unlock(&ctx->uring_lock);
172662306a36Sopenharmony_ci	}
172762306a36Sopenharmony_ci}
172862306a36Sopenharmony_ci
172962306a36Sopenharmony_ciunsigned int io_file_get_flags(struct file *file)
173062306a36Sopenharmony_ci{
173162306a36Sopenharmony_ci	unsigned int res = 0;
173262306a36Sopenharmony_ci
173362306a36Sopenharmony_ci	if (S_ISREG(file_inode(file)->i_mode))
173462306a36Sopenharmony_ci		res |= REQ_F_ISREG;
173562306a36Sopenharmony_ci	if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT))
173662306a36Sopenharmony_ci		res |= REQ_F_SUPPORT_NOWAIT;
173762306a36Sopenharmony_ci	return res;
173862306a36Sopenharmony_ci}
173962306a36Sopenharmony_ci
174062306a36Sopenharmony_cibool io_alloc_async_data(struct io_kiocb *req)
174162306a36Sopenharmony_ci{
174262306a36Sopenharmony_ci	WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size);
174362306a36Sopenharmony_ci	req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL);
174462306a36Sopenharmony_ci	if (req->async_data) {
174562306a36Sopenharmony_ci		req->flags |= REQ_F_ASYNC_DATA;
174662306a36Sopenharmony_ci		return false;
174762306a36Sopenharmony_ci	}
174862306a36Sopenharmony_ci	return true;
174962306a36Sopenharmony_ci}
175062306a36Sopenharmony_ci
175162306a36Sopenharmony_ciint io_req_prep_async(struct io_kiocb *req)
175262306a36Sopenharmony_ci{
175362306a36Sopenharmony_ci	const struct io_cold_def *cdef = &io_cold_defs[req->opcode];
175462306a36Sopenharmony_ci	const struct io_issue_def *def = &io_issue_defs[req->opcode];
175562306a36Sopenharmony_ci
175662306a36Sopenharmony_ci	/* assign early for deferred execution for non-fixed file */
175762306a36Sopenharmony_ci	if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file)
175862306a36Sopenharmony_ci		req->file = io_file_get_normal(req, req->cqe.fd);
175962306a36Sopenharmony_ci	if (!cdef->prep_async)
176062306a36Sopenharmony_ci		return 0;
176162306a36Sopenharmony_ci	if (WARN_ON_ONCE(req_has_async_data(req)))
176262306a36Sopenharmony_ci		return -EFAULT;
176362306a36Sopenharmony_ci	if (!def->manual_alloc) {
176462306a36Sopenharmony_ci		if (io_alloc_async_data(req))
176562306a36Sopenharmony_ci			return -EAGAIN;
176662306a36Sopenharmony_ci	}
176762306a36Sopenharmony_ci	return cdef->prep_async(req);
176862306a36Sopenharmony_ci}
176962306a36Sopenharmony_ci
177062306a36Sopenharmony_cistatic u32 io_get_sequence(struct io_kiocb *req)
177162306a36Sopenharmony_ci{
177262306a36Sopenharmony_ci	u32 seq = req->ctx->cached_sq_head;
177362306a36Sopenharmony_ci	struct io_kiocb *cur;
177462306a36Sopenharmony_ci
177562306a36Sopenharmony_ci	/* need original cached_sq_head, but it was increased for each req */
177662306a36Sopenharmony_ci	io_for_each_link(cur, req)
177762306a36Sopenharmony_ci		seq--;
177862306a36Sopenharmony_ci	return seq;
177962306a36Sopenharmony_ci}
178062306a36Sopenharmony_ci
178162306a36Sopenharmony_cistatic __cold void io_drain_req(struct io_kiocb *req)
178262306a36Sopenharmony_ci	__must_hold(&ctx->uring_lock)
178362306a36Sopenharmony_ci{
178462306a36Sopenharmony_ci	struct io_ring_ctx *ctx = req->ctx;
178562306a36Sopenharmony_ci	struct io_defer_entry *de;
178662306a36Sopenharmony_ci	int ret;
178762306a36Sopenharmony_ci	u32 seq = io_get_sequence(req);
178862306a36Sopenharmony_ci
178962306a36Sopenharmony_ci	/* Still need defer if there is pending req in defer list. */
179062306a36Sopenharmony_ci	spin_lock(&ctx->completion_lock);
179162306a36Sopenharmony_ci	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
179262306a36Sopenharmony_ci		spin_unlock(&ctx->completion_lock);
179362306a36Sopenharmony_ciqueue:
179462306a36Sopenharmony_ci		ctx->drain_active = false;
179562306a36Sopenharmony_ci		io_req_task_queue(req);
179662306a36Sopenharmony_ci		return;
179762306a36Sopenharmony_ci	}
179862306a36Sopenharmony_ci	spin_unlock(&ctx->completion_lock);
179962306a36Sopenharmony_ci
180062306a36Sopenharmony_ci	io_prep_async_link(req);
180162306a36Sopenharmony_ci	de = kmalloc(sizeof(*de), GFP_KERNEL);
180262306a36Sopenharmony_ci	if (!de) {
180362306a36Sopenharmony_ci		ret = -ENOMEM;
180462306a36Sopenharmony_ci		io_req_defer_failed(req, ret);
180562306a36Sopenharmony_ci		return;
180662306a36Sopenharmony_ci	}
180762306a36Sopenharmony_ci
180862306a36Sopenharmony_ci	spin_lock(&ctx->completion_lock);
180962306a36Sopenharmony_ci	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
181062306a36Sopenharmony_ci		spin_unlock(&ctx->completion_lock);
181162306a36Sopenharmony_ci		kfree(de);
181262306a36Sopenharmony_ci		goto queue;
181362306a36Sopenharmony_ci	}
181462306a36Sopenharmony_ci
181562306a36Sopenharmony_ci	trace_io_uring_defer(req);
181662306a36Sopenharmony_ci	de->req = req;
181762306a36Sopenharmony_ci	de->seq = seq;
181862306a36Sopenharmony_ci	list_add_tail(&de->list, &ctx->defer_list);
181962306a36Sopenharmony_ci	spin_unlock(&ctx->completion_lock);
182062306a36Sopenharmony_ci}
182162306a36Sopenharmony_ci
182262306a36Sopenharmony_cistatic bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
182362306a36Sopenharmony_ci			   unsigned int issue_flags)
182462306a36Sopenharmony_ci{
182562306a36Sopenharmony_ci	if (req->file || !def->needs_file)
182662306a36Sopenharmony_ci		return true;
182762306a36Sopenharmony_ci
182862306a36Sopenharmony_ci	if (req->flags & REQ_F_FIXED_FILE)
182962306a36Sopenharmony_ci		req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags);
183062306a36Sopenharmony_ci	else
183162306a36Sopenharmony_ci		req->file = io_file_get_normal(req, req->cqe.fd);
183262306a36Sopenharmony_ci
183362306a36Sopenharmony_ci	return !!req->file;
183462306a36Sopenharmony_ci}
183562306a36Sopenharmony_ci
183662306a36Sopenharmony_cistatic int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
183762306a36Sopenharmony_ci{
183862306a36Sopenharmony_ci	const struct io_issue_def *def = &io_issue_defs[req->opcode];
183962306a36Sopenharmony_ci	const struct cred *creds = NULL;
184062306a36Sopenharmony_ci	int ret;
184162306a36Sopenharmony_ci
184262306a36Sopenharmony_ci	if (unlikely(!io_assign_file(req, def, issue_flags)))
184362306a36Sopenharmony_ci		return -EBADF;
184462306a36Sopenharmony_ci
184562306a36Sopenharmony_ci	if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
184662306a36Sopenharmony_ci		creds = override_creds(req->creds);
184762306a36Sopenharmony_ci
184862306a36Sopenharmony_ci	if (!def->audit_skip)
184962306a36Sopenharmony_ci		audit_uring_entry(req->opcode);
185062306a36Sopenharmony_ci
185162306a36Sopenharmony_ci	ret = def->issue(req, issue_flags);
185262306a36Sopenharmony_ci
185362306a36Sopenharmony_ci	if (!def->audit_skip)
185462306a36Sopenharmony_ci		audit_uring_exit(!ret, ret);
185562306a36Sopenharmony_ci
185662306a36Sopenharmony_ci	if (creds)
185762306a36Sopenharmony_ci		revert_creds(creds);
185862306a36Sopenharmony_ci
185962306a36Sopenharmony_ci	if (ret == IOU_OK) {
186062306a36Sopenharmony_ci		if (issue_flags & IO_URING_F_COMPLETE_DEFER)
186162306a36Sopenharmony_ci			io_req_complete_defer(req);
186262306a36Sopenharmony_ci		else
186362306a36Sopenharmony_ci			io_req_complete_post(req, issue_flags);
186462306a36Sopenharmony_ci
186562306a36Sopenharmony_ci		return 0;
186662306a36Sopenharmony_ci	}
186762306a36Sopenharmony_ci
186862306a36Sopenharmony_ci	if (ret != IOU_ISSUE_SKIP_COMPLETE)
186962306a36Sopenharmony_ci		return ret;
187062306a36Sopenharmony_ci
187162306a36Sopenharmony_ci	/* If the op doesn't have a file, we're not polling for it */
187262306a36Sopenharmony_ci	if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue)
187362306a36Sopenharmony_ci		io_iopoll_req_issued(req, issue_flags);
187462306a36Sopenharmony_ci
187562306a36Sopenharmony_ci	return 0;
187662306a36Sopenharmony_ci}
187762306a36Sopenharmony_ci
187862306a36Sopenharmony_ciint io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
187962306a36Sopenharmony_ci{
188062306a36Sopenharmony_ci	io_tw_lock(req->ctx, ts);
188162306a36Sopenharmony_ci	return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
188262306a36Sopenharmony_ci				 IO_URING_F_COMPLETE_DEFER);
188362306a36Sopenharmony_ci}
188462306a36Sopenharmony_ci
188562306a36Sopenharmony_cistruct io_wq_work *io_wq_free_work(struct io_wq_work *work)
188662306a36Sopenharmony_ci{
188762306a36Sopenharmony_ci	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
188862306a36Sopenharmony_ci	struct io_kiocb *nxt = NULL;
188962306a36Sopenharmony_ci
189062306a36Sopenharmony_ci	if (req_ref_put_and_test(req)) {
189162306a36Sopenharmony_ci		if (req->flags & IO_REQ_LINK_FLAGS)
189262306a36Sopenharmony_ci			nxt = io_req_find_next(req);
189362306a36Sopenharmony_ci		io_free_req(req);
189462306a36Sopenharmony_ci	}
189562306a36Sopenharmony_ci	return nxt ? &nxt->work : NULL;
189662306a36Sopenharmony_ci}
189762306a36Sopenharmony_ci
189862306a36Sopenharmony_civoid io_wq_submit_work(struct io_wq_work *work)
189962306a36Sopenharmony_ci{
190062306a36Sopenharmony_ci	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
190162306a36Sopenharmony_ci	const struct io_issue_def *def = &io_issue_defs[req->opcode];
190262306a36Sopenharmony_ci	unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ;
190362306a36Sopenharmony_ci	bool needs_poll = false;
190462306a36Sopenharmony_ci	int ret = 0, err = -ECANCELED;
190562306a36Sopenharmony_ci
190662306a36Sopenharmony_ci	/* one will be dropped by ->io_wq_free_work() after returning to io-wq */
190762306a36Sopenharmony_ci	if (!(req->flags & REQ_F_REFCOUNT))
190862306a36Sopenharmony_ci		__io_req_set_refcount(req, 2);
190962306a36Sopenharmony_ci	else
191062306a36Sopenharmony_ci		req_ref_get(req);
191162306a36Sopenharmony_ci
191262306a36Sopenharmony_ci	io_arm_ltimeout(req);
191362306a36Sopenharmony_ci
191462306a36Sopenharmony_ci	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
191562306a36Sopenharmony_ci	if (work->flags & IO_WQ_WORK_CANCEL) {
191662306a36Sopenharmony_cifail:
191762306a36Sopenharmony_ci		io_req_task_queue_fail(req, err);
191862306a36Sopenharmony_ci		return;
191962306a36Sopenharmony_ci	}
192062306a36Sopenharmony_ci	if (!io_assign_file(req, def, issue_flags)) {
192162306a36Sopenharmony_ci		err = -EBADF;
192262306a36Sopenharmony_ci		work->flags |= IO_WQ_WORK_CANCEL;
192362306a36Sopenharmony_ci		goto fail;
192462306a36Sopenharmony_ci	}
192562306a36Sopenharmony_ci
192662306a36Sopenharmony_ci	if (req->flags & REQ_F_FORCE_ASYNC) {
192762306a36Sopenharmony_ci		bool opcode_poll = def->pollin || def->pollout;
192862306a36Sopenharmony_ci
192962306a36Sopenharmony_ci		if (opcode_poll && file_can_poll(req->file)) {
193062306a36Sopenharmony_ci			needs_poll = true;
193162306a36Sopenharmony_ci			issue_flags |= IO_URING_F_NONBLOCK;
193262306a36Sopenharmony_ci		}
193362306a36Sopenharmony_ci	}
193462306a36Sopenharmony_ci
193562306a36Sopenharmony_ci	do {
193662306a36Sopenharmony_ci		ret = io_issue_sqe(req, issue_flags);
193762306a36Sopenharmony_ci		if (ret != -EAGAIN)
193862306a36Sopenharmony_ci			break;
193962306a36Sopenharmony_ci
194062306a36Sopenharmony_ci		/*
194162306a36Sopenharmony_ci		 * If REQ_F_NOWAIT is set, then don't wait or retry with
194262306a36Sopenharmony_ci		 * poll. -EAGAIN is final for that case.
194362306a36Sopenharmony_ci		 */
194462306a36Sopenharmony_ci		if (req->flags & REQ_F_NOWAIT)
194562306a36Sopenharmony_ci			break;
194662306a36Sopenharmony_ci
194762306a36Sopenharmony_ci		/*
194862306a36Sopenharmony_ci		 * We can get EAGAIN for iopolled IO even though we're
194962306a36Sopenharmony_ci		 * forcing a sync submission from here, since we can't
195062306a36Sopenharmony_ci		 * wait for request slots on the block side.
195162306a36Sopenharmony_ci		 */
195262306a36Sopenharmony_ci		if (!needs_poll) {
195362306a36Sopenharmony_ci			if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
195462306a36Sopenharmony_ci				break;
195562306a36Sopenharmony_ci			if (io_wq_worker_stopped())
195662306a36Sopenharmony_ci				break;
195762306a36Sopenharmony_ci			cond_resched();
195862306a36Sopenharmony_ci			continue;
195962306a36Sopenharmony_ci		}
196062306a36Sopenharmony_ci
196162306a36Sopenharmony_ci		if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
196262306a36Sopenharmony_ci			return;
196362306a36Sopenharmony_ci		/* aborted or ready, in either case retry blocking */
196462306a36Sopenharmony_ci		needs_poll = false;
196562306a36Sopenharmony_ci		issue_flags &= ~IO_URING_F_NONBLOCK;
196662306a36Sopenharmony_ci	} while (1);
196762306a36Sopenharmony_ci
196862306a36Sopenharmony_ci	/* avoid locking problems by failing it from a clean context */
196962306a36Sopenharmony_ci	if (ret < 0)
197062306a36Sopenharmony_ci		io_req_task_queue_fail(req, ret);
197162306a36Sopenharmony_ci}
197262306a36Sopenharmony_ci
197362306a36Sopenharmony_ciinline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
197462306a36Sopenharmony_ci				      unsigned int issue_flags)
197562306a36Sopenharmony_ci{
197662306a36Sopenharmony_ci	struct io_ring_ctx *ctx = req->ctx;
197762306a36Sopenharmony_ci	struct io_fixed_file *slot;
197862306a36Sopenharmony_ci	struct file *file = NULL;
197962306a36Sopenharmony_ci
198062306a36Sopenharmony_ci	io_ring_submit_lock(ctx, issue_flags);
198162306a36Sopenharmony_ci
198262306a36Sopenharmony_ci	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
198362306a36Sopenharmony_ci		goto out;
198462306a36Sopenharmony_ci	fd = array_index_nospec(fd, ctx->nr_user_files);
198562306a36Sopenharmony_ci	slot = io_fixed_file_slot(&ctx->file_table, fd);
198662306a36Sopenharmony_ci	file = io_slot_file(slot);
198762306a36Sopenharmony_ci	req->flags |= io_slot_flags(slot);
198862306a36Sopenharmony_ci	io_req_set_rsrc_node(req, ctx, 0);
198962306a36Sopenharmony_ciout:
199062306a36Sopenharmony_ci	io_ring_submit_unlock(ctx, issue_flags);
199162306a36Sopenharmony_ci	return file;
199262306a36Sopenharmony_ci}
199362306a36Sopenharmony_ci
199462306a36Sopenharmony_cistruct file *io_file_get_normal(struct io_kiocb *req, int fd)
199562306a36Sopenharmony_ci{
199662306a36Sopenharmony_ci	struct file *file = fget(fd);
199762306a36Sopenharmony_ci
199862306a36Sopenharmony_ci	trace_io_uring_file_get(req, fd);
199962306a36Sopenharmony_ci
200062306a36Sopenharmony_ci	/* we don't allow fixed io_uring files */
200162306a36Sopenharmony_ci	if (file && io_is_uring_fops(file))
200262306a36Sopenharmony_ci		io_req_track_inflight(req);
200362306a36Sopenharmony_ci	return file;
200462306a36Sopenharmony_ci}
200562306a36Sopenharmony_ci
200662306a36Sopenharmony_cistatic void io_queue_async(struct io_kiocb *req, int ret)
200762306a36Sopenharmony_ci	__must_hold(&req->ctx->uring_lock)
200862306a36Sopenharmony_ci{
200962306a36Sopenharmony_ci	struct io_kiocb *linked_timeout;
201062306a36Sopenharmony_ci
201162306a36Sopenharmony_ci	if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) {
201262306a36Sopenharmony_ci		io_req_defer_failed(req, ret);
201362306a36Sopenharmony_ci		return;
201462306a36Sopenharmony_ci	}
201562306a36Sopenharmony_ci
201662306a36Sopenharmony_ci	linked_timeout = io_prep_linked_timeout(req);
201762306a36Sopenharmony_ci
201862306a36Sopenharmony_ci	switch (io_arm_poll_handler(req, 0)) {
201962306a36Sopenharmony_ci	case IO_APOLL_READY:
202062306a36Sopenharmony_ci		io_kbuf_recycle(req, 0);
202162306a36Sopenharmony_ci		io_req_task_queue(req);
202262306a36Sopenharmony_ci		break;
202362306a36Sopenharmony_ci	case IO_APOLL_ABORTED:
202462306a36Sopenharmony_ci		io_kbuf_recycle(req, 0);
202562306a36Sopenharmony_ci		io_queue_iowq(req, NULL);
202662306a36Sopenharmony_ci		break;
202762306a36Sopenharmony_ci	case IO_APOLL_OK:
202862306a36Sopenharmony_ci		break;
202962306a36Sopenharmony_ci	}
203062306a36Sopenharmony_ci
203162306a36Sopenharmony_ci	if (linked_timeout)
203262306a36Sopenharmony_ci		io_queue_linked_timeout(linked_timeout);
203362306a36Sopenharmony_ci}
203462306a36Sopenharmony_ci
203562306a36Sopenharmony_cistatic inline void io_queue_sqe(struct io_kiocb *req)
203662306a36Sopenharmony_ci	__must_hold(&req->ctx->uring_lock)
203762306a36Sopenharmony_ci{
203862306a36Sopenharmony_ci	int ret;
203962306a36Sopenharmony_ci
204062306a36Sopenharmony_ci	ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
204162306a36Sopenharmony_ci
204262306a36Sopenharmony_ci	/*
204362306a36Sopenharmony_ci	 * We async punt it if the file wasn't marked NOWAIT, or if the file
204462306a36Sopenharmony_ci	 * doesn't support non-blocking read/write attempts
204562306a36Sopenharmony_ci	 */
204662306a36Sopenharmony_ci	if (likely(!ret))
204762306a36Sopenharmony_ci		io_arm_ltimeout(req);
204862306a36Sopenharmony_ci	else
204962306a36Sopenharmony_ci		io_queue_async(req, ret);
205062306a36Sopenharmony_ci}
205162306a36Sopenharmony_ci
205262306a36Sopenharmony_cistatic void io_queue_sqe_fallback(struct io_kiocb *req)
205362306a36Sopenharmony_ci	__must_hold(&req->ctx->uring_lock)
205462306a36Sopenharmony_ci{
205562306a36Sopenharmony_ci	if (unlikely(req->flags & REQ_F_FAIL)) {
205662306a36Sopenharmony_ci		/*
205762306a36Sopenharmony_ci		 * We don't submit, fail them all, for that replace hardlinks
205862306a36Sopenharmony_ci		 * with normal links. Extra REQ_F_LINK is tolerated.
205962306a36Sopenharmony_ci		 */
206062306a36Sopenharmony_ci		req->flags &= ~REQ_F_HARDLINK;
206162306a36Sopenharmony_ci		req->flags |= REQ_F_LINK;
206262306a36Sopenharmony_ci		io_req_defer_failed(req, req->cqe.res);
206362306a36Sopenharmony_ci	} else {
206462306a36Sopenharmony_ci		int ret = io_req_prep_async(req);
206562306a36Sopenharmony_ci
206662306a36Sopenharmony_ci		if (unlikely(ret)) {
206762306a36Sopenharmony_ci			io_req_defer_failed(req, ret);
206862306a36Sopenharmony_ci			return;
206962306a36Sopenharmony_ci		}
207062306a36Sopenharmony_ci
207162306a36Sopenharmony_ci		if (unlikely(req->ctx->drain_active))
207262306a36Sopenharmony_ci			io_drain_req(req);
207362306a36Sopenharmony_ci		else
207462306a36Sopenharmony_ci			io_queue_iowq(req, NULL);
207562306a36Sopenharmony_ci	}
207662306a36Sopenharmony_ci}
207762306a36Sopenharmony_ci
207862306a36Sopenharmony_ci/*
207962306a36Sopenharmony_ci * Check SQE restrictions (opcode and flags).
208062306a36Sopenharmony_ci *
208162306a36Sopenharmony_ci * Returns 'true' if SQE is allowed, 'false' otherwise.
208262306a36Sopenharmony_ci */
208362306a36Sopenharmony_cistatic inline bool io_check_restriction(struct io_ring_ctx *ctx,
208462306a36Sopenharmony_ci					struct io_kiocb *req,
208562306a36Sopenharmony_ci					unsigned int sqe_flags)
208662306a36Sopenharmony_ci{
208762306a36Sopenharmony_ci	if (!test_bit(req->opcode, ctx->restrictions.sqe_op))
208862306a36Sopenharmony_ci		return false;
208962306a36Sopenharmony_ci
209062306a36Sopenharmony_ci	if ((sqe_flags & ctx->restrictions.sqe_flags_required) !=
209162306a36Sopenharmony_ci	    ctx->restrictions.sqe_flags_required)
209262306a36Sopenharmony_ci		return false;
209362306a36Sopenharmony_ci
209462306a36Sopenharmony_ci	if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed |
209562306a36Sopenharmony_ci			  ctx->restrictions.sqe_flags_required))
209662306a36Sopenharmony_ci		return false;
209762306a36Sopenharmony_ci
209862306a36Sopenharmony_ci	return true;
209962306a36Sopenharmony_ci}
210062306a36Sopenharmony_ci
210162306a36Sopenharmony_cistatic void io_init_req_drain(struct io_kiocb *req)
210262306a36Sopenharmony_ci{
210362306a36Sopenharmony_ci	struct io_ring_ctx *ctx = req->ctx;
210462306a36Sopenharmony_ci	struct io_kiocb *head = ctx->submit_state.link.head;
210562306a36Sopenharmony_ci
210662306a36Sopenharmony_ci	ctx->drain_active = true;
210762306a36Sopenharmony_ci	if (head) {
210862306a36Sopenharmony_ci		/*
210962306a36Sopenharmony_ci		 * If we need to drain a request in the middle of a link, drain
211062306a36Sopenharmony_ci		 * the head request and the next request/link after the current
211162306a36Sopenharmony_ci		 * link. Considering sequential execution of links,
211262306a36Sopenharmony_ci		 * REQ_F_IO_DRAIN will be maintained for every request of our
211362306a36Sopenharmony_ci		 * link.
211462306a36Sopenharmony_ci		 */
211562306a36Sopenharmony_ci		head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
211662306a36Sopenharmony_ci		ctx->drain_next = true;
211762306a36Sopenharmony_ci	}
211862306a36Sopenharmony_ci}
211962306a36Sopenharmony_ci
212062306a36Sopenharmony_cistatic int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
212162306a36Sopenharmony_ci		       const struct io_uring_sqe *sqe)
212262306a36Sopenharmony_ci	__must_hold(&ctx->uring_lock)
212362306a36Sopenharmony_ci{
212462306a36Sopenharmony_ci	const struct io_issue_def *def;
212562306a36Sopenharmony_ci	unsigned int sqe_flags;
212662306a36Sopenharmony_ci	int personality;
212762306a36Sopenharmony_ci	u8 opcode;
212862306a36Sopenharmony_ci
212962306a36Sopenharmony_ci	/* req is partially pre-initialised, see io_preinit_req() */
213062306a36Sopenharmony_ci	req->opcode = opcode = READ_ONCE(sqe->opcode);
213162306a36Sopenharmony_ci	/* same numerical values with corresponding REQ_F_*, safe to copy */
213262306a36Sopenharmony_ci	req->flags = sqe_flags = READ_ONCE(sqe->flags);
213362306a36Sopenharmony_ci	req->cqe.user_data = READ_ONCE(sqe->user_data);
213462306a36Sopenharmony_ci	req->file = NULL;
213562306a36Sopenharmony_ci	req->rsrc_node = NULL;
213662306a36Sopenharmony_ci	req->task = current;
213762306a36Sopenharmony_ci
213862306a36Sopenharmony_ci	if (unlikely(opcode >= IORING_OP_LAST)) {
213962306a36Sopenharmony_ci		req->opcode = 0;
214062306a36Sopenharmony_ci		return -EINVAL;
214162306a36Sopenharmony_ci	}
214262306a36Sopenharmony_ci	def = &io_issue_defs[opcode];
214362306a36Sopenharmony_ci	if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
214462306a36Sopenharmony_ci		/* enforce forwards compatibility on users */
214562306a36Sopenharmony_ci		if (sqe_flags & ~SQE_VALID_FLAGS)
214662306a36Sopenharmony_ci			return -EINVAL;
214762306a36Sopenharmony_ci		if (sqe_flags & IOSQE_BUFFER_SELECT) {
214862306a36Sopenharmony_ci			if (!def->buffer_select)
214962306a36Sopenharmony_ci				return -EOPNOTSUPP;
215062306a36Sopenharmony_ci			req->buf_index = READ_ONCE(sqe->buf_group);
215162306a36Sopenharmony_ci		}
215262306a36Sopenharmony_ci		if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
215362306a36Sopenharmony_ci			ctx->drain_disabled = true;
215462306a36Sopenharmony_ci		if (sqe_flags & IOSQE_IO_DRAIN) {
215562306a36Sopenharmony_ci			if (ctx->drain_disabled)
215662306a36Sopenharmony_ci				return -EOPNOTSUPP;
215762306a36Sopenharmony_ci			io_init_req_drain(req);
215862306a36Sopenharmony_ci		}
215962306a36Sopenharmony_ci	}
216062306a36Sopenharmony_ci	if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) {
216162306a36Sopenharmony_ci		if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags))
216262306a36Sopenharmony_ci			return -EACCES;
216362306a36Sopenharmony_ci		/* knock it to the slow queue path, will be drained there */
216462306a36Sopenharmony_ci		if (ctx->drain_active)
216562306a36Sopenharmony_ci			req->flags |= REQ_F_FORCE_ASYNC;
216662306a36Sopenharmony_ci		/* if there is no link, we're at "next" request and need to drain */
216762306a36Sopenharmony_ci		if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) {
216862306a36Sopenharmony_ci			ctx->drain_next = false;
216962306a36Sopenharmony_ci			ctx->drain_active = true;
217062306a36Sopenharmony_ci			req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC;
217162306a36Sopenharmony_ci		}
217262306a36Sopenharmony_ci	}
217362306a36Sopenharmony_ci
217462306a36Sopenharmony_ci	if (!def->ioprio && sqe->ioprio)
217562306a36Sopenharmony_ci		return -EINVAL;
217662306a36Sopenharmony_ci	if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
217762306a36Sopenharmony_ci		return -EINVAL;
217862306a36Sopenharmony_ci
217962306a36Sopenharmony_ci	if (def->needs_file) {
218062306a36Sopenharmony_ci		struct io_submit_state *state = &ctx->submit_state;
218162306a36Sopenharmony_ci
218262306a36Sopenharmony_ci		req->cqe.fd = READ_ONCE(sqe->fd);
218362306a36Sopenharmony_ci
218462306a36Sopenharmony_ci		/*
218562306a36Sopenharmony_ci		 * Plug now if we have more than 2 IO left after this, and the
218662306a36Sopenharmony_ci		 * target is potentially a read/write to block based storage.
218762306a36Sopenharmony_ci		 */
218862306a36Sopenharmony_ci		if (state->need_plug && def->plug) {
218962306a36Sopenharmony_ci			state->plug_started = true;
219062306a36Sopenharmony_ci			state->need_plug = false;
219162306a36Sopenharmony_ci			blk_start_plug_nr_ios(&state->plug, state->submit_nr);
219262306a36Sopenharmony_ci		}
219362306a36Sopenharmony_ci	}
219462306a36Sopenharmony_ci
219562306a36Sopenharmony_ci	personality = READ_ONCE(sqe->personality);
219662306a36Sopenharmony_ci	if (personality) {
219762306a36Sopenharmony_ci		int ret;
219862306a36Sopenharmony_ci
219962306a36Sopenharmony_ci		req->creds = xa_load(&ctx->personalities, personality);
220062306a36Sopenharmony_ci		if (!req->creds)
220162306a36Sopenharmony_ci			return -EINVAL;
220262306a36Sopenharmony_ci		get_cred(req->creds);
220362306a36Sopenharmony_ci		ret = security_uring_override_creds(req->creds);
220462306a36Sopenharmony_ci		if (ret) {
220562306a36Sopenharmony_ci			put_cred(req->creds);
220662306a36Sopenharmony_ci			return ret;
220762306a36Sopenharmony_ci		}
220862306a36Sopenharmony_ci		req->flags |= REQ_F_CREDS;
220962306a36Sopenharmony_ci	}
221062306a36Sopenharmony_ci
221162306a36Sopenharmony_ci	return def->prep(req, sqe);
221262306a36Sopenharmony_ci}
221362306a36Sopenharmony_ci
221462306a36Sopenharmony_cistatic __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
221562306a36Sopenharmony_ci				      struct io_kiocb *req, int ret)
221662306a36Sopenharmony_ci{
221762306a36Sopenharmony_ci	struct io_ring_ctx *ctx = req->ctx;
221862306a36Sopenharmony_ci	struct io_submit_link *link = &ctx->submit_state.link;
221962306a36Sopenharmony_ci	struct io_kiocb *head = link->head;
222062306a36Sopenharmony_ci
222162306a36Sopenharmony_ci	trace_io_uring_req_failed(sqe, req, ret);
222262306a36Sopenharmony_ci
222362306a36Sopenharmony_ci	/*
222462306a36Sopenharmony_ci	 * Avoid breaking links in the middle as it renders links with SQPOLL
222562306a36Sopenharmony_ci	 * unusable. Instead of failing eagerly, continue assembling the link if
222662306a36Sopenharmony_ci	 * applicable and mark the head with REQ_F_FAIL. The link flushing code
222762306a36Sopenharmony_ci	 * should find the flag and handle the rest.
222862306a36Sopenharmony_ci	 */
222962306a36Sopenharmony_ci	req_fail_link_node(req, ret);
223062306a36Sopenharmony_ci	if (head && !(head->flags & REQ_F_FAIL))
223162306a36Sopenharmony_ci		req_fail_link_node(head, -ECANCELED);
223262306a36Sopenharmony_ci
223362306a36Sopenharmony_ci	if (!(req->flags & IO_REQ_LINK_FLAGS)) {
223462306a36Sopenharmony_ci		if (head) {
223562306a36Sopenharmony_ci			link->last->link = req;
223662306a36Sopenharmony_ci			link->head = NULL;
223762306a36Sopenharmony_ci			req = head;
223862306a36Sopenharmony_ci		}
223962306a36Sopenharmony_ci		io_queue_sqe_fallback(req);
224062306a36Sopenharmony_ci		return ret;
224162306a36Sopenharmony_ci	}
224262306a36Sopenharmony_ci
224362306a36Sopenharmony_ci	if (head)
224462306a36Sopenharmony_ci		link->last->link = req;
224562306a36Sopenharmony_ci	else
224662306a36Sopenharmony_ci		link->head = req;
224762306a36Sopenharmony_ci	link->last = req;
224862306a36Sopenharmony_ci	return 0;
224962306a36Sopenharmony_ci}
225062306a36Sopenharmony_ci
225162306a36Sopenharmony_cistatic inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
225262306a36Sopenharmony_ci			 const struct io_uring_sqe *sqe)
225362306a36Sopenharmony_ci	__must_hold(&ctx->uring_lock)
225462306a36Sopenharmony_ci{
225562306a36Sopenharmony_ci	struct io_submit_link *link = &ctx->submit_state.link;
225662306a36Sopenharmony_ci	int ret;
225762306a36Sopenharmony_ci
225862306a36Sopenharmony_ci	ret = io_init_req(ctx, req, sqe);
225962306a36Sopenharmony_ci	if (unlikely(ret))
226062306a36Sopenharmony_ci		return io_submit_fail_init(sqe, req, ret);
226162306a36Sopenharmony_ci
226262306a36Sopenharmony_ci	trace_io_uring_submit_req(req);
226362306a36Sopenharmony_ci
226462306a36Sopenharmony_ci	/*
226562306a36Sopenharmony_ci	 * If we already have a head request, queue this one for async
226662306a36Sopenharmony_ci	 * submittal once the head completes. If we don't have a head but
226762306a36Sopenharmony_ci	 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
226862306a36Sopenharmony_ci	 * submitted sync once the chain is complete. If none of those
226962306a36Sopenharmony_ci	 * conditions are true (normal request), then just queue it.
227062306a36Sopenharmony_ci	 */
227162306a36Sopenharmony_ci	if (unlikely(link->head)) {
227262306a36Sopenharmony_ci		ret = io_req_prep_async(req);
227362306a36Sopenharmony_ci		if (unlikely(ret))
227462306a36Sopenharmony_ci			return io_submit_fail_init(sqe, req, ret);
227562306a36Sopenharmony_ci
227662306a36Sopenharmony_ci		trace_io_uring_link(req, link->head);
227762306a36Sopenharmony_ci		link->last->link = req;
227862306a36Sopenharmony_ci		link->last = req;
227962306a36Sopenharmony_ci
228062306a36Sopenharmony_ci		if (req->flags & IO_REQ_LINK_FLAGS)
228162306a36Sopenharmony_ci			return 0;
228262306a36Sopenharmony_ci		/* last request of the link, flush it */
228362306a36Sopenharmony_ci		req = link->head;
228462306a36Sopenharmony_ci		link->head = NULL;
228562306a36Sopenharmony_ci		if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL))
228662306a36Sopenharmony_ci			goto fallback;
228762306a36Sopenharmony_ci
228862306a36Sopenharmony_ci	} else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS |
228962306a36Sopenharmony_ci					  REQ_F_FORCE_ASYNC | REQ_F_FAIL))) {
229062306a36Sopenharmony_ci		if (req->flags & IO_REQ_LINK_FLAGS) {
229162306a36Sopenharmony_ci			link->head = req;
229262306a36Sopenharmony_ci			link->last = req;
229362306a36Sopenharmony_ci		} else {
229462306a36Sopenharmony_cifallback:
229562306a36Sopenharmony_ci			io_queue_sqe_fallback(req);
229662306a36Sopenharmony_ci		}
229762306a36Sopenharmony_ci		return 0;
229862306a36Sopenharmony_ci	}
229962306a36Sopenharmony_ci
230062306a36Sopenharmony_ci	io_queue_sqe(req);
230162306a36Sopenharmony_ci	return 0;
230262306a36Sopenharmony_ci}
230362306a36Sopenharmony_ci
230462306a36Sopenharmony_ci/*
230562306a36Sopenharmony_ci * Batched submission is done, ensure local IO is flushed out.
230662306a36Sopenharmony_ci */
230762306a36Sopenharmony_cistatic void io_submit_state_end(struct io_ring_ctx *ctx)
230862306a36Sopenharmony_ci{
230962306a36Sopenharmony_ci	struct io_submit_state *state = &ctx->submit_state;
231062306a36Sopenharmony_ci
231162306a36Sopenharmony_ci	if (unlikely(state->link.head))
231262306a36Sopenharmony_ci		io_queue_sqe_fallback(state->link.head);
231362306a36Sopenharmony_ci	/* flush only after queuing links as they can generate completions */
231462306a36Sopenharmony_ci	io_submit_flush_completions(ctx);
231562306a36Sopenharmony_ci	if (state->plug_started)
231662306a36Sopenharmony_ci		blk_finish_plug(&state->plug);
231762306a36Sopenharmony_ci}
231862306a36Sopenharmony_ci
231962306a36Sopenharmony_ci/*
232062306a36Sopenharmony_ci * Start submission side cache.
232162306a36Sopenharmony_ci */
232262306a36Sopenharmony_cistatic void io_submit_state_start(struct io_submit_state *state,
232362306a36Sopenharmony_ci				  unsigned int max_ios)
232462306a36Sopenharmony_ci{
232562306a36Sopenharmony_ci	state->plug_started = false;
232662306a36Sopenharmony_ci	state->need_plug = max_ios > 2;
232762306a36Sopenharmony_ci	state->submit_nr = max_ios;
232862306a36Sopenharmony_ci	/* set only head, no need to init link_last in advance */
232962306a36Sopenharmony_ci	state->link.head = NULL;
233062306a36Sopenharmony_ci}
233162306a36Sopenharmony_ci
233262306a36Sopenharmony_cistatic void io_commit_sqring(struct io_ring_ctx *ctx)
233362306a36Sopenharmony_ci{
233462306a36Sopenharmony_ci	struct io_rings *rings = ctx->rings;
233562306a36Sopenharmony_ci
233662306a36Sopenharmony_ci	/*
233762306a36Sopenharmony_ci	 * Ensure any loads from the SQEs are done at this point,
233862306a36Sopenharmony_ci	 * since once we write the new head, the application could
233962306a36Sopenharmony_ci	 * write new data to them.
234062306a36Sopenharmony_ci	 */
234162306a36Sopenharmony_ci	smp_store_release(&rings->sq.head, ctx->cached_sq_head);
234262306a36Sopenharmony_ci}
234362306a36Sopenharmony_ci
234462306a36Sopenharmony_ci/*
234562306a36Sopenharmony_ci * Fetch an sqe, if one is available. Note this returns a pointer to memory
234662306a36Sopenharmony_ci * that is mapped by userspace. This means that care needs to be taken to
234762306a36Sopenharmony_ci * ensure that reads are stable, as we cannot rely on userspace always
234862306a36Sopenharmony_ci * being a good citizen. If members of the sqe are validated and then later
234962306a36Sopenharmony_ci * used, it's important that those reads are done through READ_ONCE() to
235062306a36Sopenharmony_ci * prevent a re-load down the line.
235162306a36Sopenharmony_ci */
235262306a36Sopenharmony_cistatic bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
235362306a36Sopenharmony_ci{
235462306a36Sopenharmony_ci	unsigned mask = ctx->sq_entries - 1;
235562306a36Sopenharmony_ci	unsigned head = ctx->cached_sq_head++ & mask;
235662306a36Sopenharmony_ci
235762306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) {
235862306a36Sopenharmony_ci		head = READ_ONCE(ctx->sq_array[head]);
235962306a36Sopenharmony_ci		if (unlikely(head >= ctx->sq_entries)) {
236062306a36Sopenharmony_ci			/* drop invalid entries */
236162306a36Sopenharmony_ci			spin_lock(&ctx->completion_lock);
236262306a36Sopenharmony_ci			ctx->cq_extra--;
236362306a36Sopenharmony_ci			spin_unlock(&ctx->completion_lock);
236462306a36Sopenharmony_ci			WRITE_ONCE(ctx->rings->sq_dropped,
236562306a36Sopenharmony_ci				   READ_ONCE(ctx->rings->sq_dropped) + 1);
236662306a36Sopenharmony_ci			return false;
236762306a36Sopenharmony_ci		}
236862306a36Sopenharmony_ci	}
236962306a36Sopenharmony_ci
237062306a36Sopenharmony_ci	/*
237162306a36Sopenharmony_ci	 * The cached sq head (or cq tail) serves two purposes:
237262306a36Sopenharmony_ci	 *
237362306a36Sopenharmony_ci	 * 1) allows us to batch the cost of updating the user visible
237462306a36Sopenharmony_ci	 *    head updates.
237562306a36Sopenharmony_ci	 * 2) allows the kernel side to track the head on its own, even
237662306a36Sopenharmony_ci	 *    though the application is the one updating it.
237762306a36Sopenharmony_ci	 */
237862306a36Sopenharmony_ci
237962306a36Sopenharmony_ci	/* double index for 128-byte SQEs, twice as long */
238062306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_SQE128)
238162306a36Sopenharmony_ci		head <<= 1;
238262306a36Sopenharmony_ci	*sqe = &ctx->sq_sqes[head];
238362306a36Sopenharmony_ci	return true;
238462306a36Sopenharmony_ci}
238562306a36Sopenharmony_ci
238662306a36Sopenharmony_ciint io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
238762306a36Sopenharmony_ci	__must_hold(&ctx->uring_lock)
238862306a36Sopenharmony_ci{
238962306a36Sopenharmony_ci	unsigned int entries = io_sqring_entries(ctx);
239062306a36Sopenharmony_ci	unsigned int left;
239162306a36Sopenharmony_ci	int ret;
239262306a36Sopenharmony_ci
239362306a36Sopenharmony_ci	if (unlikely(!entries))
239462306a36Sopenharmony_ci		return 0;
239562306a36Sopenharmony_ci	/* make sure SQ entry isn't read before tail */
239662306a36Sopenharmony_ci	ret = left = min(nr, entries);
239762306a36Sopenharmony_ci	io_get_task_refs(left);
239862306a36Sopenharmony_ci	io_submit_state_start(&ctx->submit_state, left);
239962306a36Sopenharmony_ci
240062306a36Sopenharmony_ci	do {
240162306a36Sopenharmony_ci		const struct io_uring_sqe *sqe;
240262306a36Sopenharmony_ci		struct io_kiocb *req;
240362306a36Sopenharmony_ci
240462306a36Sopenharmony_ci		if (unlikely(!io_alloc_req(ctx, &req)))
240562306a36Sopenharmony_ci			break;
240662306a36Sopenharmony_ci		if (unlikely(!io_get_sqe(ctx, &sqe))) {
240762306a36Sopenharmony_ci			io_req_add_to_cache(req, ctx);
240862306a36Sopenharmony_ci			break;
240962306a36Sopenharmony_ci		}
241062306a36Sopenharmony_ci
241162306a36Sopenharmony_ci		/*
241262306a36Sopenharmony_ci		 * Continue submitting even for sqe failure if the
241362306a36Sopenharmony_ci		 * ring was setup with IORING_SETUP_SUBMIT_ALL
241462306a36Sopenharmony_ci		 */
241562306a36Sopenharmony_ci		if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
241662306a36Sopenharmony_ci		    !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
241762306a36Sopenharmony_ci			left--;
241862306a36Sopenharmony_ci			break;
241962306a36Sopenharmony_ci		}
242062306a36Sopenharmony_ci	} while (--left);
242162306a36Sopenharmony_ci
242262306a36Sopenharmony_ci	if (unlikely(left)) {
242362306a36Sopenharmony_ci		ret -= left;
242462306a36Sopenharmony_ci		/* try again if it submitted nothing and can't allocate a req */
242562306a36Sopenharmony_ci		if (!ret && io_req_cache_empty(ctx))
242662306a36Sopenharmony_ci			ret = -EAGAIN;
242762306a36Sopenharmony_ci		current->io_uring->cached_refs += left;
242862306a36Sopenharmony_ci	}
242962306a36Sopenharmony_ci
243062306a36Sopenharmony_ci	io_submit_state_end(ctx);
243162306a36Sopenharmony_ci	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
243262306a36Sopenharmony_ci	io_commit_sqring(ctx);
243362306a36Sopenharmony_ci	return ret;
243462306a36Sopenharmony_ci}
243562306a36Sopenharmony_ci
243662306a36Sopenharmony_cistruct io_wait_queue {
243762306a36Sopenharmony_ci	struct wait_queue_entry wq;
243862306a36Sopenharmony_ci	struct io_ring_ctx *ctx;
243962306a36Sopenharmony_ci	unsigned cq_tail;
244062306a36Sopenharmony_ci	unsigned nr_timeouts;
244162306a36Sopenharmony_ci	ktime_t timeout;
244262306a36Sopenharmony_ci};
244362306a36Sopenharmony_ci
244462306a36Sopenharmony_cistatic inline bool io_has_work(struct io_ring_ctx *ctx)
244562306a36Sopenharmony_ci{
244662306a36Sopenharmony_ci	return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
244762306a36Sopenharmony_ci	       !llist_empty(&ctx->work_llist);
244862306a36Sopenharmony_ci}
244962306a36Sopenharmony_ci
245062306a36Sopenharmony_cistatic inline bool io_should_wake(struct io_wait_queue *iowq)
245162306a36Sopenharmony_ci{
245262306a36Sopenharmony_ci	struct io_ring_ctx *ctx = iowq->ctx;
245362306a36Sopenharmony_ci	int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
245462306a36Sopenharmony_ci
245562306a36Sopenharmony_ci	/*
245662306a36Sopenharmony_ci	 * Wake up if we have enough events, or if a timeout occurred since we
245762306a36Sopenharmony_ci	 * started waiting. For timeouts, we always want to return to userspace,
245862306a36Sopenharmony_ci	 * regardless of event count.
245962306a36Sopenharmony_ci	 */
246062306a36Sopenharmony_ci	return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
246162306a36Sopenharmony_ci}
246262306a36Sopenharmony_ci
246362306a36Sopenharmony_cistatic int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
246462306a36Sopenharmony_ci			    int wake_flags, void *key)
246562306a36Sopenharmony_ci{
246662306a36Sopenharmony_ci	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq);
246762306a36Sopenharmony_ci
246862306a36Sopenharmony_ci	/*
246962306a36Sopenharmony_ci	 * Cannot safely flush overflowed CQEs from here, ensure we wake up
247062306a36Sopenharmony_ci	 * the task, and the next invocation will do it.
247162306a36Sopenharmony_ci	 */
247262306a36Sopenharmony_ci	if (io_should_wake(iowq) || io_has_work(iowq->ctx))
247362306a36Sopenharmony_ci		return autoremove_wake_function(curr, mode, wake_flags, key);
247462306a36Sopenharmony_ci	return -1;
247562306a36Sopenharmony_ci}
247662306a36Sopenharmony_ci
247762306a36Sopenharmony_ciint io_run_task_work_sig(struct io_ring_ctx *ctx)
247862306a36Sopenharmony_ci{
247962306a36Sopenharmony_ci	if (!llist_empty(&ctx->work_llist)) {
248062306a36Sopenharmony_ci		__set_current_state(TASK_RUNNING);
248162306a36Sopenharmony_ci		if (io_run_local_work(ctx, INT_MAX) > 0)
248262306a36Sopenharmony_ci			return 0;
248362306a36Sopenharmony_ci	}
248462306a36Sopenharmony_ci	if (io_run_task_work() > 0)
248562306a36Sopenharmony_ci		return 0;
248662306a36Sopenharmony_ci	if (task_sigpending(current))
248762306a36Sopenharmony_ci		return -EINTR;
248862306a36Sopenharmony_ci	return 0;
248962306a36Sopenharmony_ci}
249062306a36Sopenharmony_ci
249162306a36Sopenharmony_cistatic bool current_pending_io(void)
249262306a36Sopenharmony_ci{
249362306a36Sopenharmony_ci	struct io_uring_task *tctx = current->io_uring;
249462306a36Sopenharmony_ci
249562306a36Sopenharmony_ci	if (!tctx)
249662306a36Sopenharmony_ci		return false;
249762306a36Sopenharmony_ci	return percpu_counter_read_positive(&tctx->inflight);
249862306a36Sopenharmony_ci}
249962306a36Sopenharmony_ci
250062306a36Sopenharmony_ci/* when returns >0, the caller should retry */
250162306a36Sopenharmony_cistatic inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
250262306a36Sopenharmony_ci					  struct io_wait_queue *iowq)
250362306a36Sopenharmony_ci{
250462306a36Sopenharmony_ci	int ret;
250562306a36Sopenharmony_ci
250662306a36Sopenharmony_ci	if (unlikely(READ_ONCE(ctx->check_cq)))
250762306a36Sopenharmony_ci		return 1;
250862306a36Sopenharmony_ci	if (unlikely(!llist_empty(&ctx->work_llist)))
250962306a36Sopenharmony_ci		return 1;
251062306a36Sopenharmony_ci	if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
251162306a36Sopenharmony_ci		return 1;
251262306a36Sopenharmony_ci	if (unlikely(task_sigpending(current)))
251362306a36Sopenharmony_ci		return -EINTR;
251462306a36Sopenharmony_ci	if (unlikely(io_should_wake(iowq)))
251562306a36Sopenharmony_ci		return 0;
251662306a36Sopenharmony_ci
251762306a36Sopenharmony_ci	/*
251862306a36Sopenharmony_ci	 * Mark us as being in io_wait if we have pending requests, so cpufreq
251962306a36Sopenharmony_ci	 * can take into account that the task is waiting for IO - turns out
252062306a36Sopenharmony_ci	 * to be important for low QD IO.
252162306a36Sopenharmony_ci	 */
252262306a36Sopenharmony_ci	if (current_pending_io())
252362306a36Sopenharmony_ci		current->in_iowait = 1;
252462306a36Sopenharmony_ci	ret = 0;
252562306a36Sopenharmony_ci	if (iowq->timeout == KTIME_MAX)
252662306a36Sopenharmony_ci		schedule();
252762306a36Sopenharmony_ci	else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
252862306a36Sopenharmony_ci		ret = -ETIME;
252962306a36Sopenharmony_ci	current->in_iowait = 0;
253062306a36Sopenharmony_ci	return ret;
253162306a36Sopenharmony_ci}
253262306a36Sopenharmony_ci
253362306a36Sopenharmony_ci/*
253462306a36Sopenharmony_ci * Wait until events become available, if we don't already have some. The
253562306a36Sopenharmony_ci * application must reap them itself, as they reside on the shared cq ring.
253662306a36Sopenharmony_ci */
253762306a36Sopenharmony_cistatic int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
253862306a36Sopenharmony_ci			  const sigset_t __user *sig, size_t sigsz,
253962306a36Sopenharmony_ci			  struct __kernel_timespec __user *uts)
254062306a36Sopenharmony_ci{
254162306a36Sopenharmony_ci	struct io_wait_queue iowq;
254262306a36Sopenharmony_ci	struct io_rings *rings = ctx->rings;
254362306a36Sopenharmony_ci	int ret;
254462306a36Sopenharmony_ci
254562306a36Sopenharmony_ci	if (!io_allowed_run_tw(ctx))
254662306a36Sopenharmony_ci		return -EEXIST;
254762306a36Sopenharmony_ci	if (!llist_empty(&ctx->work_llist))
254862306a36Sopenharmony_ci		io_run_local_work(ctx, min_events);
254962306a36Sopenharmony_ci	io_run_task_work();
255062306a36Sopenharmony_ci	io_cqring_overflow_flush(ctx);
255162306a36Sopenharmony_ci	/* if user messes with these they will just get an early return */
255262306a36Sopenharmony_ci	if (__io_cqring_events_user(ctx) >= min_events)
255362306a36Sopenharmony_ci		return 0;
255462306a36Sopenharmony_ci
255562306a36Sopenharmony_ci	if (sig) {
255662306a36Sopenharmony_ci#ifdef CONFIG_COMPAT
255762306a36Sopenharmony_ci		if (in_compat_syscall())
255862306a36Sopenharmony_ci			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
255962306a36Sopenharmony_ci						      sigsz);
256062306a36Sopenharmony_ci		else
256162306a36Sopenharmony_ci#endif
256262306a36Sopenharmony_ci			ret = set_user_sigmask(sig, sigsz);
256362306a36Sopenharmony_ci
256462306a36Sopenharmony_ci		if (ret)
256562306a36Sopenharmony_ci			return ret;
256662306a36Sopenharmony_ci	}
256762306a36Sopenharmony_ci
256862306a36Sopenharmony_ci	init_waitqueue_func_entry(&iowq.wq, io_wake_function);
256962306a36Sopenharmony_ci	iowq.wq.private = current;
257062306a36Sopenharmony_ci	INIT_LIST_HEAD(&iowq.wq.entry);
257162306a36Sopenharmony_ci	iowq.ctx = ctx;
257262306a36Sopenharmony_ci	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
257362306a36Sopenharmony_ci	iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
257462306a36Sopenharmony_ci	iowq.timeout = KTIME_MAX;
257562306a36Sopenharmony_ci
257662306a36Sopenharmony_ci	if (uts) {
257762306a36Sopenharmony_ci		struct timespec64 ts;
257862306a36Sopenharmony_ci
257962306a36Sopenharmony_ci		if (get_timespec64(&ts, uts))
258062306a36Sopenharmony_ci			return -EFAULT;
258162306a36Sopenharmony_ci		iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
258262306a36Sopenharmony_ci	}
258362306a36Sopenharmony_ci
258462306a36Sopenharmony_ci	trace_io_uring_cqring_wait(ctx, min_events);
258562306a36Sopenharmony_ci	do {
258662306a36Sopenharmony_ci		int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail);
258762306a36Sopenharmony_ci		unsigned long check_cq;
258862306a36Sopenharmony_ci
258962306a36Sopenharmony_ci		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
259062306a36Sopenharmony_ci			atomic_set(&ctx->cq_wait_nr, nr_wait);
259162306a36Sopenharmony_ci			set_current_state(TASK_INTERRUPTIBLE);
259262306a36Sopenharmony_ci		} else {
259362306a36Sopenharmony_ci			prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
259462306a36Sopenharmony_ci							TASK_INTERRUPTIBLE);
259562306a36Sopenharmony_ci		}
259662306a36Sopenharmony_ci
259762306a36Sopenharmony_ci		ret = io_cqring_wait_schedule(ctx, &iowq);
259862306a36Sopenharmony_ci		__set_current_state(TASK_RUNNING);
259962306a36Sopenharmony_ci		atomic_set(&ctx->cq_wait_nr, 0);
260062306a36Sopenharmony_ci
260162306a36Sopenharmony_ci		/*
260262306a36Sopenharmony_ci		 * Run task_work after scheduling and before io_should_wake().
260362306a36Sopenharmony_ci		 * If we got woken because of task_work being processed, run it
260462306a36Sopenharmony_ci		 * now rather than let the caller do another wait loop.
260562306a36Sopenharmony_ci		 */
260662306a36Sopenharmony_ci		io_run_task_work();
260762306a36Sopenharmony_ci		if (!llist_empty(&ctx->work_llist))
260862306a36Sopenharmony_ci			io_run_local_work(ctx, nr_wait);
260962306a36Sopenharmony_ci
261062306a36Sopenharmony_ci		/*
261162306a36Sopenharmony_ci		 * Non-local task_work will be run on exit to userspace, but
261262306a36Sopenharmony_ci		 * if we're using DEFER_TASKRUN, then we could have waited
261362306a36Sopenharmony_ci		 * with a timeout for a number of requests. If the timeout
261462306a36Sopenharmony_ci		 * hits, we could have some requests ready to process. Ensure
261562306a36Sopenharmony_ci		 * this break is _after_ we have run task_work, to avoid
261662306a36Sopenharmony_ci		 * deferring running potentially pending requests until the
261762306a36Sopenharmony_ci		 * next time we wait for events.
261862306a36Sopenharmony_ci		 */
261962306a36Sopenharmony_ci		if (ret < 0)
262062306a36Sopenharmony_ci			break;
262162306a36Sopenharmony_ci
262262306a36Sopenharmony_ci		check_cq = READ_ONCE(ctx->check_cq);
262362306a36Sopenharmony_ci		if (unlikely(check_cq)) {
262462306a36Sopenharmony_ci			/* let the caller flush overflows, retry */
262562306a36Sopenharmony_ci			if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
262662306a36Sopenharmony_ci				io_cqring_do_overflow_flush(ctx);
262762306a36Sopenharmony_ci			if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) {
262862306a36Sopenharmony_ci				ret = -EBADR;
262962306a36Sopenharmony_ci				break;
263062306a36Sopenharmony_ci			}
263162306a36Sopenharmony_ci		}
263262306a36Sopenharmony_ci
263362306a36Sopenharmony_ci		if (io_should_wake(&iowq)) {
263462306a36Sopenharmony_ci			ret = 0;
263562306a36Sopenharmony_ci			break;
263662306a36Sopenharmony_ci		}
263762306a36Sopenharmony_ci		cond_resched();
263862306a36Sopenharmony_ci	} while (1);
263962306a36Sopenharmony_ci
264062306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
264162306a36Sopenharmony_ci		finish_wait(&ctx->cq_wait, &iowq.wq);
264262306a36Sopenharmony_ci	restore_saved_sigmask_unless(ret == -EINTR);
264362306a36Sopenharmony_ci
264462306a36Sopenharmony_ci	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
264562306a36Sopenharmony_ci}
264662306a36Sopenharmony_ci
264762306a36Sopenharmony_civoid io_mem_free(void *ptr)
264862306a36Sopenharmony_ci{
264962306a36Sopenharmony_ci	if (!ptr)
265062306a36Sopenharmony_ci		return;
265162306a36Sopenharmony_ci
265262306a36Sopenharmony_ci	folio_put(virt_to_folio(ptr));
265362306a36Sopenharmony_ci}
265462306a36Sopenharmony_ci
265562306a36Sopenharmony_cistatic void io_pages_free(struct page ***pages, int npages)
265662306a36Sopenharmony_ci{
265762306a36Sopenharmony_ci	struct page **page_array;
265862306a36Sopenharmony_ci	int i;
265962306a36Sopenharmony_ci
266062306a36Sopenharmony_ci	if (!pages)
266162306a36Sopenharmony_ci		return;
266262306a36Sopenharmony_ci
266362306a36Sopenharmony_ci	page_array = *pages;
266462306a36Sopenharmony_ci	if (!page_array)
266562306a36Sopenharmony_ci		return;
266662306a36Sopenharmony_ci
266762306a36Sopenharmony_ci	for (i = 0; i < npages; i++)
266862306a36Sopenharmony_ci		unpin_user_page(page_array[i]);
266962306a36Sopenharmony_ci	kvfree(page_array);
267062306a36Sopenharmony_ci	*pages = NULL;
267162306a36Sopenharmony_ci}
267262306a36Sopenharmony_ci
267362306a36Sopenharmony_cistatic void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
267462306a36Sopenharmony_ci			    unsigned long uaddr, size_t size)
267562306a36Sopenharmony_ci{
267662306a36Sopenharmony_ci	struct page **page_array;
267762306a36Sopenharmony_ci	unsigned int nr_pages;
267862306a36Sopenharmony_ci	void *page_addr;
267962306a36Sopenharmony_ci	int ret, i, pinned;
268062306a36Sopenharmony_ci
268162306a36Sopenharmony_ci	*npages = 0;
268262306a36Sopenharmony_ci
268362306a36Sopenharmony_ci	if (uaddr & (PAGE_SIZE - 1) || !size)
268462306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
268562306a36Sopenharmony_ci
268662306a36Sopenharmony_ci	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
268762306a36Sopenharmony_ci	if (nr_pages > USHRT_MAX)
268862306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
268962306a36Sopenharmony_ci	page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
269062306a36Sopenharmony_ci	if (!page_array)
269162306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
269262306a36Sopenharmony_ci
269362306a36Sopenharmony_ci
269462306a36Sopenharmony_ci	pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
269562306a36Sopenharmony_ci				     page_array);
269662306a36Sopenharmony_ci	if (pinned != nr_pages) {
269762306a36Sopenharmony_ci		ret = (pinned < 0) ? pinned : -EFAULT;
269862306a36Sopenharmony_ci		goto free_pages;
269962306a36Sopenharmony_ci	}
270062306a36Sopenharmony_ci
270162306a36Sopenharmony_ci	page_addr = page_address(page_array[0]);
270262306a36Sopenharmony_ci	for (i = 0; i < nr_pages; i++) {
270362306a36Sopenharmony_ci		ret = -EINVAL;
270462306a36Sopenharmony_ci
270562306a36Sopenharmony_ci		/*
270662306a36Sopenharmony_ci		 * Can't support mapping user allocated ring memory on 32-bit
270762306a36Sopenharmony_ci		 * archs where it could potentially reside in highmem. Just
270862306a36Sopenharmony_ci		 * fail those with -EINVAL, just like we did on kernels that
270962306a36Sopenharmony_ci		 * didn't support this feature.
271062306a36Sopenharmony_ci		 */
271162306a36Sopenharmony_ci		if (PageHighMem(page_array[i]))
271262306a36Sopenharmony_ci			goto free_pages;
271362306a36Sopenharmony_ci
271462306a36Sopenharmony_ci		/*
271562306a36Sopenharmony_ci		 * No support for discontig pages for now, should either be a
271662306a36Sopenharmony_ci		 * single normal page, or a huge page. Later on we can add
271762306a36Sopenharmony_ci		 * support for remapping discontig pages, for now we will
271862306a36Sopenharmony_ci		 * just fail them with EINVAL.
271962306a36Sopenharmony_ci		 */
272062306a36Sopenharmony_ci		if (page_address(page_array[i]) != page_addr)
272162306a36Sopenharmony_ci			goto free_pages;
272262306a36Sopenharmony_ci		page_addr += PAGE_SIZE;
272362306a36Sopenharmony_ci	}
272462306a36Sopenharmony_ci
272562306a36Sopenharmony_ci	*pages = page_array;
272662306a36Sopenharmony_ci	*npages = nr_pages;
272762306a36Sopenharmony_ci	return page_to_virt(page_array[0]);
272862306a36Sopenharmony_ci
272962306a36Sopenharmony_cifree_pages:
273062306a36Sopenharmony_ci	io_pages_free(&page_array, pinned > 0 ? pinned : 0);
273162306a36Sopenharmony_ci	return ERR_PTR(ret);
273262306a36Sopenharmony_ci}
273362306a36Sopenharmony_ci
273462306a36Sopenharmony_cistatic void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
273562306a36Sopenharmony_ci			  size_t size)
273662306a36Sopenharmony_ci{
273762306a36Sopenharmony_ci	return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr,
273862306a36Sopenharmony_ci				size);
273962306a36Sopenharmony_ci}
274062306a36Sopenharmony_ci
274162306a36Sopenharmony_cistatic void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
274262306a36Sopenharmony_ci			 size_t size)
274362306a36Sopenharmony_ci{
274462306a36Sopenharmony_ci	return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr,
274562306a36Sopenharmony_ci				size);
274662306a36Sopenharmony_ci}
274762306a36Sopenharmony_ci
274862306a36Sopenharmony_cistatic void io_rings_free(struct io_ring_ctx *ctx)
274962306a36Sopenharmony_ci{
275062306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
275162306a36Sopenharmony_ci		io_mem_free(ctx->rings);
275262306a36Sopenharmony_ci		io_mem_free(ctx->sq_sqes);
275362306a36Sopenharmony_ci		ctx->rings = NULL;
275462306a36Sopenharmony_ci		ctx->sq_sqes = NULL;
275562306a36Sopenharmony_ci	} else {
275662306a36Sopenharmony_ci		io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
275762306a36Sopenharmony_ci		ctx->n_ring_pages = 0;
275862306a36Sopenharmony_ci		io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
275962306a36Sopenharmony_ci		ctx->n_sqe_pages = 0;
276062306a36Sopenharmony_ci	}
276162306a36Sopenharmony_ci}
276262306a36Sopenharmony_ci
276362306a36Sopenharmony_civoid *io_mem_alloc(size_t size)
276462306a36Sopenharmony_ci{
276562306a36Sopenharmony_ci	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
276662306a36Sopenharmony_ci	void *ret;
276762306a36Sopenharmony_ci
276862306a36Sopenharmony_ci	ret = (void *) __get_free_pages(gfp, get_order(size));
276962306a36Sopenharmony_ci	if (ret)
277062306a36Sopenharmony_ci		return ret;
277162306a36Sopenharmony_ci	return ERR_PTR(-ENOMEM);
277262306a36Sopenharmony_ci}
277362306a36Sopenharmony_ci
277462306a36Sopenharmony_cistatic unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
277562306a36Sopenharmony_ci				unsigned int cq_entries, size_t *sq_offset)
277662306a36Sopenharmony_ci{
277762306a36Sopenharmony_ci	struct io_rings *rings;
277862306a36Sopenharmony_ci	size_t off, sq_array_size;
277962306a36Sopenharmony_ci
278062306a36Sopenharmony_ci	off = struct_size(rings, cqes, cq_entries);
278162306a36Sopenharmony_ci	if (off == SIZE_MAX)
278262306a36Sopenharmony_ci		return SIZE_MAX;
278362306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_CQE32) {
278462306a36Sopenharmony_ci		if (check_shl_overflow(off, 1, &off))
278562306a36Sopenharmony_ci			return SIZE_MAX;
278662306a36Sopenharmony_ci	}
278762306a36Sopenharmony_ci
278862306a36Sopenharmony_ci#ifdef CONFIG_SMP
278962306a36Sopenharmony_ci	off = ALIGN(off, SMP_CACHE_BYTES);
279062306a36Sopenharmony_ci	if (off == 0)
279162306a36Sopenharmony_ci		return SIZE_MAX;
279262306a36Sopenharmony_ci#endif
279362306a36Sopenharmony_ci
279462306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_NO_SQARRAY) {
279562306a36Sopenharmony_ci		if (sq_offset)
279662306a36Sopenharmony_ci			*sq_offset = SIZE_MAX;
279762306a36Sopenharmony_ci		return off;
279862306a36Sopenharmony_ci	}
279962306a36Sopenharmony_ci
280062306a36Sopenharmony_ci	if (sq_offset)
280162306a36Sopenharmony_ci		*sq_offset = off;
280262306a36Sopenharmony_ci
280362306a36Sopenharmony_ci	sq_array_size = array_size(sizeof(u32), sq_entries);
280462306a36Sopenharmony_ci	if (sq_array_size == SIZE_MAX)
280562306a36Sopenharmony_ci		return SIZE_MAX;
280662306a36Sopenharmony_ci
280762306a36Sopenharmony_ci	if (check_add_overflow(off, sq_array_size, &off))
280862306a36Sopenharmony_ci		return SIZE_MAX;
280962306a36Sopenharmony_ci
281062306a36Sopenharmony_ci	return off;
281162306a36Sopenharmony_ci}
281262306a36Sopenharmony_ci
281362306a36Sopenharmony_cistatic int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
281462306a36Sopenharmony_ci			       unsigned int eventfd_async)
281562306a36Sopenharmony_ci{
281662306a36Sopenharmony_ci	struct io_ev_fd *ev_fd;
281762306a36Sopenharmony_ci	__s32 __user *fds = arg;
281862306a36Sopenharmony_ci	int fd;
281962306a36Sopenharmony_ci
282062306a36Sopenharmony_ci	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
282162306a36Sopenharmony_ci					lockdep_is_held(&ctx->uring_lock));
282262306a36Sopenharmony_ci	if (ev_fd)
282362306a36Sopenharmony_ci		return -EBUSY;
282462306a36Sopenharmony_ci
282562306a36Sopenharmony_ci	if (copy_from_user(&fd, fds, sizeof(*fds)))
282662306a36Sopenharmony_ci		return -EFAULT;
282762306a36Sopenharmony_ci
282862306a36Sopenharmony_ci	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
282962306a36Sopenharmony_ci	if (!ev_fd)
283062306a36Sopenharmony_ci		return -ENOMEM;
283162306a36Sopenharmony_ci
283262306a36Sopenharmony_ci	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
283362306a36Sopenharmony_ci	if (IS_ERR(ev_fd->cq_ev_fd)) {
283462306a36Sopenharmony_ci		int ret = PTR_ERR(ev_fd->cq_ev_fd);
283562306a36Sopenharmony_ci		kfree(ev_fd);
283662306a36Sopenharmony_ci		return ret;
283762306a36Sopenharmony_ci	}
283862306a36Sopenharmony_ci
283962306a36Sopenharmony_ci	spin_lock(&ctx->completion_lock);
284062306a36Sopenharmony_ci	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
284162306a36Sopenharmony_ci	spin_unlock(&ctx->completion_lock);
284262306a36Sopenharmony_ci
284362306a36Sopenharmony_ci	ev_fd->eventfd_async = eventfd_async;
284462306a36Sopenharmony_ci	ctx->has_evfd = true;
284562306a36Sopenharmony_ci	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
284662306a36Sopenharmony_ci	atomic_set(&ev_fd->refs, 1);
284762306a36Sopenharmony_ci	atomic_set(&ev_fd->ops, 0);
284862306a36Sopenharmony_ci	return 0;
284962306a36Sopenharmony_ci}
285062306a36Sopenharmony_ci
285162306a36Sopenharmony_cistatic int io_eventfd_unregister(struct io_ring_ctx *ctx)
285262306a36Sopenharmony_ci{
285362306a36Sopenharmony_ci	struct io_ev_fd *ev_fd;
285462306a36Sopenharmony_ci
285562306a36Sopenharmony_ci	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
285662306a36Sopenharmony_ci					lockdep_is_held(&ctx->uring_lock));
285762306a36Sopenharmony_ci	if (ev_fd) {
285862306a36Sopenharmony_ci		ctx->has_evfd = false;
285962306a36Sopenharmony_ci		rcu_assign_pointer(ctx->io_ev_fd, NULL);
286062306a36Sopenharmony_ci		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
286162306a36Sopenharmony_ci			call_rcu(&ev_fd->rcu, io_eventfd_ops);
286262306a36Sopenharmony_ci		return 0;
286362306a36Sopenharmony_ci	}
286462306a36Sopenharmony_ci
286562306a36Sopenharmony_ci	return -ENXIO;
286662306a36Sopenharmony_ci}
286762306a36Sopenharmony_ci
286862306a36Sopenharmony_cistatic void io_req_caches_free(struct io_ring_ctx *ctx)
286962306a36Sopenharmony_ci{
287062306a36Sopenharmony_ci	struct io_kiocb *req;
287162306a36Sopenharmony_ci	int nr = 0;
287262306a36Sopenharmony_ci
287362306a36Sopenharmony_ci	mutex_lock(&ctx->uring_lock);
287462306a36Sopenharmony_ci	io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
287562306a36Sopenharmony_ci
287662306a36Sopenharmony_ci	while (!io_req_cache_empty(ctx)) {
287762306a36Sopenharmony_ci		req = io_extract_req(ctx);
287862306a36Sopenharmony_ci		kmem_cache_free(req_cachep, req);
287962306a36Sopenharmony_ci		nr++;
288062306a36Sopenharmony_ci	}
288162306a36Sopenharmony_ci	if (nr)
288262306a36Sopenharmony_ci		percpu_ref_put_many(&ctx->refs, nr);
288362306a36Sopenharmony_ci	mutex_unlock(&ctx->uring_lock);
288462306a36Sopenharmony_ci}
288562306a36Sopenharmony_ci
288662306a36Sopenharmony_cistatic void io_rsrc_node_cache_free(struct io_cache_entry *entry)
288762306a36Sopenharmony_ci{
288862306a36Sopenharmony_ci	kfree(container_of(entry, struct io_rsrc_node, cache));
288962306a36Sopenharmony_ci}
289062306a36Sopenharmony_ci
289162306a36Sopenharmony_cistatic __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
289262306a36Sopenharmony_ci{
289362306a36Sopenharmony_ci	io_sq_thread_finish(ctx);
289462306a36Sopenharmony_ci	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
289562306a36Sopenharmony_ci	if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
289662306a36Sopenharmony_ci		return;
289762306a36Sopenharmony_ci
289862306a36Sopenharmony_ci	mutex_lock(&ctx->uring_lock);
289962306a36Sopenharmony_ci	if (ctx->buf_data)
290062306a36Sopenharmony_ci		__io_sqe_buffers_unregister(ctx);
290162306a36Sopenharmony_ci	if (ctx->file_data)
290262306a36Sopenharmony_ci		__io_sqe_files_unregister(ctx);
290362306a36Sopenharmony_ci	io_cqring_overflow_kill(ctx);
290462306a36Sopenharmony_ci	io_eventfd_unregister(ctx);
290562306a36Sopenharmony_ci	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
290662306a36Sopenharmony_ci	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
290762306a36Sopenharmony_ci	io_destroy_buffers(ctx);
290862306a36Sopenharmony_ci	mutex_unlock(&ctx->uring_lock);
290962306a36Sopenharmony_ci	if (ctx->sq_creds)
291062306a36Sopenharmony_ci		put_cred(ctx->sq_creds);
291162306a36Sopenharmony_ci	if (ctx->submitter_task)
291262306a36Sopenharmony_ci		put_task_struct(ctx->submitter_task);
291362306a36Sopenharmony_ci
291462306a36Sopenharmony_ci	/* there are no registered resources left, nobody uses it */
291562306a36Sopenharmony_ci	if (ctx->rsrc_node)
291662306a36Sopenharmony_ci		io_rsrc_node_destroy(ctx, ctx->rsrc_node);
291762306a36Sopenharmony_ci
291862306a36Sopenharmony_ci	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
291962306a36Sopenharmony_ci	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
292062306a36Sopenharmony_ci
292162306a36Sopenharmony_ci	io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
292262306a36Sopenharmony_ci	if (ctx->mm_account) {
292362306a36Sopenharmony_ci		mmdrop(ctx->mm_account);
292462306a36Sopenharmony_ci		ctx->mm_account = NULL;
292562306a36Sopenharmony_ci	}
292662306a36Sopenharmony_ci	io_rings_free(ctx);
292762306a36Sopenharmony_ci	io_kbuf_mmap_list_free(ctx);
292862306a36Sopenharmony_ci
292962306a36Sopenharmony_ci	percpu_ref_exit(&ctx->refs);
293062306a36Sopenharmony_ci	free_uid(ctx->user);
293162306a36Sopenharmony_ci	io_req_caches_free(ctx);
293262306a36Sopenharmony_ci	if (ctx->hash_map)
293362306a36Sopenharmony_ci		io_wq_put_hash(ctx->hash_map);
293462306a36Sopenharmony_ci	kfree(ctx->cancel_table.hbs);
293562306a36Sopenharmony_ci	kfree(ctx->cancel_table_locked.hbs);
293662306a36Sopenharmony_ci	kfree(ctx->io_bl);
293762306a36Sopenharmony_ci	xa_destroy(&ctx->io_bl_xa);
293862306a36Sopenharmony_ci	kfree(ctx);
293962306a36Sopenharmony_ci}
294062306a36Sopenharmony_ci
294162306a36Sopenharmony_cistatic __cold void io_activate_pollwq_cb(struct callback_head *cb)
294262306a36Sopenharmony_ci{
294362306a36Sopenharmony_ci	struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
294462306a36Sopenharmony_ci					       poll_wq_task_work);
294562306a36Sopenharmony_ci
294662306a36Sopenharmony_ci	mutex_lock(&ctx->uring_lock);
294762306a36Sopenharmony_ci	ctx->poll_activated = true;
294862306a36Sopenharmony_ci	mutex_unlock(&ctx->uring_lock);
294962306a36Sopenharmony_ci
295062306a36Sopenharmony_ci	/*
295162306a36Sopenharmony_ci	 * Wake ups for some events between start of polling and activation
295262306a36Sopenharmony_ci	 * might've been lost due to loose synchronisation.
295362306a36Sopenharmony_ci	 */
295462306a36Sopenharmony_ci	wake_up_all(&ctx->poll_wq);
295562306a36Sopenharmony_ci	percpu_ref_put(&ctx->refs);
295662306a36Sopenharmony_ci}
295762306a36Sopenharmony_ci
295862306a36Sopenharmony_cistatic __cold void io_activate_pollwq(struct io_ring_ctx *ctx)
295962306a36Sopenharmony_ci{
296062306a36Sopenharmony_ci	spin_lock(&ctx->completion_lock);
296162306a36Sopenharmony_ci	/* already activated or in progress */
296262306a36Sopenharmony_ci	if (ctx->poll_activated || ctx->poll_wq_task_work.func)
296362306a36Sopenharmony_ci		goto out;
296462306a36Sopenharmony_ci	if (WARN_ON_ONCE(!ctx->task_complete))
296562306a36Sopenharmony_ci		goto out;
296662306a36Sopenharmony_ci	if (!ctx->submitter_task)
296762306a36Sopenharmony_ci		goto out;
296862306a36Sopenharmony_ci	/*
296962306a36Sopenharmony_ci	 * with ->submitter_task only the submitter task completes requests, we
297062306a36Sopenharmony_ci	 * only need to sync with it, which is done by injecting a tw
297162306a36Sopenharmony_ci	 */
297262306a36Sopenharmony_ci	init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb);
297362306a36Sopenharmony_ci	percpu_ref_get(&ctx->refs);
297462306a36Sopenharmony_ci	if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL))
297562306a36Sopenharmony_ci		percpu_ref_put(&ctx->refs);
297662306a36Sopenharmony_ciout:
297762306a36Sopenharmony_ci	spin_unlock(&ctx->completion_lock);
297862306a36Sopenharmony_ci}
297962306a36Sopenharmony_ci
298062306a36Sopenharmony_cistatic __poll_t io_uring_poll(struct file *file, poll_table *wait)
298162306a36Sopenharmony_ci{
298262306a36Sopenharmony_ci	struct io_ring_ctx *ctx = file->private_data;
298362306a36Sopenharmony_ci	__poll_t mask = 0;
298462306a36Sopenharmony_ci
298562306a36Sopenharmony_ci	if (unlikely(!ctx->poll_activated))
298662306a36Sopenharmony_ci		io_activate_pollwq(ctx);
298762306a36Sopenharmony_ci
298862306a36Sopenharmony_ci	poll_wait(file, &ctx->poll_wq, wait);
298962306a36Sopenharmony_ci	/*
299062306a36Sopenharmony_ci	 * synchronizes with barrier from wq_has_sleeper call in
299162306a36Sopenharmony_ci	 * io_commit_cqring
299262306a36Sopenharmony_ci	 */
299362306a36Sopenharmony_ci	smp_rmb();
299462306a36Sopenharmony_ci	if (!io_sqring_full(ctx))
299562306a36Sopenharmony_ci		mask |= EPOLLOUT | EPOLLWRNORM;
299662306a36Sopenharmony_ci
299762306a36Sopenharmony_ci	/*
299862306a36Sopenharmony_ci	 * Don't flush cqring overflow list here, just do a simple check.
299962306a36Sopenharmony_ci	 * Otherwise there could possible be ABBA deadlock:
300062306a36Sopenharmony_ci	 *      CPU0                    CPU1
300162306a36Sopenharmony_ci	 *      ----                    ----
300262306a36Sopenharmony_ci	 * lock(&ctx->uring_lock);
300362306a36Sopenharmony_ci	 *                              lock(&ep->mtx);
300462306a36Sopenharmony_ci	 *                              lock(&ctx->uring_lock);
300562306a36Sopenharmony_ci	 * lock(&ep->mtx);
300662306a36Sopenharmony_ci	 *
300762306a36Sopenharmony_ci	 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this
300862306a36Sopenharmony_ci	 * pushes them to do the flush.
300962306a36Sopenharmony_ci	 */
301062306a36Sopenharmony_ci
301162306a36Sopenharmony_ci	if (__io_cqring_events_user(ctx) || io_has_work(ctx))
301262306a36Sopenharmony_ci		mask |= EPOLLIN | EPOLLRDNORM;
301362306a36Sopenharmony_ci
301462306a36Sopenharmony_ci	return mask;
301562306a36Sopenharmony_ci}
301662306a36Sopenharmony_ci
301762306a36Sopenharmony_cistatic int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
301862306a36Sopenharmony_ci{
301962306a36Sopenharmony_ci	const struct cred *creds;
302062306a36Sopenharmony_ci
302162306a36Sopenharmony_ci	creds = xa_erase(&ctx->personalities, id);
302262306a36Sopenharmony_ci	if (creds) {
302362306a36Sopenharmony_ci		put_cred(creds);
302462306a36Sopenharmony_ci		return 0;
302562306a36Sopenharmony_ci	}
302662306a36Sopenharmony_ci
302762306a36Sopenharmony_ci	return -EINVAL;
302862306a36Sopenharmony_ci}
302962306a36Sopenharmony_ci
303062306a36Sopenharmony_cistruct io_tctx_exit {
303162306a36Sopenharmony_ci	struct callback_head		task_work;
303262306a36Sopenharmony_ci	struct completion		completion;
303362306a36Sopenharmony_ci	struct io_ring_ctx		*ctx;
303462306a36Sopenharmony_ci};
303562306a36Sopenharmony_ci
303662306a36Sopenharmony_cistatic __cold void io_tctx_exit_cb(struct callback_head *cb)
303762306a36Sopenharmony_ci{
303862306a36Sopenharmony_ci	struct io_uring_task *tctx = current->io_uring;
303962306a36Sopenharmony_ci	struct io_tctx_exit *work;
304062306a36Sopenharmony_ci
304162306a36Sopenharmony_ci	work = container_of(cb, struct io_tctx_exit, task_work);
304262306a36Sopenharmony_ci	/*
304362306a36Sopenharmony_ci	 * When @in_cancel, we're in cancellation and it's racy to remove the
304462306a36Sopenharmony_ci	 * node. It'll be removed by the end of cancellation, just ignore it.
304562306a36Sopenharmony_ci	 * tctx can be NULL if the queueing of this task_work raced with
304662306a36Sopenharmony_ci	 * work cancelation off the exec path.
304762306a36Sopenharmony_ci	 */
304862306a36Sopenharmony_ci	if (tctx && !atomic_read(&tctx->in_cancel))
304962306a36Sopenharmony_ci		io_uring_del_tctx_node((unsigned long)work->ctx);
305062306a36Sopenharmony_ci	complete(&work->completion);
305162306a36Sopenharmony_ci}
305262306a36Sopenharmony_ci
305362306a36Sopenharmony_cistatic __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
305462306a36Sopenharmony_ci{
305562306a36Sopenharmony_ci	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
305662306a36Sopenharmony_ci
305762306a36Sopenharmony_ci	return req->ctx == data;
305862306a36Sopenharmony_ci}
305962306a36Sopenharmony_ci
306062306a36Sopenharmony_cistatic __cold void io_ring_exit_work(struct work_struct *work)
306162306a36Sopenharmony_ci{
306262306a36Sopenharmony_ci	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work);
306362306a36Sopenharmony_ci	unsigned long timeout = jiffies + HZ * 60 * 5;
306462306a36Sopenharmony_ci	unsigned long interval = HZ / 20;
306562306a36Sopenharmony_ci	struct io_tctx_exit exit;
306662306a36Sopenharmony_ci	struct io_tctx_node *node;
306762306a36Sopenharmony_ci	int ret;
306862306a36Sopenharmony_ci
306962306a36Sopenharmony_ci	/*
307062306a36Sopenharmony_ci	 * If we're doing polled IO and end up having requests being
307162306a36Sopenharmony_ci	 * submitted async (out-of-line), then completions can come in while
307262306a36Sopenharmony_ci	 * we're waiting for refs to drop. We need to reap these manually,
307362306a36Sopenharmony_ci	 * as nobody else will be looking for them.
307462306a36Sopenharmony_ci	 */
307562306a36Sopenharmony_ci	do {
307662306a36Sopenharmony_ci		if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) {
307762306a36Sopenharmony_ci			mutex_lock(&ctx->uring_lock);
307862306a36Sopenharmony_ci			io_cqring_overflow_kill(ctx);
307962306a36Sopenharmony_ci			mutex_unlock(&ctx->uring_lock);
308062306a36Sopenharmony_ci		}
308162306a36Sopenharmony_ci
308262306a36Sopenharmony_ci		if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
308362306a36Sopenharmony_ci			io_move_task_work_from_local(ctx);
308462306a36Sopenharmony_ci
308562306a36Sopenharmony_ci		while (io_uring_try_cancel_requests(ctx, NULL, true))
308662306a36Sopenharmony_ci			cond_resched();
308762306a36Sopenharmony_ci
308862306a36Sopenharmony_ci		if (ctx->sq_data) {
308962306a36Sopenharmony_ci			struct io_sq_data *sqd = ctx->sq_data;
309062306a36Sopenharmony_ci			struct task_struct *tsk;
309162306a36Sopenharmony_ci
309262306a36Sopenharmony_ci			io_sq_thread_park(sqd);
309362306a36Sopenharmony_ci			tsk = sqd->thread;
309462306a36Sopenharmony_ci			if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
309562306a36Sopenharmony_ci				io_wq_cancel_cb(tsk->io_uring->io_wq,
309662306a36Sopenharmony_ci						io_cancel_ctx_cb, ctx, true);
309762306a36Sopenharmony_ci			io_sq_thread_unpark(sqd);
309862306a36Sopenharmony_ci		}
309962306a36Sopenharmony_ci
310062306a36Sopenharmony_ci		io_req_caches_free(ctx);
310162306a36Sopenharmony_ci
310262306a36Sopenharmony_ci		if (WARN_ON_ONCE(time_after(jiffies, timeout))) {
310362306a36Sopenharmony_ci			/* there is little hope left, don't run it too often */
310462306a36Sopenharmony_ci			interval = HZ * 60;
310562306a36Sopenharmony_ci		}
310662306a36Sopenharmony_ci		/*
310762306a36Sopenharmony_ci		 * This is really an uninterruptible wait, as it has to be
310862306a36Sopenharmony_ci		 * complete. But it's also run from a kworker, which doesn't
310962306a36Sopenharmony_ci		 * take signals, so it's fine to make it interruptible. This
311062306a36Sopenharmony_ci		 * avoids scenarios where we knowingly can wait much longer
311162306a36Sopenharmony_ci		 * on completions, for example if someone does a SIGSTOP on
311262306a36Sopenharmony_ci		 * a task that needs to finish task_work to make this loop
311362306a36Sopenharmony_ci		 * complete. That's a synthetic situation that should not
311462306a36Sopenharmony_ci		 * cause a stuck task backtrace, and hence a potential panic
311562306a36Sopenharmony_ci		 * on stuck tasks if that is enabled.
311662306a36Sopenharmony_ci		 */
311762306a36Sopenharmony_ci	} while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));
311862306a36Sopenharmony_ci
311962306a36Sopenharmony_ci	init_completion(&exit.completion);
312062306a36Sopenharmony_ci	init_task_work(&exit.task_work, io_tctx_exit_cb);
312162306a36Sopenharmony_ci	exit.ctx = ctx;
312262306a36Sopenharmony_ci
312362306a36Sopenharmony_ci	mutex_lock(&ctx->uring_lock);
312462306a36Sopenharmony_ci	while (!list_empty(&ctx->tctx_list)) {
312562306a36Sopenharmony_ci		WARN_ON_ONCE(time_after(jiffies, timeout));
312662306a36Sopenharmony_ci
312762306a36Sopenharmony_ci		node = list_first_entry(&ctx->tctx_list, struct io_tctx_node,
312862306a36Sopenharmony_ci					ctx_node);
312962306a36Sopenharmony_ci		/* don't spin on a single task if cancellation failed */
313062306a36Sopenharmony_ci		list_rotate_left(&ctx->tctx_list);
313162306a36Sopenharmony_ci		ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL);
313262306a36Sopenharmony_ci		if (WARN_ON_ONCE(ret))
313362306a36Sopenharmony_ci			continue;
313462306a36Sopenharmony_ci
313562306a36Sopenharmony_ci		mutex_unlock(&ctx->uring_lock);
313662306a36Sopenharmony_ci		/*
313762306a36Sopenharmony_ci		 * See comment above for
313862306a36Sopenharmony_ci		 * wait_for_completion_interruptible_timeout() on why this
313962306a36Sopenharmony_ci		 * wait is marked as interruptible.
314062306a36Sopenharmony_ci		 */
314162306a36Sopenharmony_ci		wait_for_completion_interruptible(&exit.completion);
314262306a36Sopenharmony_ci		mutex_lock(&ctx->uring_lock);
314362306a36Sopenharmony_ci	}
314462306a36Sopenharmony_ci	mutex_unlock(&ctx->uring_lock);
314562306a36Sopenharmony_ci	spin_lock(&ctx->completion_lock);
314662306a36Sopenharmony_ci	spin_unlock(&ctx->completion_lock);
314762306a36Sopenharmony_ci
314862306a36Sopenharmony_ci	/* pairs with RCU read section in io_req_local_work_add() */
314962306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
315062306a36Sopenharmony_ci		synchronize_rcu();
315162306a36Sopenharmony_ci
315262306a36Sopenharmony_ci	io_ring_ctx_free(ctx);
315362306a36Sopenharmony_ci}
315462306a36Sopenharmony_ci
315562306a36Sopenharmony_cistatic __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
315662306a36Sopenharmony_ci{
315762306a36Sopenharmony_ci	unsigned long index;
315862306a36Sopenharmony_ci	struct creds *creds;
315962306a36Sopenharmony_ci
316062306a36Sopenharmony_ci	mutex_lock(&ctx->uring_lock);
316162306a36Sopenharmony_ci	percpu_ref_kill(&ctx->refs);
316262306a36Sopenharmony_ci	xa_for_each(&ctx->personalities, index, creds)
316362306a36Sopenharmony_ci		io_unregister_personality(ctx, index);
316462306a36Sopenharmony_ci	if (ctx->rings)
316562306a36Sopenharmony_ci		io_poll_remove_all(ctx, NULL, true);
316662306a36Sopenharmony_ci	mutex_unlock(&ctx->uring_lock);
316762306a36Sopenharmony_ci
316862306a36Sopenharmony_ci	/*
316962306a36Sopenharmony_ci	 * If we failed setting up the ctx, we might not have any rings
317062306a36Sopenharmony_ci	 * and therefore did not submit any requests
317162306a36Sopenharmony_ci	 */
317262306a36Sopenharmony_ci	if (ctx->rings)
317362306a36Sopenharmony_ci		io_kill_timeouts(ctx, NULL, true);
317462306a36Sopenharmony_ci
317562306a36Sopenharmony_ci	flush_delayed_work(&ctx->fallback_work);
317662306a36Sopenharmony_ci
317762306a36Sopenharmony_ci	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
317862306a36Sopenharmony_ci	/*
317962306a36Sopenharmony_ci	 * Use system_unbound_wq to avoid spawning tons of event kworkers
318062306a36Sopenharmony_ci	 * if we're exiting a ton of rings at the same time. It just adds
318162306a36Sopenharmony_ci	 * noise and overhead, there's no discernable change in runtime
318262306a36Sopenharmony_ci	 * over using system_wq.
318362306a36Sopenharmony_ci	 */
318462306a36Sopenharmony_ci	queue_work(system_unbound_wq, &ctx->exit_work);
318562306a36Sopenharmony_ci}
318662306a36Sopenharmony_ci
318762306a36Sopenharmony_cistatic int io_uring_release(struct inode *inode, struct file *file)
318862306a36Sopenharmony_ci{
318962306a36Sopenharmony_ci	struct io_ring_ctx *ctx = file->private_data;
319062306a36Sopenharmony_ci
319162306a36Sopenharmony_ci	file->private_data = NULL;
319262306a36Sopenharmony_ci	io_ring_ctx_wait_and_kill(ctx);
319362306a36Sopenharmony_ci	return 0;
319462306a36Sopenharmony_ci}
319562306a36Sopenharmony_ci
319662306a36Sopenharmony_cistruct io_task_cancel {
319762306a36Sopenharmony_ci	struct task_struct *task;
319862306a36Sopenharmony_ci	bool all;
319962306a36Sopenharmony_ci};
320062306a36Sopenharmony_ci
320162306a36Sopenharmony_cistatic bool io_cancel_task_cb(struct io_wq_work *work, void *data)
320262306a36Sopenharmony_ci{
320362306a36Sopenharmony_ci	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
320462306a36Sopenharmony_ci	struct io_task_cancel *cancel = data;
320562306a36Sopenharmony_ci
320662306a36Sopenharmony_ci	return io_match_task_safe(req, cancel->task, cancel->all);
320762306a36Sopenharmony_ci}
320862306a36Sopenharmony_ci
320962306a36Sopenharmony_cistatic __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
321062306a36Sopenharmony_ci					 struct task_struct *task,
321162306a36Sopenharmony_ci					 bool cancel_all)
321262306a36Sopenharmony_ci{
321362306a36Sopenharmony_ci	struct io_defer_entry *de;
321462306a36Sopenharmony_ci	LIST_HEAD(list);
321562306a36Sopenharmony_ci
321662306a36Sopenharmony_ci	spin_lock(&ctx->completion_lock);
321762306a36Sopenharmony_ci	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
321862306a36Sopenharmony_ci		if (io_match_task_safe(de->req, task, cancel_all)) {
321962306a36Sopenharmony_ci			list_cut_position(&list, &ctx->defer_list, &de->list);
322062306a36Sopenharmony_ci			break;
322162306a36Sopenharmony_ci		}
322262306a36Sopenharmony_ci	}
322362306a36Sopenharmony_ci	spin_unlock(&ctx->completion_lock);
322462306a36Sopenharmony_ci	if (list_empty(&list))
322562306a36Sopenharmony_ci		return false;
322662306a36Sopenharmony_ci
322762306a36Sopenharmony_ci	while (!list_empty(&list)) {
322862306a36Sopenharmony_ci		de = list_first_entry(&list, struct io_defer_entry, list);
322962306a36Sopenharmony_ci		list_del_init(&de->list);
323062306a36Sopenharmony_ci		io_req_task_queue_fail(de->req, -ECANCELED);
323162306a36Sopenharmony_ci		kfree(de);
323262306a36Sopenharmony_ci	}
323362306a36Sopenharmony_ci	return true;
323462306a36Sopenharmony_ci}
323562306a36Sopenharmony_ci
323662306a36Sopenharmony_cistatic __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
323762306a36Sopenharmony_ci{
323862306a36Sopenharmony_ci	struct io_tctx_node *node;
323962306a36Sopenharmony_ci	enum io_wq_cancel cret;
324062306a36Sopenharmony_ci	bool ret = false;
324162306a36Sopenharmony_ci
324262306a36Sopenharmony_ci	mutex_lock(&ctx->uring_lock);
324362306a36Sopenharmony_ci	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
324462306a36Sopenharmony_ci		struct io_uring_task *tctx = node->task->io_uring;
324562306a36Sopenharmony_ci
324662306a36Sopenharmony_ci		/*
324762306a36Sopenharmony_ci		 * io_wq will stay alive while we hold uring_lock, because it's
324862306a36Sopenharmony_ci		 * killed after ctx nodes, which requires to take the lock.
324962306a36Sopenharmony_ci		 */
325062306a36Sopenharmony_ci		if (!tctx || !tctx->io_wq)
325162306a36Sopenharmony_ci			continue;
325262306a36Sopenharmony_ci		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true);
325362306a36Sopenharmony_ci		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
325462306a36Sopenharmony_ci	}
325562306a36Sopenharmony_ci	mutex_unlock(&ctx->uring_lock);
325662306a36Sopenharmony_ci
325762306a36Sopenharmony_ci	return ret;
325862306a36Sopenharmony_ci}
325962306a36Sopenharmony_ci
326062306a36Sopenharmony_cistatic __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
326162306a36Sopenharmony_ci						struct task_struct *task,
326262306a36Sopenharmony_ci						bool cancel_all)
326362306a36Sopenharmony_ci{
326462306a36Sopenharmony_ci	struct io_task_cancel cancel = { .task = task, .all = cancel_all, };
326562306a36Sopenharmony_ci	struct io_uring_task *tctx = task ? task->io_uring : NULL;
326662306a36Sopenharmony_ci	enum io_wq_cancel cret;
326762306a36Sopenharmony_ci	bool ret = false;
326862306a36Sopenharmony_ci
326962306a36Sopenharmony_ci	/* set it so io_req_local_work_add() would wake us up */
327062306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
327162306a36Sopenharmony_ci		atomic_set(&ctx->cq_wait_nr, 1);
327262306a36Sopenharmony_ci		smp_mb();
327362306a36Sopenharmony_ci	}
327462306a36Sopenharmony_ci
327562306a36Sopenharmony_ci	/* failed during ring init, it couldn't have issued any requests */
327662306a36Sopenharmony_ci	if (!ctx->rings)
327762306a36Sopenharmony_ci		return false;
327862306a36Sopenharmony_ci
327962306a36Sopenharmony_ci	if (!task) {
328062306a36Sopenharmony_ci		ret |= io_uring_try_cancel_iowq(ctx);
328162306a36Sopenharmony_ci	} else if (tctx && tctx->io_wq) {
328262306a36Sopenharmony_ci		/*
328362306a36Sopenharmony_ci		 * Cancels requests of all rings, not only @ctx, but
328462306a36Sopenharmony_ci		 * it's fine as the task is in exit/exec.
328562306a36Sopenharmony_ci		 */
328662306a36Sopenharmony_ci		cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb,
328762306a36Sopenharmony_ci				       &cancel, true);
328862306a36Sopenharmony_ci		ret |= (cret != IO_WQ_CANCEL_NOTFOUND);
328962306a36Sopenharmony_ci	}
329062306a36Sopenharmony_ci
329162306a36Sopenharmony_ci	/* SQPOLL thread does its own polling */
329262306a36Sopenharmony_ci	if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
329362306a36Sopenharmony_ci	    (ctx->sq_data && ctx->sq_data->thread == current)) {
329462306a36Sopenharmony_ci		while (!wq_list_empty(&ctx->iopoll_list)) {
329562306a36Sopenharmony_ci			io_iopoll_try_reap_events(ctx);
329662306a36Sopenharmony_ci			ret = true;
329762306a36Sopenharmony_ci			cond_resched();
329862306a36Sopenharmony_ci		}
329962306a36Sopenharmony_ci	}
330062306a36Sopenharmony_ci
330162306a36Sopenharmony_ci	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
330262306a36Sopenharmony_ci	    io_allowed_defer_tw_run(ctx))
330362306a36Sopenharmony_ci		ret |= io_run_local_work(ctx, INT_MAX) > 0;
330462306a36Sopenharmony_ci	ret |= io_cancel_defer_files(ctx, task, cancel_all);
330562306a36Sopenharmony_ci	mutex_lock(&ctx->uring_lock);
330662306a36Sopenharmony_ci	ret |= io_poll_remove_all(ctx, task, cancel_all);
330762306a36Sopenharmony_ci	mutex_unlock(&ctx->uring_lock);
330862306a36Sopenharmony_ci	ret |= io_kill_timeouts(ctx, task, cancel_all);
330962306a36Sopenharmony_ci	if (task)
331062306a36Sopenharmony_ci		ret |= io_run_task_work() > 0;
331162306a36Sopenharmony_ci	return ret;
331262306a36Sopenharmony_ci}
331362306a36Sopenharmony_ci
331462306a36Sopenharmony_cistatic s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
331562306a36Sopenharmony_ci{
331662306a36Sopenharmony_ci	if (tracked)
331762306a36Sopenharmony_ci		return atomic_read(&tctx->inflight_tracked);
331862306a36Sopenharmony_ci	return percpu_counter_sum(&tctx->inflight);
331962306a36Sopenharmony_ci}
332062306a36Sopenharmony_ci
332162306a36Sopenharmony_ci/*
332262306a36Sopenharmony_ci * Find any io_uring ctx that this task has registered or done IO on, and cancel
332362306a36Sopenharmony_ci * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
332462306a36Sopenharmony_ci */
332562306a36Sopenharmony_ci__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
332662306a36Sopenharmony_ci{
332762306a36Sopenharmony_ci	struct io_uring_task *tctx = current->io_uring;
332862306a36Sopenharmony_ci	struct io_ring_ctx *ctx;
332962306a36Sopenharmony_ci	struct io_tctx_node *node;
333062306a36Sopenharmony_ci	unsigned long index;
333162306a36Sopenharmony_ci	s64 inflight;
333262306a36Sopenharmony_ci	DEFINE_WAIT(wait);
333362306a36Sopenharmony_ci
333462306a36Sopenharmony_ci	WARN_ON_ONCE(sqd && sqd->thread != current);
333562306a36Sopenharmony_ci
333662306a36Sopenharmony_ci	if (!current->io_uring)
333762306a36Sopenharmony_ci		return;
333862306a36Sopenharmony_ci	if (tctx->io_wq)
333962306a36Sopenharmony_ci		io_wq_exit_start(tctx->io_wq);
334062306a36Sopenharmony_ci
334162306a36Sopenharmony_ci	atomic_inc(&tctx->in_cancel);
334262306a36Sopenharmony_ci	do {
334362306a36Sopenharmony_ci		bool loop = false;
334462306a36Sopenharmony_ci
334562306a36Sopenharmony_ci		io_uring_drop_tctx_refs(current);
334662306a36Sopenharmony_ci		/* read completions before cancelations */
334762306a36Sopenharmony_ci		inflight = tctx_inflight(tctx, !cancel_all);
334862306a36Sopenharmony_ci		if (!inflight)
334962306a36Sopenharmony_ci			break;
335062306a36Sopenharmony_ci
335162306a36Sopenharmony_ci		if (!sqd) {
335262306a36Sopenharmony_ci			xa_for_each(&tctx->xa, index, node) {
335362306a36Sopenharmony_ci				/* sqpoll task will cancel all its requests */
335462306a36Sopenharmony_ci				if (node->ctx->sq_data)
335562306a36Sopenharmony_ci					continue;
335662306a36Sopenharmony_ci				loop |= io_uring_try_cancel_requests(node->ctx,
335762306a36Sopenharmony_ci							current, cancel_all);
335862306a36Sopenharmony_ci			}
335962306a36Sopenharmony_ci		} else {
336062306a36Sopenharmony_ci			list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
336162306a36Sopenharmony_ci				loop |= io_uring_try_cancel_requests(ctx,
336262306a36Sopenharmony_ci								     current,
336362306a36Sopenharmony_ci								     cancel_all);
336462306a36Sopenharmony_ci		}
336562306a36Sopenharmony_ci
336662306a36Sopenharmony_ci		if (loop) {
336762306a36Sopenharmony_ci			cond_resched();
336862306a36Sopenharmony_ci			continue;
336962306a36Sopenharmony_ci		}
337062306a36Sopenharmony_ci
337162306a36Sopenharmony_ci		prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE);
337262306a36Sopenharmony_ci		io_run_task_work();
337362306a36Sopenharmony_ci		io_uring_drop_tctx_refs(current);
337462306a36Sopenharmony_ci		xa_for_each(&tctx->xa, index, node) {
337562306a36Sopenharmony_ci			if (!llist_empty(&node->ctx->work_llist)) {
337662306a36Sopenharmony_ci				WARN_ON_ONCE(node->ctx->submitter_task &&
337762306a36Sopenharmony_ci					     node->ctx->submitter_task != current);
337862306a36Sopenharmony_ci				goto end_wait;
337962306a36Sopenharmony_ci			}
338062306a36Sopenharmony_ci		}
338162306a36Sopenharmony_ci		/*
338262306a36Sopenharmony_ci		 * If we've seen completions, retry without waiting. This
338362306a36Sopenharmony_ci		 * avoids a race where a completion comes in before we did
338462306a36Sopenharmony_ci		 * prepare_to_wait().
338562306a36Sopenharmony_ci		 */
338662306a36Sopenharmony_ci		if (inflight == tctx_inflight(tctx, !cancel_all))
338762306a36Sopenharmony_ci			schedule();
338862306a36Sopenharmony_ciend_wait:
338962306a36Sopenharmony_ci		finish_wait(&tctx->wait, &wait);
339062306a36Sopenharmony_ci	} while (1);
339162306a36Sopenharmony_ci
339262306a36Sopenharmony_ci	io_uring_clean_tctx(tctx);
339362306a36Sopenharmony_ci	if (cancel_all) {
339462306a36Sopenharmony_ci		/*
339562306a36Sopenharmony_ci		 * We shouldn't run task_works after cancel, so just leave
339662306a36Sopenharmony_ci		 * ->in_cancel set for normal exit.
339762306a36Sopenharmony_ci		 */
339862306a36Sopenharmony_ci		atomic_dec(&tctx->in_cancel);
339962306a36Sopenharmony_ci		/* for exec all current's requests should be gone, kill tctx */
340062306a36Sopenharmony_ci		__io_uring_free(current);
340162306a36Sopenharmony_ci	}
340262306a36Sopenharmony_ci}
340362306a36Sopenharmony_ci
340462306a36Sopenharmony_civoid __io_uring_cancel(bool cancel_all)
340562306a36Sopenharmony_ci{
340662306a36Sopenharmony_ci	io_uring_cancel_generic(cancel_all, NULL);
340762306a36Sopenharmony_ci}
340862306a36Sopenharmony_ci
340962306a36Sopenharmony_cistatic void *io_uring_validate_mmap_request(struct file *file,
341062306a36Sopenharmony_ci					    loff_t pgoff, size_t sz)
341162306a36Sopenharmony_ci{
341262306a36Sopenharmony_ci	struct io_ring_ctx *ctx = file->private_data;
341362306a36Sopenharmony_ci	loff_t offset = pgoff << PAGE_SHIFT;
341462306a36Sopenharmony_ci	struct page *page;
341562306a36Sopenharmony_ci	void *ptr;
341662306a36Sopenharmony_ci
341762306a36Sopenharmony_ci	switch (offset & IORING_OFF_MMAP_MASK) {
341862306a36Sopenharmony_ci	case IORING_OFF_SQ_RING:
341962306a36Sopenharmony_ci	case IORING_OFF_CQ_RING:
342062306a36Sopenharmony_ci		/* Don't allow mmap if the ring was setup without it */
342162306a36Sopenharmony_ci		if (ctx->flags & IORING_SETUP_NO_MMAP)
342262306a36Sopenharmony_ci			return ERR_PTR(-EINVAL);
342362306a36Sopenharmony_ci		ptr = ctx->rings;
342462306a36Sopenharmony_ci		break;
342562306a36Sopenharmony_ci	case IORING_OFF_SQES:
342662306a36Sopenharmony_ci		/* Don't allow mmap if the ring was setup without it */
342762306a36Sopenharmony_ci		if (ctx->flags & IORING_SETUP_NO_MMAP)
342862306a36Sopenharmony_ci			return ERR_PTR(-EINVAL);
342962306a36Sopenharmony_ci		ptr = ctx->sq_sqes;
343062306a36Sopenharmony_ci		break;
343162306a36Sopenharmony_ci	case IORING_OFF_PBUF_RING: {
343262306a36Sopenharmony_ci		unsigned int bgid;
343362306a36Sopenharmony_ci
343462306a36Sopenharmony_ci		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
343562306a36Sopenharmony_ci		rcu_read_lock();
343662306a36Sopenharmony_ci		ptr = io_pbuf_get_address(ctx, bgid);
343762306a36Sopenharmony_ci		rcu_read_unlock();
343862306a36Sopenharmony_ci		if (!ptr)
343962306a36Sopenharmony_ci			return ERR_PTR(-EINVAL);
344062306a36Sopenharmony_ci		break;
344162306a36Sopenharmony_ci		}
344262306a36Sopenharmony_ci	default:
344362306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
344462306a36Sopenharmony_ci	}
344562306a36Sopenharmony_ci
344662306a36Sopenharmony_ci	page = virt_to_head_page(ptr);
344762306a36Sopenharmony_ci	if (sz > page_size(page))
344862306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
344962306a36Sopenharmony_ci
345062306a36Sopenharmony_ci	return ptr;
345162306a36Sopenharmony_ci}
345262306a36Sopenharmony_ci
345362306a36Sopenharmony_ci#ifdef CONFIG_MMU
345462306a36Sopenharmony_ci
345562306a36Sopenharmony_cistatic __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
345662306a36Sopenharmony_ci{
345762306a36Sopenharmony_ci	size_t sz = vma->vm_end - vma->vm_start;
345862306a36Sopenharmony_ci	unsigned long pfn;
345962306a36Sopenharmony_ci	void *ptr;
346062306a36Sopenharmony_ci
346162306a36Sopenharmony_ci	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
346262306a36Sopenharmony_ci	if (IS_ERR(ptr))
346362306a36Sopenharmony_ci		return PTR_ERR(ptr);
346462306a36Sopenharmony_ci
346562306a36Sopenharmony_ci	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
346662306a36Sopenharmony_ci	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
346762306a36Sopenharmony_ci}
346862306a36Sopenharmony_ci
346962306a36Sopenharmony_cistatic unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
347062306a36Sopenharmony_ci			unsigned long addr, unsigned long len,
347162306a36Sopenharmony_ci			unsigned long pgoff, unsigned long flags)
347262306a36Sopenharmony_ci{
347362306a36Sopenharmony_ci	void *ptr;
347462306a36Sopenharmony_ci
347562306a36Sopenharmony_ci	/*
347662306a36Sopenharmony_ci	 * Do not allow to map to user-provided address to avoid breaking the
347762306a36Sopenharmony_ci	 * aliasing rules. Userspace is not able to guess the offset address of
347862306a36Sopenharmony_ci	 * kernel kmalloc()ed memory area.
347962306a36Sopenharmony_ci	 */
348062306a36Sopenharmony_ci	if (addr)
348162306a36Sopenharmony_ci		return -EINVAL;
348262306a36Sopenharmony_ci
348362306a36Sopenharmony_ci	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
348462306a36Sopenharmony_ci	if (IS_ERR(ptr))
348562306a36Sopenharmony_ci		return -ENOMEM;
348662306a36Sopenharmony_ci
348762306a36Sopenharmony_ci	/*
348862306a36Sopenharmony_ci	 * Some architectures have strong cache aliasing requirements.
348962306a36Sopenharmony_ci	 * For such architectures we need a coherent mapping which aliases
349062306a36Sopenharmony_ci	 * kernel memory *and* userspace memory. To achieve that:
349162306a36Sopenharmony_ci	 * - use a NULL file pointer to reference physical memory, and
349262306a36Sopenharmony_ci	 * - use the kernel virtual address of the shared io_uring context
349362306a36Sopenharmony_ci	 *   (instead of the userspace-provided address, which has to be 0UL
349462306a36Sopenharmony_ci	 *   anyway).
349562306a36Sopenharmony_ci	 * - use the same pgoff which the get_unmapped_area() uses to
349662306a36Sopenharmony_ci	 *   calculate the page colouring.
349762306a36Sopenharmony_ci	 * For architectures without such aliasing requirements, the
349862306a36Sopenharmony_ci	 * architecture will return any suitable mapping because addr is 0.
349962306a36Sopenharmony_ci	 */
350062306a36Sopenharmony_ci	filp = NULL;
350162306a36Sopenharmony_ci	flags |= MAP_SHARED;
350262306a36Sopenharmony_ci	pgoff = 0;	/* has been translated to ptr above */
350362306a36Sopenharmony_ci#ifdef SHM_COLOUR
350462306a36Sopenharmony_ci	addr = (uintptr_t) ptr;
350562306a36Sopenharmony_ci	pgoff = addr >> PAGE_SHIFT;
350662306a36Sopenharmony_ci#else
350762306a36Sopenharmony_ci	addr = 0UL;
350862306a36Sopenharmony_ci#endif
350962306a36Sopenharmony_ci	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
351062306a36Sopenharmony_ci}
351162306a36Sopenharmony_ci
351262306a36Sopenharmony_ci#else /* !CONFIG_MMU */
351362306a36Sopenharmony_ci
351462306a36Sopenharmony_cistatic int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
351562306a36Sopenharmony_ci{
351662306a36Sopenharmony_ci	return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
351762306a36Sopenharmony_ci}
351862306a36Sopenharmony_ci
351962306a36Sopenharmony_cistatic unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
352062306a36Sopenharmony_ci{
352162306a36Sopenharmony_ci	return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE;
352262306a36Sopenharmony_ci}
352362306a36Sopenharmony_ci
352462306a36Sopenharmony_cistatic unsigned long io_uring_nommu_get_unmapped_area(struct file *file,
352562306a36Sopenharmony_ci	unsigned long addr, unsigned long len,
352662306a36Sopenharmony_ci	unsigned long pgoff, unsigned long flags)
352762306a36Sopenharmony_ci{
352862306a36Sopenharmony_ci	void *ptr;
352962306a36Sopenharmony_ci
353062306a36Sopenharmony_ci	ptr = io_uring_validate_mmap_request(file, pgoff, len);
353162306a36Sopenharmony_ci	if (IS_ERR(ptr))
353262306a36Sopenharmony_ci		return PTR_ERR(ptr);
353362306a36Sopenharmony_ci
353462306a36Sopenharmony_ci	return (unsigned long) ptr;
353562306a36Sopenharmony_ci}
353662306a36Sopenharmony_ci
353762306a36Sopenharmony_ci#endif /* !CONFIG_MMU */
353862306a36Sopenharmony_ci
353962306a36Sopenharmony_cistatic int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
354062306a36Sopenharmony_ci{
354162306a36Sopenharmony_ci	if (flags & IORING_ENTER_EXT_ARG) {
354262306a36Sopenharmony_ci		struct io_uring_getevents_arg arg;
354362306a36Sopenharmony_ci
354462306a36Sopenharmony_ci		if (argsz != sizeof(arg))
354562306a36Sopenharmony_ci			return -EINVAL;
354662306a36Sopenharmony_ci		if (copy_from_user(&arg, argp, sizeof(arg)))
354762306a36Sopenharmony_ci			return -EFAULT;
354862306a36Sopenharmony_ci	}
354962306a36Sopenharmony_ci	return 0;
355062306a36Sopenharmony_ci}
355162306a36Sopenharmony_ci
355262306a36Sopenharmony_cistatic int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
355362306a36Sopenharmony_ci			  struct __kernel_timespec __user **ts,
355462306a36Sopenharmony_ci			  const sigset_t __user **sig)
355562306a36Sopenharmony_ci{
355662306a36Sopenharmony_ci	struct io_uring_getevents_arg arg;
355762306a36Sopenharmony_ci
355862306a36Sopenharmony_ci	/*
355962306a36Sopenharmony_ci	 * If EXT_ARG isn't set, then we have no timespec and the argp pointer
356062306a36Sopenharmony_ci	 * is just a pointer to the sigset_t.
356162306a36Sopenharmony_ci	 */
356262306a36Sopenharmony_ci	if (!(flags & IORING_ENTER_EXT_ARG)) {
356362306a36Sopenharmony_ci		*sig = (const sigset_t __user *) argp;
356462306a36Sopenharmony_ci		*ts = NULL;
356562306a36Sopenharmony_ci		return 0;
356662306a36Sopenharmony_ci	}
356762306a36Sopenharmony_ci
356862306a36Sopenharmony_ci	/*
356962306a36Sopenharmony_ci	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
357062306a36Sopenharmony_ci	 * timespec and sigset_t pointers if good.
357162306a36Sopenharmony_ci	 */
357262306a36Sopenharmony_ci	if (*argsz != sizeof(arg))
357362306a36Sopenharmony_ci		return -EINVAL;
357462306a36Sopenharmony_ci	if (copy_from_user(&arg, argp, sizeof(arg)))
357562306a36Sopenharmony_ci		return -EFAULT;
357662306a36Sopenharmony_ci	if (arg.pad)
357762306a36Sopenharmony_ci		return -EINVAL;
357862306a36Sopenharmony_ci	*sig = u64_to_user_ptr(arg.sigmask);
357962306a36Sopenharmony_ci	*argsz = arg.sigmask_sz;
358062306a36Sopenharmony_ci	*ts = u64_to_user_ptr(arg.ts);
358162306a36Sopenharmony_ci	return 0;
358262306a36Sopenharmony_ci}
358362306a36Sopenharmony_ci
358462306a36Sopenharmony_ciSYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
358562306a36Sopenharmony_ci		u32, min_complete, u32, flags, const void __user *, argp,
358662306a36Sopenharmony_ci		size_t, argsz)
358762306a36Sopenharmony_ci{
358862306a36Sopenharmony_ci	struct io_ring_ctx *ctx;
358962306a36Sopenharmony_ci	struct file *file;
359062306a36Sopenharmony_ci	long ret;
359162306a36Sopenharmony_ci
359262306a36Sopenharmony_ci	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
359362306a36Sopenharmony_ci			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
359462306a36Sopenharmony_ci			       IORING_ENTER_REGISTERED_RING)))
359562306a36Sopenharmony_ci		return -EINVAL;
359662306a36Sopenharmony_ci
359762306a36Sopenharmony_ci	/*
359862306a36Sopenharmony_ci	 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
359962306a36Sopenharmony_ci	 * need only dereference our task private array to find it.
360062306a36Sopenharmony_ci	 */
360162306a36Sopenharmony_ci	if (flags & IORING_ENTER_REGISTERED_RING) {
360262306a36Sopenharmony_ci		struct io_uring_task *tctx = current->io_uring;
360362306a36Sopenharmony_ci
360462306a36Sopenharmony_ci		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
360562306a36Sopenharmony_ci			return -EINVAL;
360662306a36Sopenharmony_ci		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
360762306a36Sopenharmony_ci		file = tctx->registered_rings[fd];
360862306a36Sopenharmony_ci		if (unlikely(!file))
360962306a36Sopenharmony_ci			return -EBADF;
361062306a36Sopenharmony_ci	} else {
361162306a36Sopenharmony_ci		file = fget(fd);
361262306a36Sopenharmony_ci		if (unlikely(!file))
361362306a36Sopenharmony_ci			return -EBADF;
361462306a36Sopenharmony_ci		ret = -EOPNOTSUPP;
361562306a36Sopenharmony_ci		if (unlikely(!io_is_uring_fops(file)))
361662306a36Sopenharmony_ci			goto out;
361762306a36Sopenharmony_ci	}
361862306a36Sopenharmony_ci
361962306a36Sopenharmony_ci	ctx = file->private_data;
362062306a36Sopenharmony_ci	ret = -EBADFD;
362162306a36Sopenharmony_ci	if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
362262306a36Sopenharmony_ci		goto out;
362362306a36Sopenharmony_ci
362462306a36Sopenharmony_ci	/*
362562306a36Sopenharmony_ci	 * For SQ polling, the thread will do all submissions and completions.
362662306a36Sopenharmony_ci	 * Just return the requested submit count, and wake the thread if
362762306a36Sopenharmony_ci	 * we were asked to.
362862306a36Sopenharmony_ci	 */
362962306a36Sopenharmony_ci	ret = 0;
363062306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_SQPOLL) {
363162306a36Sopenharmony_ci		io_cqring_overflow_flush(ctx);
363262306a36Sopenharmony_ci
363362306a36Sopenharmony_ci		if (unlikely(ctx->sq_data->thread == NULL)) {
363462306a36Sopenharmony_ci			ret = -EOWNERDEAD;
363562306a36Sopenharmony_ci			goto out;
363662306a36Sopenharmony_ci		}
363762306a36Sopenharmony_ci		if (flags & IORING_ENTER_SQ_WAKEUP)
363862306a36Sopenharmony_ci			wake_up(&ctx->sq_data->wait);
363962306a36Sopenharmony_ci		if (flags & IORING_ENTER_SQ_WAIT)
364062306a36Sopenharmony_ci			io_sqpoll_wait_sq(ctx);
364162306a36Sopenharmony_ci
364262306a36Sopenharmony_ci		ret = to_submit;
364362306a36Sopenharmony_ci	} else if (to_submit) {
364462306a36Sopenharmony_ci		ret = io_uring_add_tctx_node(ctx);
364562306a36Sopenharmony_ci		if (unlikely(ret))
364662306a36Sopenharmony_ci			goto out;
364762306a36Sopenharmony_ci
364862306a36Sopenharmony_ci		mutex_lock(&ctx->uring_lock);
364962306a36Sopenharmony_ci		ret = io_submit_sqes(ctx, to_submit);
365062306a36Sopenharmony_ci		if (ret != to_submit) {
365162306a36Sopenharmony_ci			mutex_unlock(&ctx->uring_lock);
365262306a36Sopenharmony_ci			goto out;
365362306a36Sopenharmony_ci		}
365462306a36Sopenharmony_ci		if (flags & IORING_ENTER_GETEVENTS) {
365562306a36Sopenharmony_ci			if (ctx->syscall_iopoll)
365662306a36Sopenharmony_ci				goto iopoll_locked;
365762306a36Sopenharmony_ci			/*
365862306a36Sopenharmony_ci			 * Ignore errors, we'll soon call io_cqring_wait() and
365962306a36Sopenharmony_ci			 * it should handle ownership problems if any.
366062306a36Sopenharmony_ci			 */
366162306a36Sopenharmony_ci			if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
366262306a36Sopenharmony_ci				(void)io_run_local_work_locked(ctx, min_complete);
366362306a36Sopenharmony_ci		}
366462306a36Sopenharmony_ci		mutex_unlock(&ctx->uring_lock);
366562306a36Sopenharmony_ci	}
366662306a36Sopenharmony_ci
366762306a36Sopenharmony_ci	if (flags & IORING_ENTER_GETEVENTS) {
366862306a36Sopenharmony_ci		int ret2;
366962306a36Sopenharmony_ci
367062306a36Sopenharmony_ci		if (ctx->syscall_iopoll) {
367162306a36Sopenharmony_ci			/*
367262306a36Sopenharmony_ci			 * We disallow the app entering submit/complete with
367362306a36Sopenharmony_ci			 * polling, but we still need to lock the ring to
367462306a36Sopenharmony_ci			 * prevent racing with polled issue that got punted to
367562306a36Sopenharmony_ci			 * a workqueue.
367662306a36Sopenharmony_ci			 */
367762306a36Sopenharmony_ci			mutex_lock(&ctx->uring_lock);
367862306a36Sopenharmony_ciiopoll_locked:
367962306a36Sopenharmony_ci			ret2 = io_validate_ext_arg(flags, argp, argsz);
368062306a36Sopenharmony_ci			if (likely(!ret2)) {
368162306a36Sopenharmony_ci				min_complete = min(min_complete,
368262306a36Sopenharmony_ci						   ctx->cq_entries);
368362306a36Sopenharmony_ci				ret2 = io_iopoll_check(ctx, min_complete);
368462306a36Sopenharmony_ci			}
368562306a36Sopenharmony_ci			mutex_unlock(&ctx->uring_lock);
368662306a36Sopenharmony_ci		} else {
368762306a36Sopenharmony_ci			const sigset_t __user *sig;
368862306a36Sopenharmony_ci			struct __kernel_timespec __user *ts;
368962306a36Sopenharmony_ci
369062306a36Sopenharmony_ci			ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
369162306a36Sopenharmony_ci			if (likely(!ret2)) {
369262306a36Sopenharmony_ci				min_complete = min(min_complete,
369362306a36Sopenharmony_ci						   ctx->cq_entries);
369462306a36Sopenharmony_ci				ret2 = io_cqring_wait(ctx, min_complete, sig,
369562306a36Sopenharmony_ci						      argsz, ts);
369662306a36Sopenharmony_ci			}
369762306a36Sopenharmony_ci		}
369862306a36Sopenharmony_ci
369962306a36Sopenharmony_ci		if (!ret) {
370062306a36Sopenharmony_ci			ret = ret2;
370162306a36Sopenharmony_ci
370262306a36Sopenharmony_ci			/*
370362306a36Sopenharmony_ci			 * EBADR indicates that one or more CQE were dropped.
370462306a36Sopenharmony_ci			 * Once the user has been informed we can clear the bit
370562306a36Sopenharmony_ci			 * as they are obviously ok with those drops.
370662306a36Sopenharmony_ci			 */
370762306a36Sopenharmony_ci			if (unlikely(ret2 == -EBADR))
370862306a36Sopenharmony_ci				clear_bit(IO_CHECK_CQ_DROPPED_BIT,
370962306a36Sopenharmony_ci					  &ctx->check_cq);
371062306a36Sopenharmony_ci		}
371162306a36Sopenharmony_ci	}
371262306a36Sopenharmony_ciout:
371362306a36Sopenharmony_ci	if (!(flags & IORING_ENTER_REGISTERED_RING))
371462306a36Sopenharmony_ci		fput(file);
371562306a36Sopenharmony_ci	return ret;
371662306a36Sopenharmony_ci}
371762306a36Sopenharmony_ci
371862306a36Sopenharmony_cistatic const struct file_operations io_uring_fops = {
371962306a36Sopenharmony_ci	.release	= io_uring_release,
372062306a36Sopenharmony_ci	.mmap		= io_uring_mmap,
372162306a36Sopenharmony_ci#ifndef CONFIG_MMU
372262306a36Sopenharmony_ci	.get_unmapped_area = io_uring_nommu_get_unmapped_area,
372362306a36Sopenharmony_ci	.mmap_capabilities = io_uring_nommu_mmap_capabilities,
372462306a36Sopenharmony_ci#else
372562306a36Sopenharmony_ci	.get_unmapped_area = io_uring_mmu_get_unmapped_area,
372662306a36Sopenharmony_ci#endif
372762306a36Sopenharmony_ci	.poll		= io_uring_poll,
372862306a36Sopenharmony_ci#ifdef CONFIG_PROC_FS
372962306a36Sopenharmony_ci	.show_fdinfo	= io_uring_show_fdinfo,
373062306a36Sopenharmony_ci#endif
373162306a36Sopenharmony_ci};
373262306a36Sopenharmony_ci
373362306a36Sopenharmony_cibool io_is_uring_fops(struct file *file)
373462306a36Sopenharmony_ci{
373562306a36Sopenharmony_ci	return file->f_op == &io_uring_fops;
373662306a36Sopenharmony_ci}
373762306a36Sopenharmony_ci
373862306a36Sopenharmony_cistatic __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
373962306a36Sopenharmony_ci					 struct io_uring_params *p)
374062306a36Sopenharmony_ci{
374162306a36Sopenharmony_ci	struct io_rings *rings;
374262306a36Sopenharmony_ci	size_t size, sq_array_offset;
374362306a36Sopenharmony_ci	void *ptr;
374462306a36Sopenharmony_ci
374562306a36Sopenharmony_ci	/* make sure these are sane, as we already accounted them */
374662306a36Sopenharmony_ci	ctx->sq_entries = p->sq_entries;
374762306a36Sopenharmony_ci	ctx->cq_entries = p->cq_entries;
374862306a36Sopenharmony_ci
374962306a36Sopenharmony_ci	size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
375062306a36Sopenharmony_ci	if (size == SIZE_MAX)
375162306a36Sopenharmony_ci		return -EOVERFLOW;
375262306a36Sopenharmony_ci
375362306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
375462306a36Sopenharmony_ci		rings = io_mem_alloc(size);
375562306a36Sopenharmony_ci	else
375662306a36Sopenharmony_ci		rings = io_rings_map(ctx, p->cq_off.user_addr, size);
375762306a36Sopenharmony_ci
375862306a36Sopenharmony_ci	if (IS_ERR(rings))
375962306a36Sopenharmony_ci		return PTR_ERR(rings);
376062306a36Sopenharmony_ci
376162306a36Sopenharmony_ci	ctx->rings = rings;
376262306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
376362306a36Sopenharmony_ci		ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
376462306a36Sopenharmony_ci	rings->sq_ring_mask = p->sq_entries - 1;
376562306a36Sopenharmony_ci	rings->cq_ring_mask = p->cq_entries - 1;
376662306a36Sopenharmony_ci	rings->sq_ring_entries = p->sq_entries;
376762306a36Sopenharmony_ci	rings->cq_ring_entries = p->cq_entries;
376862306a36Sopenharmony_ci
376962306a36Sopenharmony_ci	if (p->flags & IORING_SETUP_SQE128)
377062306a36Sopenharmony_ci		size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
377162306a36Sopenharmony_ci	else
377262306a36Sopenharmony_ci		size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
377362306a36Sopenharmony_ci	if (size == SIZE_MAX) {
377462306a36Sopenharmony_ci		io_rings_free(ctx);
377562306a36Sopenharmony_ci		return -EOVERFLOW;
377662306a36Sopenharmony_ci	}
377762306a36Sopenharmony_ci
377862306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
377962306a36Sopenharmony_ci		ptr = io_mem_alloc(size);
378062306a36Sopenharmony_ci	else
378162306a36Sopenharmony_ci		ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
378262306a36Sopenharmony_ci
378362306a36Sopenharmony_ci	if (IS_ERR(ptr)) {
378462306a36Sopenharmony_ci		io_rings_free(ctx);
378562306a36Sopenharmony_ci		return PTR_ERR(ptr);
378662306a36Sopenharmony_ci	}
378762306a36Sopenharmony_ci
378862306a36Sopenharmony_ci	ctx->sq_sqes = ptr;
378962306a36Sopenharmony_ci	return 0;
379062306a36Sopenharmony_ci}
379162306a36Sopenharmony_ci
379262306a36Sopenharmony_cistatic int io_uring_install_fd(struct file *file)
379362306a36Sopenharmony_ci{
379462306a36Sopenharmony_ci	int fd;
379562306a36Sopenharmony_ci
379662306a36Sopenharmony_ci	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
379762306a36Sopenharmony_ci	if (fd < 0)
379862306a36Sopenharmony_ci		return fd;
379962306a36Sopenharmony_ci	fd_install(fd, file);
380062306a36Sopenharmony_ci	return fd;
380162306a36Sopenharmony_ci}
380262306a36Sopenharmony_ci
380362306a36Sopenharmony_ci/*
380462306a36Sopenharmony_ci * Allocate an anonymous fd, this is what constitutes the application
380562306a36Sopenharmony_ci * visible backing of an io_uring instance. The application mmaps this
380662306a36Sopenharmony_ci * fd to gain access to the SQ/CQ ring details.
380762306a36Sopenharmony_ci */
380862306a36Sopenharmony_cistatic struct file *io_uring_get_file(struct io_ring_ctx *ctx)
380962306a36Sopenharmony_ci{
381062306a36Sopenharmony_ci	return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx,
381162306a36Sopenharmony_ci					 O_RDWR | O_CLOEXEC, NULL);
381262306a36Sopenharmony_ci}
381362306a36Sopenharmony_ci
381462306a36Sopenharmony_cistatic __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
381562306a36Sopenharmony_ci				  struct io_uring_params __user *params)
381662306a36Sopenharmony_ci{
381762306a36Sopenharmony_ci	struct io_ring_ctx *ctx;
381862306a36Sopenharmony_ci	struct io_uring_task *tctx;
381962306a36Sopenharmony_ci	struct file *file;
382062306a36Sopenharmony_ci	int ret;
382162306a36Sopenharmony_ci
382262306a36Sopenharmony_ci	if (!entries)
382362306a36Sopenharmony_ci		return -EINVAL;
382462306a36Sopenharmony_ci	if (entries > IORING_MAX_ENTRIES) {
382562306a36Sopenharmony_ci		if (!(p->flags & IORING_SETUP_CLAMP))
382662306a36Sopenharmony_ci			return -EINVAL;
382762306a36Sopenharmony_ci		entries = IORING_MAX_ENTRIES;
382862306a36Sopenharmony_ci	}
382962306a36Sopenharmony_ci
383062306a36Sopenharmony_ci	if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
383162306a36Sopenharmony_ci	    && !(p->flags & IORING_SETUP_NO_MMAP))
383262306a36Sopenharmony_ci		return -EINVAL;
383362306a36Sopenharmony_ci
383462306a36Sopenharmony_ci	/*
383562306a36Sopenharmony_ci	 * Use twice as many entries for the CQ ring. It's possible for the
383662306a36Sopenharmony_ci	 * application to drive a higher depth than the size of the SQ ring,
383762306a36Sopenharmony_ci	 * since the sqes are only used at submission time. This allows for
383862306a36Sopenharmony_ci	 * some flexibility in overcommitting a bit. If the application has
383962306a36Sopenharmony_ci	 * set IORING_SETUP_CQSIZE, it will have passed in the desired number
384062306a36Sopenharmony_ci	 * of CQ ring entries manually.
384162306a36Sopenharmony_ci	 */
384262306a36Sopenharmony_ci	p->sq_entries = roundup_pow_of_two(entries);
384362306a36Sopenharmony_ci	if (p->flags & IORING_SETUP_CQSIZE) {
384462306a36Sopenharmony_ci		/*
384562306a36Sopenharmony_ci		 * If IORING_SETUP_CQSIZE is set, we do the same roundup
384662306a36Sopenharmony_ci		 * to a power-of-two, if it isn't already. We do NOT impose
384762306a36Sopenharmony_ci		 * any cq vs sq ring sizing.
384862306a36Sopenharmony_ci		 */
384962306a36Sopenharmony_ci		if (!p->cq_entries)
385062306a36Sopenharmony_ci			return -EINVAL;
385162306a36Sopenharmony_ci		if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
385262306a36Sopenharmony_ci			if (!(p->flags & IORING_SETUP_CLAMP))
385362306a36Sopenharmony_ci				return -EINVAL;
385462306a36Sopenharmony_ci			p->cq_entries = IORING_MAX_CQ_ENTRIES;
385562306a36Sopenharmony_ci		}
385662306a36Sopenharmony_ci		p->cq_entries = roundup_pow_of_two(p->cq_entries);
385762306a36Sopenharmony_ci		if (p->cq_entries < p->sq_entries)
385862306a36Sopenharmony_ci			return -EINVAL;
385962306a36Sopenharmony_ci	} else {
386062306a36Sopenharmony_ci		p->cq_entries = 2 * p->sq_entries;
386162306a36Sopenharmony_ci	}
386262306a36Sopenharmony_ci
386362306a36Sopenharmony_ci	ctx = io_ring_ctx_alloc(p);
386462306a36Sopenharmony_ci	if (!ctx)
386562306a36Sopenharmony_ci		return -ENOMEM;
386662306a36Sopenharmony_ci
386762306a36Sopenharmony_ci	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
386862306a36Sopenharmony_ci	    !(ctx->flags & IORING_SETUP_IOPOLL) &&
386962306a36Sopenharmony_ci	    !(ctx->flags & IORING_SETUP_SQPOLL))
387062306a36Sopenharmony_ci		ctx->task_complete = true;
387162306a36Sopenharmony_ci
387262306a36Sopenharmony_ci	if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL))
387362306a36Sopenharmony_ci		ctx->lockless_cq = true;
387462306a36Sopenharmony_ci
387562306a36Sopenharmony_ci	/*
387662306a36Sopenharmony_ci	 * lazy poll_wq activation relies on ->task_complete for synchronisation
387762306a36Sopenharmony_ci	 * purposes, see io_activate_pollwq()
387862306a36Sopenharmony_ci	 */
387962306a36Sopenharmony_ci	if (!ctx->task_complete)
388062306a36Sopenharmony_ci		ctx->poll_activated = true;
388162306a36Sopenharmony_ci
388262306a36Sopenharmony_ci	/*
388362306a36Sopenharmony_ci	 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user
388462306a36Sopenharmony_ci	 * space applications don't need to do io completion events
388562306a36Sopenharmony_ci	 * polling again, they can rely on io_sq_thread to do polling
388662306a36Sopenharmony_ci	 * work, which can reduce cpu usage and uring_lock contention.
388762306a36Sopenharmony_ci	 */
388862306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_IOPOLL &&
388962306a36Sopenharmony_ci	    !(ctx->flags & IORING_SETUP_SQPOLL))
389062306a36Sopenharmony_ci		ctx->syscall_iopoll = 1;
389162306a36Sopenharmony_ci
389262306a36Sopenharmony_ci	ctx->compat = in_compat_syscall();
389362306a36Sopenharmony_ci	if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK))
389462306a36Sopenharmony_ci		ctx->user = get_uid(current_user());
389562306a36Sopenharmony_ci
389662306a36Sopenharmony_ci	/*
389762306a36Sopenharmony_ci	 * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
389862306a36Sopenharmony_ci	 * COOP_TASKRUN is set, then IPIs are never needed by the app.
389962306a36Sopenharmony_ci	 */
390062306a36Sopenharmony_ci	ret = -EINVAL;
390162306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_SQPOLL) {
390262306a36Sopenharmony_ci		/* IPI related flags don't make sense with SQPOLL */
390362306a36Sopenharmony_ci		if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
390462306a36Sopenharmony_ci				  IORING_SETUP_TASKRUN_FLAG |
390562306a36Sopenharmony_ci				  IORING_SETUP_DEFER_TASKRUN))
390662306a36Sopenharmony_ci			goto err;
390762306a36Sopenharmony_ci		ctx->notify_method = TWA_SIGNAL_NO_IPI;
390862306a36Sopenharmony_ci	} else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
390962306a36Sopenharmony_ci		ctx->notify_method = TWA_SIGNAL_NO_IPI;
391062306a36Sopenharmony_ci	} else {
391162306a36Sopenharmony_ci		if (ctx->flags & IORING_SETUP_TASKRUN_FLAG &&
391262306a36Sopenharmony_ci		    !(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
391362306a36Sopenharmony_ci			goto err;
391462306a36Sopenharmony_ci		ctx->notify_method = TWA_SIGNAL;
391562306a36Sopenharmony_ci	}
391662306a36Sopenharmony_ci
391762306a36Sopenharmony_ci	/*
391862306a36Sopenharmony_ci	 * For DEFER_TASKRUN we require the completion task to be the same as the
391962306a36Sopenharmony_ci	 * submission task. This implies that there is only one submitter, so enforce
392062306a36Sopenharmony_ci	 * that.
392162306a36Sopenharmony_ci	 */
392262306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
392362306a36Sopenharmony_ci	    !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) {
392462306a36Sopenharmony_ci		goto err;
392562306a36Sopenharmony_ci	}
392662306a36Sopenharmony_ci
392762306a36Sopenharmony_ci	/*
392862306a36Sopenharmony_ci	 * This is just grabbed for accounting purposes. When a process exits,
392962306a36Sopenharmony_ci	 * the mm is exited and dropped before the files, hence we need to hang
393062306a36Sopenharmony_ci	 * on to this mm purely for the purposes of being able to unaccount
393162306a36Sopenharmony_ci	 * memory (locked/pinned vm). It's not used for anything else.
393262306a36Sopenharmony_ci	 */
393362306a36Sopenharmony_ci	mmgrab(current->mm);
393462306a36Sopenharmony_ci	ctx->mm_account = current->mm;
393562306a36Sopenharmony_ci
393662306a36Sopenharmony_ci	ret = io_allocate_scq_urings(ctx, p);
393762306a36Sopenharmony_ci	if (ret)
393862306a36Sopenharmony_ci		goto err;
393962306a36Sopenharmony_ci
394062306a36Sopenharmony_ci	ret = io_sq_offload_create(ctx, p);
394162306a36Sopenharmony_ci	if (ret)
394262306a36Sopenharmony_ci		goto err;
394362306a36Sopenharmony_ci
394462306a36Sopenharmony_ci	ret = io_rsrc_init(ctx);
394562306a36Sopenharmony_ci	if (ret)
394662306a36Sopenharmony_ci		goto err;
394762306a36Sopenharmony_ci
394862306a36Sopenharmony_ci	p->sq_off.head = offsetof(struct io_rings, sq.head);
394962306a36Sopenharmony_ci	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
395062306a36Sopenharmony_ci	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
395162306a36Sopenharmony_ci	p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries);
395262306a36Sopenharmony_ci	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
395362306a36Sopenharmony_ci	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
395462306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
395562306a36Sopenharmony_ci		p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
395662306a36Sopenharmony_ci	p->sq_off.resv1 = 0;
395762306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
395862306a36Sopenharmony_ci		p->sq_off.user_addr = 0;
395962306a36Sopenharmony_ci
396062306a36Sopenharmony_ci	p->cq_off.head = offsetof(struct io_rings, cq.head);
396162306a36Sopenharmony_ci	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
396262306a36Sopenharmony_ci	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
396362306a36Sopenharmony_ci	p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries);
396462306a36Sopenharmony_ci	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
396562306a36Sopenharmony_ci	p->cq_off.cqes = offsetof(struct io_rings, cqes);
396662306a36Sopenharmony_ci	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
396762306a36Sopenharmony_ci	p->cq_off.resv1 = 0;
396862306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
396962306a36Sopenharmony_ci		p->cq_off.user_addr = 0;
397062306a36Sopenharmony_ci
397162306a36Sopenharmony_ci	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
397262306a36Sopenharmony_ci			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
397362306a36Sopenharmony_ci			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
397462306a36Sopenharmony_ci			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
397562306a36Sopenharmony_ci			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
397662306a36Sopenharmony_ci			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
397762306a36Sopenharmony_ci			IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING;
397862306a36Sopenharmony_ci
397962306a36Sopenharmony_ci	if (copy_to_user(params, p, sizeof(*p))) {
398062306a36Sopenharmony_ci		ret = -EFAULT;
398162306a36Sopenharmony_ci		goto err;
398262306a36Sopenharmony_ci	}
398362306a36Sopenharmony_ci
398462306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
398562306a36Sopenharmony_ci	    && !(ctx->flags & IORING_SETUP_R_DISABLED))
398662306a36Sopenharmony_ci		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
398762306a36Sopenharmony_ci
398862306a36Sopenharmony_ci	file = io_uring_get_file(ctx);
398962306a36Sopenharmony_ci	if (IS_ERR(file)) {
399062306a36Sopenharmony_ci		ret = PTR_ERR(file);
399162306a36Sopenharmony_ci		goto err;
399262306a36Sopenharmony_ci	}
399362306a36Sopenharmony_ci
399462306a36Sopenharmony_ci	ret = __io_uring_add_tctx_node(ctx);
399562306a36Sopenharmony_ci	if (ret)
399662306a36Sopenharmony_ci		goto err_fput;
399762306a36Sopenharmony_ci	tctx = current->io_uring;
399862306a36Sopenharmony_ci
399962306a36Sopenharmony_ci	/*
400062306a36Sopenharmony_ci	 * Install ring fd as the very last thing, so we don't risk someone
400162306a36Sopenharmony_ci	 * having closed it before we finish setup
400262306a36Sopenharmony_ci	 */
400362306a36Sopenharmony_ci	if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
400462306a36Sopenharmony_ci		ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX);
400562306a36Sopenharmony_ci	else
400662306a36Sopenharmony_ci		ret = io_uring_install_fd(file);
400762306a36Sopenharmony_ci	if (ret < 0)
400862306a36Sopenharmony_ci		goto err_fput;
400962306a36Sopenharmony_ci
401062306a36Sopenharmony_ci	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
401162306a36Sopenharmony_ci	return ret;
401262306a36Sopenharmony_cierr:
401362306a36Sopenharmony_ci	io_ring_ctx_wait_and_kill(ctx);
401462306a36Sopenharmony_ci	return ret;
401562306a36Sopenharmony_cierr_fput:
401662306a36Sopenharmony_ci	fput(file);
401762306a36Sopenharmony_ci	return ret;
401862306a36Sopenharmony_ci}
401962306a36Sopenharmony_ci
402062306a36Sopenharmony_ci/*
402162306a36Sopenharmony_ci * Sets up an aio uring context, and returns the fd. Applications asks for a
402262306a36Sopenharmony_ci * ring size, we return the actual sq/cq ring sizes (among other things) in the
402362306a36Sopenharmony_ci * params structure passed in.
402462306a36Sopenharmony_ci */
402562306a36Sopenharmony_cistatic long io_uring_setup(u32 entries, struct io_uring_params __user *params)
402662306a36Sopenharmony_ci{
402762306a36Sopenharmony_ci	struct io_uring_params p;
402862306a36Sopenharmony_ci	int i;
402962306a36Sopenharmony_ci
403062306a36Sopenharmony_ci	if (copy_from_user(&p, params, sizeof(p)))
403162306a36Sopenharmony_ci		return -EFAULT;
403262306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(p.resv); i++) {
403362306a36Sopenharmony_ci		if (p.resv[i])
403462306a36Sopenharmony_ci			return -EINVAL;
403562306a36Sopenharmony_ci	}
403662306a36Sopenharmony_ci
403762306a36Sopenharmony_ci	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
403862306a36Sopenharmony_ci			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
403962306a36Sopenharmony_ci			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
404062306a36Sopenharmony_ci			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
404162306a36Sopenharmony_ci			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
404262306a36Sopenharmony_ci			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
404362306a36Sopenharmony_ci			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
404462306a36Sopenharmony_ci			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
404562306a36Sopenharmony_ci			IORING_SETUP_NO_SQARRAY))
404662306a36Sopenharmony_ci		return -EINVAL;
404762306a36Sopenharmony_ci
404862306a36Sopenharmony_ci	return io_uring_create(entries, &p, params);
404962306a36Sopenharmony_ci}
405062306a36Sopenharmony_ci
405162306a36Sopenharmony_cistatic inline bool io_uring_allowed(void)
405262306a36Sopenharmony_ci{
405362306a36Sopenharmony_ci	int disabled = READ_ONCE(sysctl_io_uring_disabled);
405462306a36Sopenharmony_ci	kgid_t io_uring_group;
405562306a36Sopenharmony_ci
405662306a36Sopenharmony_ci	if (disabled == 2)
405762306a36Sopenharmony_ci		return false;
405862306a36Sopenharmony_ci
405962306a36Sopenharmony_ci	if (disabled == 0 || capable(CAP_SYS_ADMIN))
406062306a36Sopenharmony_ci		return true;
406162306a36Sopenharmony_ci
406262306a36Sopenharmony_ci	io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group);
406362306a36Sopenharmony_ci	if (!gid_valid(io_uring_group))
406462306a36Sopenharmony_ci		return false;
406562306a36Sopenharmony_ci
406662306a36Sopenharmony_ci	return in_group_p(io_uring_group);
406762306a36Sopenharmony_ci}
406862306a36Sopenharmony_ci
406962306a36Sopenharmony_ciSYSCALL_DEFINE2(io_uring_setup, u32, entries,
407062306a36Sopenharmony_ci		struct io_uring_params __user *, params)
407162306a36Sopenharmony_ci{
407262306a36Sopenharmony_ci	if (!io_uring_allowed())
407362306a36Sopenharmony_ci		return -EPERM;
407462306a36Sopenharmony_ci
407562306a36Sopenharmony_ci	return io_uring_setup(entries, params);
407662306a36Sopenharmony_ci}
407762306a36Sopenharmony_ci
407862306a36Sopenharmony_cistatic __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
407962306a36Sopenharmony_ci			   unsigned nr_args)
408062306a36Sopenharmony_ci{
408162306a36Sopenharmony_ci	struct io_uring_probe *p;
408262306a36Sopenharmony_ci	size_t size;
408362306a36Sopenharmony_ci	int i, ret;
408462306a36Sopenharmony_ci
408562306a36Sopenharmony_ci	size = struct_size(p, ops, nr_args);
408662306a36Sopenharmony_ci	if (size == SIZE_MAX)
408762306a36Sopenharmony_ci		return -EOVERFLOW;
408862306a36Sopenharmony_ci	p = kzalloc(size, GFP_KERNEL);
408962306a36Sopenharmony_ci	if (!p)
409062306a36Sopenharmony_ci		return -ENOMEM;
409162306a36Sopenharmony_ci
409262306a36Sopenharmony_ci	ret = -EFAULT;
409362306a36Sopenharmony_ci	if (copy_from_user(p, arg, size))
409462306a36Sopenharmony_ci		goto out;
409562306a36Sopenharmony_ci	ret = -EINVAL;
409662306a36Sopenharmony_ci	if (memchr_inv(p, 0, size))
409762306a36Sopenharmony_ci		goto out;
409862306a36Sopenharmony_ci
409962306a36Sopenharmony_ci	p->last_op = IORING_OP_LAST - 1;
410062306a36Sopenharmony_ci	if (nr_args > IORING_OP_LAST)
410162306a36Sopenharmony_ci		nr_args = IORING_OP_LAST;
410262306a36Sopenharmony_ci
410362306a36Sopenharmony_ci	for (i = 0; i < nr_args; i++) {
410462306a36Sopenharmony_ci		p->ops[i].op = i;
410562306a36Sopenharmony_ci		if (!io_issue_defs[i].not_supported)
410662306a36Sopenharmony_ci			p->ops[i].flags = IO_URING_OP_SUPPORTED;
410762306a36Sopenharmony_ci	}
410862306a36Sopenharmony_ci	p->ops_len = i;
410962306a36Sopenharmony_ci
411062306a36Sopenharmony_ci	ret = 0;
411162306a36Sopenharmony_ci	if (copy_to_user(arg, p, size))
411262306a36Sopenharmony_ci		ret = -EFAULT;
411362306a36Sopenharmony_ciout:
411462306a36Sopenharmony_ci	kfree(p);
411562306a36Sopenharmony_ci	return ret;
411662306a36Sopenharmony_ci}
411762306a36Sopenharmony_ci
411862306a36Sopenharmony_cistatic int io_register_personality(struct io_ring_ctx *ctx)
411962306a36Sopenharmony_ci{
412062306a36Sopenharmony_ci	const struct cred *creds;
412162306a36Sopenharmony_ci	u32 id;
412262306a36Sopenharmony_ci	int ret;
412362306a36Sopenharmony_ci
412462306a36Sopenharmony_ci	creds = get_current_cred();
412562306a36Sopenharmony_ci
412662306a36Sopenharmony_ci	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
412762306a36Sopenharmony_ci			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
412862306a36Sopenharmony_ci	if (ret < 0) {
412962306a36Sopenharmony_ci		put_cred(creds);
413062306a36Sopenharmony_ci		return ret;
413162306a36Sopenharmony_ci	}
413262306a36Sopenharmony_ci	return id;
413362306a36Sopenharmony_ci}
413462306a36Sopenharmony_ci
413562306a36Sopenharmony_cistatic __cold int io_register_restrictions(struct io_ring_ctx *ctx,
413662306a36Sopenharmony_ci					   void __user *arg, unsigned int nr_args)
413762306a36Sopenharmony_ci{
413862306a36Sopenharmony_ci	struct io_uring_restriction *res;
413962306a36Sopenharmony_ci	size_t size;
414062306a36Sopenharmony_ci	int i, ret;
414162306a36Sopenharmony_ci
414262306a36Sopenharmony_ci	/* Restrictions allowed only if rings started disabled */
414362306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
414462306a36Sopenharmony_ci		return -EBADFD;
414562306a36Sopenharmony_ci
414662306a36Sopenharmony_ci	/* We allow only a single restrictions registration */
414762306a36Sopenharmony_ci	if (ctx->restrictions.registered)
414862306a36Sopenharmony_ci		return -EBUSY;
414962306a36Sopenharmony_ci
415062306a36Sopenharmony_ci	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
415162306a36Sopenharmony_ci		return -EINVAL;
415262306a36Sopenharmony_ci
415362306a36Sopenharmony_ci	size = array_size(nr_args, sizeof(*res));
415462306a36Sopenharmony_ci	if (size == SIZE_MAX)
415562306a36Sopenharmony_ci		return -EOVERFLOW;
415662306a36Sopenharmony_ci
415762306a36Sopenharmony_ci	res = memdup_user(arg, size);
415862306a36Sopenharmony_ci	if (IS_ERR(res))
415962306a36Sopenharmony_ci		return PTR_ERR(res);
416062306a36Sopenharmony_ci
416162306a36Sopenharmony_ci	ret = 0;
416262306a36Sopenharmony_ci
416362306a36Sopenharmony_ci	for (i = 0; i < nr_args; i++) {
416462306a36Sopenharmony_ci		switch (res[i].opcode) {
416562306a36Sopenharmony_ci		case IORING_RESTRICTION_REGISTER_OP:
416662306a36Sopenharmony_ci			if (res[i].register_op >= IORING_REGISTER_LAST) {
416762306a36Sopenharmony_ci				ret = -EINVAL;
416862306a36Sopenharmony_ci				goto out;
416962306a36Sopenharmony_ci			}
417062306a36Sopenharmony_ci
417162306a36Sopenharmony_ci			__set_bit(res[i].register_op,
417262306a36Sopenharmony_ci				  ctx->restrictions.register_op);
417362306a36Sopenharmony_ci			break;
417462306a36Sopenharmony_ci		case IORING_RESTRICTION_SQE_OP:
417562306a36Sopenharmony_ci			if (res[i].sqe_op >= IORING_OP_LAST) {
417662306a36Sopenharmony_ci				ret = -EINVAL;
417762306a36Sopenharmony_ci				goto out;
417862306a36Sopenharmony_ci			}
417962306a36Sopenharmony_ci
418062306a36Sopenharmony_ci			__set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
418162306a36Sopenharmony_ci			break;
418262306a36Sopenharmony_ci		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
418362306a36Sopenharmony_ci			ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
418462306a36Sopenharmony_ci			break;
418562306a36Sopenharmony_ci		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
418662306a36Sopenharmony_ci			ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
418762306a36Sopenharmony_ci			break;
418862306a36Sopenharmony_ci		default:
418962306a36Sopenharmony_ci			ret = -EINVAL;
419062306a36Sopenharmony_ci			goto out;
419162306a36Sopenharmony_ci		}
419262306a36Sopenharmony_ci	}
419362306a36Sopenharmony_ci
419462306a36Sopenharmony_ciout:
419562306a36Sopenharmony_ci	/* Reset all restrictions if an error happened */
419662306a36Sopenharmony_ci	if (ret != 0)
419762306a36Sopenharmony_ci		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
419862306a36Sopenharmony_ci	else
419962306a36Sopenharmony_ci		ctx->restrictions.registered = true;
420062306a36Sopenharmony_ci
420162306a36Sopenharmony_ci	kfree(res);
420262306a36Sopenharmony_ci	return ret;
420362306a36Sopenharmony_ci}
420462306a36Sopenharmony_ci
420562306a36Sopenharmony_cistatic int io_register_enable_rings(struct io_ring_ctx *ctx)
420662306a36Sopenharmony_ci{
420762306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
420862306a36Sopenharmony_ci		return -EBADFD;
420962306a36Sopenharmony_ci
421062306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
421162306a36Sopenharmony_ci		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
421262306a36Sopenharmony_ci		/*
421362306a36Sopenharmony_ci		 * Lazy activation attempts would fail if it was polled before
421462306a36Sopenharmony_ci		 * submitter_task is set.
421562306a36Sopenharmony_ci		 */
421662306a36Sopenharmony_ci		if (wq_has_sleeper(&ctx->poll_wq))
421762306a36Sopenharmony_ci			io_activate_pollwq(ctx);
421862306a36Sopenharmony_ci	}
421962306a36Sopenharmony_ci
422062306a36Sopenharmony_ci	if (ctx->restrictions.registered)
422162306a36Sopenharmony_ci		ctx->restricted = 1;
422262306a36Sopenharmony_ci
422362306a36Sopenharmony_ci	ctx->flags &= ~IORING_SETUP_R_DISABLED;
422462306a36Sopenharmony_ci	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
422562306a36Sopenharmony_ci		wake_up(&ctx->sq_data->wait);
422662306a36Sopenharmony_ci	return 0;
422762306a36Sopenharmony_ci}
422862306a36Sopenharmony_ci
422962306a36Sopenharmony_cistatic __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
423062306a36Sopenharmony_ci					 cpumask_var_t new_mask)
423162306a36Sopenharmony_ci{
423262306a36Sopenharmony_ci	int ret;
423362306a36Sopenharmony_ci
423462306a36Sopenharmony_ci	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
423562306a36Sopenharmony_ci		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
423662306a36Sopenharmony_ci	} else {
423762306a36Sopenharmony_ci		mutex_unlock(&ctx->uring_lock);
423862306a36Sopenharmony_ci		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
423962306a36Sopenharmony_ci		mutex_lock(&ctx->uring_lock);
424062306a36Sopenharmony_ci	}
424162306a36Sopenharmony_ci
424262306a36Sopenharmony_ci	return ret;
424362306a36Sopenharmony_ci}
424462306a36Sopenharmony_ci
424562306a36Sopenharmony_cistatic __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
424662306a36Sopenharmony_ci				       void __user *arg, unsigned len)
424762306a36Sopenharmony_ci{
424862306a36Sopenharmony_ci	cpumask_var_t new_mask;
424962306a36Sopenharmony_ci	int ret;
425062306a36Sopenharmony_ci
425162306a36Sopenharmony_ci	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
425262306a36Sopenharmony_ci		return -ENOMEM;
425362306a36Sopenharmony_ci
425462306a36Sopenharmony_ci	cpumask_clear(new_mask);
425562306a36Sopenharmony_ci	if (len > cpumask_size())
425662306a36Sopenharmony_ci		len = cpumask_size();
425762306a36Sopenharmony_ci
425862306a36Sopenharmony_ci	if (in_compat_syscall()) {
425962306a36Sopenharmony_ci		ret = compat_get_bitmap(cpumask_bits(new_mask),
426062306a36Sopenharmony_ci					(const compat_ulong_t __user *)arg,
426162306a36Sopenharmony_ci					len * 8 /* CHAR_BIT */);
426262306a36Sopenharmony_ci	} else {
426362306a36Sopenharmony_ci		ret = copy_from_user(new_mask, arg, len);
426462306a36Sopenharmony_ci	}
426562306a36Sopenharmony_ci
426662306a36Sopenharmony_ci	if (ret) {
426762306a36Sopenharmony_ci		free_cpumask_var(new_mask);
426862306a36Sopenharmony_ci		return -EFAULT;
426962306a36Sopenharmony_ci	}
427062306a36Sopenharmony_ci
427162306a36Sopenharmony_ci	ret = __io_register_iowq_aff(ctx, new_mask);
427262306a36Sopenharmony_ci	free_cpumask_var(new_mask);
427362306a36Sopenharmony_ci	return ret;
427462306a36Sopenharmony_ci}
427562306a36Sopenharmony_ci
427662306a36Sopenharmony_cistatic __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
427762306a36Sopenharmony_ci{
427862306a36Sopenharmony_ci	return __io_register_iowq_aff(ctx, NULL);
427962306a36Sopenharmony_ci}
428062306a36Sopenharmony_ci
428162306a36Sopenharmony_cistatic __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
428262306a36Sopenharmony_ci					       void __user *arg)
428362306a36Sopenharmony_ci	__must_hold(&ctx->uring_lock)
428462306a36Sopenharmony_ci{
428562306a36Sopenharmony_ci	struct io_tctx_node *node;
428662306a36Sopenharmony_ci	struct io_uring_task *tctx = NULL;
428762306a36Sopenharmony_ci	struct io_sq_data *sqd = NULL;
428862306a36Sopenharmony_ci	__u32 new_count[2];
428962306a36Sopenharmony_ci	int i, ret;
429062306a36Sopenharmony_ci
429162306a36Sopenharmony_ci	if (copy_from_user(new_count, arg, sizeof(new_count)))
429262306a36Sopenharmony_ci		return -EFAULT;
429362306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(new_count); i++)
429462306a36Sopenharmony_ci		if (new_count[i] > INT_MAX)
429562306a36Sopenharmony_ci			return -EINVAL;
429662306a36Sopenharmony_ci
429762306a36Sopenharmony_ci	if (ctx->flags & IORING_SETUP_SQPOLL) {
429862306a36Sopenharmony_ci		sqd = ctx->sq_data;
429962306a36Sopenharmony_ci		if (sqd) {
430062306a36Sopenharmony_ci			/*
430162306a36Sopenharmony_ci			 * Observe the correct sqd->lock -> ctx->uring_lock
430262306a36Sopenharmony_ci			 * ordering. Fine to drop uring_lock here, we hold
430362306a36Sopenharmony_ci			 * a ref to the ctx.
430462306a36Sopenharmony_ci			 */
430562306a36Sopenharmony_ci			refcount_inc(&sqd->refs);
430662306a36Sopenharmony_ci			mutex_unlock(&ctx->uring_lock);
430762306a36Sopenharmony_ci			mutex_lock(&sqd->lock);
430862306a36Sopenharmony_ci			mutex_lock(&ctx->uring_lock);
430962306a36Sopenharmony_ci			if (sqd->thread)
431062306a36Sopenharmony_ci				tctx = sqd->thread->io_uring;
431162306a36Sopenharmony_ci		}
431262306a36Sopenharmony_ci	} else {
431362306a36Sopenharmony_ci		tctx = current->io_uring;
431462306a36Sopenharmony_ci	}
431562306a36Sopenharmony_ci
431662306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
431762306a36Sopenharmony_ci
431862306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(new_count); i++)
431962306a36Sopenharmony_ci		if (new_count[i])
432062306a36Sopenharmony_ci			ctx->iowq_limits[i] = new_count[i];
432162306a36Sopenharmony_ci	ctx->iowq_limits_set = true;
432262306a36Sopenharmony_ci
432362306a36Sopenharmony_ci	if (tctx && tctx->io_wq) {
432462306a36Sopenharmony_ci		ret = io_wq_max_workers(tctx->io_wq, new_count);
432562306a36Sopenharmony_ci		if (ret)
432662306a36Sopenharmony_ci			goto err;
432762306a36Sopenharmony_ci	} else {
432862306a36Sopenharmony_ci		memset(new_count, 0, sizeof(new_count));
432962306a36Sopenharmony_ci	}
433062306a36Sopenharmony_ci
433162306a36Sopenharmony_ci	if (sqd) {
433262306a36Sopenharmony_ci		mutex_unlock(&sqd->lock);
433362306a36Sopenharmony_ci		io_put_sq_data(sqd);
433462306a36Sopenharmony_ci	}
433562306a36Sopenharmony_ci
433662306a36Sopenharmony_ci	if (copy_to_user(arg, new_count, sizeof(new_count)))
433762306a36Sopenharmony_ci		return -EFAULT;
433862306a36Sopenharmony_ci
433962306a36Sopenharmony_ci	/* that's it for SQPOLL, only the SQPOLL task creates requests */
434062306a36Sopenharmony_ci	if (sqd)
434162306a36Sopenharmony_ci		return 0;
434262306a36Sopenharmony_ci
434362306a36Sopenharmony_ci	/* now propagate the restriction to all registered users */
434462306a36Sopenharmony_ci	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
434562306a36Sopenharmony_ci		struct io_uring_task *tctx = node->task->io_uring;
434662306a36Sopenharmony_ci
434762306a36Sopenharmony_ci		if (WARN_ON_ONCE(!tctx->io_wq))
434862306a36Sopenharmony_ci			continue;
434962306a36Sopenharmony_ci
435062306a36Sopenharmony_ci		for (i = 0; i < ARRAY_SIZE(new_count); i++)
435162306a36Sopenharmony_ci			new_count[i] = ctx->iowq_limits[i];
435262306a36Sopenharmony_ci		/* ignore errors, it always returns zero anyway */
435362306a36Sopenharmony_ci		(void)io_wq_max_workers(tctx->io_wq, new_count);
435462306a36Sopenharmony_ci	}
435562306a36Sopenharmony_ci	return 0;
435662306a36Sopenharmony_cierr:
435762306a36Sopenharmony_ci	if (sqd) {
435862306a36Sopenharmony_ci		mutex_unlock(&sqd->lock);
435962306a36Sopenharmony_ci		io_put_sq_data(sqd);
436062306a36Sopenharmony_ci	}
436162306a36Sopenharmony_ci	return ret;
436262306a36Sopenharmony_ci}
436362306a36Sopenharmony_ci
436462306a36Sopenharmony_cistatic int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
436562306a36Sopenharmony_ci			       void __user *arg, unsigned nr_args)
436662306a36Sopenharmony_ci	__releases(ctx->uring_lock)
436762306a36Sopenharmony_ci	__acquires(ctx->uring_lock)
436862306a36Sopenharmony_ci{
436962306a36Sopenharmony_ci	int ret;
437062306a36Sopenharmony_ci
437162306a36Sopenharmony_ci	/*
437262306a36Sopenharmony_ci	 * We don't quiesce the refs for register anymore and so it can't be
437362306a36Sopenharmony_ci	 * dying as we're holding a file ref here.
437462306a36Sopenharmony_ci	 */
437562306a36Sopenharmony_ci	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
437662306a36Sopenharmony_ci		return -ENXIO;
437762306a36Sopenharmony_ci
437862306a36Sopenharmony_ci	if (ctx->submitter_task && ctx->submitter_task != current)
437962306a36Sopenharmony_ci		return -EEXIST;
438062306a36Sopenharmony_ci
438162306a36Sopenharmony_ci	if (ctx->restricted) {
438262306a36Sopenharmony_ci		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
438362306a36Sopenharmony_ci		if (!test_bit(opcode, ctx->restrictions.register_op))
438462306a36Sopenharmony_ci			return -EACCES;
438562306a36Sopenharmony_ci	}
438662306a36Sopenharmony_ci
438762306a36Sopenharmony_ci	switch (opcode) {
438862306a36Sopenharmony_ci	case IORING_REGISTER_BUFFERS:
438962306a36Sopenharmony_ci		ret = -EFAULT;
439062306a36Sopenharmony_ci		if (!arg)
439162306a36Sopenharmony_ci			break;
439262306a36Sopenharmony_ci		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
439362306a36Sopenharmony_ci		break;
439462306a36Sopenharmony_ci	case IORING_UNREGISTER_BUFFERS:
439562306a36Sopenharmony_ci		ret = -EINVAL;
439662306a36Sopenharmony_ci		if (arg || nr_args)
439762306a36Sopenharmony_ci			break;
439862306a36Sopenharmony_ci		ret = io_sqe_buffers_unregister(ctx);
439962306a36Sopenharmony_ci		break;
440062306a36Sopenharmony_ci	case IORING_REGISTER_FILES:
440162306a36Sopenharmony_ci		ret = -EFAULT;
440262306a36Sopenharmony_ci		if (!arg)
440362306a36Sopenharmony_ci			break;
440462306a36Sopenharmony_ci		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
440562306a36Sopenharmony_ci		break;
440662306a36Sopenharmony_ci	case IORING_UNREGISTER_FILES:
440762306a36Sopenharmony_ci		ret = -EINVAL;
440862306a36Sopenharmony_ci		if (arg || nr_args)
440962306a36Sopenharmony_ci			break;
441062306a36Sopenharmony_ci		ret = io_sqe_files_unregister(ctx);
441162306a36Sopenharmony_ci		break;
441262306a36Sopenharmony_ci	case IORING_REGISTER_FILES_UPDATE:
441362306a36Sopenharmony_ci		ret = io_register_files_update(ctx, arg, nr_args);
441462306a36Sopenharmony_ci		break;
441562306a36Sopenharmony_ci	case IORING_REGISTER_EVENTFD:
441662306a36Sopenharmony_ci		ret = -EINVAL;
441762306a36Sopenharmony_ci		if (nr_args != 1)
441862306a36Sopenharmony_ci			break;
441962306a36Sopenharmony_ci		ret = io_eventfd_register(ctx, arg, 0);
442062306a36Sopenharmony_ci		break;
442162306a36Sopenharmony_ci	case IORING_REGISTER_EVENTFD_ASYNC:
442262306a36Sopenharmony_ci		ret = -EINVAL;
442362306a36Sopenharmony_ci		if (nr_args != 1)
442462306a36Sopenharmony_ci			break;
442562306a36Sopenharmony_ci		ret = io_eventfd_register(ctx, arg, 1);
442662306a36Sopenharmony_ci		break;
442762306a36Sopenharmony_ci	case IORING_UNREGISTER_EVENTFD:
442862306a36Sopenharmony_ci		ret = -EINVAL;
442962306a36Sopenharmony_ci		if (arg || nr_args)
443062306a36Sopenharmony_ci			break;
443162306a36Sopenharmony_ci		ret = io_eventfd_unregister(ctx);
443262306a36Sopenharmony_ci		break;
443362306a36Sopenharmony_ci	case IORING_REGISTER_PROBE:
443462306a36Sopenharmony_ci		ret = -EINVAL;
443562306a36Sopenharmony_ci		if (!arg || nr_args > 256)
443662306a36Sopenharmony_ci			break;
443762306a36Sopenharmony_ci		ret = io_probe(ctx, arg, nr_args);
443862306a36Sopenharmony_ci		break;
443962306a36Sopenharmony_ci	case IORING_REGISTER_PERSONALITY:
444062306a36Sopenharmony_ci		ret = -EINVAL;
444162306a36Sopenharmony_ci		if (arg || nr_args)
444262306a36Sopenharmony_ci			break;
444362306a36Sopenharmony_ci		ret = io_register_personality(ctx);
444462306a36Sopenharmony_ci		break;
444562306a36Sopenharmony_ci	case IORING_UNREGISTER_PERSONALITY:
444662306a36Sopenharmony_ci		ret = -EINVAL;
444762306a36Sopenharmony_ci		if (arg)
444862306a36Sopenharmony_ci			break;
444962306a36Sopenharmony_ci		ret = io_unregister_personality(ctx, nr_args);
445062306a36Sopenharmony_ci		break;
445162306a36Sopenharmony_ci	case IORING_REGISTER_ENABLE_RINGS:
445262306a36Sopenharmony_ci		ret = -EINVAL;
445362306a36Sopenharmony_ci		if (arg || nr_args)
445462306a36Sopenharmony_ci			break;
445562306a36Sopenharmony_ci		ret = io_register_enable_rings(ctx);
445662306a36Sopenharmony_ci		break;
445762306a36Sopenharmony_ci	case IORING_REGISTER_RESTRICTIONS:
445862306a36Sopenharmony_ci		ret = io_register_restrictions(ctx, arg, nr_args);
445962306a36Sopenharmony_ci		break;
446062306a36Sopenharmony_ci	case IORING_REGISTER_FILES2:
446162306a36Sopenharmony_ci		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
446262306a36Sopenharmony_ci		break;
446362306a36Sopenharmony_ci	case IORING_REGISTER_FILES_UPDATE2:
446462306a36Sopenharmony_ci		ret = io_register_rsrc_update(ctx, arg, nr_args,
446562306a36Sopenharmony_ci					      IORING_RSRC_FILE);
446662306a36Sopenharmony_ci		break;
446762306a36Sopenharmony_ci	case IORING_REGISTER_BUFFERS2:
446862306a36Sopenharmony_ci		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
446962306a36Sopenharmony_ci		break;
447062306a36Sopenharmony_ci	case IORING_REGISTER_BUFFERS_UPDATE:
447162306a36Sopenharmony_ci		ret = io_register_rsrc_update(ctx, arg, nr_args,
447262306a36Sopenharmony_ci					      IORING_RSRC_BUFFER);
447362306a36Sopenharmony_ci		break;
447462306a36Sopenharmony_ci	case IORING_REGISTER_IOWQ_AFF:
447562306a36Sopenharmony_ci		ret = -EINVAL;
447662306a36Sopenharmony_ci		if (!arg || !nr_args)
447762306a36Sopenharmony_ci			break;
447862306a36Sopenharmony_ci		ret = io_register_iowq_aff(ctx, arg, nr_args);
447962306a36Sopenharmony_ci		break;
448062306a36Sopenharmony_ci	case IORING_UNREGISTER_IOWQ_AFF:
448162306a36Sopenharmony_ci		ret = -EINVAL;
448262306a36Sopenharmony_ci		if (arg || nr_args)
448362306a36Sopenharmony_ci			break;
448462306a36Sopenharmony_ci		ret = io_unregister_iowq_aff(ctx);
448562306a36Sopenharmony_ci		break;
448662306a36Sopenharmony_ci	case IORING_REGISTER_IOWQ_MAX_WORKERS:
448762306a36Sopenharmony_ci		ret = -EINVAL;
448862306a36Sopenharmony_ci		if (!arg || nr_args != 2)
448962306a36Sopenharmony_ci			break;
449062306a36Sopenharmony_ci		ret = io_register_iowq_max_workers(ctx, arg);
449162306a36Sopenharmony_ci		break;
449262306a36Sopenharmony_ci	case IORING_REGISTER_RING_FDS:
449362306a36Sopenharmony_ci		ret = io_ringfd_register(ctx, arg, nr_args);
449462306a36Sopenharmony_ci		break;
449562306a36Sopenharmony_ci	case IORING_UNREGISTER_RING_FDS:
449662306a36Sopenharmony_ci		ret = io_ringfd_unregister(ctx, arg, nr_args);
449762306a36Sopenharmony_ci		break;
449862306a36Sopenharmony_ci	case IORING_REGISTER_PBUF_RING:
449962306a36Sopenharmony_ci		ret = -EINVAL;
450062306a36Sopenharmony_ci		if (!arg || nr_args != 1)
450162306a36Sopenharmony_ci			break;
450262306a36Sopenharmony_ci		ret = io_register_pbuf_ring(ctx, arg);
450362306a36Sopenharmony_ci		break;
450462306a36Sopenharmony_ci	case IORING_UNREGISTER_PBUF_RING:
450562306a36Sopenharmony_ci		ret = -EINVAL;
450662306a36Sopenharmony_ci		if (!arg || nr_args != 1)
450762306a36Sopenharmony_ci			break;
450862306a36Sopenharmony_ci		ret = io_unregister_pbuf_ring(ctx, arg);
450962306a36Sopenharmony_ci		break;
451062306a36Sopenharmony_ci	case IORING_REGISTER_SYNC_CANCEL:
451162306a36Sopenharmony_ci		ret = -EINVAL;
451262306a36Sopenharmony_ci		if (!arg || nr_args != 1)
451362306a36Sopenharmony_ci			break;
451462306a36Sopenharmony_ci		ret = io_sync_cancel(ctx, arg);
451562306a36Sopenharmony_ci		break;
451662306a36Sopenharmony_ci	case IORING_REGISTER_FILE_ALLOC_RANGE:
451762306a36Sopenharmony_ci		ret = -EINVAL;
451862306a36Sopenharmony_ci		if (!arg || nr_args)
451962306a36Sopenharmony_ci			break;
452062306a36Sopenharmony_ci		ret = io_register_file_alloc_range(ctx, arg);
452162306a36Sopenharmony_ci		break;
452262306a36Sopenharmony_ci	default:
452362306a36Sopenharmony_ci		ret = -EINVAL;
452462306a36Sopenharmony_ci		break;
452562306a36Sopenharmony_ci	}
452662306a36Sopenharmony_ci
452762306a36Sopenharmony_ci	return ret;
452862306a36Sopenharmony_ci}
452962306a36Sopenharmony_ci
453062306a36Sopenharmony_ciSYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
453162306a36Sopenharmony_ci		void __user *, arg, unsigned int, nr_args)
453262306a36Sopenharmony_ci{
453362306a36Sopenharmony_ci	struct io_ring_ctx *ctx;
453462306a36Sopenharmony_ci	long ret = -EBADF;
453562306a36Sopenharmony_ci	struct file *file;
453662306a36Sopenharmony_ci	bool use_registered_ring;
453762306a36Sopenharmony_ci
453862306a36Sopenharmony_ci	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
453962306a36Sopenharmony_ci	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
454062306a36Sopenharmony_ci
454162306a36Sopenharmony_ci	if (opcode >= IORING_REGISTER_LAST)
454262306a36Sopenharmony_ci		return -EINVAL;
454362306a36Sopenharmony_ci
454462306a36Sopenharmony_ci	if (use_registered_ring) {
454562306a36Sopenharmony_ci		/*
454662306a36Sopenharmony_ci		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
454762306a36Sopenharmony_ci		 * need only dereference our task private array to find it.
454862306a36Sopenharmony_ci		 */
454962306a36Sopenharmony_ci		struct io_uring_task *tctx = current->io_uring;
455062306a36Sopenharmony_ci
455162306a36Sopenharmony_ci		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
455262306a36Sopenharmony_ci			return -EINVAL;
455362306a36Sopenharmony_ci		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
455462306a36Sopenharmony_ci		file = tctx->registered_rings[fd];
455562306a36Sopenharmony_ci		if (unlikely(!file))
455662306a36Sopenharmony_ci			return -EBADF;
455762306a36Sopenharmony_ci	} else {
455862306a36Sopenharmony_ci		file = fget(fd);
455962306a36Sopenharmony_ci		if (unlikely(!file))
456062306a36Sopenharmony_ci			return -EBADF;
456162306a36Sopenharmony_ci		ret = -EOPNOTSUPP;
456262306a36Sopenharmony_ci		if (!io_is_uring_fops(file))
456362306a36Sopenharmony_ci			goto out_fput;
456462306a36Sopenharmony_ci	}
456562306a36Sopenharmony_ci
456662306a36Sopenharmony_ci	ctx = file->private_data;
456762306a36Sopenharmony_ci
456862306a36Sopenharmony_ci	mutex_lock(&ctx->uring_lock);
456962306a36Sopenharmony_ci	ret = __io_uring_register(ctx, opcode, arg, nr_args);
457062306a36Sopenharmony_ci	mutex_unlock(&ctx->uring_lock);
457162306a36Sopenharmony_ci	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
457262306a36Sopenharmony_ciout_fput:
457362306a36Sopenharmony_ci	if (!use_registered_ring)
457462306a36Sopenharmony_ci		fput(file);
457562306a36Sopenharmony_ci	return ret;
457662306a36Sopenharmony_ci}
457762306a36Sopenharmony_ci
457862306a36Sopenharmony_cistatic int __init io_uring_init(void)
457962306a36Sopenharmony_ci{
458062306a36Sopenharmony_ci#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
458162306a36Sopenharmony_ci	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
458262306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \
458362306a36Sopenharmony_ci} while (0)
458462306a36Sopenharmony_ci
458562306a36Sopenharmony_ci#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
458662306a36Sopenharmony_ci	__BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename)
458762306a36Sopenharmony_ci#define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \
458862306a36Sopenharmony_ci	__BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename)
458962306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
459062306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
459162306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
459262306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
459362306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
459462306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
459562306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
459662306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(8,  __u32,  cmd_op);
459762306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(12, __u32, __pad1);
459862306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
459962306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
460062306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(24, __u32,  len);
460162306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
460262306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
460362306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
460462306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
460562306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
460662306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
460762306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
460862306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
460962306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
461062306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
461162306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
461262306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
461362306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
461462306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
461562306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
461662306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  rename_flags);
461762306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  unlink_flags);
461862306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  hardlink_flags);
461962306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  xattr_flags);
462062306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(28, __u32,  msg_ring_flags);
462162306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
462262306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
462362306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
462462306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
462562306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
462662306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
462762306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
462862306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
462962306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
463062306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
463162306a36Sopenharmony_ci	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);
463262306a36Sopenharmony_ci
463362306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
463462306a36Sopenharmony_ci		     sizeof(struct io_uring_rsrc_update));
463562306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
463662306a36Sopenharmony_ci		     sizeof(struct io_uring_rsrc_update2));
463762306a36Sopenharmony_ci
463862306a36Sopenharmony_ci	/* ->buf_index is u16 */
463962306a36Sopenharmony_ci	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
464062306a36Sopenharmony_ci	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
464162306a36Sopenharmony_ci		     offsetof(struct io_uring_buf_ring, tail));
464262306a36Sopenharmony_ci
464362306a36Sopenharmony_ci	/* should fit into one byte */
464462306a36Sopenharmony_ci	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
464562306a36Sopenharmony_ci	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
464662306a36Sopenharmony_ci	BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
464762306a36Sopenharmony_ci
464862306a36Sopenharmony_ci	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
464962306a36Sopenharmony_ci
465062306a36Sopenharmony_ci	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
465162306a36Sopenharmony_ci
465262306a36Sopenharmony_ci	io_uring_optable_init();
465362306a36Sopenharmony_ci
465462306a36Sopenharmony_ci	/*
465562306a36Sopenharmony_ci	 * Allow user copy in the per-command field, which starts after the
465662306a36Sopenharmony_ci	 * file in io_kiocb and until the opcode field. The openat2 handling
465762306a36Sopenharmony_ci	 * requires copying in user memory into the io_kiocb object in that
465862306a36Sopenharmony_ci	 * range, and HARDENED_USERCOPY will complain if we haven't
465962306a36Sopenharmony_ci	 * correctly annotated this range.
466062306a36Sopenharmony_ci	 */
466162306a36Sopenharmony_ci	req_cachep = kmem_cache_create_usercopy("io_kiocb",
466262306a36Sopenharmony_ci				sizeof(struct io_kiocb), 0,
466362306a36Sopenharmony_ci				SLAB_HWCACHE_ALIGN | SLAB_PANIC |
466462306a36Sopenharmony_ci				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
466562306a36Sopenharmony_ci				offsetof(struct io_kiocb, cmd.data),
466662306a36Sopenharmony_ci				sizeof_field(struct io_kiocb, cmd.data), NULL);
466762306a36Sopenharmony_ci
466862306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL
466962306a36Sopenharmony_ci	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
467062306a36Sopenharmony_ci#endif
467162306a36Sopenharmony_ci
467262306a36Sopenharmony_ci	return 0;
467362306a36Sopenharmony_ci};
467462306a36Sopenharmony_ci__initcall(io_uring_init);
4675