162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Shared application/kernel submission and completion ring pairs, for 462306a36Sopenharmony_ci * supporting fast/efficient IO. 562306a36Sopenharmony_ci * 662306a36Sopenharmony_ci * A note on the read/write ordering memory barriers that are matched between 762306a36Sopenharmony_ci * the application and kernel side. 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * After the application reads the CQ ring tail, it must use an 1062306a36Sopenharmony_ci * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses 1162306a36Sopenharmony_ci * before writing the tail (using smp_load_acquire to read the tail will 1262306a36Sopenharmony_ci * do). It also needs a smp_mb() before updating CQ head (ordering the 1362306a36Sopenharmony_ci * entry load(s) with the head store), pairing with an implicit barrier 1462306a36Sopenharmony_ci * through a control-dependency in io_get_cqe (smp_store_release to 1562306a36Sopenharmony_ci * store head will do). Failure to do so could lead to reading invalid 1662306a36Sopenharmony_ci * CQ entries. 1762306a36Sopenharmony_ci * 1862306a36Sopenharmony_ci * Likewise, the application must use an appropriate smp_wmb() before 1962306a36Sopenharmony_ci * writing the SQ tail (ordering SQ entry stores with the tail store), 2062306a36Sopenharmony_ci * which pairs with smp_load_acquire in io_get_sqring (smp_store_release 2162306a36Sopenharmony_ci * to store the tail will do). And it needs a barrier ordering the SQ 2262306a36Sopenharmony_ci * head load before writing new SQ entries (smp_load_acquire to read 2362306a36Sopenharmony_ci * head will do). 2462306a36Sopenharmony_ci * 2562306a36Sopenharmony_ci * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application 2662306a36Sopenharmony_ci * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* 2762306a36Sopenharmony_ci * updating the SQ tail; a full memory barrier smp_mb() is needed 2862306a36Sopenharmony_ci * between. 2962306a36Sopenharmony_ci * 3062306a36Sopenharmony_ci * Also see the examples in the liburing library: 3162306a36Sopenharmony_ci * 3262306a36Sopenharmony_ci * git://git.kernel.dk/liburing 3362306a36Sopenharmony_ci * 3462306a36Sopenharmony_ci * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens 3562306a36Sopenharmony_ci * from data shared between the kernel and application. This is done both 3662306a36Sopenharmony_ci * for ordering purposes, but also to ensure that once a value is loaded from 3762306a36Sopenharmony_ci * data that the application could potentially modify, it remains stable. 3862306a36Sopenharmony_ci * 3962306a36Sopenharmony_ci * Copyright (C) 2018-2019 Jens Axboe 4062306a36Sopenharmony_ci * Copyright (c) 2018-2019 Christoph Hellwig 4162306a36Sopenharmony_ci */ 4262306a36Sopenharmony_ci#include <linux/kernel.h> 4362306a36Sopenharmony_ci#include <linux/init.h> 4462306a36Sopenharmony_ci#include <linux/errno.h> 4562306a36Sopenharmony_ci#include <linux/syscalls.h> 4662306a36Sopenharmony_ci#include <net/compat.h> 4762306a36Sopenharmony_ci#include <linux/refcount.h> 4862306a36Sopenharmony_ci#include <linux/uio.h> 4962306a36Sopenharmony_ci#include <linux/bits.h> 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci#include <linux/sched/signal.h> 5262306a36Sopenharmony_ci#include <linux/fs.h> 5362306a36Sopenharmony_ci#include <linux/file.h> 5462306a36Sopenharmony_ci#include <linux/fdtable.h> 5562306a36Sopenharmony_ci#include <linux/mm.h> 5662306a36Sopenharmony_ci#include <linux/mman.h> 5762306a36Sopenharmony_ci#include <linux/percpu.h> 5862306a36Sopenharmony_ci#include <linux/slab.h> 5962306a36Sopenharmony_ci#include <linux/bvec.h> 6062306a36Sopenharmony_ci#include <linux/net.h> 6162306a36Sopenharmony_ci#include <net/sock.h> 6262306a36Sopenharmony_ci#include <net/af_unix.h> 6362306a36Sopenharmony_ci#include <linux/anon_inodes.h> 6462306a36Sopenharmony_ci#include <linux/sched/mm.h> 6562306a36Sopenharmony_ci#include <linux/uaccess.h> 6662306a36Sopenharmony_ci#include <linux/nospec.h> 6762306a36Sopenharmony_ci#include <linux/highmem.h> 6862306a36Sopenharmony_ci#include <linux/fsnotify.h> 6962306a36Sopenharmony_ci#include <linux/fadvise.h> 7062306a36Sopenharmony_ci#include <linux/task_work.h> 7162306a36Sopenharmony_ci#include <linux/io_uring.h> 7262306a36Sopenharmony_ci#include <linux/audit.h> 7362306a36Sopenharmony_ci#include <linux/security.h> 7462306a36Sopenharmony_ci#include <asm/shmparam.h> 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci#define CREATE_TRACE_POINTS 7762306a36Sopenharmony_ci#include <trace/events/io_uring.h> 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci#include <uapi/linux/io_uring.h> 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci#include "io-wq.h" 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci#include "io_uring.h" 8462306a36Sopenharmony_ci#include "opdef.h" 8562306a36Sopenharmony_ci#include "refs.h" 8662306a36Sopenharmony_ci#include "tctx.h" 8762306a36Sopenharmony_ci#include "sqpoll.h" 8862306a36Sopenharmony_ci#include "fdinfo.h" 8962306a36Sopenharmony_ci#include "kbuf.h" 9062306a36Sopenharmony_ci#include "rsrc.h" 9162306a36Sopenharmony_ci#include "cancel.h" 9262306a36Sopenharmony_ci#include "net.h" 9362306a36Sopenharmony_ci#include "notif.h" 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ci#include "timeout.h" 9662306a36Sopenharmony_ci#include "poll.h" 9762306a36Sopenharmony_ci#include "rw.h" 9862306a36Sopenharmony_ci#include "alloc_cache.h" 9962306a36Sopenharmony_ci 10062306a36Sopenharmony_ci#define IORING_MAX_ENTRIES 32768 10162306a36Sopenharmony_ci#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 10462306a36Sopenharmony_ci IORING_REGISTER_LAST + IORING_OP_LAST) 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ 10762306a36Sopenharmony_ci IOSQE_IO_HARDLINK | IOSQE_ASYNC) 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ 11062306a36Sopenharmony_ci IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) 11162306a36Sopenharmony_ci 11262306a36Sopenharmony_ci#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ 11362306a36Sopenharmony_ci REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ 11462306a36Sopenharmony_ci REQ_F_ASYNC_DATA) 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ 11762306a36Sopenharmony_ci IO_REQ_CLEAN_FLAGS) 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci#define IO_TCTX_REFS_CACHE_NR (1U << 10) 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci#define IO_COMPL_BATCH 32 12262306a36Sopenharmony_ci#define IO_REQ_ALLOC_BATCH 8 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_cienum { 12562306a36Sopenharmony_ci IO_CHECK_CQ_OVERFLOW_BIT, 12662306a36Sopenharmony_ci IO_CHECK_CQ_DROPPED_BIT, 12762306a36Sopenharmony_ci}; 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_cienum { 13062306a36Sopenharmony_ci IO_EVENTFD_OP_SIGNAL_BIT, 13162306a36Sopenharmony_ci IO_EVENTFD_OP_FREE_BIT, 13262306a36Sopenharmony_ci}; 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_cistruct io_defer_entry { 13562306a36Sopenharmony_ci struct list_head list; 13662306a36Sopenharmony_ci struct io_kiocb *req; 13762306a36Sopenharmony_ci u32 seq; 13862306a36Sopenharmony_ci}; 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci/* requests with any of those set should undergo io_disarm_next() */ 14162306a36Sopenharmony_ci#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 14262306a36Sopenharmony_ci#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_cistatic bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 14562306a36Sopenharmony_ci struct task_struct *task, 14662306a36Sopenharmony_ci bool cancel_all); 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_cistatic void io_queue_sqe(struct io_kiocb *req); 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_cistruct kmem_cache *req_cachep; 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_cistatic int __read_mostly sysctl_io_uring_disabled; 15362306a36Sopenharmony_cistatic int __read_mostly sysctl_io_uring_group = -1; 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL 15662306a36Sopenharmony_cistatic struct ctl_table kernel_io_uring_disabled_table[] = { 15762306a36Sopenharmony_ci { 15862306a36Sopenharmony_ci .procname = "io_uring_disabled", 15962306a36Sopenharmony_ci .data = &sysctl_io_uring_disabled, 16062306a36Sopenharmony_ci .maxlen = sizeof(sysctl_io_uring_disabled), 16162306a36Sopenharmony_ci .mode = 0644, 16262306a36Sopenharmony_ci .proc_handler = proc_dointvec_minmax, 16362306a36Sopenharmony_ci .extra1 = SYSCTL_ZERO, 16462306a36Sopenharmony_ci .extra2 = SYSCTL_TWO, 16562306a36Sopenharmony_ci }, 16662306a36Sopenharmony_ci { 16762306a36Sopenharmony_ci .procname = "io_uring_group", 16862306a36Sopenharmony_ci .data = &sysctl_io_uring_group, 16962306a36Sopenharmony_ci .maxlen = sizeof(gid_t), 17062306a36Sopenharmony_ci .mode = 0644, 17162306a36Sopenharmony_ci .proc_handler = proc_dointvec, 17262306a36Sopenharmony_ci }, 17362306a36Sopenharmony_ci {}, 17462306a36Sopenharmony_ci}; 17562306a36Sopenharmony_ci#endif 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_cistatic inline void io_submit_flush_completions(struct io_ring_ctx *ctx) 17862306a36Sopenharmony_ci{ 17962306a36Sopenharmony_ci if (!wq_list_empty(&ctx->submit_state.compl_reqs) || 18062306a36Sopenharmony_ci ctx->submit_state.cqes_count) 18162306a36Sopenharmony_ci __io_submit_flush_completions(ctx); 18262306a36Sopenharmony_ci} 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_cistatic inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 18562306a36Sopenharmony_ci{ 18662306a36Sopenharmony_ci return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 18762306a36Sopenharmony_ci} 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_cistatic inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) 19062306a36Sopenharmony_ci{ 19162306a36Sopenharmony_ci return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); 19262306a36Sopenharmony_ci} 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_cistatic bool io_match_linked(struct io_kiocb *head) 19562306a36Sopenharmony_ci{ 19662306a36Sopenharmony_ci struct io_kiocb *req; 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci io_for_each_link(req, head) { 19962306a36Sopenharmony_ci if (req->flags & REQ_F_INFLIGHT) 20062306a36Sopenharmony_ci return true; 20162306a36Sopenharmony_ci } 20262306a36Sopenharmony_ci return false; 20362306a36Sopenharmony_ci} 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci/* 20662306a36Sopenharmony_ci * As io_match_task() but protected against racing with linked timeouts. 20762306a36Sopenharmony_ci * User must not hold timeout_lock. 20862306a36Sopenharmony_ci */ 20962306a36Sopenharmony_cibool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, 21062306a36Sopenharmony_ci bool cancel_all) 21162306a36Sopenharmony_ci{ 21262306a36Sopenharmony_ci bool matched; 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci if (task && head->task != task) 21562306a36Sopenharmony_ci return false; 21662306a36Sopenharmony_ci if (cancel_all) 21762306a36Sopenharmony_ci return true; 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci if (head->flags & REQ_F_LINK_TIMEOUT) { 22062306a36Sopenharmony_ci struct io_ring_ctx *ctx = head->ctx; 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci /* protect against races with linked timeouts */ 22362306a36Sopenharmony_ci spin_lock_irq(&ctx->timeout_lock); 22462306a36Sopenharmony_ci matched = io_match_linked(head); 22562306a36Sopenharmony_ci spin_unlock_irq(&ctx->timeout_lock); 22662306a36Sopenharmony_ci } else { 22762306a36Sopenharmony_ci matched = io_match_linked(head); 22862306a36Sopenharmony_ci } 22962306a36Sopenharmony_ci return matched; 23062306a36Sopenharmony_ci} 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_cistatic inline void req_fail_link_node(struct io_kiocb *req, int res) 23362306a36Sopenharmony_ci{ 23462306a36Sopenharmony_ci req_set_fail(req); 23562306a36Sopenharmony_ci io_req_set_res(req, res, 0); 23662306a36Sopenharmony_ci} 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_cistatic inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) 23962306a36Sopenharmony_ci{ 24062306a36Sopenharmony_ci wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 24162306a36Sopenharmony_ci} 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_cistatic __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) 24462306a36Sopenharmony_ci{ 24562306a36Sopenharmony_ci struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci complete(&ctx->ref_comp); 24862306a36Sopenharmony_ci} 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_cistatic __cold void io_fallback_req_func(struct work_struct *work) 25162306a36Sopenharmony_ci{ 25262306a36Sopenharmony_ci struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, 25362306a36Sopenharmony_ci fallback_work.work); 25462306a36Sopenharmony_ci struct llist_node *node = llist_del_all(&ctx->fallback_llist); 25562306a36Sopenharmony_ci struct io_kiocb *req, *tmp; 25662306a36Sopenharmony_ci struct io_tw_state ts = { .locked = true, }; 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci percpu_ref_get(&ctx->refs); 25962306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 26062306a36Sopenharmony_ci llist_for_each_entry_safe(req, tmp, node, io_task_work.node) 26162306a36Sopenharmony_ci req->io_task_work.func(req, &ts); 26262306a36Sopenharmony_ci if (WARN_ON_ONCE(!ts.locked)) 26362306a36Sopenharmony_ci return; 26462306a36Sopenharmony_ci io_submit_flush_completions(ctx); 26562306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 26662306a36Sopenharmony_ci percpu_ref_put(&ctx->refs); 26762306a36Sopenharmony_ci} 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_cistatic int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) 27062306a36Sopenharmony_ci{ 27162306a36Sopenharmony_ci unsigned hash_buckets = 1U << bits; 27262306a36Sopenharmony_ci size_t hash_size = hash_buckets * sizeof(table->hbs[0]); 27362306a36Sopenharmony_ci 27462306a36Sopenharmony_ci table->hbs = kmalloc(hash_size, GFP_KERNEL); 27562306a36Sopenharmony_ci if (!table->hbs) 27662306a36Sopenharmony_ci return -ENOMEM; 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci table->hash_bits = bits; 27962306a36Sopenharmony_ci init_hash_table(table, hash_buckets); 28062306a36Sopenharmony_ci return 0; 28162306a36Sopenharmony_ci} 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_cistatic __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) 28462306a36Sopenharmony_ci{ 28562306a36Sopenharmony_ci struct io_ring_ctx *ctx; 28662306a36Sopenharmony_ci int hash_bits; 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 28962306a36Sopenharmony_ci if (!ctx) 29062306a36Sopenharmony_ci return NULL; 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_ci xa_init(&ctx->io_bl_xa); 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci /* 29562306a36Sopenharmony_ci * Use 5 bits less than the max cq entries, that should give us around 29662306a36Sopenharmony_ci * 32 entries per hash list if totally full and uniformly spread, but 29762306a36Sopenharmony_ci * don't keep too many buckets to not overconsume memory. 29862306a36Sopenharmony_ci */ 29962306a36Sopenharmony_ci hash_bits = ilog2(p->cq_entries) - 5; 30062306a36Sopenharmony_ci hash_bits = clamp(hash_bits, 1, 8); 30162306a36Sopenharmony_ci if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) 30262306a36Sopenharmony_ci goto err; 30362306a36Sopenharmony_ci if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) 30462306a36Sopenharmony_ci goto err; 30562306a36Sopenharmony_ci if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 30662306a36Sopenharmony_ci 0, GFP_KERNEL)) 30762306a36Sopenharmony_ci goto err; 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci ctx->flags = p->flags; 31062306a36Sopenharmony_ci init_waitqueue_head(&ctx->sqo_sq_wait); 31162306a36Sopenharmony_ci INIT_LIST_HEAD(&ctx->sqd_list); 31262306a36Sopenharmony_ci INIT_LIST_HEAD(&ctx->cq_overflow_list); 31362306a36Sopenharmony_ci INIT_LIST_HEAD(&ctx->io_buffers_cache); 31462306a36Sopenharmony_ci INIT_HLIST_HEAD(&ctx->io_buf_list); 31562306a36Sopenharmony_ci io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, 31662306a36Sopenharmony_ci sizeof(struct io_rsrc_node)); 31762306a36Sopenharmony_ci io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, 31862306a36Sopenharmony_ci sizeof(struct async_poll)); 31962306a36Sopenharmony_ci io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, 32062306a36Sopenharmony_ci sizeof(struct io_async_msghdr)); 32162306a36Sopenharmony_ci init_completion(&ctx->ref_comp); 32262306a36Sopenharmony_ci xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); 32362306a36Sopenharmony_ci mutex_init(&ctx->uring_lock); 32462306a36Sopenharmony_ci init_waitqueue_head(&ctx->cq_wait); 32562306a36Sopenharmony_ci init_waitqueue_head(&ctx->poll_wq); 32662306a36Sopenharmony_ci init_waitqueue_head(&ctx->rsrc_quiesce_wq); 32762306a36Sopenharmony_ci spin_lock_init(&ctx->completion_lock); 32862306a36Sopenharmony_ci spin_lock_init(&ctx->timeout_lock); 32962306a36Sopenharmony_ci INIT_WQ_LIST(&ctx->iopoll_list); 33062306a36Sopenharmony_ci INIT_LIST_HEAD(&ctx->io_buffers_pages); 33162306a36Sopenharmony_ci INIT_LIST_HEAD(&ctx->io_buffers_comp); 33262306a36Sopenharmony_ci INIT_LIST_HEAD(&ctx->defer_list); 33362306a36Sopenharmony_ci INIT_LIST_HEAD(&ctx->timeout_list); 33462306a36Sopenharmony_ci INIT_LIST_HEAD(&ctx->ltimeout_list); 33562306a36Sopenharmony_ci INIT_LIST_HEAD(&ctx->rsrc_ref_list); 33662306a36Sopenharmony_ci init_llist_head(&ctx->work_llist); 33762306a36Sopenharmony_ci INIT_LIST_HEAD(&ctx->tctx_list); 33862306a36Sopenharmony_ci ctx->submit_state.free_list.next = NULL; 33962306a36Sopenharmony_ci INIT_WQ_LIST(&ctx->locked_free_list); 34062306a36Sopenharmony_ci INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); 34162306a36Sopenharmony_ci INIT_WQ_LIST(&ctx->submit_state.compl_reqs); 34262306a36Sopenharmony_ci return ctx; 34362306a36Sopenharmony_cierr: 34462306a36Sopenharmony_ci kfree(ctx->cancel_table.hbs); 34562306a36Sopenharmony_ci kfree(ctx->cancel_table_locked.hbs); 34662306a36Sopenharmony_ci kfree(ctx->io_bl); 34762306a36Sopenharmony_ci xa_destroy(&ctx->io_bl_xa); 34862306a36Sopenharmony_ci kfree(ctx); 34962306a36Sopenharmony_ci return NULL; 35062306a36Sopenharmony_ci} 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_cistatic void io_account_cq_overflow(struct io_ring_ctx *ctx) 35362306a36Sopenharmony_ci{ 35462306a36Sopenharmony_ci struct io_rings *r = ctx->rings; 35562306a36Sopenharmony_ci 35662306a36Sopenharmony_ci WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); 35762306a36Sopenharmony_ci ctx->cq_extra--; 35862306a36Sopenharmony_ci} 35962306a36Sopenharmony_ci 36062306a36Sopenharmony_cistatic bool req_need_defer(struct io_kiocb *req, u32 seq) 36162306a36Sopenharmony_ci{ 36262306a36Sopenharmony_ci if (unlikely(req->flags & REQ_F_IO_DRAIN)) { 36362306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; 36662306a36Sopenharmony_ci } 36762306a36Sopenharmony_ci 36862306a36Sopenharmony_ci return false; 36962306a36Sopenharmony_ci} 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_cistatic void io_clean_op(struct io_kiocb *req) 37262306a36Sopenharmony_ci{ 37362306a36Sopenharmony_ci if (req->flags & REQ_F_BUFFER_SELECTED) { 37462306a36Sopenharmony_ci spin_lock(&req->ctx->completion_lock); 37562306a36Sopenharmony_ci io_put_kbuf_comp(req); 37662306a36Sopenharmony_ci spin_unlock(&req->ctx->completion_lock); 37762306a36Sopenharmony_ci } 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci if (req->flags & REQ_F_NEED_CLEANUP) { 38062306a36Sopenharmony_ci const struct io_cold_def *def = &io_cold_defs[req->opcode]; 38162306a36Sopenharmony_ci 38262306a36Sopenharmony_ci if (def->cleanup) 38362306a36Sopenharmony_ci def->cleanup(req); 38462306a36Sopenharmony_ci } 38562306a36Sopenharmony_ci if ((req->flags & REQ_F_POLLED) && req->apoll) { 38662306a36Sopenharmony_ci kfree(req->apoll->double_poll); 38762306a36Sopenharmony_ci kfree(req->apoll); 38862306a36Sopenharmony_ci req->apoll = NULL; 38962306a36Sopenharmony_ci } 39062306a36Sopenharmony_ci if (req->flags & REQ_F_INFLIGHT) { 39162306a36Sopenharmony_ci struct io_uring_task *tctx = req->task->io_uring; 39262306a36Sopenharmony_ci 39362306a36Sopenharmony_ci atomic_dec(&tctx->inflight_tracked); 39462306a36Sopenharmony_ci } 39562306a36Sopenharmony_ci if (req->flags & REQ_F_CREDS) 39662306a36Sopenharmony_ci put_cred(req->creds); 39762306a36Sopenharmony_ci if (req->flags & REQ_F_ASYNC_DATA) { 39862306a36Sopenharmony_ci kfree(req->async_data); 39962306a36Sopenharmony_ci req->async_data = NULL; 40062306a36Sopenharmony_ci } 40162306a36Sopenharmony_ci req->flags &= ~IO_REQ_CLEAN_FLAGS; 40262306a36Sopenharmony_ci} 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_cistatic inline void io_req_track_inflight(struct io_kiocb *req) 40562306a36Sopenharmony_ci{ 40662306a36Sopenharmony_ci if (!(req->flags & REQ_F_INFLIGHT)) { 40762306a36Sopenharmony_ci req->flags |= REQ_F_INFLIGHT; 40862306a36Sopenharmony_ci atomic_inc(&req->task->io_uring->inflight_tracked); 40962306a36Sopenharmony_ci } 41062306a36Sopenharmony_ci} 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_cistatic struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) 41362306a36Sopenharmony_ci{ 41462306a36Sopenharmony_ci if (WARN_ON_ONCE(!req->link)) 41562306a36Sopenharmony_ci return NULL; 41662306a36Sopenharmony_ci 41762306a36Sopenharmony_ci req->flags &= ~REQ_F_ARM_LTIMEOUT; 41862306a36Sopenharmony_ci req->flags |= REQ_F_LINK_TIMEOUT; 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci /* linked timeouts should have two refs once prep'ed */ 42162306a36Sopenharmony_ci io_req_set_refcount(req); 42262306a36Sopenharmony_ci __io_req_set_refcount(req->link, 2); 42362306a36Sopenharmony_ci return req->link; 42462306a36Sopenharmony_ci} 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_cistatic inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 42762306a36Sopenharmony_ci{ 42862306a36Sopenharmony_ci if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) 42962306a36Sopenharmony_ci return NULL; 43062306a36Sopenharmony_ci return __io_prep_linked_timeout(req); 43162306a36Sopenharmony_ci} 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_cistatic noinline void __io_arm_ltimeout(struct io_kiocb *req) 43462306a36Sopenharmony_ci{ 43562306a36Sopenharmony_ci io_queue_linked_timeout(__io_prep_linked_timeout(req)); 43662306a36Sopenharmony_ci} 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_cistatic inline void io_arm_ltimeout(struct io_kiocb *req) 43962306a36Sopenharmony_ci{ 44062306a36Sopenharmony_ci if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) 44162306a36Sopenharmony_ci __io_arm_ltimeout(req); 44262306a36Sopenharmony_ci} 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_cistatic void io_prep_async_work(struct io_kiocb *req) 44562306a36Sopenharmony_ci{ 44662306a36Sopenharmony_ci const struct io_issue_def *def = &io_issue_defs[req->opcode]; 44762306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 44862306a36Sopenharmony_ci 44962306a36Sopenharmony_ci if (!(req->flags & REQ_F_CREDS)) { 45062306a36Sopenharmony_ci req->flags |= REQ_F_CREDS; 45162306a36Sopenharmony_ci req->creds = get_current_cred(); 45262306a36Sopenharmony_ci } 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci req->work.list.next = NULL; 45562306a36Sopenharmony_ci req->work.flags = 0; 45662306a36Sopenharmony_ci req->work.cancel_seq = atomic_read(&ctx->cancel_seq); 45762306a36Sopenharmony_ci if (req->flags & REQ_F_FORCE_ASYNC) 45862306a36Sopenharmony_ci req->work.flags |= IO_WQ_WORK_CONCURRENT; 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci if (req->file && !(req->flags & REQ_F_FIXED_FILE)) 46162306a36Sopenharmony_ci req->flags |= io_file_get_flags(req->file); 46262306a36Sopenharmony_ci 46362306a36Sopenharmony_ci if (req->file && (req->flags & REQ_F_ISREG)) { 46462306a36Sopenharmony_ci bool should_hash = def->hash_reg_file; 46562306a36Sopenharmony_ci 46662306a36Sopenharmony_ci /* don't serialize this request if the fs doesn't need it */ 46762306a36Sopenharmony_ci if (should_hash && (req->file->f_flags & O_DIRECT) && 46862306a36Sopenharmony_ci (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE)) 46962306a36Sopenharmony_ci should_hash = false; 47062306a36Sopenharmony_ci if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) 47162306a36Sopenharmony_ci io_wq_hash_work(&req->work, file_inode(req->file)); 47262306a36Sopenharmony_ci } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { 47362306a36Sopenharmony_ci if (def->unbound_nonreg_file) 47462306a36Sopenharmony_ci req->work.flags |= IO_WQ_WORK_UNBOUND; 47562306a36Sopenharmony_ci } 47662306a36Sopenharmony_ci} 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_cistatic void io_prep_async_link(struct io_kiocb *req) 47962306a36Sopenharmony_ci{ 48062306a36Sopenharmony_ci struct io_kiocb *cur; 48162306a36Sopenharmony_ci 48262306a36Sopenharmony_ci if (req->flags & REQ_F_LINK_TIMEOUT) { 48362306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 48462306a36Sopenharmony_ci 48562306a36Sopenharmony_ci spin_lock_irq(&ctx->timeout_lock); 48662306a36Sopenharmony_ci io_for_each_link(cur, req) 48762306a36Sopenharmony_ci io_prep_async_work(cur); 48862306a36Sopenharmony_ci spin_unlock_irq(&ctx->timeout_lock); 48962306a36Sopenharmony_ci } else { 49062306a36Sopenharmony_ci io_for_each_link(cur, req) 49162306a36Sopenharmony_ci io_prep_async_work(cur); 49262306a36Sopenharmony_ci } 49362306a36Sopenharmony_ci} 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_civoid io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) 49662306a36Sopenharmony_ci{ 49762306a36Sopenharmony_ci struct io_kiocb *link = io_prep_linked_timeout(req); 49862306a36Sopenharmony_ci struct io_uring_task *tctx = req->task->io_uring; 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ci BUG_ON(!tctx); 50162306a36Sopenharmony_ci BUG_ON(!tctx->io_wq); 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_ci /* init ->work of the whole link before punting */ 50462306a36Sopenharmony_ci io_prep_async_link(req); 50562306a36Sopenharmony_ci 50662306a36Sopenharmony_ci /* 50762306a36Sopenharmony_ci * Not expected to happen, but if we do have a bug where this _can_ 50862306a36Sopenharmony_ci * happen, catch it here and ensure the request is marked as 50962306a36Sopenharmony_ci * canceled. That will make io-wq go through the usual work cancel 51062306a36Sopenharmony_ci * procedure rather than attempt to run this request (or create a new 51162306a36Sopenharmony_ci * worker for it). 51262306a36Sopenharmony_ci */ 51362306a36Sopenharmony_ci if (WARN_ON_ONCE(!same_thread_group(req->task, current))) 51462306a36Sopenharmony_ci req->work.flags |= IO_WQ_WORK_CANCEL; 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_ci trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); 51762306a36Sopenharmony_ci io_wq_enqueue(tctx->io_wq, &req->work); 51862306a36Sopenharmony_ci if (link) 51962306a36Sopenharmony_ci io_queue_linked_timeout(link); 52062306a36Sopenharmony_ci} 52162306a36Sopenharmony_ci 52262306a36Sopenharmony_cistatic __cold void io_queue_deferred(struct io_ring_ctx *ctx) 52362306a36Sopenharmony_ci{ 52462306a36Sopenharmony_ci while (!list_empty(&ctx->defer_list)) { 52562306a36Sopenharmony_ci struct io_defer_entry *de = list_first_entry(&ctx->defer_list, 52662306a36Sopenharmony_ci struct io_defer_entry, list); 52762306a36Sopenharmony_ci 52862306a36Sopenharmony_ci if (req_need_defer(de->req, de->seq)) 52962306a36Sopenharmony_ci break; 53062306a36Sopenharmony_ci list_del_init(&de->list); 53162306a36Sopenharmony_ci io_req_task_queue(de->req); 53262306a36Sopenharmony_ci kfree(de); 53362306a36Sopenharmony_ci } 53462306a36Sopenharmony_ci} 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_cistatic void io_eventfd_ops(struct rcu_head *rcu) 53862306a36Sopenharmony_ci{ 53962306a36Sopenharmony_ci struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); 54062306a36Sopenharmony_ci int ops = atomic_xchg(&ev_fd->ops, 0); 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_ci if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT)) 54362306a36Sopenharmony_ci eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); 54462306a36Sopenharmony_ci 54562306a36Sopenharmony_ci /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback 54662306a36Sopenharmony_ci * ordering in a race but if references are 0 we know we have to free 54762306a36Sopenharmony_ci * it regardless. 54862306a36Sopenharmony_ci */ 54962306a36Sopenharmony_ci if (atomic_dec_and_test(&ev_fd->refs)) { 55062306a36Sopenharmony_ci eventfd_ctx_put(ev_fd->cq_ev_fd); 55162306a36Sopenharmony_ci kfree(ev_fd); 55262306a36Sopenharmony_ci } 55362306a36Sopenharmony_ci} 55462306a36Sopenharmony_ci 55562306a36Sopenharmony_cistatic void io_eventfd_signal(struct io_ring_ctx *ctx) 55662306a36Sopenharmony_ci{ 55762306a36Sopenharmony_ci struct io_ev_fd *ev_fd = NULL; 55862306a36Sopenharmony_ci 55962306a36Sopenharmony_ci rcu_read_lock(); 56062306a36Sopenharmony_ci /* 56162306a36Sopenharmony_ci * rcu_dereference ctx->io_ev_fd once and use it for both for checking 56262306a36Sopenharmony_ci * and eventfd_signal 56362306a36Sopenharmony_ci */ 56462306a36Sopenharmony_ci ev_fd = rcu_dereference(ctx->io_ev_fd); 56562306a36Sopenharmony_ci 56662306a36Sopenharmony_ci /* 56762306a36Sopenharmony_ci * Check again if ev_fd exists incase an io_eventfd_unregister call 56862306a36Sopenharmony_ci * completed between the NULL check of ctx->io_ev_fd at the start of 56962306a36Sopenharmony_ci * the function and rcu_read_lock. 57062306a36Sopenharmony_ci */ 57162306a36Sopenharmony_ci if (unlikely(!ev_fd)) 57262306a36Sopenharmony_ci goto out; 57362306a36Sopenharmony_ci if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 57462306a36Sopenharmony_ci goto out; 57562306a36Sopenharmony_ci if (ev_fd->eventfd_async && !io_wq_current_is_worker()) 57662306a36Sopenharmony_ci goto out; 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci if (likely(eventfd_signal_allowed())) { 57962306a36Sopenharmony_ci eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE); 58062306a36Sopenharmony_ci } else { 58162306a36Sopenharmony_ci atomic_inc(&ev_fd->refs); 58262306a36Sopenharmony_ci if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) 58362306a36Sopenharmony_ci call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops); 58462306a36Sopenharmony_ci else 58562306a36Sopenharmony_ci atomic_dec(&ev_fd->refs); 58662306a36Sopenharmony_ci } 58762306a36Sopenharmony_ci 58862306a36Sopenharmony_ciout: 58962306a36Sopenharmony_ci rcu_read_unlock(); 59062306a36Sopenharmony_ci} 59162306a36Sopenharmony_ci 59262306a36Sopenharmony_cistatic void io_eventfd_flush_signal(struct io_ring_ctx *ctx) 59362306a36Sopenharmony_ci{ 59462306a36Sopenharmony_ci bool skip; 59562306a36Sopenharmony_ci 59662306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci /* 59962306a36Sopenharmony_ci * Eventfd should only get triggered when at least one event has been 60062306a36Sopenharmony_ci * posted. Some applications rely on the eventfd notification count 60162306a36Sopenharmony_ci * only changing IFF a new CQE has been added to the CQ ring. There's 60262306a36Sopenharmony_ci * no depedency on 1:1 relationship between how many times this 60362306a36Sopenharmony_ci * function is called (and hence the eventfd count) and number of CQEs 60462306a36Sopenharmony_ci * posted to the CQ ring. 60562306a36Sopenharmony_ci */ 60662306a36Sopenharmony_ci skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; 60762306a36Sopenharmony_ci ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 60862306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 60962306a36Sopenharmony_ci if (skip) 61062306a36Sopenharmony_ci return; 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci io_eventfd_signal(ctx); 61362306a36Sopenharmony_ci} 61462306a36Sopenharmony_ci 61562306a36Sopenharmony_civoid __io_commit_cqring_flush(struct io_ring_ctx *ctx) 61662306a36Sopenharmony_ci{ 61762306a36Sopenharmony_ci if (ctx->poll_activated) 61862306a36Sopenharmony_ci io_poll_wq_wake(ctx); 61962306a36Sopenharmony_ci if (ctx->off_timeout_used) 62062306a36Sopenharmony_ci io_flush_timeouts(ctx); 62162306a36Sopenharmony_ci if (ctx->drain_active) { 62262306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 62362306a36Sopenharmony_ci io_queue_deferred(ctx); 62462306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 62562306a36Sopenharmony_ci } 62662306a36Sopenharmony_ci if (ctx->has_evfd) 62762306a36Sopenharmony_ci io_eventfd_flush_signal(ctx); 62862306a36Sopenharmony_ci} 62962306a36Sopenharmony_ci 63062306a36Sopenharmony_cistatic inline void __io_cq_lock(struct io_ring_ctx *ctx) 63162306a36Sopenharmony_ci{ 63262306a36Sopenharmony_ci if (!ctx->lockless_cq) 63362306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 63462306a36Sopenharmony_ci} 63562306a36Sopenharmony_ci 63662306a36Sopenharmony_cistatic inline void io_cq_lock(struct io_ring_ctx *ctx) 63762306a36Sopenharmony_ci __acquires(ctx->completion_lock) 63862306a36Sopenharmony_ci{ 63962306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 64062306a36Sopenharmony_ci} 64162306a36Sopenharmony_ci 64262306a36Sopenharmony_cistatic inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) 64362306a36Sopenharmony_ci{ 64462306a36Sopenharmony_ci io_commit_cqring(ctx); 64562306a36Sopenharmony_ci if (!ctx->task_complete) { 64662306a36Sopenharmony_ci if (!ctx->lockless_cq) 64762306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 64862306a36Sopenharmony_ci /* IOPOLL rings only need to wake up if it's also SQPOLL */ 64962306a36Sopenharmony_ci if (!ctx->syscall_iopoll) 65062306a36Sopenharmony_ci io_cqring_wake(ctx); 65162306a36Sopenharmony_ci } 65262306a36Sopenharmony_ci io_commit_cqring_flush(ctx); 65362306a36Sopenharmony_ci} 65462306a36Sopenharmony_ci 65562306a36Sopenharmony_cistatic void io_cq_unlock_post(struct io_ring_ctx *ctx) 65662306a36Sopenharmony_ci __releases(ctx->completion_lock) 65762306a36Sopenharmony_ci{ 65862306a36Sopenharmony_ci io_commit_cqring(ctx); 65962306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 66062306a36Sopenharmony_ci io_cqring_wake(ctx); 66162306a36Sopenharmony_ci io_commit_cqring_flush(ctx); 66262306a36Sopenharmony_ci} 66362306a36Sopenharmony_ci 66462306a36Sopenharmony_ci/* Returns true if there are no backlogged entries after the flush */ 66562306a36Sopenharmony_cistatic void io_cqring_overflow_kill(struct io_ring_ctx *ctx) 66662306a36Sopenharmony_ci{ 66762306a36Sopenharmony_ci struct io_overflow_cqe *ocqe; 66862306a36Sopenharmony_ci LIST_HEAD(list); 66962306a36Sopenharmony_ci 67062306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 67162306a36Sopenharmony_ci list_splice_init(&ctx->cq_overflow_list, &list); 67262306a36Sopenharmony_ci clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); 67362306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 67462306a36Sopenharmony_ci 67562306a36Sopenharmony_ci while (!list_empty(&list)) { 67662306a36Sopenharmony_ci ocqe = list_first_entry(&list, struct io_overflow_cqe, list); 67762306a36Sopenharmony_ci list_del(&ocqe->list); 67862306a36Sopenharmony_ci kfree(ocqe); 67962306a36Sopenharmony_ci } 68062306a36Sopenharmony_ci} 68162306a36Sopenharmony_ci 68262306a36Sopenharmony_cistatic void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) 68362306a36Sopenharmony_ci{ 68462306a36Sopenharmony_ci size_t cqe_size = sizeof(struct io_uring_cqe); 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci if (__io_cqring_events(ctx) == ctx->cq_entries) 68762306a36Sopenharmony_ci return; 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_CQE32) 69062306a36Sopenharmony_ci cqe_size <<= 1; 69162306a36Sopenharmony_ci 69262306a36Sopenharmony_ci io_cq_lock(ctx); 69362306a36Sopenharmony_ci while (!list_empty(&ctx->cq_overflow_list)) { 69462306a36Sopenharmony_ci struct io_uring_cqe *cqe; 69562306a36Sopenharmony_ci struct io_overflow_cqe *ocqe; 69662306a36Sopenharmony_ci 69762306a36Sopenharmony_ci if (!io_get_cqe_overflow(ctx, &cqe, true)) 69862306a36Sopenharmony_ci break; 69962306a36Sopenharmony_ci ocqe = list_first_entry(&ctx->cq_overflow_list, 70062306a36Sopenharmony_ci struct io_overflow_cqe, list); 70162306a36Sopenharmony_ci memcpy(cqe, &ocqe->cqe, cqe_size); 70262306a36Sopenharmony_ci list_del(&ocqe->list); 70362306a36Sopenharmony_ci kfree(ocqe); 70462306a36Sopenharmony_ci } 70562306a36Sopenharmony_ci 70662306a36Sopenharmony_ci if (list_empty(&ctx->cq_overflow_list)) { 70762306a36Sopenharmony_ci clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); 70862306a36Sopenharmony_ci atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); 70962306a36Sopenharmony_ci } 71062306a36Sopenharmony_ci io_cq_unlock_post(ctx); 71162306a36Sopenharmony_ci} 71262306a36Sopenharmony_ci 71362306a36Sopenharmony_cistatic void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) 71462306a36Sopenharmony_ci{ 71562306a36Sopenharmony_ci /* iopoll syncs against uring_lock, not completion_lock */ 71662306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_IOPOLL) 71762306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 71862306a36Sopenharmony_ci __io_cqring_overflow_flush(ctx); 71962306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_IOPOLL) 72062306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 72162306a36Sopenharmony_ci} 72262306a36Sopenharmony_ci 72362306a36Sopenharmony_cistatic void io_cqring_overflow_flush(struct io_ring_ctx *ctx) 72462306a36Sopenharmony_ci{ 72562306a36Sopenharmony_ci if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) 72662306a36Sopenharmony_ci io_cqring_do_overflow_flush(ctx); 72762306a36Sopenharmony_ci} 72862306a36Sopenharmony_ci 72962306a36Sopenharmony_ci/* can be called by any task */ 73062306a36Sopenharmony_cistatic void io_put_task_remote(struct task_struct *task) 73162306a36Sopenharmony_ci{ 73262306a36Sopenharmony_ci struct io_uring_task *tctx = task->io_uring; 73362306a36Sopenharmony_ci 73462306a36Sopenharmony_ci percpu_counter_sub(&tctx->inflight, 1); 73562306a36Sopenharmony_ci if (unlikely(atomic_read(&tctx->in_cancel))) 73662306a36Sopenharmony_ci wake_up(&tctx->wait); 73762306a36Sopenharmony_ci put_task_struct(task); 73862306a36Sopenharmony_ci} 73962306a36Sopenharmony_ci 74062306a36Sopenharmony_ci/* used by a task to put its own references */ 74162306a36Sopenharmony_cistatic void io_put_task_local(struct task_struct *task) 74262306a36Sopenharmony_ci{ 74362306a36Sopenharmony_ci task->io_uring->cached_refs++; 74462306a36Sopenharmony_ci} 74562306a36Sopenharmony_ci 74662306a36Sopenharmony_ci/* must to be called somewhat shortly after putting a request */ 74762306a36Sopenharmony_cistatic inline void io_put_task(struct task_struct *task) 74862306a36Sopenharmony_ci{ 74962306a36Sopenharmony_ci if (likely(task == current)) 75062306a36Sopenharmony_ci io_put_task_local(task); 75162306a36Sopenharmony_ci else 75262306a36Sopenharmony_ci io_put_task_remote(task); 75362306a36Sopenharmony_ci} 75462306a36Sopenharmony_ci 75562306a36Sopenharmony_civoid io_task_refs_refill(struct io_uring_task *tctx) 75662306a36Sopenharmony_ci{ 75762306a36Sopenharmony_ci unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; 75862306a36Sopenharmony_ci 75962306a36Sopenharmony_ci percpu_counter_add(&tctx->inflight, refill); 76062306a36Sopenharmony_ci refcount_add(refill, ¤t->usage); 76162306a36Sopenharmony_ci tctx->cached_refs += refill; 76262306a36Sopenharmony_ci} 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_cistatic __cold void io_uring_drop_tctx_refs(struct task_struct *task) 76562306a36Sopenharmony_ci{ 76662306a36Sopenharmony_ci struct io_uring_task *tctx = task->io_uring; 76762306a36Sopenharmony_ci unsigned int refs = tctx->cached_refs; 76862306a36Sopenharmony_ci 76962306a36Sopenharmony_ci if (refs) { 77062306a36Sopenharmony_ci tctx->cached_refs = 0; 77162306a36Sopenharmony_ci percpu_counter_sub(&tctx->inflight, refs); 77262306a36Sopenharmony_ci put_task_struct_many(task, refs); 77362306a36Sopenharmony_ci } 77462306a36Sopenharmony_ci} 77562306a36Sopenharmony_ci 77662306a36Sopenharmony_cistatic bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, 77762306a36Sopenharmony_ci s32 res, u32 cflags, u64 extra1, u64 extra2) 77862306a36Sopenharmony_ci{ 77962306a36Sopenharmony_ci struct io_overflow_cqe *ocqe; 78062306a36Sopenharmony_ci size_t ocq_size = sizeof(struct io_overflow_cqe); 78162306a36Sopenharmony_ci bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); 78262306a36Sopenharmony_ci 78362306a36Sopenharmony_ci lockdep_assert_held(&ctx->completion_lock); 78462306a36Sopenharmony_ci 78562306a36Sopenharmony_ci if (is_cqe32) 78662306a36Sopenharmony_ci ocq_size += sizeof(struct io_uring_cqe); 78762306a36Sopenharmony_ci 78862306a36Sopenharmony_ci ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); 78962306a36Sopenharmony_ci trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); 79062306a36Sopenharmony_ci if (!ocqe) { 79162306a36Sopenharmony_ci /* 79262306a36Sopenharmony_ci * If we're in ring overflow flush mode, or in task cancel mode, 79362306a36Sopenharmony_ci * or cannot allocate an overflow entry, then we need to drop it 79462306a36Sopenharmony_ci * on the floor. 79562306a36Sopenharmony_ci */ 79662306a36Sopenharmony_ci io_account_cq_overflow(ctx); 79762306a36Sopenharmony_ci set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); 79862306a36Sopenharmony_ci return false; 79962306a36Sopenharmony_ci } 80062306a36Sopenharmony_ci if (list_empty(&ctx->cq_overflow_list)) { 80162306a36Sopenharmony_ci set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); 80262306a36Sopenharmony_ci atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); 80362306a36Sopenharmony_ci 80462306a36Sopenharmony_ci } 80562306a36Sopenharmony_ci ocqe->cqe.user_data = user_data; 80662306a36Sopenharmony_ci ocqe->cqe.res = res; 80762306a36Sopenharmony_ci ocqe->cqe.flags = cflags; 80862306a36Sopenharmony_ci if (is_cqe32) { 80962306a36Sopenharmony_ci ocqe->cqe.big_cqe[0] = extra1; 81062306a36Sopenharmony_ci ocqe->cqe.big_cqe[1] = extra2; 81162306a36Sopenharmony_ci } 81262306a36Sopenharmony_ci list_add_tail(&ocqe->list, &ctx->cq_overflow_list); 81362306a36Sopenharmony_ci return true; 81462306a36Sopenharmony_ci} 81562306a36Sopenharmony_ci 81662306a36Sopenharmony_civoid io_req_cqe_overflow(struct io_kiocb *req) 81762306a36Sopenharmony_ci{ 81862306a36Sopenharmony_ci io_cqring_event_overflow(req->ctx, req->cqe.user_data, 81962306a36Sopenharmony_ci req->cqe.res, req->cqe.flags, 82062306a36Sopenharmony_ci req->big_cqe.extra1, req->big_cqe.extra2); 82162306a36Sopenharmony_ci memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 82262306a36Sopenharmony_ci} 82362306a36Sopenharmony_ci 82462306a36Sopenharmony_ci/* 82562306a36Sopenharmony_ci * writes to the cq entry need to come after reading head; the 82662306a36Sopenharmony_ci * control dependency is enough as we're using WRITE_ONCE to 82762306a36Sopenharmony_ci * fill the cq entry 82862306a36Sopenharmony_ci */ 82962306a36Sopenharmony_cibool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) 83062306a36Sopenharmony_ci{ 83162306a36Sopenharmony_ci struct io_rings *rings = ctx->rings; 83262306a36Sopenharmony_ci unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); 83362306a36Sopenharmony_ci unsigned int free, queued, len; 83462306a36Sopenharmony_ci 83562306a36Sopenharmony_ci /* 83662306a36Sopenharmony_ci * Posting into the CQ when there are pending overflowed CQEs may break 83762306a36Sopenharmony_ci * ordering guarantees, which will affect links, F_MORE users and more. 83862306a36Sopenharmony_ci * Force overflow the completion. 83962306a36Sopenharmony_ci */ 84062306a36Sopenharmony_ci if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) 84162306a36Sopenharmony_ci return false; 84262306a36Sopenharmony_ci 84362306a36Sopenharmony_ci /* userspace may cheat modifying the tail, be safe and do min */ 84462306a36Sopenharmony_ci queued = min(__io_cqring_events(ctx), ctx->cq_entries); 84562306a36Sopenharmony_ci free = ctx->cq_entries - queued; 84662306a36Sopenharmony_ci /* we need a contiguous range, limit based on the current array offset */ 84762306a36Sopenharmony_ci len = min(free, ctx->cq_entries - off); 84862306a36Sopenharmony_ci if (!len) 84962306a36Sopenharmony_ci return false; 85062306a36Sopenharmony_ci 85162306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_CQE32) { 85262306a36Sopenharmony_ci off <<= 1; 85362306a36Sopenharmony_ci len <<= 1; 85462306a36Sopenharmony_ci } 85562306a36Sopenharmony_ci 85662306a36Sopenharmony_ci ctx->cqe_cached = &rings->cqes[off]; 85762306a36Sopenharmony_ci ctx->cqe_sentinel = ctx->cqe_cached + len; 85862306a36Sopenharmony_ci return true; 85962306a36Sopenharmony_ci} 86062306a36Sopenharmony_ci 86162306a36Sopenharmony_cistatic bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, 86262306a36Sopenharmony_ci u32 cflags) 86362306a36Sopenharmony_ci{ 86462306a36Sopenharmony_ci struct io_uring_cqe *cqe; 86562306a36Sopenharmony_ci 86662306a36Sopenharmony_ci ctx->cq_extra++; 86762306a36Sopenharmony_ci 86862306a36Sopenharmony_ci /* 86962306a36Sopenharmony_ci * If we can't get a cq entry, userspace overflowed the 87062306a36Sopenharmony_ci * submission (by quite a lot). Increment the overflow count in 87162306a36Sopenharmony_ci * the ring. 87262306a36Sopenharmony_ci */ 87362306a36Sopenharmony_ci if (likely(io_get_cqe(ctx, &cqe))) { 87462306a36Sopenharmony_ci trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); 87562306a36Sopenharmony_ci 87662306a36Sopenharmony_ci WRITE_ONCE(cqe->user_data, user_data); 87762306a36Sopenharmony_ci WRITE_ONCE(cqe->res, res); 87862306a36Sopenharmony_ci WRITE_ONCE(cqe->flags, cflags); 87962306a36Sopenharmony_ci 88062306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_CQE32) { 88162306a36Sopenharmony_ci WRITE_ONCE(cqe->big_cqe[0], 0); 88262306a36Sopenharmony_ci WRITE_ONCE(cqe->big_cqe[1], 0); 88362306a36Sopenharmony_ci } 88462306a36Sopenharmony_ci return true; 88562306a36Sopenharmony_ci } 88662306a36Sopenharmony_ci return false; 88762306a36Sopenharmony_ci} 88862306a36Sopenharmony_ci 88962306a36Sopenharmony_cistatic void __io_flush_post_cqes(struct io_ring_ctx *ctx) 89062306a36Sopenharmony_ci __must_hold(&ctx->uring_lock) 89162306a36Sopenharmony_ci{ 89262306a36Sopenharmony_ci struct io_submit_state *state = &ctx->submit_state; 89362306a36Sopenharmony_ci unsigned int i; 89462306a36Sopenharmony_ci 89562306a36Sopenharmony_ci lockdep_assert_held(&ctx->uring_lock); 89662306a36Sopenharmony_ci for (i = 0; i < state->cqes_count; i++) { 89762306a36Sopenharmony_ci struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; 89862306a36Sopenharmony_ci 89962306a36Sopenharmony_ci if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { 90062306a36Sopenharmony_ci if (ctx->lockless_cq) { 90162306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 90262306a36Sopenharmony_ci io_cqring_event_overflow(ctx, cqe->user_data, 90362306a36Sopenharmony_ci cqe->res, cqe->flags, 0, 0); 90462306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 90562306a36Sopenharmony_ci } else { 90662306a36Sopenharmony_ci io_cqring_event_overflow(ctx, cqe->user_data, 90762306a36Sopenharmony_ci cqe->res, cqe->flags, 0, 0); 90862306a36Sopenharmony_ci } 90962306a36Sopenharmony_ci } 91062306a36Sopenharmony_ci } 91162306a36Sopenharmony_ci state->cqes_count = 0; 91262306a36Sopenharmony_ci} 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_cistatic bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, 91562306a36Sopenharmony_ci bool allow_overflow) 91662306a36Sopenharmony_ci{ 91762306a36Sopenharmony_ci bool filled; 91862306a36Sopenharmony_ci 91962306a36Sopenharmony_ci io_cq_lock(ctx); 92062306a36Sopenharmony_ci filled = io_fill_cqe_aux(ctx, user_data, res, cflags); 92162306a36Sopenharmony_ci if (!filled && allow_overflow) 92262306a36Sopenharmony_ci filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); 92362306a36Sopenharmony_ci 92462306a36Sopenharmony_ci io_cq_unlock_post(ctx); 92562306a36Sopenharmony_ci return filled; 92662306a36Sopenharmony_ci} 92762306a36Sopenharmony_ci 92862306a36Sopenharmony_cibool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) 92962306a36Sopenharmony_ci{ 93062306a36Sopenharmony_ci return __io_post_aux_cqe(ctx, user_data, res, cflags, true); 93162306a36Sopenharmony_ci} 93262306a36Sopenharmony_ci 93362306a36Sopenharmony_ci/* 93462306a36Sopenharmony_ci * A helper for multishot requests posting additional CQEs. 93562306a36Sopenharmony_ci * Should only be used from a task_work including IO_URING_F_MULTISHOT. 93662306a36Sopenharmony_ci */ 93762306a36Sopenharmony_cibool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) 93862306a36Sopenharmony_ci{ 93962306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 94062306a36Sopenharmony_ci u64 user_data = req->cqe.user_data; 94162306a36Sopenharmony_ci struct io_uring_cqe *cqe; 94262306a36Sopenharmony_ci 94362306a36Sopenharmony_ci if (!defer) 94462306a36Sopenharmony_ci return __io_post_aux_cqe(ctx, user_data, res, cflags, false); 94562306a36Sopenharmony_ci 94662306a36Sopenharmony_ci lockdep_assert_held(&ctx->uring_lock); 94762306a36Sopenharmony_ci 94862306a36Sopenharmony_ci if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { 94962306a36Sopenharmony_ci __io_cq_lock(ctx); 95062306a36Sopenharmony_ci __io_flush_post_cqes(ctx); 95162306a36Sopenharmony_ci /* no need to flush - flush is deferred */ 95262306a36Sopenharmony_ci __io_cq_unlock_post(ctx); 95362306a36Sopenharmony_ci } 95462306a36Sopenharmony_ci 95562306a36Sopenharmony_ci /* For defered completions this is not as strict as it is otherwise, 95662306a36Sopenharmony_ci * however it's main job is to prevent unbounded posted completions, 95762306a36Sopenharmony_ci * and in that it works just as well. 95862306a36Sopenharmony_ci */ 95962306a36Sopenharmony_ci if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) 96062306a36Sopenharmony_ci return false; 96162306a36Sopenharmony_ci 96262306a36Sopenharmony_ci cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; 96362306a36Sopenharmony_ci cqe->user_data = user_data; 96462306a36Sopenharmony_ci cqe->res = res; 96562306a36Sopenharmony_ci cqe->flags = cflags; 96662306a36Sopenharmony_ci return true; 96762306a36Sopenharmony_ci} 96862306a36Sopenharmony_ci 96962306a36Sopenharmony_cistatic void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) 97062306a36Sopenharmony_ci{ 97162306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 97262306a36Sopenharmony_ci struct io_rsrc_node *rsrc_node = NULL; 97362306a36Sopenharmony_ci 97462306a36Sopenharmony_ci io_cq_lock(ctx); 97562306a36Sopenharmony_ci if (!(req->flags & REQ_F_CQE_SKIP)) { 97662306a36Sopenharmony_ci if (!io_fill_cqe_req(ctx, req)) 97762306a36Sopenharmony_ci io_req_cqe_overflow(req); 97862306a36Sopenharmony_ci } 97962306a36Sopenharmony_ci 98062306a36Sopenharmony_ci /* 98162306a36Sopenharmony_ci * If we're the last reference to this request, add to our locked 98262306a36Sopenharmony_ci * free_list cache. 98362306a36Sopenharmony_ci */ 98462306a36Sopenharmony_ci if (req_ref_put_and_test(req)) { 98562306a36Sopenharmony_ci if (req->flags & IO_REQ_LINK_FLAGS) { 98662306a36Sopenharmony_ci if (req->flags & IO_DISARM_MASK) 98762306a36Sopenharmony_ci io_disarm_next(req); 98862306a36Sopenharmony_ci if (req->link) { 98962306a36Sopenharmony_ci io_req_task_queue(req->link); 99062306a36Sopenharmony_ci req->link = NULL; 99162306a36Sopenharmony_ci } 99262306a36Sopenharmony_ci } 99362306a36Sopenharmony_ci io_put_kbuf_comp(req); 99462306a36Sopenharmony_ci if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) 99562306a36Sopenharmony_ci io_clean_op(req); 99662306a36Sopenharmony_ci io_put_file(req); 99762306a36Sopenharmony_ci 99862306a36Sopenharmony_ci rsrc_node = req->rsrc_node; 99962306a36Sopenharmony_ci /* 100062306a36Sopenharmony_ci * Selected buffer deallocation in io_clean_op() assumes that 100162306a36Sopenharmony_ci * we don't hold ->completion_lock. Clean them here to avoid 100262306a36Sopenharmony_ci * deadlocks. 100362306a36Sopenharmony_ci */ 100462306a36Sopenharmony_ci io_put_task_remote(req->task); 100562306a36Sopenharmony_ci wq_list_add_head(&req->comp_list, &ctx->locked_free_list); 100662306a36Sopenharmony_ci ctx->locked_free_nr++; 100762306a36Sopenharmony_ci } 100862306a36Sopenharmony_ci io_cq_unlock_post(ctx); 100962306a36Sopenharmony_ci 101062306a36Sopenharmony_ci if (rsrc_node) { 101162306a36Sopenharmony_ci io_ring_submit_lock(ctx, issue_flags); 101262306a36Sopenharmony_ci io_put_rsrc_node(ctx, rsrc_node); 101362306a36Sopenharmony_ci io_ring_submit_unlock(ctx, issue_flags); 101462306a36Sopenharmony_ci } 101562306a36Sopenharmony_ci} 101662306a36Sopenharmony_ci 101762306a36Sopenharmony_civoid io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) 101862306a36Sopenharmony_ci{ 101962306a36Sopenharmony_ci if (req->ctx->task_complete && req->ctx->submitter_task != current) { 102062306a36Sopenharmony_ci req->io_task_work.func = io_req_task_complete; 102162306a36Sopenharmony_ci io_req_task_work_add(req); 102262306a36Sopenharmony_ci } else if (!(issue_flags & IO_URING_F_UNLOCKED) || 102362306a36Sopenharmony_ci !(req->ctx->flags & IORING_SETUP_IOPOLL)) { 102462306a36Sopenharmony_ci __io_req_complete_post(req, issue_flags); 102562306a36Sopenharmony_ci } else { 102662306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 102762306a36Sopenharmony_ci 102862306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 102962306a36Sopenharmony_ci __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); 103062306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 103162306a36Sopenharmony_ci } 103262306a36Sopenharmony_ci} 103362306a36Sopenharmony_ci 103462306a36Sopenharmony_civoid io_req_defer_failed(struct io_kiocb *req, s32 res) 103562306a36Sopenharmony_ci __must_hold(&ctx->uring_lock) 103662306a36Sopenharmony_ci{ 103762306a36Sopenharmony_ci const struct io_cold_def *def = &io_cold_defs[req->opcode]; 103862306a36Sopenharmony_ci 103962306a36Sopenharmony_ci lockdep_assert_held(&req->ctx->uring_lock); 104062306a36Sopenharmony_ci 104162306a36Sopenharmony_ci req_set_fail(req); 104262306a36Sopenharmony_ci io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); 104362306a36Sopenharmony_ci if (def->fail) 104462306a36Sopenharmony_ci def->fail(req); 104562306a36Sopenharmony_ci io_req_complete_defer(req); 104662306a36Sopenharmony_ci} 104762306a36Sopenharmony_ci 104862306a36Sopenharmony_ci/* 104962306a36Sopenharmony_ci * Don't initialise the fields below on every allocation, but do that in 105062306a36Sopenharmony_ci * advance and keep them valid across allocations. 105162306a36Sopenharmony_ci */ 105262306a36Sopenharmony_cistatic void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) 105362306a36Sopenharmony_ci{ 105462306a36Sopenharmony_ci req->ctx = ctx; 105562306a36Sopenharmony_ci req->link = NULL; 105662306a36Sopenharmony_ci req->async_data = NULL; 105762306a36Sopenharmony_ci /* not necessary, but safer to zero */ 105862306a36Sopenharmony_ci memset(&req->cqe, 0, sizeof(req->cqe)); 105962306a36Sopenharmony_ci memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 106062306a36Sopenharmony_ci} 106162306a36Sopenharmony_ci 106262306a36Sopenharmony_cistatic void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, 106362306a36Sopenharmony_ci struct io_submit_state *state) 106462306a36Sopenharmony_ci{ 106562306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 106662306a36Sopenharmony_ci wq_list_splice(&ctx->locked_free_list, &state->free_list); 106762306a36Sopenharmony_ci ctx->locked_free_nr = 0; 106862306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 106962306a36Sopenharmony_ci} 107062306a36Sopenharmony_ci 107162306a36Sopenharmony_ci/* 107262306a36Sopenharmony_ci * A request might get retired back into the request caches even before opcode 107362306a36Sopenharmony_ci * handlers and io_issue_sqe() are done with it, e.g. inline completion path. 107462306a36Sopenharmony_ci * Because of that, io_alloc_req() should be called only under ->uring_lock 107562306a36Sopenharmony_ci * and with extra caution to not get a request that is still worked on. 107662306a36Sopenharmony_ci */ 107762306a36Sopenharmony_ci__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) 107862306a36Sopenharmony_ci __must_hold(&ctx->uring_lock) 107962306a36Sopenharmony_ci{ 108062306a36Sopenharmony_ci gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 108162306a36Sopenharmony_ci void *reqs[IO_REQ_ALLOC_BATCH]; 108262306a36Sopenharmony_ci int ret, i; 108362306a36Sopenharmony_ci 108462306a36Sopenharmony_ci /* 108562306a36Sopenharmony_ci * If we have more than a batch's worth of requests in our IRQ side 108662306a36Sopenharmony_ci * locked cache, grab the lock and move them over to our submission 108762306a36Sopenharmony_ci * side cache. 108862306a36Sopenharmony_ci */ 108962306a36Sopenharmony_ci if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { 109062306a36Sopenharmony_ci io_flush_cached_locked_reqs(ctx, &ctx->submit_state); 109162306a36Sopenharmony_ci if (!io_req_cache_empty(ctx)) 109262306a36Sopenharmony_ci return true; 109362306a36Sopenharmony_ci } 109462306a36Sopenharmony_ci 109562306a36Sopenharmony_ci ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); 109662306a36Sopenharmony_ci 109762306a36Sopenharmony_ci /* 109862306a36Sopenharmony_ci * Bulk alloc is all-or-nothing. If we fail to get a batch, 109962306a36Sopenharmony_ci * retry single alloc to be on the safe side. 110062306a36Sopenharmony_ci */ 110162306a36Sopenharmony_ci if (unlikely(ret <= 0)) { 110262306a36Sopenharmony_ci reqs[0] = kmem_cache_alloc(req_cachep, gfp); 110362306a36Sopenharmony_ci if (!reqs[0]) 110462306a36Sopenharmony_ci return false; 110562306a36Sopenharmony_ci ret = 1; 110662306a36Sopenharmony_ci } 110762306a36Sopenharmony_ci 110862306a36Sopenharmony_ci percpu_ref_get_many(&ctx->refs, ret); 110962306a36Sopenharmony_ci for (i = 0; i < ret; i++) { 111062306a36Sopenharmony_ci struct io_kiocb *req = reqs[i]; 111162306a36Sopenharmony_ci 111262306a36Sopenharmony_ci io_preinit_req(req, ctx); 111362306a36Sopenharmony_ci io_req_add_to_cache(req, ctx); 111462306a36Sopenharmony_ci } 111562306a36Sopenharmony_ci return true; 111662306a36Sopenharmony_ci} 111762306a36Sopenharmony_ci 111862306a36Sopenharmony_ci__cold void io_free_req(struct io_kiocb *req) 111962306a36Sopenharmony_ci{ 112062306a36Sopenharmony_ci /* refs were already put, restore them for io_req_task_complete() */ 112162306a36Sopenharmony_ci req->flags &= ~REQ_F_REFCOUNT; 112262306a36Sopenharmony_ci /* we only want to free it, don't post CQEs */ 112362306a36Sopenharmony_ci req->flags |= REQ_F_CQE_SKIP; 112462306a36Sopenharmony_ci req->io_task_work.func = io_req_task_complete; 112562306a36Sopenharmony_ci io_req_task_work_add(req); 112662306a36Sopenharmony_ci} 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_cistatic void __io_req_find_next_prep(struct io_kiocb *req) 112962306a36Sopenharmony_ci{ 113062306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 113162306a36Sopenharmony_ci 113262306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 113362306a36Sopenharmony_ci io_disarm_next(req); 113462306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 113562306a36Sopenharmony_ci} 113662306a36Sopenharmony_ci 113762306a36Sopenharmony_cistatic inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) 113862306a36Sopenharmony_ci{ 113962306a36Sopenharmony_ci struct io_kiocb *nxt; 114062306a36Sopenharmony_ci 114162306a36Sopenharmony_ci /* 114262306a36Sopenharmony_ci * If LINK is set, we have dependent requests in this chain. If we 114362306a36Sopenharmony_ci * didn't fail this request, queue the first one up, moving any other 114462306a36Sopenharmony_ci * dependencies to the next request. In case of failure, fail the rest 114562306a36Sopenharmony_ci * of the chain. 114662306a36Sopenharmony_ci */ 114762306a36Sopenharmony_ci if (unlikely(req->flags & IO_DISARM_MASK)) 114862306a36Sopenharmony_ci __io_req_find_next_prep(req); 114962306a36Sopenharmony_ci nxt = req->link; 115062306a36Sopenharmony_ci req->link = NULL; 115162306a36Sopenharmony_ci return nxt; 115262306a36Sopenharmony_ci} 115362306a36Sopenharmony_ci 115462306a36Sopenharmony_cistatic void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) 115562306a36Sopenharmony_ci{ 115662306a36Sopenharmony_ci if (!ctx) 115762306a36Sopenharmony_ci return; 115862306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 115962306a36Sopenharmony_ci atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 116062306a36Sopenharmony_ci if (ts->locked) { 116162306a36Sopenharmony_ci io_submit_flush_completions(ctx); 116262306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 116362306a36Sopenharmony_ci ts->locked = false; 116462306a36Sopenharmony_ci } 116562306a36Sopenharmony_ci percpu_ref_put(&ctx->refs); 116662306a36Sopenharmony_ci} 116762306a36Sopenharmony_ci 116862306a36Sopenharmony_cistatic unsigned int handle_tw_list(struct llist_node *node, 116962306a36Sopenharmony_ci struct io_ring_ctx **ctx, 117062306a36Sopenharmony_ci struct io_tw_state *ts) 117162306a36Sopenharmony_ci{ 117262306a36Sopenharmony_ci unsigned int count = 0; 117362306a36Sopenharmony_ci 117462306a36Sopenharmony_ci do { 117562306a36Sopenharmony_ci struct llist_node *next = node->next; 117662306a36Sopenharmony_ci struct io_kiocb *req = container_of(node, struct io_kiocb, 117762306a36Sopenharmony_ci io_task_work.node); 117862306a36Sopenharmony_ci 117962306a36Sopenharmony_ci prefetch(container_of(next, struct io_kiocb, io_task_work.node)); 118062306a36Sopenharmony_ci 118162306a36Sopenharmony_ci if (req->ctx != *ctx) { 118262306a36Sopenharmony_ci ctx_flush_and_put(*ctx, ts); 118362306a36Sopenharmony_ci *ctx = req->ctx; 118462306a36Sopenharmony_ci /* if not contended, grab and improve batching */ 118562306a36Sopenharmony_ci ts->locked = mutex_trylock(&(*ctx)->uring_lock); 118662306a36Sopenharmony_ci percpu_ref_get(&(*ctx)->refs); 118762306a36Sopenharmony_ci } 118862306a36Sopenharmony_ci INDIRECT_CALL_2(req->io_task_work.func, 118962306a36Sopenharmony_ci io_poll_task_func, io_req_rw_complete, 119062306a36Sopenharmony_ci req, ts); 119162306a36Sopenharmony_ci node = next; 119262306a36Sopenharmony_ci count++; 119362306a36Sopenharmony_ci if (unlikely(need_resched())) { 119462306a36Sopenharmony_ci ctx_flush_and_put(*ctx, ts); 119562306a36Sopenharmony_ci *ctx = NULL; 119662306a36Sopenharmony_ci cond_resched(); 119762306a36Sopenharmony_ci } 119862306a36Sopenharmony_ci } while (node); 119962306a36Sopenharmony_ci 120062306a36Sopenharmony_ci return count; 120162306a36Sopenharmony_ci} 120262306a36Sopenharmony_ci 120362306a36Sopenharmony_ci/** 120462306a36Sopenharmony_ci * io_llist_xchg - swap all entries in a lock-less list 120562306a36Sopenharmony_ci * @head: the head of lock-less list to delete all entries 120662306a36Sopenharmony_ci * @new: new entry as the head of the list 120762306a36Sopenharmony_ci * 120862306a36Sopenharmony_ci * If list is empty, return NULL, otherwise, return the pointer to the first entry. 120962306a36Sopenharmony_ci * The order of entries returned is from the newest to the oldest added one. 121062306a36Sopenharmony_ci */ 121162306a36Sopenharmony_cistatic inline struct llist_node *io_llist_xchg(struct llist_head *head, 121262306a36Sopenharmony_ci struct llist_node *new) 121362306a36Sopenharmony_ci{ 121462306a36Sopenharmony_ci return xchg(&head->first, new); 121562306a36Sopenharmony_ci} 121662306a36Sopenharmony_ci 121762306a36Sopenharmony_cistatic __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) 121862306a36Sopenharmony_ci{ 121962306a36Sopenharmony_ci struct llist_node *node = llist_del_all(&tctx->task_list); 122062306a36Sopenharmony_ci struct io_ring_ctx *last_ctx = NULL; 122162306a36Sopenharmony_ci struct io_kiocb *req; 122262306a36Sopenharmony_ci 122362306a36Sopenharmony_ci while (node) { 122462306a36Sopenharmony_ci req = container_of(node, struct io_kiocb, io_task_work.node); 122562306a36Sopenharmony_ci node = node->next; 122662306a36Sopenharmony_ci if (sync && last_ctx != req->ctx) { 122762306a36Sopenharmony_ci if (last_ctx) { 122862306a36Sopenharmony_ci flush_delayed_work(&last_ctx->fallback_work); 122962306a36Sopenharmony_ci percpu_ref_put(&last_ctx->refs); 123062306a36Sopenharmony_ci } 123162306a36Sopenharmony_ci last_ctx = req->ctx; 123262306a36Sopenharmony_ci percpu_ref_get(&last_ctx->refs); 123362306a36Sopenharmony_ci } 123462306a36Sopenharmony_ci if (llist_add(&req->io_task_work.node, 123562306a36Sopenharmony_ci &req->ctx->fallback_llist)) 123662306a36Sopenharmony_ci schedule_delayed_work(&req->ctx->fallback_work, 1); 123762306a36Sopenharmony_ci } 123862306a36Sopenharmony_ci 123962306a36Sopenharmony_ci if (last_ctx) { 124062306a36Sopenharmony_ci flush_delayed_work(&last_ctx->fallback_work); 124162306a36Sopenharmony_ci percpu_ref_put(&last_ctx->refs); 124262306a36Sopenharmony_ci } 124362306a36Sopenharmony_ci} 124462306a36Sopenharmony_ci 124562306a36Sopenharmony_civoid tctx_task_work(struct callback_head *cb) 124662306a36Sopenharmony_ci{ 124762306a36Sopenharmony_ci struct io_tw_state ts = {}; 124862306a36Sopenharmony_ci struct io_ring_ctx *ctx = NULL; 124962306a36Sopenharmony_ci struct io_uring_task *tctx = container_of(cb, struct io_uring_task, 125062306a36Sopenharmony_ci task_work); 125162306a36Sopenharmony_ci struct llist_node *node; 125262306a36Sopenharmony_ci unsigned int count = 0; 125362306a36Sopenharmony_ci 125462306a36Sopenharmony_ci if (unlikely(current->flags & PF_EXITING)) { 125562306a36Sopenharmony_ci io_fallback_tw(tctx, true); 125662306a36Sopenharmony_ci return; 125762306a36Sopenharmony_ci } 125862306a36Sopenharmony_ci 125962306a36Sopenharmony_ci node = llist_del_all(&tctx->task_list); 126062306a36Sopenharmony_ci if (node) 126162306a36Sopenharmony_ci count = handle_tw_list(node, &ctx, &ts); 126262306a36Sopenharmony_ci 126362306a36Sopenharmony_ci ctx_flush_and_put(ctx, &ts); 126462306a36Sopenharmony_ci 126562306a36Sopenharmony_ci /* relaxed read is enough as only the task itself sets ->in_cancel */ 126662306a36Sopenharmony_ci if (unlikely(atomic_read(&tctx->in_cancel))) 126762306a36Sopenharmony_ci io_uring_drop_tctx_refs(current); 126862306a36Sopenharmony_ci 126962306a36Sopenharmony_ci trace_io_uring_task_work_run(tctx, count, 1); 127062306a36Sopenharmony_ci} 127162306a36Sopenharmony_ci 127262306a36Sopenharmony_cistatic inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) 127362306a36Sopenharmony_ci{ 127462306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 127562306a36Sopenharmony_ci unsigned nr_wait, nr_tw, nr_tw_prev; 127662306a36Sopenharmony_ci struct llist_node *first; 127762306a36Sopenharmony_ci 127862306a36Sopenharmony_ci if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) 127962306a36Sopenharmony_ci flags &= ~IOU_F_TWQ_LAZY_WAKE; 128062306a36Sopenharmony_ci 128162306a36Sopenharmony_ci first = READ_ONCE(ctx->work_llist.first); 128262306a36Sopenharmony_ci do { 128362306a36Sopenharmony_ci nr_tw_prev = 0; 128462306a36Sopenharmony_ci if (first) { 128562306a36Sopenharmony_ci struct io_kiocb *first_req = container_of(first, 128662306a36Sopenharmony_ci struct io_kiocb, 128762306a36Sopenharmony_ci io_task_work.node); 128862306a36Sopenharmony_ci /* 128962306a36Sopenharmony_ci * Might be executed at any moment, rely on 129062306a36Sopenharmony_ci * SLAB_TYPESAFE_BY_RCU to keep it alive. 129162306a36Sopenharmony_ci */ 129262306a36Sopenharmony_ci nr_tw_prev = READ_ONCE(first_req->nr_tw); 129362306a36Sopenharmony_ci } 129462306a36Sopenharmony_ci nr_tw = nr_tw_prev + 1; 129562306a36Sopenharmony_ci /* Large enough to fail the nr_wait comparison below */ 129662306a36Sopenharmony_ci if (!(flags & IOU_F_TWQ_LAZY_WAKE)) 129762306a36Sopenharmony_ci nr_tw = INT_MAX; 129862306a36Sopenharmony_ci 129962306a36Sopenharmony_ci req->nr_tw = nr_tw; 130062306a36Sopenharmony_ci req->io_task_work.node.next = first; 130162306a36Sopenharmony_ci } while (!try_cmpxchg(&ctx->work_llist.first, &first, 130262306a36Sopenharmony_ci &req->io_task_work.node)); 130362306a36Sopenharmony_ci 130462306a36Sopenharmony_ci if (!first) { 130562306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 130662306a36Sopenharmony_ci atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 130762306a36Sopenharmony_ci if (ctx->has_evfd) 130862306a36Sopenharmony_ci io_eventfd_signal(ctx); 130962306a36Sopenharmony_ci } 131062306a36Sopenharmony_ci 131162306a36Sopenharmony_ci nr_wait = atomic_read(&ctx->cq_wait_nr); 131262306a36Sopenharmony_ci /* no one is waiting */ 131362306a36Sopenharmony_ci if (!nr_wait) 131462306a36Sopenharmony_ci return; 131562306a36Sopenharmony_ci /* either not enough or the previous add has already woken it up */ 131662306a36Sopenharmony_ci if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) 131762306a36Sopenharmony_ci return; 131862306a36Sopenharmony_ci /* pairs with set_current_state() in io_cqring_wait() */ 131962306a36Sopenharmony_ci smp_mb__after_atomic(); 132062306a36Sopenharmony_ci wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); 132162306a36Sopenharmony_ci} 132262306a36Sopenharmony_ci 132362306a36Sopenharmony_cistatic void io_req_normal_work_add(struct io_kiocb *req) 132462306a36Sopenharmony_ci{ 132562306a36Sopenharmony_ci struct io_uring_task *tctx = req->task->io_uring; 132662306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 132762306a36Sopenharmony_ci 132862306a36Sopenharmony_ci /* task_work already pending, we're done */ 132962306a36Sopenharmony_ci if (!llist_add(&req->io_task_work.node, &tctx->task_list)) 133062306a36Sopenharmony_ci return; 133162306a36Sopenharmony_ci 133262306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 133362306a36Sopenharmony_ci atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 133462306a36Sopenharmony_ci 133562306a36Sopenharmony_ci if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) 133662306a36Sopenharmony_ci return; 133762306a36Sopenharmony_ci 133862306a36Sopenharmony_ci io_fallback_tw(tctx, false); 133962306a36Sopenharmony_ci} 134062306a36Sopenharmony_ci 134162306a36Sopenharmony_civoid __io_req_task_work_add(struct io_kiocb *req, unsigned flags) 134262306a36Sopenharmony_ci{ 134362306a36Sopenharmony_ci if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 134462306a36Sopenharmony_ci rcu_read_lock(); 134562306a36Sopenharmony_ci io_req_local_work_add(req, flags); 134662306a36Sopenharmony_ci rcu_read_unlock(); 134762306a36Sopenharmony_ci } else { 134862306a36Sopenharmony_ci io_req_normal_work_add(req); 134962306a36Sopenharmony_ci } 135062306a36Sopenharmony_ci} 135162306a36Sopenharmony_ci 135262306a36Sopenharmony_cistatic void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) 135362306a36Sopenharmony_ci{ 135462306a36Sopenharmony_ci struct llist_node *node; 135562306a36Sopenharmony_ci 135662306a36Sopenharmony_ci node = llist_del_all(&ctx->work_llist); 135762306a36Sopenharmony_ci while (node) { 135862306a36Sopenharmony_ci struct io_kiocb *req = container_of(node, struct io_kiocb, 135962306a36Sopenharmony_ci io_task_work.node); 136062306a36Sopenharmony_ci 136162306a36Sopenharmony_ci node = node->next; 136262306a36Sopenharmony_ci io_req_normal_work_add(req); 136362306a36Sopenharmony_ci } 136462306a36Sopenharmony_ci} 136562306a36Sopenharmony_ci 136662306a36Sopenharmony_cistatic bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, 136762306a36Sopenharmony_ci int min_events) 136862306a36Sopenharmony_ci{ 136962306a36Sopenharmony_ci if (llist_empty(&ctx->work_llist)) 137062306a36Sopenharmony_ci return false; 137162306a36Sopenharmony_ci if (events < min_events) 137262306a36Sopenharmony_ci return true; 137362306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 137462306a36Sopenharmony_ci atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 137562306a36Sopenharmony_ci return false; 137662306a36Sopenharmony_ci} 137762306a36Sopenharmony_ci 137862306a36Sopenharmony_cistatic int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, 137962306a36Sopenharmony_ci int min_events) 138062306a36Sopenharmony_ci{ 138162306a36Sopenharmony_ci struct llist_node *node; 138262306a36Sopenharmony_ci unsigned int loops = 0; 138362306a36Sopenharmony_ci int ret = 0; 138462306a36Sopenharmony_ci 138562306a36Sopenharmony_ci if (WARN_ON_ONCE(ctx->submitter_task != current)) 138662306a36Sopenharmony_ci return -EEXIST; 138762306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 138862306a36Sopenharmony_ci atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 138962306a36Sopenharmony_ciagain: 139062306a36Sopenharmony_ci /* 139162306a36Sopenharmony_ci * llists are in reverse order, flip it back the right way before 139262306a36Sopenharmony_ci * running the pending items. 139362306a36Sopenharmony_ci */ 139462306a36Sopenharmony_ci node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL)); 139562306a36Sopenharmony_ci while (node) { 139662306a36Sopenharmony_ci struct llist_node *next = node->next; 139762306a36Sopenharmony_ci struct io_kiocb *req = container_of(node, struct io_kiocb, 139862306a36Sopenharmony_ci io_task_work.node); 139962306a36Sopenharmony_ci prefetch(container_of(next, struct io_kiocb, io_task_work.node)); 140062306a36Sopenharmony_ci INDIRECT_CALL_2(req->io_task_work.func, 140162306a36Sopenharmony_ci io_poll_task_func, io_req_rw_complete, 140262306a36Sopenharmony_ci req, ts); 140362306a36Sopenharmony_ci ret++; 140462306a36Sopenharmony_ci node = next; 140562306a36Sopenharmony_ci } 140662306a36Sopenharmony_ci loops++; 140762306a36Sopenharmony_ci 140862306a36Sopenharmony_ci if (io_run_local_work_continue(ctx, ret, min_events)) 140962306a36Sopenharmony_ci goto again; 141062306a36Sopenharmony_ci if (ts->locked) { 141162306a36Sopenharmony_ci io_submit_flush_completions(ctx); 141262306a36Sopenharmony_ci if (io_run_local_work_continue(ctx, ret, min_events)) 141362306a36Sopenharmony_ci goto again; 141462306a36Sopenharmony_ci } 141562306a36Sopenharmony_ci 141662306a36Sopenharmony_ci trace_io_uring_local_work_run(ctx, ret, loops); 141762306a36Sopenharmony_ci return ret; 141862306a36Sopenharmony_ci} 141962306a36Sopenharmony_ci 142062306a36Sopenharmony_cistatic inline int io_run_local_work_locked(struct io_ring_ctx *ctx, 142162306a36Sopenharmony_ci int min_events) 142262306a36Sopenharmony_ci{ 142362306a36Sopenharmony_ci struct io_tw_state ts = { .locked = true, }; 142462306a36Sopenharmony_ci int ret; 142562306a36Sopenharmony_ci 142662306a36Sopenharmony_ci if (llist_empty(&ctx->work_llist)) 142762306a36Sopenharmony_ci return 0; 142862306a36Sopenharmony_ci 142962306a36Sopenharmony_ci ret = __io_run_local_work(ctx, &ts, min_events); 143062306a36Sopenharmony_ci /* shouldn't happen! */ 143162306a36Sopenharmony_ci if (WARN_ON_ONCE(!ts.locked)) 143262306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 143362306a36Sopenharmony_ci return ret; 143462306a36Sopenharmony_ci} 143562306a36Sopenharmony_ci 143662306a36Sopenharmony_cistatic int io_run_local_work(struct io_ring_ctx *ctx, int min_events) 143762306a36Sopenharmony_ci{ 143862306a36Sopenharmony_ci struct io_tw_state ts = {}; 143962306a36Sopenharmony_ci int ret; 144062306a36Sopenharmony_ci 144162306a36Sopenharmony_ci ts.locked = mutex_trylock(&ctx->uring_lock); 144262306a36Sopenharmony_ci ret = __io_run_local_work(ctx, &ts, min_events); 144362306a36Sopenharmony_ci if (ts.locked) 144462306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 144562306a36Sopenharmony_ci 144662306a36Sopenharmony_ci return ret; 144762306a36Sopenharmony_ci} 144862306a36Sopenharmony_ci 144962306a36Sopenharmony_cistatic void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) 145062306a36Sopenharmony_ci{ 145162306a36Sopenharmony_ci io_tw_lock(req->ctx, ts); 145262306a36Sopenharmony_ci io_req_defer_failed(req, req->cqe.res); 145362306a36Sopenharmony_ci} 145462306a36Sopenharmony_ci 145562306a36Sopenharmony_civoid io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) 145662306a36Sopenharmony_ci{ 145762306a36Sopenharmony_ci io_tw_lock(req->ctx, ts); 145862306a36Sopenharmony_ci /* req->task == current here, checking PF_EXITING is safe */ 145962306a36Sopenharmony_ci if (unlikely(req->task->flags & PF_EXITING)) 146062306a36Sopenharmony_ci io_req_defer_failed(req, -EFAULT); 146162306a36Sopenharmony_ci else if (req->flags & REQ_F_FORCE_ASYNC) 146262306a36Sopenharmony_ci io_queue_iowq(req, ts); 146362306a36Sopenharmony_ci else 146462306a36Sopenharmony_ci io_queue_sqe(req); 146562306a36Sopenharmony_ci} 146662306a36Sopenharmony_ci 146762306a36Sopenharmony_civoid io_req_task_queue_fail(struct io_kiocb *req, int ret) 146862306a36Sopenharmony_ci{ 146962306a36Sopenharmony_ci io_req_set_res(req, ret, 0); 147062306a36Sopenharmony_ci req->io_task_work.func = io_req_task_cancel; 147162306a36Sopenharmony_ci io_req_task_work_add(req); 147262306a36Sopenharmony_ci} 147362306a36Sopenharmony_ci 147462306a36Sopenharmony_civoid io_req_task_queue(struct io_kiocb *req) 147562306a36Sopenharmony_ci{ 147662306a36Sopenharmony_ci req->io_task_work.func = io_req_task_submit; 147762306a36Sopenharmony_ci io_req_task_work_add(req); 147862306a36Sopenharmony_ci} 147962306a36Sopenharmony_ci 148062306a36Sopenharmony_civoid io_queue_next(struct io_kiocb *req) 148162306a36Sopenharmony_ci{ 148262306a36Sopenharmony_ci struct io_kiocb *nxt = io_req_find_next(req); 148362306a36Sopenharmony_ci 148462306a36Sopenharmony_ci if (nxt) 148562306a36Sopenharmony_ci io_req_task_queue(nxt); 148662306a36Sopenharmony_ci} 148762306a36Sopenharmony_ci 148862306a36Sopenharmony_cistatic void io_free_batch_list(struct io_ring_ctx *ctx, 148962306a36Sopenharmony_ci struct io_wq_work_node *node) 149062306a36Sopenharmony_ci __must_hold(&ctx->uring_lock) 149162306a36Sopenharmony_ci{ 149262306a36Sopenharmony_ci do { 149362306a36Sopenharmony_ci struct io_kiocb *req = container_of(node, struct io_kiocb, 149462306a36Sopenharmony_ci comp_list); 149562306a36Sopenharmony_ci 149662306a36Sopenharmony_ci if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { 149762306a36Sopenharmony_ci if (req->flags & REQ_F_REFCOUNT) { 149862306a36Sopenharmony_ci node = req->comp_list.next; 149962306a36Sopenharmony_ci if (!req_ref_put_and_test(req)) 150062306a36Sopenharmony_ci continue; 150162306a36Sopenharmony_ci } 150262306a36Sopenharmony_ci if ((req->flags & REQ_F_POLLED) && req->apoll) { 150362306a36Sopenharmony_ci struct async_poll *apoll = req->apoll; 150462306a36Sopenharmony_ci 150562306a36Sopenharmony_ci if (apoll->double_poll) 150662306a36Sopenharmony_ci kfree(apoll->double_poll); 150762306a36Sopenharmony_ci if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache)) 150862306a36Sopenharmony_ci kfree(apoll); 150962306a36Sopenharmony_ci req->flags &= ~REQ_F_POLLED; 151062306a36Sopenharmony_ci } 151162306a36Sopenharmony_ci if (req->flags & IO_REQ_LINK_FLAGS) 151262306a36Sopenharmony_ci io_queue_next(req); 151362306a36Sopenharmony_ci if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) 151462306a36Sopenharmony_ci io_clean_op(req); 151562306a36Sopenharmony_ci } 151662306a36Sopenharmony_ci io_put_file(req); 151762306a36Sopenharmony_ci 151862306a36Sopenharmony_ci io_req_put_rsrc_locked(req, ctx); 151962306a36Sopenharmony_ci 152062306a36Sopenharmony_ci io_put_task(req->task); 152162306a36Sopenharmony_ci node = req->comp_list.next; 152262306a36Sopenharmony_ci io_req_add_to_cache(req, ctx); 152362306a36Sopenharmony_ci } while (node); 152462306a36Sopenharmony_ci} 152562306a36Sopenharmony_ci 152662306a36Sopenharmony_civoid __io_submit_flush_completions(struct io_ring_ctx *ctx) 152762306a36Sopenharmony_ci __must_hold(&ctx->uring_lock) 152862306a36Sopenharmony_ci{ 152962306a36Sopenharmony_ci struct io_submit_state *state = &ctx->submit_state; 153062306a36Sopenharmony_ci struct io_wq_work_node *node; 153162306a36Sopenharmony_ci 153262306a36Sopenharmony_ci __io_cq_lock(ctx); 153362306a36Sopenharmony_ci /* must come first to preserve CQE ordering in failure cases */ 153462306a36Sopenharmony_ci if (state->cqes_count) 153562306a36Sopenharmony_ci __io_flush_post_cqes(ctx); 153662306a36Sopenharmony_ci __wq_list_for_each(node, &state->compl_reqs) { 153762306a36Sopenharmony_ci struct io_kiocb *req = container_of(node, struct io_kiocb, 153862306a36Sopenharmony_ci comp_list); 153962306a36Sopenharmony_ci 154062306a36Sopenharmony_ci if (!(req->flags & REQ_F_CQE_SKIP) && 154162306a36Sopenharmony_ci unlikely(!io_fill_cqe_req(ctx, req))) { 154262306a36Sopenharmony_ci if (ctx->lockless_cq) { 154362306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 154462306a36Sopenharmony_ci io_req_cqe_overflow(req); 154562306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 154662306a36Sopenharmony_ci } else { 154762306a36Sopenharmony_ci io_req_cqe_overflow(req); 154862306a36Sopenharmony_ci } 154962306a36Sopenharmony_ci } 155062306a36Sopenharmony_ci } 155162306a36Sopenharmony_ci __io_cq_unlock_post(ctx); 155262306a36Sopenharmony_ci 155362306a36Sopenharmony_ci if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { 155462306a36Sopenharmony_ci io_free_batch_list(ctx, state->compl_reqs.first); 155562306a36Sopenharmony_ci INIT_WQ_LIST(&state->compl_reqs); 155662306a36Sopenharmony_ci } 155762306a36Sopenharmony_ci} 155862306a36Sopenharmony_ci 155962306a36Sopenharmony_cistatic unsigned io_cqring_events(struct io_ring_ctx *ctx) 156062306a36Sopenharmony_ci{ 156162306a36Sopenharmony_ci /* See comment at the top of this file */ 156262306a36Sopenharmony_ci smp_rmb(); 156362306a36Sopenharmony_ci return __io_cqring_events(ctx); 156462306a36Sopenharmony_ci} 156562306a36Sopenharmony_ci 156662306a36Sopenharmony_ci/* 156762306a36Sopenharmony_ci * We can't just wait for polled events to come to us, we have to actively 156862306a36Sopenharmony_ci * find and complete them. 156962306a36Sopenharmony_ci */ 157062306a36Sopenharmony_cistatic __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) 157162306a36Sopenharmony_ci{ 157262306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_IOPOLL)) 157362306a36Sopenharmony_ci return; 157462306a36Sopenharmony_ci 157562306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 157662306a36Sopenharmony_ci while (!wq_list_empty(&ctx->iopoll_list)) { 157762306a36Sopenharmony_ci /* let it sleep and repeat later if can't complete a request */ 157862306a36Sopenharmony_ci if (io_do_iopoll(ctx, true) == 0) 157962306a36Sopenharmony_ci break; 158062306a36Sopenharmony_ci /* 158162306a36Sopenharmony_ci * Ensure we allow local-to-the-cpu processing to take place, 158262306a36Sopenharmony_ci * in this case we need to ensure that we reap all events. 158362306a36Sopenharmony_ci * Also let task_work, etc. to progress by releasing the mutex 158462306a36Sopenharmony_ci */ 158562306a36Sopenharmony_ci if (need_resched()) { 158662306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 158762306a36Sopenharmony_ci cond_resched(); 158862306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 158962306a36Sopenharmony_ci } 159062306a36Sopenharmony_ci } 159162306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 159262306a36Sopenharmony_ci} 159362306a36Sopenharmony_ci 159462306a36Sopenharmony_cistatic int io_iopoll_check(struct io_ring_ctx *ctx, long min) 159562306a36Sopenharmony_ci{ 159662306a36Sopenharmony_ci unsigned int nr_events = 0; 159762306a36Sopenharmony_ci unsigned long check_cq; 159862306a36Sopenharmony_ci 159962306a36Sopenharmony_ci if (!io_allowed_run_tw(ctx)) 160062306a36Sopenharmony_ci return -EEXIST; 160162306a36Sopenharmony_ci 160262306a36Sopenharmony_ci check_cq = READ_ONCE(ctx->check_cq); 160362306a36Sopenharmony_ci if (unlikely(check_cq)) { 160462306a36Sopenharmony_ci if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 160562306a36Sopenharmony_ci __io_cqring_overflow_flush(ctx); 160662306a36Sopenharmony_ci /* 160762306a36Sopenharmony_ci * Similarly do not spin if we have not informed the user of any 160862306a36Sopenharmony_ci * dropped CQE. 160962306a36Sopenharmony_ci */ 161062306a36Sopenharmony_ci if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) 161162306a36Sopenharmony_ci return -EBADR; 161262306a36Sopenharmony_ci } 161362306a36Sopenharmony_ci /* 161462306a36Sopenharmony_ci * Don't enter poll loop if we already have events pending. 161562306a36Sopenharmony_ci * If we do, we can potentially be spinning for commands that 161662306a36Sopenharmony_ci * already triggered a CQE (eg in error). 161762306a36Sopenharmony_ci */ 161862306a36Sopenharmony_ci if (io_cqring_events(ctx)) 161962306a36Sopenharmony_ci return 0; 162062306a36Sopenharmony_ci 162162306a36Sopenharmony_ci do { 162262306a36Sopenharmony_ci int ret = 0; 162362306a36Sopenharmony_ci 162462306a36Sopenharmony_ci /* 162562306a36Sopenharmony_ci * If a submit got punted to a workqueue, we can have the 162662306a36Sopenharmony_ci * application entering polling for a command before it gets 162762306a36Sopenharmony_ci * issued. That app will hold the uring_lock for the duration 162862306a36Sopenharmony_ci * of the poll right here, so we need to take a breather every 162962306a36Sopenharmony_ci * now and then to ensure that the issue has a chance to add 163062306a36Sopenharmony_ci * the poll to the issued list. Otherwise we can spin here 163162306a36Sopenharmony_ci * forever, while the workqueue is stuck trying to acquire the 163262306a36Sopenharmony_ci * very same mutex. 163362306a36Sopenharmony_ci */ 163462306a36Sopenharmony_ci if (wq_list_empty(&ctx->iopoll_list) || 163562306a36Sopenharmony_ci io_task_work_pending(ctx)) { 163662306a36Sopenharmony_ci u32 tail = ctx->cached_cq_tail; 163762306a36Sopenharmony_ci 163862306a36Sopenharmony_ci (void) io_run_local_work_locked(ctx, min); 163962306a36Sopenharmony_ci 164062306a36Sopenharmony_ci if (task_work_pending(current) || 164162306a36Sopenharmony_ci wq_list_empty(&ctx->iopoll_list)) { 164262306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 164362306a36Sopenharmony_ci io_run_task_work(); 164462306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 164562306a36Sopenharmony_ci } 164662306a36Sopenharmony_ci /* some requests don't go through iopoll_list */ 164762306a36Sopenharmony_ci if (tail != ctx->cached_cq_tail || 164862306a36Sopenharmony_ci wq_list_empty(&ctx->iopoll_list)) 164962306a36Sopenharmony_ci break; 165062306a36Sopenharmony_ci } 165162306a36Sopenharmony_ci ret = io_do_iopoll(ctx, !min); 165262306a36Sopenharmony_ci if (unlikely(ret < 0)) 165362306a36Sopenharmony_ci return ret; 165462306a36Sopenharmony_ci 165562306a36Sopenharmony_ci if (task_sigpending(current)) 165662306a36Sopenharmony_ci return -EINTR; 165762306a36Sopenharmony_ci if (need_resched()) 165862306a36Sopenharmony_ci break; 165962306a36Sopenharmony_ci 166062306a36Sopenharmony_ci nr_events += ret; 166162306a36Sopenharmony_ci } while (nr_events < min); 166262306a36Sopenharmony_ci 166362306a36Sopenharmony_ci return 0; 166462306a36Sopenharmony_ci} 166562306a36Sopenharmony_ci 166662306a36Sopenharmony_civoid io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) 166762306a36Sopenharmony_ci{ 166862306a36Sopenharmony_ci if (ts->locked) 166962306a36Sopenharmony_ci io_req_complete_defer(req); 167062306a36Sopenharmony_ci else 167162306a36Sopenharmony_ci io_req_complete_post(req, IO_URING_F_UNLOCKED); 167262306a36Sopenharmony_ci} 167362306a36Sopenharmony_ci 167462306a36Sopenharmony_ci/* 167562306a36Sopenharmony_ci * After the iocb has been issued, it's safe to be found on the poll list. 167662306a36Sopenharmony_ci * Adding the kiocb to the list AFTER submission ensures that we don't 167762306a36Sopenharmony_ci * find it from a io_do_iopoll() thread before the issuer is done 167862306a36Sopenharmony_ci * accessing the kiocb cookie. 167962306a36Sopenharmony_ci */ 168062306a36Sopenharmony_cistatic void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) 168162306a36Sopenharmony_ci{ 168262306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 168362306a36Sopenharmony_ci const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; 168462306a36Sopenharmony_ci 168562306a36Sopenharmony_ci /* workqueue context doesn't hold uring_lock, grab it now */ 168662306a36Sopenharmony_ci if (unlikely(needs_lock)) 168762306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 168862306a36Sopenharmony_ci 168962306a36Sopenharmony_ci /* 169062306a36Sopenharmony_ci * Track whether we have multiple files in our lists. This will impact 169162306a36Sopenharmony_ci * how we do polling eventually, not spinning if we're on potentially 169262306a36Sopenharmony_ci * different devices. 169362306a36Sopenharmony_ci */ 169462306a36Sopenharmony_ci if (wq_list_empty(&ctx->iopoll_list)) { 169562306a36Sopenharmony_ci ctx->poll_multi_queue = false; 169662306a36Sopenharmony_ci } else if (!ctx->poll_multi_queue) { 169762306a36Sopenharmony_ci struct io_kiocb *list_req; 169862306a36Sopenharmony_ci 169962306a36Sopenharmony_ci list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, 170062306a36Sopenharmony_ci comp_list); 170162306a36Sopenharmony_ci if (list_req->file != req->file) 170262306a36Sopenharmony_ci ctx->poll_multi_queue = true; 170362306a36Sopenharmony_ci } 170462306a36Sopenharmony_ci 170562306a36Sopenharmony_ci /* 170662306a36Sopenharmony_ci * For fast devices, IO may have already completed. If it has, add 170762306a36Sopenharmony_ci * it to the front so we find it first. 170862306a36Sopenharmony_ci */ 170962306a36Sopenharmony_ci if (READ_ONCE(req->iopoll_completed)) 171062306a36Sopenharmony_ci wq_list_add_head(&req->comp_list, &ctx->iopoll_list); 171162306a36Sopenharmony_ci else 171262306a36Sopenharmony_ci wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); 171362306a36Sopenharmony_ci 171462306a36Sopenharmony_ci if (unlikely(needs_lock)) { 171562306a36Sopenharmony_ci /* 171662306a36Sopenharmony_ci * If IORING_SETUP_SQPOLL is enabled, sqes are either handle 171762306a36Sopenharmony_ci * in sq thread task context or in io worker task context. If 171862306a36Sopenharmony_ci * current task context is sq thread, we don't need to check 171962306a36Sopenharmony_ci * whether should wake up sq thread. 172062306a36Sopenharmony_ci */ 172162306a36Sopenharmony_ci if ((ctx->flags & IORING_SETUP_SQPOLL) && 172262306a36Sopenharmony_ci wq_has_sleeper(&ctx->sq_data->wait)) 172362306a36Sopenharmony_ci wake_up(&ctx->sq_data->wait); 172462306a36Sopenharmony_ci 172562306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 172662306a36Sopenharmony_ci } 172762306a36Sopenharmony_ci} 172862306a36Sopenharmony_ci 172962306a36Sopenharmony_ciunsigned int io_file_get_flags(struct file *file) 173062306a36Sopenharmony_ci{ 173162306a36Sopenharmony_ci unsigned int res = 0; 173262306a36Sopenharmony_ci 173362306a36Sopenharmony_ci if (S_ISREG(file_inode(file)->i_mode)) 173462306a36Sopenharmony_ci res |= REQ_F_ISREG; 173562306a36Sopenharmony_ci if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) 173662306a36Sopenharmony_ci res |= REQ_F_SUPPORT_NOWAIT; 173762306a36Sopenharmony_ci return res; 173862306a36Sopenharmony_ci} 173962306a36Sopenharmony_ci 174062306a36Sopenharmony_cibool io_alloc_async_data(struct io_kiocb *req) 174162306a36Sopenharmony_ci{ 174262306a36Sopenharmony_ci WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); 174362306a36Sopenharmony_ci req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); 174462306a36Sopenharmony_ci if (req->async_data) { 174562306a36Sopenharmony_ci req->flags |= REQ_F_ASYNC_DATA; 174662306a36Sopenharmony_ci return false; 174762306a36Sopenharmony_ci } 174862306a36Sopenharmony_ci return true; 174962306a36Sopenharmony_ci} 175062306a36Sopenharmony_ci 175162306a36Sopenharmony_ciint io_req_prep_async(struct io_kiocb *req) 175262306a36Sopenharmony_ci{ 175362306a36Sopenharmony_ci const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; 175462306a36Sopenharmony_ci const struct io_issue_def *def = &io_issue_defs[req->opcode]; 175562306a36Sopenharmony_ci 175662306a36Sopenharmony_ci /* assign early for deferred execution for non-fixed file */ 175762306a36Sopenharmony_ci if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) 175862306a36Sopenharmony_ci req->file = io_file_get_normal(req, req->cqe.fd); 175962306a36Sopenharmony_ci if (!cdef->prep_async) 176062306a36Sopenharmony_ci return 0; 176162306a36Sopenharmony_ci if (WARN_ON_ONCE(req_has_async_data(req))) 176262306a36Sopenharmony_ci return -EFAULT; 176362306a36Sopenharmony_ci if (!def->manual_alloc) { 176462306a36Sopenharmony_ci if (io_alloc_async_data(req)) 176562306a36Sopenharmony_ci return -EAGAIN; 176662306a36Sopenharmony_ci } 176762306a36Sopenharmony_ci return cdef->prep_async(req); 176862306a36Sopenharmony_ci} 176962306a36Sopenharmony_ci 177062306a36Sopenharmony_cistatic u32 io_get_sequence(struct io_kiocb *req) 177162306a36Sopenharmony_ci{ 177262306a36Sopenharmony_ci u32 seq = req->ctx->cached_sq_head; 177362306a36Sopenharmony_ci struct io_kiocb *cur; 177462306a36Sopenharmony_ci 177562306a36Sopenharmony_ci /* need original cached_sq_head, but it was increased for each req */ 177662306a36Sopenharmony_ci io_for_each_link(cur, req) 177762306a36Sopenharmony_ci seq--; 177862306a36Sopenharmony_ci return seq; 177962306a36Sopenharmony_ci} 178062306a36Sopenharmony_ci 178162306a36Sopenharmony_cistatic __cold void io_drain_req(struct io_kiocb *req) 178262306a36Sopenharmony_ci __must_hold(&ctx->uring_lock) 178362306a36Sopenharmony_ci{ 178462306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 178562306a36Sopenharmony_ci struct io_defer_entry *de; 178662306a36Sopenharmony_ci int ret; 178762306a36Sopenharmony_ci u32 seq = io_get_sequence(req); 178862306a36Sopenharmony_ci 178962306a36Sopenharmony_ci /* Still need defer if there is pending req in defer list. */ 179062306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 179162306a36Sopenharmony_ci if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { 179262306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 179362306a36Sopenharmony_ciqueue: 179462306a36Sopenharmony_ci ctx->drain_active = false; 179562306a36Sopenharmony_ci io_req_task_queue(req); 179662306a36Sopenharmony_ci return; 179762306a36Sopenharmony_ci } 179862306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 179962306a36Sopenharmony_ci 180062306a36Sopenharmony_ci io_prep_async_link(req); 180162306a36Sopenharmony_ci de = kmalloc(sizeof(*de), GFP_KERNEL); 180262306a36Sopenharmony_ci if (!de) { 180362306a36Sopenharmony_ci ret = -ENOMEM; 180462306a36Sopenharmony_ci io_req_defer_failed(req, ret); 180562306a36Sopenharmony_ci return; 180662306a36Sopenharmony_ci } 180762306a36Sopenharmony_ci 180862306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 180962306a36Sopenharmony_ci if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { 181062306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 181162306a36Sopenharmony_ci kfree(de); 181262306a36Sopenharmony_ci goto queue; 181362306a36Sopenharmony_ci } 181462306a36Sopenharmony_ci 181562306a36Sopenharmony_ci trace_io_uring_defer(req); 181662306a36Sopenharmony_ci de->req = req; 181762306a36Sopenharmony_ci de->seq = seq; 181862306a36Sopenharmony_ci list_add_tail(&de->list, &ctx->defer_list); 181962306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 182062306a36Sopenharmony_ci} 182162306a36Sopenharmony_ci 182262306a36Sopenharmony_cistatic bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, 182362306a36Sopenharmony_ci unsigned int issue_flags) 182462306a36Sopenharmony_ci{ 182562306a36Sopenharmony_ci if (req->file || !def->needs_file) 182662306a36Sopenharmony_ci return true; 182762306a36Sopenharmony_ci 182862306a36Sopenharmony_ci if (req->flags & REQ_F_FIXED_FILE) 182962306a36Sopenharmony_ci req->file = io_file_get_fixed(req, req->cqe.fd, issue_flags); 183062306a36Sopenharmony_ci else 183162306a36Sopenharmony_ci req->file = io_file_get_normal(req, req->cqe.fd); 183262306a36Sopenharmony_ci 183362306a36Sopenharmony_ci return !!req->file; 183462306a36Sopenharmony_ci} 183562306a36Sopenharmony_ci 183662306a36Sopenharmony_cistatic int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 183762306a36Sopenharmony_ci{ 183862306a36Sopenharmony_ci const struct io_issue_def *def = &io_issue_defs[req->opcode]; 183962306a36Sopenharmony_ci const struct cred *creds = NULL; 184062306a36Sopenharmony_ci int ret; 184162306a36Sopenharmony_ci 184262306a36Sopenharmony_ci if (unlikely(!io_assign_file(req, def, issue_flags))) 184362306a36Sopenharmony_ci return -EBADF; 184462306a36Sopenharmony_ci 184562306a36Sopenharmony_ci if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) 184662306a36Sopenharmony_ci creds = override_creds(req->creds); 184762306a36Sopenharmony_ci 184862306a36Sopenharmony_ci if (!def->audit_skip) 184962306a36Sopenharmony_ci audit_uring_entry(req->opcode); 185062306a36Sopenharmony_ci 185162306a36Sopenharmony_ci ret = def->issue(req, issue_flags); 185262306a36Sopenharmony_ci 185362306a36Sopenharmony_ci if (!def->audit_skip) 185462306a36Sopenharmony_ci audit_uring_exit(!ret, ret); 185562306a36Sopenharmony_ci 185662306a36Sopenharmony_ci if (creds) 185762306a36Sopenharmony_ci revert_creds(creds); 185862306a36Sopenharmony_ci 185962306a36Sopenharmony_ci if (ret == IOU_OK) { 186062306a36Sopenharmony_ci if (issue_flags & IO_URING_F_COMPLETE_DEFER) 186162306a36Sopenharmony_ci io_req_complete_defer(req); 186262306a36Sopenharmony_ci else 186362306a36Sopenharmony_ci io_req_complete_post(req, issue_flags); 186462306a36Sopenharmony_ci 186562306a36Sopenharmony_ci return 0; 186662306a36Sopenharmony_ci } 186762306a36Sopenharmony_ci 186862306a36Sopenharmony_ci if (ret != IOU_ISSUE_SKIP_COMPLETE) 186962306a36Sopenharmony_ci return ret; 187062306a36Sopenharmony_ci 187162306a36Sopenharmony_ci /* If the op doesn't have a file, we're not polling for it */ 187262306a36Sopenharmony_ci if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) 187362306a36Sopenharmony_ci io_iopoll_req_issued(req, issue_flags); 187462306a36Sopenharmony_ci 187562306a36Sopenharmony_ci return 0; 187662306a36Sopenharmony_ci} 187762306a36Sopenharmony_ci 187862306a36Sopenharmony_ciint io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) 187962306a36Sopenharmony_ci{ 188062306a36Sopenharmony_ci io_tw_lock(req->ctx, ts); 188162306a36Sopenharmony_ci return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| 188262306a36Sopenharmony_ci IO_URING_F_COMPLETE_DEFER); 188362306a36Sopenharmony_ci} 188462306a36Sopenharmony_ci 188562306a36Sopenharmony_cistruct io_wq_work *io_wq_free_work(struct io_wq_work *work) 188662306a36Sopenharmony_ci{ 188762306a36Sopenharmony_ci struct io_kiocb *req = container_of(work, struct io_kiocb, work); 188862306a36Sopenharmony_ci struct io_kiocb *nxt = NULL; 188962306a36Sopenharmony_ci 189062306a36Sopenharmony_ci if (req_ref_put_and_test(req)) { 189162306a36Sopenharmony_ci if (req->flags & IO_REQ_LINK_FLAGS) 189262306a36Sopenharmony_ci nxt = io_req_find_next(req); 189362306a36Sopenharmony_ci io_free_req(req); 189462306a36Sopenharmony_ci } 189562306a36Sopenharmony_ci return nxt ? &nxt->work : NULL; 189662306a36Sopenharmony_ci} 189762306a36Sopenharmony_ci 189862306a36Sopenharmony_civoid io_wq_submit_work(struct io_wq_work *work) 189962306a36Sopenharmony_ci{ 190062306a36Sopenharmony_ci struct io_kiocb *req = container_of(work, struct io_kiocb, work); 190162306a36Sopenharmony_ci const struct io_issue_def *def = &io_issue_defs[req->opcode]; 190262306a36Sopenharmony_ci unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ; 190362306a36Sopenharmony_ci bool needs_poll = false; 190462306a36Sopenharmony_ci int ret = 0, err = -ECANCELED; 190562306a36Sopenharmony_ci 190662306a36Sopenharmony_ci /* one will be dropped by ->io_wq_free_work() after returning to io-wq */ 190762306a36Sopenharmony_ci if (!(req->flags & REQ_F_REFCOUNT)) 190862306a36Sopenharmony_ci __io_req_set_refcount(req, 2); 190962306a36Sopenharmony_ci else 191062306a36Sopenharmony_ci req_ref_get(req); 191162306a36Sopenharmony_ci 191262306a36Sopenharmony_ci io_arm_ltimeout(req); 191362306a36Sopenharmony_ci 191462306a36Sopenharmony_ci /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ 191562306a36Sopenharmony_ci if (work->flags & IO_WQ_WORK_CANCEL) { 191662306a36Sopenharmony_cifail: 191762306a36Sopenharmony_ci io_req_task_queue_fail(req, err); 191862306a36Sopenharmony_ci return; 191962306a36Sopenharmony_ci } 192062306a36Sopenharmony_ci if (!io_assign_file(req, def, issue_flags)) { 192162306a36Sopenharmony_ci err = -EBADF; 192262306a36Sopenharmony_ci work->flags |= IO_WQ_WORK_CANCEL; 192362306a36Sopenharmony_ci goto fail; 192462306a36Sopenharmony_ci } 192562306a36Sopenharmony_ci 192662306a36Sopenharmony_ci if (req->flags & REQ_F_FORCE_ASYNC) { 192762306a36Sopenharmony_ci bool opcode_poll = def->pollin || def->pollout; 192862306a36Sopenharmony_ci 192962306a36Sopenharmony_ci if (opcode_poll && file_can_poll(req->file)) { 193062306a36Sopenharmony_ci needs_poll = true; 193162306a36Sopenharmony_ci issue_flags |= IO_URING_F_NONBLOCK; 193262306a36Sopenharmony_ci } 193362306a36Sopenharmony_ci } 193462306a36Sopenharmony_ci 193562306a36Sopenharmony_ci do { 193662306a36Sopenharmony_ci ret = io_issue_sqe(req, issue_flags); 193762306a36Sopenharmony_ci if (ret != -EAGAIN) 193862306a36Sopenharmony_ci break; 193962306a36Sopenharmony_ci 194062306a36Sopenharmony_ci /* 194162306a36Sopenharmony_ci * If REQ_F_NOWAIT is set, then don't wait or retry with 194262306a36Sopenharmony_ci * poll. -EAGAIN is final for that case. 194362306a36Sopenharmony_ci */ 194462306a36Sopenharmony_ci if (req->flags & REQ_F_NOWAIT) 194562306a36Sopenharmony_ci break; 194662306a36Sopenharmony_ci 194762306a36Sopenharmony_ci /* 194862306a36Sopenharmony_ci * We can get EAGAIN for iopolled IO even though we're 194962306a36Sopenharmony_ci * forcing a sync submission from here, since we can't 195062306a36Sopenharmony_ci * wait for request slots on the block side. 195162306a36Sopenharmony_ci */ 195262306a36Sopenharmony_ci if (!needs_poll) { 195362306a36Sopenharmony_ci if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) 195462306a36Sopenharmony_ci break; 195562306a36Sopenharmony_ci if (io_wq_worker_stopped()) 195662306a36Sopenharmony_ci break; 195762306a36Sopenharmony_ci cond_resched(); 195862306a36Sopenharmony_ci continue; 195962306a36Sopenharmony_ci } 196062306a36Sopenharmony_ci 196162306a36Sopenharmony_ci if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK) 196262306a36Sopenharmony_ci return; 196362306a36Sopenharmony_ci /* aborted or ready, in either case retry blocking */ 196462306a36Sopenharmony_ci needs_poll = false; 196562306a36Sopenharmony_ci issue_flags &= ~IO_URING_F_NONBLOCK; 196662306a36Sopenharmony_ci } while (1); 196762306a36Sopenharmony_ci 196862306a36Sopenharmony_ci /* avoid locking problems by failing it from a clean context */ 196962306a36Sopenharmony_ci if (ret < 0) 197062306a36Sopenharmony_ci io_req_task_queue_fail(req, ret); 197162306a36Sopenharmony_ci} 197262306a36Sopenharmony_ci 197362306a36Sopenharmony_ciinline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, 197462306a36Sopenharmony_ci unsigned int issue_flags) 197562306a36Sopenharmony_ci{ 197662306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 197762306a36Sopenharmony_ci struct io_fixed_file *slot; 197862306a36Sopenharmony_ci struct file *file = NULL; 197962306a36Sopenharmony_ci 198062306a36Sopenharmony_ci io_ring_submit_lock(ctx, issue_flags); 198162306a36Sopenharmony_ci 198262306a36Sopenharmony_ci if (unlikely((unsigned int)fd >= ctx->nr_user_files)) 198362306a36Sopenharmony_ci goto out; 198462306a36Sopenharmony_ci fd = array_index_nospec(fd, ctx->nr_user_files); 198562306a36Sopenharmony_ci slot = io_fixed_file_slot(&ctx->file_table, fd); 198662306a36Sopenharmony_ci file = io_slot_file(slot); 198762306a36Sopenharmony_ci req->flags |= io_slot_flags(slot); 198862306a36Sopenharmony_ci io_req_set_rsrc_node(req, ctx, 0); 198962306a36Sopenharmony_ciout: 199062306a36Sopenharmony_ci io_ring_submit_unlock(ctx, issue_flags); 199162306a36Sopenharmony_ci return file; 199262306a36Sopenharmony_ci} 199362306a36Sopenharmony_ci 199462306a36Sopenharmony_cistruct file *io_file_get_normal(struct io_kiocb *req, int fd) 199562306a36Sopenharmony_ci{ 199662306a36Sopenharmony_ci struct file *file = fget(fd); 199762306a36Sopenharmony_ci 199862306a36Sopenharmony_ci trace_io_uring_file_get(req, fd); 199962306a36Sopenharmony_ci 200062306a36Sopenharmony_ci /* we don't allow fixed io_uring files */ 200162306a36Sopenharmony_ci if (file && io_is_uring_fops(file)) 200262306a36Sopenharmony_ci io_req_track_inflight(req); 200362306a36Sopenharmony_ci return file; 200462306a36Sopenharmony_ci} 200562306a36Sopenharmony_ci 200662306a36Sopenharmony_cistatic void io_queue_async(struct io_kiocb *req, int ret) 200762306a36Sopenharmony_ci __must_hold(&req->ctx->uring_lock) 200862306a36Sopenharmony_ci{ 200962306a36Sopenharmony_ci struct io_kiocb *linked_timeout; 201062306a36Sopenharmony_ci 201162306a36Sopenharmony_ci if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { 201262306a36Sopenharmony_ci io_req_defer_failed(req, ret); 201362306a36Sopenharmony_ci return; 201462306a36Sopenharmony_ci } 201562306a36Sopenharmony_ci 201662306a36Sopenharmony_ci linked_timeout = io_prep_linked_timeout(req); 201762306a36Sopenharmony_ci 201862306a36Sopenharmony_ci switch (io_arm_poll_handler(req, 0)) { 201962306a36Sopenharmony_ci case IO_APOLL_READY: 202062306a36Sopenharmony_ci io_kbuf_recycle(req, 0); 202162306a36Sopenharmony_ci io_req_task_queue(req); 202262306a36Sopenharmony_ci break; 202362306a36Sopenharmony_ci case IO_APOLL_ABORTED: 202462306a36Sopenharmony_ci io_kbuf_recycle(req, 0); 202562306a36Sopenharmony_ci io_queue_iowq(req, NULL); 202662306a36Sopenharmony_ci break; 202762306a36Sopenharmony_ci case IO_APOLL_OK: 202862306a36Sopenharmony_ci break; 202962306a36Sopenharmony_ci } 203062306a36Sopenharmony_ci 203162306a36Sopenharmony_ci if (linked_timeout) 203262306a36Sopenharmony_ci io_queue_linked_timeout(linked_timeout); 203362306a36Sopenharmony_ci} 203462306a36Sopenharmony_ci 203562306a36Sopenharmony_cistatic inline void io_queue_sqe(struct io_kiocb *req) 203662306a36Sopenharmony_ci __must_hold(&req->ctx->uring_lock) 203762306a36Sopenharmony_ci{ 203862306a36Sopenharmony_ci int ret; 203962306a36Sopenharmony_ci 204062306a36Sopenharmony_ci ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); 204162306a36Sopenharmony_ci 204262306a36Sopenharmony_ci /* 204362306a36Sopenharmony_ci * We async punt it if the file wasn't marked NOWAIT, or if the file 204462306a36Sopenharmony_ci * doesn't support non-blocking read/write attempts 204562306a36Sopenharmony_ci */ 204662306a36Sopenharmony_ci if (likely(!ret)) 204762306a36Sopenharmony_ci io_arm_ltimeout(req); 204862306a36Sopenharmony_ci else 204962306a36Sopenharmony_ci io_queue_async(req, ret); 205062306a36Sopenharmony_ci} 205162306a36Sopenharmony_ci 205262306a36Sopenharmony_cistatic void io_queue_sqe_fallback(struct io_kiocb *req) 205362306a36Sopenharmony_ci __must_hold(&req->ctx->uring_lock) 205462306a36Sopenharmony_ci{ 205562306a36Sopenharmony_ci if (unlikely(req->flags & REQ_F_FAIL)) { 205662306a36Sopenharmony_ci /* 205762306a36Sopenharmony_ci * We don't submit, fail them all, for that replace hardlinks 205862306a36Sopenharmony_ci * with normal links. Extra REQ_F_LINK is tolerated. 205962306a36Sopenharmony_ci */ 206062306a36Sopenharmony_ci req->flags &= ~REQ_F_HARDLINK; 206162306a36Sopenharmony_ci req->flags |= REQ_F_LINK; 206262306a36Sopenharmony_ci io_req_defer_failed(req, req->cqe.res); 206362306a36Sopenharmony_ci } else { 206462306a36Sopenharmony_ci int ret = io_req_prep_async(req); 206562306a36Sopenharmony_ci 206662306a36Sopenharmony_ci if (unlikely(ret)) { 206762306a36Sopenharmony_ci io_req_defer_failed(req, ret); 206862306a36Sopenharmony_ci return; 206962306a36Sopenharmony_ci } 207062306a36Sopenharmony_ci 207162306a36Sopenharmony_ci if (unlikely(req->ctx->drain_active)) 207262306a36Sopenharmony_ci io_drain_req(req); 207362306a36Sopenharmony_ci else 207462306a36Sopenharmony_ci io_queue_iowq(req, NULL); 207562306a36Sopenharmony_ci } 207662306a36Sopenharmony_ci} 207762306a36Sopenharmony_ci 207862306a36Sopenharmony_ci/* 207962306a36Sopenharmony_ci * Check SQE restrictions (opcode and flags). 208062306a36Sopenharmony_ci * 208162306a36Sopenharmony_ci * Returns 'true' if SQE is allowed, 'false' otherwise. 208262306a36Sopenharmony_ci */ 208362306a36Sopenharmony_cistatic inline bool io_check_restriction(struct io_ring_ctx *ctx, 208462306a36Sopenharmony_ci struct io_kiocb *req, 208562306a36Sopenharmony_ci unsigned int sqe_flags) 208662306a36Sopenharmony_ci{ 208762306a36Sopenharmony_ci if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) 208862306a36Sopenharmony_ci return false; 208962306a36Sopenharmony_ci 209062306a36Sopenharmony_ci if ((sqe_flags & ctx->restrictions.sqe_flags_required) != 209162306a36Sopenharmony_ci ctx->restrictions.sqe_flags_required) 209262306a36Sopenharmony_ci return false; 209362306a36Sopenharmony_ci 209462306a36Sopenharmony_ci if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | 209562306a36Sopenharmony_ci ctx->restrictions.sqe_flags_required)) 209662306a36Sopenharmony_ci return false; 209762306a36Sopenharmony_ci 209862306a36Sopenharmony_ci return true; 209962306a36Sopenharmony_ci} 210062306a36Sopenharmony_ci 210162306a36Sopenharmony_cistatic void io_init_req_drain(struct io_kiocb *req) 210262306a36Sopenharmony_ci{ 210362306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 210462306a36Sopenharmony_ci struct io_kiocb *head = ctx->submit_state.link.head; 210562306a36Sopenharmony_ci 210662306a36Sopenharmony_ci ctx->drain_active = true; 210762306a36Sopenharmony_ci if (head) { 210862306a36Sopenharmony_ci /* 210962306a36Sopenharmony_ci * If we need to drain a request in the middle of a link, drain 211062306a36Sopenharmony_ci * the head request and the next request/link after the current 211162306a36Sopenharmony_ci * link. Considering sequential execution of links, 211262306a36Sopenharmony_ci * REQ_F_IO_DRAIN will be maintained for every request of our 211362306a36Sopenharmony_ci * link. 211462306a36Sopenharmony_ci */ 211562306a36Sopenharmony_ci head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; 211662306a36Sopenharmony_ci ctx->drain_next = true; 211762306a36Sopenharmony_ci } 211862306a36Sopenharmony_ci} 211962306a36Sopenharmony_ci 212062306a36Sopenharmony_cistatic int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, 212162306a36Sopenharmony_ci const struct io_uring_sqe *sqe) 212262306a36Sopenharmony_ci __must_hold(&ctx->uring_lock) 212362306a36Sopenharmony_ci{ 212462306a36Sopenharmony_ci const struct io_issue_def *def; 212562306a36Sopenharmony_ci unsigned int sqe_flags; 212662306a36Sopenharmony_ci int personality; 212762306a36Sopenharmony_ci u8 opcode; 212862306a36Sopenharmony_ci 212962306a36Sopenharmony_ci /* req is partially pre-initialised, see io_preinit_req() */ 213062306a36Sopenharmony_ci req->opcode = opcode = READ_ONCE(sqe->opcode); 213162306a36Sopenharmony_ci /* same numerical values with corresponding REQ_F_*, safe to copy */ 213262306a36Sopenharmony_ci req->flags = sqe_flags = READ_ONCE(sqe->flags); 213362306a36Sopenharmony_ci req->cqe.user_data = READ_ONCE(sqe->user_data); 213462306a36Sopenharmony_ci req->file = NULL; 213562306a36Sopenharmony_ci req->rsrc_node = NULL; 213662306a36Sopenharmony_ci req->task = current; 213762306a36Sopenharmony_ci 213862306a36Sopenharmony_ci if (unlikely(opcode >= IORING_OP_LAST)) { 213962306a36Sopenharmony_ci req->opcode = 0; 214062306a36Sopenharmony_ci return -EINVAL; 214162306a36Sopenharmony_ci } 214262306a36Sopenharmony_ci def = &io_issue_defs[opcode]; 214362306a36Sopenharmony_ci if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { 214462306a36Sopenharmony_ci /* enforce forwards compatibility on users */ 214562306a36Sopenharmony_ci if (sqe_flags & ~SQE_VALID_FLAGS) 214662306a36Sopenharmony_ci return -EINVAL; 214762306a36Sopenharmony_ci if (sqe_flags & IOSQE_BUFFER_SELECT) { 214862306a36Sopenharmony_ci if (!def->buffer_select) 214962306a36Sopenharmony_ci return -EOPNOTSUPP; 215062306a36Sopenharmony_ci req->buf_index = READ_ONCE(sqe->buf_group); 215162306a36Sopenharmony_ci } 215262306a36Sopenharmony_ci if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) 215362306a36Sopenharmony_ci ctx->drain_disabled = true; 215462306a36Sopenharmony_ci if (sqe_flags & IOSQE_IO_DRAIN) { 215562306a36Sopenharmony_ci if (ctx->drain_disabled) 215662306a36Sopenharmony_ci return -EOPNOTSUPP; 215762306a36Sopenharmony_ci io_init_req_drain(req); 215862306a36Sopenharmony_ci } 215962306a36Sopenharmony_ci } 216062306a36Sopenharmony_ci if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { 216162306a36Sopenharmony_ci if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) 216262306a36Sopenharmony_ci return -EACCES; 216362306a36Sopenharmony_ci /* knock it to the slow queue path, will be drained there */ 216462306a36Sopenharmony_ci if (ctx->drain_active) 216562306a36Sopenharmony_ci req->flags |= REQ_F_FORCE_ASYNC; 216662306a36Sopenharmony_ci /* if there is no link, we're at "next" request and need to drain */ 216762306a36Sopenharmony_ci if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { 216862306a36Sopenharmony_ci ctx->drain_next = false; 216962306a36Sopenharmony_ci ctx->drain_active = true; 217062306a36Sopenharmony_ci req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; 217162306a36Sopenharmony_ci } 217262306a36Sopenharmony_ci } 217362306a36Sopenharmony_ci 217462306a36Sopenharmony_ci if (!def->ioprio && sqe->ioprio) 217562306a36Sopenharmony_ci return -EINVAL; 217662306a36Sopenharmony_ci if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) 217762306a36Sopenharmony_ci return -EINVAL; 217862306a36Sopenharmony_ci 217962306a36Sopenharmony_ci if (def->needs_file) { 218062306a36Sopenharmony_ci struct io_submit_state *state = &ctx->submit_state; 218162306a36Sopenharmony_ci 218262306a36Sopenharmony_ci req->cqe.fd = READ_ONCE(sqe->fd); 218362306a36Sopenharmony_ci 218462306a36Sopenharmony_ci /* 218562306a36Sopenharmony_ci * Plug now if we have more than 2 IO left after this, and the 218662306a36Sopenharmony_ci * target is potentially a read/write to block based storage. 218762306a36Sopenharmony_ci */ 218862306a36Sopenharmony_ci if (state->need_plug && def->plug) { 218962306a36Sopenharmony_ci state->plug_started = true; 219062306a36Sopenharmony_ci state->need_plug = false; 219162306a36Sopenharmony_ci blk_start_plug_nr_ios(&state->plug, state->submit_nr); 219262306a36Sopenharmony_ci } 219362306a36Sopenharmony_ci } 219462306a36Sopenharmony_ci 219562306a36Sopenharmony_ci personality = READ_ONCE(sqe->personality); 219662306a36Sopenharmony_ci if (personality) { 219762306a36Sopenharmony_ci int ret; 219862306a36Sopenharmony_ci 219962306a36Sopenharmony_ci req->creds = xa_load(&ctx->personalities, personality); 220062306a36Sopenharmony_ci if (!req->creds) 220162306a36Sopenharmony_ci return -EINVAL; 220262306a36Sopenharmony_ci get_cred(req->creds); 220362306a36Sopenharmony_ci ret = security_uring_override_creds(req->creds); 220462306a36Sopenharmony_ci if (ret) { 220562306a36Sopenharmony_ci put_cred(req->creds); 220662306a36Sopenharmony_ci return ret; 220762306a36Sopenharmony_ci } 220862306a36Sopenharmony_ci req->flags |= REQ_F_CREDS; 220962306a36Sopenharmony_ci } 221062306a36Sopenharmony_ci 221162306a36Sopenharmony_ci return def->prep(req, sqe); 221262306a36Sopenharmony_ci} 221362306a36Sopenharmony_ci 221462306a36Sopenharmony_cistatic __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, 221562306a36Sopenharmony_ci struct io_kiocb *req, int ret) 221662306a36Sopenharmony_ci{ 221762306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 221862306a36Sopenharmony_ci struct io_submit_link *link = &ctx->submit_state.link; 221962306a36Sopenharmony_ci struct io_kiocb *head = link->head; 222062306a36Sopenharmony_ci 222162306a36Sopenharmony_ci trace_io_uring_req_failed(sqe, req, ret); 222262306a36Sopenharmony_ci 222362306a36Sopenharmony_ci /* 222462306a36Sopenharmony_ci * Avoid breaking links in the middle as it renders links with SQPOLL 222562306a36Sopenharmony_ci * unusable. Instead of failing eagerly, continue assembling the link if 222662306a36Sopenharmony_ci * applicable and mark the head with REQ_F_FAIL. The link flushing code 222762306a36Sopenharmony_ci * should find the flag and handle the rest. 222862306a36Sopenharmony_ci */ 222962306a36Sopenharmony_ci req_fail_link_node(req, ret); 223062306a36Sopenharmony_ci if (head && !(head->flags & REQ_F_FAIL)) 223162306a36Sopenharmony_ci req_fail_link_node(head, -ECANCELED); 223262306a36Sopenharmony_ci 223362306a36Sopenharmony_ci if (!(req->flags & IO_REQ_LINK_FLAGS)) { 223462306a36Sopenharmony_ci if (head) { 223562306a36Sopenharmony_ci link->last->link = req; 223662306a36Sopenharmony_ci link->head = NULL; 223762306a36Sopenharmony_ci req = head; 223862306a36Sopenharmony_ci } 223962306a36Sopenharmony_ci io_queue_sqe_fallback(req); 224062306a36Sopenharmony_ci return ret; 224162306a36Sopenharmony_ci } 224262306a36Sopenharmony_ci 224362306a36Sopenharmony_ci if (head) 224462306a36Sopenharmony_ci link->last->link = req; 224562306a36Sopenharmony_ci else 224662306a36Sopenharmony_ci link->head = req; 224762306a36Sopenharmony_ci link->last = req; 224862306a36Sopenharmony_ci return 0; 224962306a36Sopenharmony_ci} 225062306a36Sopenharmony_ci 225162306a36Sopenharmony_cistatic inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 225262306a36Sopenharmony_ci const struct io_uring_sqe *sqe) 225362306a36Sopenharmony_ci __must_hold(&ctx->uring_lock) 225462306a36Sopenharmony_ci{ 225562306a36Sopenharmony_ci struct io_submit_link *link = &ctx->submit_state.link; 225662306a36Sopenharmony_ci int ret; 225762306a36Sopenharmony_ci 225862306a36Sopenharmony_ci ret = io_init_req(ctx, req, sqe); 225962306a36Sopenharmony_ci if (unlikely(ret)) 226062306a36Sopenharmony_ci return io_submit_fail_init(sqe, req, ret); 226162306a36Sopenharmony_ci 226262306a36Sopenharmony_ci trace_io_uring_submit_req(req); 226362306a36Sopenharmony_ci 226462306a36Sopenharmony_ci /* 226562306a36Sopenharmony_ci * If we already have a head request, queue this one for async 226662306a36Sopenharmony_ci * submittal once the head completes. If we don't have a head but 226762306a36Sopenharmony_ci * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be 226862306a36Sopenharmony_ci * submitted sync once the chain is complete. If none of those 226962306a36Sopenharmony_ci * conditions are true (normal request), then just queue it. 227062306a36Sopenharmony_ci */ 227162306a36Sopenharmony_ci if (unlikely(link->head)) { 227262306a36Sopenharmony_ci ret = io_req_prep_async(req); 227362306a36Sopenharmony_ci if (unlikely(ret)) 227462306a36Sopenharmony_ci return io_submit_fail_init(sqe, req, ret); 227562306a36Sopenharmony_ci 227662306a36Sopenharmony_ci trace_io_uring_link(req, link->head); 227762306a36Sopenharmony_ci link->last->link = req; 227862306a36Sopenharmony_ci link->last = req; 227962306a36Sopenharmony_ci 228062306a36Sopenharmony_ci if (req->flags & IO_REQ_LINK_FLAGS) 228162306a36Sopenharmony_ci return 0; 228262306a36Sopenharmony_ci /* last request of the link, flush it */ 228362306a36Sopenharmony_ci req = link->head; 228462306a36Sopenharmony_ci link->head = NULL; 228562306a36Sopenharmony_ci if (req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)) 228662306a36Sopenharmony_ci goto fallback; 228762306a36Sopenharmony_ci 228862306a36Sopenharmony_ci } else if (unlikely(req->flags & (IO_REQ_LINK_FLAGS | 228962306a36Sopenharmony_ci REQ_F_FORCE_ASYNC | REQ_F_FAIL))) { 229062306a36Sopenharmony_ci if (req->flags & IO_REQ_LINK_FLAGS) { 229162306a36Sopenharmony_ci link->head = req; 229262306a36Sopenharmony_ci link->last = req; 229362306a36Sopenharmony_ci } else { 229462306a36Sopenharmony_cifallback: 229562306a36Sopenharmony_ci io_queue_sqe_fallback(req); 229662306a36Sopenharmony_ci } 229762306a36Sopenharmony_ci return 0; 229862306a36Sopenharmony_ci } 229962306a36Sopenharmony_ci 230062306a36Sopenharmony_ci io_queue_sqe(req); 230162306a36Sopenharmony_ci return 0; 230262306a36Sopenharmony_ci} 230362306a36Sopenharmony_ci 230462306a36Sopenharmony_ci/* 230562306a36Sopenharmony_ci * Batched submission is done, ensure local IO is flushed out. 230662306a36Sopenharmony_ci */ 230762306a36Sopenharmony_cistatic void io_submit_state_end(struct io_ring_ctx *ctx) 230862306a36Sopenharmony_ci{ 230962306a36Sopenharmony_ci struct io_submit_state *state = &ctx->submit_state; 231062306a36Sopenharmony_ci 231162306a36Sopenharmony_ci if (unlikely(state->link.head)) 231262306a36Sopenharmony_ci io_queue_sqe_fallback(state->link.head); 231362306a36Sopenharmony_ci /* flush only after queuing links as they can generate completions */ 231462306a36Sopenharmony_ci io_submit_flush_completions(ctx); 231562306a36Sopenharmony_ci if (state->plug_started) 231662306a36Sopenharmony_ci blk_finish_plug(&state->plug); 231762306a36Sopenharmony_ci} 231862306a36Sopenharmony_ci 231962306a36Sopenharmony_ci/* 232062306a36Sopenharmony_ci * Start submission side cache. 232162306a36Sopenharmony_ci */ 232262306a36Sopenharmony_cistatic void io_submit_state_start(struct io_submit_state *state, 232362306a36Sopenharmony_ci unsigned int max_ios) 232462306a36Sopenharmony_ci{ 232562306a36Sopenharmony_ci state->plug_started = false; 232662306a36Sopenharmony_ci state->need_plug = max_ios > 2; 232762306a36Sopenharmony_ci state->submit_nr = max_ios; 232862306a36Sopenharmony_ci /* set only head, no need to init link_last in advance */ 232962306a36Sopenharmony_ci state->link.head = NULL; 233062306a36Sopenharmony_ci} 233162306a36Sopenharmony_ci 233262306a36Sopenharmony_cistatic void io_commit_sqring(struct io_ring_ctx *ctx) 233362306a36Sopenharmony_ci{ 233462306a36Sopenharmony_ci struct io_rings *rings = ctx->rings; 233562306a36Sopenharmony_ci 233662306a36Sopenharmony_ci /* 233762306a36Sopenharmony_ci * Ensure any loads from the SQEs are done at this point, 233862306a36Sopenharmony_ci * since once we write the new head, the application could 233962306a36Sopenharmony_ci * write new data to them. 234062306a36Sopenharmony_ci */ 234162306a36Sopenharmony_ci smp_store_release(&rings->sq.head, ctx->cached_sq_head); 234262306a36Sopenharmony_ci} 234362306a36Sopenharmony_ci 234462306a36Sopenharmony_ci/* 234562306a36Sopenharmony_ci * Fetch an sqe, if one is available. Note this returns a pointer to memory 234662306a36Sopenharmony_ci * that is mapped by userspace. This means that care needs to be taken to 234762306a36Sopenharmony_ci * ensure that reads are stable, as we cannot rely on userspace always 234862306a36Sopenharmony_ci * being a good citizen. If members of the sqe are validated and then later 234962306a36Sopenharmony_ci * used, it's important that those reads are done through READ_ONCE() to 235062306a36Sopenharmony_ci * prevent a re-load down the line. 235162306a36Sopenharmony_ci */ 235262306a36Sopenharmony_cistatic bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) 235362306a36Sopenharmony_ci{ 235462306a36Sopenharmony_ci unsigned mask = ctx->sq_entries - 1; 235562306a36Sopenharmony_ci unsigned head = ctx->cached_sq_head++ & mask; 235662306a36Sopenharmony_ci 235762306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) { 235862306a36Sopenharmony_ci head = READ_ONCE(ctx->sq_array[head]); 235962306a36Sopenharmony_ci if (unlikely(head >= ctx->sq_entries)) { 236062306a36Sopenharmony_ci /* drop invalid entries */ 236162306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 236262306a36Sopenharmony_ci ctx->cq_extra--; 236362306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 236462306a36Sopenharmony_ci WRITE_ONCE(ctx->rings->sq_dropped, 236562306a36Sopenharmony_ci READ_ONCE(ctx->rings->sq_dropped) + 1); 236662306a36Sopenharmony_ci return false; 236762306a36Sopenharmony_ci } 236862306a36Sopenharmony_ci } 236962306a36Sopenharmony_ci 237062306a36Sopenharmony_ci /* 237162306a36Sopenharmony_ci * The cached sq head (or cq tail) serves two purposes: 237262306a36Sopenharmony_ci * 237362306a36Sopenharmony_ci * 1) allows us to batch the cost of updating the user visible 237462306a36Sopenharmony_ci * head updates. 237562306a36Sopenharmony_ci * 2) allows the kernel side to track the head on its own, even 237662306a36Sopenharmony_ci * though the application is the one updating it. 237762306a36Sopenharmony_ci */ 237862306a36Sopenharmony_ci 237962306a36Sopenharmony_ci /* double index for 128-byte SQEs, twice as long */ 238062306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_SQE128) 238162306a36Sopenharmony_ci head <<= 1; 238262306a36Sopenharmony_ci *sqe = &ctx->sq_sqes[head]; 238362306a36Sopenharmony_ci return true; 238462306a36Sopenharmony_ci} 238562306a36Sopenharmony_ci 238662306a36Sopenharmony_ciint io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 238762306a36Sopenharmony_ci __must_hold(&ctx->uring_lock) 238862306a36Sopenharmony_ci{ 238962306a36Sopenharmony_ci unsigned int entries = io_sqring_entries(ctx); 239062306a36Sopenharmony_ci unsigned int left; 239162306a36Sopenharmony_ci int ret; 239262306a36Sopenharmony_ci 239362306a36Sopenharmony_ci if (unlikely(!entries)) 239462306a36Sopenharmony_ci return 0; 239562306a36Sopenharmony_ci /* make sure SQ entry isn't read before tail */ 239662306a36Sopenharmony_ci ret = left = min(nr, entries); 239762306a36Sopenharmony_ci io_get_task_refs(left); 239862306a36Sopenharmony_ci io_submit_state_start(&ctx->submit_state, left); 239962306a36Sopenharmony_ci 240062306a36Sopenharmony_ci do { 240162306a36Sopenharmony_ci const struct io_uring_sqe *sqe; 240262306a36Sopenharmony_ci struct io_kiocb *req; 240362306a36Sopenharmony_ci 240462306a36Sopenharmony_ci if (unlikely(!io_alloc_req(ctx, &req))) 240562306a36Sopenharmony_ci break; 240662306a36Sopenharmony_ci if (unlikely(!io_get_sqe(ctx, &sqe))) { 240762306a36Sopenharmony_ci io_req_add_to_cache(req, ctx); 240862306a36Sopenharmony_ci break; 240962306a36Sopenharmony_ci } 241062306a36Sopenharmony_ci 241162306a36Sopenharmony_ci /* 241262306a36Sopenharmony_ci * Continue submitting even for sqe failure if the 241362306a36Sopenharmony_ci * ring was setup with IORING_SETUP_SUBMIT_ALL 241462306a36Sopenharmony_ci */ 241562306a36Sopenharmony_ci if (unlikely(io_submit_sqe(ctx, req, sqe)) && 241662306a36Sopenharmony_ci !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { 241762306a36Sopenharmony_ci left--; 241862306a36Sopenharmony_ci break; 241962306a36Sopenharmony_ci } 242062306a36Sopenharmony_ci } while (--left); 242162306a36Sopenharmony_ci 242262306a36Sopenharmony_ci if (unlikely(left)) { 242362306a36Sopenharmony_ci ret -= left; 242462306a36Sopenharmony_ci /* try again if it submitted nothing and can't allocate a req */ 242562306a36Sopenharmony_ci if (!ret && io_req_cache_empty(ctx)) 242662306a36Sopenharmony_ci ret = -EAGAIN; 242762306a36Sopenharmony_ci current->io_uring->cached_refs += left; 242862306a36Sopenharmony_ci } 242962306a36Sopenharmony_ci 243062306a36Sopenharmony_ci io_submit_state_end(ctx); 243162306a36Sopenharmony_ci /* Commit SQ ring head once we've consumed and submitted all SQEs */ 243262306a36Sopenharmony_ci io_commit_sqring(ctx); 243362306a36Sopenharmony_ci return ret; 243462306a36Sopenharmony_ci} 243562306a36Sopenharmony_ci 243662306a36Sopenharmony_cistruct io_wait_queue { 243762306a36Sopenharmony_ci struct wait_queue_entry wq; 243862306a36Sopenharmony_ci struct io_ring_ctx *ctx; 243962306a36Sopenharmony_ci unsigned cq_tail; 244062306a36Sopenharmony_ci unsigned nr_timeouts; 244162306a36Sopenharmony_ci ktime_t timeout; 244262306a36Sopenharmony_ci}; 244362306a36Sopenharmony_ci 244462306a36Sopenharmony_cistatic inline bool io_has_work(struct io_ring_ctx *ctx) 244562306a36Sopenharmony_ci{ 244662306a36Sopenharmony_ci return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || 244762306a36Sopenharmony_ci !llist_empty(&ctx->work_llist); 244862306a36Sopenharmony_ci} 244962306a36Sopenharmony_ci 245062306a36Sopenharmony_cistatic inline bool io_should_wake(struct io_wait_queue *iowq) 245162306a36Sopenharmony_ci{ 245262306a36Sopenharmony_ci struct io_ring_ctx *ctx = iowq->ctx; 245362306a36Sopenharmony_ci int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; 245462306a36Sopenharmony_ci 245562306a36Sopenharmony_ci /* 245662306a36Sopenharmony_ci * Wake up if we have enough events, or if a timeout occurred since we 245762306a36Sopenharmony_ci * started waiting. For timeouts, we always want to return to userspace, 245862306a36Sopenharmony_ci * regardless of event count. 245962306a36Sopenharmony_ci */ 246062306a36Sopenharmony_ci return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; 246162306a36Sopenharmony_ci} 246262306a36Sopenharmony_ci 246362306a36Sopenharmony_cistatic int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, 246462306a36Sopenharmony_ci int wake_flags, void *key) 246562306a36Sopenharmony_ci{ 246662306a36Sopenharmony_ci struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); 246762306a36Sopenharmony_ci 246862306a36Sopenharmony_ci /* 246962306a36Sopenharmony_ci * Cannot safely flush overflowed CQEs from here, ensure we wake up 247062306a36Sopenharmony_ci * the task, and the next invocation will do it. 247162306a36Sopenharmony_ci */ 247262306a36Sopenharmony_ci if (io_should_wake(iowq) || io_has_work(iowq->ctx)) 247362306a36Sopenharmony_ci return autoremove_wake_function(curr, mode, wake_flags, key); 247462306a36Sopenharmony_ci return -1; 247562306a36Sopenharmony_ci} 247662306a36Sopenharmony_ci 247762306a36Sopenharmony_ciint io_run_task_work_sig(struct io_ring_ctx *ctx) 247862306a36Sopenharmony_ci{ 247962306a36Sopenharmony_ci if (!llist_empty(&ctx->work_llist)) { 248062306a36Sopenharmony_ci __set_current_state(TASK_RUNNING); 248162306a36Sopenharmony_ci if (io_run_local_work(ctx, INT_MAX) > 0) 248262306a36Sopenharmony_ci return 0; 248362306a36Sopenharmony_ci } 248462306a36Sopenharmony_ci if (io_run_task_work() > 0) 248562306a36Sopenharmony_ci return 0; 248662306a36Sopenharmony_ci if (task_sigpending(current)) 248762306a36Sopenharmony_ci return -EINTR; 248862306a36Sopenharmony_ci return 0; 248962306a36Sopenharmony_ci} 249062306a36Sopenharmony_ci 249162306a36Sopenharmony_cistatic bool current_pending_io(void) 249262306a36Sopenharmony_ci{ 249362306a36Sopenharmony_ci struct io_uring_task *tctx = current->io_uring; 249462306a36Sopenharmony_ci 249562306a36Sopenharmony_ci if (!tctx) 249662306a36Sopenharmony_ci return false; 249762306a36Sopenharmony_ci return percpu_counter_read_positive(&tctx->inflight); 249862306a36Sopenharmony_ci} 249962306a36Sopenharmony_ci 250062306a36Sopenharmony_ci/* when returns >0, the caller should retry */ 250162306a36Sopenharmony_cistatic inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 250262306a36Sopenharmony_ci struct io_wait_queue *iowq) 250362306a36Sopenharmony_ci{ 250462306a36Sopenharmony_ci int ret; 250562306a36Sopenharmony_ci 250662306a36Sopenharmony_ci if (unlikely(READ_ONCE(ctx->check_cq))) 250762306a36Sopenharmony_ci return 1; 250862306a36Sopenharmony_ci if (unlikely(!llist_empty(&ctx->work_llist))) 250962306a36Sopenharmony_ci return 1; 251062306a36Sopenharmony_ci if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) 251162306a36Sopenharmony_ci return 1; 251262306a36Sopenharmony_ci if (unlikely(task_sigpending(current))) 251362306a36Sopenharmony_ci return -EINTR; 251462306a36Sopenharmony_ci if (unlikely(io_should_wake(iowq))) 251562306a36Sopenharmony_ci return 0; 251662306a36Sopenharmony_ci 251762306a36Sopenharmony_ci /* 251862306a36Sopenharmony_ci * Mark us as being in io_wait if we have pending requests, so cpufreq 251962306a36Sopenharmony_ci * can take into account that the task is waiting for IO - turns out 252062306a36Sopenharmony_ci * to be important for low QD IO. 252162306a36Sopenharmony_ci */ 252262306a36Sopenharmony_ci if (current_pending_io()) 252362306a36Sopenharmony_ci current->in_iowait = 1; 252462306a36Sopenharmony_ci ret = 0; 252562306a36Sopenharmony_ci if (iowq->timeout == KTIME_MAX) 252662306a36Sopenharmony_ci schedule(); 252762306a36Sopenharmony_ci else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) 252862306a36Sopenharmony_ci ret = -ETIME; 252962306a36Sopenharmony_ci current->in_iowait = 0; 253062306a36Sopenharmony_ci return ret; 253162306a36Sopenharmony_ci} 253262306a36Sopenharmony_ci 253362306a36Sopenharmony_ci/* 253462306a36Sopenharmony_ci * Wait until events become available, if we don't already have some. The 253562306a36Sopenharmony_ci * application must reap them itself, as they reside on the shared cq ring. 253662306a36Sopenharmony_ci */ 253762306a36Sopenharmony_cistatic int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 253862306a36Sopenharmony_ci const sigset_t __user *sig, size_t sigsz, 253962306a36Sopenharmony_ci struct __kernel_timespec __user *uts) 254062306a36Sopenharmony_ci{ 254162306a36Sopenharmony_ci struct io_wait_queue iowq; 254262306a36Sopenharmony_ci struct io_rings *rings = ctx->rings; 254362306a36Sopenharmony_ci int ret; 254462306a36Sopenharmony_ci 254562306a36Sopenharmony_ci if (!io_allowed_run_tw(ctx)) 254662306a36Sopenharmony_ci return -EEXIST; 254762306a36Sopenharmony_ci if (!llist_empty(&ctx->work_llist)) 254862306a36Sopenharmony_ci io_run_local_work(ctx, min_events); 254962306a36Sopenharmony_ci io_run_task_work(); 255062306a36Sopenharmony_ci io_cqring_overflow_flush(ctx); 255162306a36Sopenharmony_ci /* if user messes with these they will just get an early return */ 255262306a36Sopenharmony_ci if (__io_cqring_events_user(ctx) >= min_events) 255362306a36Sopenharmony_ci return 0; 255462306a36Sopenharmony_ci 255562306a36Sopenharmony_ci if (sig) { 255662306a36Sopenharmony_ci#ifdef CONFIG_COMPAT 255762306a36Sopenharmony_ci if (in_compat_syscall()) 255862306a36Sopenharmony_ci ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, 255962306a36Sopenharmony_ci sigsz); 256062306a36Sopenharmony_ci else 256162306a36Sopenharmony_ci#endif 256262306a36Sopenharmony_ci ret = set_user_sigmask(sig, sigsz); 256362306a36Sopenharmony_ci 256462306a36Sopenharmony_ci if (ret) 256562306a36Sopenharmony_ci return ret; 256662306a36Sopenharmony_ci } 256762306a36Sopenharmony_ci 256862306a36Sopenharmony_ci init_waitqueue_func_entry(&iowq.wq, io_wake_function); 256962306a36Sopenharmony_ci iowq.wq.private = current; 257062306a36Sopenharmony_ci INIT_LIST_HEAD(&iowq.wq.entry); 257162306a36Sopenharmony_ci iowq.ctx = ctx; 257262306a36Sopenharmony_ci iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 257362306a36Sopenharmony_ci iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 257462306a36Sopenharmony_ci iowq.timeout = KTIME_MAX; 257562306a36Sopenharmony_ci 257662306a36Sopenharmony_ci if (uts) { 257762306a36Sopenharmony_ci struct timespec64 ts; 257862306a36Sopenharmony_ci 257962306a36Sopenharmony_ci if (get_timespec64(&ts, uts)) 258062306a36Sopenharmony_ci return -EFAULT; 258162306a36Sopenharmony_ci iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); 258262306a36Sopenharmony_ci } 258362306a36Sopenharmony_ci 258462306a36Sopenharmony_ci trace_io_uring_cqring_wait(ctx, min_events); 258562306a36Sopenharmony_ci do { 258662306a36Sopenharmony_ci int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); 258762306a36Sopenharmony_ci unsigned long check_cq; 258862306a36Sopenharmony_ci 258962306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 259062306a36Sopenharmony_ci atomic_set(&ctx->cq_wait_nr, nr_wait); 259162306a36Sopenharmony_ci set_current_state(TASK_INTERRUPTIBLE); 259262306a36Sopenharmony_ci } else { 259362306a36Sopenharmony_ci prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 259462306a36Sopenharmony_ci TASK_INTERRUPTIBLE); 259562306a36Sopenharmony_ci } 259662306a36Sopenharmony_ci 259762306a36Sopenharmony_ci ret = io_cqring_wait_schedule(ctx, &iowq); 259862306a36Sopenharmony_ci __set_current_state(TASK_RUNNING); 259962306a36Sopenharmony_ci atomic_set(&ctx->cq_wait_nr, 0); 260062306a36Sopenharmony_ci 260162306a36Sopenharmony_ci /* 260262306a36Sopenharmony_ci * Run task_work after scheduling and before io_should_wake(). 260362306a36Sopenharmony_ci * If we got woken because of task_work being processed, run it 260462306a36Sopenharmony_ci * now rather than let the caller do another wait loop. 260562306a36Sopenharmony_ci */ 260662306a36Sopenharmony_ci io_run_task_work(); 260762306a36Sopenharmony_ci if (!llist_empty(&ctx->work_llist)) 260862306a36Sopenharmony_ci io_run_local_work(ctx, nr_wait); 260962306a36Sopenharmony_ci 261062306a36Sopenharmony_ci /* 261162306a36Sopenharmony_ci * Non-local task_work will be run on exit to userspace, but 261262306a36Sopenharmony_ci * if we're using DEFER_TASKRUN, then we could have waited 261362306a36Sopenharmony_ci * with a timeout for a number of requests. If the timeout 261462306a36Sopenharmony_ci * hits, we could have some requests ready to process. Ensure 261562306a36Sopenharmony_ci * this break is _after_ we have run task_work, to avoid 261662306a36Sopenharmony_ci * deferring running potentially pending requests until the 261762306a36Sopenharmony_ci * next time we wait for events. 261862306a36Sopenharmony_ci */ 261962306a36Sopenharmony_ci if (ret < 0) 262062306a36Sopenharmony_ci break; 262162306a36Sopenharmony_ci 262262306a36Sopenharmony_ci check_cq = READ_ONCE(ctx->check_cq); 262362306a36Sopenharmony_ci if (unlikely(check_cq)) { 262462306a36Sopenharmony_ci /* let the caller flush overflows, retry */ 262562306a36Sopenharmony_ci if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 262662306a36Sopenharmony_ci io_cqring_do_overflow_flush(ctx); 262762306a36Sopenharmony_ci if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { 262862306a36Sopenharmony_ci ret = -EBADR; 262962306a36Sopenharmony_ci break; 263062306a36Sopenharmony_ci } 263162306a36Sopenharmony_ci } 263262306a36Sopenharmony_ci 263362306a36Sopenharmony_ci if (io_should_wake(&iowq)) { 263462306a36Sopenharmony_ci ret = 0; 263562306a36Sopenharmony_ci break; 263662306a36Sopenharmony_ci } 263762306a36Sopenharmony_ci cond_resched(); 263862306a36Sopenharmony_ci } while (1); 263962306a36Sopenharmony_ci 264062306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 264162306a36Sopenharmony_ci finish_wait(&ctx->cq_wait, &iowq.wq); 264262306a36Sopenharmony_ci restore_saved_sigmask_unless(ret == -EINTR); 264362306a36Sopenharmony_ci 264462306a36Sopenharmony_ci return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 264562306a36Sopenharmony_ci} 264662306a36Sopenharmony_ci 264762306a36Sopenharmony_civoid io_mem_free(void *ptr) 264862306a36Sopenharmony_ci{ 264962306a36Sopenharmony_ci if (!ptr) 265062306a36Sopenharmony_ci return; 265162306a36Sopenharmony_ci 265262306a36Sopenharmony_ci folio_put(virt_to_folio(ptr)); 265362306a36Sopenharmony_ci} 265462306a36Sopenharmony_ci 265562306a36Sopenharmony_cistatic void io_pages_free(struct page ***pages, int npages) 265662306a36Sopenharmony_ci{ 265762306a36Sopenharmony_ci struct page **page_array; 265862306a36Sopenharmony_ci int i; 265962306a36Sopenharmony_ci 266062306a36Sopenharmony_ci if (!pages) 266162306a36Sopenharmony_ci return; 266262306a36Sopenharmony_ci 266362306a36Sopenharmony_ci page_array = *pages; 266462306a36Sopenharmony_ci if (!page_array) 266562306a36Sopenharmony_ci return; 266662306a36Sopenharmony_ci 266762306a36Sopenharmony_ci for (i = 0; i < npages; i++) 266862306a36Sopenharmony_ci unpin_user_page(page_array[i]); 266962306a36Sopenharmony_ci kvfree(page_array); 267062306a36Sopenharmony_ci *pages = NULL; 267162306a36Sopenharmony_ci} 267262306a36Sopenharmony_ci 267362306a36Sopenharmony_cistatic void *__io_uaddr_map(struct page ***pages, unsigned short *npages, 267462306a36Sopenharmony_ci unsigned long uaddr, size_t size) 267562306a36Sopenharmony_ci{ 267662306a36Sopenharmony_ci struct page **page_array; 267762306a36Sopenharmony_ci unsigned int nr_pages; 267862306a36Sopenharmony_ci void *page_addr; 267962306a36Sopenharmony_ci int ret, i, pinned; 268062306a36Sopenharmony_ci 268162306a36Sopenharmony_ci *npages = 0; 268262306a36Sopenharmony_ci 268362306a36Sopenharmony_ci if (uaddr & (PAGE_SIZE - 1) || !size) 268462306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 268562306a36Sopenharmony_ci 268662306a36Sopenharmony_ci nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 268762306a36Sopenharmony_ci if (nr_pages > USHRT_MAX) 268862306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 268962306a36Sopenharmony_ci page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 269062306a36Sopenharmony_ci if (!page_array) 269162306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 269262306a36Sopenharmony_ci 269362306a36Sopenharmony_ci 269462306a36Sopenharmony_ci pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 269562306a36Sopenharmony_ci page_array); 269662306a36Sopenharmony_ci if (pinned != nr_pages) { 269762306a36Sopenharmony_ci ret = (pinned < 0) ? pinned : -EFAULT; 269862306a36Sopenharmony_ci goto free_pages; 269962306a36Sopenharmony_ci } 270062306a36Sopenharmony_ci 270162306a36Sopenharmony_ci page_addr = page_address(page_array[0]); 270262306a36Sopenharmony_ci for (i = 0; i < nr_pages; i++) { 270362306a36Sopenharmony_ci ret = -EINVAL; 270462306a36Sopenharmony_ci 270562306a36Sopenharmony_ci /* 270662306a36Sopenharmony_ci * Can't support mapping user allocated ring memory on 32-bit 270762306a36Sopenharmony_ci * archs where it could potentially reside in highmem. Just 270862306a36Sopenharmony_ci * fail those with -EINVAL, just like we did on kernels that 270962306a36Sopenharmony_ci * didn't support this feature. 271062306a36Sopenharmony_ci */ 271162306a36Sopenharmony_ci if (PageHighMem(page_array[i])) 271262306a36Sopenharmony_ci goto free_pages; 271362306a36Sopenharmony_ci 271462306a36Sopenharmony_ci /* 271562306a36Sopenharmony_ci * No support for discontig pages for now, should either be a 271662306a36Sopenharmony_ci * single normal page, or a huge page. Later on we can add 271762306a36Sopenharmony_ci * support for remapping discontig pages, for now we will 271862306a36Sopenharmony_ci * just fail them with EINVAL. 271962306a36Sopenharmony_ci */ 272062306a36Sopenharmony_ci if (page_address(page_array[i]) != page_addr) 272162306a36Sopenharmony_ci goto free_pages; 272262306a36Sopenharmony_ci page_addr += PAGE_SIZE; 272362306a36Sopenharmony_ci } 272462306a36Sopenharmony_ci 272562306a36Sopenharmony_ci *pages = page_array; 272662306a36Sopenharmony_ci *npages = nr_pages; 272762306a36Sopenharmony_ci return page_to_virt(page_array[0]); 272862306a36Sopenharmony_ci 272962306a36Sopenharmony_cifree_pages: 273062306a36Sopenharmony_ci io_pages_free(&page_array, pinned > 0 ? pinned : 0); 273162306a36Sopenharmony_ci return ERR_PTR(ret); 273262306a36Sopenharmony_ci} 273362306a36Sopenharmony_ci 273462306a36Sopenharmony_cistatic void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, 273562306a36Sopenharmony_ci size_t size) 273662306a36Sopenharmony_ci{ 273762306a36Sopenharmony_ci return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, 273862306a36Sopenharmony_ci size); 273962306a36Sopenharmony_ci} 274062306a36Sopenharmony_ci 274162306a36Sopenharmony_cistatic void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, 274262306a36Sopenharmony_ci size_t size) 274362306a36Sopenharmony_ci{ 274462306a36Sopenharmony_ci return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, 274562306a36Sopenharmony_ci size); 274662306a36Sopenharmony_ci} 274762306a36Sopenharmony_ci 274862306a36Sopenharmony_cistatic void io_rings_free(struct io_ring_ctx *ctx) 274962306a36Sopenharmony_ci{ 275062306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { 275162306a36Sopenharmony_ci io_mem_free(ctx->rings); 275262306a36Sopenharmony_ci io_mem_free(ctx->sq_sqes); 275362306a36Sopenharmony_ci ctx->rings = NULL; 275462306a36Sopenharmony_ci ctx->sq_sqes = NULL; 275562306a36Sopenharmony_ci } else { 275662306a36Sopenharmony_ci io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); 275762306a36Sopenharmony_ci ctx->n_ring_pages = 0; 275862306a36Sopenharmony_ci io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); 275962306a36Sopenharmony_ci ctx->n_sqe_pages = 0; 276062306a36Sopenharmony_ci } 276162306a36Sopenharmony_ci} 276262306a36Sopenharmony_ci 276362306a36Sopenharmony_civoid *io_mem_alloc(size_t size) 276462306a36Sopenharmony_ci{ 276562306a36Sopenharmony_ci gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; 276662306a36Sopenharmony_ci void *ret; 276762306a36Sopenharmony_ci 276862306a36Sopenharmony_ci ret = (void *) __get_free_pages(gfp, get_order(size)); 276962306a36Sopenharmony_ci if (ret) 277062306a36Sopenharmony_ci return ret; 277162306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 277262306a36Sopenharmony_ci} 277362306a36Sopenharmony_ci 277462306a36Sopenharmony_cistatic unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, 277562306a36Sopenharmony_ci unsigned int cq_entries, size_t *sq_offset) 277662306a36Sopenharmony_ci{ 277762306a36Sopenharmony_ci struct io_rings *rings; 277862306a36Sopenharmony_ci size_t off, sq_array_size; 277962306a36Sopenharmony_ci 278062306a36Sopenharmony_ci off = struct_size(rings, cqes, cq_entries); 278162306a36Sopenharmony_ci if (off == SIZE_MAX) 278262306a36Sopenharmony_ci return SIZE_MAX; 278362306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_CQE32) { 278462306a36Sopenharmony_ci if (check_shl_overflow(off, 1, &off)) 278562306a36Sopenharmony_ci return SIZE_MAX; 278662306a36Sopenharmony_ci } 278762306a36Sopenharmony_ci 278862306a36Sopenharmony_ci#ifdef CONFIG_SMP 278962306a36Sopenharmony_ci off = ALIGN(off, SMP_CACHE_BYTES); 279062306a36Sopenharmony_ci if (off == 0) 279162306a36Sopenharmony_ci return SIZE_MAX; 279262306a36Sopenharmony_ci#endif 279362306a36Sopenharmony_ci 279462306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_NO_SQARRAY) { 279562306a36Sopenharmony_ci if (sq_offset) 279662306a36Sopenharmony_ci *sq_offset = SIZE_MAX; 279762306a36Sopenharmony_ci return off; 279862306a36Sopenharmony_ci } 279962306a36Sopenharmony_ci 280062306a36Sopenharmony_ci if (sq_offset) 280162306a36Sopenharmony_ci *sq_offset = off; 280262306a36Sopenharmony_ci 280362306a36Sopenharmony_ci sq_array_size = array_size(sizeof(u32), sq_entries); 280462306a36Sopenharmony_ci if (sq_array_size == SIZE_MAX) 280562306a36Sopenharmony_ci return SIZE_MAX; 280662306a36Sopenharmony_ci 280762306a36Sopenharmony_ci if (check_add_overflow(off, sq_array_size, &off)) 280862306a36Sopenharmony_ci return SIZE_MAX; 280962306a36Sopenharmony_ci 281062306a36Sopenharmony_ci return off; 281162306a36Sopenharmony_ci} 281262306a36Sopenharmony_ci 281362306a36Sopenharmony_cistatic int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, 281462306a36Sopenharmony_ci unsigned int eventfd_async) 281562306a36Sopenharmony_ci{ 281662306a36Sopenharmony_ci struct io_ev_fd *ev_fd; 281762306a36Sopenharmony_ci __s32 __user *fds = arg; 281862306a36Sopenharmony_ci int fd; 281962306a36Sopenharmony_ci 282062306a36Sopenharmony_ci ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 282162306a36Sopenharmony_ci lockdep_is_held(&ctx->uring_lock)); 282262306a36Sopenharmony_ci if (ev_fd) 282362306a36Sopenharmony_ci return -EBUSY; 282462306a36Sopenharmony_ci 282562306a36Sopenharmony_ci if (copy_from_user(&fd, fds, sizeof(*fds))) 282662306a36Sopenharmony_ci return -EFAULT; 282762306a36Sopenharmony_ci 282862306a36Sopenharmony_ci ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); 282962306a36Sopenharmony_ci if (!ev_fd) 283062306a36Sopenharmony_ci return -ENOMEM; 283162306a36Sopenharmony_ci 283262306a36Sopenharmony_ci ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); 283362306a36Sopenharmony_ci if (IS_ERR(ev_fd->cq_ev_fd)) { 283462306a36Sopenharmony_ci int ret = PTR_ERR(ev_fd->cq_ev_fd); 283562306a36Sopenharmony_ci kfree(ev_fd); 283662306a36Sopenharmony_ci return ret; 283762306a36Sopenharmony_ci } 283862306a36Sopenharmony_ci 283962306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 284062306a36Sopenharmony_ci ctx->evfd_last_cq_tail = ctx->cached_cq_tail; 284162306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 284262306a36Sopenharmony_ci 284362306a36Sopenharmony_ci ev_fd->eventfd_async = eventfd_async; 284462306a36Sopenharmony_ci ctx->has_evfd = true; 284562306a36Sopenharmony_ci rcu_assign_pointer(ctx->io_ev_fd, ev_fd); 284662306a36Sopenharmony_ci atomic_set(&ev_fd->refs, 1); 284762306a36Sopenharmony_ci atomic_set(&ev_fd->ops, 0); 284862306a36Sopenharmony_ci return 0; 284962306a36Sopenharmony_ci} 285062306a36Sopenharmony_ci 285162306a36Sopenharmony_cistatic int io_eventfd_unregister(struct io_ring_ctx *ctx) 285262306a36Sopenharmony_ci{ 285362306a36Sopenharmony_ci struct io_ev_fd *ev_fd; 285462306a36Sopenharmony_ci 285562306a36Sopenharmony_ci ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 285662306a36Sopenharmony_ci lockdep_is_held(&ctx->uring_lock)); 285762306a36Sopenharmony_ci if (ev_fd) { 285862306a36Sopenharmony_ci ctx->has_evfd = false; 285962306a36Sopenharmony_ci rcu_assign_pointer(ctx->io_ev_fd, NULL); 286062306a36Sopenharmony_ci if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops)) 286162306a36Sopenharmony_ci call_rcu(&ev_fd->rcu, io_eventfd_ops); 286262306a36Sopenharmony_ci return 0; 286362306a36Sopenharmony_ci } 286462306a36Sopenharmony_ci 286562306a36Sopenharmony_ci return -ENXIO; 286662306a36Sopenharmony_ci} 286762306a36Sopenharmony_ci 286862306a36Sopenharmony_cistatic void io_req_caches_free(struct io_ring_ctx *ctx) 286962306a36Sopenharmony_ci{ 287062306a36Sopenharmony_ci struct io_kiocb *req; 287162306a36Sopenharmony_ci int nr = 0; 287262306a36Sopenharmony_ci 287362306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 287462306a36Sopenharmony_ci io_flush_cached_locked_reqs(ctx, &ctx->submit_state); 287562306a36Sopenharmony_ci 287662306a36Sopenharmony_ci while (!io_req_cache_empty(ctx)) { 287762306a36Sopenharmony_ci req = io_extract_req(ctx); 287862306a36Sopenharmony_ci kmem_cache_free(req_cachep, req); 287962306a36Sopenharmony_ci nr++; 288062306a36Sopenharmony_ci } 288162306a36Sopenharmony_ci if (nr) 288262306a36Sopenharmony_ci percpu_ref_put_many(&ctx->refs, nr); 288362306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 288462306a36Sopenharmony_ci} 288562306a36Sopenharmony_ci 288662306a36Sopenharmony_cistatic void io_rsrc_node_cache_free(struct io_cache_entry *entry) 288762306a36Sopenharmony_ci{ 288862306a36Sopenharmony_ci kfree(container_of(entry, struct io_rsrc_node, cache)); 288962306a36Sopenharmony_ci} 289062306a36Sopenharmony_ci 289162306a36Sopenharmony_cistatic __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) 289262306a36Sopenharmony_ci{ 289362306a36Sopenharmony_ci io_sq_thread_finish(ctx); 289462306a36Sopenharmony_ci /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ 289562306a36Sopenharmony_ci if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list))) 289662306a36Sopenharmony_ci return; 289762306a36Sopenharmony_ci 289862306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 289962306a36Sopenharmony_ci if (ctx->buf_data) 290062306a36Sopenharmony_ci __io_sqe_buffers_unregister(ctx); 290162306a36Sopenharmony_ci if (ctx->file_data) 290262306a36Sopenharmony_ci __io_sqe_files_unregister(ctx); 290362306a36Sopenharmony_ci io_cqring_overflow_kill(ctx); 290462306a36Sopenharmony_ci io_eventfd_unregister(ctx); 290562306a36Sopenharmony_ci io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); 290662306a36Sopenharmony_ci io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); 290762306a36Sopenharmony_ci io_destroy_buffers(ctx); 290862306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 290962306a36Sopenharmony_ci if (ctx->sq_creds) 291062306a36Sopenharmony_ci put_cred(ctx->sq_creds); 291162306a36Sopenharmony_ci if (ctx->submitter_task) 291262306a36Sopenharmony_ci put_task_struct(ctx->submitter_task); 291362306a36Sopenharmony_ci 291462306a36Sopenharmony_ci /* there are no registered resources left, nobody uses it */ 291562306a36Sopenharmony_ci if (ctx->rsrc_node) 291662306a36Sopenharmony_ci io_rsrc_node_destroy(ctx, ctx->rsrc_node); 291762306a36Sopenharmony_ci 291862306a36Sopenharmony_ci WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); 291962306a36Sopenharmony_ci WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); 292062306a36Sopenharmony_ci 292162306a36Sopenharmony_ci io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); 292262306a36Sopenharmony_ci if (ctx->mm_account) { 292362306a36Sopenharmony_ci mmdrop(ctx->mm_account); 292462306a36Sopenharmony_ci ctx->mm_account = NULL; 292562306a36Sopenharmony_ci } 292662306a36Sopenharmony_ci io_rings_free(ctx); 292762306a36Sopenharmony_ci io_kbuf_mmap_list_free(ctx); 292862306a36Sopenharmony_ci 292962306a36Sopenharmony_ci percpu_ref_exit(&ctx->refs); 293062306a36Sopenharmony_ci free_uid(ctx->user); 293162306a36Sopenharmony_ci io_req_caches_free(ctx); 293262306a36Sopenharmony_ci if (ctx->hash_map) 293362306a36Sopenharmony_ci io_wq_put_hash(ctx->hash_map); 293462306a36Sopenharmony_ci kfree(ctx->cancel_table.hbs); 293562306a36Sopenharmony_ci kfree(ctx->cancel_table_locked.hbs); 293662306a36Sopenharmony_ci kfree(ctx->io_bl); 293762306a36Sopenharmony_ci xa_destroy(&ctx->io_bl_xa); 293862306a36Sopenharmony_ci kfree(ctx); 293962306a36Sopenharmony_ci} 294062306a36Sopenharmony_ci 294162306a36Sopenharmony_cistatic __cold void io_activate_pollwq_cb(struct callback_head *cb) 294262306a36Sopenharmony_ci{ 294362306a36Sopenharmony_ci struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, 294462306a36Sopenharmony_ci poll_wq_task_work); 294562306a36Sopenharmony_ci 294662306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 294762306a36Sopenharmony_ci ctx->poll_activated = true; 294862306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 294962306a36Sopenharmony_ci 295062306a36Sopenharmony_ci /* 295162306a36Sopenharmony_ci * Wake ups for some events between start of polling and activation 295262306a36Sopenharmony_ci * might've been lost due to loose synchronisation. 295362306a36Sopenharmony_ci */ 295462306a36Sopenharmony_ci wake_up_all(&ctx->poll_wq); 295562306a36Sopenharmony_ci percpu_ref_put(&ctx->refs); 295662306a36Sopenharmony_ci} 295762306a36Sopenharmony_ci 295862306a36Sopenharmony_cistatic __cold void io_activate_pollwq(struct io_ring_ctx *ctx) 295962306a36Sopenharmony_ci{ 296062306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 296162306a36Sopenharmony_ci /* already activated or in progress */ 296262306a36Sopenharmony_ci if (ctx->poll_activated || ctx->poll_wq_task_work.func) 296362306a36Sopenharmony_ci goto out; 296462306a36Sopenharmony_ci if (WARN_ON_ONCE(!ctx->task_complete)) 296562306a36Sopenharmony_ci goto out; 296662306a36Sopenharmony_ci if (!ctx->submitter_task) 296762306a36Sopenharmony_ci goto out; 296862306a36Sopenharmony_ci /* 296962306a36Sopenharmony_ci * with ->submitter_task only the submitter task completes requests, we 297062306a36Sopenharmony_ci * only need to sync with it, which is done by injecting a tw 297162306a36Sopenharmony_ci */ 297262306a36Sopenharmony_ci init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb); 297362306a36Sopenharmony_ci percpu_ref_get(&ctx->refs); 297462306a36Sopenharmony_ci if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL)) 297562306a36Sopenharmony_ci percpu_ref_put(&ctx->refs); 297662306a36Sopenharmony_ciout: 297762306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 297862306a36Sopenharmony_ci} 297962306a36Sopenharmony_ci 298062306a36Sopenharmony_cistatic __poll_t io_uring_poll(struct file *file, poll_table *wait) 298162306a36Sopenharmony_ci{ 298262306a36Sopenharmony_ci struct io_ring_ctx *ctx = file->private_data; 298362306a36Sopenharmony_ci __poll_t mask = 0; 298462306a36Sopenharmony_ci 298562306a36Sopenharmony_ci if (unlikely(!ctx->poll_activated)) 298662306a36Sopenharmony_ci io_activate_pollwq(ctx); 298762306a36Sopenharmony_ci 298862306a36Sopenharmony_ci poll_wait(file, &ctx->poll_wq, wait); 298962306a36Sopenharmony_ci /* 299062306a36Sopenharmony_ci * synchronizes with barrier from wq_has_sleeper call in 299162306a36Sopenharmony_ci * io_commit_cqring 299262306a36Sopenharmony_ci */ 299362306a36Sopenharmony_ci smp_rmb(); 299462306a36Sopenharmony_ci if (!io_sqring_full(ctx)) 299562306a36Sopenharmony_ci mask |= EPOLLOUT | EPOLLWRNORM; 299662306a36Sopenharmony_ci 299762306a36Sopenharmony_ci /* 299862306a36Sopenharmony_ci * Don't flush cqring overflow list here, just do a simple check. 299962306a36Sopenharmony_ci * Otherwise there could possible be ABBA deadlock: 300062306a36Sopenharmony_ci * CPU0 CPU1 300162306a36Sopenharmony_ci * ---- ---- 300262306a36Sopenharmony_ci * lock(&ctx->uring_lock); 300362306a36Sopenharmony_ci * lock(&ep->mtx); 300462306a36Sopenharmony_ci * lock(&ctx->uring_lock); 300562306a36Sopenharmony_ci * lock(&ep->mtx); 300662306a36Sopenharmony_ci * 300762306a36Sopenharmony_ci * Users may get EPOLLIN meanwhile seeing nothing in cqring, this 300862306a36Sopenharmony_ci * pushes them to do the flush. 300962306a36Sopenharmony_ci */ 301062306a36Sopenharmony_ci 301162306a36Sopenharmony_ci if (__io_cqring_events_user(ctx) || io_has_work(ctx)) 301262306a36Sopenharmony_ci mask |= EPOLLIN | EPOLLRDNORM; 301362306a36Sopenharmony_ci 301462306a36Sopenharmony_ci return mask; 301562306a36Sopenharmony_ci} 301662306a36Sopenharmony_ci 301762306a36Sopenharmony_cistatic int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 301862306a36Sopenharmony_ci{ 301962306a36Sopenharmony_ci const struct cred *creds; 302062306a36Sopenharmony_ci 302162306a36Sopenharmony_ci creds = xa_erase(&ctx->personalities, id); 302262306a36Sopenharmony_ci if (creds) { 302362306a36Sopenharmony_ci put_cred(creds); 302462306a36Sopenharmony_ci return 0; 302562306a36Sopenharmony_ci } 302662306a36Sopenharmony_ci 302762306a36Sopenharmony_ci return -EINVAL; 302862306a36Sopenharmony_ci} 302962306a36Sopenharmony_ci 303062306a36Sopenharmony_cistruct io_tctx_exit { 303162306a36Sopenharmony_ci struct callback_head task_work; 303262306a36Sopenharmony_ci struct completion completion; 303362306a36Sopenharmony_ci struct io_ring_ctx *ctx; 303462306a36Sopenharmony_ci}; 303562306a36Sopenharmony_ci 303662306a36Sopenharmony_cistatic __cold void io_tctx_exit_cb(struct callback_head *cb) 303762306a36Sopenharmony_ci{ 303862306a36Sopenharmony_ci struct io_uring_task *tctx = current->io_uring; 303962306a36Sopenharmony_ci struct io_tctx_exit *work; 304062306a36Sopenharmony_ci 304162306a36Sopenharmony_ci work = container_of(cb, struct io_tctx_exit, task_work); 304262306a36Sopenharmony_ci /* 304362306a36Sopenharmony_ci * When @in_cancel, we're in cancellation and it's racy to remove the 304462306a36Sopenharmony_ci * node. It'll be removed by the end of cancellation, just ignore it. 304562306a36Sopenharmony_ci * tctx can be NULL if the queueing of this task_work raced with 304662306a36Sopenharmony_ci * work cancelation off the exec path. 304762306a36Sopenharmony_ci */ 304862306a36Sopenharmony_ci if (tctx && !atomic_read(&tctx->in_cancel)) 304962306a36Sopenharmony_ci io_uring_del_tctx_node((unsigned long)work->ctx); 305062306a36Sopenharmony_ci complete(&work->completion); 305162306a36Sopenharmony_ci} 305262306a36Sopenharmony_ci 305362306a36Sopenharmony_cistatic __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) 305462306a36Sopenharmony_ci{ 305562306a36Sopenharmony_ci struct io_kiocb *req = container_of(work, struct io_kiocb, work); 305662306a36Sopenharmony_ci 305762306a36Sopenharmony_ci return req->ctx == data; 305862306a36Sopenharmony_ci} 305962306a36Sopenharmony_ci 306062306a36Sopenharmony_cistatic __cold void io_ring_exit_work(struct work_struct *work) 306162306a36Sopenharmony_ci{ 306262306a36Sopenharmony_ci struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); 306362306a36Sopenharmony_ci unsigned long timeout = jiffies + HZ * 60 * 5; 306462306a36Sopenharmony_ci unsigned long interval = HZ / 20; 306562306a36Sopenharmony_ci struct io_tctx_exit exit; 306662306a36Sopenharmony_ci struct io_tctx_node *node; 306762306a36Sopenharmony_ci int ret; 306862306a36Sopenharmony_ci 306962306a36Sopenharmony_ci /* 307062306a36Sopenharmony_ci * If we're doing polled IO and end up having requests being 307162306a36Sopenharmony_ci * submitted async (out-of-line), then completions can come in while 307262306a36Sopenharmony_ci * we're waiting for refs to drop. We need to reap these manually, 307362306a36Sopenharmony_ci * as nobody else will be looking for them. 307462306a36Sopenharmony_ci */ 307562306a36Sopenharmony_ci do { 307662306a36Sopenharmony_ci if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { 307762306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 307862306a36Sopenharmony_ci io_cqring_overflow_kill(ctx); 307962306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 308062306a36Sopenharmony_ci } 308162306a36Sopenharmony_ci 308262306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) 308362306a36Sopenharmony_ci io_move_task_work_from_local(ctx); 308462306a36Sopenharmony_ci 308562306a36Sopenharmony_ci while (io_uring_try_cancel_requests(ctx, NULL, true)) 308662306a36Sopenharmony_ci cond_resched(); 308762306a36Sopenharmony_ci 308862306a36Sopenharmony_ci if (ctx->sq_data) { 308962306a36Sopenharmony_ci struct io_sq_data *sqd = ctx->sq_data; 309062306a36Sopenharmony_ci struct task_struct *tsk; 309162306a36Sopenharmony_ci 309262306a36Sopenharmony_ci io_sq_thread_park(sqd); 309362306a36Sopenharmony_ci tsk = sqd->thread; 309462306a36Sopenharmony_ci if (tsk && tsk->io_uring && tsk->io_uring->io_wq) 309562306a36Sopenharmony_ci io_wq_cancel_cb(tsk->io_uring->io_wq, 309662306a36Sopenharmony_ci io_cancel_ctx_cb, ctx, true); 309762306a36Sopenharmony_ci io_sq_thread_unpark(sqd); 309862306a36Sopenharmony_ci } 309962306a36Sopenharmony_ci 310062306a36Sopenharmony_ci io_req_caches_free(ctx); 310162306a36Sopenharmony_ci 310262306a36Sopenharmony_ci if (WARN_ON_ONCE(time_after(jiffies, timeout))) { 310362306a36Sopenharmony_ci /* there is little hope left, don't run it too often */ 310462306a36Sopenharmony_ci interval = HZ * 60; 310562306a36Sopenharmony_ci } 310662306a36Sopenharmony_ci /* 310762306a36Sopenharmony_ci * This is really an uninterruptible wait, as it has to be 310862306a36Sopenharmony_ci * complete. But it's also run from a kworker, which doesn't 310962306a36Sopenharmony_ci * take signals, so it's fine to make it interruptible. This 311062306a36Sopenharmony_ci * avoids scenarios where we knowingly can wait much longer 311162306a36Sopenharmony_ci * on completions, for example if someone does a SIGSTOP on 311262306a36Sopenharmony_ci * a task that needs to finish task_work to make this loop 311362306a36Sopenharmony_ci * complete. That's a synthetic situation that should not 311462306a36Sopenharmony_ci * cause a stuck task backtrace, and hence a potential panic 311562306a36Sopenharmony_ci * on stuck tasks if that is enabled. 311662306a36Sopenharmony_ci */ 311762306a36Sopenharmony_ci } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval)); 311862306a36Sopenharmony_ci 311962306a36Sopenharmony_ci init_completion(&exit.completion); 312062306a36Sopenharmony_ci init_task_work(&exit.task_work, io_tctx_exit_cb); 312162306a36Sopenharmony_ci exit.ctx = ctx; 312262306a36Sopenharmony_ci 312362306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 312462306a36Sopenharmony_ci while (!list_empty(&ctx->tctx_list)) { 312562306a36Sopenharmony_ci WARN_ON_ONCE(time_after(jiffies, timeout)); 312662306a36Sopenharmony_ci 312762306a36Sopenharmony_ci node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, 312862306a36Sopenharmony_ci ctx_node); 312962306a36Sopenharmony_ci /* don't spin on a single task if cancellation failed */ 313062306a36Sopenharmony_ci list_rotate_left(&ctx->tctx_list); 313162306a36Sopenharmony_ci ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); 313262306a36Sopenharmony_ci if (WARN_ON_ONCE(ret)) 313362306a36Sopenharmony_ci continue; 313462306a36Sopenharmony_ci 313562306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 313662306a36Sopenharmony_ci /* 313762306a36Sopenharmony_ci * See comment above for 313862306a36Sopenharmony_ci * wait_for_completion_interruptible_timeout() on why this 313962306a36Sopenharmony_ci * wait is marked as interruptible. 314062306a36Sopenharmony_ci */ 314162306a36Sopenharmony_ci wait_for_completion_interruptible(&exit.completion); 314262306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 314362306a36Sopenharmony_ci } 314462306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 314562306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 314662306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 314762306a36Sopenharmony_ci 314862306a36Sopenharmony_ci /* pairs with RCU read section in io_req_local_work_add() */ 314962306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) 315062306a36Sopenharmony_ci synchronize_rcu(); 315162306a36Sopenharmony_ci 315262306a36Sopenharmony_ci io_ring_ctx_free(ctx); 315362306a36Sopenharmony_ci} 315462306a36Sopenharmony_ci 315562306a36Sopenharmony_cistatic __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) 315662306a36Sopenharmony_ci{ 315762306a36Sopenharmony_ci unsigned long index; 315862306a36Sopenharmony_ci struct creds *creds; 315962306a36Sopenharmony_ci 316062306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 316162306a36Sopenharmony_ci percpu_ref_kill(&ctx->refs); 316262306a36Sopenharmony_ci xa_for_each(&ctx->personalities, index, creds) 316362306a36Sopenharmony_ci io_unregister_personality(ctx, index); 316462306a36Sopenharmony_ci if (ctx->rings) 316562306a36Sopenharmony_ci io_poll_remove_all(ctx, NULL, true); 316662306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 316762306a36Sopenharmony_ci 316862306a36Sopenharmony_ci /* 316962306a36Sopenharmony_ci * If we failed setting up the ctx, we might not have any rings 317062306a36Sopenharmony_ci * and therefore did not submit any requests 317162306a36Sopenharmony_ci */ 317262306a36Sopenharmony_ci if (ctx->rings) 317362306a36Sopenharmony_ci io_kill_timeouts(ctx, NULL, true); 317462306a36Sopenharmony_ci 317562306a36Sopenharmony_ci flush_delayed_work(&ctx->fallback_work); 317662306a36Sopenharmony_ci 317762306a36Sopenharmony_ci INIT_WORK(&ctx->exit_work, io_ring_exit_work); 317862306a36Sopenharmony_ci /* 317962306a36Sopenharmony_ci * Use system_unbound_wq to avoid spawning tons of event kworkers 318062306a36Sopenharmony_ci * if we're exiting a ton of rings at the same time. It just adds 318162306a36Sopenharmony_ci * noise and overhead, there's no discernable change in runtime 318262306a36Sopenharmony_ci * over using system_wq. 318362306a36Sopenharmony_ci */ 318462306a36Sopenharmony_ci queue_work(system_unbound_wq, &ctx->exit_work); 318562306a36Sopenharmony_ci} 318662306a36Sopenharmony_ci 318762306a36Sopenharmony_cistatic int io_uring_release(struct inode *inode, struct file *file) 318862306a36Sopenharmony_ci{ 318962306a36Sopenharmony_ci struct io_ring_ctx *ctx = file->private_data; 319062306a36Sopenharmony_ci 319162306a36Sopenharmony_ci file->private_data = NULL; 319262306a36Sopenharmony_ci io_ring_ctx_wait_and_kill(ctx); 319362306a36Sopenharmony_ci return 0; 319462306a36Sopenharmony_ci} 319562306a36Sopenharmony_ci 319662306a36Sopenharmony_cistruct io_task_cancel { 319762306a36Sopenharmony_ci struct task_struct *task; 319862306a36Sopenharmony_ci bool all; 319962306a36Sopenharmony_ci}; 320062306a36Sopenharmony_ci 320162306a36Sopenharmony_cistatic bool io_cancel_task_cb(struct io_wq_work *work, void *data) 320262306a36Sopenharmony_ci{ 320362306a36Sopenharmony_ci struct io_kiocb *req = container_of(work, struct io_kiocb, work); 320462306a36Sopenharmony_ci struct io_task_cancel *cancel = data; 320562306a36Sopenharmony_ci 320662306a36Sopenharmony_ci return io_match_task_safe(req, cancel->task, cancel->all); 320762306a36Sopenharmony_ci} 320862306a36Sopenharmony_ci 320962306a36Sopenharmony_cistatic __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, 321062306a36Sopenharmony_ci struct task_struct *task, 321162306a36Sopenharmony_ci bool cancel_all) 321262306a36Sopenharmony_ci{ 321362306a36Sopenharmony_ci struct io_defer_entry *de; 321462306a36Sopenharmony_ci LIST_HEAD(list); 321562306a36Sopenharmony_ci 321662306a36Sopenharmony_ci spin_lock(&ctx->completion_lock); 321762306a36Sopenharmony_ci list_for_each_entry_reverse(de, &ctx->defer_list, list) { 321862306a36Sopenharmony_ci if (io_match_task_safe(de->req, task, cancel_all)) { 321962306a36Sopenharmony_ci list_cut_position(&list, &ctx->defer_list, &de->list); 322062306a36Sopenharmony_ci break; 322162306a36Sopenharmony_ci } 322262306a36Sopenharmony_ci } 322362306a36Sopenharmony_ci spin_unlock(&ctx->completion_lock); 322462306a36Sopenharmony_ci if (list_empty(&list)) 322562306a36Sopenharmony_ci return false; 322662306a36Sopenharmony_ci 322762306a36Sopenharmony_ci while (!list_empty(&list)) { 322862306a36Sopenharmony_ci de = list_first_entry(&list, struct io_defer_entry, list); 322962306a36Sopenharmony_ci list_del_init(&de->list); 323062306a36Sopenharmony_ci io_req_task_queue_fail(de->req, -ECANCELED); 323162306a36Sopenharmony_ci kfree(de); 323262306a36Sopenharmony_ci } 323362306a36Sopenharmony_ci return true; 323462306a36Sopenharmony_ci} 323562306a36Sopenharmony_ci 323662306a36Sopenharmony_cistatic __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) 323762306a36Sopenharmony_ci{ 323862306a36Sopenharmony_ci struct io_tctx_node *node; 323962306a36Sopenharmony_ci enum io_wq_cancel cret; 324062306a36Sopenharmony_ci bool ret = false; 324162306a36Sopenharmony_ci 324262306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 324362306a36Sopenharmony_ci list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 324462306a36Sopenharmony_ci struct io_uring_task *tctx = node->task->io_uring; 324562306a36Sopenharmony_ci 324662306a36Sopenharmony_ci /* 324762306a36Sopenharmony_ci * io_wq will stay alive while we hold uring_lock, because it's 324862306a36Sopenharmony_ci * killed after ctx nodes, which requires to take the lock. 324962306a36Sopenharmony_ci */ 325062306a36Sopenharmony_ci if (!tctx || !tctx->io_wq) 325162306a36Sopenharmony_ci continue; 325262306a36Sopenharmony_ci cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 325362306a36Sopenharmony_ci ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 325462306a36Sopenharmony_ci } 325562306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 325662306a36Sopenharmony_ci 325762306a36Sopenharmony_ci return ret; 325862306a36Sopenharmony_ci} 325962306a36Sopenharmony_ci 326062306a36Sopenharmony_cistatic __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 326162306a36Sopenharmony_ci struct task_struct *task, 326262306a36Sopenharmony_ci bool cancel_all) 326362306a36Sopenharmony_ci{ 326462306a36Sopenharmony_ci struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; 326562306a36Sopenharmony_ci struct io_uring_task *tctx = task ? task->io_uring : NULL; 326662306a36Sopenharmony_ci enum io_wq_cancel cret; 326762306a36Sopenharmony_ci bool ret = false; 326862306a36Sopenharmony_ci 326962306a36Sopenharmony_ci /* set it so io_req_local_work_add() would wake us up */ 327062306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 327162306a36Sopenharmony_ci atomic_set(&ctx->cq_wait_nr, 1); 327262306a36Sopenharmony_ci smp_mb(); 327362306a36Sopenharmony_ci } 327462306a36Sopenharmony_ci 327562306a36Sopenharmony_ci /* failed during ring init, it couldn't have issued any requests */ 327662306a36Sopenharmony_ci if (!ctx->rings) 327762306a36Sopenharmony_ci return false; 327862306a36Sopenharmony_ci 327962306a36Sopenharmony_ci if (!task) { 328062306a36Sopenharmony_ci ret |= io_uring_try_cancel_iowq(ctx); 328162306a36Sopenharmony_ci } else if (tctx && tctx->io_wq) { 328262306a36Sopenharmony_ci /* 328362306a36Sopenharmony_ci * Cancels requests of all rings, not only @ctx, but 328462306a36Sopenharmony_ci * it's fine as the task is in exit/exec. 328562306a36Sopenharmony_ci */ 328662306a36Sopenharmony_ci cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, 328762306a36Sopenharmony_ci &cancel, true); 328862306a36Sopenharmony_ci ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 328962306a36Sopenharmony_ci } 329062306a36Sopenharmony_ci 329162306a36Sopenharmony_ci /* SQPOLL thread does its own polling */ 329262306a36Sopenharmony_ci if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 329362306a36Sopenharmony_ci (ctx->sq_data && ctx->sq_data->thread == current)) { 329462306a36Sopenharmony_ci while (!wq_list_empty(&ctx->iopoll_list)) { 329562306a36Sopenharmony_ci io_iopoll_try_reap_events(ctx); 329662306a36Sopenharmony_ci ret = true; 329762306a36Sopenharmony_ci cond_resched(); 329862306a36Sopenharmony_ci } 329962306a36Sopenharmony_ci } 330062306a36Sopenharmony_ci 330162306a36Sopenharmony_ci if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 330262306a36Sopenharmony_ci io_allowed_defer_tw_run(ctx)) 330362306a36Sopenharmony_ci ret |= io_run_local_work(ctx, INT_MAX) > 0; 330462306a36Sopenharmony_ci ret |= io_cancel_defer_files(ctx, task, cancel_all); 330562306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 330662306a36Sopenharmony_ci ret |= io_poll_remove_all(ctx, task, cancel_all); 330762306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 330862306a36Sopenharmony_ci ret |= io_kill_timeouts(ctx, task, cancel_all); 330962306a36Sopenharmony_ci if (task) 331062306a36Sopenharmony_ci ret |= io_run_task_work() > 0; 331162306a36Sopenharmony_ci return ret; 331262306a36Sopenharmony_ci} 331362306a36Sopenharmony_ci 331462306a36Sopenharmony_cistatic s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) 331562306a36Sopenharmony_ci{ 331662306a36Sopenharmony_ci if (tracked) 331762306a36Sopenharmony_ci return atomic_read(&tctx->inflight_tracked); 331862306a36Sopenharmony_ci return percpu_counter_sum(&tctx->inflight); 331962306a36Sopenharmony_ci} 332062306a36Sopenharmony_ci 332162306a36Sopenharmony_ci/* 332262306a36Sopenharmony_ci * Find any io_uring ctx that this task has registered or done IO on, and cancel 332362306a36Sopenharmony_ci * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. 332462306a36Sopenharmony_ci */ 332562306a36Sopenharmony_ci__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) 332662306a36Sopenharmony_ci{ 332762306a36Sopenharmony_ci struct io_uring_task *tctx = current->io_uring; 332862306a36Sopenharmony_ci struct io_ring_ctx *ctx; 332962306a36Sopenharmony_ci struct io_tctx_node *node; 333062306a36Sopenharmony_ci unsigned long index; 333162306a36Sopenharmony_ci s64 inflight; 333262306a36Sopenharmony_ci DEFINE_WAIT(wait); 333362306a36Sopenharmony_ci 333462306a36Sopenharmony_ci WARN_ON_ONCE(sqd && sqd->thread != current); 333562306a36Sopenharmony_ci 333662306a36Sopenharmony_ci if (!current->io_uring) 333762306a36Sopenharmony_ci return; 333862306a36Sopenharmony_ci if (tctx->io_wq) 333962306a36Sopenharmony_ci io_wq_exit_start(tctx->io_wq); 334062306a36Sopenharmony_ci 334162306a36Sopenharmony_ci atomic_inc(&tctx->in_cancel); 334262306a36Sopenharmony_ci do { 334362306a36Sopenharmony_ci bool loop = false; 334462306a36Sopenharmony_ci 334562306a36Sopenharmony_ci io_uring_drop_tctx_refs(current); 334662306a36Sopenharmony_ci /* read completions before cancelations */ 334762306a36Sopenharmony_ci inflight = tctx_inflight(tctx, !cancel_all); 334862306a36Sopenharmony_ci if (!inflight) 334962306a36Sopenharmony_ci break; 335062306a36Sopenharmony_ci 335162306a36Sopenharmony_ci if (!sqd) { 335262306a36Sopenharmony_ci xa_for_each(&tctx->xa, index, node) { 335362306a36Sopenharmony_ci /* sqpoll task will cancel all its requests */ 335462306a36Sopenharmony_ci if (node->ctx->sq_data) 335562306a36Sopenharmony_ci continue; 335662306a36Sopenharmony_ci loop |= io_uring_try_cancel_requests(node->ctx, 335762306a36Sopenharmony_ci current, cancel_all); 335862306a36Sopenharmony_ci } 335962306a36Sopenharmony_ci } else { 336062306a36Sopenharmony_ci list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 336162306a36Sopenharmony_ci loop |= io_uring_try_cancel_requests(ctx, 336262306a36Sopenharmony_ci current, 336362306a36Sopenharmony_ci cancel_all); 336462306a36Sopenharmony_ci } 336562306a36Sopenharmony_ci 336662306a36Sopenharmony_ci if (loop) { 336762306a36Sopenharmony_ci cond_resched(); 336862306a36Sopenharmony_ci continue; 336962306a36Sopenharmony_ci } 337062306a36Sopenharmony_ci 337162306a36Sopenharmony_ci prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); 337262306a36Sopenharmony_ci io_run_task_work(); 337362306a36Sopenharmony_ci io_uring_drop_tctx_refs(current); 337462306a36Sopenharmony_ci xa_for_each(&tctx->xa, index, node) { 337562306a36Sopenharmony_ci if (!llist_empty(&node->ctx->work_llist)) { 337662306a36Sopenharmony_ci WARN_ON_ONCE(node->ctx->submitter_task && 337762306a36Sopenharmony_ci node->ctx->submitter_task != current); 337862306a36Sopenharmony_ci goto end_wait; 337962306a36Sopenharmony_ci } 338062306a36Sopenharmony_ci } 338162306a36Sopenharmony_ci /* 338262306a36Sopenharmony_ci * If we've seen completions, retry without waiting. This 338362306a36Sopenharmony_ci * avoids a race where a completion comes in before we did 338462306a36Sopenharmony_ci * prepare_to_wait(). 338562306a36Sopenharmony_ci */ 338662306a36Sopenharmony_ci if (inflight == tctx_inflight(tctx, !cancel_all)) 338762306a36Sopenharmony_ci schedule(); 338862306a36Sopenharmony_ciend_wait: 338962306a36Sopenharmony_ci finish_wait(&tctx->wait, &wait); 339062306a36Sopenharmony_ci } while (1); 339162306a36Sopenharmony_ci 339262306a36Sopenharmony_ci io_uring_clean_tctx(tctx); 339362306a36Sopenharmony_ci if (cancel_all) { 339462306a36Sopenharmony_ci /* 339562306a36Sopenharmony_ci * We shouldn't run task_works after cancel, so just leave 339662306a36Sopenharmony_ci * ->in_cancel set for normal exit. 339762306a36Sopenharmony_ci */ 339862306a36Sopenharmony_ci atomic_dec(&tctx->in_cancel); 339962306a36Sopenharmony_ci /* for exec all current's requests should be gone, kill tctx */ 340062306a36Sopenharmony_ci __io_uring_free(current); 340162306a36Sopenharmony_ci } 340262306a36Sopenharmony_ci} 340362306a36Sopenharmony_ci 340462306a36Sopenharmony_civoid __io_uring_cancel(bool cancel_all) 340562306a36Sopenharmony_ci{ 340662306a36Sopenharmony_ci io_uring_cancel_generic(cancel_all, NULL); 340762306a36Sopenharmony_ci} 340862306a36Sopenharmony_ci 340962306a36Sopenharmony_cistatic void *io_uring_validate_mmap_request(struct file *file, 341062306a36Sopenharmony_ci loff_t pgoff, size_t sz) 341162306a36Sopenharmony_ci{ 341262306a36Sopenharmony_ci struct io_ring_ctx *ctx = file->private_data; 341362306a36Sopenharmony_ci loff_t offset = pgoff << PAGE_SHIFT; 341462306a36Sopenharmony_ci struct page *page; 341562306a36Sopenharmony_ci void *ptr; 341662306a36Sopenharmony_ci 341762306a36Sopenharmony_ci switch (offset & IORING_OFF_MMAP_MASK) { 341862306a36Sopenharmony_ci case IORING_OFF_SQ_RING: 341962306a36Sopenharmony_ci case IORING_OFF_CQ_RING: 342062306a36Sopenharmony_ci /* Don't allow mmap if the ring was setup without it */ 342162306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_NO_MMAP) 342262306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 342362306a36Sopenharmony_ci ptr = ctx->rings; 342462306a36Sopenharmony_ci break; 342562306a36Sopenharmony_ci case IORING_OFF_SQES: 342662306a36Sopenharmony_ci /* Don't allow mmap if the ring was setup without it */ 342762306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_NO_MMAP) 342862306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 342962306a36Sopenharmony_ci ptr = ctx->sq_sqes; 343062306a36Sopenharmony_ci break; 343162306a36Sopenharmony_ci case IORING_OFF_PBUF_RING: { 343262306a36Sopenharmony_ci unsigned int bgid; 343362306a36Sopenharmony_ci 343462306a36Sopenharmony_ci bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; 343562306a36Sopenharmony_ci rcu_read_lock(); 343662306a36Sopenharmony_ci ptr = io_pbuf_get_address(ctx, bgid); 343762306a36Sopenharmony_ci rcu_read_unlock(); 343862306a36Sopenharmony_ci if (!ptr) 343962306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 344062306a36Sopenharmony_ci break; 344162306a36Sopenharmony_ci } 344262306a36Sopenharmony_ci default: 344362306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 344462306a36Sopenharmony_ci } 344562306a36Sopenharmony_ci 344662306a36Sopenharmony_ci page = virt_to_head_page(ptr); 344762306a36Sopenharmony_ci if (sz > page_size(page)) 344862306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 344962306a36Sopenharmony_ci 345062306a36Sopenharmony_ci return ptr; 345162306a36Sopenharmony_ci} 345262306a36Sopenharmony_ci 345362306a36Sopenharmony_ci#ifdef CONFIG_MMU 345462306a36Sopenharmony_ci 345562306a36Sopenharmony_cistatic __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 345662306a36Sopenharmony_ci{ 345762306a36Sopenharmony_ci size_t sz = vma->vm_end - vma->vm_start; 345862306a36Sopenharmony_ci unsigned long pfn; 345962306a36Sopenharmony_ci void *ptr; 346062306a36Sopenharmony_ci 346162306a36Sopenharmony_ci ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 346262306a36Sopenharmony_ci if (IS_ERR(ptr)) 346362306a36Sopenharmony_ci return PTR_ERR(ptr); 346462306a36Sopenharmony_ci 346562306a36Sopenharmony_ci pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 346662306a36Sopenharmony_ci return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 346762306a36Sopenharmony_ci} 346862306a36Sopenharmony_ci 346962306a36Sopenharmony_cistatic unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, 347062306a36Sopenharmony_ci unsigned long addr, unsigned long len, 347162306a36Sopenharmony_ci unsigned long pgoff, unsigned long flags) 347262306a36Sopenharmony_ci{ 347362306a36Sopenharmony_ci void *ptr; 347462306a36Sopenharmony_ci 347562306a36Sopenharmony_ci /* 347662306a36Sopenharmony_ci * Do not allow to map to user-provided address to avoid breaking the 347762306a36Sopenharmony_ci * aliasing rules. Userspace is not able to guess the offset address of 347862306a36Sopenharmony_ci * kernel kmalloc()ed memory area. 347962306a36Sopenharmony_ci */ 348062306a36Sopenharmony_ci if (addr) 348162306a36Sopenharmony_ci return -EINVAL; 348262306a36Sopenharmony_ci 348362306a36Sopenharmony_ci ptr = io_uring_validate_mmap_request(filp, pgoff, len); 348462306a36Sopenharmony_ci if (IS_ERR(ptr)) 348562306a36Sopenharmony_ci return -ENOMEM; 348662306a36Sopenharmony_ci 348762306a36Sopenharmony_ci /* 348862306a36Sopenharmony_ci * Some architectures have strong cache aliasing requirements. 348962306a36Sopenharmony_ci * For such architectures we need a coherent mapping which aliases 349062306a36Sopenharmony_ci * kernel memory *and* userspace memory. To achieve that: 349162306a36Sopenharmony_ci * - use a NULL file pointer to reference physical memory, and 349262306a36Sopenharmony_ci * - use the kernel virtual address of the shared io_uring context 349362306a36Sopenharmony_ci * (instead of the userspace-provided address, which has to be 0UL 349462306a36Sopenharmony_ci * anyway). 349562306a36Sopenharmony_ci * - use the same pgoff which the get_unmapped_area() uses to 349662306a36Sopenharmony_ci * calculate the page colouring. 349762306a36Sopenharmony_ci * For architectures without such aliasing requirements, the 349862306a36Sopenharmony_ci * architecture will return any suitable mapping because addr is 0. 349962306a36Sopenharmony_ci */ 350062306a36Sopenharmony_ci filp = NULL; 350162306a36Sopenharmony_ci flags |= MAP_SHARED; 350262306a36Sopenharmony_ci pgoff = 0; /* has been translated to ptr above */ 350362306a36Sopenharmony_ci#ifdef SHM_COLOUR 350462306a36Sopenharmony_ci addr = (uintptr_t) ptr; 350562306a36Sopenharmony_ci pgoff = addr >> PAGE_SHIFT; 350662306a36Sopenharmony_ci#else 350762306a36Sopenharmony_ci addr = 0UL; 350862306a36Sopenharmony_ci#endif 350962306a36Sopenharmony_ci return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 351062306a36Sopenharmony_ci} 351162306a36Sopenharmony_ci 351262306a36Sopenharmony_ci#else /* !CONFIG_MMU */ 351362306a36Sopenharmony_ci 351462306a36Sopenharmony_cistatic int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 351562306a36Sopenharmony_ci{ 351662306a36Sopenharmony_ci return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; 351762306a36Sopenharmony_ci} 351862306a36Sopenharmony_ci 351962306a36Sopenharmony_cistatic unsigned int io_uring_nommu_mmap_capabilities(struct file *file) 352062306a36Sopenharmony_ci{ 352162306a36Sopenharmony_ci return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; 352262306a36Sopenharmony_ci} 352362306a36Sopenharmony_ci 352462306a36Sopenharmony_cistatic unsigned long io_uring_nommu_get_unmapped_area(struct file *file, 352562306a36Sopenharmony_ci unsigned long addr, unsigned long len, 352662306a36Sopenharmony_ci unsigned long pgoff, unsigned long flags) 352762306a36Sopenharmony_ci{ 352862306a36Sopenharmony_ci void *ptr; 352962306a36Sopenharmony_ci 353062306a36Sopenharmony_ci ptr = io_uring_validate_mmap_request(file, pgoff, len); 353162306a36Sopenharmony_ci if (IS_ERR(ptr)) 353262306a36Sopenharmony_ci return PTR_ERR(ptr); 353362306a36Sopenharmony_ci 353462306a36Sopenharmony_ci return (unsigned long) ptr; 353562306a36Sopenharmony_ci} 353662306a36Sopenharmony_ci 353762306a36Sopenharmony_ci#endif /* !CONFIG_MMU */ 353862306a36Sopenharmony_ci 353962306a36Sopenharmony_cistatic int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) 354062306a36Sopenharmony_ci{ 354162306a36Sopenharmony_ci if (flags & IORING_ENTER_EXT_ARG) { 354262306a36Sopenharmony_ci struct io_uring_getevents_arg arg; 354362306a36Sopenharmony_ci 354462306a36Sopenharmony_ci if (argsz != sizeof(arg)) 354562306a36Sopenharmony_ci return -EINVAL; 354662306a36Sopenharmony_ci if (copy_from_user(&arg, argp, sizeof(arg))) 354762306a36Sopenharmony_ci return -EFAULT; 354862306a36Sopenharmony_ci } 354962306a36Sopenharmony_ci return 0; 355062306a36Sopenharmony_ci} 355162306a36Sopenharmony_ci 355262306a36Sopenharmony_cistatic int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 355362306a36Sopenharmony_ci struct __kernel_timespec __user **ts, 355462306a36Sopenharmony_ci const sigset_t __user **sig) 355562306a36Sopenharmony_ci{ 355662306a36Sopenharmony_ci struct io_uring_getevents_arg arg; 355762306a36Sopenharmony_ci 355862306a36Sopenharmony_ci /* 355962306a36Sopenharmony_ci * If EXT_ARG isn't set, then we have no timespec and the argp pointer 356062306a36Sopenharmony_ci * is just a pointer to the sigset_t. 356162306a36Sopenharmony_ci */ 356262306a36Sopenharmony_ci if (!(flags & IORING_ENTER_EXT_ARG)) { 356362306a36Sopenharmony_ci *sig = (const sigset_t __user *) argp; 356462306a36Sopenharmony_ci *ts = NULL; 356562306a36Sopenharmony_ci return 0; 356662306a36Sopenharmony_ci } 356762306a36Sopenharmony_ci 356862306a36Sopenharmony_ci /* 356962306a36Sopenharmony_ci * EXT_ARG is set - ensure we agree on the size of it and copy in our 357062306a36Sopenharmony_ci * timespec and sigset_t pointers if good. 357162306a36Sopenharmony_ci */ 357262306a36Sopenharmony_ci if (*argsz != sizeof(arg)) 357362306a36Sopenharmony_ci return -EINVAL; 357462306a36Sopenharmony_ci if (copy_from_user(&arg, argp, sizeof(arg))) 357562306a36Sopenharmony_ci return -EFAULT; 357662306a36Sopenharmony_ci if (arg.pad) 357762306a36Sopenharmony_ci return -EINVAL; 357862306a36Sopenharmony_ci *sig = u64_to_user_ptr(arg.sigmask); 357962306a36Sopenharmony_ci *argsz = arg.sigmask_sz; 358062306a36Sopenharmony_ci *ts = u64_to_user_ptr(arg.ts); 358162306a36Sopenharmony_ci return 0; 358262306a36Sopenharmony_ci} 358362306a36Sopenharmony_ci 358462306a36Sopenharmony_ciSYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 358562306a36Sopenharmony_ci u32, min_complete, u32, flags, const void __user *, argp, 358662306a36Sopenharmony_ci size_t, argsz) 358762306a36Sopenharmony_ci{ 358862306a36Sopenharmony_ci struct io_ring_ctx *ctx; 358962306a36Sopenharmony_ci struct file *file; 359062306a36Sopenharmony_ci long ret; 359162306a36Sopenharmony_ci 359262306a36Sopenharmony_ci if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 359362306a36Sopenharmony_ci IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | 359462306a36Sopenharmony_ci IORING_ENTER_REGISTERED_RING))) 359562306a36Sopenharmony_ci return -EINVAL; 359662306a36Sopenharmony_ci 359762306a36Sopenharmony_ci /* 359862306a36Sopenharmony_ci * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 359962306a36Sopenharmony_ci * need only dereference our task private array to find it. 360062306a36Sopenharmony_ci */ 360162306a36Sopenharmony_ci if (flags & IORING_ENTER_REGISTERED_RING) { 360262306a36Sopenharmony_ci struct io_uring_task *tctx = current->io_uring; 360362306a36Sopenharmony_ci 360462306a36Sopenharmony_ci if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 360562306a36Sopenharmony_ci return -EINVAL; 360662306a36Sopenharmony_ci fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 360762306a36Sopenharmony_ci file = tctx->registered_rings[fd]; 360862306a36Sopenharmony_ci if (unlikely(!file)) 360962306a36Sopenharmony_ci return -EBADF; 361062306a36Sopenharmony_ci } else { 361162306a36Sopenharmony_ci file = fget(fd); 361262306a36Sopenharmony_ci if (unlikely(!file)) 361362306a36Sopenharmony_ci return -EBADF; 361462306a36Sopenharmony_ci ret = -EOPNOTSUPP; 361562306a36Sopenharmony_ci if (unlikely(!io_is_uring_fops(file))) 361662306a36Sopenharmony_ci goto out; 361762306a36Sopenharmony_ci } 361862306a36Sopenharmony_ci 361962306a36Sopenharmony_ci ctx = file->private_data; 362062306a36Sopenharmony_ci ret = -EBADFD; 362162306a36Sopenharmony_ci if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) 362262306a36Sopenharmony_ci goto out; 362362306a36Sopenharmony_ci 362462306a36Sopenharmony_ci /* 362562306a36Sopenharmony_ci * For SQ polling, the thread will do all submissions and completions. 362662306a36Sopenharmony_ci * Just return the requested submit count, and wake the thread if 362762306a36Sopenharmony_ci * we were asked to. 362862306a36Sopenharmony_ci */ 362962306a36Sopenharmony_ci ret = 0; 363062306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_SQPOLL) { 363162306a36Sopenharmony_ci io_cqring_overflow_flush(ctx); 363262306a36Sopenharmony_ci 363362306a36Sopenharmony_ci if (unlikely(ctx->sq_data->thread == NULL)) { 363462306a36Sopenharmony_ci ret = -EOWNERDEAD; 363562306a36Sopenharmony_ci goto out; 363662306a36Sopenharmony_ci } 363762306a36Sopenharmony_ci if (flags & IORING_ENTER_SQ_WAKEUP) 363862306a36Sopenharmony_ci wake_up(&ctx->sq_data->wait); 363962306a36Sopenharmony_ci if (flags & IORING_ENTER_SQ_WAIT) 364062306a36Sopenharmony_ci io_sqpoll_wait_sq(ctx); 364162306a36Sopenharmony_ci 364262306a36Sopenharmony_ci ret = to_submit; 364362306a36Sopenharmony_ci } else if (to_submit) { 364462306a36Sopenharmony_ci ret = io_uring_add_tctx_node(ctx); 364562306a36Sopenharmony_ci if (unlikely(ret)) 364662306a36Sopenharmony_ci goto out; 364762306a36Sopenharmony_ci 364862306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 364962306a36Sopenharmony_ci ret = io_submit_sqes(ctx, to_submit); 365062306a36Sopenharmony_ci if (ret != to_submit) { 365162306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 365262306a36Sopenharmony_ci goto out; 365362306a36Sopenharmony_ci } 365462306a36Sopenharmony_ci if (flags & IORING_ENTER_GETEVENTS) { 365562306a36Sopenharmony_ci if (ctx->syscall_iopoll) 365662306a36Sopenharmony_ci goto iopoll_locked; 365762306a36Sopenharmony_ci /* 365862306a36Sopenharmony_ci * Ignore errors, we'll soon call io_cqring_wait() and 365962306a36Sopenharmony_ci * it should handle ownership problems if any. 366062306a36Sopenharmony_ci */ 366162306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) 366262306a36Sopenharmony_ci (void)io_run_local_work_locked(ctx, min_complete); 366362306a36Sopenharmony_ci } 366462306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 366562306a36Sopenharmony_ci } 366662306a36Sopenharmony_ci 366762306a36Sopenharmony_ci if (flags & IORING_ENTER_GETEVENTS) { 366862306a36Sopenharmony_ci int ret2; 366962306a36Sopenharmony_ci 367062306a36Sopenharmony_ci if (ctx->syscall_iopoll) { 367162306a36Sopenharmony_ci /* 367262306a36Sopenharmony_ci * We disallow the app entering submit/complete with 367362306a36Sopenharmony_ci * polling, but we still need to lock the ring to 367462306a36Sopenharmony_ci * prevent racing with polled issue that got punted to 367562306a36Sopenharmony_ci * a workqueue. 367662306a36Sopenharmony_ci */ 367762306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 367862306a36Sopenharmony_ciiopoll_locked: 367962306a36Sopenharmony_ci ret2 = io_validate_ext_arg(flags, argp, argsz); 368062306a36Sopenharmony_ci if (likely(!ret2)) { 368162306a36Sopenharmony_ci min_complete = min(min_complete, 368262306a36Sopenharmony_ci ctx->cq_entries); 368362306a36Sopenharmony_ci ret2 = io_iopoll_check(ctx, min_complete); 368462306a36Sopenharmony_ci } 368562306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 368662306a36Sopenharmony_ci } else { 368762306a36Sopenharmony_ci const sigset_t __user *sig; 368862306a36Sopenharmony_ci struct __kernel_timespec __user *ts; 368962306a36Sopenharmony_ci 369062306a36Sopenharmony_ci ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 369162306a36Sopenharmony_ci if (likely(!ret2)) { 369262306a36Sopenharmony_ci min_complete = min(min_complete, 369362306a36Sopenharmony_ci ctx->cq_entries); 369462306a36Sopenharmony_ci ret2 = io_cqring_wait(ctx, min_complete, sig, 369562306a36Sopenharmony_ci argsz, ts); 369662306a36Sopenharmony_ci } 369762306a36Sopenharmony_ci } 369862306a36Sopenharmony_ci 369962306a36Sopenharmony_ci if (!ret) { 370062306a36Sopenharmony_ci ret = ret2; 370162306a36Sopenharmony_ci 370262306a36Sopenharmony_ci /* 370362306a36Sopenharmony_ci * EBADR indicates that one or more CQE were dropped. 370462306a36Sopenharmony_ci * Once the user has been informed we can clear the bit 370562306a36Sopenharmony_ci * as they are obviously ok with those drops. 370662306a36Sopenharmony_ci */ 370762306a36Sopenharmony_ci if (unlikely(ret2 == -EBADR)) 370862306a36Sopenharmony_ci clear_bit(IO_CHECK_CQ_DROPPED_BIT, 370962306a36Sopenharmony_ci &ctx->check_cq); 371062306a36Sopenharmony_ci } 371162306a36Sopenharmony_ci } 371262306a36Sopenharmony_ciout: 371362306a36Sopenharmony_ci if (!(flags & IORING_ENTER_REGISTERED_RING)) 371462306a36Sopenharmony_ci fput(file); 371562306a36Sopenharmony_ci return ret; 371662306a36Sopenharmony_ci} 371762306a36Sopenharmony_ci 371862306a36Sopenharmony_cistatic const struct file_operations io_uring_fops = { 371962306a36Sopenharmony_ci .release = io_uring_release, 372062306a36Sopenharmony_ci .mmap = io_uring_mmap, 372162306a36Sopenharmony_ci#ifndef CONFIG_MMU 372262306a36Sopenharmony_ci .get_unmapped_area = io_uring_nommu_get_unmapped_area, 372362306a36Sopenharmony_ci .mmap_capabilities = io_uring_nommu_mmap_capabilities, 372462306a36Sopenharmony_ci#else 372562306a36Sopenharmony_ci .get_unmapped_area = io_uring_mmu_get_unmapped_area, 372662306a36Sopenharmony_ci#endif 372762306a36Sopenharmony_ci .poll = io_uring_poll, 372862306a36Sopenharmony_ci#ifdef CONFIG_PROC_FS 372962306a36Sopenharmony_ci .show_fdinfo = io_uring_show_fdinfo, 373062306a36Sopenharmony_ci#endif 373162306a36Sopenharmony_ci}; 373262306a36Sopenharmony_ci 373362306a36Sopenharmony_cibool io_is_uring_fops(struct file *file) 373462306a36Sopenharmony_ci{ 373562306a36Sopenharmony_ci return file->f_op == &io_uring_fops; 373662306a36Sopenharmony_ci} 373762306a36Sopenharmony_ci 373862306a36Sopenharmony_cistatic __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, 373962306a36Sopenharmony_ci struct io_uring_params *p) 374062306a36Sopenharmony_ci{ 374162306a36Sopenharmony_ci struct io_rings *rings; 374262306a36Sopenharmony_ci size_t size, sq_array_offset; 374362306a36Sopenharmony_ci void *ptr; 374462306a36Sopenharmony_ci 374562306a36Sopenharmony_ci /* make sure these are sane, as we already accounted them */ 374662306a36Sopenharmony_ci ctx->sq_entries = p->sq_entries; 374762306a36Sopenharmony_ci ctx->cq_entries = p->cq_entries; 374862306a36Sopenharmony_ci 374962306a36Sopenharmony_ci size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); 375062306a36Sopenharmony_ci if (size == SIZE_MAX) 375162306a36Sopenharmony_ci return -EOVERFLOW; 375262306a36Sopenharmony_ci 375362306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 375462306a36Sopenharmony_ci rings = io_mem_alloc(size); 375562306a36Sopenharmony_ci else 375662306a36Sopenharmony_ci rings = io_rings_map(ctx, p->cq_off.user_addr, size); 375762306a36Sopenharmony_ci 375862306a36Sopenharmony_ci if (IS_ERR(rings)) 375962306a36Sopenharmony_ci return PTR_ERR(rings); 376062306a36Sopenharmony_ci 376162306a36Sopenharmony_ci ctx->rings = rings; 376262306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 376362306a36Sopenharmony_ci ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 376462306a36Sopenharmony_ci rings->sq_ring_mask = p->sq_entries - 1; 376562306a36Sopenharmony_ci rings->cq_ring_mask = p->cq_entries - 1; 376662306a36Sopenharmony_ci rings->sq_ring_entries = p->sq_entries; 376762306a36Sopenharmony_ci rings->cq_ring_entries = p->cq_entries; 376862306a36Sopenharmony_ci 376962306a36Sopenharmony_ci if (p->flags & IORING_SETUP_SQE128) 377062306a36Sopenharmony_ci size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); 377162306a36Sopenharmony_ci else 377262306a36Sopenharmony_ci size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); 377362306a36Sopenharmony_ci if (size == SIZE_MAX) { 377462306a36Sopenharmony_ci io_rings_free(ctx); 377562306a36Sopenharmony_ci return -EOVERFLOW; 377662306a36Sopenharmony_ci } 377762306a36Sopenharmony_ci 377862306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 377962306a36Sopenharmony_ci ptr = io_mem_alloc(size); 378062306a36Sopenharmony_ci else 378162306a36Sopenharmony_ci ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); 378262306a36Sopenharmony_ci 378362306a36Sopenharmony_ci if (IS_ERR(ptr)) { 378462306a36Sopenharmony_ci io_rings_free(ctx); 378562306a36Sopenharmony_ci return PTR_ERR(ptr); 378662306a36Sopenharmony_ci } 378762306a36Sopenharmony_ci 378862306a36Sopenharmony_ci ctx->sq_sqes = ptr; 378962306a36Sopenharmony_ci return 0; 379062306a36Sopenharmony_ci} 379162306a36Sopenharmony_ci 379262306a36Sopenharmony_cistatic int io_uring_install_fd(struct file *file) 379362306a36Sopenharmony_ci{ 379462306a36Sopenharmony_ci int fd; 379562306a36Sopenharmony_ci 379662306a36Sopenharmony_ci fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); 379762306a36Sopenharmony_ci if (fd < 0) 379862306a36Sopenharmony_ci return fd; 379962306a36Sopenharmony_ci fd_install(fd, file); 380062306a36Sopenharmony_ci return fd; 380162306a36Sopenharmony_ci} 380262306a36Sopenharmony_ci 380362306a36Sopenharmony_ci/* 380462306a36Sopenharmony_ci * Allocate an anonymous fd, this is what constitutes the application 380562306a36Sopenharmony_ci * visible backing of an io_uring instance. The application mmaps this 380662306a36Sopenharmony_ci * fd to gain access to the SQ/CQ ring details. 380762306a36Sopenharmony_ci */ 380862306a36Sopenharmony_cistatic struct file *io_uring_get_file(struct io_ring_ctx *ctx) 380962306a36Sopenharmony_ci{ 381062306a36Sopenharmony_ci return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, 381162306a36Sopenharmony_ci O_RDWR | O_CLOEXEC, NULL); 381262306a36Sopenharmony_ci} 381362306a36Sopenharmony_ci 381462306a36Sopenharmony_cistatic __cold int io_uring_create(unsigned entries, struct io_uring_params *p, 381562306a36Sopenharmony_ci struct io_uring_params __user *params) 381662306a36Sopenharmony_ci{ 381762306a36Sopenharmony_ci struct io_ring_ctx *ctx; 381862306a36Sopenharmony_ci struct io_uring_task *tctx; 381962306a36Sopenharmony_ci struct file *file; 382062306a36Sopenharmony_ci int ret; 382162306a36Sopenharmony_ci 382262306a36Sopenharmony_ci if (!entries) 382362306a36Sopenharmony_ci return -EINVAL; 382462306a36Sopenharmony_ci if (entries > IORING_MAX_ENTRIES) { 382562306a36Sopenharmony_ci if (!(p->flags & IORING_SETUP_CLAMP)) 382662306a36Sopenharmony_ci return -EINVAL; 382762306a36Sopenharmony_ci entries = IORING_MAX_ENTRIES; 382862306a36Sopenharmony_ci } 382962306a36Sopenharmony_ci 383062306a36Sopenharmony_ci if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) 383162306a36Sopenharmony_ci && !(p->flags & IORING_SETUP_NO_MMAP)) 383262306a36Sopenharmony_ci return -EINVAL; 383362306a36Sopenharmony_ci 383462306a36Sopenharmony_ci /* 383562306a36Sopenharmony_ci * Use twice as many entries for the CQ ring. It's possible for the 383662306a36Sopenharmony_ci * application to drive a higher depth than the size of the SQ ring, 383762306a36Sopenharmony_ci * since the sqes are only used at submission time. This allows for 383862306a36Sopenharmony_ci * some flexibility in overcommitting a bit. If the application has 383962306a36Sopenharmony_ci * set IORING_SETUP_CQSIZE, it will have passed in the desired number 384062306a36Sopenharmony_ci * of CQ ring entries manually. 384162306a36Sopenharmony_ci */ 384262306a36Sopenharmony_ci p->sq_entries = roundup_pow_of_two(entries); 384362306a36Sopenharmony_ci if (p->flags & IORING_SETUP_CQSIZE) { 384462306a36Sopenharmony_ci /* 384562306a36Sopenharmony_ci * If IORING_SETUP_CQSIZE is set, we do the same roundup 384662306a36Sopenharmony_ci * to a power-of-two, if it isn't already. We do NOT impose 384762306a36Sopenharmony_ci * any cq vs sq ring sizing. 384862306a36Sopenharmony_ci */ 384962306a36Sopenharmony_ci if (!p->cq_entries) 385062306a36Sopenharmony_ci return -EINVAL; 385162306a36Sopenharmony_ci if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { 385262306a36Sopenharmony_ci if (!(p->flags & IORING_SETUP_CLAMP)) 385362306a36Sopenharmony_ci return -EINVAL; 385462306a36Sopenharmony_ci p->cq_entries = IORING_MAX_CQ_ENTRIES; 385562306a36Sopenharmony_ci } 385662306a36Sopenharmony_ci p->cq_entries = roundup_pow_of_two(p->cq_entries); 385762306a36Sopenharmony_ci if (p->cq_entries < p->sq_entries) 385862306a36Sopenharmony_ci return -EINVAL; 385962306a36Sopenharmony_ci } else { 386062306a36Sopenharmony_ci p->cq_entries = 2 * p->sq_entries; 386162306a36Sopenharmony_ci } 386262306a36Sopenharmony_ci 386362306a36Sopenharmony_ci ctx = io_ring_ctx_alloc(p); 386462306a36Sopenharmony_ci if (!ctx) 386562306a36Sopenharmony_ci return -ENOMEM; 386662306a36Sopenharmony_ci 386762306a36Sopenharmony_ci if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 386862306a36Sopenharmony_ci !(ctx->flags & IORING_SETUP_IOPOLL) && 386962306a36Sopenharmony_ci !(ctx->flags & IORING_SETUP_SQPOLL)) 387062306a36Sopenharmony_ci ctx->task_complete = true; 387162306a36Sopenharmony_ci 387262306a36Sopenharmony_ci if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) 387362306a36Sopenharmony_ci ctx->lockless_cq = true; 387462306a36Sopenharmony_ci 387562306a36Sopenharmony_ci /* 387662306a36Sopenharmony_ci * lazy poll_wq activation relies on ->task_complete for synchronisation 387762306a36Sopenharmony_ci * purposes, see io_activate_pollwq() 387862306a36Sopenharmony_ci */ 387962306a36Sopenharmony_ci if (!ctx->task_complete) 388062306a36Sopenharmony_ci ctx->poll_activated = true; 388162306a36Sopenharmony_ci 388262306a36Sopenharmony_ci /* 388362306a36Sopenharmony_ci * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 388462306a36Sopenharmony_ci * space applications don't need to do io completion events 388562306a36Sopenharmony_ci * polling again, they can rely on io_sq_thread to do polling 388662306a36Sopenharmony_ci * work, which can reduce cpu usage and uring_lock contention. 388762306a36Sopenharmony_ci */ 388862306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_IOPOLL && 388962306a36Sopenharmony_ci !(ctx->flags & IORING_SETUP_SQPOLL)) 389062306a36Sopenharmony_ci ctx->syscall_iopoll = 1; 389162306a36Sopenharmony_ci 389262306a36Sopenharmony_ci ctx->compat = in_compat_syscall(); 389362306a36Sopenharmony_ci if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) 389462306a36Sopenharmony_ci ctx->user = get_uid(current_user()); 389562306a36Sopenharmony_ci 389662306a36Sopenharmony_ci /* 389762306a36Sopenharmony_ci * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if 389862306a36Sopenharmony_ci * COOP_TASKRUN is set, then IPIs are never needed by the app. 389962306a36Sopenharmony_ci */ 390062306a36Sopenharmony_ci ret = -EINVAL; 390162306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_SQPOLL) { 390262306a36Sopenharmony_ci /* IPI related flags don't make sense with SQPOLL */ 390362306a36Sopenharmony_ci if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | 390462306a36Sopenharmony_ci IORING_SETUP_TASKRUN_FLAG | 390562306a36Sopenharmony_ci IORING_SETUP_DEFER_TASKRUN)) 390662306a36Sopenharmony_ci goto err; 390762306a36Sopenharmony_ci ctx->notify_method = TWA_SIGNAL_NO_IPI; 390862306a36Sopenharmony_ci } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { 390962306a36Sopenharmony_ci ctx->notify_method = TWA_SIGNAL_NO_IPI; 391062306a36Sopenharmony_ci } else { 391162306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && 391262306a36Sopenharmony_ci !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 391362306a36Sopenharmony_ci goto err; 391462306a36Sopenharmony_ci ctx->notify_method = TWA_SIGNAL; 391562306a36Sopenharmony_ci } 391662306a36Sopenharmony_ci 391762306a36Sopenharmony_ci /* 391862306a36Sopenharmony_ci * For DEFER_TASKRUN we require the completion task to be the same as the 391962306a36Sopenharmony_ci * submission task. This implies that there is only one submitter, so enforce 392062306a36Sopenharmony_ci * that. 392162306a36Sopenharmony_ci */ 392262306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && 392362306a36Sopenharmony_ci !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { 392462306a36Sopenharmony_ci goto err; 392562306a36Sopenharmony_ci } 392662306a36Sopenharmony_ci 392762306a36Sopenharmony_ci /* 392862306a36Sopenharmony_ci * This is just grabbed for accounting purposes. When a process exits, 392962306a36Sopenharmony_ci * the mm is exited and dropped before the files, hence we need to hang 393062306a36Sopenharmony_ci * on to this mm purely for the purposes of being able to unaccount 393162306a36Sopenharmony_ci * memory (locked/pinned vm). It's not used for anything else. 393262306a36Sopenharmony_ci */ 393362306a36Sopenharmony_ci mmgrab(current->mm); 393462306a36Sopenharmony_ci ctx->mm_account = current->mm; 393562306a36Sopenharmony_ci 393662306a36Sopenharmony_ci ret = io_allocate_scq_urings(ctx, p); 393762306a36Sopenharmony_ci if (ret) 393862306a36Sopenharmony_ci goto err; 393962306a36Sopenharmony_ci 394062306a36Sopenharmony_ci ret = io_sq_offload_create(ctx, p); 394162306a36Sopenharmony_ci if (ret) 394262306a36Sopenharmony_ci goto err; 394362306a36Sopenharmony_ci 394462306a36Sopenharmony_ci ret = io_rsrc_init(ctx); 394562306a36Sopenharmony_ci if (ret) 394662306a36Sopenharmony_ci goto err; 394762306a36Sopenharmony_ci 394862306a36Sopenharmony_ci p->sq_off.head = offsetof(struct io_rings, sq.head); 394962306a36Sopenharmony_ci p->sq_off.tail = offsetof(struct io_rings, sq.tail); 395062306a36Sopenharmony_ci p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); 395162306a36Sopenharmony_ci p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); 395262306a36Sopenharmony_ci p->sq_off.flags = offsetof(struct io_rings, sq_flags); 395362306a36Sopenharmony_ci p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); 395462306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 395562306a36Sopenharmony_ci p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 395662306a36Sopenharmony_ci p->sq_off.resv1 = 0; 395762306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 395862306a36Sopenharmony_ci p->sq_off.user_addr = 0; 395962306a36Sopenharmony_ci 396062306a36Sopenharmony_ci p->cq_off.head = offsetof(struct io_rings, cq.head); 396162306a36Sopenharmony_ci p->cq_off.tail = offsetof(struct io_rings, cq.tail); 396262306a36Sopenharmony_ci p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); 396362306a36Sopenharmony_ci p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); 396462306a36Sopenharmony_ci p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); 396562306a36Sopenharmony_ci p->cq_off.cqes = offsetof(struct io_rings, cqes); 396662306a36Sopenharmony_ci p->cq_off.flags = offsetof(struct io_rings, cq_flags); 396762306a36Sopenharmony_ci p->cq_off.resv1 = 0; 396862306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 396962306a36Sopenharmony_ci p->cq_off.user_addr = 0; 397062306a36Sopenharmony_ci 397162306a36Sopenharmony_ci p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 397262306a36Sopenharmony_ci IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 397362306a36Sopenharmony_ci IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 397462306a36Sopenharmony_ci IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 397562306a36Sopenharmony_ci IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 397662306a36Sopenharmony_ci IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | 397762306a36Sopenharmony_ci IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING; 397862306a36Sopenharmony_ci 397962306a36Sopenharmony_ci if (copy_to_user(params, p, sizeof(*p))) { 398062306a36Sopenharmony_ci ret = -EFAULT; 398162306a36Sopenharmony_ci goto err; 398262306a36Sopenharmony_ci } 398362306a36Sopenharmony_ci 398462306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_SINGLE_ISSUER 398562306a36Sopenharmony_ci && !(ctx->flags & IORING_SETUP_R_DISABLED)) 398662306a36Sopenharmony_ci WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 398762306a36Sopenharmony_ci 398862306a36Sopenharmony_ci file = io_uring_get_file(ctx); 398962306a36Sopenharmony_ci if (IS_ERR(file)) { 399062306a36Sopenharmony_ci ret = PTR_ERR(file); 399162306a36Sopenharmony_ci goto err; 399262306a36Sopenharmony_ci } 399362306a36Sopenharmony_ci 399462306a36Sopenharmony_ci ret = __io_uring_add_tctx_node(ctx); 399562306a36Sopenharmony_ci if (ret) 399662306a36Sopenharmony_ci goto err_fput; 399762306a36Sopenharmony_ci tctx = current->io_uring; 399862306a36Sopenharmony_ci 399962306a36Sopenharmony_ci /* 400062306a36Sopenharmony_ci * Install ring fd as the very last thing, so we don't risk someone 400162306a36Sopenharmony_ci * having closed it before we finish setup 400262306a36Sopenharmony_ci */ 400362306a36Sopenharmony_ci if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) 400462306a36Sopenharmony_ci ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); 400562306a36Sopenharmony_ci else 400662306a36Sopenharmony_ci ret = io_uring_install_fd(file); 400762306a36Sopenharmony_ci if (ret < 0) 400862306a36Sopenharmony_ci goto err_fput; 400962306a36Sopenharmony_ci 401062306a36Sopenharmony_ci trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); 401162306a36Sopenharmony_ci return ret; 401262306a36Sopenharmony_cierr: 401362306a36Sopenharmony_ci io_ring_ctx_wait_and_kill(ctx); 401462306a36Sopenharmony_ci return ret; 401562306a36Sopenharmony_cierr_fput: 401662306a36Sopenharmony_ci fput(file); 401762306a36Sopenharmony_ci return ret; 401862306a36Sopenharmony_ci} 401962306a36Sopenharmony_ci 402062306a36Sopenharmony_ci/* 402162306a36Sopenharmony_ci * Sets up an aio uring context, and returns the fd. Applications asks for a 402262306a36Sopenharmony_ci * ring size, we return the actual sq/cq ring sizes (among other things) in the 402362306a36Sopenharmony_ci * params structure passed in. 402462306a36Sopenharmony_ci */ 402562306a36Sopenharmony_cistatic long io_uring_setup(u32 entries, struct io_uring_params __user *params) 402662306a36Sopenharmony_ci{ 402762306a36Sopenharmony_ci struct io_uring_params p; 402862306a36Sopenharmony_ci int i; 402962306a36Sopenharmony_ci 403062306a36Sopenharmony_ci if (copy_from_user(&p, params, sizeof(p))) 403162306a36Sopenharmony_ci return -EFAULT; 403262306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(p.resv); i++) { 403362306a36Sopenharmony_ci if (p.resv[i]) 403462306a36Sopenharmony_ci return -EINVAL; 403562306a36Sopenharmony_ci } 403662306a36Sopenharmony_ci 403762306a36Sopenharmony_ci if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | 403862306a36Sopenharmony_ci IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | 403962306a36Sopenharmony_ci IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | 404062306a36Sopenharmony_ci IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | 404162306a36Sopenharmony_ci IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | 404262306a36Sopenharmony_ci IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | 404362306a36Sopenharmony_ci IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | 404462306a36Sopenharmony_ci IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | 404562306a36Sopenharmony_ci IORING_SETUP_NO_SQARRAY)) 404662306a36Sopenharmony_ci return -EINVAL; 404762306a36Sopenharmony_ci 404862306a36Sopenharmony_ci return io_uring_create(entries, &p, params); 404962306a36Sopenharmony_ci} 405062306a36Sopenharmony_ci 405162306a36Sopenharmony_cistatic inline bool io_uring_allowed(void) 405262306a36Sopenharmony_ci{ 405362306a36Sopenharmony_ci int disabled = READ_ONCE(sysctl_io_uring_disabled); 405462306a36Sopenharmony_ci kgid_t io_uring_group; 405562306a36Sopenharmony_ci 405662306a36Sopenharmony_ci if (disabled == 2) 405762306a36Sopenharmony_ci return false; 405862306a36Sopenharmony_ci 405962306a36Sopenharmony_ci if (disabled == 0 || capable(CAP_SYS_ADMIN)) 406062306a36Sopenharmony_ci return true; 406162306a36Sopenharmony_ci 406262306a36Sopenharmony_ci io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group); 406362306a36Sopenharmony_ci if (!gid_valid(io_uring_group)) 406462306a36Sopenharmony_ci return false; 406562306a36Sopenharmony_ci 406662306a36Sopenharmony_ci return in_group_p(io_uring_group); 406762306a36Sopenharmony_ci} 406862306a36Sopenharmony_ci 406962306a36Sopenharmony_ciSYSCALL_DEFINE2(io_uring_setup, u32, entries, 407062306a36Sopenharmony_ci struct io_uring_params __user *, params) 407162306a36Sopenharmony_ci{ 407262306a36Sopenharmony_ci if (!io_uring_allowed()) 407362306a36Sopenharmony_ci return -EPERM; 407462306a36Sopenharmony_ci 407562306a36Sopenharmony_ci return io_uring_setup(entries, params); 407662306a36Sopenharmony_ci} 407762306a36Sopenharmony_ci 407862306a36Sopenharmony_cistatic __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, 407962306a36Sopenharmony_ci unsigned nr_args) 408062306a36Sopenharmony_ci{ 408162306a36Sopenharmony_ci struct io_uring_probe *p; 408262306a36Sopenharmony_ci size_t size; 408362306a36Sopenharmony_ci int i, ret; 408462306a36Sopenharmony_ci 408562306a36Sopenharmony_ci size = struct_size(p, ops, nr_args); 408662306a36Sopenharmony_ci if (size == SIZE_MAX) 408762306a36Sopenharmony_ci return -EOVERFLOW; 408862306a36Sopenharmony_ci p = kzalloc(size, GFP_KERNEL); 408962306a36Sopenharmony_ci if (!p) 409062306a36Sopenharmony_ci return -ENOMEM; 409162306a36Sopenharmony_ci 409262306a36Sopenharmony_ci ret = -EFAULT; 409362306a36Sopenharmony_ci if (copy_from_user(p, arg, size)) 409462306a36Sopenharmony_ci goto out; 409562306a36Sopenharmony_ci ret = -EINVAL; 409662306a36Sopenharmony_ci if (memchr_inv(p, 0, size)) 409762306a36Sopenharmony_ci goto out; 409862306a36Sopenharmony_ci 409962306a36Sopenharmony_ci p->last_op = IORING_OP_LAST - 1; 410062306a36Sopenharmony_ci if (nr_args > IORING_OP_LAST) 410162306a36Sopenharmony_ci nr_args = IORING_OP_LAST; 410262306a36Sopenharmony_ci 410362306a36Sopenharmony_ci for (i = 0; i < nr_args; i++) { 410462306a36Sopenharmony_ci p->ops[i].op = i; 410562306a36Sopenharmony_ci if (!io_issue_defs[i].not_supported) 410662306a36Sopenharmony_ci p->ops[i].flags = IO_URING_OP_SUPPORTED; 410762306a36Sopenharmony_ci } 410862306a36Sopenharmony_ci p->ops_len = i; 410962306a36Sopenharmony_ci 411062306a36Sopenharmony_ci ret = 0; 411162306a36Sopenharmony_ci if (copy_to_user(arg, p, size)) 411262306a36Sopenharmony_ci ret = -EFAULT; 411362306a36Sopenharmony_ciout: 411462306a36Sopenharmony_ci kfree(p); 411562306a36Sopenharmony_ci return ret; 411662306a36Sopenharmony_ci} 411762306a36Sopenharmony_ci 411862306a36Sopenharmony_cistatic int io_register_personality(struct io_ring_ctx *ctx) 411962306a36Sopenharmony_ci{ 412062306a36Sopenharmony_ci const struct cred *creds; 412162306a36Sopenharmony_ci u32 id; 412262306a36Sopenharmony_ci int ret; 412362306a36Sopenharmony_ci 412462306a36Sopenharmony_ci creds = get_current_cred(); 412562306a36Sopenharmony_ci 412662306a36Sopenharmony_ci ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 412762306a36Sopenharmony_ci XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 412862306a36Sopenharmony_ci if (ret < 0) { 412962306a36Sopenharmony_ci put_cred(creds); 413062306a36Sopenharmony_ci return ret; 413162306a36Sopenharmony_ci } 413262306a36Sopenharmony_ci return id; 413362306a36Sopenharmony_ci} 413462306a36Sopenharmony_ci 413562306a36Sopenharmony_cistatic __cold int io_register_restrictions(struct io_ring_ctx *ctx, 413662306a36Sopenharmony_ci void __user *arg, unsigned int nr_args) 413762306a36Sopenharmony_ci{ 413862306a36Sopenharmony_ci struct io_uring_restriction *res; 413962306a36Sopenharmony_ci size_t size; 414062306a36Sopenharmony_ci int i, ret; 414162306a36Sopenharmony_ci 414262306a36Sopenharmony_ci /* Restrictions allowed only if rings started disabled */ 414362306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 414462306a36Sopenharmony_ci return -EBADFD; 414562306a36Sopenharmony_ci 414662306a36Sopenharmony_ci /* We allow only a single restrictions registration */ 414762306a36Sopenharmony_ci if (ctx->restrictions.registered) 414862306a36Sopenharmony_ci return -EBUSY; 414962306a36Sopenharmony_ci 415062306a36Sopenharmony_ci if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 415162306a36Sopenharmony_ci return -EINVAL; 415262306a36Sopenharmony_ci 415362306a36Sopenharmony_ci size = array_size(nr_args, sizeof(*res)); 415462306a36Sopenharmony_ci if (size == SIZE_MAX) 415562306a36Sopenharmony_ci return -EOVERFLOW; 415662306a36Sopenharmony_ci 415762306a36Sopenharmony_ci res = memdup_user(arg, size); 415862306a36Sopenharmony_ci if (IS_ERR(res)) 415962306a36Sopenharmony_ci return PTR_ERR(res); 416062306a36Sopenharmony_ci 416162306a36Sopenharmony_ci ret = 0; 416262306a36Sopenharmony_ci 416362306a36Sopenharmony_ci for (i = 0; i < nr_args; i++) { 416462306a36Sopenharmony_ci switch (res[i].opcode) { 416562306a36Sopenharmony_ci case IORING_RESTRICTION_REGISTER_OP: 416662306a36Sopenharmony_ci if (res[i].register_op >= IORING_REGISTER_LAST) { 416762306a36Sopenharmony_ci ret = -EINVAL; 416862306a36Sopenharmony_ci goto out; 416962306a36Sopenharmony_ci } 417062306a36Sopenharmony_ci 417162306a36Sopenharmony_ci __set_bit(res[i].register_op, 417262306a36Sopenharmony_ci ctx->restrictions.register_op); 417362306a36Sopenharmony_ci break; 417462306a36Sopenharmony_ci case IORING_RESTRICTION_SQE_OP: 417562306a36Sopenharmony_ci if (res[i].sqe_op >= IORING_OP_LAST) { 417662306a36Sopenharmony_ci ret = -EINVAL; 417762306a36Sopenharmony_ci goto out; 417862306a36Sopenharmony_ci } 417962306a36Sopenharmony_ci 418062306a36Sopenharmony_ci __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 418162306a36Sopenharmony_ci break; 418262306a36Sopenharmony_ci case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 418362306a36Sopenharmony_ci ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 418462306a36Sopenharmony_ci break; 418562306a36Sopenharmony_ci case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 418662306a36Sopenharmony_ci ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 418762306a36Sopenharmony_ci break; 418862306a36Sopenharmony_ci default: 418962306a36Sopenharmony_ci ret = -EINVAL; 419062306a36Sopenharmony_ci goto out; 419162306a36Sopenharmony_ci } 419262306a36Sopenharmony_ci } 419362306a36Sopenharmony_ci 419462306a36Sopenharmony_ciout: 419562306a36Sopenharmony_ci /* Reset all restrictions if an error happened */ 419662306a36Sopenharmony_ci if (ret != 0) 419762306a36Sopenharmony_ci memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 419862306a36Sopenharmony_ci else 419962306a36Sopenharmony_ci ctx->restrictions.registered = true; 420062306a36Sopenharmony_ci 420162306a36Sopenharmony_ci kfree(res); 420262306a36Sopenharmony_ci return ret; 420362306a36Sopenharmony_ci} 420462306a36Sopenharmony_ci 420562306a36Sopenharmony_cistatic int io_register_enable_rings(struct io_ring_ctx *ctx) 420662306a36Sopenharmony_ci{ 420762306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 420862306a36Sopenharmony_ci return -EBADFD; 420962306a36Sopenharmony_ci 421062306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 421162306a36Sopenharmony_ci WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 421262306a36Sopenharmony_ci /* 421362306a36Sopenharmony_ci * Lazy activation attempts would fail if it was polled before 421462306a36Sopenharmony_ci * submitter_task is set. 421562306a36Sopenharmony_ci */ 421662306a36Sopenharmony_ci if (wq_has_sleeper(&ctx->poll_wq)) 421762306a36Sopenharmony_ci io_activate_pollwq(ctx); 421862306a36Sopenharmony_ci } 421962306a36Sopenharmony_ci 422062306a36Sopenharmony_ci if (ctx->restrictions.registered) 422162306a36Sopenharmony_ci ctx->restricted = 1; 422262306a36Sopenharmony_ci 422362306a36Sopenharmony_ci ctx->flags &= ~IORING_SETUP_R_DISABLED; 422462306a36Sopenharmony_ci if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 422562306a36Sopenharmony_ci wake_up(&ctx->sq_data->wait); 422662306a36Sopenharmony_ci return 0; 422762306a36Sopenharmony_ci} 422862306a36Sopenharmony_ci 422962306a36Sopenharmony_cistatic __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, 423062306a36Sopenharmony_ci cpumask_var_t new_mask) 423162306a36Sopenharmony_ci{ 423262306a36Sopenharmony_ci int ret; 423362306a36Sopenharmony_ci 423462306a36Sopenharmony_ci if (!(ctx->flags & IORING_SETUP_SQPOLL)) { 423562306a36Sopenharmony_ci ret = io_wq_cpu_affinity(current->io_uring, new_mask); 423662306a36Sopenharmony_ci } else { 423762306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 423862306a36Sopenharmony_ci ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); 423962306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 424062306a36Sopenharmony_ci } 424162306a36Sopenharmony_ci 424262306a36Sopenharmony_ci return ret; 424362306a36Sopenharmony_ci} 424462306a36Sopenharmony_ci 424562306a36Sopenharmony_cistatic __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, 424662306a36Sopenharmony_ci void __user *arg, unsigned len) 424762306a36Sopenharmony_ci{ 424862306a36Sopenharmony_ci cpumask_var_t new_mask; 424962306a36Sopenharmony_ci int ret; 425062306a36Sopenharmony_ci 425162306a36Sopenharmony_ci if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 425262306a36Sopenharmony_ci return -ENOMEM; 425362306a36Sopenharmony_ci 425462306a36Sopenharmony_ci cpumask_clear(new_mask); 425562306a36Sopenharmony_ci if (len > cpumask_size()) 425662306a36Sopenharmony_ci len = cpumask_size(); 425762306a36Sopenharmony_ci 425862306a36Sopenharmony_ci if (in_compat_syscall()) { 425962306a36Sopenharmony_ci ret = compat_get_bitmap(cpumask_bits(new_mask), 426062306a36Sopenharmony_ci (const compat_ulong_t __user *)arg, 426162306a36Sopenharmony_ci len * 8 /* CHAR_BIT */); 426262306a36Sopenharmony_ci } else { 426362306a36Sopenharmony_ci ret = copy_from_user(new_mask, arg, len); 426462306a36Sopenharmony_ci } 426562306a36Sopenharmony_ci 426662306a36Sopenharmony_ci if (ret) { 426762306a36Sopenharmony_ci free_cpumask_var(new_mask); 426862306a36Sopenharmony_ci return -EFAULT; 426962306a36Sopenharmony_ci } 427062306a36Sopenharmony_ci 427162306a36Sopenharmony_ci ret = __io_register_iowq_aff(ctx, new_mask); 427262306a36Sopenharmony_ci free_cpumask_var(new_mask); 427362306a36Sopenharmony_ci return ret; 427462306a36Sopenharmony_ci} 427562306a36Sopenharmony_ci 427662306a36Sopenharmony_cistatic __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 427762306a36Sopenharmony_ci{ 427862306a36Sopenharmony_ci return __io_register_iowq_aff(ctx, NULL); 427962306a36Sopenharmony_ci} 428062306a36Sopenharmony_ci 428162306a36Sopenharmony_cistatic __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 428262306a36Sopenharmony_ci void __user *arg) 428362306a36Sopenharmony_ci __must_hold(&ctx->uring_lock) 428462306a36Sopenharmony_ci{ 428562306a36Sopenharmony_ci struct io_tctx_node *node; 428662306a36Sopenharmony_ci struct io_uring_task *tctx = NULL; 428762306a36Sopenharmony_ci struct io_sq_data *sqd = NULL; 428862306a36Sopenharmony_ci __u32 new_count[2]; 428962306a36Sopenharmony_ci int i, ret; 429062306a36Sopenharmony_ci 429162306a36Sopenharmony_ci if (copy_from_user(new_count, arg, sizeof(new_count))) 429262306a36Sopenharmony_ci return -EFAULT; 429362306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(new_count); i++) 429462306a36Sopenharmony_ci if (new_count[i] > INT_MAX) 429562306a36Sopenharmony_ci return -EINVAL; 429662306a36Sopenharmony_ci 429762306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_SQPOLL) { 429862306a36Sopenharmony_ci sqd = ctx->sq_data; 429962306a36Sopenharmony_ci if (sqd) { 430062306a36Sopenharmony_ci /* 430162306a36Sopenharmony_ci * Observe the correct sqd->lock -> ctx->uring_lock 430262306a36Sopenharmony_ci * ordering. Fine to drop uring_lock here, we hold 430362306a36Sopenharmony_ci * a ref to the ctx. 430462306a36Sopenharmony_ci */ 430562306a36Sopenharmony_ci refcount_inc(&sqd->refs); 430662306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 430762306a36Sopenharmony_ci mutex_lock(&sqd->lock); 430862306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 430962306a36Sopenharmony_ci if (sqd->thread) 431062306a36Sopenharmony_ci tctx = sqd->thread->io_uring; 431162306a36Sopenharmony_ci } 431262306a36Sopenharmony_ci } else { 431362306a36Sopenharmony_ci tctx = current->io_uring; 431462306a36Sopenharmony_ci } 431562306a36Sopenharmony_ci 431662306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 431762306a36Sopenharmony_ci 431862306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(new_count); i++) 431962306a36Sopenharmony_ci if (new_count[i]) 432062306a36Sopenharmony_ci ctx->iowq_limits[i] = new_count[i]; 432162306a36Sopenharmony_ci ctx->iowq_limits_set = true; 432262306a36Sopenharmony_ci 432362306a36Sopenharmony_ci if (tctx && tctx->io_wq) { 432462306a36Sopenharmony_ci ret = io_wq_max_workers(tctx->io_wq, new_count); 432562306a36Sopenharmony_ci if (ret) 432662306a36Sopenharmony_ci goto err; 432762306a36Sopenharmony_ci } else { 432862306a36Sopenharmony_ci memset(new_count, 0, sizeof(new_count)); 432962306a36Sopenharmony_ci } 433062306a36Sopenharmony_ci 433162306a36Sopenharmony_ci if (sqd) { 433262306a36Sopenharmony_ci mutex_unlock(&sqd->lock); 433362306a36Sopenharmony_ci io_put_sq_data(sqd); 433462306a36Sopenharmony_ci } 433562306a36Sopenharmony_ci 433662306a36Sopenharmony_ci if (copy_to_user(arg, new_count, sizeof(new_count))) 433762306a36Sopenharmony_ci return -EFAULT; 433862306a36Sopenharmony_ci 433962306a36Sopenharmony_ci /* that's it for SQPOLL, only the SQPOLL task creates requests */ 434062306a36Sopenharmony_ci if (sqd) 434162306a36Sopenharmony_ci return 0; 434262306a36Sopenharmony_ci 434362306a36Sopenharmony_ci /* now propagate the restriction to all registered users */ 434462306a36Sopenharmony_ci list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 434562306a36Sopenharmony_ci struct io_uring_task *tctx = node->task->io_uring; 434662306a36Sopenharmony_ci 434762306a36Sopenharmony_ci if (WARN_ON_ONCE(!tctx->io_wq)) 434862306a36Sopenharmony_ci continue; 434962306a36Sopenharmony_ci 435062306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(new_count); i++) 435162306a36Sopenharmony_ci new_count[i] = ctx->iowq_limits[i]; 435262306a36Sopenharmony_ci /* ignore errors, it always returns zero anyway */ 435362306a36Sopenharmony_ci (void)io_wq_max_workers(tctx->io_wq, new_count); 435462306a36Sopenharmony_ci } 435562306a36Sopenharmony_ci return 0; 435662306a36Sopenharmony_cierr: 435762306a36Sopenharmony_ci if (sqd) { 435862306a36Sopenharmony_ci mutex_unlock(&sqd->lock); 435962306a36Sopenharmony_ci io_put_sq_data(sqd); 436062306a36Sopenharmony_ci } 436162306a36Sopenharmony_ci return ret; 436262306a36Sopenharmony_ci} 436362306a36Sopenharmony_ci 436462306a36Sopenharmony_cistatic int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 436562306a36Sopenharmony_ci void __user *arg, unsigned nr_args) 436662306a36Sopenharmony_ci __releases(ctx->uring_lock) 436762306a36Sopenharmony_ci __acquires(ctx->uring_lock) 436862306a36Sopenharmony_ci{ 436962306a36Sopenharmony_ci int ret; 437062306a36Sopenharmony_ci 437162306a36Sopenharmony_ci /* 437262306a36Sopenharmony_ci * We don't quiesce the refs for register anymore and so it can't be 437362306a36Sopenharmony_ci * dying as we're holding a file ref here. 437462306a36Sopenharmony_ci */ 437562306a36Sopenharmony_ci if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) 437662306a36Sopenharmony_ci return -ENXIO; 437762306a36Sopenharmony_ci 437862306a36Sopenharmony_ci if (ctx->submitter_task && ctx->submitter_task != current) 437962306a36Sopenharmony_ci return -EEXIST; 438062306a36Sopenharmony_ci 438162306a36Sopenharmony_ci if (ctx->restricted) { 438262306a36Sopenharmony_ci opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 438362306a36Sopenharmony_ci if (!test_bit(opcode, ctx->restrictions.register_op)) 438462306a36Sopenharmony_ci return -EACCES; 438562306a36Sopenharmony_ci } 438662306a36Sopenharmony_ci 438762306a36Sopenharmony_ci switch (opcode) { 438862306a36Sopenharmony_ci case IORING_REGISTER_BUFFERS: 438962306a36Sopenharmony_ci ret = -EFAULT; 439062306a36Sopenharmony_ci if (!arg) 439162306a36Sopenharmony_ci break; 439262306a36Sopenharmony_ci ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 439362306a36Sopenharmony_ci break; 439462306a36Sopenharmony_ci case IORING_UNREGISTER_BUFFERS: 439562306a36Sopenharmony_ci ret = -EINVAL; 439662306a36Sopenharmony_ci if (arg || nr_args) 439762306a36Sopenharmony_ci break; 439862306a36Sopenharmony_ci ret = io_sqe_buffers_unregister(ctx); 439962306a36Sopenharmony_ci break; 440062306a36Sopenharmony_ci case IORING_REGISTER_FILES: 440162306a36Sopenharmony_ci ret = -EFAULT; 440262306a36Sopenharmony_ci if (!arg) 440362306a36Sopenharmony_ci break; 440462306a36Sopenharmony_ci ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 440562306a36Sopenharmony_ci break; 440662306a36Sopenharmony_ci case IORING_UNREGISTER_FILES: 440762306a36Sopenharmony_ci ret = -EINVAL; 440862306a36Sopenharmony_ci if (arg || nr_args) 440962306a36Sopenharmony_ci break; 441062306a36Sopenharmony_ci ret = io_sqe_files_unregister(ctx); 441162306a36Sopenharmony_ci break; 441262306a36Sopenharmony_ci case IORING_REGISTER_FILES_UPDATE: 441362306a36Sopenharmony_ci ret = io_register_files_update(ctx, arg, nr_args); 441462306a36Sopenharmony_ci break; 441562306a36Sopenharmony_ci case IORING_REGISTER_EVENTFD: 441662306a36Sopenharmony_ci ret = -EINVAL; 441762306a36Sopenharmony_ci if (nr_args != 1) 441862306a36Sopenharmony_ci break; 441962306a36Sopenharmony_ci ret = io_eventfd_register(ctx, arg, 0); 442062306a36Sopenharmony_ci break; 442162306a36Sopenharmony_ci case IORING_REGISTER_EVENTFD_ASYNC: 442262306a36Sopenharmony_ci ret = -EINVAL; 442362306a36Sopenharmony_ci if (nr_args != 1) 442462306a36Sopenharmony_ci break; 442562306a36Sopenharmony_ci ret = io_eventfd_register(ctx, arg, 1); 442662306a36Sopenharmony_ci break; 442762306a36Sopenharmony_ci case IORING_UNREGISTER_EVENTFD: 442862306a36Sopenharmony_ci ret = -EINVAL; 442962306a36Sopenharmony_ci if (arg || nr_args) 443062306a36Sopenharmony_ci break; 443162306a36Sopenharmony_ci ret = io_eventfd_unregister(ctx); 443262306a36Sopenharmony_ci break; 443362306a36Sopenharmony_ci case IORING_REGISTER_PROBE: 443462306a36Sopenharmony_ci ret = -EINVAL; 443562306a36Sopenharmony_ci if (!arg || nr_args > 256) 443662306a36Sopenharmony_ci break; 443762306a36Sopenharmony_ci ret = io_probe(ctx, arg, nr_args); 443862306a36Sopenharmony_ci break; 443962306a36Sopenharmony_ci case IORING_REGISTER_PERSONALITY: 444062306a36Sopenharmony_ci ret = -EINVAL; 444162306a36Sopenharmony_ci if (arg || nr_args) 444262306a36Sopenharmony_ci break; 444362306a36Sopenharmony_ci ret = io_register_personality(ctx); 444462306a36Sopenharmony_ci break; 444562306a36Sopenharmony_ci case IORING_UNREGISTER_PERSONALITY: 444662306a36Sopenharmony_ci ret = -EINVAL; 444762306a36Sopenharmony_ci if (arg) 444862306a36Sopenharmony_ci break; 444962306a36Sopenharmony_ci ret = io_unregister_personality(ctx, nr_args); 445062306a36Sopenharmony_ci break; 445162306a36Sopenharmony_ci case IORING_REGISTER_ENABLE_RINGS: 445262306a36Sopenharmony_ci ret = -EINVAL; 445362306a36Sopenharmony_ci if (arg || nr_args) 445462306a36Sopenharmony_ci break; 445562306a36Sopenharmony_ci ret = io_register_enable_rings(ctx); 445662306a36Sopenharmony_ci break; 445762306a36Sopenharmony_ci case IORING_REGISTER_RESTRICTIONS: 445862306a36Sopenharmony_ci ret = io_register_restrictions(ctx, arg, nr_args); 445962306a36Sopenharmony_ci break; 446062306a36Sopenharmony_ci case IORING_REGISTER_FILES2: 446162306a36Sopenharmony_ci ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 446262306a36Sopenharmony_ci break; 446362306a36Sopenharmony_ci case IORING_REGISTER_FILES_UPDATE2: 446462306a36Sopenharmony_ci ret = io_register_rsrc_update(ctx, arg, nr_args, 446562306a36Sopenharmony_ci IORING_RSRC_FILE); 446662306a36Sopenharmony_ci break; 446762306a36Sopenharmony_ci case IORING_REGISTER_BUFFERS2: 446862306a36Sopenharmony_ci ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 446962306a36Sopenharmony_ci break; 447062306a36Sopenharmony_ci case IORING_REGISTER_BUFFERS_UPDATE: 447162306a36Sopenharmony_ci ret = io_register_rsrc_update(ctx, arg, nr_args, 447262306a36Sopenharmony_ci IORING_RSRC_BUFFER); 447362306a36Sopenharmony_ci break; 447462306a36Sopenharmony_ci case IORING_REGISTER_IOWQ_AFF: 447562306a36Sopenharmony_ci ret = -EINVAL; 447662306a36Sopenharmony_ci if (!arg || !nr_args) 447762306a36Sopenharmony_ci break; 447862306a36Sopenharmony_ci ret = io_register_iowq_aff(ctx, arg, nr_args); 447962306a36Sopenharmony_ci break; 448062306a36Sopenharmony_ci case IORING_UNREGISTER_IOWQ_AFF: 448162306a36Sopenharmony_ci ret = -EINVAL; 448262306a36Sopenharmony_ci if (arg || nr_args) 448362306a36Sopenharmony_ci break; 448462306a36Sopenharmony_ci ret = io_unregister_iowq_aff(ctx); 448562306a36Sopenharmony_ci break; 448662306a36Sopenharmony_ci case IORING_REGISTER_IOWQ_MAX_WORKERS: 448762306a36Sopenharmony_ci ret = -EINVAL; 448862306a36Sopenharmony_ci if (!arg || nr_args != 2) 448962306a36Sopenharmony_ci break; 449062306a36Sopenharmony_ci ret = io_register_iowq_max_workers(ctx, arg); 449162306a36Sopenharmony_ci break; 449262306a36Sopenharmony_ci case IORING_REGISTER_RING_FDS: 449362306a36Sopenharmony_ci ret = io_ringfd_register(ctx, arg, nr_args); 449462306a36Sopenharmony_ci break; 449562306a36Sopenharmony_ci case IORING_UNREGISTER_RING_FDS: 449662306a36Sopenharmony_ci ret = io_ringfd_unregister(ctx, arg, nr_args); 449762306a36Sopenharmony_ci break; 449862306a36Sopenharmony_ci case IORING_REGISTER_PBUF_RING: 449962306a36Sopenharmony_ci ret = -EINVAL; 450062306a36Sopenharmony_ci if (!arg || nr_args != 1) 450162306a36Sopenharmony_ci break; 450262306a36Sopenharmony_ci ret = io_register_pbuf_ring(ctx, arg); 450362306a36Sopenharmony_ci break; 450462306a36Sopenharmony_ci case IORING_UNREGISTER_PBUF_RING: 450562306a36Sopenharmony_ci ret = -EINVAL; 450662306a36Sopenharmony_ci if (!arg || nr_args != 1) 450762306a36Sopenharmony_ci break; 450862306a36Sopenharmony_ci ret = io_unregister_pbuf_ring(ctx, arg); 450962306a36Sopenharmony_ci break; 451062306a36Sopenharmony_ci case IORING_REGISTER_SYNC_CANCEL: 451162306a36Sopenharmony_ci ret = -EINVAL; 451262306a36Sopenharmony_ci if (!arg || nr_args != 1) 451362306a36Sopenharmony_ci break; 451462306a36Sopenharmony_ci ret = io_sync_cancel(ctx, arg); 451562306a36Sopenharmony_ci break; 451662306a36Sopenharmony_ci case IORING_REGISTER_FILE_ALLOC_RANGE: 451762306a36Sopenharmony_ci ret = -EINVAL; 451862306a36Sopenharmony_ci if (!arg || nr_args) 451962306a36Sopenharmony_ci break; 452062306a36Sopenharmony_ci ret = io_register_file_alloc_range(ctx, arg); 452162306a36Sopenharmony_ci break; 452262306a36Sopenharmony_ci default: 452362306a36Sopenharmony_ci ret = -EINVAL; 452462306a36Sopenharmony_ci break; 452562306a36Sopenharmony_ci } 452662306a36Sopenharmony_ci 452762306a36Sopenharmony_ci return ret; 452862306a36Sopenharmony_ci} 452962306a36Sopenharmony_ci 453062306a36Sopenharmony_ciSYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 453162306a36Sopenharmony_ci void __user *, arg, unsigned int, nr_args) 453262306a36Sopenharmony_ci{ 453362306a36Sopenharmony_ci struct io_ring_ctx *ctx; 453462306a36Sopenharmony_ci long ret = -EBADF; 453562306a36Sopenharmony_ci struct file *file; 453662306a36Sopenharmony_ci bool use_registered_ring; 453762306a36Sopenharmony_ci 453862306a36Sopenharmony_ci use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); 453962306a36Sopenharmony_ci opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; 454062306a36Sopenharmony_ci 454162306a36Sopenharmony_ci if (opcode >= IORING_REGISTER_LAST) 454262306a36Sopenharmony_ci return -EINVAL; 454362306a36Sopenharmony_ci 454462306a36Sopenharmony_ci if (use_registered_ring) { 454562306a36Sopenharmony_ci /* 454662306a36Sopenharmony_ci * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 454762306a36Sopenharmony_ci * need only dereference our task private array to find it. 454862306a36Sopenharmony_ci */ 454962306a36Sopenharmony_ci struct io_uring_task *tctx = current->io_uring; 455062306a36Sopenharmony_ci 455162306a36Sopenharmony_ci if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 455262306a36Sopenharmony_ci return -EINVAL; 455362306a36Sopenharmony_ci fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 455462306a36Sopenharmony_ci file = tctx->registered_rings[fd]; 455562306a36Sopenharmony_ci if (unlikely(!file)) 455662306a36Sopenharmony_ci return -EBADF; 455762306a36Sopenharmony_ci } else { 455862306a36Sopenharmony_ci file = fget(fd); 455962306a36Sopenharmony_ci if (unlikely(!file)) 456062306a36Sopenharmony_ci return -EBADF; 456162306a36Sopenharmony_ci ret = -EOPNOTSUPP; 456262306a36Sopenharmony_ci if (!io_is_uring_fops(file)) 456362306a36Sopenharmony_ci goto out_fput; 456462306a36Sopenharmony_ci } 456562306a36Sopenharmony_ci 456662306a36Sopenharmony_ci ctx = file->private_data; 456762306a36Sopenharmony_ci 456862306a36Sopenharmony_ci mutex_lock(&ctx->uring_lock); 456962306a36Sopenharmony_ci ret = __io_uring_register(ctx, opcode, arg, nr_args); 457062306a36Sopenharmony_ci mutex_unlock(&ctx->uring_lock); 457162306a36Sopenharmony_ci trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); 457262306a36Sopenharmony_ciout_fput: 457362306a36Sopenharmony_ci if (!use_registered_ring) 457462306a36Sopenharmony_ci fput(file); 457562306a36Sopenharmony_ci return ret; 457662306a36Sopenharmony_ci} 457762306a36Sopenharmony_ci 457862306a36Sopenharmony_cistatic int __init io_uring_init(void) 457962306a36Sopenharmony_ci{ 458062306a36Sopenharmony_ci#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ 458162306a36Sopenharmony_ci BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ 458262306a36Sopenharmony_ci BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \ 458362306a36Sopenharmony_ci} while (0) 458462306a36Sopenharmony_ci 458562306a36Sopenharmony_ci#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ 458662306a36Sopenharmony_ci __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename) 458762306a36Sopenharmony_ci#define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \ 458862306a36Sopenharmony_ci __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename) 458962306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); 459062306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(0, __u8, opcode); 459162306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(1, __u8, flags); 459262306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(2, __u16, ioprio); 459362306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(4, __s32, fd); 459462306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(8, __u64, off); 459562306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(8, __u64, addr2); 459662306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(8, __u32, cmd_op); 459762306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(12, __u32, __pad1); 459862306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(16, __u64, addr); 459962306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); 460062306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(24, __u32, len); 460162306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); 460262306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); 460362306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); 460462306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); 460562306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); 460662306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); 460762306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); 460862306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); 460962306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); 461062306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); 461162306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); 461262306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, open_flags); 461362306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); 461462306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); 461562306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); 461662306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, rename_flags); 461762306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, unlink_flags); 461862306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, hardlink_flags); 461962306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, xattr_flags); 462062306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(28, __u32, msg_ring_flags); 462162306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(32, __u64, user_data); 462262306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(40, __u16, buf_index); 462362306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(40, __u16, buf_group); 462462306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(42, __u16, personality); 462562306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); 462662306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(44, __u32, file_index); 462762306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(44, __u16, addr_len); 462862306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); 462962306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(48, __u64, addr3); 463062306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); 463162306a36Sopenharmony_ci BUILD_BUG_SQE_ELEM(56, __u64, __pad2); 463262306a36Sopenharmony_ci 463362306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(struct io_uring_files_update) != 463462306a36Sopenharmony_ci sizeof(struct io_uring_rsrc_update)); 463562306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > 463662306a36Sopenharmony_ci sizeof(struct io_uring_rsrc_update2)); 463762306a36Sopenharmony_ci 463862306a36Sopenharmony_ci /* ->buf_index is u16 */ 463962306a36Sopenharmony_ci BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); 464062306a36Sopenharmony_ci BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != 464162306a36Sopenharmony_ci offsetof(struct io_uring_buf_ring, tail)); 464262306a36Sopenharmony_ci 464362306a36Sopenharmony_ci /* should fit into one byte */ 464462306a36Sopenharmony_ci BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); 464562306a36Sopenharmony_ci BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); 464662306a36Sopenharmony_ci BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); 464762306a36Sopenharmony_ci 464862306a36Sopenharmony_ci BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); 464962306a36Sopenharmony_ci 465062306a36Sopenharmony_ci BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); 465162306a36Sopenharmony_ci 465262306a36Sopenharmony_ci io_uring_optable_init(); 465362306a36Sopenharmony_ci 465462306a36Sopenharmony_ci /* 465562306a36Sopenharmony_ci * Allow user copy in the per-command field, which starts after the 465662306a36Sopenharmony_ci * file in io_kiocb and until the opcode field. The openat2 handling 465762306a36Sopenharmony_ci * requires copying in user memory into the io_kiocb object in that 465862306a36Sopenharmony_ci * range, and HARDENED_USERCOPY will complain if we haven't 465962306a36Sopenharmony_ci * correctly annotated this range. 466062306a36Sopenharmony_ci */ 466162306a36Sopenharmony_ci req_cachep = kmem_cache_create_usercopy("io_kiocb", 466262306a36Sopenharmony_ci sizeof(struct io_kiocb), 0, 466362306a36Sopenharmony_ci SLAB_HWCACHE_ALIGN | SLAB_PANIC | 466462306a36Sopenharmony_ci SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, 466562306a36Sopenharmony_ci offsetof(struct io_kiocb, cmd.data), 466662306a36Sopenharmony_ci sizeof_field(struct io_kiocb, cmd.data), NULL); 466762306a36Sopenharmony_ci 466862306a36Sopenharmony_ci#ifdef CONFIG_SYSCTL 466962306a36Sopenharmony_ci register_sysctl_init("kernel", kernel_io_uring_disabled_table); 467062306a36Sopenharmony_ci#endif 467162306a36Sopenharmony_ci 467262306a36Sopenharmony_ci return 0; 467362306a36Sopenharmony_ci}; 467462306a36Sopenharmony_ci__initcall(io_uring_init); 4675