162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci#include <linux/kernel.h> 362306a36Sopenharmony_ci#include <linux/errno.h> 462306a36Sopenharmony_ci#include <linux/fs.h> 562306a36Sopenharmony_ci#include <linux/file.h> 662306a36Sopenharmony_ci#include <linux/blk-mq.h> 762306a36Sopenharmony_ci#include <linux/mm.h> 862306a36Sopenharmony_ci#include <linux/slab.h> 962306a36Sopenharmony_ci#include <linux/fsnotify.h> 1062306a36Sopenharmony_ci#include <linux/poll.h> 1162306a36Sopenharmony_ci#include <linux/nospec.h> 1262306a36Sopenharmony_ci#include <linux/compat.h> 1362306a36Sopenharmony_ci#include <linux/io_uring.h> 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci#include <uapi/linux/io_uring.h> 1662306a36Sopenharmony_ci 1762306a36Sopenharmony_ci#include "io_uring.h" 1862306a36Sopenharmony_ci#include "opdef.h" 1962306a36Sopenharmony_ci#include "kbuf.h" 2062306a36Sopenharmony_ci#include "rsrc.h" 2162306a36Sopenharmony_ci#include "rw.h" 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_cistruct io_rw { 2462306a36Sopenharmony_ci /* NOTE: kiocb has the file as the first member, so don't do it here */ 2562306a36Sopenharmony_ci struct kiocb kiocb; 2662306a36Sopenharmony_ci u64 addr; 2762306a36Sopenharmony_ci u32 len; 2862306a36Sopenharmony_ci rwf_t flags; 2962306a36Sopenharmony_ci}; 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_cistatic inline bool io_file_supports_nowait(struct io_kiocb *req) 3262306a36Sopenharmony_ci{ 3362306a36Sopenharmony_ci return req->flags & REQ_F_SUPPORT_NOWAIT; 3462306a36Sopenharmony_ci} 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci#ifdef CONFIG_COMPAT 3762306a36Sopenharmony_cistatic int io_iov_compat_buffer_select_prep(struct io_rw *rw) 3862306a36Sopenharmony_ci{ 3962306a36Sopenharmony_ci struct compat_iovec __user *uiov; 4062306a36Sopenharmony_ci compat_ssize_t clen; 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci uiov = u64_to_user_ptr(rw->addr); 4362306a36Sopenharmony_ci if (!access_ok(uiov, sizeof(*uiov))) 4462306a36Sopenharmony_ci return -EFAULT; 4562306a36Sopenharmony_ci if (__get_user(clen, &uiov->iov_len)) 4662306a36Sopenharmony_ci return -EFAULT; 4762306a36Sopenharmony_ci if (clen < 0) 4862306a36Sopenharmony_ci return -EINVAL; 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci rw->len = clen; 5162306a36Sopenharmony_ci return 0; 5262306a36Sopenharmony_ci} 5362306a36Sopenharmony_ci#endif 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_cistatic int io_iov_buffer_select_prep(struct io_kiocb *req) 5662306a36Sopenharmony_ci{ 5762306a36Sopenharmony_ci struct iovec __user *uiov; 5862306a36Sopenharmony_ci struct iovec iov; 5962306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci if (rw->len != 1) 6262306a36Sopenharmony_ci return -EINVAL; 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci#ifdef CONFIG_COMPAT 6562306a36Sopenharmony_ci if (req->ctx->compat) 6662306a36Sopenharmony_ci return io_iov_compat_buffer_select_prep(rw); 6762306a36Sopenharmony_ci#endif 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci uiov = u64_to_user_ptr(rw->addr); 7062306a36Sopenharmony_ci if (copy_from_user(&iov, uiov, sizeof(*uiov))) 7162306a36Sopenharmony_ci return -EFAULT; 7262306a36Sopenharmony_ci rw->len = iov.iov_len; 7362306a36Sopenharmony_ci return 0; 7462306a36Sopenharmony_ci} 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ciint io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) 7762306a36Sopenharmony_ci{ 7862306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 7962306a36Sopenharmony_ci unsigned ioprio; 8062306a36Sopenharmony_ci int ret; 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci rw->kiocb.ki_pos = READ_ONCE(sqe->off); 8362306a36Sopenharmony_ci /* used for fixed read/write too - just read unconditionally */ 8462306a36Sopenharmony_ci req->buf_index = READ_ONCE(sqe->buf_index); 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci if (req->opcode == IORING_OP_READ_FIXED || 8762306a36Sopenharmony_ci req->opcode == IORING_OP_WRITE_FIXED) { 8862306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 8962306a36Sopenharmony_ci u16 index; 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci if (unlikely(req->buf_index >= ctx->nr_user_bufs)) 9262306a36Sopenharmony_ci return -EFAULT; 9362306a36Sopenharmony_ci index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); 9462306a36Sopenharmony_ci req->imu = ctx->user_bufs[index]; 9562306a36Sopenharmony_ci io_req_set_rsrc_node(req, ctx, 0); 9662306a36Sopenharmony_ci } 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci ioprio = READ_ONCE(sqe->ioprio); 9962306a36Sopenharmony_ci if (ioprio) { 10062306a36Sopenharmony_ci ret = ioprio_check_cap(ioprio); 10162306a36Sopenharmony_ci if (ret) 10262306a36Sopenharmony_ci return ret; 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci rw->kiocb.ki_ioprio = ioprio; 10562306a36Sopenharmony_ci } else { 10662306a36Sopenharmony_ci rw->kiocb.ki_ioprio = get_current_ioprio(); 10762306a36Sopenharmony_ci } 10862306a36Sopenharmony_ci rw->kiocb.dio_complete = NULL; 10962306a36Sopenharmony_ci 11062306a36Sopenharmony_ci rw->addr = READ_ONCE(sqe->addr); 11162306a36Sopenharmony_ci rw->len = READ_ONCE(sqe->len); 11262306a36Sopenharmony_ci rw->flags = READ_ONCE(sqe->rw_flags); 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci /* Have to do this validation here, as this is in io_read() rw->len might 11562306a36Sopenharmony_ci * have chanaged due to buffer selection 11662306a36Sopenharmony_ci */ 11762306a36Sopenharmony_ci if (req->opcode == IORING_OP_READV && req->flags & REQ_F_BUFFER_SELECT) { 11862306a36Sopenharmony_ci ret = io_iov_buffer_select_prep(req); 11962306a36Sopenharmony_ci if (ret) 12062306a36Sopenharmony_ci return ret; 12162306a36Sopenharmony_ci } 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci return 0; 12462306a36Sopenharmony_ci} 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_civoid io_readv_writev_cleanup(struct io_kiocb *req) 12762306a36Sopenharmony_ci{ 12862306a36Sopenharmony_ci struct io_async_rw *io = req->async_data; 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci kfree(io->free_iovec); 13162306a36Sopenharmony_ci} 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_cistatic inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) 13462306a36Sopenharmony_ci{ 13562306a36Sopenharmony_ci switch (ret) { 13662306a36Sopenharmony_ci case -EIOCBQUEUED: 13762306a36Sopenharmony_ci break; 13862306a36Sopenharmony_ci case -ERESTARTSYS: 13962306a36Sopenharmony_ci case -ERESTARTNOINTR: 14062306a36Sopenharmony_ci case -ERESTARTNOHAND: 14162306a36Sopenharmony_ci case -ERESTART_RESTARTBLOCK: 14262306a36Sopenharmony_ci /* 14362306a36Sopenharmony_ci * We can't just restart the syscall, since previously 14462306a36Sopenharmony_ci * submitted sqes may already be in progress. Just fail this 14562306a36Sopenharmony_ci * IO with EINTR. 14662306a36Sopenharmony_ci */ 14762306a36Sopenharmony_ci ret = -EINTR; 14862306a36Sopenharmony_ci fallthrough; 14962306a36Sopenharmony_ci default: 15062306a36Sopenharmony_ci kiocb->ki_complete(kiocb, ret); 15162306a36Sopenharmony_ci } 15262306a36Sopenharmony_ci} 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_cistatic inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) 15562306a36Sopenharmony_ci{ 15662306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci if (rw->kiocb.ki_pos != -1) 15962306a36Sopenharmony_ci return &rw->kiocb.ki_pos; 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci if (!(req->file->f_mode & FMODE_STREAM)) { 16262306a36Sopenharmony_ci req->flags |= REQ_F_CUR_POS; 16362306a36Sopenharmony_ci rw->kiocb.ki_pos = req->file->f_pos; 16462306a36Sopenharmony_ci return &rw->kiocb.ki_pos; 16562306a36Sopenharmony_ci } 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci rw->kiocb.ki_pos = 0; 16862306a36Sopenharmony_ci return NULL; 16962306a36Sopenharmony_ci} 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_cistatic void io_req_task_queue_reissue(struct io_kiocb *req) 17262306a36Sopenharmony_ci{ 17362306a36Sopenharmony_ci req->io_task_work.func = io_queue_iowq; 17462306a36Sopenharmony_ci io_req_task_work_add(req); 17562306a36Sopenharmony_ci} 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci#ifdef CONFIG_BLOCK 17862306a36Sopenharmony_cistatic bool io_resubmit_prep(struct io_kiocb *req) 17962306a36Sopenharmony_ci{ 18062306a36Sopenharmony_ci struct io_async_rw *io = req->async_data; 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci if (!req_has_async_data(req)) 18362306a36Sopenharmony_ci return !io_req_prep_async(req); 18462306a36Sopenharmony_ci iov_iter_restore(&io->s.iter, &io->s.iter_state); 18562306a36Sopenharmony_ci return true; 18662306a36Sopenharmony_ci} 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_cistatic bool io_rw_should_reissue(struct io_kiocb *req) 18962306a36Sopenharmony_ci{ 19062306a36Sopenharmony_ci umode_t mode = file_inode(req->file)->i_mode; 19162306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci if (!S_ISBLK(mode) && !S_ISREG(mode)) 19462306a36Sopenharmony_ci return false; 19562306a36Sopenharmony_ci if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && 19662306a36Sopenharmony_ci !(ctx->flags & IORING_SETUP_IOPOLL))) 19762306a36Sopenharmony_ci return false; 19862306a36Sopenharmony_ci /* 19962306a36Sopenharmony_ci * If ref is dying, we might be running poll reap from the exit work. 20062306a36Sopenharmony_ci * Don't attempt to reissue from that path, just let it fail with 20162306a36Sopenharmony_ci * -EAGAIN. 20262306a36Sopenharmony_ci */ 20362306a36Sopenharmony_ci if (percpu_ref_is_dying(&ctx->refs)) 20462306a36Sopenharmony_ci return false; 20562306a36Sopenharmony_ci /* 20662306a36Sopenharmony_ci * Play it safe and assume not safe to re-import and reissue if we're 20762306a36Sopenharmony_ci * not in the original thread group (or in task context). 20862306a36Sopenharmony_ci */ 20962306a36Sopenharmony_ci if (!same_thread_group(req->task, current) || !in_task()) 21062306a36Sopenharmony_ci return false; 21162306a36Sopenharmony_ci return true; 21262306a36Sopenharmony_ci} 21362306a36Sopenharmony_ci#else 21462306a36Sopenharmony_cistatic bool io_resubmit_prep(struct io_kiocb *req) 21562306a36Sopenharmony_ci{ 21662306a36Sopenharmony_ci return false; 21762306a36Sopenharmony_ci} 21862306a36Sopenharmony_cistatic bool io_rw_should_reissue(struct io_kiocb *req) 21962306a36Sopenharmony_ci{ 22062306a36Sopenharmony_ci return false; 22162306a36Sopenharmony_ci} 22262306a36Sopenharmony_ci#endif 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_cistatic void io_req_end_write(struct io_kiocb *req) 22562306a36Sopenharmony_ci{ 22662306a36Sopenharmony_ci if (req->flags & REQ_F_ISREG) { 22762306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci kiocb_end_write(&rw->kiocb); 23062306a36Sopenharmony_ci } 23162306a36Sopenharmony_ci} 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci/* 23462306a36Sopenharmony_ci * Trigger the notifications after having done some IO, and finish the write 23562306a36Sopenharmony_ci * accounting, if any. 23662306a36Sopenharmony_ci */ 23762306a36Sopenharmony_cistatic void io_req_io_end(struct io_kiocb *req) 23862306a36Sopenharmony_ci{ 23962306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci if (rw->kiocb.ki_flags & IOCB_WRITE) { 24262306a36Sopenharmony_ci io_req_end_write(req); 24362306a36Sopenharmony_ci fsnotify_modify(req->file); 24462306a36Sopenharmony_ci } else { 24562306a36Sopenharmony_ci fsnotify_access(req->file); 24662306a36Sopenharmony_ci } 24762306a36Sopenharmony_ci} 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_cistatic bool __io_complete_rw_common(struct io_kiocb *req, long res) 25062306a36Sopenharmony_ci{ 25162306a36Sopenharmony_ci if (unlikely(res != req->cqe.res)) { 25262306a36Sopenharmony_ci if ((res == -EAGAIN || res == -EOPNOTSUPP) && 25362306a36Sopenharmony_ci io_rw_should_reissue(req)) { 25462306a36Sopenharmony_ci /* 25562306a36Sopenharmony_ci * Reissue will start accounting again, finish the 25662306a36Sopenharmony_ci * current cycle. 25762306a36Sopenharmony_ci */ 25862306a36Sopenharmony_ci io_req_io_end(req); 25962306a36Sopenharmony_ci req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; 26062306a36Sopenharmony_ci return true; 26162306a36Sopenharmony_ci } 26262306a36Sopenharmony_ci req_set_fail(req); 26362306a36Sopenharmony_ci req->cqe.res = res; 26462306a36Sopenharmony_ci } 26562306a36Sopenharmony_ci return false; 26662306a36Sopenharmony_ci} 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_cistatic inline int io_fixup_rw_res(struct io_kiocb *req, long res) 26962306a36Sopenharmony_ci{ 27062306a36Sopenharmony_ci struct io_async_rw *io = req->async_data; 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci /* add previously done IO, if any */ 27362306a36Sopenharmony_ci if (req_has_async_data(req) && io->bytes_done > 0) { 27462306a36Sopenharmony_ci if (res < 0) 27562306a36Sopenharmony_ci res = io->bytes_done; 27662306a36Sopenharmony_ci else 27762306a36Sopenharmony_ci res += io->bytes_done; 27862306a36Sopenharmony_ci } 27962306a36Sopenharmony_ci return res; 28062306a36Sopenharmony_ci} 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_civoid io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) 28362306a36Sopenharmony_ci{ 28462306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 28562306a36Sopenharmony_ci struct kiocb *kiocb = &rw->kiocb; 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) { 28862306a36Sopenharmony_ci long res = kiocb->dio_complete(rw->kiocb.private); 28962306a36Sopenharmony_ci 29062306a36Sopenharmony_ci io_req_set_res(req, io_fixup_rw_res(req, res), 0); 29162306a36Sopenharmony_ci } 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci io_req_io_end(req); 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { 29662306a36Sopenharmony_ci unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_ci req->cqe.flags |= io_put_kbuf(req, issue_flags); 29962306a36Sopenharmony_ci } 30062306a36Sopenharmony_ci io_req_task_complete(req, ts); 30162306a36Sopenharmony_ci} 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_cistatic void io_complete_rw(struct kiocb *kiocb, long res) 30462306a36Sopenharmony_ci{ 30562306a36Sopenharmony_ci struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); 30662306a36Sopenharmony_ci struct io_kiocb *req = cmd_to_io_kiocb(rw); 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { 30962306a36Sopenharmony_ci if (__io_complete_rw_common(req, res)) 31062306a36Sopenharmony_ci return; 31162306a36Sopenharmony_ci io_req_set_res(req, io_fixup_rw_res(req, res), 0); 31262306a36Sopenharmony_ci } 31362306a36Sopenharmony_ci req->io_task_work.func = io_req_rw_complete; 31462306a36Sopenharmony_ci __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE); 31562306a36Sopenharmony_ci} 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_cistatic void io_complete_rw_iopoll(struct kiocb *kiocb, long res) 31862306a36Sopenharmony_ci{ 31962306a36Sopenharmony_ci struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); 32062306a36Sopenharmony_ci struct io_kiocb *req = cmd_to_io_kiocb(rw); 32162306a36Sopenharmony_ci 32262306a36Sopenharmony_ci if (kiocb->ki_flags & IOCB_WRITE) 32362306a36Sopenharmony_ci io_req_end_write(req); 32462306a36Sopenharmony_ci if (unlikely(res != req->cqe.res)) { 32562306a36Sopenharmony_ci if (res == -EAGAIN && io_rw_should_reissue(req)) { 32662306a36Sopenharmony_ci req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; 32762306a36Sopenharmony_ci return; 32862306a36Sopenharmony_ci } 32962306a36Sopenharmony_ci req->cqe.res = res; 33062306a36Sopenharmony_ci } 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_ci /* order with io_iopoll_complete() checking ->iopoll_completed */ 33362306a36Sopenharmony_ci smp_store_release(&req->iopoll_completed, 1); 33462306a36Sopenharmony_ci} 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_cistatic int kiocb_done(struct io_kiocb *req, ssize_t ret, 33762306a36Sopenharmony_ci unsigned int issue_flags) 33862306a36Sopenharmony_ci{ 33962306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 34062306a36Sopenharmony_ci unsigned final_ret = io_fixup_rw_res(req, ret); 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci if (ret >= 0 && req->flags & REQ_F_CUR_POS) 34362306a36Sopenharmony_ci req->file->f_pos = rw->kiocb.ki_pos; 34462306a36Sopenharmony_ci if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { 34562306a36Sopenharmony_ci if (!__io_complete_rw_common(req, ret)) { 34662306a36Sopenharmony_ci /* 34762306a36Sopenharmony_ci * Safe to call io_end from here as we're inline 34862306a36Sopenharmony_ci * from the submission path. 34962306a36Sopenharmony_ci */ 35062306a36Sopenharmony_ci io_req_io_end(req); 35162306a36Sopenharmony_ci io_req_set_res(req, final_ret, 35262306a36Sopenharmony_ci io_put_kbuf(req, issue_flags)); 35362306a36Sopenharmony_ci return IOU_OK; 35462306a36Sopenharmony_ci } 35562306a36Sopenharmony_ci } else { 35662306a36Sopenharmony_ci io_rw_done(&rw->kiocb, ret); 35762306a36Sopenharmony_ci } 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci if (req->flags & REQ_F_REISSUE) { 36062306a36Sopenharmony_ci req->flags &= ~REQ_F_REISSUE; 36162306a36Sopenharmony_ci if (io_resubmit_prep(req)) 36262306a36Sopenharmony_ci io_req_task_queue_reissue(req); 36362306a36Sopenharmony_ci else 36462306a36Sopenharmony_ci io_req_task_queue_fail(req, final_ret); 36562306a36Sopenharmony_ci } 36662306a36Sopenharmony_ci return IOU_ISSUE_SKIP_COMPLETE; 36762306a36Sopenharmony_ci} 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_cistatic struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, 37062306a36Sopenharmony_ci struct io_rw_state *s, 37162306a36Sopenharmony_ci unsigned int issue_flags) 37262306a36Sopenharmony_ci{ 37362306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 37462306a36Sopenharmony_ci struct iov_iter *iter = &s->iter; 37562306a36Sopenharmony_ci u8 opcode = req->opcode; 37662306a36Sopenharmony_ci struct iovec *iovec; 37762306a36Sopenharmony_ci void __user *buf; 37862306a36Sopenharmony_ci size_t sqe_len; 37962306a36Sopenharmony_ci ssize_t ret; 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { 38262306a36Sopenharmony_ci ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len); 38362306a36Sopenharmony_ci if (ret) 38462306a36Sopenharmony_ci return ERR_PTR(ret); 38562306a36Sopenharmony_ci return NULL; 38662306a36Sopenharmony_ci } 38762306a36Sopenharmony_ci 38862306a36Sopenharmony_ci buf = u64_to_user_ptr(rw->addr); 38962306a36Sopenharmony_ci sqe_len = rw->len; 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE || 39262306a36Sopenharmony_ci (req->flags & REQ_F_BUFFER_SELECT)) { 39362306a36Sopenharmony_ci if (io_do_buffer_select(req)) { 39462306a36Sopenharmony_ci buf = io_buffer_select(req, &sqe_len, issue_flags); 39562306a36Sopenharmony_ci if (!buf) 39662306a36Sopenharmony_ci return ERR_PTR(-ENOBUFS); 39762306a36Sopenharmony_ci rw->addr = (unsigned long) buf; 39862306a36Sopenharmony_ci rw->len = sqe_len; 39962306a36Sopenharmony_ci } 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci ret = import_ubuf(ddir, buf, sqe_len, iter); 40262306a36Sopenharmony_ci if (ret) 40362306a36Sopenharmony_ci return ERR_PTR(ret); 40462306a36Sopenharmony_ci return NULL; 40562306a36Sopenharmony_ci } 40662306a36Sopenharmony_ci 40762306a36Sopenharmony_ci iovec = s->fast_iov; 40862306a36Sopenharmony_ci ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, 40962306a36Sopenharmony_ci req->ctx->compat); 41062306a36Sopenharmony_ci if (unlikely(ret < 0)) 41162306a36Sopenharmony_ci return ERR_PTR(ret); 41262306a36Sopenharmony_ci return iovec; 41362306a36Sopenharmony_ci} 41462306a36Sopenharmony_ci 41562306a36Sopenharmony_cistatic inline int io_import_iovec(int rw, struct io_kiocb *req, 41662306a36Sopenharmony_ci struct iovec **iovec, struct io_rw_state *s, 41762306a36Sopenharmony_ci unsigned int issue_flags) 41862306a36Sopenharmony_ci{ 41962306a36Sopenharmony_ci *iovec = __io_import_iovec(rw, req, s, issue_flags); 42062306a36Sopenharmony_ci if (IS_ERR(*iovec)) 42162306a36Sopenharmony_ci return PTR_ERR(*iovec); 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci iov_iter_save_state(&s->iter, &s->iter_state); 42462306a36Sopenharmony_ci return 0; 42562306a36Sopenharmony_ci} 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_cistatic inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) 42862306a36Sopenharmony_ci{ 42962306a36Sopenharmony_ci return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; 43062306a36Sopenharmony_ci} 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci/* 43362306a36Sopenharmony_ci * For files that don't have ->read_iter() and ->write_iter(), handle them 43462306a36Sopenharmony_ci * by looping over ->read() or ->write() manually. 43562306a36Sopenharmony_ci */ 43662306a36Sopenharmony_cistatic ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) 43762306a36Sopenharmony_ci{ 43862306a36Sopenharmony_ci struct kiocb *kiocb = &rw->kiocb; 43962306a36Sopenharmony_ci struct file *file = kiocb->ki_filp; 44062306a36Sopenharmony_ci ssize_t ret = 0; 44162306a36Sopenharmony_ci loff_t *ppos; 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci /* 44462306a36Sopenharmony_ci * Don't support polled IO through this interface, and we can't 44562306a36Sopenharmony_ci * support non-blocking either. For the latter, this just causes 44662306a36Sopenharmony_ci * the kiocb to be handled from an async context. 44762306a36Sopenharmony_ci */ 44862306a36Sopenharmony_ci if (kiocb->ki_flags & IOCB_HIPRI) 44962306a36Sopenharmony_ci return -EOPNOTSUPP; 45062306a36Sopenharmony_ci if ((kiocb->ki_flags & IOCB_NOWAIT) && 45162306a36Sopenharmony_ci !(kiocb->ki_filp->f_flags & O_NONBLOCK)) 45262306a36Sopenharmony_ci return -EAGAIN; 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci ppos = io_kiocb_ppos(kiocb); 45562306a36Sopenharmony_ci 45662306a36Sopenharmony_ci while (iov_iter_count(iter)) { 45762306a36Sopenharmony_ci void __user *addr; 45862306a36Sopenharmony_ci size_t len; 45962306a36Sopenharmony_ci ssize_t nr; 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_ci if (iter_is_ubuf(iter)) { 46262306a36Sopenharmony_ci addr = iter->ubuf + iter->iov_offset; 46362306a36Sopenharmony_ci len = iov_iter_count(iter); 46462306a36Sopenharmony_ci } else if (!iov_iter_is_bvec(iter)) { 46562306a36Sopenharmony_ci addr = iter_iov_addr(iter); 46662306a36Sopenharmony_ci len = iter_iov_len(iter); 46762306a36Sopenharmony_ci } else { 46862306a36Sopenharmony_ci addr = u64_to_user_ptr(rw->addr); 46962306a36Sopenharmony_ci len = rw->len; 47062306a36Sopenharmony_ci } 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_ci if (ddir == READ) 47362306a36Sopenharmony_ci nr = file->f_op->read(file, addr, len, ppos); 47462306a36Sopenharmony_ci else 47562306a36Sopenharmony_ci nr = file->f_op->write(file, addr, len, ppos); 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_ci if (nr < 0) { 47862306a36Sopenharmony_ci if (!ret) 47962306a36Sopenharmony_ci ret = nr; 48062306a36Sopenharmony_ci break; 48162306a36Sopenharmony_ci } 48262306a36Sopenharmony_ci ret += nr; 48362306a36Sopenharmony_ci if (!iov_iter_is_bvec(iter)) { 48462306a36Sopenharmony_ci iov_iter_advance(iter, nr); 48562306a36Sopenharmony_ci } else { 48662306a36Sopenharmony_ci rw->addr += nr; 48762306a36Sopenharmony_ci rw->len -= nr; 48862306a36Sopenharmony_ci if (!rw->len) 48962306a36Sopenharmony_ci break; 49062306a36Sopenharmony_ci } 49162306a36Sopenharmony_ci if (nr != len) 49262306a36Sopenharmony_ci break; 49362306a36Sopenharmony_ci } 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci return ret; 49662306a36Sopenharmony_ci} 49762306a36Sopenharmony_ci 49862306a36Sopenharmony_cistatic void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, 49962306a36Sopenharmony_ci const struct iovec *fast_iov, struct iov_iter *iter) 50062306a36Sopenharmony_ci{ 50162306a36Sopenharmony_ci struct io_async_rw *io = req->async_data; 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_ci memcpy(&io->s.iter, iter, sizeof(*iter)); 50462306a36Sopenharmony_ci io->free_iovec = iovec; 50562306a36Sopenharmony_ci io->bytes_done = 0; 50662306a36Sopenharmony_ci /* can only be fixed buffers, no need to do anything */ 50762306a36Sopenharmony_ci if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter)) 50862306a36Sopenharmony_ci return; 50962306a36Sopenharmony_ci if (!iovec) { 51062306a36Sopenharmony_ci unsigned iov_off = 0; 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci io->s.iter.__iov = io->s.fast_iov; 51362306a36Sopenharmony_ci if (iter->__iov != fast_iov) { 51462306a36Sopenharmony_ci iov_off = iter_iov(iter) - fast_iov; 51562306a36Sopenharmony_ci io->s.iter.__iov += iov_off; 51662306a36Sopenharmony_ci } 51762306a36Sopenharmony_ci if (io->s.fast_iov != fast_iov) 51862306a36Sopenharmony_ci memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, 51962306a36Sopenharmony_ci sizeof(struct iovec) * iter->nr_segs); 52062306a36Sopenharmony_ci } else { 52162306a36Sopenharmony_ci req->flags |= REQ_F_NEED_CLEANUP; 52262306a36Sopenharmony_ci } 52362306a36Sopenharmony_ci} 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_cistatic int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 52662306a36Sopenharmony_ci struct io_rw_state *s, bool force) 52762306a36Sopenharmony_ci{ 52862306a36Sopenharmony_ci if (!force && !io_cold_defs[req->opcode].prep_async) 52962306a36Sopenharmony_ci return 0; 53062306a36Sopenharmony_ci if (!req_has_async_data(req)) { 53162306a36Sopenharmony_ci struct io_async_rw *iorw; 53262306a36Sopenharmony_ci 53362306a36Sopenharmony_ci if (io_alloc_async_data(req)) { 53462306a36Sopenharmony_ci kfree(iovec); 53562306a36Sopenharmony_ci return -ENOMEM; 53662306a36Sopenharmony_ci } 53762306a36Sopenharmony_ci 53862306a36Sopenharmony_ci io_req_map_rw(req, iovec, s->fast_iov, &s->iter); 53962306a36Sopenharmony_ci iorw = req->async_data; 54062306a36Sopenharmony_ci /* we've copied and mapped the iter, ensure state is saved */ 54162306a36Sopenharmony_ci iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); 54262306a36Sopenharmony_ci } 54362306a36Sopenharmony_ci return 0; 54462306a36Sopenharmony_ci} 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_cistatic inline int io_rw_prep_async(struct io_kiocb *req, int rw) 54762306a36Sopenharmony_ci{ 54862306a36Sopenharmony_ci struct io_async_rw *iorw = req->async_data; 54962306a36Sopenharmony_ci struct iovec *iov; 55062306a36Sopenharmony_ci int ret; 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_ci iorw->bytes_done = 0; 55362306a36Sopenharmony_ci iorw->free_iovec = NULL; 55462306a36Sopenharmony_ci 55562306a36Sopenharmony_ci /* submission path, ->uring_lock should already be taken */ 55662306a36Sopenharmony_ci ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); 55762306a36Sopenharmony_ci if (unlikely(ret < 0)) 55862306a36Sopenharmony_ci return ret; 55962306a36Sopenharmony_ci 56062306a36Sopenharmony_ci if (iov) { 56162306a36Sopenharmony_ci iorw->free_iovec = iov; 56262306a36Sopenharmony_ci req->flags |= REQ_F_NEED_CLEANUP; 56362306a36Sopenharmony_ci } 56462306a36Sopenharmony_ci 56562306a36Sopenharmony_ci return 0; 56662306a36Sopenharmony_ci} 56762306a36Sopenharmony_ci 56862306a36Sopenharmony_ciint io_readv_prep_async(struct io_kiocb *req) 56962306a36Sopenharmony_ci{ 57062306a36Sopenharmony_ci return io_rw_prep_async(req, ITER_DEST); 57162306a36Sopenharmony_ci} 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ciint io_writev_prep_async(struct io_kiocb *req) 57462306a36Sopenharmony_ci{ 57562306a36Sopenharmony_ci return io_rw_prep_async(req, ITER_SOURCE); 57662306a36Sopenharmony_ci} 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci/* 57962306a36Sopenharmony_ci * This is our waitqueue callback handler, registered through __folio_lock_async() 58062306a36Sopenharmony_ci * when we initially tried to do the IO with the iocb armed our waitqueue. 58162306a36Sopenharmony_ci * This gets called when the page is unlocked, and we generally expect that to 58262306a36Sopenharmony_ci * happen when the page IO is completed and the page is now uptodate. This will 58362306a36Sopenharmony_ci * queue a task_work based retry of the operation, attempting to copy the data 58462306a36Sopenharmony_ci * again. If the latter fails because the page was NOT uptodate, then we will 58562306a36Sopenharmony_ci * do a thread based blocking retry of the operation. That's the unexpected 58662306a36Sopenharmony_ci * slow path. 58762306a36Sopenharmony_ci */ 58862306a36Sopenharmony_cistatic int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, 58962306a36Sopenharmony_ci int sync, void *arg) 59062306a36Sopenharmony_ci{ 59162306a36Sopenharmony_ci struct wait_page_queue *wpq; 59262306a36Sopenharmony_ci struct io_kiocb *req = wait->private; 59362306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 59462306a36Sopenharmony_ci struct wait_page_key *key = arg; 59562306a36Sopenharmony_ci 59662306a36Sopenharmony_ci wpq = container_of(wait, struct wait_page_queue, wait); 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci if (!wake_page_match(wpq, key)) 59962306a36Sopenharmony_ci return 0; 60062306a36Sopenharmony_ci 60162306a36Sopenharmony_ci rw->kiocb.ki_flags &= ~IOCB_WAITQ; 60262306a36Sopenharmony_ci list_del_init(&wait->entry); 60362306a36Sopenharmony_ci io_req_task_queue(req); 60462306a36Sopenharmony_ci return 1; 60562306a36Sopenharmony_ci} 60662306a36Sopenharmony_ci 60762306a36Sopenharmony_ci/* 60862306a36Sopenharmony_ci * This controls whether a given IO request should be armed for async page 60962306a36Sopenharmony_ci * based retry. If we return false here, the request is handed to the async 61062306a36Sopenharmony_ci * worker threads for retry. If we're doing buffered reads on a regular file, 61162306a36Sopenharmony_ci * we prepare a private wait_page_queue entry and retry the operation. This 61262306a36Sopenharmony_ci * will either succeed because the page is now uptodate and unlocked, or it 61362306a36Sopenharmony_ci * will register a callback when the page is unlocked at IO completion. Through 61462306a36Sopenharmony_ci * that callback, io_uring uses task_work to setup a retry of the operation. 61562306a36Sopenharmony_ci * That retry will attempt the buffered read again. The retry will generally 61662306a36Sopenharmony_ci * succeed, or in rare cases where it fails, we then fall back to using the 61762306a36Sopenharmony_ci * async worker threads for a blocking retry. 61862306a36Sopenharmony_ci */ 61962306a36Sopenharmony_cistatic bool io_rw_should_retry(struct io_kiocb *req) 62062306a36Sopenharmony_ci{ 62162306a36Sopenharmony_ci struct io_async_rw *io = req->async_data; 62262306a36Sopenharmony_ci struct wait_page_queue *wait = &io->wpq; 62362306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 62462306a36Sopenharmony_ci struct kiocb *kiocb = &rw->kiocb; 62562306a36Sopenharmony_ci 62662306a36Sopenharmony_ci /* never retry for NOWAIT, we just complete with -EAGAIN */ 62762306a36Sopenharmony_ci if (req->flags & REQ_F_NOWAIT) 62862306a36Sopenharmony_ci return false; 62962306a36Sopenharmony_ci 63062306a36Sopenharmony_ci /* Only for buffered IO */ 63162306a36Sopenharmony_ci if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) 63262306a36Sopenharmony_ci return false; 63362306a36Sopenharmony_ci 63462306a36Sopenharmony_ci /* 63562306a36Sopenharmony_ci * just use poll if we can, and don't attempt if the fs doesn't 63662306a36Sopenharmony_ci * support callback based unlocks 63762306a36Sopenharmony_ci */ 63862306a36Sopenharmony_ci if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) 63962306a36Sopenharmony_ci return false; 64062306a36Sopenharmony_ci 64162306a36Sopenharmony_ci wait->wait.func = io_async_buf_func; 64262306a36Sopenharmony_ci wait->wait.private = req; 64362306a36Sopenharmony_ci wait->wait.flags = 0; 64462306a36Sopenharmony_ci INIT_LIST_HEAD(&wait->wait.entry); 64562306a36Sopenharmony_ci kiocb->ki_flags |= IOCB_WAITQ; 64662306a36Sopenharmony_ci kiocb->ki_flags &= ~IOCB_NOWAIT; 64762306a36Sopenharmony_ci kiocb->ki_waitq = wait; 64862306a36Sopenharmony_ci return true; 64962306a36Sopenharmony_ci} 65062306a36Sopenharmony_ci 65162306a36Sopenharmony_cistatic inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) 65262306a36Sopenharmony_ci{ 65362306a36Sopenharmony_ci struct file *file = rw->kiocb.ki_filp; 65462306a36Sopenharmony_ci 65562306a36Sopenharmony_ci if (likely(file->f_op->read_iter)) 65662306a36Sopenharmony_ci return call_read_iter(file, &rw->kiocb, iter); 65762306a36Sopenharmony_ci else if (file->f_op->read) 65862306a36Sopenharmony_ci return loop_rw_iter(READ, rw, iter); 65962306a36Sopenharmony_ci else 66062306a36Sopenharmony_ci return -EINVAL; 66162306a36Sopenharmony_ci} 66262306a36Sopenharmony_ci 66362306a36Sopenharmony_cistatic bool need_complete_io(struct io_kiocb *req) 66462306a36Sopenharmony_ci{ 66562306a36Sopenharmony_ci return req->flags & REQ_F_ISREG || 66662306a36Sopenharmony_ci S_ISBLK(file_inode(req->file)->i_mode); 66762306a36Sopenharmony_ci} 66862306a36Sopenharmony_ci 66962306a36Sopenharmony_cistatic int io_rw_init_file(struct io_kiocb *req, fmode_t mode) 67062306a36Sopenharmony_ci{ 67162306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 67262306a36Sopenharmony_ci struct kiocb *kiocb = &rw->kiocb; 67362306a36Sopenharmony_ci struct io_ring_ctx *ctx = req->ctx; 67462306a36Sopenharmony_ci struct file *file = req->file; 67562306a36Sopenharmony_ci int ret; 67662306a36Sopenharmony_ci 67762306a36Sopenharmony_ci if (unlikely(!file || !(file->f_mode & mode))) 67862306a36Sopenharmony_ci return -EBADF; 67962306a36Sopenharmony_ci 68062306a36Sopenharmony_ci if (!(req->flags & REQ_F_FIXED_FILE)) 68162306a36Sopenharmony_ci req->flags |= io_file_get_flags(file); 68262306a36Sopenharmony_ci 68362306a36Sopenharmony_ci kiocb->ki_flags = file->f_iocb_flags; 68462306a36Sopenharmony_ci ret = kiocb_set_rw_flags(kiocb, rw->flags); 68562306a36Sopenharmony_ci if (unlikely(ret)) 68662306a36Sopenharmony_ci return ret; 68762306a36Sopenharmony_ci kiocb->ki_flags |= IOCB_ALLOC_CACHE; 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ci /* 69062306a36Sopenharmony_ci * If the file is marked O_NONBLOCK, still allow retry for it if it 69162306a36Sopenharmony_ci * supports async. Otherwise it's impossible to use O_NONBLOCK files 69262306a36Sopenharmony_ci * reliably. If not, or it IOCB_NOWAIT is set, don't retry. 69362306a36Sopenharmony_ci */ 69462306a36Sopenharmony_ci if ((kiocb->ki_flags & IOCB_NOWAIT) || 69562306a36Sopenharmony_ci ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) 69662306a36Sopenharmony_ci req->flags |= REQ_F_NOWAIT; 69762306a36Sopenharmony_ci 69862306a36Sopenharmony_ci if (ctx->flags & IORING_SETUP_IOPOLL) { 69962306a36Sopenharmony_ci if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) 70062306a36Sopenharmony_ci return -EOPNOTSUPP; 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_ci kiocb->private = NULL; 70362306a36Sopenharmony_ci kiocb->ki_flags |= IOCB_HIPRI; 70462306a36Sopenharmony_ci kiocb->ki_complete = io_complete_rw_iopoll; 70562306a36Sopenharmony_ci req->iopoll_completed = 0; 70662306a36Sopenharmony_ci } else { 70762306a36Sopenharmony_ci if (kiocb->ki_flags & IOCB_HIPRI) 70862306a36Sopenharmony_ci return -EINVAL; 70962306a36Sopenharmony_ci kiocb->ki_complete = io_complete_rw; 71062306a36Sopenharmony_ci } 71162306a36Sopenharmony_ci 71262306a36Sopenharmony_ci return 0; 71362306a36Sopenharmony_ci} 71462306a36Sopenharmony_ci 71562306a36Sopenharmony_ciint io_read(struct io_kiocb *req, unsigned int issue_flags) 71662306a36Sopenharmony_ci{ 71762306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 71862306a36Sopenharmony_ci struct io_rw_state __s, *s = &__s; 71962306a36Sopenharmony_ci struct iovec *iovec; 72062306a36Sopenharmony_ci struct kiocb *kiocb = &rw->kiocb; 72162306a36Sopenharmony_ci bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 72262306a36Sopenharmony_ci struct io_async_rw *io; 72362306a36Sopenharmony_ci ssize_t ret, ret2; 72462306a36Sopenharmony_ci loff_t *ppos; 72562306a36Sopenharmony_ci 72662306a36Sopenharmony_ci if (!req_has_async_data(req)) { 72762306a36Sopenharmony_ci ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); 72862306a36Sopenharmony_ci if (unlikely(ret < 0)) 72962306a36Sopenharmony_ci return ret; 73062306a36Sopenharmony_ci } else { 73162306a36Sopenharmony_ci io = req->async_data; 73262306a36Sopenharmony_ci s = &io->s; 73362306a36Sopenharmony_ci 73462306a36Sopenharmony_ci /* 73562306a36Sopenharmony_ci * Safe and required to re-import if we're using provided 73662306a36Sopenharmony_ci * buffers, as we dropped the selected one before retry. 73762306a36Sopenharmony_ci */ 73862306a36Sopenharmony_ci if (io_do_buffer_select(req)) { 73962306a36Sopenharmony_ci ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); 74062306a36Sopenharmony_ci if (unlikely(ret < 0)) 74162306a36Sopenharmony_ci return ret; 74262306a36Sopenharmony_ci } 74362306a36Sopenharmony_ci 74462306a36Sopenharmony_ci /* 74562306a36Sopenharmony_ci * We come here from an earlier attempt, restore our state to 74662306a36Sopenharmony_ci * match in case it doesn't. It's cheap enough that we don't 74762306a36Sopenharmony_ci * need to make this conditional. 74862306a36Sopenharmony_ci */ 74962306a36Sopenharmony_ci iov_iter_restore(&s->iter, &s->iter_state); 75062306a36Sopenharmony_ci iovec = NULL; 75162306a36Sopenharmony_ci } 75262306a36Sopenharmony_ci ret = io_rw_init_file(req, FMODE_READ); 75362306a36Sopenharmony_ci if (unlikely(ret)) { 75462306a36Sopenharmony_ci kfree(iovec); 75562306a36Sopenharmony_ci return ret; 75662306a36Sopenharmony_ci } 75762306a36Sopenharmony_ci req->cqe.res = iov_iter_count(&s->iter); 75862306a36Sopenharmony_ci 75962306a36Sopenharmony_ci if (force_nonblock) { 76062306a36Sopenharmony_ci /* If the file doesn't support async, just async punt */ 76162306a36Sopenharmony_ci if (unlikely(!io_file_supports_nowait(req))) { 76262306a36Sopenharmony_ci ret = io_setup_async_rw(req, iovec, s, true); 76362306a36Sopenharmony_ci return ret ?: -EAGAIN; 76462306a36Sopenharmony_ci } 76562306a36Sopenharmony_ci kiocb->ki_flags |= IOCB_NOWAIT; 76662306a36Sopenharmony_ci } else { 76762306a36Sopenharmony_ci /* Ensure we clear previously set non-block flag */ 76862306a36Sopenharmony_ci kiocb->ki_flags &= ~IOCB_NOWAIT; 76962306a36Sopenharmony_ci } 77062306a36Sopenharmony_ci 77162306a36Sopenharmony_ci ppos = io_kiocb_update_pos(req); 77262306a36Sopenharmony_ci 77362306a36Sopenharmony_ci ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); 77462306a36Sopenharmony_ci if (unlikely(ret)) { 77562306a36Sopenharmony_ci kfree(iovec); 77662306a36Sopenharmony_ci return ret; 77762306a36Sopenharmony_ci } 77862306a36Sopenharmony_ci 77962306a36Sopenharmony_ci ret = io_iter_do_read(rw, &s->iter); 78062306a36Sopenharmony_ci 78162306a36Sopenharmony_ci if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { 78262306a36Sopenharmony_ci req->flags &= ~REQ_F_REISSUE; 78362306a36Sopenharmony_ci /* if we can poll, just do that */ 78462306a36Sopenharmony_ci if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) 78562306a36Sopenharmony_ci return -EAGAIN; 78662306a36Sopenharmony_ci /* IOPOLL retry should happen for io-wq threads */ 78762306a36Sopenharmony_ci if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) 78862306a36Sopenharmony_ci goto done; 78962306a36Sopenharmony_ci /* no retry on NONBLOCK nor RWF_NOWAIT */ 79062306a36Sopenharmony_ci if (req->flags & REQ_F_NOWAIT) 79162306a36Sopenharmony_ci goto done; 79262306a36Sopenharmony_ci ret = 0; 79362306a36Sopenharmony_ci } else if (ret == -EIOCBQUEUED) { 79462306a36Sopenharmony_ci if (iovec) 79562306a36Sopenharmony_ci kfree(iovec); 79662306a36Sopenharmony_ci return IOU_ISSUE_SKIP_COMPLETE; 79762306a36Sopenharmony_ci } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || 79862306a36Sopenharmony_ci (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { 79962306a36Sopenharmony_ci /* read all, failed, already did sync or don't want to retry */ 80062306a36Sopenharmony_ci goto done; 80162306a36Sopenharmony_ci } 80262306a36Sopenharmony_ci 80362306a36Sopenharmony_ci /* 80462306a36Sopenharmony_ci * Don't depend on the iter state matching what was consumed, or being 80562306a36Sopenharmony_ci * untouched in case of error. Restore it and we'll advance it 80662306a36Sopenharmony_ci * manually if we need to. 80762306a36Sopenharmony_ci */ 80862306a36Sopenharmony_ci iov_iter_restore(&s->iter, &s->iter_state); 80962306a36Sopenharmony_ci 81062306a36Sopenharmony_ci ret2 = io_setup_async_rw(req, iovec, s, true); 81162306a36Sopenharmony_ci iovec = NULL; 81262306a36Sopenharmony_ci if (ret2) { 81362306a36Sopenharmony_ci ret = ret > 0 ? ret : ret2; 81462306a36Sopenharmony_ci goto done; 81562306a36Sopenharmony_ci } 81662306a36Sopenharmony_ci 81762306a36Sopenharmony_ci io = req->async_data; 81862306a36Sopenharmony_ci s = &io->s; 81962306a36Sopenharmony_ci /* 82062306a36Sopenharmony_ci * Now use our persistent iterator and state, if we aren't already. 82162306a36Sopenharmony_ci * We've restored and mapped the iter to match. 82262306a36Sopenharmony_ci */ 82362306a36Sopenharmony_ci 82462306a36Sopenharmony_ci do { 82562306a36Sopenharmony_ci /* 82662306a36Sopenharmony_ci * We end up here because of a partial read, either from 82762306a36Sopenharmony_ci * above or inside this loop. Advance the iter by the bytes 82862306a36Sopenharmony_ci * that were consumed. 82962306a36Sopenharmony_ci */ 83062306a36Sopenharmony_ci iov_iter_advance(&s->iter, ret); 83162306a36Sopenharmony_ci if (!iov_iter_count(&s->iter)) 83262306a36Sopenharmony_ci break; 83362306a36Sopenharmony_ci io->bytes_done += ret; 83462306a36Sopenharmony_ci iov_iter_save_state(&s->iter, &s->iter_state); 83562306a36Sopenharmony_ci 83662306a36Sopenharmony_ci /* if we can retry, do so with the callbacks armed */ 83762306a36Sopenharmony_ci if (!io_rw_should_retry(req)) { 83862306a36Sopenharmony_ci kiocb->ki_flags &= ~IOCB_WAITQ; 83962306a36Sopenharmony_ci return -EAGAIN; 84062306a36Sopenharmony_ci } 84162306a36Sopenharmony_ci 84262306a36Sopenharmony_ci req->cqe.res = iov_iter_count(&s->iter); 84362306a36Sopenharmony_ci /* 84462306a36Sopenharmony_ci * Now retry read with the IOCB_WAITQ parts set in the iocb. If 84562306a36Sopenharmony_ci * we get -EIOCBQUEUED, then we'll get a notification when the 84662306a36Sopenharmony_ci * desired page gets unlocked. We can also get a partial read 84762306a36Sopenharmony_ci * here, and if we do, then just retry at the new offset. 84862306a36Sopenharmony_ci */ 84962306a36Sopenharmony_ci ret = io_iter_do_read(rw, &s->iter); 85062306a36Sopenharmony_ci if (ret == -EIOCBQUEUED) 85162306a36Sopenharmony_ci return IOU_ISSUE_SKIP_COMPLETE; 85262306a36Sopenharmony_ci /* we got some bytes, but not all. retry. */ 85362306a36Sopenharmony_ci kiocb->ki_flags &= ~IOCB_WAITQ; 85462306a36Sopenharmony_ci iov_iter_restore(&s->iter, &s->iter_state); 85562306a36Sopenharmony_ci } while (ret > 0); 85662306a36Sopenharmony_cidone: 85762306a36Sopenharmony_ci /* it's faster to check here then delegate to kfree */ 85862306a36Sopenharmony_ci if (iovec) 85962306a36Sopenharmony_ci kfree(iovec); 86062306a36Sopenharmony_ci return kiocb_done(req, ret, issue_flags); 86162306a36Sopenharmony_ci} 86262306a36Sopenharmony_ci 86362306a36Sopenharmony_ciint io_write(struct io_kiocb *req, unsigned int issue_flags) 86462306a36Sopenharmony_ci{ 86562306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 86662306a36Sopenharmony_ci struct io_rw_state __s, *s = &__s; 86762306a36Sopenharmony_ci struct iovec *iovec; 86862306a36Sopenharmony_ci struct kiocb *kiocb = &rw->kiocb; 86962306a36Sopenharmony_ci bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 87062306a36Sopenharmony_ci ssize_t ret, ret2; 87162306a36Sopenharmony_ci loff_t *ppos; 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci if (!req_has_async_data(req)) { 87462306a36Sopenharmony_ci ret = io_import_iovec(ITER_SOURCE, req, &iovec, s, issue_flags); 87562306a36Sopenharmony_ci if (unlikely(ret < 0)) 87662306a36Sopenharmony_ci return ret; 87762306a36Sopenharmony_ci } else { 87862306a36Sopenharmony_ci struct io_async_rw *io = req->async_data; 87962306a36Sopenharmony_ci 88062306a36Sopenharmony_ci s = &io->s; 88162306a36Sopenharmony_ci iov_iter_restore(&s->iter, &s->iter_state); 88262306a36Sopenharmony_ci iovec = NULL; 88362306a36Sopenharmony_ci } 88462306a36Sopenharmony_ci ret = io_rw_init_file(req, FMODE_WRITE); 88562306a36Sopenharmony_ci if (unlikely(ret)) { 88662306a36Sopenharmony_ci kfree(iovec); 88762306a36Sopenharmony_ci return ret; 88862306a36Sopenharmony_ci } 88962306a36Sopenharmony_ci req->cqe.res = iov_iter_count(&s->iter); 89062306a36Sopenharmony_ci 89162306a36Sopenharmony_ci if (force_nonblock) { 89262306a36Sopenharmony_ci /* If the file doesn't support async, just async punt */ 89362306a36Sopenharmony_ci if (unlikely(!io_file_supports_nowait(req))) 89462306a36Sopenharmony_ci goto copy_iov; 89562306a36Sopenharmony_ci 89662306a36Sopenharmony_ci /* File path supports NOWAIT for non-direct_IO only for block devices. */ 89762306a36Sopenharmony_ci if (!(kiocb->ki_flags & IOCB_DIRECT) && 89862306a36Sopenharmony_ci !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) && 89962306a36Sopenharmony_ci (req->flags & REQ_F_ISREG)) 90062306a36Sopenharmony_ci goto copy_iov; 90162306a36Sopenharmony_ci 90262306a36Sopenharmony_ci kiocb->ki_flags |= IOCB_NOWAIT; 90362306a36Sopenharmony_ci } else { 90462306a36Sopenharmony_ci /* Ensure we clear previously set non-block flag */ 90562306a36Sopenharmony_ci kiocb->ki_flags &= ~IOCB_NOWAIT; 90662306a36Sopenharmony_ci } 90762306a36Sopenharmony_ci 90862306a36Sopenharmony_ci ppos = io_kiocb_update_pos(req); 90962306a36Sopenharmony_ci 91062306a36Sopenharmony_ci ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); 91162306a36Sopenharmony_ci if (unlikely(ret)) { 91262306a36Sopenharmony_ci kfree(iovec); 91362306a36Sopenharmony_ci return ret; 91462306a36Sopenharmony_ci } 91562306a36Sopenharmony_ci 91662306a36Sopenharmony_ci if (req->flags & REQ_F_ISREG) 91762306a36Sopenharmony_ci kiocb_start_write(kiocb); 91862306a36Sopenharmony_ci kiocb->ki_flags |= IOCB_WRITE; 91962306a36Sopenharmony_ci 92062306a36Sopenharmony_ci if (likely(req->file->f_op->write_iter)) 92162306a36Sopenharmony_ci ret2 = call_write_iter(req->file, kiocb, &s->iter); 92262306a36Sopenharmony_ci else if (req->file->f_op->write) 92362306a36Sopenharmony_ci ret2 = loop_rw_iter(WRITE, rw, &s->iter); 92462306a36Sopenharmony_ci else 92562306a36Sopenharmony_ci ret2 = -EINVAL; 92662306a36Sopenharmony_ci 92762306a36Sopenharmony_ci if (req->flags & REQ_F_REISSUE) { 92862306a36Sopenharmony_ci req->flags &= ~REQ_F_REISSUE; 92962306a36Sopenharmony_ci ret2 = -EAGAIN; 93062306a36Sopenharmony_ci } 93162306a36Sopenharmony_ci 93262306a36Sopenharmony_ci /* 93362306a36Sopenharmony_ci * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just 93462306a36Sopenharmony_ci * retry them without IOCB_NOWAIT. 93562306a36Sopenharmony_ci */ 93662306a36Sopenharmony_ci if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) 93762306a36Sopenharmony_ci ret2 = -EAGAIN; 93862306a36Sopenharmony_ci /* no retry on NONBLOCK nor RWF_NOWAIT */ 93962306a36Sopenharmony_ci if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) 94062306a36Sopenharmony_ci goto done; 94162306a36Sopenharmony_ci if (!force_nonblock || ret2 != -EAGAIN) { 94262306a36Sopenharmony_ci /* IOPOLL retry should happen for io-wq threads */ 94362306a36Sopenharmony_ci if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) 94462306a36Sopenharmony_ci goto copy_iov; 94562306a36Sopenharmony_ci 94662306a36Sopenharmony_ci if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { 94762306a36Sopenharmony_ci struct io_async_rw *io; 94862306a36Sopenharmony_ci 94962306a36Sopenharmony_ci trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, 95062306a36Sopenharmony_ci req->cqe.res, ret2); 95162306a36Sopenharmony_ci 95262306a36Sopenharmony_ci /* This is a partial write. The file pos has already been 95362306a36Sopenharmony_ci * updated, setup the async struct to complete the request 95462306a36Sopenharmony_ci * in the worker. Also update bytes_done to account for 95562306a36Sopenharmony_ci * the bytes already written. 95662306a36Sopenharmony_ci */ 95762306a36Sopenharmony_ci iov_iter_save_state(&s->iter, &s->iter_state); 95862306a36Sopenharmony_ci ret = io_setup_async_rw(req, iovec, s, true); 95962306a36Sopenharmony_ci 96062306a36Sopenharmony_ci io = req->async_data; 96162306a36Sopenharmony_ci if (io) 96262306a36Sopenharmony_ci io->bytes_done += ret2; 96362306a36Sopenharmony_ci 96462306a36Sopenharmony_ci if (kiocb->ki_flags & IOCB_WRITE) 96562306a36Sopenharmony_ci io_req_end_write(req); 96662306a36Sopenharmony_ci return ret ? ret : -EAGAIN; 96762306a36Sopenharmony_ci } 96862306a36Sopenharmony_cidone: 96962306a36Sopenharmony_ci ret = kiocb_done(req, ret2, issue_flags); 97062306a36Sopenharmony_ci } else { 97162306a36Sopenharmony_cicopy_iov: 97262306a36Sopenharmony_ci iov_iter_restore(&s->iter, &s->iter_state); 97362306a36Sopenharmony_ci ret = io_setup_async_rw(req, iovec, s, false); 97462306a36Sopenharmony_ci if (!ret) { 97562306a36Sopenharmony_ci if (kiocb->ki_flags & IOCB_WRITE) 97662306a36Sopenharmony_ci io_req_end_write(req); 97762306a36Sopenharmony_ci return -EAGAIN; 97862306a36Sopenharmony_ci } 97962306a36Sopenharmony_ci return ret; 98062306a36Sopenharmony_ci } 98162306a36Sopenharmony_ci /* it's reportedly faster than delegating the null check to kfree() */ 98262306a36Sopenharmony_ci if (iovec) 98362306a36Sopenharmony_ci kfree(iovec); 98462306a36Sopenharmony_ci return ret; 98562306a36Sopenharmony_ci} 98662306a36Sopenharmony_ci 98762306a36Sopenharmony_civoid io_rw_fail(struct io_kiocb *req) 98862306a36Sopenharmony_ci{ 98962306a36Sopenharmony_ci int res; 99062306a36Sopenharmony_ci 99162306a36Sopenharmony_ci res = io_fixup_rw_res(req, req->cqe.res); 99262306a36Sopenharmony_ci io_req_set_res(req, res, req->cqe.flags); 99362306a36Sopenharmony_ci} 99462306a36Sopenharmony_ci 99562306a36Sopenharmony_ciint io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) 99662306a36Sopenharmony_ci{ 99762306a36Sopenharmony_ci struct io_wq_work_node *pos, *start, *prev; 99862306a36Sopenharmony_ci unsigned int poll_flags = 0; 99962306a36Sopenharmony_ci DEFINE_IO_COMP_BATCH(iob); 100062306a36Sopenharmony_ci int nr_events = 0; 100162306a36Sopenharmony_ci 100262306a36Sopenharmony_ci /* 100362306a36Sopenharmony_ci * Only spin for completions if we don't have multiple devices hanging 100462306a36Sopenharmony_ci * off our complete list. 100562306a36Sopenharmony_ci */ 100662306a36Sopenharmony_ci if (ctx->poll_multi_queue || force_nonspin) 100762306a36Sopenharmony_ci poll_flags |= BLK_POLL_ONESHOT; 100862306a36Sopenharmony_ci 100962306a36Sopenharmony_ci wq_list_for_each(pos, start, &ctx->iopoll_list) { 101062306a36Sopenharmony_ci struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 101162306a36Sopenharmony_ci struct file *file = req->file; 101262306a36Sopenharmony_ci int ret; 101362306a36Sopenharmony_ci 101462306a36Sopenharmony_ci /* 101562306a36Sopenharmony_ci * Move completed and retryable entries to our local lists. 101662306a36Sopenharmony_ci * If we find a request that requires polling, break out 101762306a36Sopenharmony_ci * and complete those lists first, if we have entries there. 101862306a36Sopenharmony_ci */ 101962306a36Sopenharmony_ci if (READ_ONCE(req->iopoll_completed)) 102062306a36Sopenharmony_ci break; 102162306a36Sopenharmony_ci 102262306a36Sopenharmony_ci if (req->opcode == IORING_OP_URING_CMD) { 102362306a36Sopenharmony_ci struct io_uring_cmd *ioucmd; 102462306a36Sopenharmony_ci 102562306a36Sopenharmony_ci ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 102662306a36Sopenharmony_ci ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob, 102762306a36Sopenharmony_ci poll_flags); 102862306a36Sopenharmony_ci } else { 102962306a36Sopenharmony_ci struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 103062306a36Sopenharmony_ci 103162306a36Sopenharmony_ci ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags); 103262306a36Sopenharmony_ci } 103362306a36Sopenharmony_ci if (unlikely(ret < 0)) 103462306a36Sopenharmony_ci return ret; 103562306a36Sopenharmony_ci else if (ret) 103662306a36Sopenharmony_ci poll_flags |= BLK_POLL_ONESHOT; 103762306a36Sopenharmony_ci 103862306a36Sopenharmony_ci /* iopoll may have completed current req */ 103962306a36Sopenharmony_ci if (!rq_list_empty(iob.req_list) || 104062306a36Sopenharmony_ci READ_ONCE(req->iopoll_completed)) 104162306a36Sopenharmony_ci break; 104262306a36Sopenharmony_ci } 104362306a36Sopenharmony_ci 104462306a36Sopenharmony_ci if (!rq_list_empty(iob.req_list)) 104562306a36Sopenharmony_ci iob.complete(&iob); 104662306a36Sopenharmony_ci else if (!pos) 104762306a36Sopenharmony_ci return 0; 104862306a36Sopenharmony_ci 104962306a36Sopenharmony_ci prev = start; 105062306a36Sopenharmony_ci wq_list_for_each_resume(pos, prev) { 105162306a36Sopenharmony_ci struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 105262306a36Sopenharmony_ci 105362306a36Sopenharmony_ci /* order with io_complete_rw_iopoll(), e.g. ->result updates */ 105462306a36Sopenharmony_ci if (!smp_load_acquire(&req->iopoll_completed)) 105562306a36Sopenharmony_ci break; 105662306a36Sopenharmony_ci nr_events++; 105762306a36Sopenharmony_ci req->cqe.flags = io_put_kbuf(req, 0); 105862306a36Sopenharmony_ci } 105962306a36Sopenharmony_ci if (unlikely(!nr_events)) 106062306a36Sopenharmony_ci return 0; 106162306a36Sopenharmony_ci 106262306a36Sopenharmony_ci pos = start ? start->next : ctx->iopoll_list.first; 106362306a36Sopenharmony_ci wq_list_cut(&ctx->iopoll_list, prev, start); 106462306a36Sopenharmony_ci 106562306a36Sopenharmony_ci if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs))) 106662306a36Sopenharmony_ci return 0; 106762306a36Sopenharmony_ci ctx->submit_state.compl_reqs.first = pos; 106862306a36Sopenharmony_ci __io_submit_flush_completions(ctx); 106962306a36Sopenharmony_ci return nr_events; 107062306a36Sopenharmony_ci} 1071