1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Shared application/kernel submission and completion ring pairs, for 4 * supporting fast/efficient IO. 5 * 6 * A note on the read/write ordering memory barriers that are matched between 7 * the application and kernel side. 8 * 9 * After the application reads the CQ ring tail, it must use an 10 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses 11 * before writing the tail (using smp_load_acquire to read the tail will 12 * do). It also needs a smp_mb() before updating CQ head (ordering the 13 * entry load(s) with the head store), pairing with an implicit barrier 14 * through a control-dependency in io_get_cqe (smp_store_release to 15 * store head will do). Failure to do so could lead to reading invalid 16 * CQ entries. 17 * 18 * Likewise, the application must use an appropriate smp_wmb() before 19 * writing the SQ tail (ordering SQ entry stores with the tail store), 20 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release 21 * to store the tail will do). And it needs a barrier ordering the SQ 22 * head load before writing new SQ entries (smp_load_acquire to read 23 * head will do). 24 * 25 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application 26 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after* 27 * updating the SQ tail; a full memory barrier smp_mb() is needed 28 * between. 29 * 30 * Also see the examples in the liburing library: 31 * 32 * git://git.kernel.dk/liburing 33 * 34 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens 35 * from data shared between the kernel and application. This is done both 36 * for ordering purposes, but also to ensure that once a value is loaded from 37 * data that the application could potentially modify, it remains stable. 38 * 39 * Copyright (C) 2018-2019 Jens Axboe 40 * Copyright (c) 2018-2019 Christoph Hellwig 41 */ 42#include <linux/kernel.h> 43#include <linux/init.h> 44#include <linux/errno.h> 45#include <linux/syscalls.h> 46#include <linux/compat.h> 47#include <net/compat.h> 48#include <linux/refcount.h> 49#include <linux/uio.h> 50#include <linux/bits.h> 51 52#include <linux/sched/signal.h> 53#include <linux/fs.h> 54#include <linux/file.h> 55#include <linux/fdtable.h> 56#include <linux/mm.h> 57#include <linux/mman.h> 58#include <linux/percpu.h> 59#include <linux/slab.h> 60#include <linux/blkdev.h> 61#include <linux/bvec.h> 62#include <linux/net.h> 63#include <net/sock.h> 64#include <net/af_unix.h> 65#include <net/scm.h> 66#include <linux/anon_inodes.h> 67#include <linux/sched/mm.h> 68#include <linux/uaccess.h> 69#include <linux/nospec.h> 70#include <linux/sizes.h> 71#include <linux/hugetlb.h> 72#include <linux/highmem.h> 73#include <linux/namei.h> 74#include <linux/fsnotify.h> 75#include <linux/fadvise.h> 76#include <linux/eventpoll.h> 77#include <linux/splice.h> 78#include <linux/task_work.h> 79#include <linux/pagemap.h> 80#include <linux/io_uring.h> 81#include <linux/tracehook.h> 82 83#define CREATE_TRACE_POINTS 84#include <trace/events/io_uring.h> 85 86#include <uapi/linux/io_uring.h> 87 88#include "../fs/internal.h" 89#include "io-wq.h" 90 91#define IORING_MAX_ENTRIES 32768 92#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 93#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 94 95/* only define max */ 96#define IORING_MAX_FIXED_FILES (1U << 15) 97#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 98 IORING_REGISTER_LAST + IORING_OP_LAST) 99 100#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) 101#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) 102#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) 103 104#define IORING_MAX_REG_BUFFERS (1U << 14) 105 106#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ 107 IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ 108 IOSQE_BUFFER_SELECT) 109#define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ 110 REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS) 111 112#define IO_TCTX_REFS_CACHE_NR (1U << 10) 113 114struct io_uring { 115 u32 head ____cacheline_aligned_in_smp; 116 u32 tail ____cacheline_aligned_in_smp; 117}; 118 119/* 120 * This data is shared with the application through the mmap at offsets 121 * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. 122 * 123 * The offsets to the member fields are published through struct 124 * io_sqring_offsets when calling io_uring_setup. 125 */ 126struct io_rings { 127 /* 128 * Head and tail offsets into the ring; the offsets need to be 129 * masked to get valid indices. 130 * 131 * The kernel controls head of the sq ring and the tail of the cq ring, 132 * and the application controls tail of the sq ring and the head of the 133 * cq ring. 134 */ 135 struct io_uring sq, cq; 136 /* 137 * Bitmasks to apply to head and tail offsets (constant, equals 138 * ring_entries - 1) 139 */ 140 u32 sq_ring_mask, cq_ring_mask; 141 /* Ring sizes (constant, power of 2) */ 142 u32 sq_ring_entries, cq_ring_entries; 143 /* 144 * Number of invalid entries dropped by the kernel due to 145 * invalid index stored in array 146 * 147 * Written by the kernel, shouldn't be modified by the 148 * application (i.e. get number of "new events" by comparing to 149 * cached value). 150 * 151 * After a new SQ head value was read by the application this 152 * counter includes all submissions that were dropped reaching 153 * the new SQ head (and possibly more). 154 */ 155 u32 sq_dropped; 156 /* 157 * Runtime SQ flags 158 * 159 * Written by the kernel, shouldn't be modified by the 160 * application. 161 * 162 * The application needs a full memory barrier before checking 163 * for IORING_SQ_NEED_WAKEUP after updating the sq tail. 164 */ 165 u32 sq_flags; 166 /* 167 * Runtime CQ flags 168 * 169 * Written by the application, shouldn't be modified by the 170 * kernel. 171 */ 172 u32 cq_flags; 173 /* 174 * Number of completion events lost because the queue was full; 175 * this should be avoided by the application by making sure 176 * there are not more requests pending than there is space in 177 * the completion queue. 178 * 179 * Written by the kernel, shouldn't be modified by the 180 * application (i.e. get number of "new events" by comparing to 181 * cached value). 182 * 183 * As completion events come in out of order this counter is not 184 * ordered with any other data. 185 */ 186 u32 cq_overflow; 187 /* 188 * Ring buffer of completion events. 189 * 190 * The kernel writes completion events fresh every time they are 191 * produced, so the application is allowed to modify pending 192 * entries. 193 */ 194 struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; 195}; 196 197enum io_uring_cmd_flags { 198 IO_URING_F_NONBLOCK = 1, 199 IO_URING_F_COMPLETE_DEFER = 2, 200}; 201 202struct io_mapped_ubuf { 203 u64 ubuf; 204 u64 ubuf_end; 205 unsigned int nr_bvecs; 206 unsigned long acct_pages; 207 struct bio_vec bvec[]; 208}; 209 210struct io_ring_ctx; 211 212struct io_overflow_cqe { 213 struct io_uring_cqe cqe; 214 struct list_head list; 215}; 216 217struct io_fixed_file { 218 /* file * with additional FFS_* flags */ 219 unsigned long file_ptr; 220}; 221 222struct io_rsrc_put { 223 struct list_head list; 224 u64 tag; 225 union { 226 void *rsrc; 227 struct file *file; 228 struct io_mapped_ubuf *buf; 229 }; 230}; 231 232struct io_file_table { 233 struct io_fixed_file *files; 234}; 235 236struct io_rsrc_node { 237 struct percpu_ref refs; 238 struct list_head node; 239 struct list_head rsrc_list; 240 struct io_rsrc_data *rsrc_data; 241 struct llist_node llist; 242 bool done; 243}; 244 245typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); 246 247struct io_rsrc_data { 248 struct io_ring_ctx *ctx; 249 250 u64 **tags; 251 unsigned int nr; 252 rsrc_put_fn *do_put; 253 atomic_t refs; 254 struct completion done; 255 bool quiesce; 256}; 257 258struct io_buffer { 259 struct list_head list; 260 __u64 addr; 261 __u32 len; 262 __u16 bid; 263}; 264 265struct io_restriction { 266 DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); 267 DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 268 u8 sqe_flags_allowed; 269 u8 sqe_flags_required; 270 bool registered; 271}; 272 273enum { 274 IO_SQ_THREAD_SHOULD_STOP = 0, 275 IO_SQ_THREAD_SHOULD_PARK, 276}; 277 278struct io_sq_data { 279 refcount_t refs; 280 atomic_t park_pending; 281 struct mutex lock; 282 283 /* ctx's that are using this sqd */ 284 struct list_head ctx_list; 285 286 struct task_struct *thread; 287 struct wait_queue_head wait; 288 289 unsigned sq_thread_idle; 290 int sq_cpu; 291 pid_t task_pid; 292 pid_t task_tgid; 293 294 unsigned long state; 295 struct completion exited; 296}; 297 298#define IO_COMPL_BATCH 32 299#define IO_REQ_CACHE_SIZE 32 300#define IO_REQ_ALLOC_BATCH 8 301 302struct io_submit_link { 303 struct io_kiocb *head; 304 struct io_kiocb *last; 305}; 306 307struct io_submit_state { 308 struct blk_plug plug; 309 struct io_submit_link link; 310 311 /* 312 * io_kiocb alloc cache 313 */ 314 void *reqs[IO_REQ_CACHE_SIZE]; 315 unsigned int free_reqs; 316 317 bool plug_started; 318 319 /* 320 * Batch completion logic 321 */ 322 struct io_kiocb *compl_reqs[IO_COMPL_BATCH]; 323 unsigned int compl_nr; 324 /* inline/task_work completion list, under ->uring_lock */ 325 struct list_head free_list; 326 327 unsigned int ios_left; 328}; 329 330struct io_ring_ctx { 331 /* const or read-mostly hot data */ 332 struct { 333 struct percpu_ref refs; 334 335 struct io_rings *rings; 336 unsigned int flags; 337 unsigned int compat: 1; 338 unsigned int drain_next: 1; 339 unsigned int eventfd_async: 1; 340 unsigned int restricted: 1; 341 unsigned int off_timeout_used: 1; 342 unsigned int drain_active: 1; 343 } ____cacheline_aligned_in_smp; 344 345 /* submission data */ 346 struct { 347 struct mutex uring_lock; 348 349 /* 350 * Ring buffer of indices into array of io_uring_sqe, which is 351 * mmapped by the application using the IORING_OFF_SQES offset. 352 * 353 * This indirection could e.g. be used to assign fixed 354 * io_uring_sqe entries to operations and only submit them to 355 * the queue when needed. 356 * 357 * The kernel modifies neither the indices array nor the entries 358 * array. 359 */ 360 u32 *sq_array; 361 struct io_uring_sqe *sq_sqes; 362 unsigned cached_sq_head; 363 unsigned sq_entries; 364 struct list_head defer_list; 365 366 /* 367 * Fixed resources fast path, should be accessed only under 368 * uring_lock, and updated through io_uring_register(2) 369 */ 370 struct io_rsrc_node *rsrc_node; 371 struct io_file_table file_table; 372 unsigned nr_user_files; 373 unsigned nr_user_bufs; 374 struct io_mapped_ubuf **user_bufs; 375 376 struct io_submit_state submit_state; 377 struct list_head timeout_list; 378 struct list_head ltimeout_list; 379 struct list_head cq_overflow_list; 380 struct xarray io_buffers; 381 struct xarray personalities; 382 u32 pers_next; 383 unsigned sq_thread_idle; 384 } ____cacheline_aligned_in_smp; 385 386 /* IRQ completion list, under ->completion_lock */ 387 struct list_head locked_free_list; 388 unsigned int locked_free_nr; 389 390 const struct cred *sq_creds; /* cred used for __io_sq_thread() */ 391 struct io_sq_data *sq_data; /* if using sq thread polling */ 392 393 struct wait_queue_head sqo_sq_wait; 394 struct list_head sqd_list; 395 396 unsigned long check_cq_overflow; 397 398 struct { 399 unsigned cached_cq_tail; 400 unsigned cq_entries; 401 struct eventfd_ctx *cq_ev_fd; 402 struct wait_queue_head poll_wait; 403 struct wait_queue_head cq_wait; 404 unsigned cq_extra; 405 atomic_t cq_timeouts; 406 unsigned cq_last_tm_flush; 407 } ____cacheline_aligned_in_smp; 408 409 struct { 410 spinlock_t completion_lock; 411 412 spinlock_t timeout_lock; 413 414 /* 415 * ->iopoll_list is protected by the ctx->uring_lock for 416 * io_uring instances that don't use IORING_SETUP_SQPOLL. 417 * For SQPOLL, only the single threaded io_sq_thread() will 418 * manipulate the list, hence no extra locking is needed there. 419 */ 420 struct list_head iopoll_list; 421 struct hlist_head *cancel_hash; 422 unsigned cancel_hash_bits; 423 bool poll_multi_queue; 424 } ____cacheline_aligned_in_smp; 425 426 struct io_restriction restrictions; 427 428 /* slow path rsrc auxilary data, used by update/register */ 429 struct { 430 struct io_rsrc_node *rsrc_backup_node; 431 struct io_mapped_ubuf *dummy_ubuf; 432 struct io_rsrc_data *file_data; 433 struct io_rsrc_data *buf_data; 434 435 struct delayed_work rsrc_put_work; 436 struct llist_head rsrc_put_llist; 437 struct list_head rsrc_ref_list; 438 spinlock_t rsrc_ref_lock; 439 }; 440 441 /* Keep this last, we don't need it for the fast path */ 442 struct { 443 #if defined(CONFIG_UNIX) 444 struct socket *ring_sock; 445 #endif 446 /* hashed buffered write serialization */ 447 struct io_wq_hash *hash_map; 448 449 /* Only used for accounting purposes */ 450 struct user_struct *user; 451 struct mm_struct *mm_account; 452 453 /* ctx exit and cancelation */ 454 struct llist_head fallback_llist; 455 struct delayed_work fallback_work; 456 struct work_struct exit_work; 457 struct list_head tctx_list; 458 struct completion ref_comp; 459 u32 iowq_limits[2]; 460 bool iowq_limits_set; 461 }; 462}; 463 464struct io_uring_task { 465 /* submission side */ 466 int cached_refs; 467 struct xarray xa; 468 struct wait_queue_head wait; 469 const struct io_ring_ctx *last; 470 struct io_wq *io_wq; 471 struct percpu_counter inflight; 472 atomic_t inflight_tracked; 473 atomic_t in_idle; 474 475 spinlock_t task_lock; 476 struct io_wq_work_list task_list; 477 struct callback_head task_work; 478 bool task_running; 479}; 480 481/* 482 * First field must be the file pointer in all the 483 * iocb unions! See also 'struct kiocb' in <linux/fs.h> 484 */ 485struct io_poll_iocb { 486 struct file *file; 487 struct wait_queue_head *head; 488 __poll_t events; 489 int retries; 490 struct wait_queue_entry wait; 491}; 492 493struct io_poll_update { 494 struct file *file; 495 u64 old_user_data; 496 u64 new_user_data; 497 __poll_t events; 498 bool update_events; 499 bool update_user_data; 500}; 501 502struct io_close { 503 struct file *file; 504 int fd; 505 u32 file_slot; 506}; 507 508struct io_timeout_data { 509 struct io_kiocb *req; 510 struct hrtimer timer; 511 struct timespec64 ts; 512 enum hrtimer_mode mode; 513 u32 flags; 514}; 515 516struct io_accept { 517 struct file *file; 518 struct sockaddr __user *addr; 519 int __user *addr_len; 520 int flags; 521 u32 file_slot; 522 unsigned long nofile; 523}; 524 525struct io_sync { 526 struct file *file; 527 loff_t len; 528 loff_t off; 529 int flags; 530 int mode; 531}; 532 533struct io_cancel { 534 struct file *file; 535 u64 addr; 536}; 537 538struct io_timeout { 539 struct file *file; 540 u32 off; 541 u32 target_seq; 542 struct list_head list; 543 /* head of the link, used by linked timeouts only */ 544 struct io_kiocb *head; 545 /* for linked completions */ 546 struct io_kiocb *prev; 547}; 548 549struct io_timeout_rem { 550 struct file *file; 551 u64 addr; 552 553 /* timeout update */ 554 struct timespec64 ts; 555 u32 flags; 556 bool ltimeout; 557}; 558 559struct io_rw { 560 /* NOTE: kiocb has the file as the first member, so don't do it here */ 561 struct kiocb kiocb; 562 u64 addr; 563 u64 len; 564}; 565 566struct io_connect { 567 struct file *file; 568 struct sockaddr __user *addr; 569 int addr_len; 570}; 571 572struct io_sr_msg { 573 struct file *file; 574 union { 575 struct compat_msghdr __user *umsg_compat; 576 struct user_msghdr __user *umsg; 577 void __user *buf; 578 }; 579 int msg_flags; 580 int bgid; 581 size_t len; 582 size_t done_io; 583 struct io_buffer *kbuf; 584 void __user *msg_control; 585}; 586 587struct io_open { 588 struct file *file; 589 int dfd; 590 u32 file_slot; 591 struct filename *filename; 592 struct open_how how; 593 unsigned long nofile; 594}; 595 596struct io_rsrc_update { 597 struct file *file; 598 u64 arg; 599 u32 nr_args; 600 u32 offset; 601}; 602 603struct io_fadvise { 604 struct file *file; 605 u64 offset; 606 u32 len; 607 u32 advice; 608}; 609 610struct io_madvise { 611 struct file *file; 612 u64 addr; 613 u32 len; 614 u32 advice; 615}; 616 617struct io_epoll { 618 struct file *file; 619 int epfd; 620 int op; 621 int fd; 622 struct epoll_event event; 623}; 624 625struct io_splice { 626 struct file *file_out; 627 loff_t off_out; 628 loff_t off_in; 629 u64 len; 630 int splice_fd_in; 631 unsigned int flags; 632}; 633 634struct io_provide_buf { 635 struct file *file; 636 __u64 addr; 637 __u32 len; 638 __u32 bgid; 639 __u16 nbufs; 640 __u16 bid; 641}; 642 643struct io_statx { 644 struct file *file; 645 int dfd; 646 unsigned int mask; 647 unsigned int flags; 648 const char __user *filename; 649 struct statx __user *buffer; 650}; 651 652struct io_shutdown { 653 struct file *file; 654 int how; 655}; 656 657struct io_rename { 658 struct file *file; 659 int old_dfd; 660 int new_dfd; 661 struct filename *oldpath; 662 struct filename *newpath; 663 int flags; 664}; 665 666struct io_unlink { 667 struct file *file; 668 int dfd; 669 int flags; 670 struct filename *filename; 671}; 672 673struct io_mkdir { 674 struct file *file; 675 int dfd; 676 umode_t mode; 677 struct filename *filename; 678}; 679 680struct io_symlink { 681 struct file *file; 682 int new_dfd; 683 struct filename *oldpath; 684 struct filename *newpath; 685}; 686 687struct io_hardlink { 688 struct file *file; 689 int old_dfd; 690 int new_dfd; 691 struct filename *oldpath; 692 struct filename *newpath; 693 int flags; 694}; 695 696struct io_completion { 697 struct file *file; 698 u32 cflags; 699}; 700 701struct io_async_connect { 702 struct sockaddr_storage address; 703}; 704 705struct io_async_msghdr { 706 struct iovec fast_iov[UIO_FASTIOV]; 707 /* points to an allocated iov, if NULL we use fast_iov instead */ 708 struct iovec *free_iov; 709 struct sockaddr __user *uaddr; 710 struct msghdr msg; 711 struct sockaddr_storage addr; 712}; 713 714struct io_async_rw { 715 struct iovec fast_iov[UIO_FASTIOV]; 716 const struct iovec *free_iovec; 717 struct iov_iter iter; 718 struct iov_iter_state iter_state; 719 size_t bytes_done; 720 struct wait_page_queue wpq; 721}; 722 723enum { 724 REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, 725 REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, 726 REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, 727 REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, 728 REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, 729 REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, 730 731 /* first byte is taken by user flags, shift it to not overlap */ 732 REQ_F_FAIL_BIT = 8, 733 REQ_F_INFLIGHT_BIT, 734 REQ_F_CUR_POS_BIT, 735 REQ_F_NOWAIT_BIT, 736 REQ_F_LINK_TIMEOUT_BIT, 737 REQ_F_NEED_CLEANUP_BIT, 738 REQ_F_POLLED_BIT, 739 REQ_F_BUFFER_SELECTED_BIT, 740 REQ_F_COMPLETE_INLINE_BIT, 741 REQ_F_REISSUE_BIT, 742 REQ_F_CREDS_BIT, 743 REQ_F_REFCOUNT_BIT, 744 REQ_F_ARM_LTIMEOUT_BIT, 745 REQ_F_PARTIAL_IO_BIT, 746 /* keep async read/write and isreg together and in order */ 747 REQ_F_NOWAIT_READ_BIT, 748 REQ_F_NOWAIT_WRITE_BIT, 749 REQ_F_ISREG_BIT, 750 751 /* not a real bit, just to check we're not overflowing the space */ 752 __REQ_F_LAST_BIT, 753}; 754 755enum { 756 /* ctx owns file */ 757 REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), 758 /* drain existing IO first */ 759 REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), 760 /* linked sqes */ 761 REQ_F_LINK = BIT(REQ_F_LINK_BIT), 762 /* doesn't sever on completion < 0 */ 763 REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), 764 /* IOSQE_ASYNC */ 765 REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), 766 /* IOSQE_BUFFER_SELECT */ 767 REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), 768 769 /* fail rest of links */ 770 REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), 771 /* on inflight list, should be cancelled and waited on exit reliably */ 772 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), 773 /* read/write uses file position */ 774 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), 775 /* must not punt to workers */ 776 REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), 777 /* has or had linked timeout */ 778 REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), 779 /* needs cleanup */ 780 REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), 781 /* already went through poll handler */ 782 REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), 783 /* buffer already selected */ 784 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 785 /* completion is deferred through io_comp_state */ 786 REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), 787 /* caller should reissue async */ 788 REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), 789 /* supports async reads */ 790 REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT), 791 /* supports async writes */ 792 REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT), 793 /* regular file */ 794 REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), 795 /* has creds assigned */ 796 REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), 797 /* skip refcounting if not set */ 798 REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), 799 /* there is a linked timeout that has to be armed */ 800 REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), 801 /* request has already done partial IO */ 802 REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), 803}; 804 805struct async_poll { 806 struct io_poll_iocb poll; 807 struct io_poll_iocb *double_poll; 808}; 809 810typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); 811 812struct io_task_work { 813 union { 814 struct io_wq_work_node node; 815 struct llist_node fallback_node; 816 }; 817 io_req_tw_func_t func; 818}; 819 820enum { 821 IORING_RSRC_FILE = 0, 822 IORING_RSRC_BUFFER = 1, 823}; 824 825/* 826 * NOTE! Each of the iocb union members has the file pointer 827 * as the first entry in their struct definition. So you can 828 * access the file pointer through any of the sub-structs, 829 * or directly as just 'ki_filp' in this struct. 830 */ 831struct io_kiocb { 832 union { 833 struct file *file; 834 struct io_rw rw; 835 struct io_poll_iocb poll; 836 struct io_poll_update poll_update; 837 struct io_accept accept; 838 struct io_sync sync; 839 struct io_cancel cancel; 840 struct io_timeout timeout; 841 struct io_timeout_rem timeout_rem; 842 struct io_connect connect; 843 struct io_sr_msg sr_msg; 844 struct io_open open; 845 struct io_close close; 846 struct io_rsrc_update rsrc_update; 847 struct io_fadvise fadvise; 848 struct io_madvise madvise; 849 struct io_epoll epoll; 850 struct io_splice splice; 851 struct io_provide_buf pbuf; 852 struct io_statx statx; 853 struct io_shutdown shutdown; 854 struct io_rename rename; 855 struct io_unlink unlink; 856 struct io_mkdir mkdir; 857 struct io_symlink symlink; 858 struct io_hardlink hardlink; 859 /* use only after cleaning per-op data, see io_clean_op() */ 860 struct io_completion compl; 861 }; 862 863 /* opcode allocated if it needs to store data for async defer */ 864 void *async_data; 865 u8 opcode; 866 /* polled IO has completed */ 867 u8 iopoll_completed; 868 869 u16 buf_index; 870 u32 result; 871 872 struct io_ring_ctx *ctx; 873 unsigned int flags; 874 atomic_t refs; 875 struct task_struct *task; 876 u64 user_data; 877 878 struct io_kiocb *link; 879 struct percpu_ref *fixed_rsrc_refs; 880 881 /* used with ctx->iopoll_list with reads/writes */ 882 struct list_head inflight_entry; 883 struct io_task_work io_task_work; 884 /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ 885 struct hlist_node hash_node; 886 struct async_poll *apoll; 887 struct io_wq_work work; 888 const struct cred *creds; 889 890 /* store used ubuf, so we can prevent reloading */ 891 struct io_mapped_ubuf *imu; 892 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 893 struct io_buffer *kbuf; 894 atomic_t poll_refs; 895}; 896 897struct io_tctx_node { 898 struct list_head ctx_node; 899 struct task_struct *task; 900 struct io_ring_ctx *ctx; 901}; 902 903struct io_defer_entry { 904 struct list_head list; 905 struct io_kiocb *req; 906 u32 seq; 907}; 908 909struct io_op_def { 910 /* needs req->file assigned */ 911 unsigned needs_file : 1; 912 /* hash wq insertion if file is a regular file */ 913 unsigned hash_reg_file : 1; 914 /* unbound wq insertion if file is a non-regular file */ 915 unsigned unbound_nonreg_file : 1; 916 /* opcode is not supported by this kernel */ 917 unsigned not_supported : 1; 918 /* set if opcode supports polled "wait" */ 919 unsigned pollin : 1; 920 unsigned pollout : 1; 921 /* op supports buffer selection */ 922 unsigned buffer_select : 1; 923 /* do prep async if is going to be punted */ 924 unsigned needs_async_setup : 1; 925 /* should block plug */ 926 unsigned plug : 1; 927 /* size of async data needed, if any */ 928 unsigned short async_size; 929}; 930 931static const struct io_op_def io_op_defs[] = { 932 [IORING_OP_NOP] = {}, 933 [IORING_OP_READV] = { 934 .needs_file = 1, 935 .unbound_nonreg_file = 1, 936 .pollin = 1, 937 .buffer_select = 1, 938 .needs_async_setup = 1, 939 .plug = 1, 940 .async_size = sizeof(struct io_async_rw), 941 }, 942 [IORING_OP_WRITEV] = { 943 .needs_file = 1, 944 .hash_reg_file = 1, 945 .unbound_nonreg_file = 1, 946 .pollout = 1, 947 .needs_async_setup = 1, 948 .plug = 1, 949 .async_size = sizeof(struct io_async_rw), 950 }, 951 [IORING_OP_FSYNC] = { 952 .needs_file = 1, 953 }, 954 [IORING_OP_READ_FIXED] = { 955 .needs_file = 1, 956 .unbound_nonreg_file = 1, 957 .pollin = 1, 958 .plug = 1, 959 .async_size = sizeof(struct io_async_rw), 960 }, 961 [IORING_OP_WRITE_FIXED] = { 962 .needs_file = 1, 963 .hash_reg_file = 1, 964 .unbound_nonreg_file = 1, 965 .pollout = 1, 966 .plug = 1, 967 .async_size = sizeof(struct io_async_rw), 968 }, 969 [IORING_OP_POLL_ADD] = { 970 .needs_file = 1, 971 .unbound_nonreg_file = 1, 972 }, 973 [IORING_OP_POLL_REMOVE] = {}, 974 [IORING_OP_SYNC_FILE_RANGE] = { 975 .needs_file = 1, 976 }, 977 [IORING_OP_SENDMSG] = { 978 .needs_file = 1, 979 .unbound_nonreg_file = 1, 980 .pollout = 1, 981 .needs_async_setup = 1, 982 .async_size = sizeof(struct io_async_msghdr), 983 }, 984 [IORING_OP_RECVMSG] = { 985 .needs_file = 1, 986 .unbound_nonreg_file = 1, 987 .pollin = 1, 988 .buffer_select = 1, 989 .needs_async_setup = 1, 990 .async_size = sizeof(struct io_async_msghdr), 991 }, 992 [IORING_OP_TIMEOUT] = { 993 .async_size = sizeof(struct io_timeout_data), 994 }, 995 [IORING_OP_TIMEOUT_REMOVE] = { 996 /* used by timeout updates' prep() */ 997 }, 998 [IORING_OP_ACCEPT] = { 999 .needs_file = 1, 1000 .unbound_nonreg_file = 1, 1001 .pollin = 1, 1002 }, 1003 [IORING_OP_ASYNC_CANCEL] = {}, 1004 [IORING_OP_LINK_TIMEOUT] = { 1005 .async_size = sizeof(struct io_timeout_data), 1006 }, 1007 [IORING_OP_CONNECT] = { 1008 .needs_file = 1, 1009 .unbound_nonreg_file = 1, 1010 .pollout = 1, 1011 .needs_async_setup = 1, 1012 .async_size = sizeof(struct io_async_connect), 1013 }, 1014 [IORING_OP_FALLOCATE] = { 1015 .needs_file = 1, 1016 }, 1017 [IORING_OP_OPENAT] = {}, 1018 [IORING_OP_CLOSE] = {}, 1019 [IORING_OP_FILES_UPDATE] = {}, 1020 [IORING_OP_STATX] = {}, 1021 [IORING_OP_READ] = { 1022 .needs_file = 1, 1023 .unbound_nonreg_file = 1, 1024 .pollin = 1, 1025 .buffer_select = 1, 1026 .plug = 1, 1027 .async_size = sizeof(struct io_async_rw), 1028 }, 1029 [IORING_OP_WRITE] = { 1030 .needs_file = 1, 1031 .hash_reg_file = 1, 1032 .unbound_nonreg_file = 1, 1033 .pollout = 1, 1034 .plug = 1, 1035 .async_size = sizeof(struct io_async_rw), 1036 }, 1037 [IORING_OP_FADVISE] = { 1038 .needs_file = 1, 1039 }, 1040 [IORING_OP_MADVISE] = {}, 1041 [IORING_OP_SEND] = { 1042 .needs_file = 1, 1043 .unbound_nonreg_file = 1, 1044 .pollout = 1, 1045 }, 1046 [IORING_OP_RECV] = { 1047 .needs_file = 1, 1048 .unbound_nonreg_file = 1, 1049 .pollin = 1, 1050 .buffer_select = 1, 1051 }, 1052 [IORING_OP_OPENAT2] = { 1053 }, 1054 [IORING_OP_EPOLL_CTL] = { 1055 .unbound_nonreg_file = 1, 1056 }, 1057 [IORING_OP_SPLICE] = { 1058 .needs_file = 1, 1059 .hash_reg_file = 1, 1060 .unbound_nonreg_file = 1, 1061 }, 1062 [IORING_OP_PROVIDE_BUFFERS] = {}, 1063 [IORING_OP_REMOVE_BUFFERS] = {}, 1064 [IORING_OP_TEE] = { 1065 .needs_file = 1, 1066 .hash_reg_file = 1, 1067 .unbound_nonreg_file = 1, 1068 }, 1069 [IORING_OP_SHUTDOWN] = { 1070 .needs_file = 1, 1071 }, 1072 [IORING_OP_RENAMEAT] = {}, 1073 [IORING_OP_UNLINKAT] = {}, 1074}; 1075 1076/* requests with any of those set should undergo io_disarm_next() */ 1077#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 1078 1079static bool io_disarm_next(struct io_kiocb *req); 1080static void io_uring_del_tctx_node(unsigned long index); 1081static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 1082 struct task_struct *task, 1083 bool cancel_all); 1084static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 1085 1086static void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags); 1087 1088static void io_put_req(struct io_kiocb *req); 1089static void io_put_req_deferred(struct io_kiocb *req); 1090static void io_dismantle_req(struct io_kiocb *req); 1091static void io_queue_linked_timeout(struct io_kiocb *req); 1092static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 1093 struct io_uring_rsrc_update2 *up, 1094 unsigned nr_args); 1095static void io_clean_op(struct io_kiocb *req); 1096static struct file *io_file_get(struct io_ring_ctx *ctx, 1097 struct io_kiocb *req, int fd, bool fixed, 1098 unsigned int issue_flags); 1099static void __io_queue_sqe(struct io_kiocb *req); 1100static void io_rsrc_put_work(struct work_struct *work); 1101 1102static void io_req_task_queue(struct io_kiocb *req); 1103static void io_submit_flush_completions(struct io_ring_ctx *ctx); 1104static int io_req_prep_async(struct io_kiocb *req); 1105 1106static int io_install_fixed_file(struct io_kiocb *req, struct file *file, 1107 unsigned int issue_flags, u32 slot_index); 1108static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); 1109 1110static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); 1111 1112static struct kmem_cache *req_cachep; 1113 1114static const struct file_operations io_uring_fops; 1115 1116struct sock *io_uring_get_socket(struct file *file) 1117{ 1118#if defined(CONFIG_UNIX) 1119 if (file->f_op == &io_uring_fops) { 1120 struct io_ring_ctx *ctx = file->private_data; 1121 1122 return ctx->ring_sock->sk; 1123 } 1124#endif 1125 return NULL; 1126} 1127EXPORT_SYMBOL(io_uring_get_socket); 1128 1129static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) 1130{ 1131 if (!*locked) { 1132 mutex_lock(&ctx->uring_lock); 1133 *locked = true; 1134 } 1135} 1136 1137#define io_for_each_link(pos, head) \ 1138 for (pos = (head); pos; pos = pos->link) 1139 1140/* 1141 * Shamelessly stolen from the mm implementation of page reference checking, 1142 * see commit f958d7b528b1 for details. 1143 */ 1144#define req_ref_zero_or_close_to_overflow(req) \ 1145 ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) 1146 1147static inline bool req_ref_inc_not_zero(struct io_kiocb *req) 1148{ 1149 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1150 return atomic_inc_not_zero(&req->refs); 1151} 1152 1153static inline bool req_ref_put_and_test(struct io_kiocb *req) 1154{ 1155 if (likely(!(req->flags & REQ_F_REFCOUNT))) 1156 return true; 1157 1158 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1159 return atomic_dec_and_test(&req->refs); 1160} 1161 1162static inline void req_ref_get(struct io_kiocb *req) 1163{ 1164 WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); 1165 WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); 1166 atomic_inc(&req->refs); 1167} 1168 1169static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) 1170{ 1171 if (!(req->flags & REQ_F_REFCOUNT)) { 1172 req->flags |= REQ_F_REFCOUNT; 1173 atomic_set(&req->refs, nr); 1174 } 1175} 1176 1177static inline void io_req_set_refcount(struct io_kiocb *req) 1178{ 1179 __io_req_set_refcount(req, 1); 1180} 1181 1182static inline void io_req_set_rsrc_node(struct io_kiocb *req) 1183{ 1184 struct io_ring_ctx *ctx = req->ctx; 1185 1186 if (!req->fixed_rsrc_refs) { 1187 req->fixed_rsrc_refs = &ctx->rsrc_node->refs; 1188 percpu_ref_get(req->fixed_rsrc_refs); 1189 } 1190} 1191 1192static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) 1193{ 1194 bool got = percpu_ref_tryget(ref); 1195 1196 /* already at zero, wait for ->release() */ 1197 if (!got) 1198 wait_for_completion(compl); 1199 percpu_ref_resurrect(ref); 1200 if (got) 1201 percpu_ref_put(ref); 1202} 1203 1204static bool io_match_task(struct io_kiocb *head, struct task_struct *task, 1205 bool cancel_all) 1206 __must_hold(&req->ctx->timeout_lock) 1207{ 1208 struct io_kiocb *req; 1209 1210 if (task && head->task != task) 1211 return false; 1212 if (cancel_all) 1213 return true; 1214 1215 io_for_each_link(req, head) { 1216 if (req->flags & REQ_F_INFLIGHT) 1217 return true; 1218 } 1219 return false; 1220} 1221 1222static bool io_match_linked(struct io_kiocb *head) 1223{ 1224 struct io_kiocb *req; 1225 1226 io_for_each_link(req, head) { 1227 if (req->flags & REQ_F_INFLIGHT) 1228 return true; 1229 } 1230 return false; 1231} 1232 1233/* 1234 * As io_match_task() but protected against racing with linked timeouts. 1235 * User must not hold timeout_lock. 1236 */ 1237static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, 1238 bool cancel_all) 1239{ 1240 bool matched; 1241 1242 if (task && head->task != task) 1243 return false; 1244 if (cancel_all) 1245 return true; 1246 1247 if (head->flags & REQ_F_LINK_TIMEOUT) { 1248 struct io_ring_ctx *ctx = head->ctx; 1249 1250 /* protect against races with linked timeouts */ 1251 spin_lock_irq(&ctx->timeout_lock); 1252 matched = io_match_linked(head); 1253 spin_unlock_irq(&ctx->timeout_lock); 1254 } else { 1255 matched = io_match_linked(head); 1256 } 1257 return matched; 1258} 1259 1260static inline void req_set_fail(struct io_kiocb *req) 1261{ 1262 req->flags |= REQ_F_FAIL; 1263} 1264 1265static inline void req_fail_link_node(struct io_kiocb *req, int res) 1266{ 1267 req_set_fail(req); 1268 req->result = res; 1269} 1270 1271static void io_ring_ctx_ref_free(struct percpu_ref *ref) 1272{ 1273 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); 1274 1275 complete(&ctx->ref_comp); 1276} 1277 1278static inline bool io_is_timeout_noseq(struct io_kiocb *req) 1279{ 1280 return !req->timeout.off; 1281} 1282 1283static void io_fallback_req_func(struct work_struct *work) 1284{ 1285 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, 1286 fallback_work.work); 1287 struct llist_node *node = llist_del_all(&ctx->fallback_llist); 1288 struct io_kiocb *req, *tmp; 1289 bool locked = false; 1290 1291 percpu_ref_get(&ctx->refs); 1292 llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) 1293 req->io_task_work.func(req, &locked); 1294 1295 if (locked) { 1296 if (ctx->submit_state.compl_nr) 1297 io_submit_flush_completions(ctx); 1298 mutex_unlock(&ctx->uring_lock); 1299 } 1300 percpu_ref_put(&ctx->refs); 1301 1302} 1303 1304static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) 1305{ 1306 struct io_ring_ctx *ctx; 1307 int hash_bits; 1308 1309 ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1310 if (!ctx) 1311 return NULL; 1312 1313 /* 1314 * Use 5 bits less than the max cq entries, that should give us around 1315 * 32 entries per hash list if totally full and uniformly spread. 1316 */ 1317 hash_bits = ilog2(p->cq_entries); 1318 hash_bits -= 5; 1319 if (hash_bits <= 0) 1320 hash_bits = 1; 1321 ctx->cancel_hash_bits = hash_bits; 1322 ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), 1323 GFP_KERNEL); 1324 if (!ctx->cancel_hash) 1325 goto err; 1326 __hash_init(ctx->cancel_hash, 1U << hash_bits); 1327 1328 ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); 1329 if (!ctx->dummy_ubuf) 1330 goto err; 1331 /* set invalid range, so io_import_fixed() fails meeting it */ 1332 ctx->dummy_ubuf->ubuf = -1UL; 1333 1334 if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 1335 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) 1336 goto err; 1337 1338 ctx->flags = p->flags; 1339 init_waitqueue_head(&ctx->sqo_sq_wait); 1340 INIT_LIST_HEAD(&ctx->sqd_list); 1341 init_waitqueue_head(&ctx->poll_wait); 1342 INIT_LIST_HEAD(&ctx->cq_overflow_list); 1343 init_completion(&ctx->ref_comp); 1344 xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); 1345 xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); 1346 mutex_init(&ctx->uring_lock); 1347 init_waitqueue_head(&ctx->cq_wait); 1348 spin_lock_init(&ctx->completion_lock); 1349 spin_lock_init(&ctx->timeout_lock); 1350 INIT_LIST_HEAD(&ctx->iopoll_list); 1351 INIT_LIST_HEAD(&ctx->defer_list); 1352 INIT_LIST_HEAD(&ctx->timeout_list); 1353 INIT_LIST_HEAD(&ctx->ltimeout_list); 1354 spin_lock_init(&ctx->rsrc_ref_lock); 1355 INIT_LIST_HEAD(&ctx->rsrc_ref_list); 1356 INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); 1357 init_llist_head(&ctx->rsrc_put_llist); 1358 INIT_LIST_HEAD(&ctx->tctx_list); 1359 INIT_LIST_HEAD(&ctx->submit_state.free_list); 1360 INIT_LIST_HEAD(&ctx->locked_free_list); 1361 INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); 1362 return ctx; 1363err: 1364 kfree(ctx->dummy_ubuf); 1365 kfree(ctx->cancel_hash); 1366 kfree(ctx); 1367 return NULL; 1368} 1369 1370static void io_account_cq_overflow(struct io_ring_ctx *ctx) 1371{ 1372 struct io_rings *r = ctx->rings; 1373 1374 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); 1375 ctx->cq_extra--; 1376} 1377 1378static bool req_need_defer(struct io_kiocb *req, u32 seq) 1379{ 1380 if (unlikely(req->flags & REQ_F_IO_DRAIN)) { 1381 struct io_ring_ctx *ctx = req->ctx; 1382 1383 return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; 1384 } 1385 1386 return false; 1387} 1388 1389#define FFS_ASYNC_READ 0x1UL 1390#define FFS_ASYNC_WRITE 0x2UL 1391#ifdef CONFIG_64BIT 1392#define FFS_ISREG 0x4UL 1393#else 1394#define FFS_ISREG 0x0UL 1395#endif 1396#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) 1397 1398static inline bool io_req_ffs_set(struct io_kiocb *req) 1399{ 1400 return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE); 1401} 1402 1403static void io_req_track_inflight(struct io_kiocb *req) 1404{ 1405 if (!(req->flags & REQ_F_INFLIGHT)) { 1406 req->flags |= REQ_F_INFLIGHT; 1407 atomic_inc(&req->task->io_uring->inflight_tracked); 1408 } 1409} 1410 1411static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) 1412{ 1413 if (WARN_ON_ONCE(!req->link)) 1414 return NULL; 1415 1416 req->flags &= ~REQ_F_ARM_LTIMEOUT; 1417 req->flags |= REQ_F_LINK_TIMEOUT; 1418 1419 /* linked timeouts should have two refs once prep'ed */ 1420 io_req_set_refcount(req); 1421 __io_req_set_refcount(req->link, 2); 1422 return req->link; 1423} 1424 1425static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) 1426{ 1427 if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) 1428 return NULL; 1429 return __io_prep_linked_timeout(req); 1430} 1431 1432static void io_prep_async_work(struct io_kiocb *req) 1433{ 1434 const struct io_op_def *def = &io_op_defs[req->opcode]; 1435 struct io_ring_ctx *ctx = req->ctx; 1436 1437 if (!(req->flags & REQ_F_CREDS)) { 1438 req->flags |= REQ_F_CREDS; 1439 req->creds = get_current_cred(); 1440 } 1441 1442 req->work.list.next = NULL; 1443 req->work.flags = 0; 1444 if (req->flags & REQ_F_FORCE_ASYNC) 1445 req->work.flags |= IO_WQ_WORK_CONCURRENT; 1446 1447 if (req->flags & REQ_F_ISREG) { 1448 if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) 1449 io_wq_hash_work(&req->work, file_inode(req->file)); 1450 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { 1451 if (def->unbound_nonreg_file) 1452 req->work.flags |= IO_WQ_WORK_UNBOUND; 1453 } 1454} 1455 1456static void io_prep_async_link(struct io_kiocb *req) 1457{ 1458 struct io_kiocb *cur; 1459 1460 if (req->flags & REQ_F_LINK_TIMEOUT) { 1461 struct io_ring_ctx *ctx = req->ctx; 1462 1463 spin_lock_irq(&ctx->timeout_lock); 1464 io_for_each_link(cur, req) 1465 io_prep_async_work(cur); 1466 spin_unlock_irq(&ctx->timeout_lock); 1467 } else { 1468 io_for_each_link(cur, req) 1469 io_prep_async_work(cur); 1470 } 1471} 1472 1473static void io_queue_async_work(struct io_kiocb *req, bool *locked) 1474{ 1475 struct io_ring_ctx *ctx = req->ctx; 1476 struct io_kiocb *link = io_prep_linked_timeout(req); 1477 struct io_uring_task *tctx = req->task->io_uring; 1478 1479 /* must not take the lock, NULL it as a precaution */ 1480 locked = NULL; 1481 1482 BUG_ON(!tctx); 1483 BUG_ON(!tctx->io_wq); 1484 1485 /* init ->work of the whole link before punting */ 1486 io_prep_async_link(req); 1487 1488 /* 1489 * Not expected to happen, but if we do have a bug where this _can_ 1490 * happen, catch it here and ensure the request is marked as 1491 * canceled. That will make io-wq go through the usual work cancel 1492 * procedure rather than attempt to run this request (or create a new 1493 * worker for it). 1494 */ 1495 if (WARN_ON_ONCE(!same_thread_group(req->task, current))) 1496 req->work.flags |= IO_WQ_WORK_CANCEL; 1497 1498 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, 1499 &req->work, req->flags); 1500 io_wq_enqueue(tctx->io_wq, &req->work); 1501 if (link) 1502 io_queue_linked_timeout(link); 1503} 1504 1505static void io_kill_timeout(struct io_kiocb *req, int status) 1506 __must_hold(&req->ctx->completion_lock) 1507 __must_hold(&req->ctx->timeout_lock) 1508{ 1509 struct io_timeout_data *io = req->async_data; 1510 1511 if (hrtimer_try_to_cancel(&io->timer) != -1) { 1512 if (status) 1513 req_set_fail(req); 1514 atomic_set(&req->ctx->cq_timeouts, 1515 atomic_read(&req->ctx->cq_timeouts) + 1); 1516 list_del_init(&req->timeout.list); 1517 io_fill_cqe_req(req, status, 0); 1518 io_put_req_deferred(req); 1519 } 1520} 1521 1522static void io_queue_deferred(struct io_ring_ctx *ctx) 1523{ 1524 lockdep_assert_held(&ctx->completion_lock); 1525 1526 while (!list_empty(&ctx->defer_list)) { 1527 struct io_defer_entry *de = list_first_entry(&ctx->defer_list, 1528 struct io_defer_entry, list); 1529 1530 if (req_need_defer(de->req, de->seq)) 1531 break; 1532 list_del_init(&de->list); 1533 io_req_task_queue(de->req); 1534 kfree(de); 1535 } 1536} 1537 1538static void io_flush_timeouts(struct io_ring_ctx *ctx) 1539 __must_hold(&ctx->completion_lock) 1540{ 1541 u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 1542 struct io_kiocb *req, *tmp; 1543 1544 spin_lock_irq(&ctx->timeout_lock); 1545 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 1546 u32 events_needed, events_got; 1547 1548 if (io_is_timeout_noseq(req)) 1549 break; 1550 1551 /* 1552 * Since seq can easily wrap around over time, subtract 1553 * the last seq at which timeouts were flushed before comparing. 1554 * Assuming not more than 2^31-1 events have happened since, 1555 * these subtractions won't have wrapped, so we can check if 1556 * target is in [last_seq, current_seq] by comparing the two. 1557 */ 1558 events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; 1559 events_got = seq - ctx->cq_last_tm_flush; 1560 if (events_got < events_needed) 1561 break; 1562 1563 io_kill_timeout(req, 0); 1564 } 1565 ctx->cq_last_tm_flush = seq; 1566 spin_unlock_irq(&ctx->timeout_lock); 1567} 1568 1569static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) 1570{ 1571 if (ctx->off_timeout_used) 1572 io_flush_timeouts(ctx); 1573 if (ctx->drain_active) 1574 io_queue_deferred(ctx); 1575} 1576 1577static inline bool io_commit_needs_flush(struct io_ring_ctx *ctx) 1578{ 1579 return ctx->off_timeout_used || ctx->drain_active; 1580} 1581 1582static inline void __io_commit_cqring(struct io_ring_ctx *ctx) 1583{ 1584 /* order cqe stores with ring update */ 1585 smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); 1586} 1587 1588static inline void io_commit_cqring(struct io_ring_ctx *ctx) 1589{ 1590 if (unlikely(io_commit_needs_flush(ctx))) 1591 __io_commit_cqring_flush(ctx); 1592 __io_commit_cqring(ctx); 1593} 1594 1595static inline bool io_sqring_full(struct io_ring_ctx *ctx) 1596{ 1597 struct io_rings *r = ctx->rings; 1598 1599 return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; 1600} 1601 1602static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 1603{ 1604 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 1605} 1606 1607static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) 1608{ 1609 struct io_rings *rings = ctx->rings; 1610 unsigned tail, mask = ctx->cq_entries - 1; 1611 1612 /* 1613 * writes to the cq entry need to come after reading head; the 1614 * control dependency is enough as we're using WRITE_ONCE to 1615 * fill the cq entry 1616 */ 1617 if (__io_cqring_events(ctx) == ctx->cq_entries) 1618 return NULL; 1619 1620 tail = ctx->cached_cq_tail++; 1621 return &rings->cqes[tail & mask]; 1622} 1623 1624static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) 1625{ 1626 if (likely(!ctx->cq_ev_fd)) 1627 return false; 1628 if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 1629 return false; 1630 return !ctx->eventfd_async || io_wq_current_is_worker(); 1631} 1632 1633/* 1634 * This should only get called when at least one event has been posted. 1635 * Some applications rely on the eventfd notification count only changing 1636 * IFF a new CQE has been added to the CQ ring. There's no depedency on 1637 * 1:1 relationship between how many times this function is called (and 1638 * hence the eventfd count) and number of CQEs posted to the CQ ring. 1639 */ 1640static void io_cqring_ev_posted(struct io_ring_ctx *ctx) 1641{ 1642 /* 1643 * wake_up_all() may seem excessive, but io_wake_function() and 1644 * io_should_wake() handle the termination of the loop and only 1645 * wake as many waiters as we need to. 1646 */ 1647 if (wq_has_sleeper(&ctx->cq_wait)) 1648 __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, 1649 poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); 1650 if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) 1651 wake_up(&ctx->sq_data->wait); 1652 if (io_should_trigger_evfd(ctx)) 1653 eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE); 1654 if (waitqueue_active(&ctx->poll_wait)) 1655 __wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0, 1656 poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); 1657} 1658 1659static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) 1660{ 1661 /* see waitqueue_active() comment */ 1662 smp_mb(); 1663 1664 if (ctx->flags & IORING_SETUP_SQPOLL) { 1665 if (waitqueue_active(&ctx->cq_wait)) 1666 __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, 1667 poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); 1668 } 1669 if (io_should_trigger_evfd(ctx)) 1670 eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE); 1671 if (waitqueue_active(&ctx->poll_wait)) 1672 __wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0, 1673 poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); 1674} 1675 1676/* Returns true if there are no backlogged entries after the flush */ 1677static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) 1678{ 1679 bool all_flushed, posted; 1680 1681 if (!force && __io_cqring_events(ctx) == ctx->cq_entries) 1682 return false; 1683 1684 posted = false; 1685 spin_lock(&ctx->completion_lock); 1686 while (!list_empty(&ctx->cq_overflow_list)) { 1687 struct io_uring_cqe *cqe = io_get_cqe(ctx); 1688 struct io_overflow_cqe *ocqe; 1689 1690 if (!cqe && !force) 1691 break; 1692 ocqe = list_first_entry(&ctx->cq_overflow_list, 1693 struct io_overflow_cqe, list); 1694 if (cqe) 1695 memcpy(cqe, &ocqe->cqe, sizeof(*cqe)); 1696 else 1697 io_account_cq_overflow(ctx); 1698 1699 posted = true; 1700 list_del(&ocqe->list); 1701 kfree(ocqe); 1702 } 1703 1704 all_flushed = list_empty(&ctx->cq_overflow_list); 1705 if (all_flushed) { 1706 clear_bit(0, &ctx->check_cq_overflow); 1707 WRITE_ONCE(ctx->rings->sq_flags, 1708 ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); 1709 } 1710 1711 if (posted) 1712 io_commit_cqring(ctx); 1713 spin_unlock(&ctx->completion_lock); 1714 if (posted) 1715 io_cqring_ev_posted(ctx); 1716 return all_flushed; 1717} 1718 1719static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) 1720{ 1721 bool ret = true; 1722 1723 if (test_bit(0, &ctx->check_cq_overflow)) { 1724 /* iopoll syncs against uring_lock, not completion_lock */ 1725 if (ctx->flags & IORING_SETUP_IOPOLL) 1726 mutex_lock(&ctx->uring_lock); 1727 ret = __io_cqring_overflow_flush(ctx, false); 1728 if (ctx->flags & IORING_SETUP_IOPOLL) 1729 mutex_unlock(&ctx->uring_lock); 1730 } 1731 1732 return ret; 1733} 1734 1735/* must to be called somewhat shortly after putting a request */ 1736static inline void io_put_task(struct task_struct *task, int nr) 1737{ 1738 struct io_uring_task *tctx = task->io_uring; 1739 1740 if (likely(task == current)) { 1741 tctx->cached_refs += nr; 1742 } else { 1743 percpu_counter_sub(&tctx->inflight, nr); 1744 if (unlikely(atomic_read(&tctx->in_idle))) 1745 wake_up(&tctx->wait); 1746 put_task_struct_many(task, nr); 1747 } 1748} 1749 1750static void io_task_refs_refill(struct io_uring_task *tctx) 1751{ 1752 unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; 1753 1754 percpu_counter_add(&tctx->inflight, refill); 1755 refcount_add(refill, ¤t->usage); 1756 tctx->cached_refs += refill; 1757} 1758 1759static inline void io_get_task_refs(int nr) 1760{ 1761 struct io_uring_task *tctx = current->io_uring; 1762 1763 tctx->cached_refs -= nr; 1764 if (unlikely(tctx->cached_refs < 0)) 1765 io_task_refs_refill(tctx); 1766} 1767 1768static __cold void io_uring_drop_tctx_refs(struct task_struct *task) 1769{ 1770 struct io_uring_task *tctx = task->io_uring; 1771 unsigned int refs = tctx->cached_refs; 1772 1773 if (refs) { 1774 tctx->cached_refs = 0; 1775 percpu_counter_sub(&tctx->inflight, refs); 1776 put_task_struct_many(task, refs); 1777 } 1778} 1779 1780static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, 1781 s32 res, u32 cflags) 1782{ 1783 struct io_overflow_cqe *ocqe; 1784 1785 ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT); 1786 if (!ocqe) { 1787 /* 1788 * If we're in ring overflow flush mode, or in task cancel mode, 1789 * or cannot allocate an overflow entry, then we need to drop it 1790 * on the floor. 1791 */ 1792 io_account_cq_overflow(ctx); 1793 return false; 1794 } 1795 if (list_empty(&ctx->cq_overflow_list)) { 1796 set_bit(0, &ctx->check_cq_overflow); 1797 WRITE_ONCE(ctx->rings->sq_flags, 1798 ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); 1799 1800 } 1801 ocqe->cqe.user_data = user_data; 1802 ocqe->cqe.res = res; 1803 ocqe->cqe.flags = cflags; 1804 list_add_tail(&ocqe->list, &ctx->cq_overflow_list); 1805 return true; 1806} 1807 1808static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, 1809 s32 res, u32 cflags) 1810{ 1811 struct io_uring_cqe *cqe; 1812 1813 trace_io_uring_complete(ctx, user_data, res, cflags); 1814 1815 /* 1816 * If we can't get a cq entry, userspace overflowed the 1817 * submission (by quite a lot). Increment the overflow count in 1818 * the ring. 1819 */ 1820 cqe = io_get_cqe(ctx); 1821 if (likely(cqe)) { 1822 WRITE_ONCE(cqe->user_data, user_data); 1823 WRITE_ONCE(cqe->res, res); 1824 WRITE_ONCE(cqe->flags, cflags); 1825 return true; 1826 } 1827 return io_cqring_event_overflow(ctx, user_data, res, cflags); 1828} 1829 1830static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) 1831{ 1832 __io_fill_cqe(req->ctx, req->user_data, res, cflags); 1833} 1834 1835static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, 1836 s32 res, u32 cflags) 1837{ 1838 ctx->cq_extra++; 1839 return __io_fill_cqe(ctx, user_data, res, cflags); 1840} 1841 1842static void io_req_complete_post(struct io_kiocb *req, s32 res, 1843 u32 cflags) 1844{ 1845 struct io_ring_ctx *ctx = req->ctx; 1846 1847 spin_lock(&ctx->completion_lock); 1848 __io_fill_cqe(ctx, req->user_data, res, cflags); 1849 /* 1850 * If we're the last reference to this request, add to our locked 1851 * free_list cache. 1852 */ 1853 if (req_ref_put_and_test(req)) { 1854 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 1855 if (req->flags & IO_DISARM_MASK) 1856 io_disarm_next(req); 1857 if (req->link) { 1858 io_req_task_queue(req->link); 1859 req->link = NULL; 1860 } 1861 } 1862 io_dismantle_req(req); 1863 io_put_task(req->task, 1); 1864 list_add(&req->inflight_entry, &ctx->locked_free_list); 1865 ctx->locked_free_nr++; 1866 } else { 1867 if (!percpu_ref_tryget(&ctx->refs)) 1868 req = NULL; 1869 } 1870 io_commit_cqring(ctx); 1871 spin_unlock(&ctx->completion_lock); 1872 1873 if (req) { 1874 io_cqring_ev_posted(ctx); 1875 percpu_ref_put(&ctx->refs); 1876 } 1877} 1878 1879static inline bool io_req_needs_clean(struct io_kiocb *req) 1880{ 1881 return req->flags & IO_REQ_CLEAN_FLAGS; 1882} 1883 1884static inline void io_req_complete_state(struct io_kiocb *req, s32 res, 1885 u32 cflags) 1886{ 1887 if (io_req_needs_clean(req)) 1888 io_clean_op(req); 1889 req->result = res; 1890 req->compl.cflags = cflags; 1891 req->flags |= REQ_F_COMPLETE_INLINE; 1892} 1893 1894static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, 1895 s32 res, u32 cflags) 1896{ 1897 if (issue_flags & IO_URING_F_COMPLETE_DEFER) 1898 io_req_complete_state(req, res, cflags); 1899 else 1900 io_req_complete_post(req, res, cflags); 1901} 1902 1903static inline void io_req_complete(struct io_kiocb *req, s32 res) 1904{ 1905 __io_req_complete(req, 0, res, 0); 1906} 1907 1908static void io_req_complete_failed(struct io_kiocb *req, s32 res) 1909{ 1910 req_set_fail(req); 1911 io_req_complete_post(req, res, 0); 1912} 1913 1914static void io_req_complete_fail_submit(struct io_kiocb *req) 1915{ 1916 /* 1917 * We don't submit, fail them all, for that replace hardlinks with 1918 * normal links. Extra REQ_F_LINK is tolerated. 1919 */ 1920 req->flags &= ~REQ_F_HARDLINK; 1921 req->flags |= REQ_F_LINK; 1922 io_req_complete_failed(req, req->result); 1923} 1924 1925/* 1926 * Don't initialise the fields below on every allocation, but do that in 1927 * advance and keep them valid across allocations. 1928 */ 1929static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) 1930{ 1931 req->ctx = ctx; 1932 req->link = NULL; 1933 req->async_data = NULL; 1934 /* not necessary, but safer to zero */ 1935 req->result = 0; 1936} 1937 1938static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, 1939 struct io_submit_state *state) 1940{ 1941 spin_lock(&ctx->completion_lock); 1942 list_splice_init(&ctx->locked_free_list, &state->free_list); 1943 ctx->locked_free_nr = 0; 1944 spin_unlock(&ctx->completion_lock); 1945} 1946 1947/* Returns true IFF there are requests in the cache */ 1948static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) 1949{ 1950 struct io_submit_state *state = &ctx->submit_state; 1951 int nr; 1952 1953 /* 1954 * If we have more than a batch's worth of requests in our IRQ side 1955 * locked cache, grab the lock and move them over to our submission 1956 * side cache. 1957 */ 1958 if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) 1959 io_flush_cached_locked_reqs(ctx, state); 1960 1961 nr = state->free_reqs; 1962 while (!list_empty(&state->free_list)) { 1963 struct io_kiocb *req = list_first_entry(&state->free_list, 1964 struct io_kiocb, inflight_entry); 1965 1966 list_del(&req->inflight_entry); 1967 state->reqs[nr++] = req; 1968 if (nr == ARRAY_SIZE(state->reqs)) 1969 break; 1970 } 1971 1972 state->free_reqs = nr; 1973 return nr != 0; 1974} 1975 1976/* 1977 * A request might get retired back into the request caches even before opcode 1978 * handlers and io_issue_sqe() are done with it, e.g. inline completion path. 1979 * Because of that, io_alloc_req() should be called only under ->uring_lock 1980 * and with extra caution to not get a request that is still worked on. 1981 */ 1982static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) 1983 __must_hold(&ctx->uring_lock) 1984{ 1985 struct io_submit_state *state = &ctx->submit_state; 1986 gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1987 int ret, i; 1988 1989 BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); 1990 1991 if (likely(state->free_reqs || io_flush_cached_reqs(ctx))) 1992 goto got_req; 1993 1994 ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, 1995 state->reqs); 1996 1997 /* 1998 * Bulk alloc is all-or-nothing. If we fail to get a batch, 1999 * retry single alloc to be on the safe side. 2000 */ 2001 if (unlikely(ret <= 0)) { 2002 state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); 2003 if (!state->reqs[0]) 2004 return NULL; 2005 ret = 1; 2006 } 2007 2008 for (i = 0; i < ret; i++) 2009 io_preinit_req(state->reqs[i], ctx); 2010 state->free_reqs = ret; 2011got_req: 2012 state->free_reqs--; 2013 return state->reqs[state->free_reqs]; 2014} 2015 2016static inline void io_put_file(struct file *file) 2017{ 2018 if (file) 2019 fput(file); 2020} 2021 2022static void io_dismantle_req(struct io_kiocb *req) 2023{ 2024 unsigned int flags = req->flags; 2025 2026 if (io_req_needs_clean(req)) 2027 io_clean_op(req); 2028 if (!(flags & REQ_F_FIXED_FILE)) 2029 io_put_file(req->file); 2030 if (req->fixed_rsrc_refs) 2031 percpu_ref_put(req->fixed_rsrc_refs); 2032 if (req->async_data) { 2033 kfree(req->async_data); 2034 req->async_data = NULL; 2035 } 2036} 2037 2038static void __io_free_req(struct io_kiocb *req) 2039{ 2040 struct io_ring_ctx *ctx = req->ctx; 2041 2042 io_dismantle_req(req); 2043 io_put_task(req->task, 1); 2044 2045 spin_lock(&ctx->completion_lock); 2046 list_add(&req->inflight_entry, &ctx->locked_free_list); 2047 ctx->locked_free_nr++; 2048 spin_unlock(&ctx->completion_lock); 2049 2050 percpu_ref_put(&ctx->refs); 2051} 2052 2053static inline void io_remove_next_linked(struct io_kiocb *req) 2054{ 2055 struct io_kiocb *nxt = req->link; 2056 2057 req->link = nxt->link; 2058 nxt->link = NULL; 2059} 2060 2061static bool io_kill_linked_timeout(struct io_kiocb *req) 2062 __must_hold(&req->ctx->completion_lock) 2063 __must_hold(&req->ctx->timeout_lock) 2064{ 2065 struct io_kiocb *link = req->link; 2066 2067 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2068 struct io_timeout_data *io = link->async_data; 2069 2070 io_remove_next_linked(req); 2071 link->timeout.head = NULL; 2072 if (hrtimer_try_to_cancel(&io->timer) != -1) { 2073 list_del(&link->timeout.list); 2074 io_fill_cqe_req(link, -ECANCELED, 0); 2075 io_put_req_deferred(link); 2076 return true; 2077 } 2078 } 2079 return false; 2080} 2081 2082static void io_fail_links(struct io_kiocb *req) 2083 __must_hold(&req->ctx->completion_lock) 2084{ 2085 struct io_kiocb *nxt, *link = req->link; 2086 2087 req->link = NULL; 2088 while (link) { 2089 long res = -ECANCELED; 2090 2091 if (link->flags & REQ_F_FAIL) 2092 res = link->result; 2093 2094 nxt = link->link; 2095 link->link = NULL; 2096 2097 trace_io_uring_fail_link(req, link); 2098 io_fill_cqe_req(link, res, 0); 2099 io_put_req_deferred(link); 2100 link = nxt; 2101 } 2102} 2103 2104static bool io_disarm_next(struct io_kiocb *req) 2105 __must_hold(&req->ctx->completion_lock) 2106{ 2107 bool posted = false; 2108 2109 if (req->flags & REQ_F_ARM_LTIMEOUT) { 2110 struct io_kiocb *link = req->link; 2111 2112 req->flags &= ~REQ_F_ARM_LTIMEOUT; 2113 if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { 2114 io_remove_next_linked(req); 2115 io_fill_cqe_req(link, -ECANCELED, 0); 2116 io_put_req_deferred(link); 2117 posted = true; 2118 } 2119 } else if (req->flags & REQ_F_LINK_TIMEOUT) { 2120 struct io_ring_ctx *ctx = req->ctx; 2121 2122 spin_lock_irq(&ctx->timeout_lock); 2123 posted = io_kill_linked_timeout(req); 2124 spin_unlock_irq(&ctx->timeout_lock); 2125 } 2126 if (unlikely((req->flags & REQ_F_FAIL) && 2127 !(req->flags & REQ_F_HARDLINK))) { 2128 posted |= (req->link != NULL); 2129 io_fail_links(req); 2130 } 2131 return posted; 2132} 2133 2134static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) 2135{ 2136 struct io_kiocb *nxt; 2137 2138 /* 2139 * If LINK is set, we have dependent requests in this chain. If we 2140 * didn't fail this request, queue the first one up, moving any other 2141 * dependencies to the next request. In case of failure, fail the rest 2142 * of the chain. 2143 */ 2144 if (req->flags & IO_DISARM_MASK) { 2145 struct io_ring_ctx *ctx = req->ctx; 2146 bool posted; 2147 2148 spin_lock(&ctx->completion_lock); 2149 posted = io_disarm_next(req); 2150 if (posted) 2151 io_commit_cqring(req->ctx); 2152 spin_unlock(&ctx->completion_lock); 2153 if (posted) 2154 io_cqring_ev_posted(ctx); 2155 } 2156 nxt = req->link; 2157 req->link = NULL; 2158 return nxt; 2159} 2160 2161static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) 2162{ 2163 if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) 2164 return NULL; 2165 return __io_req_find_next(req); 2166} 2167 2168static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) 2169{ 2170 if (!ctx) 2171 return; 2172 if (*locked) { 2173 if (ctx->submit_state.compl_nr) 2174 io_submit_flush_completions(ctx); 2175 mutex_unlock(&ctx->uring_lock); 2176 *locked = false; 2177 } 2178 percpu_ref_put(&ctx->refs); 2179} 2180 2181static void tctx_task_work(struct callback_head *cb) 2182{ 2183 bool locked = false; 2184 struct io_ring_ctx *ctx = NULL; 2185 struct io_uring_task *tctx = container_of(cb, struct io_uring_task, 2186 task_work); 2187 2188 while (1) { 2189 struct io_wq_work_node *node; 2190 2191 if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr) 2192 io_submit_flush_completions(ctx); 2193 2194 spin_lock_irq(&tctx->task_lock); 2195 node = tctx->task_list.first; 2196 INIT_WQ_LIST(&tctx->task_list); 2197 if (!node) 2198 tctx->task_running = false; 2199 spin_unlock_irq(&tctx->task_lock); 2200 if (!node) 2201 break; 2202 2203 do { 2204 struct io_wq_work_node *next = node->next; 2205 struct io_kiocb *req = container_of(node, struct io_kiocb, 2206 io_task_work.node); 2207 2208 if (req->ctx != ctx) { 2209 ctx_flush_and_put(ctx, &locked); 2210 ctx = req->ctx; 2211 /* if not contended, grab and improve batching */ 2212 locked = mutex_trylock(&ctx->uring_lock); 2213 percpu_ref_get(&ctx->refs); 2214 } 2215 req->io_task_work.func(req, &locked); 2216 node = next; 2217 if (unlikely(need_resched())) { 2218 ctx_flush_and_put(ctx, &locked); 2219 ctx = NULL; 2220 cond_resched(); 2221 } 2222 } while (node); 2223 } 2224 2225 ctx_flush_and_put(ctx, &locked); 2226 2227 /* relaxed read is enough as only the task itself sets ->in_idle */ 2228 if (unlikely(atomic_read(&tctx->in_idle))) 2229 io_uring_drop_tctx_refs(current); 2230} 2231 2232static void io_req_task_work_add(struct io_kiocb *req) 2233{ 2234 struct task_struct *tsk = req->task; 2235 struct io_uring_task *tctx = tsk->io_uring; 2236 enum task_work_notify_mode notify; 2237 struct io_wq_work_node *node; 2238 unsigned long flags; 2239 bool running; 2240 2241 WARN_ON_ONCE(!tctx); 2242 2243 spin_lock_irqsave(&tctx->task_lock, flags); 2244 wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); 2245 running = tctx->task_running; 2246 if (!running) 2247 tctx->task_running = true; 2248 spin_unlock_irqrestore(&tctx->task_lock, flags); 2249 2250 /* task_work already pending, we're done */ 2251 if (running) 2252 return; 2253 2254 /* 2255 * SQPOLL kernel thread doesn't need notification, just a wakeup. For 2256 * all other cases, use TWA_SIGNAL unconditionally to ensure we're 2257 * processing task_work. There's no reliable way to tell if TWA_RESUME 2258 * will do the job. 2259 */ 2260 notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; 2261 if (!task_work_add(tsk, &tctx->task_work, notify)) { 2262 wake_up_process(tsk); 2263 return; 2264 } 2265 2266 spin_lock_irqsave(&tctx->task_lock, flags); 2267 tctx->task_running = false; 2268 node = tctx->task_list.first; 2269 INIT_WQ_LIST(&tctx->task_list); 2270 spin_unlock_irqrestore(&tctx->task_lock, flags); 2271 2272 while (node) { 2273 req = container_of(node, struct io_kiocb, io_task_work.node); 2274 node = node->next; 2275 if (llist_add(&req->io_task_work.fallback_node, 2276 &req->ctx->fallback_llist)) 2277 schedule_delayed_work(&req->ctx->fallback_work, 1); 2278 } 2279} 2280 2281static void io_req_task_cancel(struct io_kiocb *req, bool *locked) 2282{ 2283 struct io_ring_ctx *ctx = req->ctx; 2284 2285 /* not needed for normal modes, but SQPOLL depends on it */ 2286 io_tw_lock(ctx, locked); 2287 io_req_complete_failed(req, req->result); 2288} 2289 2290static void io_req_task_submit(struct io_kiocb *req, bool *locked) 2291{ 2292 struct io_ring_ctx *ctx = req->ctx; 2293 2294 io_tw_lock(ctx, locked); 2295 /* req->task == current here, checking PF_EXITING is safe */ 2296 if (likely(!(req->task->flags & PF_EXITING))) 2297 __io_queue_sqe(req); 2298 else 2299 io_req_complete_failed(req, -EFAULT); 2300} 2301 2302static void io_req_task_queue_fail(struct io_kiocb *req, int ret) 2303{ 2304 req->result = ret; 2305 req->io_task_work.func = io_req_task_cancel; 2306 io_req_task_work_add(req); 2307} 2308 2309static void io_req_task_queue(struct io_kiocb *req) 2310{ 2311 req->io_task_work.func = io_req_task_submit; 2312 io_req_task_work_add(req); 2313} 2314 2315static void io_req_task_queue_reissue(struct io_kiocb *req) 2316{ 2317 req->io_task_work.func = io_queue_async_work; 2318 io_req_task_work_add(req); 2319} 2320 2321static inline void io_queue_next(struct io_kiocb *req) 2322{ 2323 struct io_kiocb *nxt = io_req_find_next(req); 2324 2325 if (nxt) 2326 io_req_task_queue(nxt); 2327} 2328 2329static void io_free_req(struct io_kiocb *req) 2330{ 2331 io_queue_next(req); 2332 __io_free_req(req); 2333} 2334 2335static void io_free_req_work(struct io_kiocb *req, bool *locked) 2336{ 2337 io_free_req(req); 2338} 2339 2340struct req_batch { 2341 struct task_struct *task; 2342 int task_refs; 2343 int ctx_refs; 2344}; 2345 2346static inline void io_init_req_batch(struct req_batch *rb) 2347{ 2348 rb->task_refs = 0; 2349 rb->ctx_refs = 0; 2350 rb->task = NULL; 2351} 2352 2353static void io_req_free_batch_finish(struct io_ring_ctx *ctx, 2354 struct req_batch *rb) 2355{ 2356 if (rb->ctx_refs) 2357 percpu_ref_put_many(&ctx->refs, rb->ctx_refs); 2358 if (rb->task) 2359 io_put_task(rb->task, rb->task_refs); 2360} 2361 2362static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, 2363 struct io_submit_state *state) 2364{ 2365 io_queue_next(req); 2366 io_dismantle_req(req); 2367 2368 if (req->task != rb->task) { 2369 if (rb->task) 2370 io_put_task(rb->task, rb->task_refs); 2371 rb->task = req->task; 2372 rb->task_refs = 0; 2373 } 2374 rb->task_refs++; 2375 rb->ctx_refs++; 2376 2377 if (state->free_reqs != ARRAY_SIZE(state->reqs)) 2378 state->reqs[state->free_reqs++] = req; 2379 else 2380 list_add(&req->inflight_entry, &state->free_list); 2381} 2382 2383static void io_submit_flush_completions(struct io_ring_ctx *ctx) 2384 __must_hold(&ctx->uring_lock) 2385{ 2386 struct io_submit_state *state = &ctx->submit_state; 2387 int i, nr = state->compl_nr; 2388 struct req_batch rb; 2389 2390 spin_lock(&ctx->completion_lock); 2391 for (i = 0; i < nr; i++) { 2392 struct io_kiocb *req = state->compl_reqs[i]; 2393 2394 __io_fill_cqe(ctx, req->user_data, req->result, 2395 req->compl.cflags); 2396 } 2397 io_commit_cqring(ctx); 2398 spin_unlock(&ctx->completion_lock); 2399 io_cqring_ev_posted(ctx); 2400 2401 io_init_req_batch(&rb); 2402 for (i = 0; i < nr; i++) { 2403 struct io_kiocb *req = state->compl_reqs[i]; 2404 2405 if (req_ref_put_and_test(req)) 2406 io_req_free_batch(&rb, req, &ctx->submit_state); 2407 } 2408 2409 io_req_free_batch_finish(ctx, &rb); 2410 state->compl_nr = 0; 2411} 2412 2413/* 2414 * Drop reference to request, return next in chain (if there is one) if this 2415 * was the last reference to this request. 2416 */ 2417static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) 2418{ 2419 struct io_kiocb *nxt = NULL; 2420 2421 if (req_ref_put_and_test(req)) { 2422 nxt = io_req_find_next(req); 2423 __io_free_req(req); 2424 } 2425 return nxt; 2426} 2427 2428static inline void io_put_req(struct io_kiocb *req) 2429{ 2430 if (req_ref_put_and_test(req)) 2431 io_free_req(req); 2432} 2433 2434static inline void io_put_req_deferred(struct io_kiocb *req) 2435{ 2436 if (req_ref_put_and_test(req)) { 2437 req->io_task_work.func = io_free_req_work; 2438 io_req_task_work_add(req); 2439 } 2440} 2441 2442static unsigned io_cqring_events(struct io_ring_ctx *ctx) 2443{ 2444 /* See comment at the top of this file */ 2445 smp_rmb(); 2446 return __io_cqring_events(ctx); 2447} 2448 2449static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) 2450{ 2451 struct io_rings *rings = ctx->rings; 2452 2453 /* make sure SQ entry isn't read before tail */ 2454 return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; 2455} 2456 2457static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) 2458{ 2459 unsigned int cflags; 2460 2461 cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT; 2462 cflags |= IORING_CQE_F_BUFFER; 2463 req->flags &= ~REQ_F_BUFFER_SELECTED; 2464 kfree(kbuf); 2465 return cflags; 2466} 2467 2468static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) 2469{ 2470 struct io_buffer *kbuf; 2471 2472 if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) 2473 return 0; 2474 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 2475 return io_put_kbuf(req, kbuf); 2476} 2477 2478static inline bool io_run_task_work(void) 2479{ 2480 /* 2481 * PF_IO_WORKER never returns to userspace, so check here if we have 2482 * notify work that needs processing. 2483 */ 2484 if (current->flags & PF_IO_WORKER && 2485 test_thread_flag(TIF_NOTIFY_RESUME)) { 2486 __set_current_state(TASK_RUNNING); 2487 tracehook_notify_resume(NULL); 2488 } 2489 if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { 2490 __set_current_state(TASK_RUNNING); 2491 tracehook_notify_signal(); 2492 return true; 2493 } 2494 2495 return false; 2496} 2497 2498/* 2499 * Find and free completed poll iocbs 2500 */ 2501static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, 2502 struct list_head *done) 2503{ 2504 struct req_batch rb; 2505 struct io_kiocb *req; 2506 2507 /* order with ->result store in io_complete_rw_iopoll() */ 2508 smp_rmb(); 2509 2510 io_init_req_batch(&rb); 2511 while (!list_empty(done)) { 2512 struct io_uring_cqe *cqe; 2513 unsigned cflags; 2514 2515 req = list_first_entry(done, struct io_kiocb, inflight_entry); 2516 list_del(&req->inflight_entry); 2517 cflags = io_put_rw_kbuf(req); 2518 (*nr_events)++; 2519 2520 cqe = io_get_cqe(ctx); 2521 if (cqe) { 2522 WRITE_ONCE(cqe->user_data, req->user_data); 2523 WRITE_ONCE(cqe->res, req->result); 2524 WRITE_ONCE(cqe->flags, cflags); 2525 } else { 2526 spin_lock(&ctx->completion_lock); 2527 io_cqring_event_overflow(ctx, req->user_data, 2528 req->result, cflags); 2529 spin_unlock(&ctx->completion_lock); 2530 } 2531 2532 if (req_ref_put_and_test(req)) 2533 io_req_free_batch(&rb, req, &ctx->submit_state); 2534 } 2535 2536 if (io_commit_needs_flush(ctx)) { 2537 spin_lock(&ctx->completion_lock); 2538 __io_commit_cqring_flush(ctx); 2539 spin_unlock(&ctx->completion_lock); 2540 } 2541 __io_commit_cqring(ctx); 2542 io_cqring_ev_posted_iopoll(ctx); 2543 io_req_free_batch_finish(ctx, &rb); 2544} 2545 2546static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, 2547 long min) 2548{ 2549 struct io_kiocb *req, *tmp; 2550 LIST_HEAD(done); 2551 bool spin; 2552 2553 /* 2554 * Only spin for completions if we don't have multiple devices hanging 2555 * off our complete list, and we're under the requested amount. 2556 */ 2557 spin = !ctx->poll_multi_queue && *nr_events < min; 2558 2559 list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { 2560 struct kiocb *kiocb = &req->rw.kiocb; 2561 int ret; 2562 2563 /* 2564 * Move completed and retryable entries to our local lists. 2565 * If we find a request that requires polling, break out 2566 * and complete those lists first, if we have entries there. 2567 */ 2568 if (READ_ONCE(req->iopoll_completed)) { 2569 list_move_tail(&req->inflight_entry, &done); 2570 continue; 2571 } 2572 if (!list_empty(&done)) 2573 break; 2574 2575 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); 2576 if (unlikely(ret < 0)) 2577 return ret; 2578 else if (ret) 2579 spin = false; 2580 2581 /* iopoll may have completed current req */ 2582 if (READ_ONCE(req->iopoll_completed)) 2583 list_move_tail(&req->inflight_entry, &done); 2584 } 2585 2586 if (!list_empty(&done)) 2587 io_iopoll_complete(ctx, nr_events, &done); 2588 2589 return 0; 2590} 2591 2592/* 2593 * We can't just wait for polled events to come to us, we have to actively 2594 * find and complete them. 2595 */ 2596static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) 2597{ 2598 if (!(ctx->flags & IORING_SETUP_IOPOLL)) 2599 return; 2600 2601 mutex_lock(&ctx->uring_lock); 2602 while (!list_empty(&ctx->iopoll_list)) { 2603 unsigned int nr_events = 0; 2604 2605 io_do_iopoll(ctx, &nr_events, 0); 2606 2607 /* let it sleep and repeat later if can't complete a request */ 2608 if (nr_events == 0) 2609 break; 2610 /* 2611 * Ensure we allow local-to-the-cpu processing to take place, 2612 * in this case we need to ensure that we reap all events. 2613 * Also let task_work, etc. to progress by releasing the mutex 2614 */ 2615 if (need_resched()) { 2616 mutex_unlock(&ctx->uring_lock); 2617 cond_resched(); 2618 mutex_lock(&ctx->uring_lock); 2619 } 2620 } 2621 mutex_unlock(&ctx->uring_lock); 2622} 2623 2624static int io_iopoll_check(struct io_ring_ctx *ctx, long min) 2625{ 2626 unsigned int nr_events = 0; 2627 int ret = 0; 2628 2629 /* 2630 * We disallow the app entering submit/complete with polling, but we 2631 * still need to lock the ring to prevent racing with polled issue 2632 * that got punted to a workqueue. 2633 */ 2634 mutex_lock(&ctx->uring_lock); 2635 /* 2636 * Don't enter poll loop if we already have events pending. 2637 * If we do, we can potentially be spinning for commands that 2638 * already triggered a CQE (eg in error). 2639 */ 2640 if (test_bit(0, &ctx->check_cq_overflow)) 2641 __io_cqring_overflow_flush(ctx, false); 2642 if (io_cqring_events(ctx)) 2643 goto out; 2644 do { 2645 /* 2646 * If a submit got punted to a workqueue, we can have the 2647 * application entering polling for a command before it gets 2648 * issued. That app will hold the uring_lock for the duration 2649 * of the poll right here, so we need to take a breather every 2650 * now and then to ensure that the issue has a chance to add 2651 * the poll to the issued list. Otherwise we can spin here 2652 * forever, while the workqueue is stuck trying to acquire the 2653 * very same mutex. 2654 */ 2655 if (list_empty(&ctx->iopoll_list)) { 2656 u32 tail = ctx->cached_cq_tail; 2657 2658 mutex_unlock(&ctx->uring_lock); 2659 io_run_task_work(); 2660 mutex_lock(&ctx->uring_lock); 2661 2662 /* some requests don't go through iopoll_list */ 2663 if (tail != ctx->cached_cq_tail || 2664 list_empty(&ctx->iopoll_list)) 2665 break; 2666 } 2667 ret = io_do_iopoll(ctx, &nr_events, min); 2668 2669 if (task_sigpending(current)) { 2670 ret = -EINTR; 2671 goto out; 2672 } 2673 } while (!ret && nr_events < min && !need_resched()); 2674out: 2675 mutex_unlock(&ctx->uring_lock); 2676 return ret; 2677} 2678 2679static void kiocb_end_write(struct io_kiocb *req) 2680{ 2681 /* 2682 * Tell lockdep we inherited freeze protection from submission 2683 * thread. 2684 */ 2685 if (req->flags & REQ_F_ISREG) { 2686 struct super_block *sb = file_inode(req->file)->i_sb; 2687 2688 __sb_writers_acquired(sb, SB_FREEZE_WRITE); 2689 sb_end_write(sb); 2690 } 2691} 2692 2693#ifdef CONFIG_BLOCK 2694static bool io_resubmit_prep(struct io_kiocb *req) 2695{ 2696 struct io_async_rw *rw = req->async_data; 2697 2698 if (!rw) 2699 return !io_req_prep_async(req); 2700 iov_iter_restore(&rw->iter, &rw->iter_state); 2701 return true; 2702} 2703 2704static bool io_rw_should_reissue(struct io_kiocb *req) 2705{ 2706 umode_t mode = file_inode(req->file)->i_mode; 2707 struct io_ring_ctx *ctx = req->ctx; 2708 2709 if (!S_ISBLK(mode) && !S_ISREG(mode)) 2710 return false; 2711 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && 2712 !(ctx->flags & IORING_SETUP_IOPOLL))) 2713 return false; 2714 /* 2715 * If ref is dying, we might be running poll reap from the exit work. 2716 * Don't attempt to reissue from that path, just let it fail with 2717 * -EAGAIN. 2718 */ 2719 if (percpu_ref_is_dying(&ctx->refs)) 2720 return false; 2721 /* 2722 * Play it safe and assume not safe to re-import and reissue if we're 2723 * not in the original thread group (or in task context). 2724 */ 2725 if (!same_thread_group(req->task, current) || !in_task()) 2726 return false; 2727 return true; 2728} 2729#else 2730static bool io_resubmit_prep(struct io_kiocb *req) 2731{ 2732 return false; 2733} 2734static bool io_rw_should_reissue(struct io_kiocb *req) 2735{ 2736 return false; 2737} 2738#endif 2739 2740/* 2741 * Trigger the notifications after having done some IO, and finish the write 2742 * accounting, if any. 2743 */ 2744static void io_req_io_end(struct io_kiocb *req) 2745{ 2746 struct io_rw *rw = &req->rw; 2747 2748 if (rw->kiocb.ki_flags & IOCB_WRITE) { 2749 kiocb_end_write(req); 2750 fsnotify_modify(req->file); 2751 } else { 2752 fsnotify_access(req->file); 2753 } 2754} 2755 2756static bool __io_complete_rw_common(struct io_kiocb *req, long res) 2757{ 2758 if (res != req->result) { 2759 if ((res == -EAGAIN || res == -EOPNOTSUPP) && 2760 io_rw_should_reissue(req)) { 2761 /* 2762 * Reissue will start accounting again, finish the 2763 * current cycle. 2764 */ 2765 io_req_io_end(req); 2766 req->flags |= REQ_F_REISSUE; 2767 return true; 2768 } 2769 req_set_fail(req); 2770 req->result = res; 2771 } 2772 return false; 2773} 2774 2775static inline int io_fixup_rw_res(struct io_kiocb *req, long res) 2776{ 2777 struct io_async_rw *io = req->async_data; 2778 2779 /* add previously done IO, if any */ 2780 if (io && io->bytes_done > 0) { 2781 if (res < 0) 2782 res = io->bytes_done; 2783 else 2784 res += io->bytes_done; 2785 } 2786 return res; 2787} 2788 2789static void io_req_task_complete(struct io_kiocb *req, bool *locked) 2790{ 2791 unsigned int cflags = io_put_rw_kbuf(req); 2792 int res = req->result; 2793 2794 if (*locked) { 2795 struct io_ring_ctx *ctx = req->ctx; 2796 struct io_submit_state *state = &ctx->submit_state; 2797 2798 io_req_complete_state(req, res, cflags); 2799 state->compl_reqs[state->compl_nr++] = req; 2800 if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) 2801 io_submit_flush_completions(ctx); 2802 } else { 2803 io_req_complete_post(req, res, cflags); 2804 } 2805} 2806 2807static void io_req_rw_complete(struct io_kiocb *req, bool *locked) 2808{ 2809 io_req_io_end(req); 2810 io_req_task_complete(req, locked); 2811} 2812 2813static void io_complete_rw(struct kiocb *kiocb, long res, long res2) 2814{ 2815 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2816 2817 if (__io_complete_rw_common(req, res)) 2818 return; 2819 req->result = io_fixup_rw_res(req, res); 2820 req->io_task_work.func = io_req_rw_complete; 2821 io_req_task_work_add(req); 2822} 2823 2824static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) 2825{ 2826 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2827 2828 if (kiocb->ki_flags & IOCB_WRITE) 2829 kiocb_end_write(req); 2830 if (unlikely(res != req->result)) { 2831 if (res == -EAGAIN && io_rw_should_reissue(req)) { 2832 req->flags |= REQ_F_REISSUE; 2833 return; 2834 } 2835 } 2836 2837 WRITE_ONCE(req->result, res); 2838 /* order with io_iopoll_complete() checking ->result */ 2839 smp_wmb(); 2840 WRITE_ONCE(req->iopoll_completed, 1); 2841} 2842 2843/* 2844 * After the iocb has been issued, it's safe to be found on the poll list. 2845 * Adding the kiocb to the list AFTER submission ensures that we don't 2846 * find it from a io_do_iopoll() thread before the issuer is done 2847 * accessing the kiocb cookie. 2848 */ 2849static void io_iopoll_req_issued(struct io_kiocb *req) 2850{ 2851 struct io_ring_ctx *ctx = req->ctx; 2852 const bool in_async = io_wq_current_is_worker(); 2853 2854 /* workqueue context doesn't hold uring_lock, grab it now */ 2855 if (unlikely(in_async)) 2856 mutex_lock(&ctx->uring_lock); 2857 2858 /* 2859 * Track whether we have multiple files in our lists. This will impact 2860 * how we do polling eventually, not spinning if we're on potentially 2861 * different devices. 2862 */ 2863 if (list_empty(&ctx->iopoll_list)) { 2864 ctx->poll_multi_queue = false; 2865 } else if (!ctx->poll_multi_queue) { 2866 struct io_kiocb *list_req; 2867 unsigned int queue_num0, queue_num1; 2868 2869 list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, 2870 inflight_entry); 2871 2872 if (list_req->file != req->file) { 2873 ctx->poll_multi_queue = true; 2874 } else { 2875 queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie); 2876 queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie); 2877 if (queue_num0 != queue_num1) 2878 ctx->poll_multi_queue = true; 2879 } 2880 } 2881 2882 /* 2883 * For fast devices, IO may have already completed. If it has, add 2884 * it to the front so we find it first. 2885 */ 2886 if (READ_ONCE(req->iopoll_completed)) 2887 list_add(&req->inflight_entry, &ctx->iopoll_list); 2888 else 2889 list_add_tail(&req->inflight_entry, &ctx->iopoll_list); 2890 2891 if (unlikely(in_async)) { 2892 /* 2893 * If IORING_SETUP_SQPOLL is enabled, sqes are either handle 2894 * in sq thread task context or in io worker task context. If 2895 * current task context is sq thread, we don't need to check 2896 * whether should wake up sq thread. 2897 */ 2898 if ((ctx->flags & IORING_SETUP_SQPOLL) && 2899 wq_has_sleeper(&ctx->sq_data->wait)) 2900 wake_up(&ctx->sq_data->wait); 2901 2902 mutex_unlock(&ctx->uring_lock); 2903 } 2904} 2905 2906static bool io_bdev_nowait(struct block_device *bdev) 2907{ 2908 return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); 2909} 2910 2911/* 2912 * If we tracked the file through the SCM inflight mechanism, we could support 2913 * any file. For now, just ensure that anything potentially problematic is done 2914 * inline. 2915 */ 2916static bool __io_file_supports_nowait(struct file *file, int rw) 2917{ 2918 umode_t mode = file_inode(file)->i_mode; 2919 2920 if (S_ISBLK(mode)) { 2921 if (IS_ENABLED(CONFIG_BLOCK) && 2922 io_bdev_nowait(I_BDEV(file->f_mapping->host))) 2923 return true; 2924 return false; 2925 } 2926 if (S_ISSOCK(mode)) 2927 return true; 2928 if (S_ISREG(mode)) { 2929 if (IS_ENABLED(CONFIG_BLOCK) && 2930 io_bdev_nowait(file->f_inode->i_sb->s_bdev) && 2931 file->f_op != &io_uring_fops) 2932 return true; 2933 return false; 2934 } 2935 2936 /* any ->read/write should understand O_NONBLOCK */ 2937 if (file->f_flags & O_NONBLOCK) 2938 return true; 2939 2940 if (!(file->f_mode & FMODE_NOWAIT)) 2941 return false; 2942 2943 if (rw == READ) 2944 return file->f_op->read_iter != NULL; 2945 2946 return file->f_op->write_iter != NULL; 2947} 2948 2949static bool io_file_supports_nowait(struct io_kiocb *req, int rw) 2950{ 2951 if (rw == READ && (req->flags & REQ_F_NOWAIT_READ)) 2952 return true; 2953 else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE)) 2954 return true; 2955 2956 return __io_file_supports_nowait(req->file, rw); 2957} 2958 2959static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, 2960 int rw) 2961{ 2962 struct io_ring_ctx *ctx = req->ctx; 2963 struct kiocb *kiocb = &req->rw.kiocb; 2964 struct file *file = req->file; 2965 unsigned ioprio; 2966 int ret; 2967 2968 if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode)) 2969 req->flags |= REQ_F_ISREG; 2970 2971 kiocb->ki_pos = READ_ONCE(sqe->off); 2972 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); 2973 kiocb->ki_flags = iocb_flags(kiocb->ki_filp); 2974 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); 2975 if (unlikely(ret)) 2976 return ret; 2977 2978 /* 2979 * If the file is marked O_NONBLOCK, still allow retry for it if it 2980 * supports async. Otherwise it's impossible to use O_NONBLOCK files 2981 * reliably. If not, or it IOCB_NOWAIT is set, don't retry. 2982 */ 2983 if ((kiocb->ki_flags & IOCB_NOWAIT) || 2984 ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req, rw))) 2985 req->flags |= REQ_F_NOWAIT; 2986 2987 ioprio = READ_ONCE(sqe->ioprio); 2988 if (ioprio) { 2989 ret = ioprio_check_cap(ioprio); 2990 if (ret) 2991 return ret; 2992 2993 kiocb->ki_ioprio = ioprio; 2994 } else 2995 kiocb->ki_ioprio = get_current_ioprio(); 2996 2997 if (ctx->flags & IORING_SETUP_IOPOLL) { 2998 if (!(kiocb->ki_flags & IOCB_DIRECT) || 2999 !kiocb->ki_filp->f_op->iopoll) 3000 return -EOPNOTSUPP; 3001 3002 kiocb->ki_flags |= IOCB_HIPRI; 3003 kiocb->ki_complete = io_complete_rw_iopoll; 3004 req->iopoll_completed = 0; 3005 } else { 3006 if (kiocb->ki_flags & IOCB_HIPRI) 3007 return -EINVAL; 3008 kiocb->ki_complete = io_complete_rw; 3009 } 3010 3011 /* used for fixed read/write too - just read unconditionally */ 3012 req->buf_index = READ_ONCE(sqe->buf_index); 3013 req->imu = NULL; 3014 3015 if (req->opcode == IORING_OP_READ_FIXED || 3016 req->opcode == IORING_OP_WRITE_FIXED) { 3017 struct io_ring_ctx *ctx = req->ctx; 3018 u16 index; 3019 3020 if (unlikely(req->buf_index >= ctx->nr_user_bufs)) 3021 return -EFAULT; 3022 index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); 3023 req->imu = ctx->user_bufs[index]; 3024 io_req_set_rsrc_node(req); 3025 } 3026 3027 req->rw.addr = READ_ONCE(sqe->addr); 3028 req->rw.len = READ_ONCE(sqe->len); 3029 return 0; 3030} 3031 3032static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) 3033{ 3034 switch (ret) { 3035 case -EIOCBQUEUED: 3036 break; 3037 case -ERESTARTSYS: 3038 case -ERESTARTNOINTR: 3039 case -ERESTARTNOHAND: 3040 case -ERESTART_RESTARTBLOCK: 3041 /* 3042 * We can't just restart the syscall, since previously 3043 * submitted sqes may already be in progress. Just fail this 3044 * IO with EINTR. 3045 */ 3046 ret = -EINTR; 3047 fallthrough; 3048 default: 3049 kiocb->ki_complete(kiocb, ret, 0); 3050 } 3051} 3052 3053static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) 3054{ 3055 struct kiocb *kiocb = &req->rw.kiocb; 3056 3057 if (kiocb->ki_pos != -1) 3058 return &kiocb->ki_pos; 3059 3060 if (!(req->file->f_mode & FMODE_STREAM)) { 3061 req->flags |= REQ_F_CUR_POS; 3062 kiocb->ki_pos = req->file->f_pos; 3063 return &kiocb->ki_pos; 3064 } 3065 3066 kiocb->ki_pos = 0; 3067 return NULL; 3068} 3069 3070static void kiocb_done(struct kiocb *kiocb, ssize_t ret, 3071 unsigned int issue_flags) 3072{ 3073 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 3074 3075 if (req->flags & REQ_F_CUR_POS) 3076 req->file->f_pos = kiocb->ki_pos; 3077 if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) { 3078 if (!__io_complete_rw_common(req, ret)) { 3079 /* 3080 * Safe to call io_end from here as we're inline 3081 * from the submission path. 3082 */ 3083 io_req_io_end(req); 3084 __io_req_complete(req, issue_flags, 3085 io_fixup_rw_res(req, ret), 3086 io_put_rw_kbuf(req)); 3087 } 3088 } else { 3089 io_rw_done(kiocb, ret); 3090 } 3091 3092 if (req->flags & REQ_F_REISSUE) { 3093 req->flags &= ~REQ_F_REISSUE; 3094 if (io_resubmit_prep(req)) { 3095 io_req_task_queue_reissue(req); 3096 } else { 3097 unsigned int cflags = io_put_rw_kbuf(req); 3098 struct io_ring_ctx *ctx = req->ctx; 3099 3100 ret = io_fixup_rw_res(req, ret); 3101 req_set_fail(req); 3102 if (!(issue_flags & IO_URING_F_NONBLOCK)) { 3103 mutex_lock(&ctx->uring_lock); 3104 __io_req_complete(req, issue_flags, ret, cflags); 3105 mutex_unlock(&ctx->uring_lock); 3106 } else { 3107 __io_req_complete(req, issue_flags, ret, cflags); 3108 } 3109 } 3110 } 3111} 3112 3113static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, 3114 struct io_mapped_ubuf *imu) 3115{ 3116 size_t len = req->rw.len; 3117 u64 buf_end, buf_addr = req->rw.addr; 3118 size_t offset; 3119 3120 if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) 3121 return -EFAULT; 3122 /* not inside the mapped region */ 3123 if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) 3124 return -EFAULT; 3125 3126 /* 3127 * May not be a start of buffer, set size appropriately 3128 * and advance us to the beginning. 3129 */ 3130 offset = buf_addr - imu->ubuf; 3131 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); 3132 3133 if (offset) { 3134 /* 3135 * Don't use iov_iter_advance() here, as it's really slow for 3136 * using the latter parts of a big fixed buffer - it iterates 3137 * over each segment manually. We can cheat a bit here, because 3138 * we know that: 3139 * 3140 * 1) it's a BVEC iter, we set it up 3141 * 2) all bvecs are PAGE_SIZE in size, except potentially the 3142 * first and last bvec 3143 * 3144 * So just find our index, and adjust the iterator afterwards. 3145 * If the offset is within the first bvec (or the whole first 3146 * bvec, just use iov_iter_advance(). This makes it easier 3147 * since we can just skip the first segment, which may not 3148 * be PAGE_SIZE aligned. 3149 */ 3150 const struct bio_vec *bvec = imu->bvec; 3151 3152 if (offset < bvec->bv_len) { 3153 iov_iter_advance(iter, offset); 3154 } else { 3155 unsigned long seg_skip; 3156 3157 /* skip first vec */ 3158 offset -= bvec->bv_len; 3159 seg_skip = 1 + (offset >> PAGE_SHIFT); 3160 3161 iter->bvec = bvec + seg_skip; 3162 iter->nr_segs -= seg_skip; 3163 iter->count -= bvec->bv_len + offset; 3164 iter->iov_offset = offset & ~PAGE_MASK; 3165 } 3166 } 3167 3168 return 0; 3169} 3170 3171static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) 3172{ 3173 if (WARN_ON_ONCE(!req->imu)) 3174 return -EFAULT; 3175 return __io_import_fixed(req, rw, iter, req->imu); 3176} 3177 3178static void io_ring_submit_unlock(struct io_ring_ctx *ctx, bool needs_lock) 3179{ 3180 if (needs_lock) 3181 mutex_unlock(&ctx->uring_lock); 3182} 3183 3184static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) 3185{ 3186 /* 3187 * "Normal" inline submissions always hold the uring_lock, since we 3188 * grab it from the system call. Same is true for the SQPOLL offload. 3189 * The only exception is when we've detached the request and issue it 3190 * from an async worker thread, grab the lock for that case. 3191 */ 3192 if (needs_lock) 3193 mutex_lock(&ctx->uring_lock); 3194} 3195 3196static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, 3197 int bgid, struct io_buffer *kbuf, 3198 bool needs_lock) 3199{ 3200 struct io_buffer *head; 3201 3202 if (req->flags & REQ_F_BUFFER_SELECTED) 3203 return kbuf; 3204 3205 io_ring_submit_lock(req->ctx, needs_lock); 3206 3207 lockdep_assert_held(&req->ctx->uring_lock); 3208 3209 head = xa_load(&req->ctx->io_buffers, bgid); 3210 if (head) { 3211 if (!list_empty(&head->list)) { 3212 kbuf = list_last_entry(&head->list, struct io_buffer, 3213 list); 3214 list_del(&kbuf->list); 3215 } else { 3216 kbuf = head; 3217 xa_erase(&req->ctx->io_buffers, bgid); 3218 } 3219 if (*len > kbuf->len) 3220 *len = kbuf->len; 3221 } else { 3222 kbuf = ERR_PTR(-ENOBUFS); 3223 } 3224 3225 io_ring_submit_unlock(req->ctx, needs_lock); 3226 3227 return kbuf; 3228} 3229 3230static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, 3231 bool needs_lock) 3232{ 3233 struct io_buffer *kbuf; 3234 u16 bgid; 3235 3236 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 3237 bgid = req->buf_index; 3238 kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); 3239 if (IS_ERR(kbuf)) 3240 return kbuf; 3241 req->rw.addr = (u64) (unsigned long) kbuf; 3242 req->flags |= REQ_F_BUFFER_SELECTED; 3243 return u64_to_user_ptr(kbuf->addr); 3244} 3245 3246#ifdef CONFIG_COMPAT 3247static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, 3248 bool needs_lock) 3249{ 3250 struct compat_iovec __user *uiov; 3251 compat_ssize_t clen; 3252 void __user *buf; 3253 ssize_t len; 3254 3255 uiov = u64_to_user_ptr(req->rw.addr); 3256 if (!access_ok(uiov, sizeof(*uiov))) 3257 return -EFAULT; 3258 if (__get_user(clen, &uiov->iov_len)) 3259 return -EFAULT; 3260 if (clen < 0) 3261 return -EINVAL; 3262 3263 len = clen; 3264 buf = io_rw_buffer_select(req, &len, needs_lock); 3265 if (IS_ERR(buf)) 3266 return PTR_ERR(buf); 3267 iov[0].iov_base = buf; 3268 iov[0].iov_len = (compat_size_t) len; 3269 return 0; 3270} 3271#endif 3272 3273static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 3274 bool needs_lock) 3275{ 3276 struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); 3277 void __user *buf; 3278 ssize_t len; 3279 3280 if (copy_from_user(iov, uiov, sizeof(*uiov))) 3281 return -EFAULT; 3282 3283 len = iov[0].iov_len; 3284 if (len < 0) 3285 return -EINVAL; 3286 buf = io_rw_buffer_select(req, &len, needs_lock); 3287 if (IS_ERR(buf)) 3288 return PTR_ERR(buf); 3289 iov[0].iov_base = buf; 3290 iov[0].iov_len = len; 3291 return 0; 3292} 3293 3294static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, 3295 bool needs_lock) 3296{ 3297 if (req->flags & REQ_F_BUFFER_SELECTED) { 3298 struct io_buffer *kbuf; 3299 3300 kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; 3301 iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 3302 iov[0].iov_len = kbuf->len; 3303 return 0; 3304 } 3305 if (req->rw.len != 1) 3306 return -EINVAL; 3307 3308#ifdef CONFIG_COMPAT 3309 if (req->ctx->compat) 3310 return io_compat_import(req, iov, needs_lock); 3311#endif 3312 3313 return __io_iov_buffer_select(req, iov, needs_lock); 3314} 3315 3316static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, 3317 struct iov_iter *iter, bool needs_lock) 3318{ 3319 void __user *buf = u64_to_user_ptr(req->rw.addr); 3320 size_t sqe_len = req->rw.len; 3321 u8 opcode = req->opcode; 3322 ssize_t ret; 3323 3324 if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { 3325 *iovec = NULL; 3326 return io_import_fixed(req, rw, iter); 3327 } 3328 3329 /* buffer index only valid with fixed read/write, or buffer select */ 3330 if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) 3331 return -EINVAL; 3332 3333 if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { 3334 if (req->flags & REQ_F_BUFFER_SELECT) { 3335 buf = io_rw_buffer_select(req, &sqe_len, needs_lock); 3336 if (IS_ERR(buf)) 3337 return PTR_ERR(buf); 3338 req->rw.len = sqe_len; 3339 } 3340 3341 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 3342 *iovec = NULL; 3343 return ret; 3344 } 3345 3346 if (req->flags & REQ_F_BUFFER_SELECT) { 3347 ret = io_iov_buffer_select(req, *iovec, needs_lock); 3348 if (!ret) 3349 iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len); 3350 *iovec = NULL; 3351 return ret; 3352 } 3353 3354 return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, 3355 req->ctx->compat); 3356} 3357 3358static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) 3359{ 3360 return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; 3361} 3362 3363/* 3364 * For files that don't have ->read_iter() and ->write_iter(), handle them 3365 * by looping over ->read() or ->write() manually. 3366 */ 3367static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) 3368{ 3369 struct kiocb *kiocb = &req->rw.kiocb; 3370 struct file *file = req->file; 3371 ssize_t ret = 0; 3372 loff_t *ppos; 3373 3374 /* 3375 * Don't support polled IO through this interface, and we can't 3376 * support non-blocking either. For the latter, this just causes 3377 * the kiocb to be handled from an async context. 3378 */ 3379 if (kiocb->ki_flags & IOCB_HIPRI) 3380 return -EOPNOTSUPP; 3381 if (kiocb->ki_flags & IOCB_NOWAIT) 3382 return -EAGAIN; 3383 3384 ppos = io_kiocb_ppos(kiocb); 3385 3386 while (iov_iter_count(iter)) { 3387 struct iovec iovec; 3388 ssize_t nr; 3389 3390 if (!iov_iter_is_bvec(iter)) { 3391 iovec = iov_iter_iovec(iter); 3392 } else { 3393 iovec.iov_base = u64_to_user_ptr(req->rw.addr); 3394 iovec.iov_len = req->rw.len; 3395 } 3396 3397 if (rw == READ) { 3398 nr = file->f_op->read(file, iovec.iov_base, 3399 iovec.iov_len, ppos); 3400 } else { 3401 nr = file->f_op->write(file, iovec.iov_base, 3402 iovec.iov_len, ppos); 3403 } 3404 3405 if (nr < 0) { 3406 if (!ret) 3407 ret = nr; 3408 break; 3409 } 3410 ret += nr; 3411 if (!iov_iter_is_bvec(iter)) { 3412 iov_iter_advance(iter, nr); 3413 } else { 3414 req->rw.addr += nr; 3415 req->rw.len -= nr; 3416 if (!req->rw.len) 3417 break; 3418 } 3419 if (nr != iovec.iov_len) 3420 break; 3421 } 3422 3423 return ret; 3424} 3425 3426static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, 3427 const struct iovec *fast_iov, struct iov_iter *iter) 3428{ 3429 struct io_async_rw *rw = req->async_data; 3430 3431 memcpy(&rw->iter, iter, sizeof(*iter)); 3432 rw->free_iovec = iovec; 3433 rw->bytes_done = 0; 3434 /* can only be fixed buffers, no need to do anything */ 3435 if (iov_iter_is_bvec(iter)) 3436 return; 3437 if (!iovec) { 3438 unsigned iov_off = 0; 3439 3440 rw->iter.iov = rw->fast_iov; 3441 if (iter->iov != fast_iov) { 3442 iov_off = iter->iov - fast_iov; 3443 rw->iter.iov += iov_off; 3444 } 3445 if (rw->fast_iov != fast_iov) 3446 memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, 3447 sizeof(struct iovec) * iter->nr_segs); 3448 } else { 3449 req->flags |= REQ_F_NEED_CLEANUP; 3450 } 3451} 3452 3453static inline int io_alloc_async_data(struct io_kiocb *req) 3454{ 3455 WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); 3456 req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); 3457 return req->async_data == NULL; 3458} 3459 3460static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, 3461 const struct iovec *fast_iov, 3462 struct iov_iter *iter, bool force) 3463{ 3464 if (!force && !io_op_defs[req->opcode].needs_async_setup) 3465 return 0; 3466 if (!req->async_data) { 3467 struct io_async_rw *iorw; 3468 3469 if (io_alloc_async_data(req)) { 3470 kfree(iovec); 3471 return -ENOMEM; 3472 } 3473 3474 io_req_map_rw(req, iovec, fast_iov, iter); 3475 iorw = req->async_data; 3476 /* we've copied and mapped the iter, ensure state is saved */ 3477 iov_iter_save_state(&iorw->iter, &iorw->iter_state); 3478 } 3479 return 0; 3480} 3481 3482static inline int io_rw_prep_async(struct io_kiocb *req, int rw) 3483{ 3484 struct io_async_rw *iorw = req->async_data; 3485 struct iovec *iov = iorw->fast_iov; 3486 int ret; 3487 3488 iorw->bytes_done = 0; 3489 iorw->free_iovec = NULL; 3490 3491 ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); 3492 if (unlikely(ret < 0)) 3493 return ret; 3494 3495 if (iov) { 3496 iorw->free_iovec = iov; 3497 req->flags |= REQ_F_NEED_CLEANUP; 3498 } 3499 iov_iter_save_state(&iorw->iter, &iorw->iter_state); 3500 return 0; 3501} 3502 3503static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3504{ 3505 if (unlikely(!(req->file->f_mode & FMODE_READ))) 3506 return -EBADF; 3507 return io_prep_rw(req, sqe, READ); 3508} 3509 3510/* 3511 * This is our waitqueue callback handler, registered through lock_page_async() 3512 * when we initially tried to do the IO with the iocb armed our waitqueue. 3513 * This gets called when the page is unlocked, and we generally expect that to 3514 * happen when the page IO is completed and the page is now uptodate. This will 3515 * queue a task_work based retry of the operation, attempting to copy the data 3516 * again. If the latter fails because the page was NOT uptodate, then we will 3517 * do a thread based blocking retry of the operation. That's the unexpected 3518 * slow path. 3519 */ 3520static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, 3521 int sync, void *arg) 3522{ 3523 struct wait_page_queue *wpq; 3524 struct io_kiocb *req = wait->private; 3525 struct wait_page_key *key = arg; 3526 3527 wpq = container_of(wait, struct wait_page_queue, wait); 3528 3529 if (!wake_page_match(wpq, key)) 3530 return 0; 3531 3532 req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; 3533 list_del_init(&wait->entry); 3534 io_req_task_queue(req); 3535 return 1; 3536} 3537 3538/* 3539 * This controls whether a given IO request should be armed for async page 3540 * based retry. If we return false here, the request is handed to the async 3541 * worker threads for retry. If we're doing buffered reads on a regular file, 3542 * we prepare a private wait_page_queue entry and retry the operation. This 3543 * will either succeed because the page is now uptodate and unlocked, or it 3544 * will register a callback when the page is unlocked at IO completion. Through 3545 * that callback, io_uring uses task_work to setup a retry of the operation. 3546 * That retry will attempt the buffered read again. The retry will generally 3547 * succeed, or in rare cases where it fails, we then fall back to using the 3548 * async worker threads for a blocking retry. 3549 */ 3550static bool io_rw_should_retry(struct io_kiocb *req) 3551{ 3552 struct io_async_rw *rw = req->async_data; 3553 struct wait_page_queue *wait = &rw->wpq; 3554 struct kiocb *kiocb = &req->rw.kiocb; 3555 3556 /* never retry for NOWAIT, we just complete with -EAGAIN */ 3557 if (req->flags & REQ_F_NOWAIT) 3558 return false; 3559 3560 /* Only for buffered IO */ 3561 if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) 3562 return false; 3563 3564 /* 3565 * just use poll if we can, and don't attempt if the fs doesn't 3566 * support callback based unlocks 3567 */ 3568 if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) 3569 return false; 3570 3571 wait->wait.func = io_async_buf_func; 3572 wait->wait.private = req; 3573 wait->wait.flags = 0; 3574 INIT_LIST_HEAD(&wait->wait.entry); 3575 kiocb->ki_flags |= IOCB_WAITQ; 3576 kiocb->ki_flags &= ~IOCB_NOWAIT; 3577 kiocb->ki_waitq = wait; 3578 return true; 3579} 3580 3581static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) 3582{ 3583 if (req->file->f_op->read_iter) 3584 return call_read_iter(req->file, &req->rw.kiocb, iter); 3585 else if (req->file->f_op->read) 3586 return loop_rw_iter(READ, req, iter); 3587 else 3588 return -EINVAL; 3589} 3590 3591static bool need_read_all(struct io_kiocb *req) 3592{ 3593 return req->flags & REQ_F_ISREG || 3594 S_ISBLK(file_inode(req->file)->i_mode); 3595} 3596 3597static int io_read(struct io_kiocb *req, unsigned int issue_flags) 3598{ 3599 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3600 struct kiocb *kiocb = &req->rw.kiocb; 3601 struct iov_iter __iter, *iter = &__iter; 3602 struct io_async_rw *rw = req->async_data; 3603 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3604 struct iov_iter_state __state, *state; 3605 ssize_t ret, ret2; 3606 loff_t *ppos; 3607 3608 if (rw) { 3609 iter = &rw->iter; 3610 state = &rw->iter_state; 3611 /* 3612 * We come here from an earlier attempt, restore our state to 3613 * match in case it doesn't. It's cheap enough that we don't 3614 * need to make this conditional. 3615 */ 3616 iov_iter_restore(iter, state); 3617 iovec = NULL; 3618 } else { 3619 ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); 3620 if (ret < 0) 3621 return ret; 3622 state = &__state; 3623 iov_iter_save_state(iter, state); 3624 } 3625 req->result = iov_iter_count(iter); 3626 3627 /* Ensure we clear previously set non-block flag */ 3628 if (!force_nonblock) 3629 kiocb->ki_flags &= ~IOCB_NOWAIT; 3630 else 3631 kiocb->ki_flags |= IOCB_NOWAIT; 3632 3633 /* If the file doesn't support async, just async punt */ 3634 if (force_nonblock && !io_file_supports_nowait(req, READ)) { 3635 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); 3636 return ret ?: -EAGAIN; 3637 } 3638 3639 ppos = io_kiocb_update_pos(req); 3640 3641 ret = rw_verify_area(READ, req->file, ppos, req->result); 3642 if (unlikely(ret)) { 3643 kfree(iovec); 3644 return ret; 3645 } 3646 3647 ret = io_iter_do_read(req, iter); 3648 3649 if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { 3650 req->flags &= ~REQ_F_REISSUE; 3651 /* IOPOLL retry should happen for io-wq threads */ 3652 if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) 3653 goto done; 3654 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3655 if (req->flags & REQ_F_NOWAIT) 3656 goto done; 3657 ret = 0; 3658 } else if (ret == -EIOCBQUEUED) { 3659 goto out_free; 3660 } else if (ret <= 0 || ret == req->result || !force_nonblock || 3661 (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { 3662 /* read all, failed, already did sync or don't want to retry */ 3663 goto done; 3664 } 3665 3666 /* 3667 * Don't depend on the iter state matching what was consumed, or being 3668 * untouched in case of error. Restore it and we'll advance it 3669 * manually if we need to. 3670 */ 3671 iov_iter_restore(iter, state); 3672 3673 ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); 3674 if (ret2) 3675 return ret2; 3676 3677 iovec = NULL; 3678 rw = req->async_data; 3679 /* 3680 * Now use our persistent iterator and state, if we aren't already. 3681 * We've restored and mapped the iter to match. 3682 */ 3683 if (iter != &rw->iter) { 3684 iter = &rw->iter; 3685 state = &rw->iter_state; 3686 } 3687 3688 do { 3689 /* 3690 * We end up here because of a partial read, either from 3691 * above or inside this loop. Advance the iter by the bytes 3692 * that were consumed. 3693 */ 3694 iov_iter_advance(iter, ret); 3695 if (!iov_iter_count(iter)) 3696 break; 3697 rw->bytes_done += ret; 3698 iov_iter_save_state(iter, state); 3699 3700 /* if we can retry, do so with the callbacks armed */ 3701 if (!io_rw_should_retry(req)) { 3702 kiocb->ki_flags &= ~IOCB_WAITQ; 3703 return -EAGAIN; 3704 } 3705 3706 req->result = iov_iter_count(iter); 3707 /* 3708 * Now retry read with the IOCB_WAITQ parts set in the iocb. If 3709 * we get -EIOCBQUEUED, then we'll get a notification when the 3710 * desired page gets unlocked. We can also get a partial read 3711 * here, and if we do, then just retry at the new offset. 3712 */ 3713 ret = io_iter_do_read(req, iter); 3714 if (ret == -EIOCBQUEUED) 3715 return 0; 3716 /* we got some bytes, but not all. retry. */ 3717 kiocb->ki_flags &= ~IOCB_WAITQ; 3718 iov_iter_restore(iter, state); 3719 } while (ret > 0); 3720done: 3721 kiocb_done(kiocb, ret, issue_flags); 3722out_free: 3723 /* it's faster to check here then delegate to kfree */ 3724 if (iovec) 3725 kfree(iovec); 3726 return 0; 3727} 3728 3729static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 3730{ 3731 if (unlikely(!(req->file->f_mode & FMODE_WRITE))) 3732 return -EBADF; 3733 return io_prep_rw(req, sqe, WRITE); 3734} 3735 3736static int io_write(struct io_kiocb *req, unsigned int issue_flags) 3737{ 3738 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 3739 struct kiocb *kiocb = &req->rw.kiocb; 3740 struct iov_iter __iter, *iter = &__iter; 3741 struct io_async_rw *rw = req->async_data; 3742 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 3743 struct iov_iter_state __state, *state; 3744 ssize_t ret, ret2; 3745 loff_t *ppos; 3746 3747 if (rw) { 3748 iter = &rw->iter; 3749 state = &rw->iter_state; 3750 iov_iter_restore(iter, state); 3751 iovec = NULL; 3752 } else { 3753 ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); 3754 if (ret < 0) 3755 return ret; 3756 state = &__state; 3757 iov_iter_save_state(iter, state); 3758 } 3759 req->result = iov_iter_count(iter); 3760 3761 /* Ensure we clear previously set non-block flag */ 3762 if (!force_nonblock) 3763 kiocb->ki_flags &= ~IOCB_NOWAIT; 3764 else 3765 kiocb->ki_flags |= IOCB_NOWAIT; 3766 3767 /* If the file doesn't support async, just async punt */ 3768 if (force_nonblock && !io_file_supports_nowait(req, WRITE)) 3769 goto copy_iov; 3770 3771 /* file path doesn't support NOWAIT for non-direct_IO */ 3772 if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && 3773 (req->flags & REQ_F_ISREG)) 3774 goto copy_iov; 3775 3776 ppos = io_kiocb_update_pos(req); 3777 3778 ret = rw_verify_area(WRITE, req->file, ppos, req->result); 3779 if (unlikely(ret)) 3780 goto out_free; 3781 3782 /* 3783 * Open-code file_start_write here to grab freeze protection, 3784 * which will be released by another thread in 3785 * io_complete_rw(). Fool lockdep by telling it the lock got 3786 * released so that it doesn't complain about the held lock when 3787 * we return to userspace. 3788 */ 3789 if (req->flags & REQ_F_ISREG) { 3790 sb_start_write(file_inode(req->file)->i_sb); 3791 __sb_writers_release(file_inode(req->file)->i_sb, 3792 SB_FREEZE_WRITE); 3793 } 3794 kiocb->ki_flags |= IOCB_WRITE; 3795 3796 if (req->file->f_op->write_iter) 3797 ret2 = call_write_iter(req->file, kiocb, iter); 3798 else if (req->file->f_op->write) 3799 ret2 = loop_rw_iter(WRITE, req, iter); 3800 else 3801 ret2 = -EINVAL; 3802 3803 if (req->flags & REQ_F_REISSUE) { 3804 req->flags &= ~REQ_F_REISSUE; 3805 ret2 = -EAGAIN; 3806 } 3807 3808 /* 3809 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just 3810 * retry them without IOCB_NOWAIT. 3811 */ 3812 if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) 3813 ret2 = -EAGAIN; 3814 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3815 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) 3816 goto done; 3817 if (!force_nonblock || ret2 != -EAGAIN) { 3818 /* IOPOLL retry should happen for io-wq threads */ 3819 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) 3820 goto copy_iov; 3821done: 3822 kiocb_done(kiocb, ret2, issue_flags); 3823 } else { 3824copy_iov: 3825 iov_iter_restore(iter, state); 3826 ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); 3827 if (!ret) { 3828 if (kiocb->ki_flags & IOCB_WRITE) 3829 kiocb_end_write(req); 3830 return -EAGAIN; 3831 } 3832 return ret; 3833 } 3834out_free: 3835 /* it's reportedly faster than delegating the null check to kfree() */ 3836 if (iovec) 3837 kfree(iovec); 3838 return ret; 3839} 3840 3841static int io_renameat_prep(struct io_kiocb *req, 3842 const struct io_uring_sqe *sqe) 3843{ 3844 struct io_rename *ren = &req->rename; 3845 const char __user *oldf, *newf; 3846 3847 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3848 return -EINVAL; 3849 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 3850 return -EINVAL; 3851 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3852 return -EBADF; 3853 3854 ren->old_dfd = READ_ONCE(sqe->fd); 3855 oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3856 newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 3857 ren->new_dfd = READ_ONCE(sqe->len); 3858 ren->flags = READ_ONCE(sqe->rename_flags); 3859 3860 ren->oldpath = getname(oldf); 3861 if (IS_ERR(ren->oldpath)) 3862 return PTR_ERR(ren->oldpath); 3863 3864 ren->newpath = getname(newf); 3865 if (IS_ERR(ren->newpath)) { 3866 putname(ren->oldpath); 3867 return PTR_ERR(ren->newpath); 3868 } 3869 3870 req->flags |= REQ_F_NEED_CLEANUP; 3871 return 0; 3872} 3873 3874static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) 3875{ 3876 struct io_rename *ren = &req->rename; 3877 int ret; 3878 3879 if (issue_flags & IO_URING_F_NONBLOCK) 3880 return -EAGAIN; 3881 3882 ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, 3883 ren->newpath, ren->flags); 3884 3885 req->flags &= ~REQ_F_NEED_CLEANUP; 3886 if (ret < 0) 3887 req_set_fail(req); 3888 io_req_complete(req, ret); 3889 return 0; 3890} 3891 3892static int io_unlinkat_prep(struct io_kiocb *req, 3893 const struct io_uring_sqe *sqe) 3894{ 3895 struct io_unlink *un = &req->unlink; 3896 const char __user *fname; 3897 3898 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3899 return -EINVAL; 3900 if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || 3901 sqe->splice_fd_in) 3902 return -EINVAL; 3903 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 3904 return -EBADF; 3905 3906 un->dfd = READ_ONCE(sqe->fd); 3907 3908 un->flags = READ_ONCE(sqe->unlink_flags); 3909 if (un->flags & ~AT_REMOVEDIR) 3910 return -EINVAL; 3911 3912 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3913 un->filename = getname(fname); 3914 if (IS_ERR(un->filename)) 3915 return PTR_ERR(un->filename); 3916 3917 req->flags |= REQ_F_NEED_CLEANUP; 3918 return 0; 3919} 3920 3921static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) 3922{ 3923 struct io_unlink *un = &req->unlink; 3924 int ret; 3925 3926 if (issue_flags & IO_URING_F_NONBLOCK) 3927 return -EAGAIN; 3928 3929 if (un->flags & AT_REMOVEDIR) 3930 ret = do_rmdir(un->dfd, un->filename); 3931 else 3932 ret = do_unlinkat(un->dfd, un->filename); 3933 3934 req->flags &= ~REQ_F_NEED_CLEANUP; 3935 if (ret < 0) 3936 req_set_fail(req); 3937 io_req_complete(req, ret); 3938 return 0; 3939} 3940 3941static int io_shutdown_prep(struct io_kiocb *req, 3942 const struct io_uring_sqe *sqe) 3943{ 3944#if defined(CONFIG_NET) 3945 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3946 return -EINVAL; 3947 if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || 3948 sqe->buf_index || sqe->splice_fd_in)) 3949 return -EINVAL; 3950 3951 req->shutdown.how = READ_ONCE(sqe->len); 3952 return 0; 3953#else 3954 return -EOPNOTSUPP; 3955#endif 3956} 3957 3958static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) 3959{ 3960#if defined(CONFIG_NET) 3961 struct socket *sock; 3962 int ret; 3963 3964 if (issue_flags & IO_URING_F_NONBLOCK) 3965 return -EAGAIN; 3966 3967 sock = sock_from_file(req->file, &ret); 3968 if (unlikely(!sock)) 3969 return ret; 3970 3971 ret = __sys_shutdown_sock(sock, req->shutdown.how); 3972 if (ret < 0) 3973 req_set_fail(req); 3974 io_req_complete(req, ret); 3975 return 0; 3976#else 3977 return -EOPNOTSUPP; 3978#endif 3979} 3980 3981static int __io_splice_prep(struct io_kiocb *req, 3982 const struct io_uring_sqe *sqe) 3983{ 3984 struct io_splice *sp = &req->splice; 3985 unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; 3986 3987 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 3988 return -EINVAL; 3989 3990 sp->len = READ_ONCE(sqe->len); 3991 sp->flags = READ_ONCE(sqe->splice_flags); 3992 if (unlikely(sp->flags & ~valid_flags)) 3993 return -EINVAL; 3994 sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); 3995 return 0; 3996} 3997 3998static int io_tee_prep(struct io_kiocb *req, 3999 const struct io_uring_sqe *sqe) 4000{ 4001 if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) 4002 return -EINVAL; 4003 return __io_splice_prep(req, sqe); 4004} 4005 4006static int io_tee(struct io_kiocb *req, unsigned int issue_flags) 4007{ 4008 struct io_splice *sp = &req->splice; 4009 struct file *out = sp->file_out; 4010 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 4011 struct file *in; 4012 long ret = 0; 4013 4014 if (issue_flags & IO_URING_F_NONBLOCK) 4015 return -EAGAIN; 4016 4017 in = io_file_get(req->ctx, req, sp->splice_fd_in, 4018 (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags); 4019 if (!in) { 4020 ret = -EBADF; 4021 goto done; 4022 } 4023 4024 if (sp->len) 4025 ret = do_tee(in, out, sp->len, flags); 4026 4027 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 4028 io_put_file(in); 4029done: 4030 if (ret != sp->len) 4031 req_set_fail(req); 4032 io_req_complete(req, ret); 4033 return 0; 4034} 4035 4036static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4037{ 4038 struct io_splice *sp = &req->splice; 4039 4040 sp->off_in = READ_ONCE(sqe->splice_off_in); 4041 sp->off_out = READ_ONCE(sqe->off); 4042 return __io_splice_prep(req, sqe); 4043} 4044 4045static int io_splice(struct io_kiocb *req, unsigned int issue_flags) 4046{ 4047 struct io_splice *sp = &req->splice; 4048 struct file *out = sp->file_out; 4049 unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; 4050 loff_t *poff_in, *poff_out; 4051 struct file *in; 4052 long ret = 0; 4053 4054 if (issue_flags & IO_URING_F_NONBLOCK) 4055 return -EAGAIN; 4056 4057 in = io_file_get(req->ctx, req, sp->splice_fd_in, 4058 (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags); 4059 if (!in) { 4060 ret = -EBADF; 4061 goto done; 4062 } 4063 4064 poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; 4065 poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; 4066 4067 if (sp->len) 4068 ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); 4069 4070 if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) 4071 io_put_file(in); 4072done: 4073 if (ret != sp->len) 4074 req_set_fail(req); 4075 io_req_complete(req, ret); 4076 return 0; 4077} 4078 4079/* 4080 * IORING_OP_NOP just posts a completion event, nothing else. 4081 */ 4082static int io_nop(struct io_kiocb *req, unsigned int issue_flags) 4083{ 4084 struct io_ring_ctx *ctx = req->ctx; 4085 4086 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4087 return -EINVAL; 4088 4089 __io_req_complete(req, issue_flags, 0, 0); 4090 return 0; 4091} 4092 4093static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4094{ 4095 struct io_ring_ctx *ctx = req->ctx; 4096 4097 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4098 return -EINVAL; 4099 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 4100 sqe->splice_fd_in)) 4101 return -EINVAL; 4102 4103 req->sync.flags = READ_ONCE(sqe->fsync_flags); 4104 if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) 4105 return -EINVAL; 4106 4107 req->sync.off = READ_ONCE(sqe->off); 4108 req->sync.len = READ_ONCE(sqe->len); 4109 return 0; 4110} 4111 4112static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) 4113{ 4114 loff_t end = req->sync.off + req->sync.len; 4115 int ret; 4116 4117 /* fsync always requires a blocking context */ 4118 if (issue_flags & IO_URING_F_NONBLOCK) 4119 return -EAGAIN; 4120 4121 ret = vfs_fsync_range(req->file, req->sync.off, 4122 end > 0 ? end : LLONG_MAX, 4123 req->sync.flags & IORING_FSYNC_DATASYNC); 4124 if (ret < 0) 4125 req_set_fail(req); 4126 io_req_complete(req, ret); 4127 return 0; 4128} 4129 4130static int io_fallocate_prep(struct io_kiocb *req, 4131 const struct io_uring_sqe *sqe) 4132{ 4133 if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || 4134 sqe->splice_fd_in) 4135 return -EINVAL; 4136 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4137 return -EINVAL; 4138 4139 req->sync.off = READ_ONCE(sqe->off); 4140 req->sync.len = READ_ONCE(sqe->addr); 4141 req->sync.mode = READ_ONCE(sqe->len); 4142 return 0; 4143} 4144 4145static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) 4146{ 4147 int ret; 4148 4149 /* fallocate always requiring blocking context */ 4150 if (issue_flags & IO_URING_F_NONBLOCK) 4151 return -EAGAIN; 4152 ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, 4153 req->sync.len); 4154 if (ret < 0) 4155 req_set_fail(req); 4156 else 4157 fsnotify_modify(req->file); 4158 io_req_complete(req, ret); 4159 return 0; 4160} 4161 4162static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4163{ 4164 const char __user *fname; 4165 int ret; 4166 4167 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4168 return -EINVAL; 4169 if (unlikely(sqe->ioprio || sqe->buf_index)) 4170 return -EINVAL; 4171 if (unlikely(req->flags & REQ_F_FIXED_FILE)) 4172 return -EBADF; 4173 4174 /* open.how should be already initialised */ 4175 if (!(req->open.how.flags & O_PATH) && force_o_largefile()) 4176 req->open.how.flags |= O_LARGEFILE; 4177 4178 req->open.dfd = READ_ONCE(sqe->fd); 4179 fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4180 req->open.filename = getname(fname); 4181 if (IS_ERR(req->open.filename)) { 4182 ret = PTR_ERR(req->open.filename); 4183 req->open.filename = NULL; 4184 return ret; 4185 } 4186 4187 req->open.file_slot = READ_ONCE(sqe->file_index); 4188 if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) 4189 return -EINVAL; 4190 4191 req->open.nofile = rlimit(RLIMIT_NOFILE); 4192 req->flags |= REQ_F_NEED_CLEANUP; 4193 return 0; 4194} 4195 4196static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4197{ 4198 u64 mode = READ_ONCE(sqe->len); 4199 u64 flags = READ_ONCE(sqe->open_flags); 4200 4201 req->open.how = build_open_how(flags, mode); 4202 return __io_openat_prep(req, sqe); 4203} 4204 4205static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4206{ 4207 struct open_how __user *how; 4208 size_t len; 4209 int ret; 4210 4211 how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4212 len = READ_ONCE(sqe->len); 4213 if (len < OPEN_HOW_SIZE_VER0) 4214 return -EINVAL; 4215 4216 ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, 4217 len); 4218 if (ret) 4219 return ret; 4220 4221 return __io_openat_prep(req, sqe); 4222} 4223 4224static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) 4225{ 4226 struct open_flags op; 4227 struct file *file; 4228 bool resolve_nonblock, nonblock_set; 4229 bool fixed = !!req->open.file_slot; 4230 int ret; 4231 4232 ret = build_open_flags(&req->open.how, &op); 4233 if (ret) 4234 goto err; 4235 nonblock_set = op.open_flag & O_NONBLOCK; 4236 resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED; 4237 if (issue_flags & IO_URING_F_NONBLOCK) { 4238 /* 4239 * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, 4240 * it'll always -EAGAIN. Note that we test for __O_TMPFILE 4241 * because O_TMPFILE includes O_DIRECTORY, which isn't a flag 4242 * we need to force async for. 4243 */ 4244 if (req->open.how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE)) 4245 return -EAGAIN; 4246 op.lookup_flags |= LOOKUP_CACHED; 4247 op.open_flag |= O_NONBLOCK; 4248 } 4249 4250 if (!fixed) { 4251 ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); 4252 if (ret < 0) 4253 goto err; 4254 } 4255 4256 file = do_filp_open(req->open.dfd, req->open.filename, &op); 4257 if (IS_ERR(file)) { 4258 /* 4259 * We could hang on to this 'fd' on retrying, but seems like 4260 * marginal gain for something that is now known to be a slower 4261 * path. So just put it, and we'll get a new one when we retry. 4262 */ 4263 if (!fixed) 4264 put_unused_fd(ret); 4265 4266 ret = PTR_ERR(file); 4267 /* only retry if RESOLVE_CACHED wasn't already set by application */ 4268 if (ret == -EAGAIN && 4269 (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) 4270 return -EAGAIN; 4271 goto err; 4272 } 4273 4274 if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) 4275 file->f_flags &= ~O_NONBLOCK; 4276 fsnotify_open(file); 4277 4278 if (!fixed) 4279 fd_install(ret, file); 4280 else 4281 ret = io_install_fixed_file(req, file, issue_flags, 4282 req->open.file_slot - 1); 4283err: 4284 putname(req->open.filename); 4285 req->flags &= ~REQ_F_NEED_CLEANUP; 4286 if (ret < 0) 4287 req_set_fail(req); 4288 __io_req_complete(req, issue_flags, ret, 0); 4289 return 0; 4290} 4291 4292static int io_openat(struct io_kiocb *req, unsigned int issue_flags) 4293{ 4294 return io_openat2(req, issue_flags); 4295} 4296 4297static int io_remove_buffers_prep(struct io_kiocb *req, 4298 const struct io_uring_sqe *sqe) 4299{ 4300 struct io_provide_buf *p = &req->pbuf; 4301 u64 tmp; 4302 4303 if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || 4304 sqe->splice_fd_in) 4305 return -EINVAL; 4306 4307 tmp = READ_ONCE(sqe->fd); 4308 if (!tmp || tmp > USHRT_MAX) 4309 return -EINVAL; 4310 4311 memset(p, 0, sizeof(*p)); 4312 p->nbufs = tmp; 4313 p->bgid = READ_ONCE(sqe->buf_group); 4314 return 0; 4315} 4316 4317static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, 4318 int bgid, unsigned nbufs) 4319{ 4320 unsigned i = 0; 4321 4322 /* shouldn't happen */ 4323 if (!nbufs) 4324 return 0; 4325 4326 /* the head kbuf is the list itself */ 4327 while (!list_empty(&buf->list)) { 4328 struct io_buffer *nxt; 4329 4330 nxt = list_first_entry(&buf->list, struct io_buffer, list); 4331 list_del(&nxt->list); 4332 kfree(nxt); 4333 if (++i == nbufs) 4334 return i; 4335 cond_resched(); 4336 } 4337 i++; 4338 kfree(buf); 4339 xa_erase(&ctx->io_buffers, bgid); 4340 4341 return i; 4342} 4343 4344static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) 4345{ 4346 struct io_provide_buf *p = &req->pbuf; 4347 struct io_ring_ctx *ctx = req->ctx; 4348 struct io_buffer *head; 4349 int ret = 0; 4350 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4351 4352 io_ring_submit_lock(ctx, !force_nonblock); 4353 4354 lockdep_assert_held(&ctx->uring_lock); 4355 4356 ret = -ENOENT; 4357 head = xa_load(&ctx->io_buffers, p->bgid); 4358 if (head) 4359 ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs); 4360 if (ret < 0) 4361 req_set_fail(req); 4362 4363 /* complete before unlock, IOPOLL may need the lock */ 4364 __io_req_complete(req, issue_flags, ret, 0); 4365 io_ring_submit_unlock(ctx, !force_nonblock); 4366 return 0; 4367} 4368 4369static int io_provide_buffers_prep(struct io_kiocb *req, 4370 const struct io_uring_sqe *sqe) 4371{ 4372 unsigned long size, tmp_check; 4373 struct io_provide_buf *p = &req->pbuf; 4374 u64 tmp; 4375 4376 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 4377 return -EINVAL; 4378 4379 tmp = READ_ONCE(sqe->fd); 4380 if (!tmp || tmp > USHRT_MAX) 4381 return -E2BIG; 4382 p->nbufs = tmp; 4383 p->addr = READ_ONCE(sqe->addr); 4384 p->len = READ_ONCE(sqe->len); 4385 4386 if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, 4387 &size)) 4388 return -EOVERFLOW; 4389 if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) 4390 return -EOVERFLOW; 4391 4392 size = (unsigned long)p->len * p->nbufs; 4393 if (!access_ok(u64_to_user_ptr(p->addr), size)) 4394 return -EFAULT; 4395 4396 p->bgid = READ_ONCE(sqe->buf_group); 4397 tmp = READ_ONCE(sqe->off); 4398 if (tmp > USHRT_MAX) 4399 return -E2BIG; 4400 p->bid = tmp; 4401 return 0; 4402} 4403 4404static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) 4405{ 4406 struct io_buffer *buf; 4407 u64 addr = pbuf->addr; 4408 int i, bid = pbuf->bid; 4409 4410 for (i = 0; i < pbuf->nbufs; i++) { 4411 buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); 4412 if (!buf) 4413 break; 4414 4415 buf->addr = addr; 4416 buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); 4417 buf->bid = bid; 4418 addr += pbuf->len; 4419 bid++; 4420 if (!*head) { 4421 INIT_LIST_HEAD(&buf->list); 4422 *head = buf; 4423 } else { 4424 list_add_tail(&buf->list, &(*head)->list); 4425 } 4426 cond_resched(); 4427 } 4428 4429 return i ? i : -ENOMEM; 4430} 4431 4432static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) 4433{ 4434 struct io_provide_buf *p = &req->pbuf; 4435 struct io_ring_ctx *ctx = req->ctx; 4436 struct io_buffer *head, *list; 4437 int ret = 0; 4438 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4439 4440 io_ring_submit_lock(ctx, !force_nonblock); 4441 4442 lockdep_assert_held(&ctx->uring_lock); 4443 4444 list = head = xa_load(&ctx->io_buffers, p->bgid); 4445 4446 ret = io_add_buffers(p, &head); 4447 if (ret >= 0 && !list) { 4448 ret = xa_insert(&ctx->io_buffers, p->bgid, head, 4449 GFP_KERNEL_ACCOUNT); 4450 if (ret < 0) 4451 __io_remove_buffers(ctx, head, p->bgid, -1U); 4452 } 4453 if (ret < 0) 4454 req_set_fail(req); 4455 /* complete before unlock, IOPOLL may need the lock */ 4456 __io_req_complete(req, issue_flags, ret, 0); 4457 io_ring_submit_unlock(ctx, !force_nonblock); 4458 return 0; 4459} 4460 4461static int io_epoll_ctl_prep(struct io_kiocb *req, 4462 const struct io_uring_sqe *sqe) 4463{ 4464#if defined(CONFIG_EPOLL) 4465 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 4466 return -EINVAL; 4467 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4468 return -EINVAL; 4469 4470 req->epoll.epfd = READ_ONCE(sqe->fd); 4471 req->epoll.op = READ_ONCE(sqe->len); 4472 req->epoll.fd = READ_ONCE(sqe->off); 4473 4474 if (ep_op_has_event(req->epoll.op)) { 4475 struct epoll_event __user *ev; 4476 4477 ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4478 if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) 4479 return -EFAULT; 4480 } 4481 4482 return 0; 4483#else 4484 return -EOPNOTSUPP; 4485#endif 4486} 4487 4488static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) 4489{ 4490#if defined(CONFIG_EPOLL) 4491 struct io_epoll *ie = &req->epoll; 4492 int ret; 4493 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 4494 4495 ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); 4496 if (force_nonblock && ret == -EAGAIN) 4497 return -EAGAIN; 4498 4499 if (ret < 0) 4500 req_set_fail(req); 4501 __io_req_complete(req, issue_flags, ret, 0); 4502 return 0; 4503#else 4504 return -EOPNOTSUPP; 4505#endif 4506} 4507 4508static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4509{ 4510#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4511 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) 4512 return -EINVAL; 4513 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4514 return -EINVAL; 4515 4516 req->madvise.addr = READ_ONCE(sqe->addr); 4517 req->madvise.len = READ_ONCE(sqe->len); 4518 req->madvise.advice = READ_ONCE(sqe->fadvise_advice); 4519 return 0; 4520#else 4521 return -EOPNOTSUPP; 4522#endif 4523} 4524 4525static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) 4526{ 4527#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) 4528 struct io_madvise *ma = &req->madvise; 4529 int ret; 4530 4531 if (issue_flags & IO_URING_F_NONBLOCK) 4532 return -EAGAIN; 4533 4534 ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); 4535 if (ret < 0) 4536 req_set_fail(req); 4537 io_req_complete(req, ret); 4538 return 0; 4539#else 4540 return -EOPNOTSUPP; 4541#endif 4542} 4543 4544static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4545{ 4546 if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) 4547 return -EINVAL; 4548 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4549 return -EINVAL; 4550 4551 req->fadvise.offset = READ_ONCE(sqe->off); 4552 req->fadvise.len = READ_ONCE(sqe->len); 4553 req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); 4554 return 0; 4555} 4556 4557static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) 4558{ 4559 struct io_fadvise *fa = &req->fadvise; 4560 int ret; 4561 4562 if (issue_flags & IO_URING_F_NONBLOCK) { 4563 switch (fa->advice) { 4564 case POSIX_FADV_NORMAL: 4565 case POSIX_FADV_RANDOM: 4566 case POSIX_FADV_SEQUENTIAL: 4567 break; 4568 default: 4569 return -EAGAIN; 4570 } 4571 } 4572 4573 ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); 4574 if (ret < 0) 4575 req_set_fail(req); 4576 __io_req_complete(req, issue_flags, ret, 0); 4577 return 0; 4578} 4579 4580static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4581{ 4582 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4583 return -EINVAL; 4584 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 4585 return -EINVAL; 4586 if (req->flags & REQ_F_FIXED_FILE) 4587 return -EBADF; 4588 4589 req->statx.dfd = READ_ONCE(sqe->fd); 4590 req->statx.mask = READ_ONCE(sqe->len); 4591 req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4592 req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 4593 req->statx.flags = READ_ONCE(sqe->statx_flags); 4594 4595 return 0; 4596} 4597 4598static int io_statx(struct io_kiocb *req, unsigned int issue_flags) 4599{ 4600 struct io_statx *ctx = &req->statx; 4601 int ret; 4602 4603 if (issue_flags & IO_URING_F_NONBLOCK) 4604 return -EAGAIN; 4605 4606 ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, 4607 ctx->buffer); 4608 4609 if (ret < 0) 4610 req_set_fail(req); 4611 io_req_complete(req, ret); 4612 return 0; 4613} 4614 4615static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4616{ 4617 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4618 return -EINVAL; 4619 if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || 4620 sqe->rw_flags || sqe->buf_index) 4621 return -EINVAL; 4622 if (req->flags & REQ_F_FIXED_FILE) 4623 return -EBADF; 4624 4625 req->close.fd = READ_ONCE(sqe->fd); 4626 req->close.file_slot = READ_ONCE(sqe->file_index); 4627 if (req->close.file_slot && req->close.fd) 4628 return -EINVAL; 4629 4630 return 0; 4631} 4632 4633static int io_close(struct io_kiocb *req, unsigned int issue_flags) 4634{ 4635 struct files_struct *files = current->files; 4636 struct io_close *close = &req->close; 4637 struct fdtable *fdt; 4638 struct file *file = NULL; 4639 int ret = -EBADF; 4640 4641 if (req->close.file_slot) { 4642 ret = io_close_fixed(req, issue_flags); 4643 goto err; 4644 } 4645 4646 spin_lock(&files->file_lock); 4647 fdt = files_fdtable(files); 4648 if (close->fd >= fdt->max_fds) { 4649 spin_unlock(&files->file_lock); 4650 goto err; 4651 } 4652 file = fdt->fd[close->fd]; 4653 if (!file || file->f_op == &io_uring_fops) { 4654 spin_unlock(&files->file_lock); 4655 file = NULL; 4656 goto err; 4657 } 4658 4659 /* if the file has a flush method, be safe and punt to async */ 4660 if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { 4661 spin_unlock(&files->file_lock); 4662 return -EAGAIN; 4663 } 4664 4665 ret = __close_fd_get_file(close->fd, &file); 4666 spin_unlock(&files->file_lock); 4667 if (ret < 0) { 4668 if (ret == -ENOENT) 4669 ret = -EBADF; 4670 goto err; 4671 } 4672 4673 /* No ->flush() or already async, safely close from here */ 4674 ret = filp_close(file, current->files); 4675err: 4676 if (ret < 0) 4677 req_set_fail(req); 4678 if (file) 4679 fput(file); 4680 __io_req_complete(req, issue_flags, ret, 0); 4681 return 0; 4682} 4683 4684static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4685{ 4686 struct io_ring_ctx *ctx = req->ctx; 4687 4688 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 4689 return -EINVAL; 4690 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || 4691 sqe->splice_fd_in)) 4692 return -EINVAL; 4693 4694 req->sync.off = READ_ONCE(sqe->off); 4695 req->sync.len = READ_ONCE(sqe->len); 4696 req->sync.flags = READ_ONCE(sqe->sync_range_flags); 4697 return 0; 4698} 4699 4700static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) 4701{ 4702 int ret; 4703 4704 /* sync_file_range always requires a blocking context */ 4705 if (issue_flags & IO_URING_F_NONBLOCK) 4706 return -EAGAIN; 4707 4708 ret = sync_file_range(req->file, req->sync.off, req->sync.len, 4709 req->sync.flags); 4710 if (ret < 0) 4711 req_set_fail(req); 4712 io_req_complete(req, ret); 4713 return 0; 4714} 4715 4716#if defined(CONFIG_NET) 4717static bool io_net_retry(struct socket *sock, int flags) 4718{ 4719 if (!(flags & MSG_WAITALL)) 4720 return false; 4721 return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; 4722} 4723 4724static int io_setup_async_msg(struct io_kiocb *req, 4725 struct io_async_msghdr *kmsg) 4726{ 4727 struct io_async_msghdr *async_msg = req->async_data; 4728 4729 if (async_msg) 4730 return -EAGAIN; 4731 if (io_alloc_async_data(req)) { 4732 kfree(kmsg->free_iov); 4733 return -ENOMEM; 4734 } 4735 async_msg = req->async_data; 4736 req->flags |= REQ_F_NEED_CLEANUP; 4737 memcpy(async_msg, kmsg, sizeof(*kmsg)); 4738 if (async_msg->msg.msg_name) 4739 async_msg->msg.msg_name = &async_msg->addr; 4740 /* if were using fast_iov, set it to the new one */ 4741 if (!kmsg->free_iov) { 4742 size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov; 4743 async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx]; 4744 } 4745 4746 return -EAGAIN; 4747} 4748 4749static int io_sendmsg_copy_hdr(struct io_kiocb *req, 4750 struct io_async_msghdr *iomsg) 4751{ 4752 struct io_sr_msg *sr = &req->sr_msg; 4753 int ret; 4754 4755 iomsg->msg.msg_name = &iomsg->addr; 4756 iomsg->free_iov = iomsg->fast_iov; 4757 ret = sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, 4758 req->sr_msg.msg_flags, &iomsg->free_iov); 4759 /* save msg_control as sys_sendmsg() overwrites it */ 4760 sr->msg_control = iomsg->msg.msg_control; 4761 return ret; 4762} 4763 4764static int io_sendmsg_prep_async(struct io_kiocb *req) 4765{ 4766 int ret; 4767 4768 ret = io_sendmsg_copy_hdr(req, req->async_data); 4769 if (!ret) 4770 req->flags |= REQ_F_NEED_CLEANUP; 4771 return ret; 4772} 4773 4774static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 4775{ 4776 struct io_sr_msg *sr = &req->sr_msg; 4777 4778 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 4779 return -EINVAL; 4780 if (unlikely(sqe->addr2 || sqe->file_index)) 4781 return -EINVAL; 4782 if (unlikely(sqe->addr2 || sqe->file_index || sqe->ioprio)) 4783 return -EINVAL; 4784 4785 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 4786 sr->len = READ_ONCE(sqe->len); 4787 sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; 4788 if (sr->msg_flags & MSG_DONTWAIT) 4789 req->flags |= REQ_F_NOWAIT; 4790 4791#ifdef CONFIG_COMPAT 4792 if (req->ctx->compat) 4793 sr->msg_flags |= MSG_CMSG_COMPAT; 4794#endif 4795 sr->done_io = 0; 4796 return 0; 4797} 4798 4799static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) 4800{ 4801 struct io_async_msghdr iomsg, *kmsg; 4802 struct io_sr_msg *sr = &req->sr_msg; 4803 struct socket *sock; 4804 unsigned flags; 4805 int min_ret = 0; 4806 int ret; 4807 4808 sock = sock_from_file(req->file, &ret); 4809 if (unlikely(!sock)) 4810 return ret; 4811 4812 kmsg = req->async_data; 4813 if (!kmsg) { 4814 ret = io_sendmsg_copy_hdr(req, &iomsg); 4815 if (ret) 4816 return ret; 4817 kmsg = &iomsg; 4818 } else { 4819 kmsg->msg.msg_control = sr->msg_control; 4820 } 4821 4822 flags = req->sr_msg.msg_flags; 4823 if (issue_flags & IO_URING_F_NONBLOCK) 4824 flags |= MSG_DONTWAIT; 4825 if (flags & MSG_WAITALL) 4826 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 4827 4828 ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 4829 4830 if (ret < min_ret) { 4831 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 4832 return io_setup_async_msg(req, kmsg); 4833 if (ret == -ERESTARTSYS) 4834 ret = -EINTR; 4835 if (ret > 0 && io_net_retry(sock, flags)) { 4836 sr->done_io += ret; 4837 req->flags |= REQ_F_PARTIAL_IO; 4838 return io_setup_async_msg(req, kmsg); 4839 } 4840 req_set_fail(req); 4841 } 4842 /* fast path, check for non-NULL to avoid function call */ 4843 if (kmsg->free_iov) 4844 kfree(kmsg->free_iov); 4845 req->flags &= ~REQ_F_NEED_CLEANUP; 4846 if (ret >= 0) 4847 ret += sr->done_io; 4848 else if (sr->done_io) 4849 ret = sr->done_io; 4850 __io_req_complete(req, issue_flags, ret, 0); 4851 return 0; 4852} 4853 4854static int io_send(struct io_kiocb *req, unsigned int issue_flags) 4855{ 4856 struct io_sr_msg *sr = &req->sr_msg; 4857 struct msghdr msg; 4858 struct iovec iov; 4859 struct socket *sock; 4860 unsigned flags; 4861 int min_ret = 0; 4862 int ret; 4863 4864 sock = sock_from_file(req->file, &ret); 4865 if (unlikely(!sock)) 4866 return ret; 4867 4868 ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); 4869 if (unlikely(ret)) 4870 return ret; 4871 4872 msg.msg_name = NULL; 4873 msg.msg_control = NULL; 4874 msg.msg_controllen = 0; 4875 msg.msg_namelen = 0; 4876 4877 flags = req->sr_msg.msg_flags; 4878 if (issue_flags & IO_URING_F_NONBLOCK) 4879 flags |= MSG_DONTWAIT; 4880 if (flags & MSG_WAITALL) 4881 min_ret = iov_iter_count(&msg.msg_iter); 4882 4883 msg.msg_flags = flags; 4884 ret = sock_sendmsg(sock, &msg); 4885 if (ret < min_ret) { 4886 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 4887 return -EAGAIN; 4888 if (ret == -ERESTARTSYS) 4889 ret = -EINTR; 4890 if (ret > 0 && io_net_retry(sock, flags)) { 4891 sr->len -= ret; 4892 sr->buf += ret; 4893 sr->done_io += ret; 4894 req->flags |= REQ_F_PARTIAL_IO; 4895 return -EAGAIN; 4896 } 4897 req_set_fail(req); 4898 } 4899 if (ret >= 0) 4900 ret += sr->done_io; 4901 else if (sr->done_io) 4902 ret = sr->done_io; 4903 __io_req_complete(req, issue_flags, ret, 0); 4904 return 0; 4905} 4906 4907static int __io_recvmsg_copy_hdr(struct io_kiocb *req, 4908 struct io_async_msghdr *iomsg) 4909{ 4910 struct io_sr_msg *sr = &req->sr_msg; 4911 struct iovec __user *uiov; 4912 size_t iov_len; 4913 int ret; 4914 4915 ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, 4916 &iomsg->uaddr, &uiov, &iov_len); 4917 if (ret) 4918 return ret; 4919 4920 if (req->flags & REQ_F_BUFFER_SELECT) { 4921 if (iov_len > 1) 4922 return -EINVAL; 4923 if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) 4924 return -EFAULT; 4925 sr->len = iomsg->fast_iov[0].iov_len; 4926 iomsg->free_iov = NULL; 4927 } else { 4928 iomsg->free_iov = iomsg->fast_iov; 4929 ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, 4930 &iomsg->free_iov, &iomsg->msg.msg_iter, 4931 false); 4932 if (ret > 0) 4933 ret = 0; 4934 } 4935 4936 return ret; 4937} 4938 4939#ifdef CONFIG_COMPAT 4940static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, 4941 struct io_async_msghdr *iomsg) 4942{ 4943 struct io_sr_msg *sr = &req->sr_msg; 4944 struct compat_iovec __user *uiov; 4945 compat_uptr_t ptr; 4946 compat_size_t len; 4947 int ret; 4948 4949 ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, 4950 &ptr, &len); 4951 if (ret) 4952 return ret; 4953 4954 uiov = compat_ptr(ptr); 4955 if (req->flags & REQ_F_BUFFER_SELECT) { 4956 compat_ssize_t clen; 4957 4958 if (len > 1) 4959 return -EINVAL; 4960 if (!access_ok(uiov, sizeof(*uiov))) 4961 return -EFAULT; 4962 if (__get_user(clen, &uiov->iov_len)) 4963 return -EFAULT; 4964 if (clen < 0) 4965 return -EINVAL; 4966 sr->len = clen; 4967 iomsg->free_iov = NULL; 4968 } else { 4969 iomsg->free_iov = iomsg->fast_iov; 4970 ret = __import_iovec(READ, (struct iovec __user *)uiov, len, 4971 UIO_FASTIOV, &iomsg->free_iov, 4972 &iomsg->msg.msg_iter, true); 4973 if (ret < 0) 4974 return ret; 4975 } 4976 4977 return 0; 4978} 4979#endif 4980 4981static int io_recvmsg_copy_hdr(struct io_kiocb *req, 4982 struct io_async_msghdr *iomsg) 4983{ 4984 iomsg->msg.msg_name = &iomsg->addr; 4985 4986#ifdef CONFIG_COMPAT 4987 if (req->ctx->compat) 4988 return __io_compat_recvmsg_copy_hdr(req, iomsg); 4989#endif 4990 4991 return __io_recvmsg_copy_hdr(req, iomsg); 4992} 4993 4994static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, 4995 bool needs_lock) 4996{ 4997 struct io_sr_msg *sr = &req->sr_msg; 4998 struct io_buffer *kbuf; 4999 5000 kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); 5001 if (IS_ERR(kbuf)) 5002 return kbuf; 5003 5004 sr->kbuf = kbuf; 5005 req->flags |= REQ_F_BUFFER_SELECTED; 5006 return kbuf; 5007} 5008 5009static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) 5010{ 5011 return io_put_kbuf(req, req->sr_msg.kbuf); 5012} 5013 5014static int io_recvmsg_prep_async(struct io_kiocb *req) 5015{ 5016 int ret; 5017 5018 ret = io_recvmsg_copy_hdr(req, req->async_data); 5019 if (!ret) 5020 req->flags |= REQ_F_NEED_CLEANUP; 5021 return ret; 5022} 5023 5024static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5025{ 5026 struct io_sr_msg *sr = &req->sr_msg; 5027 5028 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5029 return -EINVAL; 5030 if (unlikely(sqe->addr2 || sqe->file_index)) 5031 return -EINVAL; 5032 if (unlikely(sqe->addr2 || sqe->file_index || sqe->ioprio)) 5033 return -EINVAL; 5034 5035 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5036 sr->len = READ_ONCE(sqe->len); 5037 sr->bgid = READ_ONCE(sqe->buf_group); 5038 sr->msg_flags = READ_ONCE(sqe->msg_flags); 5039 if (sr->msg_flags & MSG_DONTWAIT) 5040 req->flags |= REQ_F_NOWAIT; 5041 5042#ifdef CONFIG_COMPAT 5043 if (req->ctx->compat) 5044 sr->msg_flags |= MSG_CMSG_COMPAT; 5045#endif 5046 sr->done_io = 0; 5047 return 0; 5048} 5049 5050static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) 5051{ 5052 struct io_async_msghdr iomsg, *kmsg; 5053 struct io_sr_msg *sr = &req->sr_msg; 5054 struct socket *sock; 5055 struct io_buffer *kbuf; 5056 unsigned flags; 5057 int min_ret = 0; 5058 int ret, cflags = 0; 5059 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5060 5061 sock = sock_from_file(req->file, &ret); 5062 if (unlikely(!sock)) 5063 return ret; 5064 5065 kmsg = req->async_data; 5066 if (!kmsg) { 5067 ret = io_recvmsg_copy_hdr(req, &iomsg); 5068 if (ret) 5069 return ret; 5070 kmsg = &iomsg; 5071 } 5072 5073 if (req->flags & REQ_F_BUFFER_SELECT) { 5074 kbuf = io_recv_buffer_select(req, !force_nonblock); 5075 if (IS_ERR(kbuf)) 5076 return PTR_ERR(kbuf); 5077 kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); 5078 kmsg->fast_iov[0].iov_len = req->sr_msg.len; 5079 iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 5080 1, req->sr_msg.len); 5081 } 5082 5083 flags = req->sr_msg.msg_flags; 5084 if (force_nonblock) 5085 flags |= MSG_DONTWAIT; 5086 if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) 5087 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 5088 5089 ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, 5090 kmsg->uaddr, flags); 5091 if (ret < min_ret) { 5092 if (ret == -EAGAIN && force_nonblock) 5093 return io_setup_async_msg(req, kmsg); 5094 if (ret == -ERESTARTSYS) 5095 ret = -EINTR; 5096 if (ret > 0 && io_net_retry(sock, flags)) { 5097 kmsg->msg.msg_controllen = 0; 5098 kmsg->msg.msg_control = NULL; 5099 sr->done_io += ret; 5100 req->flags |= REQ_F_PARTIAL_IO; 5101 return io_setup_async_msg(req, kmsg); 5102 } 5103 req_set_fail(req); 5104 } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 5105 req_set_fail(req); 5106 } 5107 5108 if (req->flags & REQ_F_BUFFER_SELECTED) 5109 cflags = io_put_recv_kbuf(req); 5110 /* fast path, check for non-NULL to avoid function call */ 5111 if (kmsg->free_iov) 5112 kfree(kmsg->free_iov); 5113 req->flags &= ~REQ_F_NEED_CLEANUP; 5114 if (ret >= 0) 5115 ret += sr->done_io; 5116 else if (sr->done_io) 5117 ret = sr->done_io; 5118 __io_req_complete(req, issue_flags, ret, cflags); 5119 return 0; 5120} 5121 5122static int io_recv(struct io_kiocb *req, unsigned int issue_flags) 5123{ 5124 struct io_buffer *kbuf; 5125 struct io_sr_msg *sr = &req->sr_msg; 5126 struct msghdr msg; 5127 void __user *buf = sr->buf; 5128 struct socket *sock; 5129 struct iovec iov; 5130 unsigned flags; 5131 int min_ret = 0; 5132 int ret, cflags = 0; 5133 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5134 5135 sock = sock_from_file(req->file, &ret); 5136 if (unlikely(!sock)) 5137 return ret; 5138 5139 if (req->flags & REQ_F_BUFFER_SELECT) { 5140 kbuf = io_recv_buffer_select(req, !force_nonblock); 5141 if (IS_ERR(kbuf)) 5142 return PTR_ERR(kbuf); 5143 buf = u64_to_user_ptr(kbuf->addr); 5144 } 5145 5146 ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter); 5147 if (unlikely(ret)) 5148 goto out_free; 5149 5150 msg.msg_name = NULL; 5151 msg.msg_control = NULL; 5152 msg.msg_controllen = 0; 5153 msg.msg_namelen = 0; 5154 msg.msg_iocb = NULL; 5155 msg.msg_flags = 0; 5156 5157 flags = req->sr_msg.msg_flags; 5158 if (force_nonblock) 5159 flags |= MSG_DONTWAIT; 5160 if (flags & MSG_WAITALL) 5161 min_ret = iov_iter_count(&msg.msg_iter); 5162 5163 ret = sock_recvmsg(sock, &msg, flags); 5164 if (ret < min_ret) { 5165 if (ret == -EAGAIN && force_nonblock) 5166 return -EAGAIN; 5167 if (ret == -ERESTARTSYS) 5168 ret = -EINTR; 5169 if (ret > 0 && io_net_retry(sock, flags)) { 5170 sr->len -= ret; 5171 sr->buf += ret; 5172 sr->done_io += ret; 5173 req->flags |= REQ_F_PARTIAL_IO; 5174 return -EAGAIN; 5175 } 5176 req_set_fail(req); 5177 } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { 5178out_free: 5179 req_set_fail(req); 5180 } 5181 if (req->flags & REQ_F_BUFFER_SELECTED) 5182 cflags = io_put_recv_kbuf(req); 5183 if (ret >= 0) 5184 ret += sr->done_io; 5185 else if (sr->done_io) 5186 ret = sr->done_io; 5187 __io_req_complete(req, issue_flags, ret, cflags); 5188 return 0; 5189} 5190 5191static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5192{ 5193 struct io_accept *accept = &req->accept; 5194 5195 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5196 return -EINVAL; 5197 if (sqe->ioprio || sqe->len || sqe->buf_index) 5198 return -EINVAL; 5199 5200 accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5201 accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 5202 accept->flags = READ_ONCE(sqe->accept_flags); 5203 accept->nofile = rlimit(RLIMIT_NOFILE); 5204 5205 accept->file_slot = READ_ONCE(sqe->file_index); 5206 if (accept->file_slot && (accept->flags & SOCK_CLOEXEC)) 5207 return -EINVAL; 5208 if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) 5209 return -EINVAL; 5210 if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) 5211 accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; 5212 return 0; 5213} 5214 5215static int io_accept(struct io_kiocb *req, unsigned int issue_flags) 5216{ 5217 struct io_accept *accept = &req->accept; 5218 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5219 unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; 5220 bool fixed = !!accept->file_slot; 5221 struct file *file; 5222 int ret, fd; 5223 5224 if (!fixed) { 5225 fd = __get_unused_fd_flags(accept->flags, accept->nofile); 5226 if (unlikely(fd < 0)) 5227 return fd; 5228 } 5229 file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, 5230 accept->flags); 5231 5232 if (IS_ERR(file)) { 5233 if (!fixed) 5234 put_unused_fd(fd); 5235 ret = PTR_ERR(file); 5236 /* safe to retry */ 5237 req->flags |= REQ_F_PARTIAL_IO; 5238 if (ret == -EAGAIN && force_nonblock) 5239 return -EAGAIN; 5240 if (ret == -ERESTARTSYS) 5241 ret = -EINTR; 5242 req_set_fail(req); 5243 } else if (!fixed) { 5244 fd_install(fd, file); 5245 ret = fd; 5246 } else { 5247 ret = io_install_fixed_file(req, file, issue_flags, 5248 accept->file_slot - 1); 5249 } 5250 __io_req_complete(req, issue_flags, ret, 0); 5251 return 0; 5252} 5253 5254static int io_connect_prep_async(struct io_kiocb *req) 5255{ 5256 struct io_async_connect *io = req->async_data; 5257 struct io_connect *conn = &req->connect; 5258 5259 return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); 5260} 5261 5262static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5263{ 5264 struct io_connect *conn = &req->connect; 5265 5266 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5267 return -EINVAL; 5268 if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || 5269 sqe->splice_fd_in) 5270 return -EINVAL; 5271 5272 conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 5273 conn->addr_len = READ_ONCE(sqe->addr2); 5274 return 0; 5275} 5276 5277static int io_connect(struct io_kiocb *req, unsigned int issue_flags) 5278{ 5279 struct io_async_connect __io, *io; 5280 unsigned file_flags; 5281 int ret; 5282 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 5283 5284 if (req->async_data) { 5285 io = req->async_data; 5286 } else { 5287 ret = move_addr_to_kernel(req->connect.addr, 5288 req->connect.addr_len, 5289 &__io.address); 5290 if (ret) 5291 goto out; 5292 io = &__io; 5293 } 5294 5295 file_flags = force_nonblock ? O_NONBLOCK : 0; 5296 5297 ret = __sys_connect_file(req->file, &io->address, 5298 req->connect.addr_len, file_flags); 5299 if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { 5300 if (req->async_data) 5301 return -EAGAIN; 5302 if (io_alloc_async_data(req)) { 5303 ret = -ENOMEM; 5304 goto out; 5305 } 5306 memcpy(req->async_data, &__io, sizeof(__io)); 5307 return -EAGAIN; 5308 } 5309 if (ret == -ERESTARTSYS) 5310 ret = -EINTR; 5311out: 5312 if (ret < 0) 5313 req_set_fail(req); 5314 __io_req_complete(req, issue_flags, ret, 0); 5315 return 0; 5316} 5317#else /* !CONFIG_NET */ 5318#define IO_NETOP_FN(op) \ 5319static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ 5320{ \ 5321 return -EOPNOTSUPP; \ 5322} 5323 5324#define IO_NETOP_PREP(op) \ 5325IO_NETOP_FN(op) \ 5326static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ 5327{ \ 5328 return -EOPNOTSUPP; \ 5329} \ 5330 5331#define IO_NETOP_PREP_ASYNC(op) \ 5332IO_NETOP_PREP(op) \ 5333static int io_##op##_prep_async(struct io_kiocb *req) \ 5334{ \ 5335 return -EOPNOTSUPP; \ 5336} 5337 5338IO_NETOP_PREP_ASYNC(sendmsg); 5339IO_NETOP_PREP_ASYNC(recvmsg); 5340IO_NETOP_PREP_ASYNC(connect); 5341IO_NETOP_PREP(accept); 5342IO_NETOP_FN(send); 5343IO_NETOP_FN(recv); 5344#endif /* CONFIG_NET */ 5345 5346struct io_poll_table { 5347 struct poll_table_struct pt; 5348 struct io_kiocb *req; 5349 int nr_entries; 5350 int error; 5351}; 5352 5353#define IO_POLL_CANCEL_FLAG BIT(31) 5354#define IO_POLL_RETRY_FLAG BIT(30) 5355#define IO_POLL_REF_MASK GENMASK(29, 0) 5356 5357/* 5358 * We usually have 1-2 refs taken, 128 is more than enough and we want to 5359 * maximise the margin between this amount and the moment when it overflows. 5360 */ 5361#define IO_POLL_REF_BIAS 128 5362 5363static bool io_poll_get_ownership_slowpath(struct io_kiocb *req) 5364{ 5365 int v; 5366 5367 /* 5368 * poll_refs are already elevated and we don't have much hope for 5369 * grabbing the ownership. Instead of incrementing set a retry flag 5370 * to notify the loop that there might have been some change. 5371 */ 5372 v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs); 5373 if (v & IO_POLL_REF_MASK) 5374 return false; 5375 return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); 5376} 5377 5378/* 5379 * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can 5380 * bump it and acquire ownership. It's disallowed to modify requests while not 5381 * owning it, that prevents from races for enqueueing task_work's and b/w 5382 * arming poll and wakeups. 5383 */ 5384static inline bool io_poll_get_ownership(struct io_kiocb *req) 5385{ 5386 if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) 5387 return io_poll_get_ownership_slowpath(req); 5388 return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); 5389} 5390 5391static void io_poll_mark_cancelled(struct io_kiocb *req) 5392{ 5393 atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); 5394} 5395 5396static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) 5397{ 5398 /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ 5399 if (req->opcode == IORING_OP_POLL_ADD) 5400 return req->async_data; 5401 return req->apoll->double_poll; 5402} 5403 5404static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) 5405{ 5406 if (req->opcode == IORING_OP_POLL_ADD) 5407 return &req->poll; 5408 return &req->apoll->poll; 5409} 5410 5411static void io_poll_req_insert(struct io_kiocb *req) 5412{ 5413 struct io_ring_ctx *ctx = req->ctx; 5414 struct hlist_head *list; 5415 5416 list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; 5417 hlist_add_head(&req->hash_node, list); 5418} 5419 5420static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, 5421 wait_queue_func_t wake_func) 5422{ 5423 poll->head = NULL; 5424#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) 5425 /* mask in events that we always want/need */ 5426 poll->events = events | IO_POLL_UNMASK; 5427 INIT_LIST_HEAD(&poll->wait.entry); 5428 init_waitqueue_func_entry(&poll->wait, wake_func); 5429} 5430 5431static inline void io_poll_remove_entry(struct io_poll_iocb *poll) 5432{ 5433 struct wait_queue_head *head = smp_load_acquire(&poll->head); 5434 5435 if (head) { 5436 spin_lock_irq(&head->lock); 5437 list_del_init(&poll->wait.entry); 5438 poll->head = NULL; 5439 spin_unlock_irq(&head->lock); 5440 } 5441} 5442 5443static void io_poll_remove_entries(struct io_kiocb *req) 5444{ 5445 struct io_poll_iocb *poll = io_poll_get_single(req); 5446 struct io_poll_iocb *poll_double = io_poll_get_double(req); 5447 5448 /* 5449 * While we hold the waitqueue lock and the waitqueue is nonempty, 5450 * wake_up_pollfree() will wait for us. However, taking the waitqueue 5451 * lock in the first place can race with the waitqueue being freed. 5452 * 5453 * We solve this as eventpoll does: by taking advantage of the fact that 5454 * all users of wake_up_pollfree() will RCU-delay the actual free. If 5455 * we enter rcu_read_lock() and see that the pointer to the queue is 5456 * non-NULL, we can then lock it without the memory being freed out from 5457 * under us. 5458 * 5459 * Keep holding rcu_read_lock() as long as we hold the queue lock, in 5460 * case the caller deletes the entry from the queue, leaving it empty. 5461 * In that case, only RCU prevents the queue memory from being freed. 5462 */ 5463 rcu_read_lock(); 5464 io_poll_remove_entry(poll); 5465 if (poll_double) 5466 io_poll_remove_entry(poll_double); 5467 rcu_read_unlock(); 5468} 5469 5470/* 5471 * All poll tw should go through this. Checks for poll events, manages 5472 * references, does rewait, etc. 5473 * 5474 * Returns a negative error on failure. >0 when no action require, which is 5475 * either spurious wakeup or multishot CQE is served. 0 when it's done with 5476 * the request, then the mask is stored in req->result. 5477 */ 5478static int io_poll_check_events(struct io_kiocb *req) 5479{ 5480 struct io_ring_ctx *ctx = req->ctx; 5481 struct io_poll_iocb *poll = io_poll_get_single(req); 5482 int v; 5483 5484 /* req->task == current here, checking PF_EXITING is safe */ 5485 if (unlikely(req->task->flags & PF_EXITING)) 5486 io_poll_mark_cancelled(req); 5487 5488 do { 5489 v = atomic_read(&req->poll_refs); 5490 5491 /* tw handler should be the owner, and so have some references */ 5492 if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) 5493 return 0; 5494 if (v & IO_POLL_CANCEL_FLAG) 5495 return -ECANCELED; 5496 /* 5497 * cqe.res contains only events of the first wake up 5498 * and all others are be lost. Redo vfs_poll() to get 5499 * up to date state. 5500 */ 5501 if ((v & IO_POLL_REF_MASK) != 1) 5502 req->result = 0; 5503 if (v & IO_POLL_RETRY_FLAG) { 5504 req->result = 0; 5505 /* 5506 * We won't find new events that came in between 5507 * vfs_poll and the ref put unless we clear the 5508 * flag in advance. 5509 */ 5510 atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs); 5511 v &= ~IO_POLL_RETRY_FLAG; 5512 } 5513 5514 if (!req->result) { 5515 struct poll_table_struct pt = { ._key = poll->events }; 5516 5517 req->result = vfs_poll(req->file, &pt) & poll->events; 5518 } 5519 5520 /* multishot, just fill an CQE and proceed */ 5521 if (req->result && !(poll->events & EPOLLONESHOT)) { 5522 __poll_t mask = mangle_poll(req->result & poll->events); 5523 bool filled; 5524 5525 spin_lock(&ctx->completion_lock); 5526 filled = io_fill_cqe_aux(ctx, req->user_data, mask, 5527 IORING_CQE_F_MORE); 5528 io_commit_cqring(ctx); 5529 spin_unlock(&ctx->completion_lock); 5530 if (unlikely(!filled)) 5531 return -ECANCELED; 5532 io_cqring_ev_posted(ctx); 5533 } else if (req->result) { 5534 return 0; 5535 } 5536 5537 /* force the next iteration to vfs_poll() */ 5538 req->result = 0; 5539 5540 /* 5541 * Release all references, retry if someone tried to restart 5542 * task_work while we were executing it. 5543 */ 5544 } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) & 5545 IO_POLL_REF_MASK); 5546 5547 return 1; 5548} 5549 5550static void io_poll_task_func(struct io_kiocb *req, bool *locked) 5551{ 5552 struct io_ring_ctx *ctx = req->ctx; 5553 int ret; 5554 5555 ret = io_poll_check_events(req); 5556 if (ret > 0) 5557 return; 5558 5559 if (!ret) { 5560 req->result = mangle_poll(req->result & req->poll.events); 5561 } else { 5562 req->result = ret; 5563 req_set_fail(req); 5564 } 5565 5566 io_poll_remove_entries(req); 5567 spin_lock(&ctx->completion_lock); 5568 hash_del(&req->hash_node); 5569 spin_unlock(&ctx->completion_lock); 5570 io_req_complete_post(req, req->result, 0); 5571} 5572 5573static void io_apoll_task_func(struct io_kiocb *req, bool *locked) 5574{ 5575 struct io_ring_ctx *ctx = req->ctx; 5576 int ret; 5577 5578 ret = io_poll_check_events(req); 5579 if (ret > 0) 5580 return; 5581 5582 io_tw_lock(req->ctx, locked); 5583 io_poll_remove_entries(req); 5584 spin_lock(&ctx->completion_lock); 5585 hash_del(&req->hash_node); 5586 spin_unlock(&ctx->completion_lock); 5587 5588 if (!ret) 5589 io_req_task_submit(req, locked); 5590 else 5591 io_req_complete_failed(req, ret); 5592} 5593 5594static void __io_poll_execute(struct io_kiocb *req, int mask) 5595{ 5596 req->result = mask; 5597 if (req->opcode == IORING_OP_POLL_ADD) 5598 req->io_task_work.func = io_poll_task_func; 5599 else 5600 req->io_task_work.func = io_apoll_task_func; 5601 5602 trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask); 5603 io_req_task_work_add(req); 5604} 5605 5606static inline void io_poll_execute(struct io_kiocb *req, int res) 5607{ 5608 if (io_poll_get_ownership(req)) 5609 __io_poll_execute(req, res); 5610} 5611 5612static void io_poll_cancel_req(struct io_kiocb *req) 5613{ 5614 io_poll_mark_cancelled(req); 5615 /* kick tw, which should complete the request */ 5616 io_poll_execute(req, 0); 5617} 5618 5619static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, 5620 void *key) 5621{ 5622 struct io_kiocb *req = wait->private; 5623 struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, 5624 wait); 5625 __poll_t mask = key_to_poll(key); 5626 5627 if (unlikely(mask & POLLFREE)) { 5628 io_poll_mark_cancelled(req); 5629 /* we have to kick tw in case it's not already */ 5630 io_poll_execute(req, 0); 5631 5632 /* 5633 * If the waitqueue is being freed early but someone is already 5634 * holds ownership over it, we have to tear down the request as 5635 * best we can. That means immediately removing the request from 5636 * its waitqueue and preventing all further accesses to the 5637 * waitqueue via the request. 5638 */ 5639 list_del_init(&poll->wait.entry); 5640 5641 /* 5642 * Careful: this *must* be the last step, since as soon 5643 * as req->head is NULL'ed out, the request can be 5644 * completed and freed, since aio_poll_complete_work() 5645 * will no longer need to take the waitqueue lock. 5646 */ 5647 smp_store_release(&poll->head, NULL); 5648 return 1; 5649 } 5650 5651 /* for instances that support it check for an event match first */ 5652 if (mask && !(mask & poll->events)) 5653 return 0; 5654 5655 if (io_poll_get_ownership(req)) { 5656 /* 5657 * If we trigger a multishot poll off our own wakeup path, 5658 * disable multishot as there is a circular dependency between 5659 * CQ posting and triggering the event. 5660 */ 5661 if (mask & EPOLL_URING_WAKE) 5662 poll->events |= EPOLLONESHOT; 5663 5664 __io_poll_execute(req, mask); 5665 } 5666 return 1; 5667} 5668 5669static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, 5670 struct wait_queue_head *head, 5671 struct io_poll_iocb **poll_ptr) 5672{ 5673 struct io_kiocb *req = pt->req; 5674 5675 /* 5676 * The file being polled uses multiple waitqueues for poll handling 5677 * (e.g. one for read, one for write). Setup a separate io_poll_iocb 5678 * if this happens. 5679 */ 5680 if (unlikely(pt->nr_entries)) { 5681 struct io_poll_iocb *first = poll; 5682 5683 /* double add on the same waitqueue head, ignore */ 5684 if (first->head == head) 5685 return; 5686 /* already have a 2nd entry, fail a third attempt */ 5687 if (*poll_ptr) { 5688 if ((*poll_ptr)->head == head) 5689 return; 5690 pt->error = -EINVAL; 5691 return; 5692 } 5693 5694 poll = kmalloc(sizeof(*poll), GFP_ATOMIC); 5695 if (!poll) { 5696 pt->error = -ENOMEM; 5697 return; 5698 } 5699 io_init_poll_iocb(poll, first->events, first->wait.func); 5700 *poll_ptr = poll; 5701 } 5702 5703 pt->nr_entries++; 5704 poll->head = head; 5705 poll->wait.private = req; 5706 5707 if (poll->events & EPOLLEXCLUSIVE) 5708 add_wait_queue_exclusive(head, &poll->wait); 5709 else 5710 add_wait_queue(head, &poll->wait); 5711} 5712 5713static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, 5714 struct poll_table_struct *p) 5715{ 5716 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5717 5718 __io_queue_proc(&pt->req->poll, pt, head, 5719 (struct io_poll_iocb **) &pt->req->async_data); 5720} 5721 5722static int __io_arm_poll_handler(struct io_kiocb *req, 5723 struct io_poll_iocb *poll, 5724 struct io_poll_table *ipt, __poll_t mask) 5725{ 5726 struct io_ring_ctx *ctx = req->ctx; 5727 5728 INIT_HLIST_NODE(&req->hash_node); 5729 io_init_poll_iocb(poll, mask, io_poll_wake); 5730 poll->file = req->file; 5731 poll->wait.private = req; 5732 5733 ipt->pt._key = mask; 5734 ipt->req = req; 5735 ipt->error = 0; 5736 ipt->nr_entries = 0; 5737 5738 /* 5739 * Take the ownership to delay any tw execution up until we're done 5740 * with poll arming. see io_poll_get_ownership(). 5741 */ 5742 atomic_set(&req->poll_refs, 1); 5743 mask = vfs_poll(req->file, &ipt->pt) & poll->events; 5744 5745 if (mask && (poll->events & EPOLLONESHOT)) { 5746 io_poll_remove_entries(req); 5747 /* no one else has access to the req, forget about the ref */ 5748 return mask; 5749 } 5750 if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { 5751 io_poll_remove_entries(req); 5752 if (!ipt->error) 5753 ipt->error = -EINVAL; 5754 return 0; 5755 } 5756 5757 spin_lock(&ctx->completion_lock); 5758 io_poll_req_insert(req); 5759 spin_unlock(&ctx->completion_lock); 5760 5761 if (mask) { 5762 /* can't multishot if failed, just queue the event we've got */ 5763 if (unlikely(ipt->error || !ipt->nr_entries)) { 5764 poll->events |= EPOLLONESHOT; 5765 ipt->error = 0; 5766 } 5767 __io_poll_execute(req, mask); 5768 return 0; 5769 } 5770 5771 /* 5772 * Try to release ownership. If we see a change of state, e.g. 5773 * poll was waken up, queue up a tw, it'll deal with it. 5774 */ 5775 if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1) 5776 __io_poll_execute(req, 0); 5777 return 0; 5778} 5779 5780static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, 5781 struct poll_table_struct *p) 5782{ 5783 struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); 5784 struct async_poll *apoll = pt->req->apoll; 5785 5786 __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); 5787} 5788 5789enum { 5790 IO_APOLL_OK, 5791 IO_APOLL_ABORTED, 5792 IO_APOLL_READY 5793}; 5794 5795/* 5796 * We can't reliably detect loops in repeated poll triggers and issue 5797 * subsequently failing. But rather than fail these immediately, allow a 5798 * certain amount of retries before we give up. Given that this condition 5799 * should _rarely_ trigger even once, we should be fine with a larger value. 5800 */ 5801#define APOLL_MAX_RETRY 128 5802 5803static int io_arm_poll_handler(struct io_kiocb *req) 5804{ 5805 const struct io_op_def *def = &io_op_defs[req->opcode]; 5806 struct io_ring_ctx *ctx = req->ctx; 5807 struct async_poll *apoll; 5808 struct io_poll_table ipt; 5809 __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI; 5810 int ret; 5811 5812 if (!req->file || !file_can_poll(req->file)) 5813 return IO_APOLL_ABORTED; 5814 if (!def->pollin && !def->pollout) 5815 return IO_APOLL_ABORTED; 5816 5817 if (def->pollin) { 5818 mask |= POLLIN | POLLRDNORM; 5819 5820 /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ 5821 if ((req->opcode == IORING_OP_RECVMSG) && 5822 (req->sr_msg.msg_flags & MSG_ERRQUEUE)) 5823 mask &= ~POLLIN; 5824 } else { 5825 mask |= POLLOUT | POLLWRNORM; 5826 } 5827 5828 if (req->flags & REQ_F_POLLED) { 5829 apoll = req->apoll; 5830 kfree(apoll->double_poll); 5831 if (unlikely(!--apoll->poll.retries)) { 5832 apoll->double_poll = NULL; 5833 return IO_APOLL_ABORTED; 5834 } 5835 } else { 5836 apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); 5837 if (unlikely(!apoll)) 5838 return IO_APOLL_ABORTED; 5839 apoll->poll.retries = APOLL_MAX_RETRY; 5840 } 5841 apoll->double_poll = NULL; 5842 req->apoll = apoll; 5843 req->flags |= REQ_F_POLLED; 5844 ipt.pt._qproc = io_async_queue_proc; 5845 5846 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); 5847 if (ret || ipt.error) 5848 return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; 5849 5850 trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, 5851 mask, apoll->poll.events); 5852 return IO_APOLL_OK; 5853} 5854 5855/* 5856 * Returns true if we found and killed one or more poll requests 5857 */ 5858static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, 5859 bool cancel_all) 5860{ 5861 struct hlist_node *tmp; 5862 struct io_kiocb *req; 5863 bool found = false; 5864 int i; 5865 5866 spin_lock(&ctx->completion_lock); 5867 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 5868 struct hlist_head *list; 5869 5870 list = &ctx->cancel_hash[i]; 5871 hlist_for_each_entry_safe(req, tmp, list, hash_node) { 5872 if (io_match_task_safe(req, tsk, cancel_all)) { 5873 hlist_del_init(&req->hash_node); 5874 io_poll_cancel_req(req); 5875 found = true; 5876 } 5877 } 5878 } 5879 spin_unlock(&ctx->completion_lock); 5880 return found; 5881} 5882 5883static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, __u64 sqe_addr, 5884 bool poll_only) 5885 __must_hold(&ctx->completion_lock) 5886{ 5887 struct hlist_head *list; 5888 struct io_kiocb *req; 5889 5890 list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; 5891 hlist_for_each_entry(req, list, hash_node) { 5892 if (sqe_addr != req->user_data) 5893 continue; 5894 if (poll_only && req->opcode != IORING_OP_POLL_ADD) 5895 continue; 5896 return req; 5897 } 5898 return NULL; 5899} 5900 5901static bool io_poll_disarm(struct io_kiocb *req) 5902 __must_hold(&ctx->completion_lock) 5903{ 5904 if (!io_poll_get_ownership(req)) 5905 return false; 5906 io_poll_remove_entries(req); 5907 hash_del(&req->hash_node); 5908 return true; 5909} 5910 5911static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr, 5912 bool poll_only) 5913 __must_hold(&ctx->completion_lock) 5914{ 5915 struct io_kiocb *req = io_poll_find(ctx, sqe_addr, poll_only); 5916 5917 if (!req) 5918 return -ENOENT; 5919 io_poll_cancel_req(req); 5920 return 0; 5921} 5922 5923static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, 5924 unsigned int flags) 5925{ 5926 u32 events; 5927 5928 events = READ_ONCE(sqe->poll32_events); 5929#ifdef __BIG_ENDIAN 5930 events = swahw32(events); 5931#endif 5932 if (!(flags & IORING_POLL_ADD_MULTI)) 5933 events |= EPOLLONESHOT; 5934 return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); 5935} 5936 5937static int io_poll_update_prep(struct io_kiocb *req, 5938 const struct io_uring_sqe *sqe) 5939{ 5940 struct io_poll_update *upd = &req->poll_update; 5941 u32 flags; 5942 5943 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5944 return -EINVAL; 5945 if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) 5946 return -EINVAL; 5947 flags = READ_ONCE(sqe->len); 5948 if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | 5949 IORING_POLL_ADD_MULTI)) 5950 return -EINVAL; 5951 /* meaningless without update */ 5952 if (flags == IORING_POLL_ADD_MULTI) 5953 return -EINVAL; 5954 5955 upd->old_user_data = READ_ONCE(sqe->addr); 5956 upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; 5957 upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; 5958 5959 upd->new_user_data = READ_ONCE(sqe->off); 5960 if (!upd->update_user_data && upd->new_user_data) 5961 return -EINVAL; 5962 if (upd->update_events) 5963 upd->events = io_poll_parse_events(sqe, flags); 5964 else if (sqe->poll32_events) 5965 return -EINVAL; 5966 5967 return 0; 5968} 5969 5970static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 5971{ 5972 struct io_poll_iocb *poll = &req->poll; 5973 u32 flags; 5974 5975 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 5976 return -EINVAL; 5977 if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr) 5978 return -EINVAL; 5979 flags = READ_ONCE(sqe->len); 5980 if (flags & ~IORING_POLL_ADD_MULTI) 5981 return -EINVAL; 5982 5983 io_req_set_refcount(req); 5984 poll->events = io_poll_parse_events(sqe, flags); 5985 return 0; 5986} 5987 5988static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) 5989{ 5990 struct io_poll_iocb *poll = &req->poll; 5991 struct io_poll_table ipt; 5992 int ret; 5993 5994 ipt.pt._qproc = io_poll_queue_proc; 5995 5996 ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); 5997 if (!ret && ipt.error) 5998 req_set_fail(req); 5999 ret = ret ?: ipt.error; 6000 if (ret) 6001 __io_req_complete(req, issue_flags, ret, 0); 6002 return 0; 6003} 6004 6005static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) 6006{ 6007 struct io_ring_ctx *ctx = req->ctx; 6008 struct io_kiocb *preq; 6009 int ret2, ret = 0; 6010 6011 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6012 6013 spin_lock(&ctx->completion_lock); 6014 preq = io_poll_find(ctx, req->poll_update.old_user_data, true); 6015 if (!preq || !io_poll_disarm(preq)) { 6016 spin_unlock(&ctx->completion_lock); 6017 ret = preq ? -EALREADY : -ENOENT; 6018 goto out; 6019 } 6020 spin_unlock(&ctx->completion_lock); 6021 6022 if (req->poll_update.update_events || req->poll_update.update_user_data) { 6023 /* only mask one event flags, keep behavior flags */ 6024 if (req->poll_update.update_events) { 6025 preq->poll.events &= ~0xffff; 6026 preq->poll.events |= req->poll_update.events & 0xffff; 6027 preq->poll.events |= IO_POLL_UNMASK; 6028 } 6029 if (req->poll_update.update_user_data) 6030 preq->user_data = req->poll_update.new_user_data; 6031 6032 ret2 = io_poll_add(preq, issue_flags); 6033 /* successfully updated, don't complete poll request */ 6034 if (!ret2) 6035 goto out; 6036 } 6037 req_set_fail(preq); 6038 io_req_complete(preq, -ECANCELED); 6039out: 6040 if (ret < 0) 6041 req_set_fail(req); 6042 /* complete update request, we're done with it */ 6043 io_req_complete(req, ret); 6044 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6045 return 0; 6046} 6047 6048static void io_req_task_timeout(struct io_kiocb *req, bool *locked) 6049{ 6050 req_set_fail(req); 6051 io_req_complete_post(req, -ETIME, 0); 6052} 6053 6054static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) 6055{ 6056 struct io_timeout_data *data = container_of(timer, 6057 struct io_timeout_data, timer); 6058 struct io_kiocb *req = data->req; 6059 struct io_ring_ctx *ctx = req->ctx; 6060 unsigned long flags; 6061 6062 spin_lock_irqsave(&ctx->timeout_lock, flags); 6063 list_del_init(&req->timeout.list); 6064 atomic_set(&req->ctx->cq_timeouts, 6065 atomic_read(&req->ctx->cq_timeouts) + 1); 6066 spin_unlock_irqrestore(&ctx->timeout_lock, flags); 6067 6068 req->io_task_work.func = io_req_task_timeout; 6069 io_req_task_work_add(req); 6070 return HRTIMER_NORESTART; 6071} 6072 6073static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, 6074 __u64 user_data) 6075 __must_hold(&ctx->timeout_lock) 6076{ 6077 struct io_timeout_data *io; 6078 struct io_kiocb *req; 6079 bool found = false; 6080 6081 list_for_each_entry(req, &ctx->timeout_list, timeout.list) { 6082 found = user_data == req->user_data; 6083 if (found) 6084 break; 6085 } 6086 if (!found) 6087 return ERR_PTR(-ENOENT); 6088 6089 io = req->async_data; 6090 if (hrtimer_try_to_cancel(&io->timer) == -1) 6091 return ERR_PTR(-EALREADY); 6092 list_del_init(&req->timeout.list); 6093 return req; 6094} 6095 6096static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) 6097 __must_hold(&ctx->completion_lock) 6098 __must_hold(&ctx->timeout_lock) 6099{ 6100 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 6101 6102 if (IS_ERR(req)) 6103 return PTR_ERR(req); 6104 6105 req_set_fail(req); 6106 io_fill_cqe_req(req, -ECANCELED, 0); 6107 io_put_req_deferred(req); 6108 return 0; 6109} 6110 6111static clockid_t io_timeout_get_clock(struct io_timeout_data *data) 6112{ 6113 switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { 6114 case IORING_TIMEOUT_BOOTTIME: 6115 return CLOCK_BOOTTIME; 6116 case IORING_TIMEOUT_REALTIME: 6117 return CLOCK_REALTIME; 6118 default: 6119 /* can't happen, vetted at prep time */ 6120 WARN_ON_ONCE(1); 6121 fallthrough; 6122 case 0: 6123 return CLOCK_MONOTONIC; 6124 } 6125} 6126 6127static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 6128 struct timespec64 *ts, enum hrtimer_mode mode) 6129 __must_hold(&ctx->timeout_lock) 6130{ 6131 struct io_timeout_data *io; 6132 struct io_kiocb *req; 6133 bool found = false; 6134 6135 list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { 6136 found = user_data == req->user_data; 6137 if (found) 6138 break; 6139 } 6140 if (!found) 6141 return -ENOENT; 6142 6143 io = req->async_data; 6144 if (hrtimer_try_to_cancel(&io->timer) == -1) 6145 return -EALREADY; 6146 hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); 6147 io->timer.function = io_link_timeout_fn; 6148 hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); 6149 return 0; 6150} 6151 6152static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 6153 struct timespec64 *ts, enum hrtimer_mode mode) 6154 __must_hold(&ctx->timeout_lock) 6155{ 6156 struct io_kiocb *req = io_timeout_extract(ctx, user_data); 6157 struct io_timeout_data *data; 6158 6159 if (IS_ERR(req)) 6160 return PTR_ERR(req); 6161 6162 req->timeout.off = 0; /* noseq */ 6163 data = req->async_data; 6164 list_add_tail(&req->timeout.list, &ctx->timeout_list); 6165 hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); 6166 data->timer.function = io_timeout_fn; 6167 hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); 6168 return 0; 6169} 6170 6171static int io_timeout_remove_prep(struct io_kiocb *req, 6172 const struct io_uring_sqe *sqe) 6173{ 6174 struct io_timeout_rem *tr = &req->timeout_rem; 6175 6176 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6177 return -EINVAL; 6178 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6179 return -EINVAL; 6180 if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) 6181 return -EINVAL; 6182 6183 tr->ltimeout = false; 6184 tr->addr = READ_ONCE(sqe->addr); 6185 tr->flags = READ_ONCE(sqe->timeout_flags); 6186 if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { 6187 if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) 6188 return -EINVAL; 6189 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) 6190 tr->ltimeout = true; 6191 if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) 6192 return -EINVAL; 6193 if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) 6194 return -EFAULT; 6195 } else if (tr->flags) { 6196 /* timeout removal doesn't support flags */ 6197 return -EINVAL; 6198 } 6199 6200 return 0; 6201} 6202 6203static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) 6204{ 6205 return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS 6206 : HRTIMER_MODE_REL; 6207} 6208 6209/* 6210 * Remove or update an existing timeout command 6211 */ 6212static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) 6213{ 6214 struct io_timeout_rem *tr = &req->timeout_rem; 6215 struct io_ring_ctx *ctx = req->ctx; 6216 int ret; 6217 6218 if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { 6219 spin_lock(&ctx->completion_lock); 6220 spin_lock_irq(&ctx->timeout_lock); 6221 ret = io_timeout_cancel(ctx, tr->addr); 6222 spin_unlock_irq(&ctx->timeout_lock); 6223 spin_unlock(&ctx->completion_lock); 6224 } else { 6225 enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); 6226 6227 spin_lock_irq(&ctx->timeout_lock); 6228 if (tr->ltimeout) 6229 ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); 6230 else 6231 ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); 6232 spin_unlock_irq(&ctx->timeout_lock); 6233 } 6234 6235 if (ret < 0) 6236 req_set_fail(req); 6237 io_req_complete_post(req, ret, 0); 6238 return 0; 6239} 6240 6241static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, 6242 bool is_timeout_link) 6243{ 6244 struct io_timeout_data *data; 6245 unsigned flags; 6246 u32 off = READ_ONCE(sqe->off); 6247 6248 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6249 return -EINVAL; 6250 if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || 6251 sqe->splice_fd_in) 6252 return -EINVAL; 6253 if (off && is_timeout_link) 6254 return -EINVAL; 6255 flags = READ_ONCE(sqe->timeout_flags); 6256 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK)) 6257 return -EINVAL; 6258 /* more than one clock specified is invalid, obviously */ 6259 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) 6260 return -EINVAL; 6261 6262 INIT_LIST_HEAD(&req->timeout.list); 6263 req->timeout.off = off; 6264 if (unlikely(off && !req->ctx->off_timeout_used)) 6265 req->ctx->off_timeout_used = true; 6266 6267 if (!req->async_data && io_alloc_async_data(req)) 6268 return -ENOMEM; 6269 6270 data = req->async_data; 6271 data->req = req; 6272 data->flags = flags; 6273 6274 if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) 6275 return -EFAULT; 6276 6277 INIT_LIST_HEAD(&req->timeout.list); 6278 data->mode = io_translate_timeout_mode(flags); 6279 hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); 6280 6281 if (is_timeout_link) { 6282 struct io_submit_link *link = &req->ctx->submit_state.link; 6283 6284 if (!link->head) 6285 return -EINVAL; 6286 if (link->last->opcode == IORING_OP_LINK_TIMEOUT) 6287 return -EINVAL; 6288 req->timeout.head = link->last; 6289 link->last->flags |= REQ_F_ARM_LTIMEOUT; 6290 } 6291 return 0; 6292} 6293 6294static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) 6295{ 6296 struct io_ring_ctx *ctx = req->ctx; 6297 struct io_timeout_data *data = req->async_data; 6298 struct list_head *entry; 6299 u32 tail, off = req->timeout.off; 6300 6301 spin_lock_irq(&ctx->timeout_lock); 6302 6303 /* 6304 * sqe->off holds how many events that need to occur for this 6305 * timeout event to be satisfied. If it isn't set, then this is 6306 * a pure timeout request, sequence isn't used. 6307 */ 6308 if (io_is_timeout_noseq(req)) { 6309 entry = ctx->timeout_list.prev; 6310 goto add; 6311 } 6312 6313 tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 6314 req->timeout.target_seq = tail + off; 6315 6316 /* Update the last seq here in case io_flush_timeouts() hasn't. 6317 * This is safe because ->completion_lock is held, and submissions 6318 * and completions are never mixed in the same ->completion_lock section. 6319 */ 6320 ctx->cq_last_tm_flush = tail; 6321 6322 /* 6323 * Insertion sort, ensuring the first entry in the list is always 6324 * the one we need first. 6325 */ 6326 list_for_each_prev(entry, &ctx->timeout_list) { 6327 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, 6328 timeout.list); 6329 6330 if (io_is_timeout_noseq(nxt)) 6331 continue; 6332 /* nxt.seq is behind @tail, otherwise would've been completed */ 6333 if (off >= nxt->timeout.target_seq - tail) 6334 break; 6335 } 6336add: 6337 list_add(&req->timeout.list, entry); 6338 data->timer.function = io_timeout_fn; 6339 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); 6340 spin_unlock_irq(&ctx->timeout_lock); 6341 return 0; 6342} 6343 6344struct io_cancel_data { 6345 struct io_ring_ctx *ctx; 6346 u64 user_data; 6347}; 6348 6349static bool io_cancel_cb(struct io_wq_work *work, void *data) 6350{ 6351 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6352 struct io_cancel_data *cd = data; 6353 6354 return req->ctx == cd->ctx && req->user_data == cd->user_data; 6355} 6356 6357static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, 6358 struct io_ring_ctx *ctx) 6359{ 6360 struct io_cancel_data data = { .ctx = ctx, .user_data = user_data, }; 6361 enum io_wq_cancel cancel_ret; 6362 int ret = 0; 6363 6364 if (!tctx || !tctx->io_wq) 6365 return -ENOENT; 6366 6367 cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, &data, false); 6368 switch (cancel_ret) { 6369 case IO_WQ_CANCEL_OK: 6370 ret = 0; 6371 break; 6372 case IO_WQ_CANCEL_RUNNING: 6373 ret = -EALREADY; 6374 break; 6375 case IO_WQ_CANCEL_NOTFOUND: 6376 ret = -ENOENT; 6377 break; 6378 } 6379 6380 return ret; 6381} 6382 6383static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) 6384{ 6385 struct io_ring_ctx *ctx = req->ctx; 6386 int ret; 6387 6388 WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); 6389 6390 ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); 6391 if (ret != -ENOENT) 6392 return ret; 6393 6394 spin_lock(&ctx->completion_lock); 6395 spin_lock_irq(&ctx->timeout_lock); 6396 ret = io_timeout_cancel(ctx, sqe_addr); 6397 spin_unlock_irq(&ctx->timeout_lock); 6398 if (ret != -ENOENT) 6399 goto out; 6400 ret = io_poll_cancel(ctx, sqe_addr, false); 6401out: 6402 spin_unlock(&ctx->completion_lock); 6403 return ret; 6404} 6405 6406static int io_async_cancel_prep(struct io_kiocb *req, 6407 const struct io_uring_sqe *sqe) 6408{ 6409 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 6410 return -EINVAL; 6411 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6412 return -EINVAL; 6413 if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || 6414 sqe->splice_fd_in) 6415 return -EINVAL; 6416 6417 req->cancel.addr = READ_ONCE(sqe->addr); 6418 return 0; 6419} 6420 6421static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) 6422{ 6423 struct io_ring_ctx *ctx = req->ctx; 6424 u64 sqe_addr = req->cancel.addr; 6425 struct io_tctx_node *node; 6426 int ret; 6427 6428 ret = io_try_cancel_userdata(req, sqe_addr); 6429 if (ret != -ENOENT) 6430 goto done; 6431 6432 /* slow path, try all io-wq's */ 6433 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6434 ret = -ENOENT; 6435 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 6436 struct io_uring_task *tctx = node->task->io_uring; 6437 6438 ret = io_async_cancel_one(tctx, req->cancel.addr, ctx); 6439 if (ret != -ENOENT) 6440 break; 6441 } 6442 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6443done: 6444 if (ret < 0) 6445 req_set_fail(req); 6446 io_req_complete_post(req, ret, 0); 6447 return 0; 6448} 6449 6450static int io_rsrc_update_prep(struct io_kiocb *req, 6451 const struct io_uring_sqe *sqe) 6452{ 6453 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 6454 return -EINVAL; 6455 if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) 6456 return -EINVAL; 6457 6458 req->rsrc_update.offset = READ_ONCE(sqe->off); 6459 req->rsrc_update.nr_args = READ_ONCE(sqe->len); 6460 if (!req->rsrc_update.nr_args) 6461 return -EINVAL; 6462 req->rsrc_update.arg = READ_ONCE(sqe->addr); 6463 return 0; 6464} 6465 6466static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) 6467{ 6468 struct io_ring_ctx *ctx = req->ctx; 6469 struct io_uring_rsrc_update2 up; 6470 int ret; 6471 6472 up.offset = req->rsrc_update.offset; 6473 up.data = req->rsrc_update.arg; 6474 up.nr = 0; 6475 up.tags = 0; 6476 up.resv = 0; 6477 up.resv2 = 0; 6478 6479 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6480 ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, 6481 &up, req->rsrc_update.nr_args); 6482 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6483 6484 if (ret < 0) 6485 req_set_fail(req); 6486 __io_req_complete(req, issue_flags, ret, 0); 6487 return 0; 6488} 6489 6490static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 6491{ 6492 switch (req->opcode) { 6493 case IORING_OP_NOP: 6494 return 0; 6495 case IORING_OP_READV: 6496 case IORING_OP_READ_FIXED: 6497 case IORING_OP_READ: 6498 return io_read_prep(req, sqe); 6499 case IORING_OP_WRITEV: 6500 case IORING_OP_WRITE_FIXED: 6501 case IORING_OP_WRITE: 6502 return io_write_prep(req, sqe); 6503 case IORING_OP_POLL_ADD: 6504 return io_poll_add_prep(req, sqe); 6505 case IORING_OP_POLL_REMOVE: 6506 return io_poll_update_prep(req, sqe); 6507 case IORING_OP_FSYNC: 6508 return io_fsync_prep(req, sqe); 6509 case IORING_OP_SYNC_FILE_RANGE: 6510 return io_sfr_prep(req, sqe); 6511 case IORING_OP_SENDMSG: 6512 case IORING_OP_SEND: 6513 return io_sendmsg_prep(req, sqe); 6514 case IORING_OP_RECVMSG: 6515 case IORING_OP_RECV: 6516 return io_recvmsg_prep(req, sqe); 6517 case IORING_OP_CONNECT: 6518 return io_connect_prep(req, sqe); 6519 case IORING_OP_TIMEOUT: 6520 return io_timeout_prep(req, sqe, false); 6521 case IORING_OP_TIMEOUT_REMOVE: 6522 return io_timeout_remove_prep(req, sqe); 6523 case IORING_OP_ASYNC_CANCEL: 6524 return io_async_cancel_prep(req, sqe); 6525 case IORING_OP_LINK_TIMEOUT: 6526 return io_timeout_prep(req, sqe, true); 6527 case IORING_OP_ACCEPT: 6528 return io_accept_prep(req, sqe); 6529 case IORING_OP_FALLOCATE: 6530 return io_fallocate_prep(req, sqe); 6531 case IORING_OP_OPENAT: 6532 return io_openat_prep(req, sqe); 6533 case IORING_OP_CLOSE: 6534 return io_close_prep(req, sqe); 6535 case IORING_OP_FILES_UPDATE: 6536 return io_rsrc_update_prep(req, sqe); 6537 case IORING_OP_STATX: 6538 return io_statx_prep(req, sqe); 6539 case IORING_OP_FADVISE: 6540 return io_fadvise_prep(req, sqe); 6541 case IORING_OP_MADVISE: 6542 return io_madvise_prep(req, sqe); 6543 case IORING_OP_OPENAT2: 6544 return io_openat2_prep(req, sqe); 6545 case IORING_OP_EPOLL_CTL: 6546 return io_epoll_ctl_prep(req, sqe); 6547 case IORING_OP_SPLICE: 6548 return io_splice_prep(req, sqe); 6549 case IORING_OP_PROVIDE_BUFFERS: 6550 return io_provide_buffers_prep(req, sqe); 6551 case IORING_OP_REMOVE_BUFFERS: 6552 return io_remove_buffers_prep(req, sqe); 6553 case IORING_OP_TEE: 6554 return io_tee_prep(req, sqe); 6555 case IORING_OP_SHUTDOWN: 6556 return io_shutdown_prep(req, sqe); 6557 case IORING_OP_RENAMEAT: 6558 return io_renameat_prep(req, sqe); 6559 case IORING_OP_UNLINKAT: 6560 return io_unlinkat_prep(req, sqe); 6561 } 6562 6563 printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", 6564 req->opcode); 6565 return -EINVAL; 6566} 6567 6568static int io_req_prep_async(struct io_kiocb *req) 6569{ 6570 if (!io_op_defs[req->opcode].needs_async_setup) 6571 return 0; 6572 if (WARN_ON_ONCE(req->async_data)) 6573 return -EFAULT; 6574 if (io_alloc_async_data(req)) 6575 return -EAGAIN; 6576 6577 switch (req->opcode) { 6578 case IORING_OP_READV: 6579 return io_rw_prep_async(req, READ); 6580 case IORING_OP_WRITEV: 6581 return io_rw_prep_async(req, WRITE); 6582 case IORING_OP_SENDMSG: 6583 return io_sendmsg_prep_async(req); 6584 case IORING_OP_RECVMSG: 6585 return io_recvmsg_prep_async(req); 6586 case IORING_OP_CONNECT: 6587 return io_connect_prep_async(req); 6588 } 6589 printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", 6590 req->opcode); 6591 return -EFAULT; 6592} 6593 6594static u32 io_get_sequence(struct io_kiocb *req) 6595{ 6596 u32 seq = req->ctx->cached_sq_head; 6597 6598 /* need original cached_sq_head, but it was increased for each req */ 6599 io_for_each_link(req, req) 6600 seq--; 6601 return seq; 6602} 6603 6604static bool io_drain_req(struct io_kiocb *req) 6605{ 6606 struct io_kiocb *pos; 6607 struct io_ring_ctx *ctx = req->ctx; 6608 struct io_defer_entry *de; 6609 int ret; 6610 u32 seq; 6611 6612 if (req->flags & REQ_F_FAIL) { 6613 io_req_complete_fail_submit(req); 6614 return true; 6615 } 6616 6617 /* 6618 * If we need to drain a request in the middle of a link, drain the 6619 * head request and the next request/link after the current link. 6620 * Considering sequential execution of links, IOSQE_IO_DRAIN will be 6621 * maintained for every request of our link. 6622 */ 6623 if (ctx->drain_next) { 6624 req->flags |= REQ_F_IO_DRAIN; 6625 ctx->drain_next = false; 6626 } 6627 /* not interested in head, start from the first linked */ 6628 io_for_each_link(pos, req->link) { 6629 if (pos->flags & REQ_F_IO_DRAIN) { 6630 ctx->drain_next = true; 6631 req->flags |= REQ_F_IO_DRAIN; 6632 break; 6633 } 6634 } 6635 6636 /* Still need defer if there is pending req in defer list. */ 6637 spin_lock(&ctx->completion_lock); 6638 if (likely(list_empty_careful(&ctx->defer_list) && 6639 !(req->flags & REQ_F_IO_DRAIN))) { 6640 spin_unlock(&ctx->completion_lock); 6641 ctx->drain_active = false; 6642 return false; 6643 } 6644 spin_unlock(&ctx->completion_lock); 6645 6646 seq = io_get_sequence(req); 6647 /* Still a chance to pass the sequence check */ 6648 if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) 6649 return false; 6650 6651 ret = io_req_prep_async(req); 6652 if (ret) 6653 goto fail; 6654 io_prep_async_link(req); 6655 de = kmalloc(sizeof(*de), GFP_KERNEL); 6656 if (!de) { 6657 ret = -ENOMEM; 6658fail: 6659 io_req_complete_failed(req, ret); 6660 return true; 6661 } 6662 6663 spin_lock(&ctx->completion_lock); 6664 if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { 6665 spin_unlock(&ctx->completion_lock); 6666 kfree(de); 6667 io_queue_async_work(req, NULL); 6668 return true; 6669 } 6670 6671 trace_io_uring_defer(ctx, req, req->user_data); 6672 de->req = req; 6673 de->seq = seq; 6674 list_add_tail(&de->list, &ctx->defer_list); 6675 spin_unlock(&ctx->completion_lock); 6676 return true; 6677} 6678 6679static void io_clean_op(struct io_kiocb *req) 6680{ 6681 if (req->flags & REQ_F_BUFFER_SELECTED) { 6682 switch (req->opcode) { 6683 case IORING_OP_READV: 6684 case IORING_OP_READ_FIXED: 6685 case IORING_OP_READ: 6686 kfree((void *)(unsigned long)req->rw.addr); 6687 break; 6688 case IORING_OP_RECVMSG: 6689 case IORING_OP_RECV: 6690 kfree(req->sr_msg.kbuf); 6691 break; 6692 } 6693 } 6694 6695 if (req->flags & REQ_F_NEED_CLEANUP) { 6696 switch (req->opcode) { 6697 case IORING_OP_READV: 6698 case IORING_OP_READ_FIXED: 6699 case IORING_OP_READ: 6700 case IORING_OP_WRITEV: 6701 case IORING_OP_WRITE_FIXED: 6702 case IORING_OP_WRITE: { 6703 struct io_async_rw *io = req->async_data; 6704 6705 kfree(io->free_iovec); 6706 break; 6707 } 6708 case IORING_OP_RECVMSG: 6709 case IORING_OP_SENDMSG: { 6710 struct io_async_msghdr *io = req->async_data; 6711 6712 kfree(io->free_iov); 6713 break; 6714 } 6715 case IORING_OP_OPENAT: 6716 case IORING_OP_OPENAT2: 6717 if (req->open.filename) 6718 putname(req->open.filename); 6719 break; 6720 case IORING_OP_RENAMEAT: 6721 putname(req->rename.oldpath); 6722 putname(req->rename.newpath); 6723 break; 6724 case IORING_OP_UNLINKAT: 6725 putname(req->unlink.filename); 6726 break; 6727 } 6728 } 6729 if ((req->flags & REQ_F_POLLED) && req->apoll) { 6730 kfree(req->apoll->double_poll); 6731 kfree(req->apoll); 6732 req->apoll = NULL; 6733 } 6734 if (req->flags & REQ_F_INFLIGHT) { 6735 struct io_uring_task *tctx = req->task->io_uring; 6736 6737 atomic_dec(&tctx->inflight_tracked); 6738 } 6739 if (req->flags & REQ_F_CREDS) 6740 put_cred(req->creds); 6741 6742 req->flags &= ~IO_REQ_CLEAN_FLAGS; 6743} 6744 6745static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 6746{ 6747 struct io_ring_ctx *ctx = req->ctx; 6748 const struct cred *creds = NULL; 6749 int ret; 6750 6751 if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) 6752 creds = override_creds(req->creds); 6753 6754 switch (req->opcode) { 6755 case IORING_OP_NOP: 6756 ret = io_nop(req, issue_flags); 6757 break; 6758 case IORING_OP_READV: 6759 case IORING_OP_READ_FIXED: 6760 case IORING_OP_READ: 6761 ret = io_read(req, issue_flags); 6762 break; 6763 case IORING_OP_WRITEV: 6764 case IORING_OP_WRITE_FIXED: 6765 case IORING_OP_WRITE: 6766 ret = io_write(req, issue_flags); 6767 break; 6768 case IORING_OP_FSYNC: 6769 ret = io_fsync(req, issue_flags); 6770 break; 6771 case IORING_OP_POLL_ADD: 6772 ret = io_poll_add(req, issue_flags); 6773 break; 6774 case IORING_OP_POLL_REMOVE: 6775 ret = io_poll_update(req, issue_flags); 6776 break; 6777 case IORING_OP_SYNC_FILE_RANGE: 6778 ret = io_sync_file_range(req, issue_flags); 6779 break; 6780 case IORING_OP_SENDMSG: 6781 ret = io_sendmsg(req, issue_flags); 6782 break; 6783 case IORING_OP_SEND: 6784 ret = io_send(req, issue_flags); 6785 break; 6786 case IORING_OP_RECVMSG: 6787 ret = io_recvmsg(req, issue_flags); 6788 break; 6789 case IORING_OP_RECV: 6790 ret = io_recv(req, issue_flags); 6791 break; 6792 case IORING_OP_TIMEOUT: 6793 ret = io_timeout(req, issue_flags); 6794 break; 6795 case IORING_OP_TIMEOUT_REMOVE: 6796 ret = io_timeout_remove(req, issue_flags); 6797 break; 6798 case IORING_OP_ACCEPT: 6799 ret = io_accept(req, issue_flags); 6800 break; 6801 case IORING_OP_CONNECT: 6802 ret = io_connect(req, issue_flags); 6803 break; 6804 case IORING_OP_ASYNC_CANCEL: 6805 ret = io_async_cancel(req, issue_flags); 6806 break; 6807 case IORING_OP_FALLOCATE: 6808 ret = io_fallocate(req, issue_flags); 6809 break; 6810 case IORING_OP_OPENAT: 6811 ret = io_openat(req, issue_flags); 6812 break; 6813 case IORING_OP_CLOSE: 6814 ret = io_close(req, issue_flags); 6815 break; 6816 case IORING_OP_FILES_UPDATE: 6817 ret = io_files_update(req, issue_flags); 6818 break; 6819 case IORING_OP_STATX: 6820 ret = io_statx(req, issue_flags); 6821 break; 6822 case IORING_OP_FADVISE: 6823 ret = io_fadvise(req, issue_flags); 6824 break; 6825 case IORING_OP_MADVISE: 6826 ret = io_madvise(req, issue_flags); 6827 break; 6828 case IORING_OP_OPENAT2: 6829 ret = io_openat2(req, issue_flags); 6830 break; 6831 case IORING_OP_EPOLL_CTL: 6832 ret = io_epoll_ctl(req, issue_flags); 6833 break; 6834 case IORING_OP_SPLICE: 6835 ret = io_splice(req, issue_flags); 6836 break; 6837 case IORING_OP_PROVIDE_BUFFERS: 6838 ret = io_provide_buffers(req, issue_flags); 6839 break; 6840 case IORING_OP_REMOVE_BUFFERS: 6841 ret = io_remove_buffers(req, issue_flags); 6842 break; 6843 case IORING_OP_TEE: 6844 ret = io_tee(req, issue_flags); 6845 break; 6846 case IORING_OP_SHUTDOWN: 6847 ret = io_shutdown(req, issue_flags); 6848 break; 6849 case IORING_OP_RENAMEAT: 6850 ret = io_renameat(req, issue_flags); 6851 break; 6852 case IORING_OP_UNLINKAT: 6853 ret = io_unlinkat(req, issue_flags); 6854 break; 6855 default: 6856 ret = -EINVAL; 6857 break; 6858 } 6859 6860 if (creds) 6861 revert_creds(creds); 6862 if (ret) 6863 return ret; 6864 /* If the op doesn't have a file, we're not polling for it */ 6865 if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) 6866 io_iopoll_req_issued(req); 6867 6868 return 0; 6869} 6870 6871static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) 6872{ 6873 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6874 6875 req = io_put_req_find_next(req); 6876 return req ? &req->work : NULL; 6877} 6878 6879static void io_wq_submit_work(struct io_wq_work *work) 6880{ 6881 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 6882 struct io_kiocb *timeout; 6883 int ret = 0; 6884 6885 /* one will be dropped by ->io_free_work() after returning to io-wq */ 6886 if (!(req->flags & REQ_F_REFCOUNT)) 6887 __io_req_set_refcount(req, 2); 6888 else 6889 req_ref_get(req); 6890 6891 timeout = io_prep_linked_timeout(req); 6892 if (timeout) 6893 io_queue_linked_timeout(timeout); 6894 /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ 6895 if (work->flags & IO_WQ_WORK_CANCEL) 6896 ret = -ECANCELED; 6897 6898 if (!ret) { 6899 do { 6900 ret = io_issue_sqe(req, 0); 6901 /* 6902 * We can get EAGAIN for polled IO even though we're 6903 * forcing a sync submission from here, since we can't 6904 * wait for request slots on the block side. 6905 */ 6906 if (ret != -EAGAIN || !(req->ctx->flags & IORING_SETUP_IOPOLL)) 6907 break; 6908 if (io_wq_worker_stopped()) 6909 break; 6910 /* 6911 * If REQ_F_NOWAIT is set, then don't wait or retry with 6912 * poll. -EAGAIN is final for that case. 6913 */ 6914 if (req->flags & REQ_F_NOWAIT) 6915 break; 6916 6917 cond_resched(); 6918 } while (1); 6919 } 6920 6921 /* avoid locking problems by failing it from a clean context */ 6922 if (ret) 6923 io_req_task_queue_fail(req, ret); 6924} 6925 6926static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, 6927 unsigned i) 6928{ 6929 return &table->files[i]; 6930} 6931 6932static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, 6933 int index) 6934{ 6935 struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index); 6936 6937 return (struct file *) (slot->file_ptr & FFS_MASK); 6938} 6939 6940static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) 6941{ 6942 unsigned long file_ptr = (unsigned long) file; 6943 6944 if (__io_file_supports_nowait(file, READ)) 6945 file_ptr |= FFS_ASYNC_READ; 6946 if (__io_file_supports_nowait(file, WRITE)) 6947 file_ptr |= FFS_ASYNC_WRITE; 6948 if (S_ISREG(file_inode(file)->i_mode)) 6949 file_ptr |= FFS_ISREG; 6950 file_slot->file_ptr = file_ptr; 6951} 6952 6953static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, 6954 struct io_kiocb *req, int fd, 6955 unsigned int issue_flags) 6956{ 6957 struct file *file = NULL; 6958 unsigned long file_ptr; 6959 6960 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6961 6962 if (unlikely((unsigned int)fd >= ctx->nr_user_files)) 6963 goto out; 6964 fd = array_index_nospec(fd, ctx->nr_user_files); 6965 file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; 6966 file = (struct file *) (file_ptr & FFS_MASK); 6967 file_ptr &= ~FFS_MASK; 6968 /* mask in overlapping REQ_F and FFS bits */ 6969 req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); 6970 io_req_set_rsrc_node(req); 6971out: 6972 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 6973 return file; 6974} 6975 6976static struct file *io_file_get_normal(struct io_ring_ctx *ctx, 6977 struct io_kiocb *req, int fd) 6978{ 6979 struct file *file = fget(fd); 6980 6981 trace_io_uring_file_get(ctx, fd); 6982 6983 /* we don't allow fixed io_uring files */ 6984 if (file && unlikely(file->f_op == &io_uring_fops)) 6985 io_req_track_inflight(req); 6986 return file; 6987} 6988 6989static inline struct file *io_file_get(struct io_ring_ctx *ctx, 6990 struct io_kiocb *req, int fd, bool fixed, 6991 unsigned int issue_flags) 6992{ 6993 if (fixed) 6994 return io_file_get_fixed(ctx, req, fd, issue_flags); 6995 else 6996 return io_file_get_normal(ctx, req, fd); 6997} 6998 6999static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) 7000{ 7001 struct io_kiocb *prev = req->timeout.prev; 7002 int ret = -ENOENT; 7003 7004 if (prev) { 7005 if (!(req->task->flags & PF_EXITING)) 7006 ret = io_try_cancel_userdata(req, prev->user_data); 7007 io_req_complete_post(req, ret ?: -ETIME, 0); 7008 io_put_req(prev); 7009 } else { 7010 io_req_complete_post(req, -ETIME, 0); 7011 } 7012} 7013 7014static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) 7015{ 7016 struct io_timeout_data *data = container_of(timer, 7017 struct io_timeout_data, timer); 7018 struct io_kiocb *prev, *req = data->req; 7019 struct io_ring_ctx *ctx = req->ctx; 7020 unsigned long flags; 7021 7022 spin_lock_irqsave(&ctx->timeout_lock, flags); 7023 prev = req->timeout.head; 7024 req->timeout.head = NULL; 7025 7026 /* 7027 * We don't expect the list to be empty, that will only happen if we 7028 * race with the completion of the linked work. 7029 */ 7030 if (prev) { 7031 io_remove_next_linked(prev); 7032 if (!req_ref_inc_not_zero(prev)) 7033 prev = NULL; 7034 } 7035 list_del(&req->timeout.list); 7036 req->timeout.prev = prev; 7037 spin_unlock_irqrestore(&ctx->timeout_lock, flags); 7038 7039 req->io_task_work.func = io_req_task_link_timeout; 7040 io_req_task_work_add(req); 7041 return HRTIMER_NORESTART; 7042} 7043 7044static void io_queue_linked_timeout(struct io_kiocb *req) 7045{ 7046 struct io_ring_ctx *ctx = req->ctx; 7047 7048 spin_lock_irq(&ctx->timeout_lock); 7049 /* 7050 * If the back reference is NULL, then our linked request finished 7051 * before we got a chance to setup the timer 7052 */ 7053 if (req->timeout.head) { 7054 struct io_timeout_data *data = req->async_data; 7055 7056 data->timer.function = io_link_timeout_fn; 7057 hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), 7058 data->mode); 7059 list_add_tail(&req->timeout.list, &ctx->ltimeout_list); 7060 } 7061 spin_unlock_irq(&ctx->timeout_lock); 7062 /* drop submission reference */ 7063 io_put_req(req); 7064} 7065 7066static void __io_queue_sqe(struct io_kiocb *req) 7067 __must_hold(&req->ctx->uring_lock) 7068{ 7069 struct io_kiocb *linked_timeout; 7070 int ret; 7071 7072issue_sqe: 7073 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); 7074 7075 /* 7076 * We async punt it if the file wasn't marked NOWAIT, or if the file 7077 * doesn't support non-blocking read/write attempts 7078 */ 7079 if (likely(!ret)) { 7080 if (req->flags & REQ_F_COMPLETE_INLINE) { 7081 struct io_ring_ctx *ctx = req->ctx; 7082 struct io_submit_state *state = &ctx->submit_state; 7083 7084 state->compl_reqs[state->compl_nr++] = req; 7085 if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) 7086 io_submit_flush_completions(ctx); 7087 return; 7088 } 7089 7090 linked_timeout = io_prep_linked_timeout(req); 7091 if (linked_timeout) 7092 io_queue_linked_timeout(linked_timeout); 7093 } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { 7094 linked_timeout = io_prep_linked_timeout(req); 7095 7096 switch (io_arm_poll_handler(req)) { 7097 case IO_APOLL_READY: 7098 if (linked_timeout) 7099 io_queue_linked_timeout(linked_timeout); 7100 goto issue_sqe; 7101 case IO_APOLL_ABORTED: 7102 /* 7103 * Queued up for async execution, worker will release 7104 * submit reference when the iocb is actually submitted. 7105 */ 7106 io_queue_async_work(req, NULL); 7107 break; 7108 } 7109 7110 if (linked_timeout) 7111 io_queue_linked_timeout(linked_timeout); 7112 } else { 7113 io_req_complete_failed(req, ret); 7114 } 7115} 7116 7117static inline void io_queue_sqe(struct io_kiocb *req) 7118 __must_hold(&req->ctx->uring_lock) 7119{ 7120 if (unlikely(req->ctx->drain_active) && io_drain_req(req)) 7121 return; 7122 7123 if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { 7124 __io_queue_sqe(req); 7125 } else if (req->flags & REQ_F_FAIL) { 7126 io_req_complete_fail_submit(req); 7127 } else { 7128 int ret = io_req_prep_async(req); 7129 7130 if (unlikely(ret)) 7131 io_req_complete_failed(req, ret); 7132 else 7133 io_queue_async_work(req, NULL); 7134 } 7135} 7136 7137/* 7138 * Check SQE restrictions (opcode and flags). 7139 * 7140 * Returns 'true' if SQE is allowed, 'false' otherwise. 7141 */ 7142static inline bool io_check_restriction(struct io_ring_ctx *ctx, 7143 struct io_kiocb *req, 7144 unsigned int sqe_flags) 7145{ 7146 if (likely(!ctx->restricted)) 7147 return true; 7148 7149 if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) 7150 return false; 7151 7152 if ((sqe_flags & ctx->restrictions.sqe_flags_required) != 7153 ctx->restrictions.sqe_flags_required) 7154 return false; 7155 7156 if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | 7157 ctx->restrictions.sqe_flags_required)) 7158 return false; 7159 7160 return true; 7161} 7162 7163static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, 7164 const struct io_uring_sqe *sqe) 7165 __must_hold(&ctx->uring_lock) 7166{ 7167 struct io_submit_state *state; 7168 unsigned int sqe_flags; 7169 int personality, ret = 0; 7170 7171 /* req is partially pre-initialised, see io_preinit_req() */ 7172 req->opcode = READ_ONCE(sqe->opcode); 7173 /* same numerical values with corresponding REQ_F_*, safe to copy */ 7174 req->flags = sqe_flags = READ_ONCE(sqe->flags); 7175 req->user_data = READ_ONCE(sqe->user_data); 7176 req->file = NULL; 7177 req->fixed_rsrc_refs = NULL; 7178 req->task = current; 7179 7180 /* enforce forwards compatibility on users */ 7181 if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) 7182 return -EINVAL; 7183 if (unlikely(req->opcode >= IORING_OP_LAST)) 7184 return -EINVAL; 7185 if (!io_check_restriction(ctx, req, sqe_flags)) 7186 return -EACCES; 7187 7188 if ((sqe_flags & IOSQE_BUFFER_SELECT) && 7189 !io_op_defs[req->opcode].buffer_select) 7190 return -EOPNOTSUPP; 7191 if (unlikely(sqe_flags & IOSQE_IO_DRAIN)) 7192 ctx->drain_active = true; 7193 7194 personality = READ_ONCE(sqe->personality); 7195 if (personality) { 7196 req->creds = xa_load(&ctx->personalities, personality); 7197 if (!req->creds) 7198 return -EINVAL; 7199 get_cred(req->creds); 7200 req->flags |= REQ_F_CREDS; 7201 } 7202 state = &ctx->submit_state; 7203 7204 /* 7205 * Plug now if we have more than 1 IO left after this, and the target 7206 * is potentially a read/write to block based storage. 7207 */ 7208 if (!state->plug_started && state->ios_left > 1 && 7209 io_op_defs[req->opcode].plug) { 7210 blk_start_plug(&state->plug); 7211 state->plug_started = true; 7212 } 7213 7214 if (io_op_defs[req->opcode].needs_file) { 7215 req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), 7216 (sqe_flags & IOSQE_FIXED_FILE), 7217 IO_URING_F_NONBLOCK); 7218 if (unlikely(!req->file)) 7219 ret = -EBADF; 7220 } 7221 7222 state->ios_left--; 7223 return ret; 7224} 7225 7226static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 7227 const struct io_uring_sqe *sqe) 7228 __must_hold(&ctx->uring_lock) 7229{ 7230 struct io_submit_link *link = &ctx->submit_state.link; 7231 int ret; 7232 7233 ret = io_init_req(ctx, req, sqe); 7234 if (unlikely(ret)) { 7235fail_req: 7236 /* fail even hard links since we don't submit */ 7237 if (link->head) { 7238 /* 7239 * we can judge a link req is failed or cancelled by if 7240 * REQ_F_FAIL is set, but the head is an exception since 7241 * it may be set REQ_F_FAIL because of other req's failure 7242 * so let's leverage req->result to distinguish if a head 7243 * is set REQ_F_FAIL because of its failure or other req's 7244 * failure so that we can set the correct ret code for it. 7245 * init result here to avoid affecting the normal path. 7246 */ 7247 if (!(link->head->flags & REQ_F_FAIL)) 7248 req_fail_link_node(link->head, -ECANCELED); 7249 } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 7250 /* 7251 * the current req is a normal req, we should return 7252 * error and thus break the submittion loop. 7253 */ 7254 io_req_complete_failed(req, ret); 7255 return ret; 7256 } 7257 req_fail_link_node(req, ret); 7258 } else { 7259 ret = io_req_prep(req, sqe); 7260 if (unlikely(ret)) 7261 goto fail_req; 7262 } 7263 7264 /* don't need @sqe from now on */ 7265 trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, 7266 req->flags, true, 7267 ctx->flags & IORING_SETUP_SQPOLL); 7268 7269 /* 7270 * If we already have a head request, queue this one for async 7271 * submittal once the head completes. If we don't have a head but 7272 * IOSQE_IO_LINK is set in the sqe, start a new head. This one will be 7273 * submitted sync once the chain is complete. If none of those 7274 * conditions are true (normal request), then just queue it. 7275 */ 7276 if (link->head) { 7277 struct io_kiocb *head = link->head; 7278 7279 if (!(req->flags & REQ_F_FAIL)) { 7280 ret = io_req_prep_async(req); 7281 if (unlikely(ret)) { 7282 req_fail_link_node(req, ret); 7283 if (!(head->flags & REQ_F_FAIL)) 7284 req_fail_link_node(head, -ECANCELED); 7285 } 7286 } 7287 trace_io_uring_link(ctx, req, head); 7288 link->last->link = req; 7289 link->last = req; 7290 7291 /* last request of a link, enqueue the link */ 7292 if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { 7293 link->head = NULL; 7294 io_queue_sqe(head); 7295 } 7296 } else { 7297 if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { 7298 link->head = req; 7299 link->last = req; 7300 } else { 7301 io_queue_sqe(req); 7302 } 7303 } 7304 7305 return 0; 7306} 7307 7308/* 7309 * Batched submission is done, ensure local IO is flushed out. 7310 */ 7311static void io_submit_state_end(struct io_submit_state *state, 7312 struct io_ring_ctx *ctx) 7313{ 7314 if (state->link.head) 7315 io_queue_sqe(state->link.head); 7316 if (state->compl_nr) 7317 io_submit_flush_completions(ctx); 7318 if (state->plug_started) 7319 blk_finish_plug(&state->plug); 7320} 7321 7322/* 7323 * Start submission side cache. 7324 */ 7325static void io_submit_state_start(struct io_submit_state *state, 7326 unsigned int max_ios) 7327{ 7328 state->plug_started = false; 7329 state->ios_left = max_ios; 7330 /* set only head, no need to init link_last in advance */ 7331 state->link.head = NULL; 7332} 7333 7334static void io_commit_sqring(struct io_ring_ctx *ctx) 7335{ 7336 struct io_rings *rings = ctx->rings; 7337 7338 /* 7339 * Ensure any loads from the SQEs are done at this point, 7340 * since once we write the new head, the application could 7341 * write new data to them. 7342 */ 7343 smp_store_release(&rings->sq.head, ctx->cached_sq_head); 7344} 7345 7346/* 7347 * Fetch an sqe, if one is available. Note this returns a pointer to memory 7348 * that is mapped by userspace. This means that care needs to be taken to 7349 * ensure that reads are stable, as we cannot rely on userspace always 7350 * being a good citizen. If members of the sqe are validated and then later 7351 * used, it's important that those reads are done through READ_ONCE() to 7352 * prevent a re-load down the line. 7353 */ 7354static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) 7355{ 7356 unsigned head, mask = ctx->sq_entries - 1; 7357 unsigned sq_idx = ctx->cached_sq_head++ & mask; 7358 7359 /* 7360 * The cached sq head (or cq tail) serves two purposes: 7361 * 7362 * 1) allows us to batch the cost of updating the user visible 7363 * head updates. 7364 * 2) allows the kernel side to track the head on its own, even 7365 * though the application is the one updating it. 7366 */ 7367 head = READ_ONCE(ctx->sq_array[sq_idx]); 7368 if (likely(head < ctx->sq_entries)) 7369 return &ctx->sq_sqes[head]; 7370 7371 /* drop invalid entries */ 7372 ctx->cq_extra--; 7373 WRITE_ONCE(ctx->rings->sq_dropped, 7374 READ_ONCE(ctx->rings->sq_dropped) + 1); 7375 return NULL; 7376} 7377 7378static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 7379 __must_hold(&ctx->uring_lock) 7380{ 7381 int submitted = 0; 7382 7383 /* make sure SQ entry isn't read before tail */ 7384 nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); 7385 if (!percpu_ref_tryget_many(&ctx->refs, nr)) 7386 return -EAGAIN; 7387 io_get_task_refs(nr); 7388 7389 io_submit_state_start(&ctx->submit_state, nr); 7390 while (submitted < nr) { 7391 const struct io_uring_sqe *sqe; 7392 struct io_kiocb *req; 7393 7394 req = io_alloc_req(ctx); 7395 if (unlikely(!req)) { 7396 if (!submitted) 7397 submitted = -EAGAIN; 7398 break; 7399 } 7400 sqe = io_get_sqe(ctx); 7401 if (unlikely(!sqe)) { 7402 list_add(&req->inflight_entry, &ctx->submit_state.free_list); 7403 break; 7404 } 7405 /* will complete beyond this point, count as submitted */ 7406 submitted++; 7407 if (io_submit_sqe(ctx, req, sqe)) 7408 break; 7409 } 7410 7411 if (unlikely(submitted != nr)) { 7412 int ref_used = (submitted == -EAGAIN) ? 0 : submitted; 7413 int unused = nr - ref_used; 7414 7415 current->io_uring->cached_refs += unused; 7416 percpu_ref_put_many(&ctx->refs, unused); 7417 } 7418 7419 io_submit_state_end(&ctx->submit_state, ctx); 7420 /* Commit SQ ring head once we've consumed and submitted all SQEs */ 7421 io_commit_sqring(ctx); 7422 7423 return submitted; 7424} 7425 7426static inline bool io_sqd_events_pending(struct io_sq_data *sqd) 7427{ 7428 return READ_ONCE(sqd->state); 7429} 7430 7431static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) 7432{ 7433 /* Tell userspace we may need a wakeup call */ 7434 spin_lock(&ctx->completion_lock); 7435 WRITE_ONCE(ctx->rings->sq_flags, 7436 ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); 7437 spin_unlock(&ctx->completion_lock); 7438} 7439 7440static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) 7441{ 7442 spin_lock(&ctx->completion_lock); 7443 WRITE_ONCE(ctx->rings->sq_flags, 7444 ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); 7445 spin_unlock(&ctx->completion_lock); 7446} 7447 7448static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) 7449{ 7450 unsigned int to_submit; 7451 int ret = 0; 7452 7453 to_submit = io_sqring_entries(ctx); 7454 /* if we're handling multiple rings, cap submit size for fairness */ 7455 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) 7456 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; 7457 7458 if (!list_empty(&ctx->iopoll_list) || to_submit) { 7459 unsigned nr_events = 0; 7460 const struct cred *creds = NULL; 7461 7462 if (ctx->sq_creds != current_cred()) 7463 creds = override_creds(ctx->sq_creds); 7464 7465 mutex_lock(&ctx->uring_lock); 7466 if (!list_empty(&ctx->iopoll_list)) 7467 io_do_iopoll(ctx, &nr_events, 0); 7468 7469 /* 7470 * Don't submit if refs are dying, good for io_uring_register(), 7471 * but also it is relied upon by io_ring_exit_work() 7472 */ 7473 if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && 7474 !(ctx->flags & IORING_SETUP_R_DISABLED)) 7475 ret = io_submit_sqes(ctx, to_submit); 7476 mutex_unlock(&ctx->uring_lock); 7477 7478 if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) 7479 wake_up(&ctx->sqo_sq_wait); 7480 if (creds) 7481 revert_creds(creds); 7482 } 7483 7484 return ret; 7485} 7486 7487static void io_sqd_update_thread_idle(struct io_sq_data *sqd) 7488{ 7489 struct io_ring_ctx *ctx; 7490 unsigned sq_thread_idle = 0; 7491 7492 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7493 sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); 7494 sqd->sq_thread_idle = sq_thread_idle; 7495} 7496 7497static bool io_sqd_handle_event(struct io_sq_data *sqd) 7498{ 7499 bool did_sig = false; 7500 struct ksignal ksig; 7501 7502 if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || 7503 signal_pending(current)) { 7504 mutex_unlock(&sqd->lock); 7505 if (signal_pending(current)) 7506 did_sig = get_signal(&ksig); 7507 cond_resched(); 7508 mutex_lock(&sqd->lock); 7509 } 7510 return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7511} 7512 7513static int io_sq_thread(void *data) 7514{ 7515 struct io_sq_data *sqd = data; 7516 struct io_ring_ctx *ctx; 7517 unsigned long timeout = 0; 7518 char buf[TASK_COMM_LEN]; 7519 DEFINE_WAIT(wait); 7520 7521 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); 7522 set_task_comm(current, buf); 7523 7524 if (sqd->sq_cpu != -1) 7525 set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); 7526 else 7527 set_cpus_allowed_ptr(current, cpu_online_mask); 7528 current->flags |= PF_NO_SETAFFINITY; 7529 7530 mutex_lock(&sqd->lock); 7531 while (1) { 7532 bool cap_entries, sqt_spin = false; 7533 7534 if (io_sqd_events_pending(sqd) || signal_pending(current)) { 7535 if (io_sqd_handle_event(sqd)) 7536 break; 7537 timeout = jiffies + sqd->sq_thread_idle; 7538 } 7539 7540 cap_entries = !list_is_singular(&sqd->ctx_list); 7541 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 7542 int ret = __io_sq_thread(ctx, cap_entries); 7543 7544 if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) 7545 sqt_spin = true; 7546 } 7547 if (io_run_task_work()) 7548 sqt_spin = true; 7549 7550 if (sqt_spin || !time_after(jiffies, timeout)) { 7551 cond_resched(); 7552 if (sqt_spin) 7553 timeout = jiffies + sqd->sq_thread_idle; 7554 continue; 7555 } 7556 7557 prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); 7558 if (!io_sqd_events_pending(sqd) && !current->task_works) { 7559 bool needs_sched = true; 7560 7561 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 7562 io_ring_set_wakeup_flag(ctx); 7563 7564 if ((ctx->flags & IORING_SETUP_IOPOLL) && 7565 !list_empty_careful(&ctx->iopoll_list)) { 7566 needs_sched = false; 7567 break; 7568 } 7569 if (io_sqring_entries(ctx)) { 7570 needs_sched = false; 7571 break; 7572 } 7573 } 7574 7575 if (needs_sched) { 7576 mutex_unlock(&sqd->lock); 7577 schedule(); 7578 mutex_lock(&sqd->lock); 7579 } 7580 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7581 io_ring_clear_wakeup_flag(ctx); 7582 } 7583 7584 finish_wait(&sqd->wait, &wait); 7585 timeout = jiffies + sqd->sq_thread_idle; 7586 } 7587 7588 io_uring_cancel_generic(true, sqd); 7589 sqd->thread = NULL; 7590 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 7591 io_ring_set_wakeup_flag(ctx); 7592 io_run_task_work(); 7593 mutex_unlock(&sqd->lock); 7594 7595 complete(&sqd->exited); 7596 do_exit(0); 7597} 7598 7599struct io_wait_queue { 7600 struct wait_queue_entry wq; 7601 struct io_ring_ctx *ctx; 7602 unsigned cq_tail; 7603 unsigned nr_timeouts; 7604}; 7605 7606static inline bool io_should_wake(struct io_wait_queue *iowq) 7607{ 7608 struct io_ring_ctx *ctx = iowq->ctx; 7609 int dist = ctx->cached_cq_tail - (int) iowq->cq_tail; 7610 7611 /* 7612 * Wake up if we have enough events, or if a timeout occurred since we 7613 * started waiting. For timeouts, we always want to return to userspace, 7614 * regardless of event count. 7615 */ 7616 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; 7617} 7618 7619static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, 7620 int wake_flags, void *key) 7621{ 7622 struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, 7623 wq); 7624 7625 /* 7626 * Cannot safely flush overflowed CQEs from here, ensure we wake up 7627 * the task, and the next invocation will do it. 7628 */ 7629 if (io_should_wake(iowq) || test_bit(0, &iowq->ctx->check_cq_overflow)) 7630 return autoremove_wake_function(curr, mode, wake_flags, key); 7631 return -1; 7632} 7633 7634static int io_run_task_work_sig(void) 7635{ 7636 if (io_run_task_work()) 7637 return 1; 7638 if (!signal_pending(current)) 7639 return 0; 7640 if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 7641 return -ERESTARTSYS; 7642 return -EINTR; 7643} 7644 7645static bool current_pending_io(void) 7646{ 7647 struct io_uring_task *tctx = current->io_uring; 7648 7649 if (!tctx) 7650 return false; 7651 return percpu_counter_read_positive(&tctx->inflight); 7652} 7653 7654/* when returns >0, the caller should retry */ 7655static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 7656 struct io_wait_queue *iowq, 7657 ktime_t *timeout) 7658{ 7659 int io_wait, ret; 7660 7661 /* make sure we run task_work before checking for signals */ 7662 ret = io_run_task_work_sig(); 7663 if (ret || io_should_wake(iowq)) 7664 return ret; 7665 /* let the caller flush overflows, retry */ 7666 if (test_bit(0, &ctx->check_cq_overflow)) 7667 return 1; 7668 7669 /* 7670 * Mark us as being in io_wait if we have pending requests, so cpufreq 7671 * can take into account that the task is waiting for IO - turns out 7672 * to be important for low QD IO. 7673 */ 7674 io_wait = current->in_iowait; 7675 if (current_pending_io()) 7676 current->in_iowait = 1; 7677 ret = 1; 7678 if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS)) 7679 ret = -ETIME; 7680 current->in_iowait = io_wait; 7681 return ret; 7682} 7683 7684/* 7685 * Wait until events become available, if we don't already have some. The 7686 * application must reap them itself, as they reside on the shared cq ring. 7687 */ 7688static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, 7689 const sigset_t __user *sig, size_t sigsz, 7690 struct __kernel_timespec __user *uts) 7691{ 7692 struct io_wait_queue iowq; 7693 struct io_rings *rings = ctx->rings; 7694 ktime_t timeout = KTIME_MAX; 7695 int ret; 7696 7697 do { 7698 io_cqring_overflow_flush(ctx); 7699 if (io_cqring_events(ctx) >= min_events) 7700 return 0; 7701 if (!io_run_task_work()) 7702 break; 7703 } while (1); 7704 7705 if (uts) { 7706 struct timespec64 ts; 7707 7708 if (get_timespec64(&ts, uts)) 7709 return -EFAULT; 7710 timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); 7711 } 7712 7713 if (sig) { 7714#ifdef CONFIG_COMPAT 7715 if (in_compat_syscall()) 7716 ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, 7717 sigsz); 7718 else 7719#endif 7720 ret = set_user_sigmask(sig, sigsz); 7721 7722 if (ret) 7723 return ret; 7724 } 7725 7726 init_waitqueue_func_entry(&iowq.wq, io_wake_function); 7727 iowq.wq.private = current; 7728 INIT_LIST_HEAD(&iowq.wq.entry); 7729 iowq.ctx = ctx; 7730 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 7731 iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 7732 7733 trace_io_uring_cqring_wait(ctx, min_events); 7734 do { 7735 /* if we can't even flush overflow, don't wait for more */ 7736 if (!io_cqring_overflow_flush(ctx)) { 7737 ret = -EBUSY; 7738 break; 7739 } 7740 prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 7741 TASK_INTERRUPTIBLE); 7742 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); 7743 finish_wait(&ctx->cq_wait, &iowq.wq); 7744 cond_resched(); 7745 } while (ret > 0); 7746 7747 restore_saved_sigmask_unless(ret == -EINTR); 7748 7749 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 7750} 7751 7752static void io_free_page_table(void **table, size_t size) 7753{ 7754 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 7755 7756 for (i = 0; i < nr_tables; i++) 7757 kfree(table[i]); 7758 kfree(table); 7759} 7760 7761static void **io_alloc_page_table(size_t size) 7762{ 7763 unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); 7764 size_t init_size = size; 7765 void **table; 7766 7767 table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); 7768 if (!table) 7769 return NULL; 7770 7771 for (i = 0; i < nr_tables; i++) { 7772 unsigned int this_size = min_t(size_t, size, PAGE_SIZE); 7773 7774 table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); 7775 if (!table[i]) { 7776 io_free_page_table(table, init_size); 7777 return NULL; 7778 } 7779 size -= this_size; 7780 } 7781 return table; 7782} 7783 7784static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) 7785{ 7786 percpu_ref_exit(&ref_node->refs); 7787 kfree(ref_node); 7788} 7789 7790static void io_rsrc_node_ref_zero(struct percpu_ref *ref) 7791{ 7792 struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); 7793 struct io_ring_ctx *ctx = node->rsrc_data->ctx; 7794 unsigned long flags; 7795 bool first_add = false; 7796 unsigned long delay = HZ; 7797 7798 spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); 7799 node->done = true; 7800 7801 /* if we are mid-quiesce then do not delay */ 7802 if (node->rsrc_data->quiesce) 7803 delay = 0; 7804 7805 while (!list_empty(&ctx->rsrc_ref_list)) { 7806 node = list_first_entry(&ctx->rsrc_ref_list, 7807 struct io_rsrc_node, node); 7808 /* recycle ref nodes in order */ 7809 if (!node->done) 7810 break; 7811 list_del(&node->node); 7812 first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); 7813 } 7814 spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); 7815 7816 if (first_add) 7817 mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); 7818} 7819 7820static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) 7821{ 7822 struct io_rsrc_node *ref_node; 7823 7824 ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); 7825 if (!ref_node) 7826 return NULL; 7827 7828 if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, 7829 0, GFP_KERNEL)) { 7830 kfree(ref_node); 7831 return NULL; 7832 } 7833 INIT_LIST_HEAD(&ref_node->node); 7834 INIT_LIST_HEAD(&ref_node->rsrc_list); 7835 ref_node->done = false; 7836 return ref_node; 7837} 7838 7839static void io_rsrc_node_switch(struct io_ring_ctx *ctx, 7840 struct io_rsrc_data *data_to_kill) 7841{ 7842 WARN_ON_ONCE(!ctx->rsrc_backup_node); 7843 WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); 7844 7845 if (data_to_kill) { 7846 struct io_rsrc_node *rsrc_node = ctx->rsrc_node; 7847 7848 rsrc_node->rsrc_data = data_to_kill; 7849 spin_lock_irq(&ctx->rsrc_ref_lock); 7850 list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); 7851 spin_unlock_irq(&ctx->rsrc_ref_lock); 7852 7853 atomic_inc(&data_to_kill->refs); 7854 percpu_ref_kill(&rsrc_node->refs); 7855 ctx->rsrc_node = NULL; 7856 } 7857 7858 if (!ctx->rsrc_node) { 7859 ctx->rsrc_node = ctx->rsrc_backup_node; 7860 ctx->rsrc_backup_node = NULL; 7861 } 7862} 7863 7864static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) 7865{ 7866 if (ctx->rsrc_backup_node) 7867 return 0; 7868 ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); 7869 return ctx->rsrc_backup_node ? 0 : -ENOMEM; 7870} 7871 7872static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) 7873{ 7874 int ret; 7875 7876 /* As we may drop ->uring_lock, other task may have started quiesce */ 7877 if (data->quiesce) 7878 return -ENXIO; 7879 7880 data->quiesce = true; 7881 do { 7882 ret = io_rsrc_node_switch_start(ctx); 7883 if (ret) 7884 break; 7885 io_rsrc_node_switch(ctx, data); 7886 7887 /* kill initial ref, already quiesced if zero */ 7888 if (atomic_dec_and_test(&data->refs)) 7889 break; 7890 mutex_unlock(&ctx->uring_lock); 7891 flush_delayed_work(&ctx->rsrc_put_work); 7892 ret = wait_for_completion_interruptible(&data->done); 7893 if (!ret) { 7894 mutex_lock(&ctx->uring_lock); 7895 if (atomic_read(&data->refs) > 0) { 7896 /* 7897 * it has been revived by another thread while 7898 * we were unlocked 7899 */ 7900 mutex_unlock(&ctx->uring_lock); 7901 } else { 7902 break; 7903 } 7904 } 7905 7906 atomic_inc(&data->refs); 7907 /* wait for all works potentially completing data->done */ 7908 flush_delayed_work(&ctx->rsrc_put_work); 7909 reinit_completion(&data->done); 7910 7911 ret = io_run_task_work_sig(); 7912 mutex_lock(&ctx->uring_lock); 7913 } while (ret >= 0); 7914 data->quiesce = false; 7915 7916 return ret; 7917} 7918 7919static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) 7920{ 7921 unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; 7922 unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; 7923 7924 return &data->tags[table_idx][off]; 7925} 7926 7927static void io_rsrc_data_free(struct io_rsrc_data *data) 7928{ 7929 size_t size = data->nr * sizeof(data->tags[0][0]); 7930 7931 if (data->tags) 7932 io_free_page_table((void **)data->tags, size); 7933 kfree(data); 7934} 7935 7936static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, 7937 u64 __user *utags, unsigned nr, 7938 struct io_rsrc_data **pdata) 7939{ 7940 struct io_rsrc_data *data; 7941 int ret = -ENOMEM; 7942 unsigned i; 7943 7944 data = kzalloc(sizeof(*data), GFP_KERNEL); 7945 if (!data) 7946 return -ENOMEM; 7947 data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); 7948 if (!data->tags) { 7949 kfree(data); 7950 return -ENOMEM; 7951 } 7952 7953 data->nr = nr; 7954 data->ctx = ctx; 7955 data->do_put = do_put; 7956 if (utags) { 7957 ret = -EFAULT; 7958 for (i = 0; i < nr; i++) { 7959 u64 *tag_slot = io_get_tag_slot(data, i); 7960 7961 if (copy_from_user(tag_slot, &utags[i], 7962 sizeof(*tag_slot))) 7963 goto fail; 7964 } 7965 } 7966 7967 atomic_set(&data->refs, 1); 7968 init_completion(&data->done); 7969 *pdata = data; 7970 return 0; 7971fail: 7972 io_rsrc_data_free(data); 7973 return ret; 7974} 7975 7976static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) 7977{ 7978 table->files = kvcalloc(nr_files, sizeof(table->files[0]), 7979 GFP_KERNEL_ACCOUNT); 7980 return !!table->files; 7981} 7982 7983static void io_free_file_tables(struct io_file_table *table) 7984{ 7985 kvfree(table->files); 7986 table->files = NULL; 7987} 7988 7989static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) 7990{ 7991#if defined(CONFIG_UNIX) 7992 if (ctx->ring_sock) { 7993 struct sock *sock = ctx->ring_sock->sk; 7994 struct sk_buff *skb; 7995 7996 while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) 7997 kfree_skb(skb); 7998 } 7999#else 8000 int i; 8001 8002 for (i = 0; i < ctx->nr_user_files; i++) { 8003 struct file *file; 8004 8005 file = io_file_from_index(ctx, i); 8006 if (file) 8007 fput(file); 8008 } 8009#endif 8010 io_free_file_tables(&ctx->file_table); 8011 io_rsrc_data_free(ctx->file_data); 8012 ctx->file_data = NULL; 8013 ctx->nr_user_files = 0; 8014} 8015 8016static int io_sqe_files_unregister(struct io_ring_ctx *ctx) 8017{ 8018 unsigned nr = ctx->nr_user_files; 8019 int ret; 8020 8021 if (!ctx->file_data) 8022 return -ENXIO; 8023 8024 /* 8025 * Quiesce may unlock ->uring_lock, and while it's not held 8026 * prevent new requests using the table. 8027 */ 8028 ctx->nr_user_files = 0; 8029 ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); 8030 ctx->nr_user_files = nr; 8031 if (!ret) 8032 __io_sqe_files_unregister(ctx); 8033 return ret; 8034} 8035 8036static void io_sq_thread_unpark(struct io_sq_data *sqd) 8037 __releases(&sqd->lock) 8038{ 8039 WARN_ON_ONCE(sqd->thread == current); 8040 8041 /* 8042 * Do the dance but not conditional clear_bit() because it'd race with 8043 * other threads incrementing park_pending and setting the bit. 8044 */ 8045 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 8046 if (atomic_dec_return(&sqd->park_pending)) 8047 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 8048 mutex_unlock(&sqd->lock); 8049} 8050 8051static void io_sq_thread_park(struct io_sq_data *sqd) 8052 __acquires(&sqd->lock) 8053{ 8054 WARN_ON_ONCE(sqd->thread == current); 8055 8056 atomic_inc(&sqd->park_pending); 8057 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 8058 mutex_lock(&sqd->lock); 8059 if (sqd->thread) 8060 wake_up_process(sqd->thread); 8061} 8062 8063static void io_sq_thread_stop(struct io_sq_data *sqd) 8064{ 8065 WARN_ON_ONCE(sqd->thread == current); 8066 WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); 8067 8068 set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 8069 mutex_lock(&sqd->lock); 8070 if (sqd->thread) 8071 wake_up_process(sqd->thread); 8072 mutex_unlock(&sqd->lock); 8073 wait_for_completion(&sqd->exited); 8074} 8075 8076static void io_put_sq_data(struct io_sq_data *sqd) 8077{ 8078 if (refcount_dec_and_test(&sqd->refs)) { 8079 WARN_ON_ONCE(atomic_read(&sqd->park_pending)); 8080 8081 io_sq_thread_stop(sqd); 8082 kfree(sqd); 8083 } 8084} 8085 8086static void io_sq_thread_finish(struct io_ring_ctx *ctx) 8087{ 8088 struct io_sq_data *sqd = ctx->sq_data; 8089 8090 if (sqd) { 8091 io_sq_thread_park(sqd); 8092 list_del_init(&ctx->sqd_list); 8093 io_sqd_update_thread_idle(sqd); 8094 io_sq_thread_unpark(sqd); 8095 8096 io_put_sq_data(sqd); 8097 ctx->sq_data = NULL; 8098 } 8099} 8100 8101static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) 8102{ 8103 struct io_ring_ctx *ctx_attach; 8104 struct io_sq_data *sqd; 8105 struct fd f; 8106 8107 f = fdget(p->wq_fd); 8108 if (!f.file) 8109 return ERR_PTR(-ENXIO); 8110 if (f.file->f_op != &io_uring_fops) { 8111 fdput(f); 8112 return ERR_PTR(-EINVAL); 8113 } 8114 8115 ctx_attach = f.file->private_data; 8116 sqd = ctx_attach->sq_data; 8117 if (!sqd) { 8118 fdput(f); 8119 return ERR_PTR(-EINVAL); 8120 } 8121 if (sqd->task_tgid != current->tgid) { 8122 fdput(f); 8123 return ERR_PTR(-EPERM); 8124 } 8125 8126 refcount_inc(&sqd->refs); 8127 fdput(f); 8128 return sqd; 8129} 8130 8131static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, 8132 bool *attached) 8133{ 8134 struct io_sq_data *sqd; 8135 8136 *attached = false; 8137 if (p->flags & IORING_SETUP_ATTACH_WQ) { 8138 sqd = io_attach_sq_data(p); 8139 if (!IS_ERR(sqd)) { 8140 *attached = true; 8141 return sqd; 8142 } 8143 /* fall through for EPERM case, setup new sqd/task */ 8144 if (PTR_ERR(sqd) != -EPERM) 8145 return sqd; 8146 } 8147 8148 sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); 8149 if (!sqd) 8150 return ERR_PTR(-ENOMEM); 8151 8152 atomic_set(&sqd->park_pending, 0); 8153 refcount_set(&sqd->refs, 1); 8154 INIT_LIST_HEAD(&sqd->ctx_list); 8155 mutex_init(&sqd->lock); 8156 init_waitqueue_head(&sqd->wait); 8157 init_completion(&sqd->exited); 8158 return sqd; 8159} 8160 8161#if defined(CONFIG_UNIX) 8162/* 8163 * Ensure the UNIX gc is aware of our file set, so we are certain that 8164 * the io_uring can be safely unregistered on process exit, even if we have 8165 * loops in the file referencing. 8166 */ 8167static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) 8168{ 8169 struct sock *sk = ctx->ring_sock->sk; 8170 struct scm_fp_list *fpl; 8171 struct sk_buff *skb; 8172 int i, nr_files; 8173 8174 fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); 8175 if (!fpl) 8176 return -ENOMEM; 8177 8178 skb = alloc_skb(0, GFP_KERNEL); 8179 if (!skb) { 8180 kfree(fpl); 8181 return -ENOMEM; 8182 } 8183 8184 skb->sk = sk; 8185 skb->scm_io_uring = 1; 8186 8187 nr_files = 0; 8188 fpl->user = get_uid(current_user()); 8189 for (i = 0; i < nr; i++) { 8190 struct file *file = io_file_from_index(ctx, i + offset); 8191 8192 if (!file) 8193 continue; 8194 fpl->fp[nr_files] = get_file(file); 8195 unix_inflight(fpl->user, fpl->fp[nr_files]); 8196 nr_files++; 8197 } 8198 8199 if (nr_files) { 8200 fpl->max = SCM_MAX_FD; 8201 fpl->count = nr_files; 8202 UNIXCB(skb).fp = fpl; 8203 skb->destructor = unix_destruct_scm; 8204 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 8205 skb_queue_head(&sk->sk_receive_queue, skb); 8206 8207 for (i = 0; i < nr; i++) { 8208 struct file *file = io_file_from_index(ctx, i + offset); 8209 8210 if (file) 8211 fput(file); 8212 } 8213 } else { 8214 kfree_skb(skb); 8215 free_uid(fpl->user); 8216 kfree(fpl); 8217 } 8218 8219 return 0; 8220} 8221 8222/* 8223 * If UNIX sockets are enabled, fd passing can cause a reference cycle which 8224 * causes regular reference counting to break down. We rely on the UNIX 8225 * garbage collection to take care of this problem for us. 8226 */ 8227static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8228{ 8229 unsigned left, total; 8230 int ret = 0; 8231 8232 total = 0; 8233 left = ctx->nr_user_files; 8234 while (left) { 8235 unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); 8236 8237 ret = __io_sqe_files_scm(ctx, this_files, total); 8238 if (ret) 8239 break; 8240 left -= this_files; 8241 total += this_files; 8242 } 8243 8244 if (!ret) 8245 return 0; 8246 8247 while (total < ctx->nr_user_files) { 8248 struct file *file = io_file_from_index(ctx, total); 8249 8250 if (file) 8251 fput(file); 8252 total++; 8253 } 8254 8255 return ret; 8256} 8257#else 8258static int io_sqe_files_scm(struct io_ring_ctx *ctx) 8259{ 8260 return 0; 8261} 8262#endif 8263 8264static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8265{ 8266 struct file *file = prsrc->file; 8267#if defined(CONFIG_UNIX) 8268 struct sock *sock = ctx->ring_sock->sk; 8269 struct sk_buff_head list, *head = &sock->sk_receive_queue; 8270 struct sk_buff *skb; 8271 int i; 8272 8273 __skb_queue_head_init(&list); 8274 8275 /* 8276 * Find the skb that holds this file in its SCM_RIGHTS. When found, 8277 * remove this entry and rearrange the file array. 8278 */ 8279 skb = skb_dequeue(head); 8280 while (skb) { 8281 struct scm_fp_list *fp; 8282 8283 fp = UNIXCB(skb).fp; 8284 for (i = 0; i < fp->count; i++) { 8285 int left; 8286 8287 if (fp->fp[i] != file) 8288 continue; 8289 8290 unix_notinflight(fp->user, fp->fp[i]); 8291 left = fp->count - 1 - i; 8292 if (left) { 8293 memmove(&fp->fp[i], &fp->fp[i + 1], 8294 left * sizeof(struct file *)); 8295 } 8296 fp->count--; 8297 if (!fp->count) { 8298 kfree_skb(skb); 8299 skb = NULL; 8300 } else { 8301 __skb_queue_tail(&list, skb); 8302 } 8303 fput(file); 8304 file = NULL; 8305 break; 8306 } 8307 8308 if (!file) 8309 break; 8310 8311 __skb_queue_tail(&list, skb); 8312 8313 skb = skb_dequeue(head); 8314 } 8315 8316 if (skb_peek(&list)) { 8317 spin_lock_irq(&head->lock); 8318 while ((skb = __skb_dequeue(&list)) != NULL) 8319 __skb_queue_tail(head, skb); 8320 spin_unlock_irq(&head->lock); 8321 } 8322#else 8323 fput(file); 8324#endif 8325} 8326 8327static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) 8328{ 8329 struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; 8330 struct io_ring_ctx *ctx = rsrc_data->ctx; 8331 struct io_rsrc_put *prsrc, *tmp; 8332 8333 list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { 8334 list_del(&prsrc->list); 8335 8336 if (prsrc->tag) { 8337 bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; 8338 8339 io_ring_submit_lock(ctx, lock_ring); 8340 spin_lock(&ctx->completion_lock); 8341 io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); 8342 io_commit_cqring(ctx); 8343 spin_unlock(&ctx->completion_lock); 8344 io_cqring_ev_posted(ctx); 8345 io_ring_submit_unlock(ctx, lock_ring); 8346 } 8347 8348 rsrc_data->do_put(ctx, prsrc); 8349 kfree(prsrc); 8350 } 8351 8352 io_rsrc_node_destroy(ref_node); 8353 if (atomic_dec_and_test(&rsrc_data->refs)) 8354 complete(&rsrc_data->done); 8355} 8356 8357static void io_rsrc_put_work(struct work_struct *work) 8358{ 8359 struct io_ring_ctx *ctx; 8360 struct llist_node *node; 8361 8362 ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); 8363 node = llist_del_all(&ctx->rsrc_put_llist); 8364 8365 while (node) { 8366 struct io_rsrc_node *ref_node; 8367 struct llist_node *next = node->next; 8368 8369 ref_node = llist_entry(node, struct io_rsrc_node, llist); 8370 __io_rsrc_put_work(ref_node); 8371 node = next; 8372 } 8373} 8374 8375static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, 8376 unsigned nr_args, u64 __user *tags) 8377{ 8378 __s32 __user *fds = (__s32 __user *) arg; 8379 struct file *file; 8380 int fd, ret; 8381 unsigned i; 8382 8383 if (ctx->file_data) 8384 return -EBUSY; 8385 if (!nr_args) 8386 return -EINVAL; 8387 if (nr_args > IORING_MAX_FIXED_FILES) 8388 return -EMFILE; 8389 if (nr_args > rlimit(RLIMIT_NOFILE)) 8390 return -EMFILE; 8391 ret = io_rsrc_node_switch_start(ctx); 8392 if (ret) 8393 return ret; 8394 ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, 8395 &ctx->file_data); 8396 if (ret) 8397 return ret; 8398 8399 ret = -ENOMEM; 8400 if (!io_alloc_file_tables(&ctx->file_table, nr_args)) 8401 goto out_free; 8402 8403 for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { 8404 if (copy_from_user(&fd, &fds[i], sizeof(fd))) { 8405 ret = -EFAULT; 8406 goto out_fput; 8407 } 8408 /* allow sparse sets */ 8409 if (fd == -1) { 8410 ret = -EINVAL; 8411 if (unlikely(*io_get_tag_slot(ctx->file_data, i))) 8412 goto out_fput; 8413 continue; 8414 } 8415 8416 file = fget(fd); 8417 ret = -EBADF; 8418 if (unlikely(!file)) 8419 goto out_fput; 8420 8421 /* 8422 * Don't allow io_uring instances to be registered. If UNIX 8423 * isn't enabled, then this causes a reference cycle and this 8424 * instance can never get freed. If UNIX is enabled we'll 8425 * handle it just fine, but there's still no point in allowing 8426 * a ring fd as it doesn't support regular read/write anyway. 8427 */ 8428 if (file->f_op == &io_uring_fops) { 8429 fput(file); 8430 goto out_fput; 8431 } 8432 io_fixed_file_set(io_fixed_file_slot(&ctx->file_table, i), file); 8433 } 8434 8435 ret = io_sqe_files_scm(ctx); 8436 if (ret) { 8437 __io_sqe_files_unregister(ctx); 8438 return ret; 8439 } 8440 8441 io_rsrc_node_switch(ctx, NULL); 8442 return ret; 8443out_fput: 8444 for (i = 0; i < ctx->nr_user_files; i++) { 8445 file = io_file_from_index(ctx, i); 8446 if (file) 8447 fput(file); 8448 } 8449 io_free_file_tables(&ctx->file_table); 8450 ctx->nr_user_files = 0; 8451out_free: 8452 io_rsrc_data_free(ctx->file_data); 8453 ctx->file_data = NULL; 8454 return ret; 8455} 8456 8457static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, 8458 struct io_rsrc_node *node, void *rsrc) 8459{ 8460 u64 *tag_slot = io_get_tag_slot(data, idx); 8461 struct io_rsrc_put *prsrc; 8462 8463 prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); 8464 if (!prsrc) 8465 return -ENOMEM; 8466 8467 prsrc->tag = *tag_slot; 8468 *tag_slot = 0; 8469 prsrc->rsrc = rsrc; 8470 list_add(&prsrc->list, &node->rsrc_list); 8471 return 0; 8472} 8473 8474static int io_install_fixed_file(struct io_kiocb *req, struct file *file, 8475 unsigned int issue_flags, u32 slot_index) 8476{ 8477 struct io_ring_ctx *ctx = req->ctx; 8478 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 8479 bool needs_switch = false; 8480 struct io_fixed_file *file_slot; 8481 int ret = -EBADF; 8482 8483 io_ring_submit_lock(ctx, !force_nonblock); 8484 if (file->f_op == &io_uring_fops) 8485 goto err; 8486 ret = -ENXIO; 8487 if (!ctx->file_data) 8488 goto err; 8489 ret = -EINVAL; 8490 if (slot_index >= ctx->nr_user_files) 8491 goto err; 8492 8493 slot_index = array_index_nospec(slot_index, ctx->nr_user_files); 8494 file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); 8495 8496 if (file_slot->file_ptr) { 8497 struct file *old_file; 8498 8499 ret = io_rsrc_node_switch_start(ctx); 8500 if (ret) 8501 goto err; 8502 8503 old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8504 ret = io_queue_rsrc_removal(ctx->file_data, slot_index, 8505 ctx->rsrc_node, old_file); 8506 if (ret) 8507 goto err; 8508 file_slot->file_ptr = 0; 8509 needs_switch = true; 8510 } 8511 8512 *io_get_tag_slot(ctx->file_data, slot_index) = 0; 8513 io_fixed_file_set(file_slot, file); 8514 ret = 0; 8515err: 8516 if (needs_switch) 8517 io_rsrc_node_switch(ctx, ctx->file_data); 8518 io_ring_submit_unlock(ctx, !force_nonblock); 8519 if (ret) 8520 fput(file); 8521 return ret; 8522} 8523 8524static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) 8525{ 8526 unsigned int offset = req->close.file_slot - 1; 8527 struct io_ring_ctx *ctx = req->ctx; 8528 struct io_fixed_file *file_slot; 8529 struct file *file; 8530 int ret; 8531 8532 io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 8533 ret = -ENXIO; 8534 if (unlikely(!ctx->file_data)) 8535 goto out; 8536 ret = -EINVAL; 8537 if (offset >= ctx->nr_user_files) 8538 goto out; 8539 ret = io_rsrc_node_switch_start(ctx); 8540 if (ret) 8541 goto out; 8542 8543 offset = array_index_nospec(offset, ctx->nr_user_files); 8544 file_slot = io_fixed_file_slot(&ctx->file_table, offset); 8545 ret = -EBADF; 8546 if (!file_slot->file_ptr) 8547 goto out; 8548 8549 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8550 ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); 8551 if (ret) 8552 goto out; 8553 8554 file_slot->file_ptr = 0; 8555 io_rsrc_node_switch(ctx, ctx->file_data); 8556 ret = 0; 8557out: 8558 io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); 8559 return ret; 8560} 8561 8562static int __io_sqe_files_update(struct io_ring_ctx *ctx, 8563 struct io_uring_rsrc_update2 *up, 8564 unsigned nr_args) 8565{ 8566 u64 __user *tags = u64_to_user_ptr(up->tags); 8567 __s32 __user *fds = u64_to_user_ptr(up->data); 8568 struct io_rsrc_data *data = ctx->file_data; 8569 struct io_fixed_file *file_slot; 8570 struct file *file; 8571 int fd, i, err = 0; 8572 unsigned int done; 8573 bool needs_switch = false; 8574 8575 if (!ctx->file_data) 8576 return -ENXIO; 8577 if (up->offset + nr_args > ctx->nr_user_files) 8578 return -EINVAL; 8579 8580 for (done = 0; done < nr_args; done++) { 8581 u64 tag = 0; 8582 8583 if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || 8584 copy_from_user(&fd, &fds[done], sizeof(fd))) { 8585 err = -EFAULT; 8586 break; 8587 } 8588 if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { 8589 err = -EINVAL; 8590 break; 8591 } 8592 if (fd == IORING_REGISTER_FILES_SKIP) 8593 continue; 8594 8595 i = array_index_nospec(up->offset + done, ctx->nr_user_files); 8596 file_slot = io_fixed_file_slot(&ctx->file_table, i); 8597 8598 if (file_slot->file_ptr) { 8599 file = (struct file *)(file_slot->file_ptr & FFS_MASK); 8600 err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); 8601 if (err) 8602 break; 8603 file_slot->file_ptr = 0; 8604 needs_switch = true; 8605 } 8606 if (fd != -1) { 8607 file = fget(fd); 8608 if (!file) { 8609 err = -EBADF; 8610 break; 8611 } 8612 /* 8613 * Don't allow io_uring instances to be registered. If 8614 * UNIX isn't enabled, then this causes a reference 8615 * cycle and this instance can never get freed. If UNIX 8616 * is enabled we'll handle it just fine, but there's 8617 * still no point in allowing a ring fd as it doesn't 8618 * support regular read/write anyway. 8619 */ 8620 if (file->f_op == &io_uring_fops) { 8621 fput(file); 8622 err = -EBADF; 8623 break; 8624 } 8625 *io_get_tag_slot(data, i) = tag; 8626 io_fixed_file_set(file_slot, file); 8627 } 8628 } 8629 8630 if (needs_switch) 8631 io_rsrc_node_switch(ctx, data); 8632 return done ? done : err; 8633} 8634 8635static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, 8636 struct task_struct *task) 8637{ 8638 struct io_wq_hash *hash; 8639 struct io_wq_data data; 8640 unsigned int concurrency; 8641 8642 mutex_lock(&ctx->uring_lock); 8643 hash = ctx->hash_map; 8644 if (!hash) { 8645 hash = kzalloc(sizeof(*hash), GFP_KERNEL); 8646 if (!hash) { 8647 mutex_unlock(&ctx->uring_lock); 8648 return ERR_PTR(-ENOMEM); 8649 } 8650 refcount_set(&hash->refs, 1); 8651 init_waitqueue_head(&hash->wait); 8652 ctx->hash_map = hash; 8653 } 8654 mutex_unlock(&ctx->uring_lock); 8655 8656 data.hash = hash; 8657 data.task = task; 8658 data.free_work = io_wq_free_work; 8659 data.do_work = io_wq_submit_work; 8660 8661 /* Do QD, or 4 * CPUS, whatever is smallest */ 8662 concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); 8663 8664 return io_wq_create(concurrency, &data); 8665} 8666 8667static int io_uring_alloc_task_context(struct task_struct *task, 8668 struct io_ring_ctx *ctx) 8669{ 8670 struct io_uring_task *tctx; 8671 int ret; 8672 8673 tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); 8674 if (unlikely(!tctx)) 8675 return -ENOMEM; 8676 8677 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); 8678 if (unlikely(ret)) { 8679 kfree(tctx); 8680 return ret; 8681 } 8682 8683 tctx->io_wq = io_init_wq_offload(ctx, task); 8684 if (IS_ERR(tctx->io_wq)) { 8685 ret = PTR_ERR(tctx->io_wq); 8686 percpu_counter_destroy(&tctx->inflight); 8687 kfree(tctx); 8688 return ret; 8689 } 8690 8691 xa_init(&tctx->xa); 8692 init_waitqueue_head(&tctx->wait); 8693 atomic_set(&tctx->in_idle, 0); 8694 atomic_set(&tctx->inflight_tracked, 0); 8695 task->io_uring = tctx; 8696 spin_lock_init(&tctx->task_lock); 8697 INIT_WQ_LIST(&tctx->task_list); 8698 init_task_work(&tctx->task_work, tctx_task_work); 8699 return 0; 8700} 8701 8702void __io_uring_free(struct task_struct *tsk) 8703{ 8704 struct io_uring_task *tctx = tsk->io_uring; 8705 8706 WARN_ON_ONCE(!xa_empty(&tctx->xa)); 8707 WARN_ON_ONCE(tctx->io_wq); 8708 WARN_ON_ONCE(tctx->cached_refs); 8709 8710 percpu_counter_destroy(&tctx->inflight); 8711 kfree(tctx); 8712 tsk->io_uring = NULL; 8713} 8714 8715static int io_sq_offload_create(struct io_ring_ctx *ctx, 8716 struct io_uring_params *p) 8717{ 8718 int ret; 8719 8720 /* Retain compatibility with failing for an invalid attach attempt */ 8721 if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == 8722 IORING_SETUP_ATTACH_WQ) { 8723 struct fd f; 8724 8725 f = fdget(p->wq_fd); 8726 if (!f.file) 8727 return -ENXIO; 8728 if (f.file->f_op != &io_uring_fops) { 8729 fdput(f); 8730 return -EINVAL; 8731 } 8732 fdput(f); 8733 } 8734 if (ctx->flags & IORING_SETUP_SQPOLL) { 8735 struct task_struct *tsk; 8736 struct io_sq_data *sqd; 8737 bool attached; 8738 8739 sqd = io_get_sq_data(p, &attached); 8740 if (IS_ERR(sqd)) { 8741 ret = PTR_ERR(sqd); 8742 goto err; 8743 } 8744 8745 ctx->sq_creds = get_current_cred(); 8746 ctx->sq_data = sqd; 8747 ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); 8748 if (!ctx->sq_thread_idle) 8749 ctx->sq_thread_idle = HZ; 8750 8751 io_sq_thread_park(sqd); 8752 list_add(&ctx->sqd_list, &sqd->ctx_list); 8753 io_sqd_update_thread_idle(sqd); 8754 /* don't attach to a dying SQPOLL thread, would be racy */ 8755 ret = (attached && !sqd->thread) ? -ENXIO : 0; 8756 io_sq_thread_unpark(sqd); 8757 8758 if (ret < 0) 8759 goto err; 8760 if (attached) 8761 return 0; 8762 8763 if (p->flags & IORING_SETUP_SQ_AFF) { 8764 int cpu = p->sq_thread_cpu; 8765 8766 ret = -EINVAL; 8767 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) 8768 goto err_sqpoll; 8769 sqd->sq_cpu = cpu; 8770 } else { 8771 sqd->sq_cpu = -1; 8772 } 8773 8774 sqd->task_pid = current->pid; 8775 sqd->task_tgid = current->tgid; 8776 tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); 8777 if (IS_ERR(tsk)) { 8778 ret = PTR_ERR(tsk); 8779 goto err_sqpoll; 8780 } 8781 8782 sqd->thread = tsk; 8783 ret = io_uring_alloc_task_context(tsk, ctx); 8784 wake_up_new_task(tsk); 8785 if (ret) 8786 goto err; 8787 } else if (p->flags & IORING_SETUP_SQ_AFF) { 8788 /* Can't have SQ_AFF without SQPOLL */ 8789 ret = -EINVAL; 8790 goto err; 8791 } 8792 8793 return 0; 8794err_sqpoll: 8795 complete(&ctx->sq_data->exited); 8796err: 8797 io_sq_thread_finish(ctx); 8798 return ret; 8799} 8800 8801static inline void __io_unaccount_mem(struct user_struct *user, 8802 unsigned long nr_pages) 8803{ 8804 atomic_long_sub(nr_pages, &user->locked_vm); 8805} 8806 8807static inline int __io_account_mem(struct user_struct *user, 8808 unsigned long nr_pages) 8809{ 8810 unsigned long page_limit, cur_pages, new_pages; 8811 8812 /* Don't allow more pages than we can safely lock */ 8813 page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 8814 8815 do { 8816 cur_pages = atomic_long_read(&user->locked_vm); 8817 new_pages = cur_pages + nr_pages; 8818 if (new_pages > page_limit) 8819 return -ENOMEM; 8820 } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, 8821 new_pages) != cur_pages); 8822 8823 return 0; 8824} 8825 8826static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8827{ 8828 if (ctx->user) 8829 __io_unaccount_mem(ctx->user, nr_pages); 8830 8831 if (ctx->mm_account) 8832 atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 8833} 8834 8835static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 8836{ 8837 int ret; 8838 8839 if (ctx->user) { 8840 ret = __io_account_mem(ctx->user, nr_pages); 8841 if (ret) 8842 return ret; 8843 } 8844 8845 if (ctx->mm_account) 8846 atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 8847 8848 return 0; 8849} 8850 8851static void io_mem_free(void *ptr) 8852{ 8853 struct page *page; 8854 8855 if (!ptr) 8856 return; 8857 8858 page = virt_to_head_page(ptr); 8859 if (put_page_testzero(page)) 8860 free_compound_page(page); 8861} 8862 8863static void *io_mem_alloc(size_t size) 8864{ 8865 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; 8866 8867 return (void *) __get_free_pages(gfp, get_order(size)); 8868} 8869 8870static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, 8871 size_t *sq_offset) 8872{ 8873 struct io_rings *rings; 8874 size_t off, sq_array_size; 8875 8876 off = struct_size(rings, cqes, cq_entries); 8877 if (off == SIZE_MAX) 8878 return SIZE_MAX; 8879 8880#ifdef CONFIG_SMP 8881 off = ALIGN(off, SMP_CACHE_BYTES); 8882 if (off == 0) 8883 return SIZE_MAX; 8884#endif 8885 8886 if (sq_offset) 8887 *sq_offset = off; 8888 8889 sq_array_size = array_size(sizeof(u32), sq_entries); 8890 if (sq_array_size == SIZE_MAX) 8891 return SIZE_MAX; 8892 8893 if (check_add_overflow(off, sq_array_size, &off)) 8894 return SIZE_MAX; 8895 8896 return off; 8897} 8898 8899static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) 8900{ 8901 struct io_mapped_ubuf *imu = *slot; 8902 unsigned int i; 8903 8904 if (imu != ctx->dummy_ubuf) { 8905 for (i = 0; i < imu->nr_bvecs; i++) 8906 unpin_user_page(imu->bvec[i].bv_page); 8907 if (imu->acct_pages) 8908 io_unaccount_mem(ctx, imu->acct_pages); 8909 kvfree(imu); 8910 } 8911 *slot = NULL; 8912} 8913 8914static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) 8915{ 8916 io_buffer_unmap(ctx, &prsrc->buf); 8917 prsrc->buf = NULL; 8918} 8919 8920static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 8921{ 8922 unsigned int i; 8923 8924 for (i = 0; i < ctx->nr_user_bufs; i++) 8925 io_buffer_unmap(ctx, &ctx->user_bufs[i]); 8926 kfree(ctx->user_bufs); 8927 io_rsrc_data_free(ctx->buf_data); 8928 ctx->user_bufs = NULL; 8929 ctx->buf_data = NULL; 8930 ctx->nr_user_bufs = 0; 8931} 8932 8933static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) 8934{ 8935 unsigned nr = ctx->nr_user_bufs; 8936 int ret; 8937 8938 if (!ctx->buf_data) 8939 return -ENXIO; 8940 8941 /* 8942 * Quiesce may unlock ->uring_lock, and while it's not held 8943 * prevent new requests using the table. 8944 */ 8945 ctx->nr_user_bufs = 0; 8946 ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); 8947 ctx->nr_user_bufs = nr; 8948 if (!ret) 8949 __io_sqe_buffers_unregister(ctx); 8950 return ret; 8951} 8952 8953static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, 8954 void __user *arg, unsigned index) 8955{ 8956 struct iovec __user *src; 8957 8958#ifdef CONFIG_COMPAT 8959 if (ctx->compat) { 8960 struct compat_iovec __user *ciovs; 8961 struct compat_iovec ciov; 8962 8963 ciovs = (struct compat_iovec __user *) arg; 8964 if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) 8965 return -EFAULT; 8966 8967 dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); 8968 dst->iov_len = ciov.iov_len; 8969 return 0; 8970 } 8971#endif 8972 src = (struct iovec __user *) arg; 8973 if (copy_from_user(dst, &src[index], sizeof(*dst))) 8974 return -EFAULT; 8975 return 0; 8976} 8977 8978/* 8979 * Not super efficient, but this is just a registration time. And we do cache 8980 * the last compound head, so generally we'll only do a full search if we don't 8981 * match that one. 8982 * 8983 * We check if the given compound head page has already been accounted, to 8984 * avoid double accounting it. This allows us to account the full size of the 8985 * page, not just the constituent pages of a huge page. 8986 */ 8987static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, 8988 int nr_pages, struct page *hpage) 8989{ 8990 int i, j; 8991 8992 /* check current page array */ 8993 for (i = 0; i < nr_pages; i++) { 8994 if (!PageCompound(pages[i])) 8995 continue; 8996 if (compound_head(pages[i]) == hpage) 8997 return true; 8998 } 8999 9000 /* check previously registered pages */ 9001 for (i = 0; i < ctx->nr_user_bufs; i++) { 9002 struct io_mapped_ubuf *imu = ctx->user_bufs[i]; 9003 9004 for (j = 0; j < imu->nr_bvecs; j++) { 9005 if (!PageCompound(imu->bvec[j].bv_page)) 9006 continue; 9007 if (compound_head(imu->bvec[j].bv_page) == hpage) 9008 return true; 9009 } 9010 } 9011 9012 return false; 9013} 9014 9015static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, 9016 int nr_pages, struct io_mapped_ubuf *imu, 9017 struct page **last_hpage) 9018{ 9019 int i, ret; 9020 9021 imu->acct_pages = 0; 9022 for (i = 0; i < nr_pages; i++) { 9023 if (!PageCompound(pages[i])) { 9024 imu->acct_pages++; 9025 } else { 9026 struct page *hpage; 9027 9028 hpage = compound_head(pages[i]); 9029 if (hpage == *last_hpage) 9030 continue; 9031 *last_hpage = hpage; 9032 if (headpage_already_acct(ctx, pages, i, hpage)) 9033 continue; 9034 imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; 9035 } 9036 } 9037 9038 if (!imu->acct_pages) 9039 return 0; 9040 9041 ret = io_account_mem(ctx, imu->acct_pages); 9042 if (ret) 9043 imu->acct_pages = 0; 9044 return ret; 9045} 9046 9047static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, 9048 struct io_mapped_ubuf **pimu, 9049 struct page **last_hpage) 9050{ 9051 struct io_mapped_ubuf *imu = NULL; 9052 struct vm_area_struct **vmas = NULL; 9053 struct page **pages = NULL; 9054 unsigned long off, start, end, ubuf; 9055 size_t size; 9056 int ret, pret, nr_pages, i; 9057 9058 if (!iov->iov_base) { 9059 *pimu = ctx->dummy_ubuf; 9060 return 0; 9061 } 9062 9063 ubuf = (unsigned long) iov->iov_base; 9064 end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; 9065 start = ubuf >> PAGE_SHIFT; 9066 nr_pages = end - start; 9067 9068 *pimu = NULL; 9069 ret = -ENOMEM; 9070 9071 pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 9072 if (!pages) 9073 goto done; 9074 9075 vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), 9076 GFP_KERNEL); 9077 if (!vmas) 9078 goto done; 9079 9080 imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); 9081 if (!imu) 9082 goto done; 9083 9084 ret = 0; 9085 mmap_read_lock(current->mm); 9086 pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 9087 pages, vmas); 9088 if (pret == nr_pages) { 9089 struct file *file = vmas[0]->vm_file; 9090 9091 /* don't support file backed memory */ 9092 for (i = 0; i < nr_pages; i++) { 9093 if (vmas[i]->vm_file != file) { 9094 ret = -EINVAL; 9095 break; 9096 } 9097 if (!file) 9098 continue; 9099 if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) { 9100 ret = -EOPNOTSUPP; 9101 break; 9102 } 9103 } 9104 } else { 9105 ret = pret < 0 ? pret : -EFAULT; 9106 } 9107 mmap_read_unlock(current->mm); 9108 if (ret) { 9109 /* 9110 * if we did partial map, or found file backed vmas, 9111 * release any pages we did get 9112 */ 9113 if (pret > 0) 9114 unpin_user_pages(pages, pret); 9115 goto done; 9116 } 9117 9118 ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage); 9119 if (ret) { 9120 unpin_user_pages(pages, pret); 9121 goto done; 9122 } 9123 9124 off = ubuf & ~PAGE_MASK; 9125 size = iov->iov_len; 9126 for (i = 0; i < nr_pages; i++) { 9127 size_t vec_len; 9128 9129 vec_len = min_t(size_t, size, PAGE_SIZE - off); 9130 imu->bvec[i].bv_page = pages[i]; 9131 imu->bvec[i].bv_len = vec_len; 9132 imu->bvec[i].bv_offset = off; 9133 off = 0; 9134 size -= vec_len; 9135 } 9136 /* store original address for later verification */ 9137 imu->ubuf = ubuf; 9138 imu->ubuf_end = ubuf + iov->iov_len; 9139 imu->nr_bvecs = nr_pages; 9140 *pimu = imu; 9141 ret = 0; 9142done: 9143 if (ret) 9144 kvfree(imu); 9145 kvfree(pages); 9146 kvfree(vmas); 9147 return ret; 9148} 9149 9150static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) 9151{ 9152 ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); 9153 return ctx->user_bufs ? 0 : -ENOMEM; 9154} 9155 9156static int io_buffer_validate(struct iovec *iov) 9157{ 9158 unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); 9159 9160 /* 9161 * Don't impose further limits on the size and buffer 9162 * constraints here, we'll -EINVAL later when IO is 9163 * submitted if they are wrong. 9164 */ 9165 if (!iov->iov_base) 9166 return iov->iov_len ? -EFAULT : 0; 9167 if (!iov->iov_len) 9168 return -EFAULT; 9169 9170 /* arbitrary limit, but we need something */ 9171 if (iov->iov_len > SZ_1G) 9172 return -EFAULT; 9173 9174 if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) 9175 return -EOVERFLOW; 9176 9177 return 0; 9178} 9179 9180static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, 9181 unsigned int nr_args, u64 __user *tags) 9182{ 9183 struct page *last_hpage = NULL; 9184 struct io_rsrc_data *data; 9185 int i, ret; 9186 struct iovec iov; 9187 9188 if (ctx->user_bufs) 9189 return -EBUSY; 9190 if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) 9191 return -EINVAL; 9192 ret = io_rsrc_node_switch_start(ctx); 9193 if (ret) 9194 return ret; 9195 ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); 9196 if (ret) 9197 return ret; 9198 ret = io_buffers_map_alloc(ctx, nr_args); 9199 if (ret) { 9200 io_rsrc_data_free(data); 9201 return ret; 9202 } 9203 9204 for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { 9205 ret = io_copy_iov(ctx, &iov, arg, i); 9206 if (ret) 9207 break; 9208 ret = io_buffer_validate(&iov); 9209 if (ret) 9210 break; 9211 if (!iov.iov_base && *io_get_tag_slot(data, i)) { 9212 ret = -EINVAL; 9213 break; 9214 } 9215 9216 ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], 9217 &last_hpage); 9218 if (ret) 9219 break; 9220 } 9221 9222 WARN_ON_ONCE(ctx->buf_data); 9223 9224 ctx->buf_data = data; 9225 if (ret) 9226 __io_sqe_buffers_unregister(ctx); 9227 else 9228 io_rsrc_node_switch(ctx, NULL); 9229 return ret; 9230} 9231 9232static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, 9233 struct io_uring_rsrc_update2 *up, 9234 unsigned int nr_args) 9235{ 9236 u64 __user *tags = u64_to_user_ptr(up->tags); 9237 struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); 9238 struct page *last_hpage = NULL; 9239 bool needs_switch = false; 9240 __u32 done; 9241 int i, err; 9242 9243 if (!ctx->buf_data) 9244 return -ENXIO; 9245 if (up->offset + nr_args > ctx->nr_user_bufs) 9246 return -EINVAL; 9247 9248 for (done = 0; done < nr_args; done++) { 9249 struct io_mapped_ubuf *imu; 9250 int offset = up->offset + done; 9251 u64 tag = 0; 9252 9253 err = io_copy_iov(ctx, &iov, iovs, done); 9254 if (err) 9255 break; 9256 if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { 9257 err = -EFAULT; 9258 break; 9259 } 9260 err = io_buffer_validate(&iov); 9261 if (err) 9262 break; 9263 if (!iov.iov_base && tag) { 9264 err = -EINVAL; 9265 break; 9266 } 9267 err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); 9268 if (err) 9269 break; 9270 9271 i = array_index_nospec(offset, ctx->nr_user_bufs); 9272 if (ctx->user_bufs[i] != ctx->dummy_ubuf) { 9273 err = io_queue_rsrc_removal(ctx->buf_data, i, 9274 ctx->rsrc_node, ctx->user_bufs[i]); 9275 if (unlikely(err)) { 9276 io_buffer_unmap(ctx, &imu); 9277 break; 9278 } 9279 ctx->user_bufs[i] = NULL; 9280 needs_switch = true; 9281 } 9282 9283 ctx->user_bufs[i] = imu; 9284 *io_get_tag_slot(ctx->buf_data, offset) = tag; 9285 } 9286 9287 if (needs_switch) 9288 io_rsrc_node_switch(ctx, ctx->buf_data); 9289 return done ? done : err; 9290} 9291 9292static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg) 9293{ 9294 __s32 __user *fds = arg; 9295 int fd; 9296 9297 if (ctx->cq_ev_fd) 9298 return -EBUSY; 9299 9300 if (copy_from_user(&fd, fds, sizeof(*fds))) 9301 return -EFAULT; 9302 9303 ctx->cq_ev_fd = eventfd_ctx_fdget(fd); 9304 if (IS_ERR(ctx->cq_ev_fd)) { 9305 int ret = PTR_ERR(ctx->cq_ev_fd); 9306 9307 ctx->cq_ev_fd = NULL; 9308 return ret; 9309 } 9310 9311 return 0; 9312} 9313 9314static int io_eventfd_unregister(struct io_ring_ctx *ctx) 9315{ 9316 if (ctx->cq_ev_fd) { 9317 eventfd_ctx_put(ctx->cq_ev_fd); 9318 ctx->cq_ev_fd = NULL; 9319 return 0; 9320 } 9321 9322 return -ENXIO; 9323} 9324 9325static void io_destroy_buffers(struct io_ring_ctx *ctx) 9326{ 9327 struct io_buffer *buf; 9328 unsigned long index; 9329 9330 xa_for_each(&ctx->io_buffers, index, buf) 9331 __io_remove_buffers(ctx, buf, index, -1U); 9332} 9333 9334static void io_req_cache_free(struct list_head *list) 9335{ 9336 struct io_kiocb *req, *nxt; 9337 9338 list_for_each_entry_safe(req, nxt, list, inflight_entry) { 9339 list_del(&req->inflight_entry); 9340 kmem_cache_free(req_cachep, req); 9341 } 9342} 9343 9344static void io_req_caches_free(struct io_ring_ctx *ctx) 9345{ 9346 struct io_submit_state *state = &ctx->submit_state; 9347 9348 mutex_lock(&ctx->uring_lock); 9349 9350 if (state->free_reqs) { 9351 kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); 9352 state->free_reqs = 0; 9353 } 9354 9355 io_flush_cached_locked_reqs(ctx, state); 9356 io_req_cache_free(&state->free_list); 9357 mutex_unlock(&ctx->uring_lock); 9358} 9359 9360static void io_wait_rsrc_data(struct io_rsrc_data *data) 9361{ 9362 if (data && !atomic_dec_and_test(&data->refs)) 9363 wait_for_completion(&data->done); 9364} 9365 9366static void io_ring_ctx_free(struct io_ring_ctx *ctx) 9367{ 9368 io_sq_thread_finish(ctx); 9369 9370 /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ 9371 io_wait_rsrc_data(ctx->buf_data); 9372 io_wait_rsrc_data(ctx->file_data); 9373 9374 mutex_lock(&ctx->uring_lock); 9375 if (ctx->buf_data) 9376 __io_sqe_buffers_unregister(ctx); 9377 if (ctx->file_data) 9378 __io_sqe_files_unregister(ctx); 9379 if (ctx->rings) 9380 __io_cqring_overflow_flush(ctx, true); 9381 mutex_unlock(&ctx->uring_lock); 9382 io_eventfd_unregister(ctx); 9383 io_destroy_buffers(ctx); 9384 if (ctx->sq_creds) 9385 put_cred(ctx->sq_creds); 9386 9387 /* there are no registered resources left, nobody uses it */ 9388 if (ctx->rsrc_node) 9389 io_rsrc_node_destroy(ctx->rsrc_node); 9390 if (ctx->rsrc_backup_node) 9391 io_rsrc_node_destroy(ctx->rsrc_backup_node); 9392 flush_delayed_work(&ctx->rsrc_put_work); 9393 9394 WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); 9395 WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); 9396 9397#if defined(CONFIG_UNIX) 9398 if (ctx->ring_sock) { 9399 ctx->ring_sock->file = NULL; /* so that iput() is called */ 9400 sock_release(ctx->ring_sock); 9401 } 9402#endif 9403 WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); 9404 9405 if (ctx->mm_account) { 9406 mmdrop(ctx->mm_account); 9407 ctx->mm_account = NULL; 9408 } 9409 9410 io_mem_free(ctx->rings); 9411 io_mem_free(ctx->sq_sqes); 9412 9413 percpu_ref_exit(&ctx->refs); 9414 free_uid(ctx->user); 9415 io_req_caches_free(ctx); 9416 if (ctx->hash_map) 9417 io_wq_put_hash(ctx->hash_map); 9418 kfree(ctx->cancel_hash); 9419 kfree(ctx->dummy_ubuf); 9420 kfree(ctx); 9421} 9422 9423static __poll_t io_uring_poll(struct file *file, poll_table *wait) 9424{ 9425 struct io_ring_ctx *ctx = file->private_data; 9426 __poll_t mask = 0; 9427 9428 poll_wait(file, &ctx->poll_wait, wait); 9429 /* 9430 * synchronizes with barrier from wq_has_sleeper call in 9431 * io_commit_cqring 9432 */ 9433 smp_rmb(); 9434 if (!io_sqring_full(ctx)) 9435 mask |= EPOLLOUT | EPOLLWRNORM; 9436 9437 /* 9438 * Don't flush cqring overflow list here, just do a simple check. 9439 * Otherwise there could possible be ABBA deadlock: 9440 * CPU0 CPU1 9441 * ---- ---- 9442 * lock(&ctx->uring_lock); 9443 * lock(&ep->mtx); 9444 * lock(&ctx->uring_lock); 9445 * lock(&ep->mtx); 9446 * 9447 * Users may get EPOLLIN meanwhile seeing nothing in cqring, this 9448 * pushs them to do the flush. 9449 */ 9450 if (io_cqring_events(ctx) || test_bit(0, &ctx->check_cq_overflow)) 9451 mask |= EPOLLIN | EPOLLRDNORM; 9452 9453 return mask; 9454} 9455 9456static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) 9457{ 9458 const struct cred *creds; 9459 9460 creds = xa_erase(&ctx->personalities, id); 9461 if (creds) { 9462 put_cred(creds); 9463 return 0; 9464 } 9465 9466 return -EINVAL; 9467} 9468 9469struct io_tctx_exit { 9470 struct callback_head task_work; 9471 struct completion completion; 9472 struct io_ring_ctx *ctx; 9473}; 9474 9475static void io_tctx_exit_cb(struct callback_head *cb) 9476{ 9477 struct io_uring_task *tctx = current->io_uring; 9478 struct io_tctx_exit *work; 9479 9480 work = container_of(cb, struct io_tctx_exit, task_work); 9481 /* 9482 * When @in_idle, we're in cancellation and it's racy to remove the 9483 * node. It'll be removed by the end of cancellation, just ignore it. 9484 * tctx can be NULL if the queueing of this task_work raced with 9485 * work cancelation off the exec path. 9486 */ 9487 if (tctx && !atomic_read(&tctx->in_idle)) 9488 io_uring_del_tctx_node((unsigned long)work->ctx); 9489 complete(&work->completion); 9490} 9491 9492static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) 9493{ 9494 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 9495 9496 return req->ctx == data; 9497} 9498 9499static void io_ring_exit_work(struct work_struct *work) 9500{ 9501 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); 9502 unsigned long timeout = jiffies + HZ * 60 * 5; 9503 unsigned long interval = HZ / 20; 9504 struct io_tctx_exit exit; 9505 struct io_tctx_node *node; 9506 int ret; 9507 9508 /* 9509 * If we're doing polled IO and end up having requests being 9510 * submitted async (out-of-line), then completions can come in while 9511 * we're waiting for refs to drop. We need to reap these manually, 9512 * as nobody else will be looking for them. 9513 */ 9514 do { 9515 io_uring_try_cancel_requests(ctx, NULL, true); 9516 if (ctx->sq_data) { 9517 struct io_sq_data *sqd = ctx->sq_data; 9518 struct task_struct *tsk; 9519 9520 io_sq_thread_park(sqd); 9521 tsk = sqd->thread; 9522 if (tsk && tsk->io_uring && tsk->io_uring->io_wq) 9523 io_wq_cancel_cb(tsk->io_uring->io_wq, 9524 io_cancel_ctx_cb, ctx, true); 9525 io_sq_thread_unpark(sqd); 9526 } 9527 9528 if (WARN_ON_ONCE(time_after(jiffies, timeout))) { 9529 /* there is little hope left, don't run it too often */ 9530 interval = HZ * 60; 9531 } 9532 /* 9533 * This is really an uninterruptible wait, as it has to be 9534 * complete. But it's also run from a kworker, which doesn't 9535 * take signals, so it's fine to make it interruptible. This 9536 * avoids scenarios where we knowingly can wait much longer 9537 * on completions, for example if someone does a SIGSTOP on 9538 * a task that needs to finish task_work to make this loop 9539 * complete. That's a synthetic situation that should not 9540 * cause a stuck task backtrace, and hence a potential panic 9541 * on stuck tasks if that is enabled. 9542 */ 9543 } while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval)); 9544 9545 init_completion(&exit.completion); 9546 init_task_work(&exit.task_work, io_tctx_exit_cb); 9547 exit.ctx = ctx; 9548 /* 9549 * Some may use context even when all refs and requests have been put, 9550 * and they are free to do so while still holding uring_lock or 9551 * completion_lock, see io_req_task_submit(). Apart from other work, 9552 * this lock/unlock section also waits them to finish. 9553 */ 9554 mutex_lock(&ctx->uring_lock); 9555 while (!list_empty(&ctx->tctx_list)) { 9556 WARN_ON_ONCE(time_after(jiffies, timeout)); 9557 9558 node = list_first_entry(&ctx->tctx_list, struct io_tctx_node, 9559 ctx_node); 9560 /* don't spin on a single task if cancellation failed */ 9561 list_rotate_left(&ctx->tctx_list); 9562 ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); 9563 if (WARN_ON_ONCE(ret)) 9564 continue; 9565 wake_up_process(node->task); 9566 9567 mutex_unlock(&ctx->uring_lock); 9568 /* 9569 * See comment above for 9570 * wait_for_completion_interruptible_timeout() on why this 9571 * wait is marked as interruptible. 9572 */ 9573 wait_for_completion_interruptible(&exit.completion); 9574 mutex_lock(&ctx->uring_lock); 9575 } 9576 mutex_unlock(&ctx->uring_lock); 9577 spin_lock(&ctx->completion_lock); 9578 spin_unlock(&ctx->completion_lock); 9579 9580 io_ring_ctx_free(ctx); 9581} 9582 9583/* Returns true if we found and killed one or more timeouts */ 9584static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, 9585 bool cancel_all) 9586{ 9587 struct io_kiocb *req, *tmp; 9588 int canceled = 0; 9589 9590 spin_lock(&ctx->completion_lock); 9591 spin_lock_irq(&ctx->timeout_lock); 9592 list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { 9593 if (io_match_task(req, tsk, cancel_all)) { 9594 io_kill_timeout(req, -ECANCELED); 9595 canceled++; 9596 } 9597 } 9598 spin_unlock_irq(&ctx->timeout_lock); 9599 if (canceled != 0) 9600 io_commit_cqring(ctx); 9601 spin_unlock(&ctx->completion_lock); 9602 if (canceled != 0) 9603 io_cqring_ev_posted(ctx); 9604 return canceled != 0; 9605} 9606 9607static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) 9608{ 9609 unsigned long index; 9610 struct creds *creds; 9611 9612 mutex_lock(&ctx->uring_lock); 9613 percpu_ref_kill(&ctx->refs); 9614 if (ctx->rings) 9615 __io_cqring_overflow_flush(ctx, true); 9616 xa_for_each(&ctx->personalities, index, creds) 9617 io_unregister_personality(ctx, index); 9618 mutex_unlock(&ctx->uring_lock); 9619 9620 io_kill_timeouts(ctx, NULL, true); 9621 io_poll_remove_all(ctx, NULL, true); 9622 9623 /* if we failed setting up the ctx, we might not have any rings */ 9624 io_iopoll_try_reap_events(ctx); 9625 9626 /* drop cached put refs after potentially doing completions */ 9627 if (current->io_uring) 9628 io_uring_drop_tctx_refs(current); 9629 9630 INIT_WORK(&ctx->exit_work, io_ring_exit_work); 9631 /* 9632 * Use system_unbound_wq to avoid spawning tons of event kworkers 9633 * if we're exiting a ton of rings at the same time. It just adds 9634 * noise and overhead, there's no discernable change in runtime 9635 * over using system_wq. 9636 */ 9637 queue_work(system_unbound_wq, &ctx->exit_work); 9638} 9639 9640static int io_uring_release(struct inode *inode, struct file *file) 9641{ 9642 struct io_ring_ctx *ctx = file->private_data; 9643 9644 file->private_data = NULL; 9645 io_ring_ctx_wait_and_kill(ctx); 9646 return 0; 9647} 9648 9649struct io_task_cancel { 9650 struct task_struct *task; 9651 bool all; 9652}; 9653 9654static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 9655{ 9656 struct io_kiocb *req = container_of(work, struct io_kiocb, work); 9657 struct io_task_cancel *cancel = data; 9658 9659 return io_match_task_safe(req, cancel->task, cancel->all); 9660} 9661 9662static bool io_cancel_defer_files(struct io_ring_ctx *ctx, 9663 struct task_struct *task, bool cancel_all) 9664{ 9665 struct io_defer_entry *de; 9666 LIST_HEAD(list); 9667 9668 spin_lock(&ctx->completion_lock); 9669 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 9670 if (io_match_task_safe(de->req, task, cancel_all)) { 9671 list_cut_position(&list, &ctx->defer_list, &de->list); 9672 break; 9673 } 9674 } 9675 spin_unlock(&ctx->completion_lock); 9676 if (list_empty(&list)) 9677 return false; 9678 9679 while (!list_empty(&list)) { 9680 de = list_first_entry(&list, struct io_defer_entry, list); 9681 list_del_init(&de->list); 9682 io_req_complete_failed(de->req, -ECANCELED); 9683 kfree(de); 9684 } 9685 return true; 9686} 9687 9688static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) 9689{ 9690 struct io_tctx_node *node; 9691 enum io_wq_cancel cret; 9692 bool ret = false; 9693 9694 mutex_lock(&ctx->uring_lock); 9695 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 9696 struct io_uring_task *tctx = node->task->io_uring; 9697 9698 /* 9699 * io_wq will stay alive while we hold uring_lock, because it's 9700 * killed after ctx nodes, which requires to take the lock. 9701 */ 9702 if (!tctx || !tctx->io_wq) 9703 continue; 9704 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 9705 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 9706 } 9707 mutex_unlock(&ctx->uring_lock); 9708 9709 return ret; 9710} 9711 9712static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 9713 struct task_struct *task, 9714 bool cancel_all) 9715{ 9716 struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; 9717 struct io_uring_task *tctx = task ? task->io_uring : NULL; 9718 9719 while (1) { 9720 enum io_wq_cancel cret; 9721 bool ret = false; 9722 9723 if (!task) { 9724 ret |= io_uring_try_cancel_iowq(ctx); 9725 } else if (tctx && tctx->io_wq) { 9726 /* 9727 * Cancels requests of all rings, not only @ctx, but 9728 * it's fine as the task is in exit/exec. 9729 */ 9730 cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, 9731 &cancel, true); 9732 ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 9733 } 9734 9735 /* SQPOLL thread does its own polling */ 9736 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 9737 (ctx->sq_data && ctx->sq_data->thread == current)) { 9738 while (!list_empty_careful(&ctx->iopoll_list)) { 9739 io_iopoll_try_reap_events(ctx); 9740 ret = true; 9741 cond_resched(); 9742 } 9743 } 9744 9745 ret |= io_cancel_defer_files(ctx, task, cancel_all); 9746 ret |= io_poll_remove_all(ctx, task, cancel_all); 9747 ret |= io_kill_timeouts(ctx, task, cancel_all); 9748 if (task) 9749 ret |= io_run_task_work(); 9750 if (!ret) 9751 break; 9752 cond_resched(); 9753 } 9754} 9755 9756static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) 9757{ 9758 struct io_uring_task *tctx = current->io_uring; 9759 struct io_tctx_node *node; 9760 int ret; 9761 9762 if (unlikely(!tctx)) { 9763 ret = io_uring_alloc_task_context(current, ctx); 9764 if (unlikely(ret)) 9765 return ret; 9766 9767 tctx = current->io_uring; 9768 if (ctx->iowq_limits_set) { 9769 unsigned int limits[2] = { ctx->iowq_limits[0], 9770 ctx->iowq_limits[1], }; 9771 9772 ret = io_wq_max_workers(tctx->io_wq, limits); 9773 if (ret) 9774 return ret; 9775 } 9776 } 9777 if (!xa_load(&tctx->xa, (unsigned long)ctx)) { 9778 node = kmalloc(sizeof(*node), GFP_KERNEL); 9779 if (!node) 9780 return -ENOMEM; 9781 node->ctx = ctx; 9782 node->task = current; 9783 9784 ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, 9785 node, GFP_KERNEL)); 9786 if (ret) { 9787 kfree(node); 9788 return ret; 9789 } 9790 9791 mutex_lock(&ctx->uring_lock); 9792 list_add(&node->ctx_node, &ctx->tctx_list); 9793 mutex_unlock(&ctx->uring_lock); 9794 } 9795 tctx->last = ctx; 9796 return 0; 9797} 9798 9799/* 9800 * Note that this task has used io_uring. We use it for cancelation purposes. 9801 */ 9802static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) 9803{ 9804 struct io_uring_task *tctx = current->io_uring; 9805 9806 if (likely(tctx && tctx->last == ctx)) 9807 return 0; 9808 return __io_uring_add_tctx_node(ctx); 9809} 9810 9811/* 9812 * Remove this io_uring_file -> task mapping. 9813 */ 9814static void io_uring_del_tctx_node(unsigned long index) 9815{ 9816 struct io_uring_task *tctx = current->io_uring; 9817 struct io_tctx_node *node; 9818 9819 if (!tctx) 9820 return; 9821 node = xa_erase(&tctx->xa, index); 9822 if (!node) 9823 return; 9824 9825 WARN_ON_ONCE(current != node->task); 9826 WARN_ON_ONCE(list_empty(&node->ctx_node)); 9827 9828 mutex_lock(&node->ctx->uring_lock); 9829 list_del(&node->ctx_node); 9830 mutex_unlock(&node->ctx->uring_lock); 9831 9832 if (tctx->last == node->ctx) 9833 tctx->last = NULL; 9834 kfree(node); 9835} 9836 9837static void io_uring_clean_tctx(struct io_uring_task *tctx) 9838{ 9839 struct io_wq *wq = tctx->io_wq; 9840 struct io_tctx_node *node; 9841 unsigned long index; 9842 9843 xa_for_each(&tctx->xa, index, node) { 9844 io_uring_del_tctx_node(index); 9845 cond_resched(); 9846 } 9847 if (wq) { 9848 /* 9849 * Must be after io_uring_del_task_file() (removes nodes under 9850 * uring_lock) to avoid race with io_uring_try_cancel_iowq(). 9851 */ 9852 io_wq_put_and_exit(wq); 9853 tctx->io_wq = NULL; 9854 } 9855} 9856 9857static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) 9858{ 9859 if (tracked) 9860 return atomic_read(&tctx->inflight_tracked); 9861 return percpu_counter_sum(&tctx->inflight); 9862} 9863 9864/* 9865 * Find any io_uring ctx that this task has registered or done IO on, and cancel 9866 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. 9867 */ 9868static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) 9869{ 9870 struct io_uring_task *tctx = current->io_uring; 9871 struct io_ring_ctx *ctx; 9872 s64 inflight; 9873 DEFINE_WAIT(wait); 9874 9875 WARN_ON_ONCE(sqd && sqd->thread != current); 9876 9877 if (!current->io_uring) 9878 return; 9879 if (tctx->io_wq) 9880 io_wq_exit_start(tctx->io_wq); 9881 9882 atomic_inc(&tctx->in_idle); 9883 do { 9884 io_uring_drop_tctx_refs(current); 9885 /* read completions before cancelations */ 9886 inflight = tctx_inflight(tctx, !cancel_all); 9887 if (!inflight) 9888 break; 9889 9890 if (!sqd) { 9891 struct io_tctx_node *node; 9892 unsigned long index; 9893 9894 xa_for_each(&tctx->xa, index, node) { 9895 /* sqpoll task will cancel all its requests */ 9896 if (node->ctx->sq_data) 9897 continue; 9898 io_uring_try_cancel_requests(node->ctx, current, 9899 cancel_all); 9900 } 9901 } else { 9902 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 9903 io_uring_try_cancel_requests(ctx, current, 9904 cancel_all); 9905 } 9906 9907 prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); 9908 io_run_task_work(); 9909 io_uring_drop_tctx_refs(current); 9910 9911 /* 9912 * If we've seen completions, retry without waiting. This 9913 * avoids a race where a completion comes in before we did 9914 * prepare_to_wait(). 9915 */ 9916 if (inflight == tctx_inflight(tctx, !cancel_all)) 9917 schedule(); 9918 finish_wait(&tctx->wait, &wait); 9919 } while (1); 9920 9921 io_uring_clean_tctx(tctx); 9922 if (cancel_all) { 9923 /* 9924 * We shouldn't run task_works after cancel, so just leave 9925 * ->in_idle set for normal exit. 9926 */ 9927 atomic_dec(&tctx->in_idle); 9928 /* for exec all current's requests should be gone, kill tctx */ 9929 __io_uring_free(current); 9930 } 9931} 9932 9933void __io_uring_cancel(bool cancel_all) 9934{ 9935 io_uring_cancel_generic(cancel_all, NULL); 9936} 9937 9938static void *io_uring_validate_mmap_request(struct file *file, 9939 loff_t pgoff, size_t sz) 9940{ 9941 struct io_ring_ctx *ctx = file->private_data; 9942 loff_t offset = pgoff << PAGE_SHIFT; 9943 struct page *page; 9944 void *ptr; 9945 9946 switch (offset) { 9947 case IORING_OFF_SQ_RING: 9948 case IORING_OFF_CQ_RING: 9949 ptr = ctx->rings; 9950 break; 9951 case IORING_OFF_SQES: 9952 ptr = ctx->sq_sqes; 9953 break; 9954 default: 9955 return ERR_PTR(-EINVAL); 9956 } 9957 9958 page = virt_to_head_page(ptr); 9959 if (sz > page_size(page)) 9960 return ERR_PTR(-EINVAL); 9961 9962 return ptr; 9963} 9964 9965#ifdef CONFIG_MMU 9966 9967static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 9968{ 9969 size_t sz = vma->vm_end - vma->vm_start; 9970 unsigned long pfn; 9971 void *ptr; 9972 9973 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 9974 if (IS_ERR(ptr)) 9975 return PTR_ERR(ptr); 9976 9977 pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 9978 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 9979} 9980 9981#else /* !CONFIG_MMU */ 9982 9983static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 9984{ 9985 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; 9986} 9987 9988static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) 9989{ 9990 return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; 9991} 9992 9993static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, 9994 unsigned long addr, unsigned long len, 9995 unsigned long pgoff, unsigned long flags) 9996{ 9997 void *ptr; 9998 9999 ptr = io_uring_validate_mmap_request(file, pgoff, len); 10000 if (IS_ERR(ptr)) 10001 return PTR_ERR(ptr); 10002 10003 return (unsigned long) ptr; 10004} 10005 10006#endif /* !CONFIG_MMU */ 10007 10008static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) 10009{ 10010 DEFINE_WAIT(wait); 10011 10012 do { 10013 if (!io_sqring_full(ctx)) 10014 break; 10015 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); 10016 10017 if (!io_sqring_full(ctx)) 10018 break; 10019 schedule(); 10020 } while (!signal_pending(current)); 10021 10022 finish_wait(&ctx->sqo_sq_wait, &wait); 10023 return 0; 10024} 10025 10026static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, 10027 struct __kernel_timespec __user **ts, 10028 const sigset_t __user **sig) 10029{ 10030 struct io_uring_getevents_arg arg; 10031 10032 /* 10033 * If EXT_ARG isn't set, then we have no timespec and the argp pointer 10034 * is just a pointer to the sigset_t. 10035 */ 10036 if (!(flags & IORING_ENTER_EXT_ARG)) { 10037 *sig = (const sigset_t __user *) argp; 10038 *ts = NULL; 10039 return 0; 10040 } 10041 10042 /* 10043 * EXT_ARG is set - ensure we agree on the size of it and copy in our 10044 * timespec and sigset_t pointers if good. 10045 */ 10046 if (*argsz != sizeof(arg)) 10047 return -EINVAL; 10048 if (copy_from_user(&arg, argp, sizeof(arg))) 10049 return -EFAULT; 10050 if (arg.pad) 10051 return -EINVAL; 10052 *sig = u64_to_user_ptr(arg.sigmask); 10053 *argsz = arg.sigmask_sz; 10054 *ts = u64_to_user_ptr(arg.ts); 10055 return 0; 10056} 10057 10058SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 10059 u32, min_complete, u32, flags, const void __user *, argp, 10060 size_t, argsz) 10061{ 10062 struct io_ring_ctx *ctx; 10063 int submitted = 0; 10064 struct fd f; 10065 long ret; 10066 10067 io_run_task_work(); 10068 10069 if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 10070 IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG))) 10071 return -EINVAL; 10072 10073 f = fdget(fd); 10074 if (unlikely(!f.file)) 10075 return -EBADF; 10076 10077 ret = -EOPNOTSUPP; 10078 if (unlikely(f.file->f_op != &io_uring_fops)) 10079 goto out_fput; 10080 10081 ret = -ENXIO; 10082 ctx = f.file->private_data; 10083 if (unlikely(!percpu_ref_tryget(&ctx->refs))) 10084 goto out_fput; 10085 10086 ret = -EBADFD; 10087 if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) 10088 goto out; 10089 10090 /* 10091 * For SQ polling, the thread will do all submissions and completions. 10092 * Just return the requested submit count, and wake the thread if 10093 * we were asked to. 10094 */ 10095 ret = 0; 10096 if (ctx->flags & IORING_SETUP_SQPOLL) { 10097 io_cqring_overflow_flush(ctx); 10098 10099 if (unlikely(ctx->sq_data->thread == NULL)) { 10100 ret = -EOWNERDEAD; 10101 goto out; 10102 } 10103 if (flags & IORING_ENTER_SQ_WAKEUP) 10104 wake_up(&ctx->sq_data->wait); 10105 if (flags & IORING_ENTER_SQ_WAIT) { 10106 ret = io_sqpoll_wait_sq(ctx); 10107 if (ret) 10108 goto out; 10109 } 10110 submitted = to_submit; 10111 } else if (to_submit) { 10112 ret = io_uring_add_tctx_node(ctx); 10113 if (unlikely(ret)) 10114 goto out; 10115 mutex_lock(&ctx->uring_lock); 10116 submitted = io_submit_sqes(ctx, to_submit); 10117 mutex_unlock(&ctx->uring_lock); 10118 10119 if (submitted != to_submit) 10120 goto out; 10121 } 10122 if (flags & IORING_ENTER_GETEVENTS) { 10123 const sigset_t __user *sig; 10124 struct __kernel_timespec __user *ts; 10125 10126 ret = io_get_ext_arg(flags, argp, &argsz, &ts, &sig); 10127 if (unlikely(ret)) 10128 goto out; 10129 10130 min_complete = min(min_complete, ctx->cq_entries); 10131 10132 /* 10133 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user 10134 * space applications don't need to do io completion events 10135 * polling again, they can rely on io_sq_thread to do polling 10136 * work, which can reduce cpu usage and uring_lock contention. 10137 */ 10138 if (ctx->flags & IORING_SETUP_IOPOLL && 10139 !(ctx->flags & IORING_SETUP_SQPOLL)) { 10140 ret = io_iopoll_check(ctx, min_complete); 10141 } else { 10142 ret = io_cqring_wait(ctx, min_complete, sig, argsz, ts); 10143 } 10144 } 10145 10146out: 10147 percpu_ref_put(&ctx->refs); 10148out_fput: 10149 fdput(f); 10150 return submitted ? submitted : ret; 10151} 10152 10153#ifdef CONFIG_PROC_FS 10154static int io_uring_show_cred(struct seq_file *m, unsigned int id, 10155 const struct cred *cred) 10156{ 10157 struct user_namespace *uns = seq_user_ns(m); 10158 struct group_info *gi; 10159 kernel_cap_t cap; 10160 unsigned __capi; 10161 int g; 10162 10163 seq_printf(m, "%5d\n", id); 10164 seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); 10165 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); 10166 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); 10167 seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); 10168 seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); 10169 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); 10170 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); 10171 seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); 10172 seq_puts(m, "\n\tGroups:\t"); 10173 gi = cred->group_info; 10174 for (g = 0; g < gi->ngroups; g++) { 10175 seq_put_decimal_ull(m, g ? " " : "", 10176 from_kgid_munged(uns, gi->gid[g])); 10177 } 10178 seq_puts(m, "\n\tCapEff:\t"); 10179 cap = cred->cap_effective; 10180 CAP_FOR_EACH_U32(__capi) 10181 seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); 10182 seq_putc(m, '\n'); 10183 return 0; 10184} 10185 10186static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) 10187{ 10188 int sq_pid = -1, sq_cpu = -1; 10189 bool has_lock; 10190 int i; 10191 10192 /* 10193 * Avoid ABBA deadlock between the seq lock and the io_uring mutex, 10194 * since fdinfo case grabs it in the opposite direction of normal use 10195 * cases. If we fail to get the lock, we just don't iterate any 10196 * structures that could be going away outside the io_uring mutex. 10197 */ 10198 has_lock = mutex_trylock(&ctx->uring_lock); 10199 10200 if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { 10201 struct io_sq_data *sq = ctx->sq_data; 10202 10203 if (mutex_trylock(&sq->lock)) { 10204 if (sq->thread) { 10205 sq_pid = task_pid_nr(sq->thread); 10206 sq_cpu = task_cpu(sq->thread); 10207 } 10208 mutex_unlock(&sq->lock); 10209 } 10210 } 10211 10212 seq_printf(m, "SqThread:\t%d\n", sq_pid); 10213 seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu); 10214 seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); 10215 for (i = 0; has_lock && i < ctx->nr_user_files; i++) { 10216 struct file *f = io_file_from_index(ctx, i); 10217 10218 if (f) 10219 seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); 10220 else 10221 seq_printf(m, "%5u: <none>\n", i); 10222 } 10223 seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); 10224 for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { 10225 struct io_mapped_ubuf *buf = ctx->user_bufs[i]; 10226 unsigned int len = buf->ubuf_end - buf->ubuf; 10227 10228 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); 10229 } 10230 if (has_lock && !xa_empty(&ctx->personalities)) { 10231 unsigned long index; 10232 const struct cred *cred; 10233 10234 seq_printf(m, "Personalities:\n"); 10235 xa_for_each(&ctx->personalities, index, cred) 10236 io_uring_show_cred(m, index, cred); 10237 } 10238 seq_printf(m, "PollList:\n"); 10239 spin_lock(&ctx->completion_lock); 10240 for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { 10241 struct hlist_head *list = &ctx->cancel_hash[i]; 10242 struct io_kiocb *req; 10243 10244 hlist_for_each_entry(req, list, hash_node) 10245 seq_printf(m, " op=%d, task_works=%d\n", req->opcode, 10246 req->task->task_works != NULL); 10247 } 10248 spin_unlock(&ctx->completion_lock); 10249 if (has_lock) 10250 mutex_unlock(&ctx->uring_lock); 10251} 10252 10253static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) 10254{ 10255 struct io_ring_ctx *ctx = f->private_data; 10256 10257 if (percpu_ref_tryget(&ctx->refs)) { 10258 __io_uring_show_fdinfo(ctx, m); 10259 percpu_ref_put(&ctx->refs); 10260 } 10261} 10262#endif 10263 10264static const struct file_operations io_uring_fops = { 10265 .release = io_uring_release, 10266 .mmap = io_uring_mmap, 10267#ifndef CONFIG_MMU 10268 .get_unmapped_area = io_uring_nommu_get_unmapped_area, 10269 .mmap_capabilities = io_uring_nommu_mmap_capabilities, 10270#endif 10271 .poll = io_uring_poll, 10272#ifdef CONFIG_PROC_FS 10273 .show_fdinfo = io_uring_show_fdinfo, 10274#endif 10275}; 10276 10277static int io_allocate_scq_urings(struct io_ring_ctx *ctx, 10278 struct io_uring_params *p) 10279{ 10280 struct io_rings *rings; 10281 size_t size, sq_array_offset; 10282 10283 /* make sure these are sane, as we already accounted them */ 10284 ctx->sq_entries = p->sq_entries; 10285 ctx->cq_entries = p->cq_entries; 10286 10287 size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); 10288 if (size == SIZE_MAX) 10289 return -EOVERFLOW; 10290 10291 rings = io_mem_alloc(size); 10292 if (!rings) 10293 return -ENOMEM; 10294 10295 ctx->rings = rings; 10296 ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 10297 rings->sq_ring_mask = p->sq_entries - 1; 10298 rings->cq_ring_mask = p->cq_entries - 1; 10299 rings->sq_ring_entries = p->sq_entries; 10300 rings->cq_ring_entries = p->cq_entries; 10301 10302 size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); 10303 if (size == SIZE_MAX) { 10304 io_mem_free(ctx->rings); 10305 ctx->rings = NULL; 10306 return -EOVERFLOW; 10307 } 10308 10309 ctx->sq_sqes = io_mem_alloc(size); 10310 if (!ctx->sq_sqes) { 10311 io_mem_free(ctx->rings); 10312 ctx->rings = NULL; 10313 return -ENOMEM; 10314 } 10315 10316 return 0; 10317} 10318 10319static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) 10320{ 10321 int ret, fd; 10322 10323 fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); 10324 if (fd < 0) 10325 return fd; 10326 10327 ret = io_uring_add_tctx_node(ctx); 10328 if (ret) { 10329 put_unused_fd(fd); 10330 return ret; 10331 } 10332 fd_install(fd, file); 10333 return fd; 10334} 10335 10336/* 10337 * Allocate an anonymous fd, this is what constitutes the application 10338 * visible backing of an io_uring instance. The application mmaps this 10339 * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, 10340 * we have to tie this fd to a socket for file garbage collection purposes. 10341 */ 10342static struct file *io_uring_get_file(struct io_ring_ctx *ctx) 10343{ 10344 struct file *file; 10345#if defined(CONFIG_UNIX) 10346 int ret; 10347 10348 ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, 10349 &ctx->ring_sock); 10350 if (ret) 10351 return ERR_PTR(ret); 10352#endif 10353 10354 file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, 10355 O_RDWR | O_CLOEXEC); 10356#if defined(CONFIG_UNIX) 10357 if (IS_ERR(file)) { 10358 sock_release(ctx->ring_sock); 10359 ctx->ring_sock = NULL; 10360 } else { 10361 ctx->ring_sock->file = file; 10362 } 10363#endif 10364 return file; 10365} 10366 10367static int io_uring_create(unsigned entries, struct io_uring_params *p, 10368 struct io_uring_params __user *params) 10369{ 10370 struct io_ring_ctx *ctx; 10371 struct file *file; 10372 int ret; 10373 10374 if (!entries) 10375 return -EINVAL; 10376 if (entries > IORING_MAX_ENTRIES) { 10377 if (!(p->flags & IORING_SETUP_CLAMP)) 10378 return -EINVAL; 10379 entries = IORING_MAX_ENTRIES; 10380 } 10381 10382 /* 10383 * Use twice as many entries for the CQ ring. It's possible for the 10384 * application to drive a higher depth than the size of the SQ ring, 10385 * since the sqes are only used at submission time. This allows for 10386 * some flexibility in overcommitting a bit. If the application has 10387 * set IORING_SETUP_CQSIZE, it will have passed in the desired number 10388 * of CQ ring entries manually. 10389 */ 10390 p->sq_entries = roundup_pow_of_two(entries); 10391 if (p->flags & IORING_SETUP_CQSIZE) { 10392 /* 10393 * If IORING_SETUP_CQSIZE is set, we do the same roundup 10394 * to a power-of-two, if it isn't already. We do NOT impose 10395 * any cq vs sq ring sizing. 10396 */ 10397 if (!p->cq_entries) 10398 return -EINVAL; 10399 if (p->cq_entries > IORING_MAX_CQ_ENTRIES) { 10400 if (!(p->flags & IORING_SETUP_CLAMP)) 10401 return -EINVAL; 10402 p->cq_entries = IORING_MAX_CQ_ENTRIES; 10403 } 10404 p->cq_entries = roundup_pow_of_two(p->cq_entries); 10405 if (p->cq_entries < p->sq_entries) 10406 return -EINVAL; 10407 } else { 10408 p->cq_entries = 2 * p->sq_entries; 10409 } 10410 10411 ctx = io_ring_ctx_alloc(p); 10412 if (!ctx) 10413 return -ENOMEM; 10414 ctx->compat = in_compat_syscall(); 10415 if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) 10416 ctx->user = get_uid(current_user()); 10417 10418 /* 10419 * This is just grabbed for accounting purposes. When a process exits, 10420 * the mm is exited and dropped before the files, hence we need to hang 10421 * on to this mm purely for the purposes of being able to unaccount 10422 * memory (locked/pinned vm). It's not used for anything else. 10423 */ 10424 mmgrab(current->mm); 10425 ctx->mm_account = current->mm; 10426 10427 ret = io_allocate_scq_urings(ctx, p); 10428 if (ret) 10429 goto err; 10430 10431 ret = io_sq_offload_create(ctx, p); 10432 if (ret) 10433 goto err; 10434 /* always set a rsrc node */ 10435 ret = io_rsrc_node_switch_start(ctx); 10436 if (ret) 10437 goto err; 10438 io_rsrc_node_switch(ctx, NULL); 10439 10440 memset(&p->sq_off, 0, sizeof(p->sq_off)); 10441 p->sq_off.head = offsetof(struct io_rings, sq.head); 10442 p->sq_off.tail = offsetof(struct io_rings, sq.tail); 10443 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); 10444 p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); 10445 p->sq_off.flags = offsetof(struct io_rings, sq_flags); 10446 p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); 10447 p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 10448 10449 memset(&p->cq_off, 0, sizeof(p->cq_off)); 10450 p->cq_off.head = offsetof(struct io_rings, cq.head); 10451 p->cq_off.tail = offsetof(struct io_rings, cq.tail); 10452 p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); 10453 p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); 10454 p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); 10455 p->cq_off.cqes = offsetof(struct io_rings, cqes); 10456 p->cq_off.flags = offsetof(struct io_rings, cq_flags); 10457 10458 p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 10459 IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 10460 IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 10461 IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 10462 IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 10463 IORING_FEAT_RSRC_TAGS; 10464 10465 if (copy_to_user(params, p, sizeof(*p))) { 10466 ret = -EFAULT; 10467 goto err; 10468 } 10469 10470 file = io_uring_get_file(ctx); 10471 if (IS_ERR(file)) { 10472 ret = PTR_ERR(file); 10473 goto err; 10474 } 10475 10476 /* 10477 * Install ring fd as the very last thing, so we don't risk someone 10478 * having closed it before we finish setup 10479 */ 10480 ret = io_uring_install_fd(ctx, file); 10481 if (ret < 0) { 10482 /* fput will clean it up */ 10483 fput(file); 10484 return ret; 10485 } 10486 10487 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); 10488 return ret; 10489err: 10490 io_ring_ctx_wait_and_kill(ctx); 10491 return ret; 10492} 10493 10494/* 10495 * Sets up an aio uring context, and returns the fd. Applications asks for a 10496 * ring size, we return the actual sq/cq ring sizes (among other things) in the 10497 * params structure passed in. 10498 */ 10499static long io_uring_setup(u32 entries, struct io_uring_params __user *params) 10500{ 10501 struct io_uring_params p; 10502 int i; 10503 10504 if (copy_from_user(&p, params, sizeof(p))) 10505 return -EFAULT; 10506 for (i = 0; i < ARRAY_SIZE(p.resv); i++) { 10507 if (p.resv[i]) 10508 return -EINVAL; 10509 } 10510 10511 if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | 10512 IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | 10513 IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | 10514 IORING_SETUP_R_DISABLED)) 10515 return -EINVAL; 10516 10517 return io_uring_create(entries, &p, params); 10518} 10519 10520SYSCALL_DEFINE2(io_uring_setup, u32, entries, 10521 struct io_uring_params __user *, params) 10522{ 10523 return io_uring_setup(entries, params); 10524} 10525 10526static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 10527{ 10528 struct io_uring_probe *p; 10529 size_t size; 10530 int i, ret; 10531 10532 size = struct_size(p, ops, nr_args); 10533 if (size == SIZE_MAX) 10534 return -EOVERFLOW; 10535 p = kzalloc(size, GFP_KERNEL); 10536 if (!p) 10537 return -ENOMEM; 10538 10539 ret = -EFAULT; 10540 if (copy_from_user(p, arg, size)) 10541 goto out; 10542 ret = -EINVAL; 10543 if (memchr_inv(p, 0, size)) 10544 goto out; 10545 10546 p->last_op = IORING_OP_LAST - 1; 10547 if (nr_args > IORING_OP_LAST) 10548 nr_args = IORING_OP_LAST; 10549 10550 for (i = 0; i < nr_args; i++) { 10551 p->ops[i].op = i; 10552 if (!io_op_defs[i].not_supported) 10553 p->ops[i].flags = IO_URING_OP_SUPPORTED; 10554 } 10555 p->ops_len = i; 10556 10557 ret = 0; 10558 if (copy_to_user(arg, p, size)) 10559 ret = -EFAULT; 10560out: 10561 kfree(p); 10562 return ret; 10563} 10564 10565static int io_register_personality(struct io_ring_ctx *ctx) 10566{ 10567 const struct cred *creds; 10568 u32 id; 10569 int ret; 10570 10571 creds = get_current_cred(); 10572 10573 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, 10574 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); 10575 if (ret < 0) { 10576 put_cred(creds); 10577 return ret; 10578 } 10579 return id; 10580} 10581 10582static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, 10583 unsigned int nr_args) 10584{ 10585 struct io_uring_restriction *res; 10586 size_t size; 10587 int i, ret; 10588 10589 /* Restrictions allowed only if rings started disabled */ 10590 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 10591 return -EBADFD; 10592 10593 /* We allow only a single restrictions registration */ 10594 if (ctx->restrictions.registered) 10595 return -EBUSY; 10596 10597 if (!arg || nr_args > IORING_MAX_RESTRICTIONS) 10598 return -EINVAL; 10599 10600 size = array_size(nr_args, sizeof(*res)); 10601 if (size == SIZE_MAX) 10602 return -EOVERFLOW; 10603 10604 res = memdup_user(arg, size); 10605 if (IS_ERR(res)) 10606 return PTR_ERR(res); 10607 10608 ret = 0; 10609 10610 for (i = 0; i < nr_args; i++) { 10611 switch (res[i].opcode) { 10612 case IORING_RESTRICTION_REGISTER_OP: 10613 if (res[i].register_op >= IORING_REGISTER_LAST) { 10614 ret = -EINVAL; 10615 goto out; 10616 } 10617 10618 __set_bit(res[i].register_op, 10619 ctx->restrictions.register_op); 10620 break; 10621 case IORING_RESTRICTION_SQE_OP: 10622 if (res[i].sqe_op >= IORING_OP_LAST) { 10623 ret = -EINVAL; 10624 goto out; 10625 } 10626 10627 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); 10628 break; 10629 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 10630 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; 10631 break; 10632 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 10633 ctx->restrictions.sqe_flags_required = res[i].sqe_flags; 10634 break; 10635 default: 10636 ret = -EINVAL; 10637 goto out; 10638 } 10639 } 10640 10641out: 10642 /* Reset all restrictions if an error happened */ 10643 if (ret != 0) 10644 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 10645 else 10646 ctx->restrictions.registered = true; 10647 10648 kfree(res); 10649 return ret; 10650} 10651 10652static int io_register_enable_rings(struct io_ring_ctx *ctx) 10653{ 10654 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 10655 return -EBADFD; 10656 10657 if (ctx->restrictions.registered) 10658 ctx->restricted = 1; 10659 10660 ctx->flags &= ~IORING_SETUP_R_DISABLED; 10661 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 10662 wake_up(&ctx->sq_data->wait); 10663 return 0; 10664} 10665 10666static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, 10667 struct io_uring_rsrc_update2 *up, 10668 unsigned nr_args) 10669{ 10670 __u32 tmp; 10671 int err; 10672 10673 if (check_add_overflow(up->offset, nr_args, &tmp)) 10674 return -EOVERFLOW; 10675 err = io_rsrc_node_switch_start(ctx); 10676 if (err) 10677 return err; 10678 10679 switch (type) { 10680 case IORING_RSRC_FILE: 10681 return __io_sqe_files_update(ctx, up, nr_args); 10682 case IORING_RSRC_BUFFER: 10683 return __io_sqe_buffers_update(ctx, up, nr_args); 10684 } 10685 return -EINVAL; 10686} 10687 10688static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, 10689 unsigned nr_args) 10690{ 10691 struct io_uring_rsrc_update2 up; 10692 10693 if (!nr_args) 10694 return -EINVAL; 10695 memset(&up, 0, sizeof(up)); 10696 if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) 10697 return -EFAULT; 10698 if (up.resv || up.resv2) 10699 return -EINVAL; 10700 return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); 10701} 10702 10703static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, 10704 unsigned size, unsigned type) 10705{ 10706 struct io_uring_rsrc_update2 up; 10707 10708 if (size != sizeof(up)) 10709 return -EINVAL; 10710 if (copy_from_user(&up, arg, sizeof(up))) 10711 return -EFAULT; 10712 if (!up.nr || up.resv || up.resv2) 10713 return -EINVAL; 10714 return __io_register_rsrc_update(ctx, type, &up, up.nr); 10715} 10716 10717static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, 10718 unsigned int size, unsigned int type) 10719{ 10720 struct io_uring_rsrc_register rr; 10721 10722 /* keep it extendible */ 10723 if (size != sizeof(rr)) 10724 return -EINVAL; 10725 10726 memset(&rr, 0, sizeof(rr)); 10727 if (copy_from_user(&rr, arg, size)) 10728 return -EFAULT; 10729 if (!rr.nr || rr.resv || rr.resv2) 10730 return -EINVAL; 10731 10732 switch (type) { 10733 case IORING_RSRC_FILE: 10734 return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), 10735 rr.nr, u64_to_user_ptr(rr.tags)); 10736 case IORING_RSRC_BUFFER: 10737 return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), 10738 rr.nr, u64_to_user_ptr(rr.tags)); 10739 } 10740 return -EINVAL; 10741} 10742 10743static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, 10744 unsigned len) 10745{ 10746 struct io_uring_task *tctx = current->io_uring; 10747 cpumask_var_t new_mask; 10748 int ret; 10749 10750 if (!tctx || !tctx->io_wq) 10751 return -EINVAL; 10752 10753 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 10754 return -ENOMEM; 10755 10756 cpumask_clear(new_mask); 10757 if (len > cpumask_size()) 10758 len = cpumask_size(); 10759 10760#ifdef CONFIG_COMPAT 10761 if (in_compat_syscall()) { 10762 ret = compat_get_bitmap(cpumask_bits(new_mask), 10763 (const compat_ulong_t __user *)arg, 10764 len * 8 /* CHAR_BIT */); 10765 } else { 10766 ret = copy_from_user(new_mask, arg, len); 10767 } 10768#else 10769 ret = copy_from_user(new_mask, arg, len); 10770#endif 10771 10772 if (ret) { 10773 free_cpumask_var(new_mask); 10774 return -EFAULT; 10775 } 10776 10777 ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); 10778 free_cpumask_var(new_mask); 10779 return ret; 10780} 10781 10782static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) 10783{ 10784 struct io_uring_task *tctx = current->io_uring; 10785 10786 if (!tctx || !tctx->io_wq) 10787 return -EINVAL; 10788 10789 return io_wq_cpu_affinity(tctx->io_wq, NULL); 10790} 10791 10792static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, 10793 void __user *arg) 10794 __must_hold(&ctx->uring_lock) 10795{ 10796 struct io_tctx_node *node; 10797 struct io_uring_task *tctx = NULL; 10798 struct io_sq_data *sqd = NULL; 10799 __u32 new_count[2]; 10800 int i, ret; 10801 10802 if (copy_from_user(new_count, arg, sizeof(new_count))) 10803 return -EFAULT; 10804 for (i = 0; i < ARRAY_SIZE(new_count); i++) 10805 if (new_count[i] > INT_MAX) 10806 return -EINVAL; 10807 10808 if (ctx->flags & IORING_SETUP_SQPOLL) { 10809 sqd = ctx->sq_data; 10810 if (sqd) { 10811 /* 10812 * Observe the correct sqd->lock -> ctx->uring_lock 10813 * ordering. Fine to drop uring_lock here, we hold 10814 * a ref to the ctx. 10815 */ 10816 refcount_inc(&sqd->refs); 10817 mutex_unlock(&ctx->uring_lock); 10818 mutex_lock(&sqd->lock); 10819 mutex_lock(&ctx->uring_lock); 10820 if (sqd->thread) 10821 tctx = sqd->thread->io_uring; 10822 } 10823 } else { 10824 tctx = current->io_uring; 10825 } 10826 10827 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); 10828 10829 for (i = 0; i < ARRAY_SIZE(new_count); i++) 10830 if (new_count[i]) 10831 ctx->iowq_limits[i] = new_count[i]; 10832 ctx->iowq_limits_set = true; 10833 10834 ret = -EINVAL; 10835 if (tctx && tctx->io_wq) { 10836 ret = io_wq_max_workers(tctx->io_wq, new_count); 10837 if (ret) 10838 goto err; 10839 } else { 10840 memset(new_count, 0, sizeof(new_count)); 10841 } 10842 10843 if (sqd) { 10844 mutex_unlock(&sqd->lock); 10845 io_put_sq_data(sqd); 10846 } 10847 10848 if (copy_to_user(arg, new_count, sizeof(new_count))) 10849 return -EFAULT; 10850 10851 /* that's it for SQPOLL, only the SQPOLL task creates requests */ 10852 if (sqd) 10853 return 0; 10854 10855 /* now propagate the restriction to all registered users */ 10856 list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 10857 struct io_uring_task *tctx = node->task->io_uring; 10858 10859 if (WARN_ON_ONCE(!tctx->io_wq)) 10860 continue; 10861 10862 for (i = 0; i < ARRAY_SIZE(new_count); i++) 10863 new_count[i] = ctx->iowq_limits[i]; 10864 /* ignore errors, it always returns zero anyway */ 10865 (void)io_wq_max_workers(tctx->io_wq, new_count); 10866 } 10867 return 0; 10868err: 10869 if (sqd) { 10870 mutex_unlock(&sqd->lock); 10871 io_put_sq_data(sqd); 10872 } 10873 return ret; 10874} 10875 10876static bool io_register_op_must_quiesce(int op) 10877{ 10878 switch (op) { 10879 case IORING_REGISTER_BUFFERS: 10880 case IORING_UNREGISTER_BUFFERS: 10881 case IORING_REGISTER_FILES: 10882 case IORING_UNREGISTER_FILES: 10883 case IORING_REGISTER_FILES_UPDATE: 10884 case IORING_REGISTER_PROBE: 10885 case IORING_REGISTER_PERSONALITY: 10886 case IORING_UNREGISTER_PERSONALITY: 10887 case IORING_REGISTER_FILES2: 10888 case IORING_REGISTER_FILES_UPDATE2: 10889 case IORING_REGISTER_BUFFERS2: 10890 case IORING_REGISTER_BUFFERS_UPDATE: 10891 case IORING_REGISTER_IOWQ_AFF: 10892 case IORING_UNREGISTER_IOWQ_AFF: 10893 case IORING_REGISTER_IOWQ_MAX_WORKERS: 10894 return false; 10895 default: 10896 return true; 10897 } 10898} 10899 10900static int io_ctx_quiesce(struct io_ring_ctx *ctx) 10901{ 10902 long ret; 10903 10904 percpu_ref_kill(&ctx->refs); 10905 10906 /* 10907 * Drop uring mutex before waiting for references to exit. If another 10908 * thread is currently inside io_uring_enter() it might need to grab the 10909 * uring_lock to make progress. If we hold it here across the drain 10910 * wait, then we can deadlock. It's safe to drop the mutex here, since 10911 * no new references will come in after we've killed the percpu ref. 10912 */ 10913 mutex_unlock(&ctx->uring_lock); 10914 do { 10915 ret = wait_for_completion_interruptible(&ctx->ref_comp); 10916 if (!ret) 10917 break; 10918 ret = io_run_task_work_sig(); 10919 } while (ret >= 0); 10920 mutex_lock(&ctx->uring_lock); 10921 10922 if (ret) 10923 io_refs_resurrect(&ctx->refs, &ctx->ref_comp); 10924 return ret; 10925} 10926 10927static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, 10928 void __user *arg, unsigned nr_args) 10929 __releases(ctx->uring_lock) 10930 __acquires(ctx->uring_lock) 10931{ 10932 int ret; 10933 10934 /* 10935 * We're inside the ring mutex, if the ref is already dying, then 10936 * someone else killed the ctx or is already going through 10937 * io_uring_register(). 10938 */ 10939 if (percpu_ref_is_dying(&ctx->refs)) 10940 return -ENXIO; 10941 10942 if (ctx->restricted) { 10943 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 10944 if (!test_bit(opcode, ctx->restrictions.register_op)) 10945 return -EACCES; 10946 } 10947 10948 if (io_register_op_must_quiesce(opcode)) { 10949 ret = io_ctx_quiesce(ctx); 10950 if (ret) 10951 return ret; 10952 } 10953 10954 switch (opcode) { 10955 case IORING_REGISTER_BUFFERS: 10956 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL); 10957 break; 10958 case IORING_UNREGISTER_BUFFERS: 10959 ret = -EINVAL; 10960 if (arg || nr_args) 10961 break; 10962 ret = io_sqe_buffers_unregister(ctx); 10963 break; 10964 case IORING_REGISTER_FILES: 10965 ret = io_sqe_files_register(ctx, arg, nr_args, NULL); 10966 break; 10967 case IORING_UNREGISTER_FILES: 10968 ret = -EINVAL; 10969 if (arg || nr_args) 10970 break; 10971 ret = io_sqe_files_unregister(ctx); 10972 break; 10973 case IORING_REGISTER_FILES_UPDATE: 10974 ret = io_register_files_update(ctx, arg, nr_args); 10975 break; 10976 case IORING_REGISTER_EVENTFD: 10977 case IORING_REGISTER_EVENTFD_ASYNC: 10978 ret = -EINVAL; 10979 if (nr_args != 1) 10980 break; 10981 ret = io_eventfd_register(ctx, arg); 10982 if (ret) 10983 break; 10984 if (opcode == IORING_REGISTER_EVENTFD_ASYNC) 10985 ctx->eventfd_async = 1; 10986 else 10987 ctx->eventfd_async = 0; 10988 break; 10989 case IORING_UNREGISTER_EVENTFD: 10990 ret = -EINVAL; 10991 if (arg || nr_args) 10992 break; 10993 ret = io_eventfd_unregister(ctx); 10994 break; 10995 case IORING_REGISTER_PROBE: 10996 ret = -EINVAL; 10997 if (!arg || nr_args > 256) 10998 break; 10999 ret = io_probe(ctx, arg, nr_args); 11000 break; 11001 case IORING_REGISTER_PERSONALITY: 11002 ret = -EINVAL; 11003 if (arg || nr_args) 11004 break; 11005 ret = io_register_personality(ctx); 11006 break; 11007 case IORING_UNREGISTER_PERSONALITY: 11008 ret = -EINVAL; 11009 if (arg) 11010 break; 11011 ret = io_unregister_personality(ctx, nr_args); 11012 break; 11013 case IORING_REGISTER_ENABLE_RINGS: 11014 ret = -EINVAL; 11015 if (arg || nr_args) 11016 break; 11017 ret = io_register_enable_rings(ctx); 11018 break; 11019 case IORING_REGISTER_RESTRICTIONS: 11020 ret = io_register_restrictions(ctx, arg, nr_args); 11021 break; 11022 case IORING_REGISTER_FILES2: 11023 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE); 11024 break; 11025 case IORING_REGISTER_FILES_UPDATE2: 11026 ret = io_register_rsrc_update(ctx, arg, nr_args, 11027 IORING_RSRC_FILE); 11028 break; 11029 case IORING_REGISTER_BUFFERS2: 11030 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER); 11031 break; 11032 case IORING_REGISTER_BUFFERS_UPDATE: 11033 ret = io_register_rsrc_update(ctx, arg, nr_args, 11034 IORING_RSRC_BUFFER); 11035 break; 11036 case IORING_REGISTER_IOWQ_AFF: 11037 ret = -EINVAL; 11038 if (!arg || !nr_args) 11039 break; 11040 ret = io_register_iowq_aff(ctx, arg, nr_args); 11041 break; 11042 case IORING_UNREGISTER_IOWQ_AFF: 11043 ret = -EINVAL; 11044 if (arg || nr_args) 11045 break; 11046 ret = io_unregister_iowq_aff(ctx); 11047 break; 11048 case IORING_REGISTER_IOWQ_MAX_WORKERS: 11049 ret = -EINVAL; 11050 if (!arg || nr_args != 2) 11051 break; 11052 ret = io_register_iowq_max_workers(ctx, arg); 11053 break; 11054 default: 11055 ret = -EINVAL; 11056 break; 11057 } 11058 11059 if (io_register_op_must_quiesce(opcode)) { 11060 /* bring the ctx back to life */ 11061 percpu_ref_reinit(&ctx->refs); 11062 reinit_completion(&ctx->ref_comp); 11063 } 11064 return ret; 11065} 11066 11067SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, 11068 void __user *, arg, unsigned int, nr_args) 11069{ 11070 struct io_ring_ctx *ctx; 11071 long ret = -EBADF; 11072 struct fd f; 11073 11074 if (opcode >= IORING_REGISTER_LAST) 11075 return -EINVAL; 11076 11077 f = fdget(fd); 11078 if (!f.file) 11079 return -EBADF; 11080 11081 ret = -EOPNOTSUPP; 11082 if (f.file->f_op != &io_uring_fops) 11083 goto out_fput; 11084 11085 ctx = f.file->private_data; 11086 11087 io_run_task_work(); 11088 11089 mutex_lock(&ctx->uring_lock); 11090 ret = __io_uring_register(ctx, opcode, arg, nr_args); 11091 mutex_unlock(&ctx->uring_lock); 11092 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, 11093 ctx->cq_ev_fd != NULL, ret); 11094out_fput: 11095 fdput(f); 11096 return ret; 11097} 11098 11099static int __init io_uring_init(void) 11100{ 11101#define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ 11102 BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ 11103 BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ 11104} while (0) 11105 11106#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \ 11107 __BUILD_BUG_VERIFY_ELEMENT(struct io_uring_sqe, eoffset, etype, ename) 11108 BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64); 11109 BUILD_BUG_SQE_ELEM(0, __u8, opcode); 11110 BUILD_BUG_SQE_ELEM(1, __u8, flags); 11111 BUILD_BUG_SQE_ELEM(2, __u16, ioprio); 11112 BUILD_BUG_SQE_ELEM(4, __s32, fd); 11113 BUILD_BUG_SQE_ELEM(8, __u64, off); 11114 BUILD_BUG_SQE_ELEM(8, __u64, addr2); 11115 BUILD_BUG_SQE_ELEM(16, __u64, addr); 11116 BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in); 11117 BUILD_BUG_SQE_ELEM(24, __u32, len); 11118 BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags); 11119 BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags); 11120 BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags); 11121 BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags); 11122 BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events); 11123 BUILD_BUG_SQE_ELEM(28, __u32, poll32_events); 11124 BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags); 11125 BUILD_BUG_SQE_ELEM(28, __u32, msg_flags); 11126 BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags); 11127 BUILD_BUG_SQE_ELEM(28, __u32, accept_flags); 11128 BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags); 11129 BUILD_BUG_SQE_ELEM(28, __u32, open_flags); 11130 BUILD_BUG_SQE_ELEM(28, __u32, statx_flags); 11131 BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice); 11132 BUILD_BUG_SQE_ELEM(28, __u32, splice_flags); 11133 BUILD_BUG_SQE_ELEM(32, __u64, user_data); 11134 BUILD_BUG_SQE_ELEM(40, __u16, buf_index); 11135 BUILD_BUG_SQE_ELEM(40, __u16, buf_group); 11136 BUILD_BUG_SQE_ELEM(42, __u16, personality); 11137 BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); 11138 BUILD_BUG_SQE_ELEM(44, __u32, file_index); 11139 11140 BUILD_BUG_ON(sizeof(struct io_uring_files_update) != 11141 sizeof(struct io_uring_rsrc_update)); 11142 BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > 11143 sizeof(struct io_uring_rsrc_update2)); 11144 11145 /* ->buf_index is u16 */ 11146 BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); 11147 11148 /* should fit into one byte */ 11149 BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); 11150 11151 BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); 11152 BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); 11153 11154 req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | 11155 SLAB_ACCOUNT); 11156 return 0; 11157}; 11158__initcall(io_uring_init); 11159