162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * fs/direct-io.c 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2002, Linus Torvalds. 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * O_DIRECT 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * 04Jul2002 Andrew Morton 1062306a36Sopenharmony_ci * Initial version 1162306a36Sopenharmony_ci * 11Sep2002 janetinc@us.ibm.com 1262306a36Sopenharmony_ci * added readv/writev support. 1362306a36Sopenharmony_ci * 29Oct2002 Andrew Morton 1462306a36Sopenharmony_ci * rewrote bio_add_page() support. 1562306a36Sopenharmony_ci * 30Oct2002 pbadari@us.ibm.com 1662306a36Sopenharmony_ci * added support for non-aligned IO. 1762306a36Sopenharmony_ci * 06Nov2002 pbadari@us.ibm.com 1862306a36Sopenharmony_ci * added asynchronous IO support. 1962306a36Sopenharmony_ci * 21Jul2003 nathans@sgi.com 2062306a36Sopenharmony_ci * added IO completion notifier. 2162306a36Sopenharmony_ci */ 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_ci#include <linux/kernel.h> 2462306a36Sopenharmony_ci#include <linux/module.h> 2562306a36Sopenharmony_ci#include <linux/types.h> 2662306a36Sopenharmony_ci#include <linux/fs.h> 2762306a36Sopenharmony_ci#include <linux/mm.h> 2862306a36Sopenharmony_ci#include <linux/slab.h> 2962306a36Sopenharmony_ci#include <linux/highmem.h> 3062306a36Sopenharmony_ci#include <linux/pagemap.h> 3162306a36Sopenharmony_ci#include <linux/task_io_accounting_ops.h> 3262306a36Sopenharmony_ci#include <linux/bio.h> 3362306a36Sopenharmony_ci#include <linux/wait.h> 3462306a36Sopenharmony_ci#include <linux/err.h> 3562306a36Sopenharmony_ci#include <linux/blkdev.h> 3662306a36Sopenharmony_ci#include <linux/buffer_head.h> 3762306a36Sopenharmony_ci#include <linux/rwsem.h> 3862306a36Sopenharmony_ci#include <linux/uio.h> 3962306a36Sopenharmony_ci#include <linux/atomic.h> 4062306a36Sopenharmony_ci#include <linux/prefetch.h> 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci#include "internal.h" 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci/* 4562306a36Sopenharmony_ci * How many user pages to map in one call to iov_iter_extract_pages(). This 4662306a36Sopenharmony_ci * determines the size of a structure in the slab cache 4762306a36Sopenharmony_ci */ 4862306a36Sopenharmony_ci#define DIO_PAGES 64 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci/* 5162306a36Sopenharmony_ci * Flags for dio_complete() 5262306a36Sopenharmony_ci */ 5362306a36Sopenharmony_ci#define DIO_COMPLETE_ASYNC 0x01 /* This is async IO */ 5462306a36Sopenharmony_ci#define DIO_COMPLETE_INVALIDATE 0x02 /* Can invalidate pages */ 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci/* 5762306a36Sopenharmony_ci * This code generally works in units of "dio_blocks". A dio_block is 5862306a36Sopenharmony_ci * somewhere between the hard sector size and the filesystem block size. it 5962306a36Sopenharmony_ci * is determined on a per-invocation basis. When talking to the filesystem 6062306a36Sopenharmony_ci * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity 6162306a36Sopenharmony_ci * down by dio->blkfactor. Similarly, fs-blocksize quantities are converted 6262306a36Sopenharmony_ci * to bio_block quantities by shifting left by blkfactor. 6362306a36Sopenharmony_ci * 6462306a36Sopenharmony_ci * If blkfactor is zero then the user's request was aligned to the filesystem's 6562306a36Sopenharmony_ci * blocksize. 6662306a36Sopenharmony_ci */ 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci/* dio_state only used in the submission path */ 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_cistruct dio_submit { 7162306a36Sopenharmony_ci struct bio *bio; /* bio under assembly */ 7262306a36Sopenharmony_ci unsigned blkbits; /* doesn't change */ 7362306a36Sopenharmony_ci unsigned blkfactor; /* When we're using an alignment which 7462306a36Sopenharmony_ci is finer than the filesystem's soft 7562306a36Sopenharmony_ci blocksize, this specifies how much 7662306a36Sopenharmony_ci finer. blkfactor=2 means 1/4-block 7762306a36Sopenharmony_ci alignment. Does not change */ 7862306a36Sopenharmony_ci unsigned start_zero_done; /* flag: sub-blocksize zeroing has 7962306a36Sopenharmony_ci been performed at the start of a 8062306a36Sopenharmony_ci write */ 8162306a36Sopenharmony_ci int pages_in_io; /* approximate total IO pages */ 8262306a36Sopenharmony_ci sector_t block_in_file; /* Current offset into the underlying 8362306a36Sopenharmony_ci file in dio_block units. */ 8462306a36Sopenharmony_ci unsigned blocks_available; /* At block_in_file. changes */ 8562306a36Sopenharmony_ci int reap_counter; /* rate limit reaping */ 8662306a36Sopenharmony_ci sector_t final_block_in_request;/* doesn't change */ 8762306a36Sopenharmony_ci int boundary; /* prev block is at a boundary */ 8862306a36Sopenharmony_ci get_block_t *get_block; /* block mapping function */ 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci loff_t logical_offset_in_bio; /* current first logical block in bio */ 9162306a36Sopenharmony_ci sector_t final_block_in_bio; /* current final block in bio + 1 */ 9262306a36Sopenharmony_ci sector_t next_block_for_io; /* next block to be put under IO, 9362306a36Sopenharmony_ci in dio_blocks units */ 9462306a36Sopenharmony_ci 9562306a36Sopenharmony_ci /* 9662306a36Sopenharmony_ci * Deferred addition of a page to the dio. These variables are 9762306a36Sopenharmony_ci * private to dio_send_cur_page(), submit_page_section() and 9862306a36Sopenharmony_ci * dio_bio_add_page(). 9962306a36Sopenharmony_ci */ 10062306a36Sopenharmony_ci struct page *cur_page; /* The page */ 10162306a36Sopenharmony_ci unsigned cur_page_offset; /* Offset into it, in bytes */ 10262306a36Sopenharmony_ci unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ 10362306a36Sopenharmony_ci sector_t cur_page_block; /* Where it starts */ 10462306a36Sopenharmony_ci loff_t cur_page_fs_offset; /* Offset in file */ 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci struct iov_iter *iter; 10762306a36Sopenharmony_ci /* 10862306a36Sopenharmony_ci * Page queue. These variables belong to dio_refill_pages() and 10962306a36Sopenharmony_ci * dio_get_page(). 11062306a36Sopenharmony_ci */ 11162306a36Sopenharmony_ci unsigned head; /* next page to process */ 11262306a36Sopenharmony_ci unsigned tail; /* last valid page + 1 */ 11362306a36Sopenharmony_ci size_t from, to; 11462306a36Sopenharmony_ci}; 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci/* dio_state communicated between submission path and end_io */ 11762306a36Sopenharmony_cistruct dio { 11862306a36Sopenharmony_ci int flags; /* doesn't change */ 11962306a36Sopenharmony_ci blk_opf_t opf; /* request operation type and flags */ 12062306a36Sopenharmony_ci struct gendisk *bio_disk; 12162306a36Sopenharmony_ci struct inode *inode; 12262306a36Sopenharmony_ci loff_t i_size; /* i_size when submitted */ 12362306a36Sopenharmony_ci dio_iodone_t *end_io; /* IO completion function */ 12462306a36Sopenharmony_ci bool is_pinned; /* T if we have pins on the pages */ 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci void *private; /* copy from map_bh.b_private */ 12762306a36Sopenharmony_ci 12862306a36Sopenharmony_ci /* BIO completion state */ 12962306a36Sopenharmony_ci spinlock_t bio_lock; /* protects BIO fields below */ 13062306a36Sopenharmony_ci int page_errors; /* err from iov_iter_extract_pages() */ 13162306a36Sopenharmony_ci int is_async; /* is IO async ? */ 13262306a36Sopenharmony_ci bool defer_completion; /* defer AIO completion to workqueue? */ 13362306a36Sopenharmony_ci bool should_dirty; /* if pages should be dirtied */ 13462306a36Sopenharmony_ci int io_error; /* IO error in completion path */ 13562306a36Sopenharmony_ci unsigned long refcount; /* direct_io_worker() and bios */ 13662306a36Sopenharmony_ci struct bio *bio_list; /* singly linked via bi_private */ 13762306a36Sopenharmony_ci struct task_struct *waiter; /* waiting task (NULL if none) */ 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci /* AIO related stuff */ 14062306a36Sopenharmony_ci struct kiocb *iocb; /* kiocb */ 14162306a36Sopenharmony_ci ssize_t result; /* IO result */ 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci /* 14462306a36Sopenharmony_ci * pages[] (and any fields placed after it) are not zeroed out at 14562306a36Sopenharmony_ci * allocation time. Don't add new fields after pages[] unless you 14662306a36Sopenharmony_ci * wish that they not be zeroed. 14762306a36Sopenharmony_ci */ 14862306a36Sopenharmony_ci union { 14962306a36Sopenharmony_ci struct page *pages[DIO_PAGES]; /* page buffer */ 15062306a36Sopenharmony_ci struct work_struct complete_work;/* deferred AIO completion */ 15162306a36Sopenharmony_ci }; 15262306a36Sopenharmony_ci} ____cacheline_aligned_in_smp; 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_cistatic struct kmem_cache *dio_cache __read_mostly; 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci/* 15762306a36Sopenharmony_ci * How many pages are in the queue? 15862306a36Sopenharmony_ci */ 15962306a36Sopenharmony_cistatic inline unsigned dio_pages_present(struct dio_submit *sdio) 16062306a36Sopenharmony_ci{ 16162306a36Sopenharmony_ci return sdio->tail - sdio->head; 16262306a36Sopenharmony_ci} 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci/* 16562306a36Sopenharmony_ci * Go grab and pin some userspace pages. Typically we'll get 64 at a time. 16662306a36Sopenharmony_ci */ 16762306a36Sopenharmony_cistatic inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) 16862306a36Sopenharmony_ci{ 16962306a36Sopenharmony_ci struct page **pages = dio->pages; 17062306a36Sopenharmony_ci const enum req_op dio_op = dio->opf & REQ_OP_MASK; 17162306a36Sopenharmony_ci ssize_t ret; 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci ret = iov_iter_extract_pages(sdio->iter, &pages, LONG_MAX, 17462306a36Sopenharmony_ci DIO_PAGES, 0, &sdio->from); 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_ci if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) { 17762306a36Sopenharmony_ci /* 17862306a36Sopenharmony_ci * A memory fault, but the filesystem has some outstanding 17962306a36Sopenharmony_ci * mapped blocks. We need to use those blocks up to avoid 18062306a36Sopenharmony_ci * leaking stale data in the file. 18162306a36Sopenharmony_ci */ 18262306a36Sopenharmony_ci if (dio->page_errors == 0) 18362306a36Sopenharmony_ci dio->page_errors = ret; 18462306a36Sopenharmony_ci dio->pages[0] = ZERO_PAGE(0); 18562306a36Sopenharmony_ci sdio->head = 0; 18662306a36Sopenharmony_ci sdio->tail = 1; 18762306a36Sopenharmony_ci sdio->from = 0; 18862306a36Sopenharmony_ci sdio->to = PAGE_SIZE; 18962306a36Sopenharmony_ci return 0; 19062306a36Sopenharmony_ci } 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci if (ret >= 0) { 19362306a36Sopenharmony_ci ret += sdio->from; 19462306a36Sopenharmony_ci sdio->head = 0; 19562306a36Sopenharmony_ci sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE; 19662306a36Sopenharmony_ci sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1; 19762306a36Sopenharmony_ci return 0; 19862306a36Sopenharmony_ci } 19962306a36Sopenharmony_ci return ret; 20062306a36Sopenharmony_ci} 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_ci/* 20362306a36Sopenharmony_ci * Get another userspace page. Returns an ERR_PTR on error. Pages are 20462306a36Sopenharmony_ci * buffered inside the dio so that we can call iov_iter_extract_pages() 20562306a36Sopenharmony_ci * against a decent number of pages, less frequently. To provide nicer use of 20662306a36Sopenharmony_ci * the L1 cache. 20762306a36Sopenharmony_ci */ 20862306a36Sopenharmony_cistatic inline struct page *dio_get_page(struct dio *dio, 20962306a36Sopenharmony_ci struct dio_submit *sdio) 21062306a36Sopenharmony_ci{ 21162306a36Sopenharmony_ci if (dio_pages_present(sdio) == 0) { 21262306a36Sopenharmony_ci int ret; 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci ret = dio_refill_pages(dio, sdio); 21562306a36Sopenharmony_ci if (ret) 21662306a36Sopenharmony_ci return ERR_PTR(ret); 21762306a36Sopenharmony_ci BUG_ON(dio_pages_present(sdio) == 0); 21862306a36Sopenharmony_ci } 21962306a36Sopenharmony_ci return dio->pages[sdio->head]; 22062306a36Sopenharmony_ci} 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_cistatic void dio_pin_page(struct dio *dio, struct page *page) 22362306a36Sopenharmony_ci{ 22462306a36Sopenharmony_ci if (dio->is_pinned) 22562306a36Sopenharmony_ci folio_add_pin(page_folio(page)); 22662306a36Sopenharmony_ci} 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_cistatic void dio_unpin_page(struct dio *dio, struct page *page) 22962306a36Sopenharmony_ci{ 23062306a36Sopenharmony_ci if (dio->is_pinned) 23162306a36Sopenharmony_ci unpin_user_page(page); 23262306a36Sopenharmony_ci} 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci/* 23562306a36Sopenharmony_ci * dio_complete() - called when all DIO BIO I/O has been completed 23662306a36Sopenharmony_ci * 23762306a36Sopenharmony_ci * This drops i_dio_count, lets interested parties know that a DIO operation 23862306a36Sopenharmony_ci * has completed, and calculates the resulting return code for the operation. 23962306a36Sopenharmony_ci * 24062306a36Sopenharmony_ci * It lets the filesystem know if it registered an interest earlier via 24162306a36Sopenharmony_ci * get_block. Pass the private field of the map buffer_head so that 24262306a36Sopenharmony_ci * filesystems can use it to hold additional state between get_block calls and 24362306a36Sopenharmony_ci * dio_complete. 24462306a36Sopenharmony_ci */ 24562306a36Sopenharmony_cistatic ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) 24662306a36Sopenharmony_ci{ 24762306a36Sopenharmony_ci const enum req_op dio_op = dio->opf & REQ_OP_MASK; 24862306a36Sopenharmony_ci loff_t offset = dio->iocb->ki_pos; 24962306a36Sopenharmony_ci ssize_t transferred = 0; 25062306a36Sopenharmony_ci int err; 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_ci /* 25362306a36Sopenharmony_ci * AIO submission can race with bio completion to get here while 25462306a36Sopenharmony_ci * expecting to have the last io completed by bio completion. 25562306a36Sopenharmony_ci * In that case -EIOCBQUEUED is in fact not an error we want 25662306a36Sopenharmony_ci * to preserve through this call. 25762306a36Sopenharmony_ci */ 25862306a36Sopenharmony_ci if (ret == -EIOCBQUEUED) 25962306a36Sopenharmony_ci ret = 0; 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci if (dio->result) { 26262306a36Sopenharmony_ci transferred = dio->result; 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci /* Check for short read case */ 26562306a36Sopenharmony_ci if (dio_op == REQ_OP_READ && 26662306a36Sopenharmony_ci ((offset + transferred) > dio->i_size)) 26762306a36Sopenharmony_ci transferred = dio->i_size - offset; 26862306a36Sopenharmony_ci /* ignore EFAULT if some IO has been done */ 26962306a36Sopenharmony_ci if (unlikely(ret == -EFAULT) && transferred) 27062306a36Sopenharmony_ci ret = 0; 27162306a36Sopenharmony_ci } 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci if (ret == 0) 27462306a36Sopenharmony_ci ret = dio->page_errors; 27562306a36Sopenharmony_ci if (ret == 0) 27662306a36Sopenharmony_ci ret = dio->io_error; 27762306a36Sopenharmony_ci if (ret == 0) 27862306a36Sopenharmony_ci ret = transferred; 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci if (dio->end_io) { 28162306a36Sopenharmony_ci // XXX: ki_pos?? 28262306a36Sopenharmony_ci err = dio->end_io(dio->iocb, offset, ret, dio->private); 28362306a36Sopenharmony_ci if (err) 28462306a36Sopenharmony_ci ret = err; 28562306a36Sopenharmony_ci } 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci /* 28862306a36Sopenharmony_ci * Try again to invalidate clean pages which might have been cached by 28962306a36Sopenharmony_ci * non-direct readahead, or faulted in by get_user_pages() if the source 29062306a36Sopenharmony_ci * of the write was an mmap'ed region of the file we're writing. Either 29162306a36Sopenharmony_ci * one is a pretty crazy thing to do, so we don't support it 100%. If 29262306a36Sopenharmony_ci * this invalidation fails, tough, the write still worked... 29362306a36Sopenharmony_ci * 29462306a36Sopenharmony_ci * And this page cache invalidation has to be after dio->end_io(), as 29562306a36Sopenharmony_ci * some filesystems convert unwritten extents to real allocations in 29662306a36Sopenharmony_ci * end_io() when necessary, otherwise a racing buffer read would cache 29762306a36Sopenharmony_ci * zeros from unwritten extents. 29862306a36Sopenharmony_ci */ 29962306a36Sopenharmony_ci if (flags & DIO_COMPLETE_INVALIDATE && 30062306a36Sopenharmony_ci ret > 0 && dio_op == REQ_OP_WRITE) 30162306a36Sopenharmony_ci kiocb_invalidate_post_direct_write(dio->iocb, ret); 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci inode_dio_end(dio->inode); 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci if (flags & DIO_COMPLETE_ASYNC) { 30662306a36Sopenharmony_ci /* 30762306a36Sopenharmony_ci * generic_write_sync expects ki_pos to have been updated 30862306a36Sopenharmony_ci * already, but the submission path only does this for 30962306a36Sopenharmony_ci * synchronous I/O. 31062306a36Sopenharmony_ci */ 31162306a36Sopenharmony_ci dio->iocb->ki_pos += transferred; 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci if (ret > 0 && dio_op == REQ_OP_WRITE) 31462306a36Sopenharmony_ci ret = generic_write_sync(dio->iocb, ret); 31562306a36Sopenharmony_ci dio->iocb->ki_complete(dio->iocb, ret); 31662306a36Sopenharmony_ci } 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci kmem_cache_free(dio_cache, dio); 31962306a36Sopenharmony_ci return ret; 32062306a36Sopenharmony_ci} 32162306a36Sopenharmony_ci 32262306a36Sopenharmony_cistatic void dio_aio_complete_work(struct work_struct *work) 32362306a36Sopenharmony_ci{ 32462306a36Sopenharmony_ci struct dio *dio = container_of(work, struct dio, complete_work); 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci dio_complete(dio, 0, DIO_COMPLETE_ASYNC | DIO_COMPLETE_INVALIDATE); 32762306a36Sopenharmony_ci} 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_cistatic blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio); 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci/* 33262306a36Sopenharmony_ci * Asynchronous IO callback. 33362306a36Sopenharmony_ci */ 33462306a36Sopenharmony_cistatic void dio_bio_end_aio(struct bio *bio) 33562306a36Sopenharmony_ci{ 33662306a36Sopenharmony_ci struct dio *dio = bio->bi_private; 33762306a36Sopenharmony_ci const enum req_op dio_op = dio->opf & REQ_OP_MASK; 33862306a36Sopenharmony_ci unsigned long remaining; 33962306a36Sopenharmony_ci unsigned long flags; 34062306a36Sopenharmony_ci bool defer_completion = false; 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci /* cleanup the bio */ 34362306a36Sopenharmony_ci dio_bio_complete(dio, bio); 34462306a36Sopenharmony_ci 34562306a36Sopenharmony_ci spin_lock_irqsave(&dio->bio_lock, flags); 34662306a36Sopenharmony_ci remaining = --dio->refcount; 34762306a36Sopenharmony_ci if (remaining == 1 && dio->waiter) 34862306a36Sopenharmony_ci wake_up_process(dio->waiter); 34962306a36Sopenharmony_ci spin_unlock_irqrestore(&dio->bio_lock, flags); 35062306a36Sopenharmony_ci 35162306a36Sopenharmony_ci if (remaining == 0) { 35262306a36Sopenharmony_ci /* 35362306a36Sopenharmony_ci * Defer completion when defer_completion is set or 35462306a36Sopenharmony_ci * when the inode has pages mapped and this is AIO write. 35562306a36Sopenharmony_ci * We need to invalidate those pages because there is a 35662306a36Sopenharmony_ci * chance they contain stale data in the case buffered IO 35762306a36Sopenharmony_ci * went in between AIO submission and completion into the 35862306a36Sopenharmony_ci * same region. 35962306a36Sopenharmony_ci */ 36062306a36Sopenharmony_ci if (dio->result) 36162306a36Sopenharmony_ci defer_completion = dio->defer_completion || 36262306a36Sopenharmony_ci (dio_op == REQ_OP_WRITE && 36362306a36Sopenharmony_ci dio->inode->i_mapping->nrpages); 36462306a36Sopenharmony_ci if (defer_completion) { 36562306a36Sopenharmony_ci INIT_WORK(&dio->complete_work, dio_aio_complete_work); 36662306a36Sopenharmony_ci queue_work(dio->inode->i_sb->s_dio_done_wq, 36762306a36Sopenharmony_ci &dio->complete_work); 36862306a36Sopenharmony_ci } else { 36962306a36Sopenharmony_ci dio_complete(dio, 0, DIO_COMPLETE_ASYNC); 37062306a36Sopenharmony_ci } 37162306a36Sopenharmony_ci } 37262306a36Sopenharmony_ci} 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_ci/* 37562306a36Sopenharmony_ci * The BIO completion handler simply queues the BIO up for the process-context 37662306a36Sopenharmony_ci * handler. 37762306a36Sopenharmony_ci * 37862306a36Sopenharmony_ci * During I/O bi_private points at the dio. After I/O, bi_private is used to 37962306a36Sopenharmony_ci * implement a singly-linked list of completed BIOs, at dio->bio_list. 38062306a36Sopenharmony_ci */ 38162306a36Sopenharmony_cistatic void dio_bio_end_io(struct bio *bio) 38262306a36Sopenharmony_ci{ 38362306a36Sopenharmony_ci struct dio *dio = bio->bi_private; 38462306a36Sopenharmony_ci unsigned long flags; 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci spin_lock_irqsave(&dio->bio_lock, flags); 38762306a36Sopenharmony_ci bio->bi_private = dio->bio_list; 38862306a36Sopenharmony_ci dio->bio_list = bio; 38962306a36Sopenharmony_ci if (--dio->refcount == 1 && dio->waiter) 39062306a36Sopenharmony_ci wake_up_process(dio->waiter); 39162306a36Sopenharmony_ci spin_unlock_irqrestore(&dio->bio_lock, flags); 39262306a36Sopenharmony_ci} 39362306a36Sopenharmony_ci 39462306a36Sopenharmony_cistatic inline void 39562306a36Sopenharmony_cidio_bio_alloc(struct dio *dio, struct dio_submit *sdio, 39662306a36Sopenharmony_ci struct block_device *bdev, 39762306a36Sopenharmony_ci sector_t first_sector, int nr_vecs) 39862306a36Sopenharmony_ci{ 39962306a36Sopenharmony_ci struct bio *bio; 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci /* 40262306a36Sopenharmony_ci * bio_alloc() is guaranteed to return a bio when allowed to sleep and 40362306a36Sopenharmony_ci * we request a valid number of vectors. 40462306a36Sopenharmony_ci */ 40562306a36Sopenharmony_ci bio = bio_alloc(bdev, nr_vecs, dio->opf, GFP_KERNEL); 40662306a36Sopenharmony_ci bio->bi_iter.bi_sector = first_sector; 40762306a36Sopenharmony_ci if (dio->is_async) 40862306a36Sopenharmony_ci bio->bi_end_io = dio_bio_end_aio; 40962306a36Sopenharmony_ci else 41062306a36Sopenharmony_ci bio->bi_end_io = dio_bio_end_io; 41162306a36Sopenharmony_ci if (dio->is_pinned) 41262306a36Sopenharmony_ci bio_set_flag(bio, BIO_PAGE_PINNED); 41362306a36Sopenharmony_ci sdio->bio = bio; 41462306a36Sopenharmony_ci sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; 41562306a36Sopenharmony_ci} 41662306a36Sopenharmony_ci 41762306a36Sopenharmony_ci/* 41862306a36Sopenharmony_ci * In the AIO read case we speculatively dirty the pages before starting IO. 41962306a36Sopenharmony_ci * During IO completion, any of these pages which happen to have been written 42062306a36Sopenharmony_ci * back will be redirtied by bio_check_pages_dirty(). 42162306a36Sopenharmony_ci * 42262306a36Sopenharmony_ci * bios hold a dio reference between submit_bio and ->end_io. 42362306a36Sopenharmony_ci */ 42462306a36Sopenharmony_cistatic inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) 42562306a36Sopenharmony_ci{ 42662306a36Sopenharmony_ci const enum req_op dio_op = dio->opf & REQ_OP_MASK; 42762306a36Sopenharmony_ci struct bio *bio = sdio->bio; 42862306a36Sopenharmony_ci unsigned long flags; 42962306a36Sopenharmony_ci 43062306a36Sopenharmony_ci bio->bi_private = dio; 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci spin_lock_irqsave(&dio->bio_lock, flags); 43362306a36Sopenharmony_ci dio->refcount++; 43462306a36Sopenharmony_ci spin_unlock_irqrestore(&dio->bio_lock, flags); 43562306a36Sopenharmony_ci 43662306a36Sopenharmony_ci if (dio->is_async && dio_op == REQ_OP_READ && dio->should_dirty) 43762306a36Sopenharmony_ci bio_set_pages_dirty(bio); 43862306a36Sopenharmony_ci 43962306a36Sopenharmony_ci dio->bio_disk = bio->bi_bdev->bd_disk; 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci submit_bio(bio); 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci sdio->bio = NULL; 44462306a36Sopenharmony_ci sdio->boundary = 0; 44562306a36Sopenharmony_ci sdio->logical_offset_in_bio = 0; 44662306a36Sopenharmony_ci} 44762306a36Sopenharmony_ci 44862306a36Sopenharmony_ci/* 44962306a36Sopenharmony_ci * Release any resources in case of a failure 45062306a36Sopenharmony_ci */ 45162306a36Sopenharmony_cistatic inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) 45262306a36Sopenharmony_ci{ 45362306a36Sopenharmony_ci if (dio->is_pinned) 45462306a36Sopenharmony_ci unpin_user_pages(dio->pages + sdio->head, 45562306a36Sopenharmony_ci sdio->tail - sdio->head); 45662306a36Sopenharmony_ci sdio->head = sdio->tail; 45762306a36Sopenharmony_ci} 45862306a36Sopenharmony_ci 45962306a36Sopenharmony_ci/* 46062306a36Sopenharmony_ci * Wait for the next BIO to complete. Remove it and return it. NULL is 46162306a36Sopenharmony_ci * returned once all BIOs have been completed. This must only be called once 46262306a36Sopenharmony_ci * all bios have been issued so that dio->refcount can only decrease. This 46362306a36Sopenharmony_ci * requires that the caller hold a reference on the dio. 46462306a36Sopenharmony_ci */ 46562306a36Sopenharmony_cistatic struct bio *dio_await_one(struct dio *dio) 46662306a36Sopenharmony_ci{ 46762306a36Sopenharmony_ci unsigned long flags; 46862306a36Sopenharmony_ci struct bio *bio = NULL; 46962306a36Sopenharmony_ci 47062306a36Sopenharmony_ci spin_lock_irqsave(&dio->bio_lock, flags); 47162306a36Sopenharmony_ci 47262306a36Sopenharmony_ci /* 47362306a36Sopenharmony_ci * Wait as long as the list is empty and there are bios in flight. bio 47462306a36Sopenharmony_ci * completion drops the count, maybe adds to the list, and wakes while 47562306a36Sopenharmony_ci * holding the bio_lock so we don't need set_current_state()'s barrier 47662306a36Sopenharmony_ci * and can call it after testing our condition. 47762306a36Sopenharmony_ci */ 47862306a36Sopenharmony_ci while (dio->refcount > 1 && dio->bio_list == NULL) { 47962306a36Sopenharmony_ci __set_current_state(TASK_UNINTERRUPTIBLE); 48062306a36Sopenharmony_ci dio->waiter = current; 48162306a36Sopenharmony_ci spin_unlock_irqrestore(&dio->bio_lock, flags); 48262306a36Sopenharmony_ci blk_io_schedule(); 48362306a36Sopenharmony_ci /* wake up sets us TASK_RUNNING */ 48462306a36Sopenharmony_ci spin_lock_irqsave(&dio->bio_lock, flags); 48562306a36Sopenharmony_ci dio->waiter = NULL; 48662306a36Sopenharmony_ci } 48762306a36Sopenharmony_ci if (dio->bio_list) { 48862306a36Sopenharmony_ci bio = dio->bio_list; 48962306a36Sopenharmony_ci dio->bio_list = bio->bi_private; 49062306a36Sopenharmony_ci } 49162306a36Sopenharmony_ci spin_unlock_irqrestore(&dio->bio_lock, flags); 49262306a36Sopenharmony_ci return bio; 49362306a36Sopenharmony_ci} 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci/* 49662306a36Sopenharmony_ci * Process one completed BIO. No locks are held. 49762306a36Sopenharmony_ci */ 49862306a36Sopenharmony_cistatic blk_status_t dio_bio_complete(struct dio *dio, struct bio *bio) 49962306a36Sopenharmony_ci{ 50062306a36Sopenharmony_ci blk_status_t err = bio->bi_status; 50162306a36Sopenharmony_ci const enum req_op dio_op = dio->opf & REQ_OP_MASK; 50262306a36Sopenharmony_ci bool should_dirty = dio_op == REQ_OP_READ && dio->should_dirty; 50362306a36Sopenharmony_ci 50462306a36Sopenharmony_ci if (err) { 50562306a36Sopenharmony_ci if (err == BLK_STS_AGAIN && (bio->bi_opf & REQ_NOWAIT)) 50662306a36Sopenharmony_ci dio->io_error = -EAGAIN; 50762306a36Sopenharmony_ci else 50862306a36Sopenharmony_ci dio->io_error = -EIO; 50962306a36Sopenharmony_ci } 51062306a36Sopenharmony_ci 51162306a36Sopenharmony_ci if (dio->is_async && should_dirty) { 51262306a36Sopenharmony_ci bio_check_pages_dirty(bio); /* transfers ownership */ 51362306a36Sopenharmony_ci } else { 51462306a36Sopenharmony_ci bio_release_pages(bio, should_dirty); 51562306a36Sopenharmony_ci bio_put(bio); 51662306a36Sopenharmony_ci } 51762306a36Sopenharmony_ci return err; 51862306a36Sopenharmony_ci} 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci/* 52162306a36Sopenharmony_ci * Wait on and process all in-flight BIOs. This must only be called once 52262306a36Sopenharmony_ci * all bios have been issued so that the refcount can only decrease. 52362306a36Sopenharmony_ci * This just waits for all bios to make it through dio_bio_complete. IO 52462306a36Sopenharmony_ci * errors are propagated through dio->io_error and should be propagated via 52562306a36Sopenharmony_ci * dio_complete(). 52662306a36Sopenharmony_ci */ 52762306a36Sopenharmony_cistatic void dio_await_completion(struct dio *dio) 52862306a36Sopenharmony_ci{ 52962306a36Sopenharmony_ci struct bio *bio; 53062306a36Sopenharmony_ci do { 53162306a36Sopenharmony_ci bio = dio_await_one(dio); 53262306a36Sopenharmony_ci if (bio) 53362306a36Sopenharmony_ci dio_bio_complete(dio, bio); 53462306a36Sopenharmony_ci } while (bio); 53562306a36Sopenharmony_ci} 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci/* 53862306a36Sopenharmony_ci * A really large O_DIRECT read or write can generate a lot of BIOs. So 53962306a36Sopenharmony_ci * to keep the memory consumption sane we periodically reap any completed BIOs 54062306a36Sopenharmony_ci * during the BIO generation phase. 54162306a36Sopenharmony_ci * 54262306a36Sopenharmony_ci * This also helps to limit the peak amount of pinned userspace memory. 54362306a36Sopenharmony_ci */ 54462306a36Sopenharmony_cistatic inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) 54562306a36Sopenharmony_ci{ 54662306a36Sopenharmony_ci int ret = 0; 54762306a36Sopenharmony_ci 54862306a36Sopenharmony_ci if (sdio->reap_counter++ >= 64) { 54962306a36Sopenharmony_ci while (dio->bio_list) { 55062306a36Sopenharmony_ci unsigned long flags; 55162306a36Sopenharmony_ci struct bio *bio; 55262306a36Sopenharmony_ci int ret2; 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ci spin_lock_irqsave(&dio->bio_lock, flags); 55562306a36Sopenharmony_ci bio = dio->bio_list; 55662306a36Sopenharmony_ci dio->bio_list = bio->bi_private; 55762306a36Sopenharmony_ci spin_unlock_irqrestore(&dio->bio_lock, flags); 55862306a36Sopenharmony_ci ret2 = blk_status_to_errno(dio_bio_complete(dio, bio)); 55962306a36Sopenharmony_ci if (ret == 0) 56062306a36Sopenharmony_ci ret = ret2; 56162306a36Sopenharmony_ci } 56262306a36Sopenharmony_ci sdio->reap_counter = 0; 56362306a36Sopenharmony_ci } 56462306a36Sopenharmony_ci return ret; 56562306a36Sopenharmony_ci} 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_cistatic int dio_set_defer_completion(struct dio *dio) 56862306a36Sopenharmony_ci{ 56962306a36Sopenharmony_ci struct super_block *sb = dio->inode->i_sb; 57062306a36Sopenharmony_ci 57162306a36Sopenharmony_ci if (dio->defer_completion) 57262306a36Sopenharmony_ci return 0; 57362306a36Sopenharmony_ci dio->defer_completion = true; 57462306a36Sopenharmony_ci if (!sb->s_dio_done_wq) 57562306a36Sopenharmony_ci return sb_init_dio_done_wq(sb); 57662306a36Sopenharmony_ci return 0; 57762306a36Sopenharmony_ci} 57862306a36Sopenharmony_ci 57962306a36Sopenharmony_ci/* 58062306a36Sopenharmony_ci * Call into the fs to map some more disk blocks. We record the current number 58162306a36Sopenharmony_ci * of available blocks at sdio->blocks_available. These are in units of the 58262306a36Sopenharmony_ci * fs blocksize, i_blocksize(inode). 58362306a36Sopenharmony_ci * 58462306a36Sopenharmony_ci * The fs is allowed to map lots of blocks at once. If it wants to do that, 58562306a36Sopenharmony_ci * it uses the passed inode-relative block number as the file offset, as usual. 58662306a36Sopenharmony_ci * 58762306a36Sopenharmony_ci * get_block() is passed the number of i_blkbits-sized blocks which direct_io 58862306a36Sopenharmony_ci * has remaining to do. The fs should not map more than this number of blocks. 58962306a36Sopenharmony_ci * 59062306a36Sopenharmony_ci * If the fs has mapped a lot of blocks, it should populate bh->b_size to 59162306a36Sopenharmony_ci * indicate how much contiguous disk space has been made available at 59262306a36Sopenharmony_ci * bh->b_blocknr. 59362306a36Sopenharmony_ci * 59462306a36Sopenharmony_ci * If *any* of the mapped blocks are new, then the fs must set buffer_new(). 59562306a36Sopenharmony_ci * This isn't very efficient... 59662306a36Sopenharmony_ci * 59762306a36Sopenharmony_ci * In the case of filesystem holes: the fs may return an arbitrarily-large 59862306a36Sopenharmony_ci * hole by returning an appropriate value in b_size and by clearing 59962306a36Sopenharmony_ci * buffer_mapped(). However the direct-io code will only process holes one 60062306a36Sopenharmony_ci * block at a time - it will repeatedly call get_block() as it walks the hole. 60162306a36Sopenharmony_ci */ 60262306a36Sopenharmony_cistatic int get_more_blocks(struct dio *dio, struct dio_submit *sdio, 60362306a36Sopenharmony_ci struct buffer_head *map_bh) 60462306a36Sopenharmony_ci{ 60562306a36Sopenharmony_ci const enum req_op dio_op = dio->opf & REQ_OP_MASK; 60662306a36Sopenharmony_ci int ret; 60762306a36Sopenharmony_ci sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ 60862306a36Sopenharmony_ci sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ 60962306a36Sopenharmony_ci unsigned long fs_count; /* Number of filesystem-sized blocks */ 61062306a36Sopenharmony_ci int create; 61162306a36Sopenharmony_ci unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; 61262306a36Sopenharmony_ci loff_t i_size; 61362306a36Sopenharmony_ci 61462306a36Sopenharmony_ci /* 61562306a36Sopenharmony_ci * If there was a memory error and we've overwritten all the 61662306a36Sopenharmony_ci * mapped blocks then we can now return that memory error 61762306a36Sopenharmony_ci */ 61862306a36Sopenharmony_ci ret = dio->page_errors; 61962306a36Sopenharmony_ci if (ret == 0) { 62062306a36Sopenharmony_ci BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); 62162306a36Sopenharmony_ci fs_startblk = sdio->block_in_file >> sdio->blkfactor; 62262306a36Sopenharmony_ci fs_endblk = (sdio->final_block_in_request - 1) >> 62362306a36Sopenharmony_ci sdio->blkfactor; 62462306a36Sopenharmony_ci fs_count = fs_endblk - fs_startblk + 1; 62562306a36Sopenharmony_ci 62662306a36Sopenharmony_ci map_bh->b_state = 0; 62762306a36Sopenharmony_ci map_bh->b_size = fs_count << i_blkbits; 62862306a36Sopenharmony_ci 62962306a36Sopenharmony_ci /* 63062306a36Sopenharmony_ci * For writes that could fill holes inside i_size on a 63162306a36Sopenharmony_ci * DIO_SKIP_HOLES filesystem we forbid block creations: only 63262306a36Sopenharmony_ci * overwrites are permitted. We will return early to the caller 63362306a36Sopenharmony_ci * once we see an unmapped buffer head returned, and the caller 63462306a36Sopenharmony_ci * will fall back to buffered I/O. 63562306a36Sopenharmony_ci * 63662306a36Sopenharmony_ci * Otherwise the decision is left to the get_blocks method, 63762306a36Sopenharmony_ci * which may decide to handle it or also return an unmapped 63862306a36Sopenharmony_ci * buffer head. 63962306a36Sopenharmony_ci */ 64062306a36Sopenharmony_ci create = dio_op == REQ_OP_WRITE; 64162306a36Sopenharmony_ci if (dio->flags & DIO_SKIP_HOLES) { 64262306a36Sopenharmony_ci i_size = i_size_read(dio->inode); 64362306a36Sopenharmony_ci if (i_size && fs_startblk <= (i_size - 1) >> i_blkbits) 64462306a36Sopenharmony_ci create = 0; 64562306a36Sopenharmony_ci } 64662306a36Sopenharmony_ci 64762306a36Sopenharmony_ci ret = (*sdio->get_block)(dio->inode, fs_startblk, 64862306a36Sopenharmony_ci map_bh, create); 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci /* Store for completion */ 65162306a36Sopenharmony_ci dio->private = map_bh->b_private; 65262306a36Sopenharmony_ci 65362306a36Sopenharmony_ci if (ret == 0 && buffer_defer_completion(map_bh)) 65462306a36Sopenharmony_ci ret = dio_set_defer_completion(dio); 65562306a36Sopenharmony_ci } 65662306a36Sopenharmony_ci return ret; 65762306a36Sopenharmony_ci} 65862306a36Sopenharmony_ci 65962306a36Sopenharmony_ci/* 66062306a36Sopenharmony_ci * There is no bio. Make one now. 66162306a36Sopenharmony_ci */ 66262306a36Sopenharmony_cistatic inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, 66362306a36Sopenharmony_ci sector_t start_sector, struct buffer_head *map_bh) 66462306a36Sopenharmony_ci{ 66562306a36Sopenharmony_ci sector_t sector; 66662306a36Sopenharmony_ci int ret, nr_pages; 66762306a36Sopenharmony_ci 66862306a36Sopenharmony_ci ret = dio_bio_reap(dio, sdio); 66962306a36Sopenharmony_ci if (ret) 67062306a36Sopenharmony_ci goto out; 67162306a36Sopenharmony_ci sector = start_sector << (sdio->blkbits - 9); 67262306a36Sopenharmony_ci nr_pages = bio_max_segs(sdio->pages_in_io); 67362306a36Sopenharmony_ci BUG_ON(nr_pages <= 0); 67462306a36Sopenharmony_ci dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); 67562306a36Sopenharmony_ci sdio->boundary = 0; 67662306a36Sopenharmony_ciout: 67762306a36Sopenharmony_ci return ret; 67862306a36Sopenharmony_ci} 67962306a36Sopenharmony_ci 68062306a36Sopenharmony_ci/* 68162306a36Sopenharmony_ci * Attempt to put the current chunk of 'cur_page' into the current BIO. If 68262306a36Sopenharmony_ci * that was successful then update final_block_in_bio and take a ref against 68362306a36Sopenharmony_ci * the just-added page. 68462306a36Sopenharmony_ci * 68562306a36Sopenharmony_ci * Return zero on success. Non-zero means the caller needs to start a new BIO. 68662306a36Sopenharmony_ci */ 68762306a36Sopenharmony_cistatic inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio) 68862306a36Sopenharmony_ci{ 68962306a36Sopenharmony_ci int ret; 69062306a36Sopenharmony_ci 69162306a36Sopenharmony_ci ret = bio_add_page(sdio->bio, sdio->cur_page, 69262306a36Sopenharmony_ci sdio->cur_page_len, sdio->cur_page_offset); 69362306a36Sopenharmony_ci if (ret == sdio->cur_page_len) { 69462306a36Sopenharmony_ci /* 69562306a36Sopenharmony_ci * Decrement count only, if we are done with this page 69662306a36Sopenharmony_ci */ 69762306a36Sopenharmony_ci if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) 69862306a36Sopenharmony_ci sdio->pages_in_io--; 69962306a36Sopenharmony_ci dio_pin_page(dio, sdio->cur_page); 70062306a36Sopenharmony_ci sdio->final_block_in_bio = sdio->cur_page_block + 70162306a36Sopenharmony_ci (sdio->cur_page_len >> sdio->blkbits); 70262306a36Sopenharmony_ci ret = 0; 70362306a36Sopenharmony_ci } else { 70462306a36Sopenharmony_ci ret = 1; 70562306a36Sopenharmony_ci } 70662306a36Sopenharmony_ci return ret; 70762306a36Sopenharmony_ci} 70862306a36Sopenharmony_ci 70962306a36Sopenharmony_ci/* 71062306a36Sopenharmony_ci * Put cur_page under IO. The section of cur_page which is described by 71162306a36Sopenharmony_ci * cur_page_offset,cur_page_len is put into a BIO. The section of cur_page 71262306a36Sopenharmony_ci * starts on-disk at cur_page_block. 71362306a36Sopenharmony_ci * 71462306a36Sopenharmony_ci * We take a ref against the page here (on behalf of its presence in the bio). 71562306a36Sopenharmony_ci * 71662306a36Sopenharmony_ci * The caller of this function is responsible for removing cur_page from the 71762306a36Sopenharmony_ci * dio, and for dropping the refcount which came from that presence. 71862306a36Sopenharmony_ci */ 71962306a36Sopenharmony_cistatic inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, 72062306a36Sopenharmony_ci struct buffer_head *map_bh) 72162306a36Sopenharmony_ci{ 72262306a36Sopenharmony_ci int ret = 0; 72362306a36Sopenharmony_ci 72462306a36Sopenharmony_ci if (sdio->bio) { 72562306a36Sopenharmony_ci loff_t cur_offset = sdio->cur_page_fs_offset; 72662306a36Sopenharmony_ci loff_t bio_next_offset = sdio->logical_offset_in_bio + 72762306a36Sopenharmony_ci sdio->bio->bi_iter.bi_size; 72862306a36Sopenharmony_ci 72962306a36Sopenharmony_ci /* 73062306a36Sopenharmony_ci * See whether this new request is contiguous with the old. 73162306a36Sopenharmony_ci * 73262306a36Sopenharmony_ci * Btrfs cannot handle having logically non-contiguous requests 73362306a36Sopenharmony_ci * submitted. For example if you have 73462306a36Sopenharmony_ci * 73562306a36Sopenharmony_ci * Logical: [0-4095][HOLE][8192-12287] 73662306a36Sopenharmony_ci * Physical: [0-4095] [4096-8191] 73762306a36Sopenharmony_ci * 73862306a36Sopenharmony_ci * We cannot submit those pages together as one BIO. So if our 73962306a36Sopenharmony_ci * current logical offset in the file does not equal what would 74062306a36Sopenharmony_ci * be the next logical offset in the bio, submit the bio we 74162306a36Sopenharmony_ci * have. 74262306a36Sopenharmony_ci */ 74362306a36Sopenharmony_ci if (sdio->final_block_in_bio != sdio->cur_page_block || 74462306a36Sopenharmony_ci cur_offset != bio_next_offset) 74562306a36Sopenharmony_ci dio_bio_submit(dio, sdio); 74662306a36Sopenharmony_ci } 74762306a36Sopenharmony_ci 74862306a36Sopenharmony_ci if (sdio->bio == NULL) { 74962306a36Sopenharmony_ci ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); 75062306a36Sopenharmony_ci if (ret) 75162306a36Sopenharmony_ci goto out; 75262306a36Sopenharmony_ci } 75362306a36Sopenharmony_ci 75462306a36Sopenharmony_ci if (dio_bio_add_page(dio, sdio) != 0) { 75562306a36Sopenharmony_ci dio_bio_submit(dio, sdio); 75662306a36Sopenharmony_ci ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); 75762306a36Sopenharmony_ci if (ret == 0) { 75862306a36Sopenharmony_ci ret = dio_bio_add_page(dio, sdio); 75962306a36Sopenharmony_ci BUG_ON(ret != 0); 76062306a36Sopenharmony_ci } 76162306a36Sopenharmony_ci } 76262306a36Sopenharmony_ciout: 76362306a36Sopenharmony_ci return ret; 76462306a36Sopenharmony_ci} 76562306a36Sopenharmony_ci 76662306a36Sopenharmony_ci/* 76762306a36Sopenharmony_ci * An autonomous function to put a chunk of a page under deferred IO. 76862306a36Sopenharmony_ci * 76962306a36Sopenharmony_ci * The caller doesn't actually know (or care) whether this piece of page is in 77062306a36Sopenharmony_ci * a BIO, or is under IO or whatever. We just take care of all possible 77162306a36Sopenharmony_ci * situations here. The separation between the logic of do_direct_IO() and 77262306a36Sopenharmony_ci * that of submit_page_section() is important for clarity. Please don't break. 77362306a36Sopenharmony_ci * 77462306a36Sopenharmony_ci * The chunk of page starts on-disk at blocknr. 77562306a36Sopenharmony_ci * 77662306a36Sopenharmony_ci * We perform deferred IO, by recording the last-submitted page inside our 77762306a36Sopenharmony_ci * private part of the dio structure. If possible, we just expand the IO 77862306a36Sopenharmony_ci * across that page here. 77962306a36Sopenharmony_ci * 78062306a36Sopenharmony_ci * If that doesn't work out then we put the old page into the bio and add this 78162306a36Sopenharmony_ci * page to the dio instead. 78262306a36Sopenharmony_ci */ 78362306a36Sopenharmony_cistatic inline int 78462306a36Sopenharmony_cisubmit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, 78562306a36Sopenharmony_ci unsigned offset, unsigned len, sector_t blocknr, 78662306a36Sopenharmony_ci struct buffer_head *map_bh) 78762306a36Sopenharmony_ci{ 78862306a36Sopenharmony_ci const enum req_op dio_op = dio->opf & REQ_OP_MASK; 78962306a36Sopenharmony_ci int ret = 0; 79062306a36Sopenharmony_ci int boundary = sdio->boundary; /* dio_send_cur_page may clear it */ 79162306a36Sopenharmony_ci 79262306a36Sopenharmony_ci if (dio_op == REQ_OP_WRITE) { 79362306a36Sopenharmony_ci /* 79462306a36Sopenharmony_ci * Read accounting is performed in submit_bio() 79562306a36Sopenharmony_ci */ 79662306a36Sopenharmony_ci task_io_account_write(len); 79762306a36Sopenharmony_ci } 79862306a36Sopenharmony_ci 79962306a36Sopenharmony_ci /* 80062306a36Sopenharmony_ci * Can we just grow the current page's presence in the dio? 80162306a36Sopenharmony_ci */ 80262306a36Sopenharmony_ci if (sdio->cur_page == page && 80362306a36Sopenharmony_ci sdio->cur_page_offset + sdio->cur_page_len == offset && 80462306a36Sopenharmony_ci sdio->cur_page_block + 80562306a36Sopenharmony_ci (sdio->cur_page_len >> sdio->blkbits) == blocknr) { 80662306a36Sopenharmony_ci sdio->cur_page_len += len; 80762306a36Sopenharmony_ci goto out; 80862306a36Sopenharmony_ci } 80962306a36Sopenharmony_ci 81062306a36Sopenharmony_ci /* 81162306a36Sopenharmony_ci * If there's a deferred page already there then send it. 81262306a36Sopenharmony_ci */ 81362306a36Sopenharmony_ci if (sdio->cur_page) { 81462306a36Sopenharmony_ci ret = dio_send_cur_page(dio, sdio, map_bh); 81562306a36Sopenharmony_ci dio_unpin_page(dio, sdio->cur_page); 81662306a36Sopenharmony_ci sdio->cur_page = NULL; 81762306a36Sopenharmony_ci if (ret) 81862306a36Sopenharmony_ci return ret; 81962306a36Sopenharmony_ci } 82062306a36Sopenharmony_ci 82162306a36Sopenharmony_ci dio_pin_page(dio, page); /* It is in dio */ 82262306a36Sopenharmony_ci sdio->cur_page = page; 82362306a36Sopenharmony_ci sdio->cur_page_offset = offset; 82462306a36Sopenharmony_ci sdio->cur_page_len = len; 82562306a36Sopenharmony_ci sdio->cur_page_block = blocknr; 82662306a36Sopenharmony_ci sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; 82762306a36Sopenharmony_ciout: 82862306a36Sopenharmony_ci /* 82962306a36Sopenharmony_ci * If boundary then we want to schedule the IO now to 83062306a36Sopenharmony_ci * avoid metadata seeks. 83162306a36Sopenharmony_ci */ 83262306a36Sopenharmony_ci if (boundary) { 83362306a36Sopenharmony_ci ret = dio_send_cur_page(dio, sdio, map_bh); 83462306a36Sopenharmony_ci if (sdio->bio) 83562306a36Sopenharmony_ci dio_bio_submit(dio, sdio); 83662306a36Sopenharmony_ci dio_unpin_page(dio, sdio->cur_page); 83762306a36Sopenharmony_ci sdio->cur_page = NULL; 83862306a36Sopenharmony_ci } 83962306a36Sopenharmony_ci return ret; 84062306a36Sopenharmony_ci} 84162306a36Sopenharmony_ci 84262306a36Sopenharmony_ci/* 84362306a36Sopenharmony_ci * If we are not writing the entire block and get_block() allocated 84462306a36Sopenharmony_ci * the block for us, we need to fill-in the unused portion of the 84562306a36Sopenharmony_ci * block with zeros. This happens only if user-buffer, fileoffset or 84662306a36Sopenharmony_ci * io length is not filesystem block-size multiple. 84762306a36Sopenharmony_ci * 84862306a36Sopenharmony_ci * `end' is zero if we're doing the start of the IO, 1 at the end of the 84962306a36Sopenharmony_ci * IO. 85062306a36Sopenharmony_ci */ 85162306a36Sopenharmony_cistatic inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, 85262306a36Sopenharmony_ci int end, struct buffer_head *map_bh) 85362306a36Sopenharmony_ci{ 85462306a36Sopenharmony_ci unsigned dio_blocks_per_fs_block; 85562306a36Sopenharmony_ci unsigned this_chunk_blocks; /* In dio_blocks */ 85662306a36Sopenharmony_ci unsigned this_chunk_bytes; 85762306a36Sopenharmony_ci struct page *page; 85862306a36Sopenharmony_ci 85962306a36Sopenharmony_ci sdio->start_zero_done = 1; 86062306a36Sopenharmony_ci if (!sdio->blkfactor || !buffer_new(map_bh)) 86162306a36Sopenharmony_ci return; 86262306a36Sopenharmony_ci 86362306a36Sopenharmony_ci dio_blocks_per_fs_block = 1 << sdio->blkfactor; 86462306a36Sopenharmony_ci this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1); 86562306a36Sopenharmony_ci 86662306a36Sopenharmony_ci if (!this_chunk_blocks) 86762306a36Sopenharmony_ci return; 86862306a36Sopenharmony_ci 86962306a36Sopenharmony_ci /* 87062306a36Sopenharmony_ci * We need to zero out part of an fs block. It is either at the 87162306a36Sopenharmony_ci * beginning or the end of the fs block. 87262306a36Sopenharmony_ci */ 87362306a36Sopenharmony_ci if (end) 87462306a36Sopenharmony_ci this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; 87562306a36Sopenharmony_ci 87662306a36Sopenharmony_ci this_chunk_bytes = this_chunk_blocks << sdio->blkbits; 87762306a36Sopenharmony_ci 87862306a36Sopenharmony_ci page = ZERO_PAGE(0); 87962306a36Sopenharmony_ci if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes, 88062306a36Sopenharmony_ci sdio->next_block_for_io, map_bh)) 88162306a36Sopenharmony_ci return; 88262306a36Sopenharmony_ci 88362306a36Sopenharmony_ci sdio->next_block_for_io += this_chunk_blocks; 88462306a36Sopenharmony_ci} 88562306a36Sopenharmony_ci 88662306a36Sopenharmony_ci/* 88762306a36Sopenharmony_ci * Walk the user pages, and the file, mapping blocks to disk and generating 88862306a36Sopenharmony_ci * a sequence of (page,offset,len,block) mappings. These mappings are injected 88962306a36Sopenharmony_ci * into submit_page_section(), which takes care of the next stage of submission 89062306a36Sopenharmony_ci * 89162306a36Sopenharmony_ci * Direct IO against a blockdev is different from a file. Because we can 89262306a36Sopenharmony_ci * happily perform page-sized but 512-byte aligned IOs. It is important that 89362306a36Sopenharmony_ci * blockdev IO be able to have fine alignment and large sizes. 89462306a36Sopenharmony_ci * 89562306a36Sopenharmony_ci * So what we do is to permit the ->get_block function to populate bh.b_size 89662306a36Sopenharmony_ci * with the size of IO which is permitted at this offset and this i_blkbits. 89762306a36Sopenharmony_ci * 89862306a36Sopenharmony_ci * For best results, the blockdev should be set up with 512-byte i_blkbits and 89962306a36Sopenharmony_ci * it should set b_size to PAGE_SIZE or more inside get_block(). This gives 90062306a36Sopenharmony_ci * fine alignment but still allows this function to work in PAGE_SIZE units. 90162306a36Sopenharmony_ci */ 90262306a36Sopenharmony_cistatic int do_direct_IO(struct dio *dio, struct dio_submit *sdio, 90362306a36Sopenharmony_ci struct buffer_head *map_bh) 90462306a36Sopenharmony_ci{ 90562306a36Sopenharmony_ci const enum req_op dio_op = dio->opf & REQ_OP_MASK; 90662306a36Sopenharmony_ci const unsigned blkbits = sdio->blkbits; 90762306a36Sopenharmony_ci const unsigned i_blkbits = blkbits + sdio->blkfactor; 90862306a36Sopenharmony_ci int ret = 0; 90962306a36Sopenharmony_ci 91062306a36Sopenharmony_ci while (sdio->block_in_file < sdio->final_block_in_request) { 91162306a36Sopenharmony_ci struct page *page; 91262306a36Sopenharmony_ci size_t from, to; 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_ci page = dio_get_page(dio, sdio); 91562306a36Sopenharmony_ci if (IS_ERR(page)) { 91662306a36Sopenharmony_ci ret = PTR_ERR(page); 91762306a36Sopenharmony_ci goto out; 91862306a36Sopenharmony_ci } 91962306a36Sopenharmony_ci from = sdio->head ? 0 : sdio->from; 92062306a36Sopenharmony_ci to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE; 92162306a36Sopenharmony_ci sdio->head++; 92262306a36Sopenharmony_ci 92362306a36Sopenharmony_ci while (from < to) { 92462306a36Sopenharmony_ci unsigned this_chunk_bytes; /* # of bytes mapped */ 92562306a36Sopenharmony_ci unsigned this_chunk_blocks; /* # of blocks */ 92662306a36Sopenharmony_ci unsigned u; 92762306a36Sopenharmony_ci 92862306a36Sopenharmony_ci if (sdio->blocks_available == 0) { 92962306a36Sopenharmony_ci /* 93062306a36Sopenharmony_ci * Need to go and map some more disk 93162306a36Sopenharmony_ci */ 93262306a36Sopenharmony_ci unsigned long blkmask; 93362306a36Sopenharmony_ci unsigned long dio_remainder; 93462306a36Sopenharmony_ci 93562306a36Sopenharmony_ci ret = get_more_blocks(dio, sdio, map_bh); 93662306a36Sopenharmony_ci if (ret) { 93762306a36Sopenharmony_ci dio_unpin_page(dio, page); 93862306a36Sopenharmony_ci goto out; 93962306a36Sopenharmony_ci } 94062306a36Sopenharmony_ci if (!buffer_mapped(map_bh)) 94162306a36Sopenharmony_ci goto do_holes; 94262306a36Sopenharmony_ci 94362306a36Sopenharmony_ci sdio->blocks_available = 94462306a36Sopenharmony_ci map_bh->b_size >> blkbits; 94562306a36Sopenharmony_ci sdio->next_block_for_io = 94662306a36Sopenharmony_ci map_bh->b_blocknr << sdio->blkfactor; 94762306a36Sopenharmony_ci if (buffer_new(map_bh)) { 94862306a36Sopenharmony_ci clean_bdev_aliases( 94962306a36Sopenharmony_ci map_bh->b_bdev, 95062306a36Sopenharmony_ci map_bh->b_blocknr, 95162306a36Sopenharmony_ci map_bh->b_size >> i_blkbits); 95262306a36Sopenharmony_ci } 95362306a36Sopenharmony_ci 95462306a36Sopenharmony_ci if (!sdio->blkfactor) 95562306a36Sopenharmony_ci goto do_holes; 95662306a36Sopenharmony_ci 95762306a36Sopenharmony_ci blkmask = (1 << sdio->blkfactor) - 1; 95862306a36Sopenharmony_ci dio_remainder = (sdio->block_in_file & blkmask); 95962306a36Sopenharmony_ci 96062306a36Sopenharmony_ci /* 96162306a36Sopenharmony_ci * If we are at the start of IO and that IO 96262306a36Sopenharmony_ci * starts partway into a fs-block, 96362306a36Sopenharmony_ci * dio_remainder will be non-zero. If the IO 96462306a36Sopenharmony_ci * is a read then we can simply advance the IO 96562306a36Sopenharmony_ci * cursor to the first block which is to be 96662306a36Sopenharmony_ci * read. But if the IO is a write and the 96762306a36Sopenharmony_ci * block was newly allocated we cannot do that; 96862306a36Sopenharmony_ci * the start of the fs block must be zeroed out 96962306a36Sopenharmony_ci * on-disk 97062306a36Sopenharmony_ci */ 97162306a36Sopenharmony_ci if (!buffer_new(map_bh)) 97262306a36Sopenharmony_ci sdio->next_block_for_io += dio_remainder; 97362306a36Sopenharmony_ci sdio->blocks_available -= dio_remainder; 97462306a36Sopenharmony_ci } 97562306a36Sopenharmony_cido_holes: 97662306a36Sopenharmony_ci /* Handle holes */ 97762306a36Sopenharmony_ci if (!buffer_mapped(map_bh)) { 97862306a36Sopenharmony_ci loff_t i_size_aligned; 97962306a36Sopenharmony_ci 98062306a36Sopenharmony_ci /* AKPM: eargh, -ENOTBLK is a hack */ 98162306a36Sopenharmony_ci if (dio_op == REQ_OP_WRITE) { 98262306a36Sopenharmony_ci dio_unpin_page(dio, page); 98362306a36Sopenharmony_ci return -ENOTBLK; 98462306a36Sopenharmony_ci } 98562306a36Sopenharmony_ci 98662306a36Sopenharmony_ci /* 98762306a36Sopenharmony_ci * Be sure to account for a partial block as the 98862306a36Sopenharmony_ci * last block in the file 98962306a36Sopenharmony_ci */ 99062306a36Sopenharmony_ci i_size_aligned = ALIGN(i_size_read(dio->inode), 99162306a36Sopenharmony_ci 1 << blkbits); 99262306a36Sopenharmony_ci if (sdio->block_in_file >= 99362306a36Sopenharmony_ci i_size_aligned >> blkbits) { 99462306a36Sopenharmony_ci /* We hit eof */ 99562306a36Sopenharmony_ci dio_unpin_page(dio, page); 99662306a36Sopenharmony_ci goto out; 99762306a36Sopenharmony_ci } 99862306a36Sopenharmony_ci zero_user(page, from, 1 << blkbits); 99962306a36Sopenharmony_ci sdio->block_in_file++; 100062306a36Sopenharmony_ci from += 1 << blkbits; 100162306a36Sopenharmony_ci dio->result += 1 << blkbits; 100262306a36Sopenharmony_ci goto next_block; 100362306a36Sopenharmony_ci } 100462306a36Sopenharmony_ci 100562306a36Sopenharmony_ci /* 100662306a36Sopenharmony_ci * If we're performing IO which has an alignment which 100762306a36Sopenharmony_ci * is finer than the underlying fs, go check to see if 100862306a36Sopenharmony_ci * we must zero out the start of this block. 100962306a36Sopenharmony_ci */ 101062306a36Sopenharmony_ci if (unlikely(sdio->blkfactor && !sdio->start_zero_done)) 101162306a36Sopenharmony_ci dio_zero_block(dio, sdio, 0, map_bh); 101262306a36Sopenharmony_ci 101362306a36Sopenharmony_ci /* 101462306a36Sopenharmony_ci * Work out, in this_chunk_blocks, how much disk we 101562306a36Sopenharmony_ci * can add to this page 101662306a36Sopenharmony_ci */ 101762306a36Sopenharmony_ci this_chunk_blocks = sdio->blocks_available; 101862306a36Sopenharmony_ci u = (to - from) >> blkbits; 101962306a36Sopenharmony_ci if (this_chunk_blocks > u) 102062306a36Sopenharmony_ci this_chunk_blocks = u; 102162306a36Sopenharmony_ci u = sdio->final_block_in_request - sdio->block_in_file; 102262306a36Sopenharmony_ci if (this_chunk_blocks > u) 102362306a36Sopenharmony_ci this_chunk_blocks = u; 102462306a36Sopenharmony_ci this_chunk_bytes = this_chunk_blocks << blkbits; 102562306a36Sopenharmony_ci BUG_ON(this_chunk_bytes == 0); 102662306a36Sopenharmony_ci 102762306a36Sopenharmony_ci if (this_chunk_blocks == sdio->blocks_available) 102862306a36Sopenharmony_ci sdio->boundary = buffer_boundary(map_bh); 102962306a36Sopenharmony_ci ret = submit_page_section(dio, sdio, page, 103062306a36Sopenharmony_ci from, 103162306a36Sopenharmony_ci this_chunk_bytes, 103262306a36Sopenharmony_ci sdio->next_block_for_io, 103362306a36Sopenharmony_ci map_bh); 103462306a36Sopenharmony_ci if (ret) { 103562306a36Sopenharmony_ci dio_unpin_page(dio, page); 103662306a36Sopenharmony_ci goto out; 103762306a36Sopenharmony_ci } 103862306a36Sopenharmony_ci sdio->next_block_for_io += this_chunk_blocks; 103962306a36Sopenharmony_ci 104062306a36Sopenharmony_ci sdio->block_in_file += this_chunk_blocks; 104162306a36Sopenharmony_ci from += this_chunk_bytes; 104262306a36Sopenharmony_ci dio->result += this_chunk_bytes; 104362306a36Sopenharmony_ci sdio->blocks_available -= this_chunk_blocks; 104462306a36Sopenharmony_cinext_block: 104562306a36Sopenharmony_ci BUG_ON(sdio->block_in_file > sdio->final_block_in_request); 104662306a36Sopenharmony_ci if (sdio->block_in_file == sdio->final_block_in_request) 104762306a36Sopenharmony_ci break; 104862306a36Sopenharmony_ci } 104962306a36Sopenharmony_ci 105062306a36Sopenharmony_ci /* Drop the pin which was taken in get_user_pages() */ 105162306a36Sopenharmony_ci dio_unpin_page(dio, page); 105262306a36Sopenharmony_ci } 105362306a36Sopenharmony_ciout: 105462306a36Sopenharmony_ci return ret; 105562306a36Sopenharmony_ci} 105662306a36Sopenharmony_ci 105762306a36Sopenharmony_cistatic inline int drop_refcount(struct dio *dio) 105862306a36Sopenharmony_ci{ 105962306a36Sopenharmony_ci int ret2; 106062306a36Sopenharmony_ci unsigned long flags; 106162306a36Sopenharmony_ci 106262306a36Sopenharmony_ci /* 106362306a36Sopenharmony_ci * Sync will always be dropping the final ref and completing the 106462306a36Sopenharmony_ci * operation. AIO can if it was a broken operation described above or 106562306a36Sopenharmony_ci * in fact if all the bios race to complete before we get here. In 106662306a36Sopenharmony_ci * that case dio_complete() translates the EIOCBQUEUED into the proper 106762306a36Sopenharmony_ci * return code that the caller will hand to ->complete(). 106862306a36Sopenharmony_ci * 106962306a36Sopenharmony_ci * This is managed by the bio_lock instead of being an atomic_t so that 107062306a36Sopenharmony_ci * completion paths can drop their ref and use the remaining count to 107162306a36Sopenharmony_ci * decide to wake the submission path atomically. 107262306a36Sopenharmony_ci */ 107362306a36Sopenharmony_ci spin_lock_irqsave(&dio->bio_lock, flags); 107462306a36Sopenharmony_ci ret2 = --dio->refcount; 107562306a36Sopenharmony_ci spin_unlock_irqrestore(&dio->bio_lock, flags); 107662306a36Sopenharmony_ci return ret2; 107762306a36Sopenharmony_ci} 107862306a36Sopenharmony_ci 107962306a36Sopenharmony_ci/* 108062306a36Sopenharmony_ci * This is a library function for use by filesystem drivers. 108162306a36Sopenharmony_ci * 108262306a36Sopenharmony_ci * The locking rules are governed by the flags parameter: 108362306a36Sopenharmony_ci * - if the flags value contains DIO_LOCKING we use a fancy locking 108462306a36Sopenharmony_ci * scheme for dumb filesystems. 108562306a36Sopenharmony_ci * For writes this function is called under i_mutex and returns with 108662306a36Sopenharmony_ci * i_mutex held, for reads, i_mutex is not held on entry, but it is 108762306a36Sopenharmony_ci * taken and dropped again before returning. 108862306a36Sopenharmony_ci * - if the flags value does NOT contain DIO_LOCKING we don't use any 108962306a36Sopenharmony_ci * internal locking but rather rely on the filesystem to synchronize 109062306a36Sopenharmony_ci * direct I/O reads/writes versus each other and truncate. 109162306a36Sopenharmony_ci * 109262306a36Sopenharmony_ci * To help with locking against truncate we incremented the i_dio_count 109362306a36Sopenharmony_ci * counter before starting direct I/O, and decrement it once we are done. 109462306a36Sopenharmony_ci * Truncate can wait for it to reach zero to provide exclusion. It is 109562306a36Sopenharmony_ci * expected that filesystem provide exclusion between new direct I/O 109662306a36Sopenharmony_ci * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, 109762306a36Sopenharmony_ci * but other filesystems need to take care of this on their own. 109862306a36Sopenharmony_ci * 109962306a36Sopenharmony_ci * NOTE: if you pass "sdio" to anything by pointer make sure that function 110062306a36Sopenharmony_ci * is always inlined. Otherwise gcc is unable to split the structure into 110162306a36Sopenharmony_ci * individual fields and will generate much worse code. This is important 110262306a36Sopenharmony_ci * for the whole file. 110362306a36Sopenharmony_ci */ 110462306a36Sopenharmony_cissize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, 110562306a36Sopenharmony_ci struct block_device *bdev, struct iov_iter *iter, 110662306a36Sopenharmony_ci get_block_t get_block, dio_iodone_t end_io, 110762306a36Sopenharmony_ci int flags) 110862306a36Sopenharmony_ci{ 110962306a36Sopenharmony_ci unsigned i_blkbits = READ_ONCE(inode->i_blkbits); 111062306a36Sopenharmony_ci unsigned blkbits = i_blkbits; 111162306a36Sopenharmony_ci unsigned blocksize_mask = (1 << blkbits) - 1; 111262306a36Sopenharmony_ci ssize_t retval = -EINVAL; 111362306a36Sopenharmony_ci const size_t count = iov_iter_count(iter); 111462306a36Sopenharmony_ci loff_t offset = iocb->ki_pos; 111562306a36Sopenharmony_ci const loff_t end = offset + count; 111662306a36Sopenharmony_ci struct dio *dio; 111762306a36Sopenharmony_ci struct dio_submit sdio = { 0, }; 111862306a36Sopenharmony_ci struct buffer_head map_bh = { 0, }; 111962306a36Sopenharmony_ci struct blk_plug plug; 112062306a36Sopenharmony_ci unsigned long align = offset | iov_iter_alignment(iter); 112162306a36Sopenharmony_ci 112262306a36Sopenharmony_ci /* 112362306a36Sopenharmony_ci * Avoid references to bdev if not absolutely needed to give 112462306a36Sopenharmony_ci * the early prefetch in the caller enough time. 112562306a36Sopenharmony_ci */ 112662306a36Sopenharmony_ci 112762306a36Sopenharmony_ci /* watch out for a 0 len io from a tricksy fs */ 112862306a36Sopenharmony_ci if (iov_iter_rw(iter) == READ && !count) 112962306a36Sopenharmony_ci return 0; 113062306a36Sopenharmony_ci 113162306a36Sopenharmony_ci dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); 113262306a36Sopenharmony_ci if (!dio) 113362306a36Sopenharmony_ci return -ENOMEM; 113462306a36Sopenharmony_ci /* 113562306a36Sopenharmony_ci * Believe it or not, zeroing out the page array caused a .5% 113662306a36Sopenharmony_ci * performance regression in a database benchmark. So, we take 113762306a36Sopenharmony_ci * care to only zero out what's needed. 113862306a36Sopenharmony_ci */ 113962306a36Sopenharmony_ci memset(dio, 0, offsetof(struct dio, pages)); 114062306a36Sopenharmony_ci 114162306a36Sopenharmony_ci dio->flags = flags; 114262306a36Sopenharmony_ci if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) { 114362306a36Sopenharmony_ci /* will be released by direct_io_worker */ 114462306a36Sopenharmony_ci inode_lock(inode); 114562306a36Sopenharmony_ci } 114662306a36Sopenharmony_ci dio->is_pinned = iov_iter_extract_will_pin(iter); 114762306a36Sopenharmony_ci 114862306a36Sopenharmony_ci /* Once we sampled i_size check for reads beyond EOF */ 114962306a36Sopenharmony_ci dio->i_size = i_size_read(inode); 115062306a36Sopenharmony_ci if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { 115162306a36Sopenharmony_ci retval = 0; 115262306a36Sopenharmony_ci goto fail_dio; 115362306a36Sopenharmony_ci } 115462306a36Sopenharmony_ci 115562306a36Sopenharmony_ci if (align & blocksize_mask) { 115662306a36Sopenharmony_ci if (bdev) 115762306a36Sopenharmony_ci blkbits = blksize_bits(bdev_logical_block_size(bdev)); 115862306a36Sopenharmony_ci blocksize_mask = (1 << blkbits) - 1; 115962306a36Sopenharmony_ci if (align & blocksize_mask) 116062306a36Sopenharmony_ci goto fail_dio; 116162306a36Sopenharmony_ci } 116262306a36Sopenharmony_ci 116362306a36Sopenharmony_ci if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) { 116462306a36Sopenharmony_ci struct address_space *mapping = iocb->ki_filp->f_mapping; 116562306a36Sopenharmony_ci 116662306a36Sopenharmony_ci retval = filemap_write_and_wait_range(mapping, offset, end - 1); 116762306a36Sopenharmony_ci if (retval) 116862306a36Sopenharmony_ci goto fail_dio; 116962306a36Sopenharmony_ci } 117062306a36Sopenharmony_ci 117162306a36Sopenharmony_ci /* 117262306a36Sopenharmony_ci * For file extending writes updating i_size before data writeouts 117362306a36Sopenharmony_ci * complete can expose uninitialized blocks in dumb filesystems. 117462306a36Sopenharmony_ci * In that case we need to wait for I/O completion even if asked 117562306a36Sopenharmony_ci * for an asynchronous write. 117662306a36Sopenharmony_ci */ 117762306a36Sopenharmony_ci if (is_sync_kiocb(iocb)) 117862306a36Sopenharmony_ci dio->is_async = false; 117962306a36Sopenharmony_ci else if (iov_iter_rw(iter) == WRITE && end > i_size_read(inode)) 118062306a36Sopenharmony_ci dio->is_async = false; 118162306a36Sopenharmony_ci else 118262306a36Sopenharmony_ci dio->is_async = true; 118362306a36Sopenharmony_ci 118462306a36Sopenharmony_ci dio->inode = inode; 118562306a36Sopenharmony_ci if (iov_iter_rw(iter) == WRITE) { 118662306a36Sopenharmony_ci dio->opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; 118762306a36Sopenharmony_ci if (iocb->ki_flags & IOCB_NOWAIT) 118862306a36Sopenharmony_ci dio->opf |= REQ_NOWAIT; 118962306a36Sopenharmony_ci } else { 119062306a36Sopenharmony_ci dio->opf = REQ_OP_READ; 119162306a36Sopenharmony_ci } 119262306a36Sopenharmony_ci 119362306a36Sopenharmony_ci /* 119462306a36Sopenharmony_ci * For AIO O_(D)SYNC writes we need to defer completions to a workqueue 119562306a36Sopenharmony_ci * so that we can call ->fsync. 119662306a36Sopenharmony_ci */ 119762306a36Sopenharmony_ci if (dio->is_async && iov_iter_rw(iter) == WRITE) { 119862306a36Sopenharmony_ci retval = 0; 119962306a36Sopenharmony_ci if (iocb_is_dsync(iocb)) 120062306a36Sopenharmony_ci retval = dio_set_defer_completion(dio); 120162306a36Sopenharmony_ci else if (!dio->inode->i_sb->s_dio_done_wq) { 120262306a36Sopenharmony_ci /* 120362306a36Sopenharmony_ci * In case of AIO write racing with buffered read we 120462306a36Sopenharmony_ci * need to defer completion. We can't decide this now, 120562306a36Sopenharmony_ci * however the workqueue needs to be initialized here. 120662306a36Sopenharmony_ci */ 120762306a36Sopenharmony_ci retval = sb_init_dio_done_wq(dio->inode->i_sb); 120862306a36Sopenharmony_ci } 120962306a36Sopenharmony_ci if (retval) 121062306a36Sopenharmony_ci goto fail_dio; 121162306a36Sopenharmony_ci } 121262306a36Sopenharmony_ci 121362306a36Sopenharmony_ci /* 121462306a36Sopenharmony_ci * Will be decremented at I/O completion time. 121562306a36Sopenharmony_ci */ 121662306a36Sopenharmony_ci inode_dio_begin(inode); 121762306a36Sopenharmony_ci 121862306a36Sopenharmony_ci retval = 0; 121962306a36Sopenharmony_ci sdio.blkbits = blkbits; 122062306a36Sopenharmony_ci sdio.blkfactor = i_blkbits - blkbits; 122162306a36Sopenharmony_ci sdio.block_in_file = offset >> blkbits; 122262306a36Sopenharmony_ci 122362306a36Sopenharmony_ci sdio.get_block = get_block; 122462306a36Sopenharmony_ci dio->end_io = end_io; 122562306a36Sopenharmony_ci sdio.final_block_in_bio = -1; 122662306a36Sopenharmony_ci sdio.next_block_for_io = -1; 122762306a36Sopenharmony_ci 122862306a36Sopenharmony_ci dio->iocb = iocb; 122962306a36Sopenharmony_ci 123062306a36Sopenharmony_ci spin_lock_init(&dio->bio_lock); 123162306a36Sopenharmony_ci dio->refcount = 1; 123262306a36Sopenharmony_ci 123362306a36Sopenharmony_ci dio->should_dirty = user_backed_iter(iter) && iov_iter_rw(iter) == READ; 123462306a36Sopenharmony_ci sdio.iter = iter; 123562306a36Sopenharmony_ci sdio.final_block_in_request = end >> blkbits; 123662306a36Sopenharmony_ci 123762306a36Sopenharmony_ci /* 123862306a36Sopenharmony_ci * In case of non-aligned buffers, we may need 2 more 123962306a36Sopenharmony_ci * pages since we need to zero out first and last block. 124062306a36Sopenharmony_ci */ 124162306a36Sopenharmony_ci if (unlikely(sdio.blkfactor)) 124262306a36Sopenharmony_ci sdio.pages_in_io = 2; 124362306a36Sopenharmony_ci 124462306a36Sopenharmony_ci sdio.pages_in_io += iov_iter_npages(iter, INT_MAX); 124562306a36Sopenharmony_ci 124662306a36Sopenharmony_ci blk_start_plug(&plug); 124762306a36Sopenharmony_ci 124862306a36Sopenharmony_ci retval = do_direct_IO(dio, &sdio, &map_bh); 124962306a36Sopenharmony_ci if (retval) 125062306a36Sopenharmony_ci dio_cleanup(dio, &sdio); 125162306a36Sopenharmony_ci 125262306a36Sopenharmony_ci if (retval == -ENOTBLK) { 125362306a36Sopenharmony_ci /* 125462306a36Sopenharmony_ci * The remaining part of the request will be 125562306a36Sopenharmony_ci * handled by buffered I/O when we return 125662306a36Sopenharmony_ci */ 125762306a36Sopenharmony_ci retval = 0; 125862306a36Sopenharmony_ci } 125962306a36Sopenharmony_ci /* 126062306a36Sopenharmony_ci * There may be some unwritten disk at the end of a part-written 126162306a36Sopenharmony_ci * fs-block-sized block. Go zero that now. 126262306a36Sopenharmony_ci */ 126362306a36Sopenharmony_ci dio_zero_block(dio, &sdio, 1, &map_bh); 126462306a36Sopenharmony_ci 126562306a36Sopenharmony_ci if (sdio.cur_page) { 126662306a36Sopenharmony_ci ssize_t ret2; 126762306a36Sopenharmony_ci 126862306a36Sopenharmony_ci ret2 = dio_send_cur_page(dio, &sdio, &map_bh); 126962306a36Sopenharmony_ci if (retval == 0) 127062306a36Sopenharmony_ci retval = ret2; 127162306a36Sopenharmony_ci dio_unpin_page(dio, sdio.cur_page); 127262306a36Sopenharmony_ci sdio.cur_page = NULL; 127362306a36Sopenharmony_ci } 127462306a36Sopenharmony_ci if (sdio.bio) 127562306a36Sopenharmony_ci dio_bio_submit(dio, &sdio); 127662306a36Sopenharmony_ci 127762306a36Sopenharmony_ci blk_finish_plug(&plug); 127862306a36Sopenharmony_ci 127962306a36Sopenharmony_ci /* 128062306a36Sopenharmony_ci * It is possible that, we return short IO due to end of file. 128162306a36Sopenharmony_ci * In that case, we need to release all the pages we got hold on. 128262306a36Sopenharmony_ci */ 128362306a36Sopenharmony_ci dio_cleanup(dio, &sdio); 128462306a36Sopenharmony_ci 128562306a36Sopenharmony_ci /* 128662306a36Sopenharmony_ci * All block lookups have been performed. For READ requests 128762306a36Sopenharmony_ci * we can let i_mutex go now that its achieved its purpose 128862306a36Sopenharmony_ci * of protecting us from looking up uninitialized blocks. 128962306a36Sopenharmony_ci */ 129062306a36Sopenharmony_ci if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) 129162306a36Sopenharmony_ci inode_unlock(dio->inode); 129262306a36Sopenharmony_ci 129362306a36Sopenharmony_ci /* 129462306a36Sopenharmony_ci * The only time we want to leave bios in flight is when a successful 129562306a36Sopenharmony_ci * partial aio read or full aio write have been setup. In that case 129662306a36Sopenharmony_ci * bio completion will call aio_complete. The only time it's safe to 129762306a36Sopenharmony_ci * call aio_complete is when we return -EIOCBQUEUED, so we key on that. 129862306a36Sopenharmony_ci * This had *better* be the only place that raises -EIOCBQUEUED. 129962306a36Sopenharmony_ci */ 130062306a36Sopenharmony_ci BUG_ON(retval == -EIOCBQUEUED); 130162306a36Sopenharmony_ci if (dio->is_async && retval == 0 && dio->result && 130262306a36Sopenharmony_ci (iov_iter_rw(iter) == READ || dio->result == count)) 130362306a36Sopenharmony_ci retval = -EIOCBQUEUED; 130462306a36Sopenharmony_ci else 130562306a36Sopenharmony_ci dio_await_completion(dio); 130662306a36Sopenharmony_ci 130762306a36Sopenharmony_ci if (drop_refcount(dio) == 0) { 130862306a36Sopenharmony_ci retval = dio_complete(dio, retval, DIO_COMPLETE_INVALIDATE); 130962306a36Sopenharmony_ci } else 131062306a36Sopenharmony_ci BUG_ON(retval != -EIOCBQUEUED); 131162306a36Sopenharmony_ci 131262306a36Sopenharmony_ci return retval; 131362306a36Sopenharmony_ci 131462306a36Sopenharmony_cifail_dio: 131562306a36Sopenharmony_ci if (dio->flags & DIO_LOCKING && iov_iter_rw(iter) == READ) 131662306a36Sopenharmony_ci inode_unlock(inode); 131762306a36Sopenharmony_ci 131862306a36Sopenharmony_ci kmem_cache_free(dio_cache, dio); 131962306a36Sopenharmony_ci return retval; 132062306a36Sopenharmony_ci} 132162306a36Sopenharmony_ciEXPORT_SYMBOL(__blockdev_direct_IO); 132262306a36Sopenharmony_ci 132362306a36Sopenharmony_cistatic __init int dio_init(void) 132462306a36Sopenharmony_ci{ 132562306a36Sopenharmony_ci dio_cache = KMEM_CACHE(dio, SLAB_PANIC); 132662306a36Sopenharmony_ci return 0; 132762306a36Sopenharmony_ci} 132862306a36Sopenharmony_cimodule_init(dio_init) 1329