162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2007 Oracle. All rights reserved. 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci#include <linux/fs.h> 762306a36Sopenharmony_ci#include <linux/pagemap.h> 862306a36Sopenharmony_ci#include <linux/time.h> 962306a36Sopenharmony_ci#include <linux/init.h> 1062306a36Sopenharmony_ci#include <linux/string.h> 1162306a36Sopenharmony_ci#include <linux/backing-dev.h> 1262306a36Sopenharmony_ci#include <linux/falloc.h> 1362306a36Sopenharmony_ci#include <linux/writeback.h> 1462306a36Sopenharmony_ci#include <linux/compat.h> 1562306a36Sopenharmony_ci#include <linux/slab.h> 1662306a36Sopenharmony_ci#include <linux/btrfs.h> 1762306a36Sopenharmony_ci#include <linux/uio.h> 1862306a36Sopenharmony_ci#include <linux/iversion.h> 1962306a36Sopenharmony_ci#include <linux/fsverity.h> 2062306a36Sopenharmony_ci#include "ctree.h" 2162306a36Sopenharmony_ci#include "disk-io.h" 2262306a36Sopenharmony_ci#include "transaction.h" 2362306a36Sopenharmony_ci#include "btrfs_inode.h" 2462306a36Sopenharmony_ci#include "print-tree.h" 2562306a36Sopenharmony_ci#include "tree-log.h" 2662306a36Sopenharmony_ci#include "locking.h" 2762306a36Sopenharmony_ci#include "volumes.h" 2862306a36Sopenharmony_ci#include "qgroup.h" 2962306a36Sopenharmony_ci#include "compression.h" 3062306a36Sopenharmony_ci#include "delalloc-space.h" 3162306a36Sopenharmony_ci#include "reflink.h" 3262306a36Sopenharmony_ci#include "subpage.h" 3362306a36Sopenharmony_ci#include "fs.h" 3462306a36Sopenharmony_ci#include "accessors.h" 3562306a36Sopenharmony_ci#include "extent-tree.h" 3662306a36Sopenharmony_ci#include "file-item.h" 3762306a36Sopenharmony_ci#include "ioctl.h" 3862306a36Sopenharmony_ci#include "file.h" 3962306a36Sopenharmony_ci#include "super.h" 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_ci/* simple helper to fault in pages and copy. This should go away 4262306a36Sopenharmony_ci * and be replaced with calls into generic code. 4362306a36Sopenharmony_ci */ 4462306a36Sopenharmony_cistatic noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, 4562306a36Sopenharmony_ci struct page **prepared_pages, 4662306a36Sopenharmony_ci struct iov_iter *i) 4762306a36Sopenharmony_ci{ 4862306a36Sopenharmony_ci size_t copied = 0; 4962306a36Sopenharmony_ci size_t total_copied = 0; 5062306a36Sopenharmony_ci int pg = 0; 5162306a36Sopenharmony_ci int offset = offset_in_page(pos); 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci while (write_bytes > 0) { 5462306a36Sopenharmony_ci size_t count = min_t(size_t, 5562306a36Sopenharmony_ci PAGE_SIZE - offset, write_bytes); 5662306a36Sopenharmony_ci struct page *page = prepared_pages[pg]; 5762306a36Sopenharmony_ci /* 5862306a36Sopenharmony_ci * Copy data from userspace to the current page 5962306a36Sopenharmony_ci */ 6062306a36Sopenharmony_ci copied = copy_page_from_iter_atomic(page, offset, count, i); 6162306a36Sopenharmony_ci 6262306a36Sopenharmony_ci /* Flush processor's dcache for this page */ 6362306a36Sopenharmony_ci flush_dcache_page(page); 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_ci /* 6662306a36Sopenharmony_ci * if we get a partial write, we can end up with 6762306a36Sopenharmony_ci * partially up to date pages. These add 6862306a36Sopenharmony_ci * a lot of complexity, so make sure they don't 6962306a36Sopenharmony_ci * happen by forcing this copy to be retried. 7062306a36Sopenharmony_ci * 7162306a36Sopenharmony_ci * The rest of the btrfs_file_write code will fall 7262306a36Sopenharmony_ci * back to page at a time copies after we return 0. 7362306a36Sopenharmony_ci */ 7462306a36Sopenharmony_ci if (unlikely(copied < count)) { 7562306a36Sopenharmony_ci if (!PageUptodate(page)) { 7662306a36Sopenharmony_ci iov_iter_revert(i, copied); 7762306a36Sopenharmony_ci copied = 0; 7862306a36Sopenharmony_ci } 7962306a36Sopenharmony_ci if (!copied) 8062306a36Sopenharmony_ci break; 8162306a36Sopenharmony_ci } 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_ci write_bytes -= copied; 8462306a36Sopenharmony_ci total_copied += copied; 8562306a36Sopenharmony_ci offset += copied; 8662306a36Sopenharmony_ci if (offset == PAGE_SIZE) { 8762306a36Sopenharmony_ci pg++; 8862306a36Sopenharmony_ci offset = 0; 8962306a36Sopenharmony_ci } 9062306a36Sopenharmony_ci } 9162306a36Sopenharmony_ci return total_copied; 9262306a36Sopenharmony_ci} 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci/* 9562306a36Sopenharmony_ci * unlocks pages after btrfs_file_write is done with them 9662306a36Sopenharmony_ci */ 9762306a36Sopenharmony_cistatic void btrfs_drop_pages(struct btrfs_fs_info *fs_info, 9862306a36Sopenharmony_ci struct page **pages, size_t num_pages, 9962306a36Sopenharmony_ci u64 pos, u64 copied) 10062306a36Sopenharmony_ci{ 10162306a36Sopenharmony_ci size_t i; 10262306a36Sopenharmony_ci u64 block_start = round_down(pos, fs_info->sectorsize); 10362306a36Sopenharmony_ci u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci ASSERT(block_len <= U32_MAX); 10662306a36Sopenharmony_ci for (i = 0; i < num_pages; i++) { 10762306a36Sopenharmony_ci /* page checked is some magic around finding pages that 10862306a36Sopenharmony_ci * have been modified without going through btrfs_set_page_dirty 10962306a36Sopenharmony_ci * clear it here. There should be no need to mark the pages 11062306a36Sopenharmony_ci * accessed as prepare_pages should have marked them accessed 11162306a36Sopenharmony_ci * in prepare_pages via find_or_create_page() 11262306a36Sopenharmony_ci */ 11362306a36Sopenharmony_ci btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start, 11462306a36Sopenharmony_ci block_len); 11562306a36Sopenharmony_ci unlock_page(pages[i]); 11662306a36Sopenharmony_ci put_page(pages[i]); 11762306a36Sopenharmony_ci } 11862306a36Sopenharmony_ci} 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci/* 12162306a36Sopenharmony_ci * After btrfs_copy_from_user(), update the following things for delalloc: 12262306a36Sopenharmony_ci * - Mark newly dirtied pages as DELALLOC in the io tree. 12362306a36Sopenharmony_ci * Used to advise which range is to be written back. 12462306a36Sopenharmony_ci * - Mark modified pages as Uptodate/Dirty and not needing COW fixup 12562306a36Sopenharmony_ci * - Update inode size for past EOF write 12662306a36Sopenharmony_ci */ 12762306a36Sopenharmony_ciint btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, 12862306a36Sopenharmony_ci size_t num_pages, loff_t pos, size_t write_bytes, 12962306a36Sopenharmony_ci struct extent_state **cached, bool noreserve) 13062306a36Sopenharmony_ci{ 13162306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = inode->root->fs_info; 13262306a36Sopenharmony_ci int err = 0; 13362306a36Sopenharmony_ci int i; 13462306a36Sopenharmony_ci u64 num_bytes; 13562306a36Sopenharmony_ci u64 start_pos; 13662306a36Sopenharmony_ci u64 end_of_last_block; 13762306a36Sopenharmony_ci u64 end_pos = pos + write_bytes; 13862306a36Sopenharmony_ci loff_t isize = i_size_read(&inode->vfs_inode); 13962306a36Sopenharmony_ci unsigned int extra_bits = 0; 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci if (write_bytes == 0) 14262306a36Sopenharmony_ci return 0; 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci if (noreserve) 14562306a36Sopenharmony_ci extra_bits |= EXTENT_NORESERVE; 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci start_pos = round_down(pos, fs_info->sectorsize); 14862306a36Sopenharmony_ci num_bytes = round_up(write_bytes + pos - start_pos, 14962306a36Sopenharmony_ci fs_info->sectorsize); 15062306a36Sopenharmony_ci ASSERT(num_bytes <= U32_MAX); 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci end_of_last_block = start_pos + num_bytes - 1; 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci /* 15562306a36Sopenharmony_ci * The pages may have already been dirty, clear out old accounting so 15662306a36Sopenharmony_ci * we can set things up properly 15762306a36Sopenharmony_ci */ 15862306a36Sopenharmony_ci clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, 15962306a36Sopenharmony_ci EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 16062306a36Sopenharmony_ci cached); 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 16362306a36Sopenharmony_ci extra_bits, cached); 16462306a36Sopenharmony_ci if (err) 16562306a36Sopenharmony_ci return err; 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci for (i = 0; i < num_pages; i++) { 16862306a36Sopenharmony_ci struct page *p = pages[i]; 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); 17162306a36Sopenharmony_ci btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes); 17262306a36Sopenharmony_ci btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); 17362306a36Sopenharmony_ci } 17462306a36Sopenharmony_ci 17562306a36Sopenharmony_ci /* 17662306a36Sopenharmony_ci * we've only changed i_size in ram, and we haven't updated 17762306a36Sopenharmony_ci * the disk i_size. There is no need to log the inode 17862306a36Sopenharmony_ci * at this time. 17962306a36Sopenharmony_ci */ 18062306a36Sopenharmony_ci if (end_pos > isize) 18162306a36Sopenharmony_ci i_size_write(&inode->vfs_inode, end_pos); 18262306a36Sopenharmony_ci return 0; 18362306a36Sopenharmony_ci} 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci/* 18662306a36Sopenharmony_ci * this is very complex, but the basic idea is to drop all extents 18762306a36Sopenharmony_ci * in the range start - end. hint_block is filled in with a block number 18862306a36Sopenharmony_ci * that would be a good hint to the block allocator for this file. 18962306a36Sopenharmony_ci * 19062306a36Sopenharmony_ci * If an extent intersects the range but is not entirely inside the range 19162306a36Sopenharmony_ci * it is either truncated or split. Anything entirely inside the range 19262306a36Sopenharmony_ci * is deleted from the tree. 19362306a36Sopenharmony_ci * 19462306a36Sopenharmony_ci * Note: the VFS' inode number of bytes is not updated, it's up to the caller 19562306a36Sopenharmony_ci * to deal with that. We set the field 'bytes_found' of the arguments structure 19662306a36Sopenharmony_ci * with the number of allocated bytes found in the target range, so that the 19762306a36Sopenharmony_ci * caller can update the inode's number of bytes in an atomic way when 19862306a36Sopenharmony_ci * replacing extents in a range to avoid races with stat(2). 19962306a36Sopenharmony_ci */ 20062306a36Sopenharmony_ciint btrfs_drop_extents(struct btrfs_trans_handle *trans, 20162306a36Sopenharmony_ci struct btrfs_root *root, struct btrfs_inode *inode, 20262306a36Sopenharmony_ci struct btrfs_drop_extents_args *args) 20362306a36Sopenharmony_ci{ 20462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = root->fs_info; 20562306a36Sopenharmony_ci struct extent_buffer *leaf; 20662306a36Sopenharmony_ci struct btrfs_file_extent_item *fi; 20762306a36Sopenharmony_ci struct btrfs_ref ref = { 0 }; 20862306a36Sopenharmony_ci struct btrfs_key key; 20962306a36Sopenharmony_ci struct btrfs_key new_key; 21062306a36Sopenharmony_ci u64 ino = btrfs_ino(inode); 21162306a36Sopenharmony_ci u64 search_start = args->start; 21262306a36Sopenharmony_ci u64 disk_bytenr = 0; 21362306a36Sopenharmony_ci u64 num_bytes = 0; 21462306a36Sopenharmony_ci u64 extent_offset = 0; 21562306a36Sopenharmony_ci u64 extent_end = 0; 21662306a36Sopenharmony_ci u64 last_end = args->start; 21762306a36Sopenharmony_ci int del_nr = 0; 21862306a36Sopenharmony_ci int del_slot = 0; 21962306a36Sopenharmony_ci int extent_type; 22062306a36Sopenharmony_ci int recow; 22162306a36Sopenharmony_ci int ret; 22262306a36Sopenharmony_ci int modify_tree = -1; 22362306a36Sopenharmony_ci int update_refs; 22462306a36Sopenharmony_ci int found = 0; 22562306a36Sopenharmony_ci struct btrfs_path *path = args->path; 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci args->bytes_found = 0; 22862306a36Sopenharmony_ci args->extent_inserted = false; 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci /* Must always have a path if ->replace_extent is true */ 23162306a36Sopenharmony_ci ASSERT(!(args->replace_extent && !args->path)); 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_ci if (!path) { 23462306a36Sopenharmony_ci path = btrfs_alloc_path(); 23562306a36Sopenharmony_ci if (!path) { 23662306a36Sopenharmony_ci ret = -ENOMEM; 23762306a36Sopenharmony_ci goto out; 23862306a36Sopenharmony_ci } 23962306a36Sopenharmony_ci } 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci if (args->drop_cache) 24262306a36Sopenharmony_ci btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci if (args->start >= inode->disk_i_size && !args->replace_extent) 24562306a36Sopenharmony_ci modify_tree = 0; 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); 24862306a36Sopenharmony_ci while (1) { 24962306a36Sopenharmony_ci recow = 0; 25062306a36Sopenharmony_ci ret = btrfs_lookup_file_extent(trans, root, path, ino, 25162306a36Sopenharmony_ci search_start, modify_tree); 25262306a36Sopenharmony_ci if (ret < 0) 25362306a36Sopenharmony_ci break; 25462306a36Sopenharmony_ci if (ret > 0 && path->slots[0] > 0 && search_start == args->start) { 25562306a36Sopenharmony_ci leaf = path->nodes[0]; 25662306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); 25762306a36Sopenharmony_ci if (key.objectid == ino && 25862306a36Sopenharmony_ci key.type == BTRFS_EXTENT_DATA_KEY) 25962306a36Sopenharmony_ci path->slots[0]--; 26062306a36Sopenharmony_ci } 26162306a36Sopenharmony_ci ret = 0; 26262306a36Sopenharmony_cinext_slot: 26362306a36Sopenharmony_ci leaf = path->nodes[0]; 26462306a36Sopenharmony_ci if (path->slots[0] >= btrfs_header_nritems(leaf)) { 26562306a36Sopenharmony_ci BUG_ON(del_nr > 0); 26662306a36Sopenharmony_ci ret = btrfs_next_leaf(root, path); 26762306a36Sopenharmony_ci if (ret < 0) 26862306a36Sopenharmony_ci break; 26962306a36Sopenharmony_ci if (ret > 0) { 27062306a36Sopenharmony_ci ret = 0; 27162306a36Sopenharmony_ci break; 27262306a36Sopenharmony_ci } 27362306a36Sopenharmony_ci leaf = path->nodes[0]; 27462306a36Sopenharmony_ci recow = 1; 27562306a36Sopenharmony_ci } 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_ci if (key.objectid > ino) 28062306a36Sopenharmony_ci break; 28162306a36Sopenharmony_ci if (WARN_ON_ONCE(key.objectid < ino) || 28262306a36Sopenharmony_ci key.type < BTRFS_EXTENT_DATA_KEY) { 28362306a36Sopenharmony_ci ASSERT(del_nr == 0); 28462306a36Sopenharmony_ci path->slots[0]++; 28562306a36Sopenharmony_ci goto next_slot; 28662306a36Sopenharmony_ci } 28762306a36Sopenharmony_ci if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end) 28862306a36Sopenharmony_ci break; 28962306a36Sopenharmony_ci 29062306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0], 29162306a36Sopenharmony_ci struct btrfs_file_extent_item); 29262306a36Sopenharmony_ci extent_type = btrfs_file_extent_type(leaf, fi); 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci if (extent_type == BTRFS_FILE_EXTENT_REG || 29562306a36Sopenharmony_ci extent_type == BTRFS_FILE_EXTENT_PREALLOC) { 29662306a36Sopenharmony_ci disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 29762306a36Sopenharmony_ci num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 29862306a36Sopenharmony_ci extent_offset = btrfs_file_extent_offset(leaf, fi); 29962306a36Sopenharmony_ci extent_end = key.offset + 30062306a36Sopenharmony_ci btrfs_file_extent_num_bytes(leaf, fi); 30162306a36Sopenharmony_ci } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 30262306a36Sopenharmony_ci extent_end = key.offset + 30362306a36Sopenharmony_ci btrfs_file_extent_ram_bytes(leaf, fi); 30462306a36Sopenharmony_ci } else { 30562306a36Sopenharmony_ci /* can't happen */ 30662306a36Sopenharmony_ci BUG(); 30762306a36Sopenharmony_ci } 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci /* 31062306a36Sopenharmony_ci * Don't skip extent items representing 0 byte lengths. They 31162306a36Sopenharmony_ci * used to be created (bug) if while punching holes we hit 31262306a36Sopenharmony_ci * -ENOSPC condition. So if we find one here, just ensure we 31362306a36Sopenharmony_ci * delete it, otherwise we would insert a new file extent item 31462306a36Sopenharmony_ci * with the same key (offset) as that 0 bytes length file 31562306a36Sopenharmony_ci * extent item in the call to setup_items_for_insert() later 31662306a36Sopenharmony_ci * in this function. 31762306a36Sopenharmony_ci */ 31862306a36Sopenharmony_ci if (extent_end == key.offset && extent_end >= search_start) { 31962306a36Sopenharmony_ci last_end = extent_end; 32062306a36Sopenharmony_ci goto delete_extent_item; 32162306a36Sopenharmony_ci } 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci if (extent_end <= search_start) { 32462306a36Sopenharmony_ci path->slots[0]++; 32562306a36Sopenharmony_ci goto next_slot; 32662306a36Sopenharmony_ci } 32762306a36Sopenharmony_ci 32862306a36Sopenharmony_ci found = 1; 32962306a36Sopenharmony_ci search_start = max(key.offset, args->start); 33062306a36Sopenharmony_ci if (recow || !modify_tree) { 33162306a36Sopenharmony_ci modify_tree = -1; 33262306a36Sopenharmony_ci btrfs_release_path(path); 33362306a36Sopenharmony_ci continue; 33462306a36Sopenharmony_ci } 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci /* 33762306a36Sopenharmony_ci * | - range to drop - | 33862306a36Sopenharmony_ci * | -------- extent -------- | 33962306a36Sopenharmony_ci */ 34062306a36Sopenharmony_ci if (args->start > key.offset && args->end < extent_end) { 34162306a36Sopenharmony_ci BUG_ON(del_nr > 0); 34262306a36Sopenharmony_ci if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 34362306a36Sopenharmony_ci ret = -EOPNOTSUPP; 34462306a36Sopenharmony_ci break; 34562306a36Sopenharmony_ci } 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci memcpy(&new_key, &key, sizeof(new_key)); 34862306a36Sopenharmony_ci new_key.offset = args->start; 34962306a36Sopenharmony_ci ret = btrfs_duplicate_item(trans, root, path, 35062306a36Sopenharmony_ci &new_key); 35162306a36Sopenharmony_ci if (ret == -EAGAIN) { 35262306a36Sopenharmony_ci btrfs_release_path(path); 35362306a36Sopenharmony_ci continue; 35462306a36Sopenharmony_ci } 35562306a36Sopenharmony_ci if (ret < 0) 35662306a36Sopenharmony_ci break; 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci leaf = path->nodes[0]; 35962306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 36062306a36Sopenharmony_ci struct btrfs_file_extent_item); 36162306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, 36262306a36Sopenharmony_ci args->start - key.offset); 36362306a36Sopenharmony_ci 36462306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0], 36562306a36Sopenharmony_ci struct btrfs_file_extent_item); 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci extent_offset += args->start - key.offset; 36862306a36Sopenharmony_ci btrfs_set_file_extent_offset(leaf, fi, extent_offset); 36962306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, 37062306a36Sopenharmony_ci extent_end - args->start); 37162306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 37262306a36Sopenharmony_ci 37362306a36Sopenharmony_ci if (update_refs && disk_bytenr > 0) { 37462306a36Sopenharmony_ci btrfs_init_generic_ref(&ref, 37562306a36Sopenharmony_ci BTRFS_ADD_DELAYED_REF, 37662306a36Sopenharmony_ci disk_bytenr, num_bytes, 0); 37762306a36Sopenharmony_ci btrfs_init_data_ref(&ref, 37862306a36Sopenharmony_ci root->root_key.objectid, 37962306a36Sopenharmony_ci new_key.objectid, 38062306a36Sopenharmony_ci args->start - extent_offset, 38162306a36Sopenharmony_ci 0, false); 38262306a36Sopenharmony_ci ret = btrfs_inc_extent_ref(trans, &ref); 38362306a36Sopenharmony_ci if (ret) { 38462306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 38562306a36Sopenharmony_ci break; 38662306a36Sopenharmony_ci } 38762306a36Sopenharmony_ci } 38862306a36Sopenharmony_ci key.offset = args->start; 38962306a36Sopenharmony_ci } 39062306a36Sopenharmony_ci /* 39162306a36Sopenharmony_ci * From here on out we will have actually dropped something, so 39262306a36Sopenharmony_ci * last_end can be updated. 39362306a36Sopenharmony_ci */ 39462306a36Sopenharmony_ci last_end = extent_end; 39562306a36Sopenharmony_ci 39662306a36Sopenharmony_ci /* 39762306a36Sopenharmony_ci * | ---- range to drop ----- | 39862306a36Sopenharmony_ci * | -------- extent -------- | 39962306a36Sopenharmony_ci */ 40062306a36Sopenharmony_ci if (args->start <= key.offset && args->end < extent_end) { 40162306a36Sopenharmony_ci if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 40262306a36Sopenharmony_ci ret = -EOPNOTSUPP; 40362306a36Sopenharmony_ci break; 40462306a36Sopenharmony_ci } 40562306a36Sopenharmony_ci 40662306a36Sopenharmony_ci memcpy(&new_key, &key, sizeof(new_key)); 40762306a36Sopenharmony_ci new_key.offset = args->end; 40862306a36Sopenharmony_ci btrfs_set_item_key_safe(trans, path, &new_key); 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci extent_offset += args->end - key.offset; 41162306a36Sopenharmony_ci btrfs_set_file_extent_offset(leaf, fi, extent_offset); 41262306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, 41362306a36Sopenharmony_ci extent_end - args->end); 41462306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 41562306a36Sopenharmony_ci if (update_refs && disk_bytenr > 0) 41662306a36Sopenharmony_ci args->bytes_found += args->end - key.offset; 41762306a36Sopenharmony_ci break; 41862306a36Sopenharmony_ci } 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci search_start = extent_end; 42162306a36Sopenharmony_ci /* 42262306a36Sopenharmony_ci * | ---- range to drop ----- | 42362306a36Sopenharmony_ci * | -------- extent -------- | 42462306a36Sopenharmony_ci */ 42562306a36Sopenharmony_ci if (args->start > key.offset && args->end >= extent_end) { 42662306a36Sopenharmony_ci BUG_ON(del_nr > 0); 42762306a36Sopenharmony_ci if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 42862306a36Sopenharmony_ci ret = -EOPNOTSUPP; 42962306a36Sopenharmony_ci break; 43062306a36Sopenharmony_ci } 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, 43362306a36Sopenharmony_ci args->start - key.offset); 43462306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 43562306a36Sopenharmony_ci if (update_refs && disk_bytenr > 0) 43662306a36Sopenharmony_ci args->bytes_found += extent_end - args->start; 43762306a36Sopenharmony_ci if (args->end == extent_end) 43862306a36Sopenharmony_ci break; 43962306a36Sopenharmony_ci 44062306a36Sopenharmony_ci path->slots[0]++; 44162306a36Sopenharmony_ci goto next_slot; 44262306a36Sopenharmony_ci } 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_ci /* 44562306a36Sopenharmony_ci * | ---- range to drop ----- | 44662306a36Sopenharmony_ci * | ------ extent ------ | 44762306a36Sopenharmony_ci */ 44862306a36Sopenharmony_ci if (args->start <= key.offset && args->end >= extent_end) { 44962306a36Sopenharmony_cidelete_extent_item: 45062306a36Sopenharmony_ci if (del_nr == 0) { 45162306a36Sopenharmony_ci del_slot = path->slots[0]; 45262306a36Sopenharmony_ci del_nr = 1; 45362306a36Sopenharmony_ci } else { 45462306a36Sopenharmony_ci BUG_ON(del_slot + del_nr != path->slots[0]); 45562306a36Sopenharmony_ci del_nr++; 45662306a36Sopenharmony_ci } 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci if (update_refs && 45962306a36Sopenharmony_ci extent_type == BTRFS_FILE_EXTENT_INLINE) { 46062306a36Sopenharmony_ci args->bytes_found += extent_end - key.offset; 46162306a36Sopenharmony_ci extent_end = ALIGN(extent_end, 46262306a36Sopenharmony_ci fs_info->sectorsize); 46362306a36Sopenharmony_ci } else if (update_refs && disk_bytenr > 0) { 46462306a36Sopenharmony_ci btrfs_init_generic_ref(&ref, 46562306a36Sopenharmony_ci BTRFS_DROP_DELAYED_REF, 46662306a36Sopenharmony_ci disk_bytenr, num_bytes, 0); 46762306a36Sopenharmony_ci btrfs_init_data_ref(&ref, 46862306a36Sopenharmony_ci root->root_key.objectid, 46962306a36Sopenharmony_ci key.objectid, 47062306a36Sopenharmony_ci key.offset - extent_offset, 0, 47162306a36Sopenharmony_ci false); 47262306a36Sopenharmony_ci ret = btrfs_free_extent(trans, &ref); 47362306a36Sopenharmony_ci if (ret) { 47462306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 47562306a36Sopenharmony_ci break; 47662306a36Sopenharmony_ci } 47762306a36Sopenharmony_ci args->bytes_found += extent_end - key.offset; 47862306a36Sopenharmony_ci } 47962306a36Sopenharmony_ci 48062306a36Sopenharmony_ci if (args->end == extent_end) 48162306a36Sopenharmony_ci break; 48262306a36Sopenharmony_ci 48362306a36Sopenharmony_ci if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { 48462306a36Sopenharmony_ci path->slots[0]++; 48562306a36Sopenharmony_ci goto next_slot; 48662306a36Sopenharmony_ci } 48762306a36Sopenharmony_ci 48862306a36Sopenharmony_ci ret = btrfs_del_items(trans, root, path, del_slot, 48962306a36Sopenharmony_ci del_nr); 49062306a36Sopenharmony_ci if (ret) { 49162306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 49262306a36Sopenharmony_ci break; 49362306a36Sopenharmony_ci } 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci del_nr = 0; 49662306a36Sopenharmony_ci del_slot = 0; 49762306a36Sopenharmony_ci 49862306a36Sopenharmony_ci btrfs_release_path(path); 49962306a36Sopenharmony_ci continue; 50062306a36Sopenharmony_ci } 50162306a36Sopenharmony_ci 50262306a36Sopenharmony_ci BUG(); 50362306a36Sopenharmony_ci } 50462306a36Sopenharmony_ci 50562306a36Sopenharmony_ci if (!ret && del_nr > 0) { 50662306a36Sopenharmony_ci /* 50762306a36Sopenharmony_ci * Set path->slots[0] to first slot, so that after the delete 50862306a36Sopenharmony_ci * if items are move off from our leaf to its immediate left or 50962306a36Sopenharmony_ci * right neighbor leafs, we end up with a correct and adjusted 51062306a36Sopenharmony_ci * path->slots[0] for our insertion (if args->replace_extent). 51162306a36Sopenharmony_ci */ 51262306a36Sopenharmony_ci path->slots[0] = del_slot; 51362306a36Sopenharmony_ci ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 51462306a36Sopenharmony_ci if (ret) 51562306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 51662306a36Sopenharmony_ci } 51762306a36Sopenharmony_ci 51862306a36Sopenharmony_ci leaf = path->nodes[0]; 51962306a36Sopenharmony_ci /* 52062306a36Sopenharmony_ci * If btrfs_del_items() was called, it might have deleted a leaf, in 52162306a36Sopenharmony_ci * which case it unlocked our path, so check path->locks[0] matches a 52262306a36Sopenharmony_ci * write lock. 52362306a36Sopenharmony_ci */ 52462306a36Sopenharmony_ci if (!ret && args->replace_extent && 52562306a36Sopenharmony_ci path->locks[0] == BTRFS_WRITE_LOCK && 52662306a36Sopenharmony_ci btrfs_leaf_free_space(leaf) >= 52762306a36Sopenharmony_ci sizeof(struct btrfs_item) + args->extent_item_size) { 52862306a36Sopenharmony_ci 52962306a36Sopenharmony_ci key.objectid = ino; 53062306a36Sopenharmony_ci key.type = BTRFS_EXTENT_DATA_KEY; 53162306a36Sopenharmony_ci key.offset = args->start; 53262306a36Sopenharmony_ci if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { 53362306a36Sopenharmony_ci struct btrfs_key slot_key; 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); 53662306a36Sopenharmony_ci if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) 53762306a36Sopenharmony_ci path->slots[0]++; 53862306a36Sopenharmony_ci } 53962306a36Sopenharmony_ci btrfs_setup_item_for_insert(trans, root, path, &key, 54062306a36Sopenharmony_ci args->extent_item_size); 54162306a36Sopenharmony_ci args->extent_inserted = true; 54262306a36Sopenharmony_ci } 54362306a36Sopenharmony_ci 54462306a36Sopenharmony_ci if (!args->path) 54562306a36Sopenharmony_ci btrfs_free_path(path); 54662306a36Sopenharmony_ci else if (!args->extent_inserted) 54762306a36Sopenharmony_ci btrfs_release_path(path); 54862306a36Sopenharmony_ciout: 54962306a36Sopenharmony_ci args->drop_end = found ? min(args->end, last_end) : args->end; 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_ci return ret; 55262306a36Sopenharmony_ci} 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_cistatic int extent_mergeable(struct extent_buffer *leaf, int slot, 55562306a36Sopenharmony_ci u64 objectid, u64 bytenr, u64 orig_offset, 55662306a36Sopenharmony_ci u64 *start, u64 *end) 55762306a36Sopenharmony_ci{ 55862306a36Sopenharmony_ci struct btrfs_file_extent_item *fi; 55962306a36Sopenharmony_ci struct btrfs_key key; 56062306a36Sopenharmony_ci u64 extent_end; 56162306a36Sopenharmony_ci 56262306a36Sopenharmony_ci if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 56362306a36Sopenharmony_ci return 0; 56462306a36Sopenharmony_ci 56562306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, slot); 56662306a36Sopenharmony_ci if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) 56762306a36Sopenharmony_ci return 0; 56862306a36Sopenharmony_ci 56962306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 57062306a36Sopenharmony_ci if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || 57162306a36Sopenharmony_ci btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || 57262306a36Sopenharmony_ci btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || 57362306a36Sopenharmony_ci btrfs_file_extent_compression(leaf, fi) || 57462306a36Sopenharmony_ci btrfs_file_extent_encryption(leaf, fi) || 57562306a36Sopenharmony_ci btrfs_file_extent_other_encoding(leaf, fi)) 57662306a36Sopenharmony_ci return 0; 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 57962306a36Sopenharmony_ci if ((*start && *start != key.offset) || (*end && *end != extent_end)) 58062306a36Sopenharmony_ci return 0; 58162306a36Sopenharmony_ci 58262306a36Sopenharmony_ci *start = key.offset; 58362306a36Sopenharmony_ci *end = extent_end; 58462306a36Sopenharmony_ci return 1; 58562306a36Sopenharmony_ci} 58662306a36Sopenharmony_ci 58762306a36Sopenharmony_ci/* 58862306a36Sopenharmony_ci * Mark extent in the range start - end as written. 58962306a36Sopenharmony_ci * 59062306a36Sopenharmony_ci * This changes extent type from 'pre-allocated' to 'regular'. If only 59162306a36Sopenharmony_ci * part of extent is marked as written, the extent will be split into 59262306a36Sopenharmony_ci * two or three. 59362306a36Sopenharmony_ci */ 59462306a36Sopenharmony_ciint btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 59562306a36Sopenharmony_ci struct btrfs_inode *inode, u64 start, u64 end) 59662306a36Sopenharmony_ci{ 59762306a36Sopenharmony_ci struct btrfs_root *root = inode->root; 59862306a36Sopenharmony_ci struct extent_buffer *leaf; 59962306a36Sopenharmony_ci struct btrfs_path *path; 60062306a36Sopenharmony_ci struct btrfs_file_extent_item *fi; 60162306a36Sopenharmony_ci struct btrfs_ref ref = { 0 }; 60262306a36Sopenharmony_ci struct btrfs_key key; 60362306a36Sopenharmony_ci struct btrfs_key new_key; 60462306a36Sopenharmony_ci u64 bytenr; 60562306a36Sopenharmony_ci u64 num_bytes; 60662306a36Sopenharmony_ci u64 extent_end; 60762306a36Sopenharmony_ci u64 orig_offset; 60862306a36Sopenharmony_ci u64 other_start; 60962306a36Sopenharmony_ci u64 other_end; 61062306a36Sopenharmony_ci u64 split; 61162306a36Sopenharmony_ci int del_nr = 0; 61262306a36Sopenharmony_ci int del_slot = 0; 61362306a36Sopenharmony_ci int recow; 61462306a36Sopenharmony_ci int ret = 0; 61562306a36Sopenharmony_ci u64 ino = btrfs_ino(inode); 61662306a36Sopenharmony_ci 61762306a36Sopenharmony_ci path = btrfs_alloc_path(); 61862306a36Sopenharmony_ci if (!path) 61962306a36Sopenharmony_ci return -ENOMEM; 62062306a36Sopenharmony_ciagain: 62162306a36Sopenharmony_ci recow = 0; 62262306a36Sopenharmony_ci split = start; 62362306a36Sopenharmony_ci key.objectid = ino; 62462306a36Sopenharmony_ci key.type = BTRFS_EXTENT_DATA_KEY; 62562306a36Sopenharmony_ci key.offset = split; 62662306a36Sopenharmony_ci 62762306a36Sopenharmony_ci ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 62862306a36Sopenharmony_ci if (ret < 0) 62962306a36Sopenharmony_ci goto out; 63062306a36Sopenharmony_ci if (ret > 0 && path->slots[0] > 0) 63162306a36Sopenharmony_ci path->slots[0]--; 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci leaf = path->nodes[0]; 63462306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 63562306a36Sopenharmony_ci if (key.objectid != ino || 63662306a36Sopenharmony_ci key.type != BTRFS_EXTENT_DATA_KEY) { 63762306a36Sopenharmony_ci ret = -EINVAL; 63862306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 63962306a36Sopenharmony_ci goto out; 64062306a36Sopenharmony_ci } 64162306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0], 64262306a36Sopenharmony_ci struct btrfs_file_extent_item); 64362306a36Sopenharmony_ci if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) { 64462306a36Sopenharmony_ci ret = -EINVAL; 64562306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 64662306a36Sopenharmony_ci goto out; 64762306a36Sopenharmony_ci } 64862306a36Sopenharmony_ci extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 64962306a36Sopenharmony_ci if (key.offset > start || extent_end < end) { 65062306a36Sopenharmony_ci ret = -EINVAL; 65162306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 65262306a36Sopenharmony_ci goto out; 65362306a36Sopenharmony_ci } 65462306a36Sopenharmony_ci 65562306a36Sopenharmony_ci bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); 65662306a36Sopenharmony_ci num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); 65762306a36Sopenharmony_ci orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); 65862306a36Sopenharmony_ci memcpy(&new_key, &key, sizeof(new_key)); 65962306a36Sopenharmony_ci 66062306a36Sopenharmony_ci if (start == key.offset && end < extent_end) { 66162306a36Sopenharmony_ci other_start = 0; 66262306a36Sopenharmony_ci other_end = start; 66362306a36Sopenharmony_ci if (extent_mergeable(leaf, path->slots[0] - 1, 66462306a36Sopenharmony_ci ino, bytenr, orig_offset, 66562306a36Sopenharmony_ci &other_start, &other_end)) { 66662306a36Sopenharmony_ci new_key.offset = end; 66762306a36Sopenharmony_ci btrfs_set_item_key_safe(trans, path, &new_key); 66862306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0], 66962306a36Sopenharmony_ci struct btrfs_file_extent_item); 67062306a36Sopenharmony_ci btrfs_set_file_extent_generation(leaf, fi, 67162306a36Sopenharmony_ci trans->transid); 67262306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, 67362306a36Sopenharmony_ci extent_end - end); 67462306a36Sopenharmony_ci btrfs_set_file_extent_offset(leaf, fi, 67562306a36Sopenharmony_ci end - orig_offset); 67662306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 67762306a36Sopenharmony_ci struct btrfs_file_extent_item); 67862306a36Sopenharmony_ci btrfs_set_file_extent_generation(leaf, fi, 67962306a36Sopenharmony_ci trans->transid); 68062306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, 68162306a36Sopenharmony_ci end - other_start); 68262306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 68362306a36Sopenharmony_ci goto out; 68462306a36Sopenharmony_ci } 68562306a36Sopenharmony_ci } 68662306a36Sopenharmony_ci 68762306a36Sopenharmony_ci if (start > key.offset && end == extent_end) { 68862306a36Sopenharmony_ci other_start = end; 68962306a36Sopenharmony_ci other_end = 0; 69062306a36Sopenharmony_ci if (extent_mergeable(leaf, path->slots[0] + 1, 69162306a36Sopenharmony_ci ino, bytenr, orig_offset, 69262306a36Sopenharmony_ci &other_start, &other_end)) { 69362306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0], 69462306a36Sopenharmony_ci struct btrfs_file_extent_item); 69562306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, 69662306a36Sopenharmony_ci start - key.offset); 69762306a36Sopenharmony_ci btrfs_set_file_extent_generation(leaf, fi, 69862306a36Sopenharmony_ci trans->transid); 69962306a36Sopenharmony_ci path->slots[0]++; 70062306a36Sopenharmony_ci new_key.offset = start; 70162306a36Sopenharmony_ci btrfs_set_item_key_safe(trans, path, &new_key); 70262306a36Sopenharmony_ci 70362306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0], 70462306a36Sopenharmony_ci struct btrfs_file_extent_item); 70562306a36Sopenharmony_ci btrfs_set_file_extent_generation(leaf, fi, 70662306a36Sopenharmony_ci trans->transid); 70762306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, 70862306a36Sopenharmony_ci other_end - start); 70962306a36Sopenharmony_ci btrfs_set_file_extent_offset(leaf, fi, 71062306a36Sopenharmony_ci start - orig_offset); 71162306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 71262306a36Sopenharmony_ci goto out; 71362306a36Sopenharmony_ci } 71462306a36Sopenharmony_ci } 71562306a36Sopenharmony_ci 71662306a36Sopenharmony_ci while (start > key.offset || end < extent_end) { 71762306a36Sopenharmony_ci if (key.offset == start) 71862306a36Sopenharmony_ci split = end; 71962306a36Sopenharmony_ci 72062306a36Sopenharmony_ci new_key.offset = split; 72162306a36Sopenharmony_ci ret = btrfs_duplicate_item(trans, root, path, &new_key); 72262306a36Sopenharmony_ci if (ret == -EAGAIN) { 72362306a36Sopenharmony_ci btrfs_release_path(path); 72462306a36Sopenharmony_ci goto again; 72562306a36Sopenharmony_ci } 72662306a36Sopenharmony_ci if (ret < 0) { 72762306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 72862306a36Sopenharmony_ci goto out; 72962306a36Sopenharmony_ci } 73062306a36Sopenharmony_ci 73162306a36Sopenharmony_ci leaf = path->nodes[0]; 73262306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0] - 1, 73362306a36Sopenharmony_ci struct btrfs_file_extent_item); 73462306a36Sopenharmony_ci btrfs_set_file_extent_generation(leaf, fi, trans->transid); 73562306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, 73662306a36Sopenharmony_ci split - key.offset); 73762306a36Sopenharmony_ci 73862306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0], 73962306a36Sopenharmony_ci struct btrfs_file_extent_item); 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ci btrfs_set_file_extent_generation(leaf, fi, trans->transid); 74262306a36Sopenharmony_ci btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); 74362306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, 74462306a36Sopenharmony_ci extent_end - split); 74562306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 74662306a36Sopenharmony_ci 74762306a36Sopenharmony_ci btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, 74862306a36Sopenharmony_ci num_bytes, 0); 74962306a36Sopenharmony_ci btrfs_init_data_ref(&ref, root->root_key.objectid, ino, 75062306a36Sopenharmony_ci orig_offset, 0, false); 75162306a36Sopenharmony_ci ret = btrfs_inc_extent_ref(trans, &ref); 75262306a36Sopenharmony_ci if (ret) { 75362306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 75462306a36Sopenharmony_ci goto out; 75562306a36Sopenharmony_ci } 75662306a36Sopenharmony_ci 75762306a36Sopenharmony_ci if (split == start) { 75862306a36Sopenharmony_ci key.offset = start; 75962306a36Sopenharmony_ci } else { 76062306a36Sopenharmony_ci if (start != key.offset) { 76162306a36Sopenharmony_ci ret = -EINVAL; 76262306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 76362306a36Sopenharmony_ci goto out; 76462306a36Sopenharmony_ci } 76562306a36Sopenharmony_ci path->slots[0]--; 76662306a36Sopenharmony_ci extent_end = end; 76762306a36Sopenharmony_ci } 76862306a36Sopenharmony_ci recow = 1; 76962306a36Sopenharmony_ci } 77062306a36Sopenharmony_ci 77162306a36Sopenharmony_ci other_start = end; 77262306a36Sopenharmony_ci other_end = 0; 77362306a36Sopenharmony_ci btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, 77462306a36Sopenharmony_ci num_bytes, 0); 77562306a36Sopenharmony_ci btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, 77662306a36Sopenharmony_ci 0, false); 77762306a36Sopenharmony_ci if (extent_mergeable(leaf, path->slots[0] + 1, 77862306a36Sopenharmony_ci ino, bytenr, orig_offset, 77962306a36Sopenharmony_ci &other_start, &other_end)) { 78062306a36Sopenharmony_ci if (recow) { 78162306a36Sopenharmony_ci btrfs_release_path(path); 78262306a36Sopenharmony_ci goto again; 78362306a36Sopenharmony_ci } 78462306a36Sopenharmony_ci extent_end = other_end; 78562306a36Sopenharmony_ci del_slot = path->slots[0] + 1; 78662306a36Sopenharmony_ci del_nr++; 78762306a36Sopenharmony_ci ret = btrfs_free_extent(trans, &ref); 78862306a36Sopenharmony_ci if (ret) { 78962306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 79062306a36Sopenharmony_ci goto out; 79162306a36Sopenharmony_ci } 79262306a36Sopenharmony_ci } 79362306a36Sopenharmony_ci other_start = 0; 79462306a36Sopenharmony_ci other_end = start; 79562306a36Sopenharmony_ci if (extent_mergeable(leaf, path->slots[0] - 1, 79662306a36Sopenharmony_ci ino, bytenr, orig_offset, 79762306a36Sopenharmony_ci &other_start, &other_end)) { 79862306a36Sopenharmony_ci if (recow) { 79962306a36Sopenharmony_ci btrfs_release_path(path); 80062306a36Sopenharmony_ci goto again; 80162306a36Sopenharmony_ci } 80262306a36Sopenharmony_ci key.offset = other_start; 80362306a36Sopenharmony_ci del_slot = path->slots[0]; 80462306a36Sopenharmony_ci del_nr++; 80562306a36Sopenharmony_ci ret = btrfs_free_extent(trans, &ref); 80662306a36Sopenharmony_ci if (ret) { 80762306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 80862306a36Sopenharmony_ci goto out; 80962306a36Sopenharmony_ci } 81062306a36Sopenharmony_ci } 81162306a36Sopenharmony_ci if (del_nr == 0) { 81262306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0], 81362306a36Sopenharmony_ci struct btrfs_file_extent_item); 81462306a36Sopenharmony_ci btrfs_set_file_extent_type(leaf, fi, 81562306a36Sopenharmony_ci BTRFS_FILE_EXTENT_REG); 81662306a36Sopenharmony_ci btrfs_set_file_extent_generation(leaf, fi, trans->transid); 81762306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 81862306a36Sopenharmony_ci } else { 81962306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, del_slot - 1, 82062306a36Sopenharmony_ci struct btrfs_file_extent_item); 82162306a36Sopenharmony_ci btrfs_set_file_extent_type(leaf, fi, 82262306a36Sopenharmony_ci BTRFS_FILE_EXTENT_REG); 82362306a36Sopenharmony_ci btrfs_set_file_extent_generation(leaf, fi, trans->transid); 82462306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, 82562306a36Sopenharmony_ci extent_end - key.offset); 82662306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 82762306a36Sopenharmony_ci 82862306a36Sopenharmony_ci ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 82962306a36Sopenharmony_ci if (ret < 0) { 83062306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 83162306a36Sopenharmony_ci goto out; 83262306a36Sopenharmony_ci } 83362306a36Sopenharmony_ci } 83462306a36Sopenharmony_ciout: 83562306a36Sopenharmony_ci btrfs_free_path(path); 83662306a36Sopenharmony_ci return ret; 83762306a36Sopenharmony_ci} 83862306a36Sopenharmony_ci 83962306a36Sopenharmony_ci/* 84062306a36Sopenharmony_ci * on error we return an unlocked page and the error value 84162306a36Sopenharmony_ci * on success we return a locked page and 0 84262306a36Sopenharmony_ci */ 84362306a36Sopenharmony_cistatic int prepare_uptodate_page(struct inode *inode, 84462306a36Sopenharmony_ci struct page *page, u64 pos, 84562306a36Sopenharmony_ci bool force_uptodate) 84662306a36Sopenharmony_ci{ 84762306a36Sopenharmony_ci struct folio *folio = page_folio(page); 84862306a36Sopenharmony_ci int ret = 0; 84962306a36Sopenharmony_ci 85062306a36Sopenharmony_ci if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && 85162306a36Sopenharmony_ci !PageUptodate(page)) { 85262306a36Sopenharmony_ci ret = btrfs_read_folio(NULL, folio); 85362306a36Sopenharmony_ci if (ret) 85462306a36Sopenharmony_ci return ret; 85562306a36Sopenharmony_ci lock_page(page); 85662306a36Sopenharmony_ci if (!PageUptodate(page)) { 85762306a36Sopenharmony_ci unlock_page(page); 85862306a36Sopenharmony_ci return -EIO; 85962306a36Sopenharmony_ci } 86062306a36Sopenharmony_ci 86162306a36Sopenharmony_ci /* 86262306a36Sopenharmony_ci * Since btrfs_read_folio() will unlock the folio before it 86362306a36Sopenharmony_ci * returns, there is a window where btrfs_release_folio() can be 86462306a36Sopenharmony_ci * called to release the page. Here we check both inode 86562306a36Sopenharmony_ci * mapping and PagePrivate() to make sure the page was not 86662306a36Sopenharmony_ci * released. 86762306a36Sopenharmony_ci * 86862306a36Sopenharmony_ci * The private flag check is essential for subpage as we need 86962306a36Sopenharmony_ci * to store extra bitmap using page->private. 87062306a36Sopenharmony_ci */ 87162306a36Sopenharmony_ci if (page->mapping != inode->i_mapping || !PagePrivate(page)) { 87262306a36Sopenharmony_ci unlock_page(page); 87362306a36Sopenharmony_ci return -EAGAIN; 87462306a36Sopenharmony_ci } 87562306a36Sopenharmony_ci } 87662306a36Sopenharmony_ci return 0; 87762306a36Sopenharmony_ci} 87862306a36Sopenharmony_ci 87962306a36Sopenharmony_cistatic fgf_t get_prepare_fgp_flags(bool nowait) 88062306a36Sopenharmony_ci{ 88162306a36Sopenharmony_ci fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; 88262306a36Sopenharmony_ci 88362306a36Sopenharmony_ci if (nowait) 88462306a36Sopenharmony_ci fgp_flags |= FGP_NOWAIT; 88562306a36Sopenharmony_ci 88662306a36Sopenharmony_ci return fgp_flags; 88762306a36Sopenharmony_ci} 88862306a36Sopenharmony_ci 88962306a36Sopenharmony_cistatic gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) 89062306a36Sopenharmony_ci{ 89162306a36Sopenharmony_ci gfp_t gfp; 89262306a36Sopenharmony_ci 89362306a36Sopenharmony_ci gfp = btrfs_alloc_write_mask(inode->i_mapping); 89462306a36Sopenharmony_ci if (nowait) { 89562306a36Sopenharmony_ci gfp &= ~__GFP_DIRECT_RECLAIM; 89662306a36Sopenharmony_ci gfp |= GFP_NOWAIT; 89762306a36Sopenharmony_ci } 89862306a36Sopenharmony_ci 89962306a36Sopenharmony_ci return gfp; 90062306a36Sopenharmony_ci} 90162306a36Sopenharmony_ci 90262306a36Sopenharmony_ci/* 90362306a36Sopenharmony_ci * this just gets pages into the page cache and locks them down. 90462306a36Sopenharmony_ci */ 90562306a36Sopenharmony_cistatic noinline int prepare_pages(struct inode *inode, struct page **pages, 90662306a36Sopenharmony_ci size_t num_pages, loff_t pos, 90762306a36Sopenharmony_ci size_t write_bytes, bool force_uptodate, 90862306a36Sopenharmony_ci bool nowait) 90962306a36Sopenharmony_ci{ 91062306a36Sopenharmony_ci int i; 91162306a36Sopenharmony_ci unsigned long index = pos >> PAGE_SHIFT; 91262306a36Sopenharmony_ci gfp_t mask = get_prepare_gfp_flags(inode, nowait); 91362306a36Sopenharmony_ci fgf_t fgp_flags = get_prepare_fgp_flags(nowait); 91462306a36Sopenharmony_ci int err = 0; 91562306a36Sopenharmony_ci int faili; 91662306a36Sopenharmony_ci 91762306a36Sopenharmony_ci for (i = 0; i < num_pages; i++) { 91862306a36Sopenharmony_ciagain: 91962306a36Sopenharmony_ci pages[i] = pagecache_get_page(inode->i_mapping, index + i, 92062306a36Sopenharmony_ci fgp_flags, mask | __GFP_WRITE); 92162306a36Sopenharmony_ci if (!pages[i]) { 92262306a36Sopenharmony_ci faili = i - 1; 92362306a36Sopenharmony_ci if (nowait) 92462306a36Sopenharmony_ci err = -EAGAIN; 92562306a36Sopenharmony_ci else 92662306a36Sopenharmony_ci err = -ENOMEM; 92762306a36Sopenharmony_ci goto fail; 92862306a36Sopenharmony_ci } 92962306a36Sopenharmony_ci 93062306a36Sopenharmony_ci err = set_page_extent_mapped(pages[i]); 93162306a36Sopenharmony_ci if (err < 0) { 93262306a36Sopenharmony_ci faili = i; 93362306a36Sopenharmony_ci goto fail; 93462306a36Sopenharmony_ci } 93562306a36Sopenharmony_ci 93662306a36Sopenharmony_ci if (i == 0) 93762306a36Sopenharmony_ci err = prepare_uptodate_page(inode, pages[i], pos, 93862306a36Sopenharmony_ci force_uptodate); 93962306a36Sopenharmony_ci if (!err && i == num_pages - 1) 94062306a36Sopenharmony_ci err = prepare_uptodate_page(inode, pages[i], 94162306a36Sopenharmony_ci pos + write_bytes, false); 94262306a36Sopenharmony_ci if (err) { 94362306a36Sopenharmony_ci put_page(pages[i]); 94462306a36Sopenharmony_ci if (!nowait && err == -EAGAIN) { 94562306a36Sopenharmony_ci err = 0; 94662306a36Sopenharmony_ci goto again; 94762306a36Sopenharmony_ci } 94862306a36Sopenharmony_ci faili = i - 1; 94962306a36Sopenharmony_ci goto fail; 95062306a36Sopenharmony_ci } 95162306a36Sopenharmony_ci wait_on_page_writeback(pages[i]); 95262306a36Sopenharmony_ci } 95362306a36Sopenharmony_ci 95462306a36Sopenharmony_ci return 0; 95562306a36Sopenharmony_cifail: 95662306a36Sopenharmony_ci while (faili >= 0) { 95762306a36Sopenharmony_ci unlock_page(pages[faili]); 95862306a36Sopenharmony_ci put_page(pages[faili]); 95962306a36Sopenharmony_ci faili--; 96062306a36Sopenharmony_ci } 96162306a36Sopenharmony_ci return err; 96262306a36Sopenharmony_ci 96362306a36Sopenharmony_ci} 96462306a36Sopenharmony_ci 96562306a36Sopenharmony_ci/* 96662306a36Sopenharmony_ci * This function locks the extent and properly waits for data=ordered extents 96762306a36Sopenharmony_ci * to finish before allowing the pages to be modified if need. 96862306a36Sopenharmony_ci * 96962306a36Sopenharmony_ci * The return value: 97062306a36Sopenharmony_ci * 1 - the extent is locked 97162306a36Sopenharmony_ci * 0 - the extent is not locked, and everything is OK 97262306a36Sopenharmony_ci * -EAGAIN - need re-prepare the pages 97362306a36Sopenharmony_ci * the other < 0 number - Something wrong happens 97462306a36Sopenharmony_ci */ 97562306a36Sopenharmony_cistatic noinline int 97662306a36Sopenharmony_cilock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, 97762306a36Sopenharmony_ci size_t num_pages, loff_t pos, 97862306a36Sopenharmony_ci size_t write_bytes, 97962306a36Sopenharmony_ci u64 *lockstart, u64 *lockend, bool nowait, 98062306a36Sopenharmony_ci struct extent_state **cached_state) 98162306a36Sopenharmony_ci{ 98262306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = inode->root->fs_info; 98362306a36Sopenharmony_ci u64 start_pos; 98462306a36Sopenharmony_ci u64 last_pos; 98562306a36Sopenharmony_ci int i; 98662306a36Sopenharmony_ci int ret = 0; 98762306a36Sopenharmony_ci 98862306a36Sopenharmony_ci start_pos = round_down(pos, fs_info->sectorsize); 98962306a36Sopenharmony_ci last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; 99062306a36Sopenharmony_ci 99162306a36Sopenharmony_ci if (start_pos < inode->vfs_inode.i_size) { 99262306a36Sopenharmony_ci struct btrfs_ordered_extent *ordered; 99362306a36Sopenharmony_ci 99462306a36Sopenharmony_ci if (nowait) { 99562306a36Sopenharmony_ci if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, 99662306a36Sopenharmony_ci cached_state)) { 99762306a36Sopenharmony_ci for (i = 0; i < num_pages; i++) { 99862306a36Sopenharmony_ci unlock_page(pages[i]); 99962306a36Sopenharmony_ci put_page(pages[i]); 100062306a36Sopenharmony_ci pages[i] = NULL; 100162306a36Sopenharmony_ci } 100262306a36Sopenharmony_ci 100362306a36Sopenharmony_ci return -EAGAIN; 100462306a36Sopenharmony_ci } 100562306a36Sopenharmony_ci } else { 100662306a36Sopenharmony_ci lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); 100762306a36Sopenharmony_ci } 100862306a36Sopenharmony_ci 100962306a36Sopenharmony_ci ordered = btrfs_lookup_ordered_range(inode, start_pos, 101062306a36Sopenharmony_ci last_pos - start_pos + 1); 101162306a36Sopenharmony_ci if (ordered && 101262306a36Sopenharmony_ci ordered->file_offset + ordered->num_bytes > start_pos && 101362306a36Sopenharmony_ci ordered->file_offset <= last_pos) { 101462306a36Sopenharmony_ci unlock_extent(&inode->io_tree, start_pos, last_pos, 101562306a36Sopenharmony_ci cached_state); 101662306a36Sopenharmony_ci for (i = 0; i < num_pages; i++) { 101762306a36Sopenharmony_ci unlock_page(pages[i]); 101862306a36Sopenharmony_ci put_page(pages[i]); 101962306a36Sopenharmony_ci } 102062306a36Sopenharmony_ci btrfs_start_ordered_extent(ordered); 102162306a36Sopenharmony_ci btrfs_put_ordered_extent(ordered); 102262306a36Sopenharmony_ci return -EAGAIN; 102362306a36Sopenharmony_ci } 102462306a36Sopenharmony_ci if (ordered) 102562306a36Sopenharmony_ci btrfs_put_ordered_extent(ordered); 102662306a36Sopenharmony_ci 102762306a36Sopenharmony_ci *lockstart = start_pos; 102862306a36Sopenharmony_ci *lockend = last_pos; 102962306a36Sopenharmony_ci ret = 1; 103062306a36Sopenharmony_ci } 103162306a36Sopenharmony_ci 103262306a36Sopenharmony_ci /* 103362306a36Sopenharmony_ci * We should be called after prepare_pages() which should have locked 103462306a36Sopenharmony_ci * all pages in the range. 103562306a36Sopenharmony_ci */ 103662306a36Sopenharmony_ci for (i = 0; i < num_pages; i++) 103762306a36Sopenharmony_ci WARN_ON(!PageLocked(pages[i])); 103862306a36Sopenharmony_ci 103962306a36Sopenharmony_ci return ret; 104062306a36Sopenharmony_ci} 104162306a36Sopenharmony_ci 104262306a36Sopenharmony_ci/* 104362306a36Sopenharmony_ci * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) 104462306a36Sopenharmony_ci * 104562306a36Sopenharmony_ci * @pos: File offset. 104662306a36Sopenharmony_ci * @write_bytes: The length to write, will be updated to the nocow writeable 104762306a36Sopenharmony_ci * range. 104862306a36Sopenharmony_ci * 104962306a36Sopenharmony_ci * This function will flush ordered extents in the range to ensure proper 105062306a36Sopenharmony_ci * nocow checks. 105162306a36Sopenharmony_ci * 105262306a36Sopenharmony_ci * Return: 105362306a36Sopenharmony_ci * > 0 If we can nocow, and updates @write_bytes. 105462306a36Sopenharmony_ci * 0 If we can't do a nocow write. 105562306a36Sopenharmony_ci * -EAGAIN If we can't do a nocow write because snapshoting of the inode's 105662306a36Sopenharmony_ci * root is in progress. 105762306a36Sopenharmony_ci * < 0 If an error happened. 105862306a36Sopenharmony_ci * 105962306a36Sopenharmony_ci * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. 106062306a36Sopenharmony_ci */ 106162306a36Sopenharmony_ciint btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, 106262306a36Sopenharmony_ci size_t *write_bytes, bool nowait) 106362306a36Sopenharmony_ci{ 106462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = inode->root->fs_info; 106562306a36Sopenharmony_ci struct btrfs_root *root = inode->root; 106662306a36Sopenharmony_ci struct extent_state *cached_state = NULL; 106762306a36Sopenharmony_ci u64 lockstart, lockend; 106862306a36Sopenharmony_ci u64 num_bytes; 106962306a36Sopenharmony_ci int ret; 107062306a36Sopenharmony_ci 107162306a36Sopenharmony_ci if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 107262306a36Sopenharmony_ci return 0; 107362306a36Sopenharmony_ci 107462306a36Sopenharmony_ci if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) 107562306a36Sopenharmony_ci return -EAGAIN; 107662306a36Sopenharmony_ci 107762306a36Sopenharmony_ci lockstart = round_down(pos, fs_info->sectorsize); 107862306a36Sopenharmony_ci lockend = round_up(pos + *write_bytes, 107962306a36Sopenharmony_ci fs_info->sectorsize) - 1; 108062306a36Sopenharmony_ci num_bytes = lockend - lockstart + 1; 108162306a36Sopenharmony_ci 108262306a36Sopenharmony_ci if (nowait) { 108362306a36Sopenharmony_ci if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, 108462306a36Sopenharmony_ci &cached_state)) { 108562306a36Sopenharmony_ci btrfs_drew_write_unlock(&root->snapshot_lock); 108662306a36Sopenharmony_ci return -EAGAIN; 108762306a36Sopenharmony_ci } 108862306a36Sopenharmony_ci } else { 108962306a36Sopenharmony_ci btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, 109062306a36Sopenharmony_ci &cached_state); 109162306a36Sopenharmony_ci } 109262306a36Sopenharmony_ci ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, 109362306a36Sopenharmony_ci NULL, NULL, NULL, nowait, false); 109462306a36Sopenharmony_ci if (ret <= 0) 109562306a36Sopenharmony_ci btrfs_drew_write_unlock(&root->snapshot_lock); 109662306a36Sopenharmony_ci else 109762306a36Sopenharmony_ci *write_bytes = min_t(size_t, *write_bytes , 109862306a36Sopenharmony_ci num_bytes - pos + lockstart); 109962306a36Sopenharmony_ci unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 110062306a36Sopenharmony_ci 110162306a36Sopenharmony_ci return ret; 110262306a36Sopenharmony_ci} 110362306a36Sopenharmony_ci 110462306a36Sopenharmony_civoid btrfs_check_nocow_unlock(struct btrfs_inode *inode) 110562306a36Sopenharmony_ci{ 110662306a36Sopenharmony_ci btrfs_drew_write_unlock(&inode->root->snapshot_lock); 110762306a36Sopenharmony_ci} 110862306a36Sopenharmony_ci 110962306a36Sopenharmony_cistatic void update_time_for_write(struct inode *inode) 111062306a36Sopenharmony_ci{ 111162306a36Sopenharmony_ci struct timespec64 now, ctime; 111262306a36Sopenharmony_ci 111362306a36Sopenharmony_ci if (IS_NOCMTIME(inode)) 111462306a36Sopenharmony_ci return; 111562306a36Sopenharmony_ci 111662306a36Sopenharmony_ci now = current_time(inode); 111762306a36Sopenharmony_ci if (!timespec64_equal(&inode->i_mtime, &now)) 111862306a36Sopenharmony_ci inode->i_mtime = now; 111962306a36Sopenharmony_ci 112062306a36Sopenharmony_ci ctime = inode_get_ctime(inode); 112162306a36Sopenharmony_ci if (!timespec64_equal(&ctime, &now)) 112262306a36Sopenharmony_ci inode_set_ctime_to_ts(inode, now); 112362306a36Sopenharmony_ci 112462306a36Sopenharmony_ci if (IS_I_VERSION(inode)) 112562306a36Sopenharmony_ci inode_inc_iversion(inode); 112662306a36Sopenharmony_ci} 112762306a36Sopenharmony_ci 112862306a36Sopenharmony_cistatic int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, 112962306a36Sopenharmony_ci size_t count) 113062306a36Sopenharmony_ci{ 113162306a36Sopenharmony_ci struct file *file = iocb->ki_filp; 113262306a36Sopenharmony_ci struct inode *inode = file_inode(file); 113362306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 113462306a36Sopenharmony_ci loff_t pos = iocb->ki_pos; 113562306a36Sopenharmony_ci int ret; 113662306a36Sopenharmony_ci loff_t oldsize; 113762306a36Sopenharmony_ci loff_t start_pos; 113862306a36Sopenharmony_ci 113962306a36Sopenharmony_ci /* 114062306a36Sopenharmony_ci * Quickly bail out on NOWAIT writes if we don't have the nodatacow or 114162306a36Sopenharmony_ci * prealloc flags, as without those flags we always have to COW. We will 114262306a36Sopenharmony_ci * later check if we can really COW into the target range (using 114362306a36Sopenharmony_ci * can_nocow_extent() at btrfs_get_blocks_direct_write()). 114462306a36Sopenharmony_ci */ 114562306a36Sopenharmony_ci if ((iocb->ki_flags & IOCB_NOWAIT) && 114662306a36Sopenharmony_ci !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) 114762306a36Sopenharmony_ci return -EAGAIN; 114862306a36Sopenharmony_ci 114962306a36Sopenharmony_ci ret = file_remove_privs(file); 115062306a36Sopenharmony_ci if (ret) 115162306a36Sopenharmony_ci return ret; 115262306a36Sopenharmony_ci 115362306a36Sopenharmony_ci /* 115462306a36Sopenharmony_ci * We reserve space for updating the inode when we reserve space for the 115562306a36Sopenharmony_ci * extent we are going to write, so we will enospc out there. We don't 115662306a36Sopenharmony_ci * need to start yet another transaction to update the inode as we will 115762306a36Sopenharmony_ci * update the inode when we finish writing whatever data we write. 115862306a36Sopenharmony_ci */ 115962306a36Sopenharmony_ci update_time_for_write(inode); 116062306a36Sopenharmony_ci 116162306a36Sopenharmony_ci start_pos = round_down(pos, fs_info->sectorsize); 116262306a36Sopenharmony_ci oldsize = i_size_read(inode); 116362306a36Sopenharmony_ci if (start_pos > oldsize) { 116462306a36Sopenharmony_ci /* Expand hole size to cover write data, preventing empty gap */ 116562306a36Sopenharmony_ci loff_t end_pos = round_up(pos + count, fs_info->sectorsize); 116662306a36Sopenharmony_ci 116762306a36Sopenharmony_ci ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); 116862306a36Sopenharmony_ci if (ret) 116962306a36Sopenharmony_ci return ret; 117062306a36Sopenharmony_ci } 117162306a36Sopenharmony_ci 117262306a36Sopenharmony_ci return 0; 117362306a36Sopenharmony_ci} 117462306a36Sopenharmony_ci 117562306a36Sopenharmony_cistatic noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, 117662306a36Sopenharmony_ci struct iov_iter *i) 117762306a36Sopenharmony_ci{ 117862306a36Sopenharmony_ci struct file *file = iocb->ki_filp; 117962306a36Sopenharmony_ci loff_t pos; 118062306a36Sopenharmony_ci struct inode *inode = file_inode(file); 118162306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 118262306a36Sopenharmony_ci struct page **pages = NULL; 118362306a36Sopenharmony_ci struct extent_changeset *data_reserved = NULL; 118462306a36Sopenharmony_ci u64 release_bytes = 0; 118562306a36Sopenharmony_ci u64 lockstart; 118662306a36Sopenharmony_ci u64 lockend; 118762306a36Sopenharmony_ci size_t num_written = 0; 118862306a36Sopenharmony_ci int nrptrs; 118962306a36Sopenharmony_ci ssize_t ret; 119062306a36Sopenharmony_ci bool only_release_metadata = false; 119162306a36Sopenharmony_ci bool force_page_uptodate = false; 119262306a36Sopenharmony_ci loff_t old_isize = i_size_read(inode); 119362306a36Sopenharmony_ci unsigned int ilock_flags = 0; 119462306a36Sopenharmony_ci const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); 119562306a36Sopenharmony_ci unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); 119662306a36Sopenharmony_ci 119762306a36Sopenharmony_ci if (nowait) 119862306a36Sopenharmony_ci ilock_flags |= BTRFS_ILOCK_TRY; 119962306a36Sopenharmony_ci 120062306a36Sopenharmony_ci ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); 120162306a36Sopenharmony_ci if (ret < 0) 120262306a36Sopenharmony_ci return ret; 120362306a36Sopenharmony_ci 120462306a36Sopenharmony_ci ret = generic_write_checks(iocb, i); 120562306a36Sopenharmony_ci if (ret <= 0) 120662306a36Sopenharmony_ci goto out; 120762306a36Sopenharmony_ci 120862306a36Sopenharmony_ci ret = btrfs_write_check(iocb, i, ret); 120962306a36Sopenharmony_ci if (ret < 0) 121062306a36Sopenharmony_ci goto out; 121162306a36Sopenharmony_ci 121262306a36Sopenharmony_ci pos = iocb->ki_pos; 121362306a36Sopenharmony_ci nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), 121462306a36Sopenharmony_ci PAGE_SIZE / (sizeof(struct page *))); 121562306a36Sopenharmony_ci nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); 121662306a36Sopenharmony_ci nrptrs = max(nrptrs, 8); 121762306a36Sopenharmony_ci pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); 121862306a36Sopenharmony_ci if (!pages) { 121962306a36Sopenharmony_ci ret = -ENOMEM; 122062306a36Sopenharmony_ci goto out; 122162306a36Sopenharmony_ci } 122262306a36Sopenharmony_ci 122362306a36Sopenharmony_ci while (iov_iter_count(i) > 0) { 122462306a36Sopenharmony_ci struct extent_state *cached_state = NULL; 122562306a36Sopenharmony_ci size_t offset = offset_in_page(pos); 122662306a36Sopenharmony_ci size_t sector_offset; 122762306a36Sopenharmony_ci size_t write_bytes = min(iov_iter_count(i), 122862306a36Sopenharmony_ci nrptrs * (size_t)PAGE_SIZE - 122962306a36Sopenharmony_ci offset); 123062306a36Sopenharmony_ci size_t num_pages; 123162306a36Sopenharmony_ci size_t reserve_bytes; 123262306a36Sopenharmony_ci size_t dirty_pages; 123362306a36Sopenharmony_ci size_t copied; 123462306a36Sopenharmony_ci size_t dirty_sectors; 123562306a36Sopenharmony_ci size_t num_sectors; 123662306a36Sopenharmony_ci int extents_locked; 123762306a36Sopenharmony_ci 123862306a36Sopenharmony_ci /* 123962306a36Sopenharmony_ci * Fault pages before locking them in prepare_pages 124062306a36Sopenharmony_ci * to avoid recursive lock 124162306a36Sopenharmony_ci */ 124262306a36Sopenharmony_ci if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { 124362306a36Sopenharmony_ci ret = -EFAULT; 124462306a36Sopenharmony_ci break; 124562306a36Sopenharmony_ci } 124662306a36Sopenharmony_ci 124762306a36Sopenharmony_ci only_release_metadata = false; 124862306a36Sopenharmony_ci sector_offset = pos & (fs_info->sectorsize - 1); 124962306a36Sopenharmony_ci 125062306a36Sopenharmony_ci extent_changeset_release(data_reserved); 125162306a36Sopenharmony_ci ret = btrfs_check_data_free_space(BTRFS_I(inode), 125262306a36Sopenharmony_ci &data_reserved, pos, 125362306a36Sopenharmony_ci write_bytes, nowait); 125462306a36Sopenharmony_ci if (ret < 0) { 125562306a36Sopenharmony_ci int can_nocow; 125662306a36Sopenharmony_ci 125762306a36Sopenharmony_ci if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) { 125862306a36Sopenharmony_ci ret = -EAGAIN; 125962306a36Sopenharmony_ci break; 126062306a36Sopenharmony_ci } 126162306a36Sopenharmony_ci 126262306a36Sopenharmony_ci /* 126362306a36Sopenharmony_ci * If we don't have to COW at the offset, reserve 126462306a36Sopenharmony_ci * metadata only. write_bytes may get smaller than 126562306a36Sopenharmony_ci * requested here. 126662306a36Sopenharmony_ci */ 126762306a36Sopenharmony_ci can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos, 126862306a36Sopenharmony_ci &write_bytes, nowait); 126962306a36Sopenharmony_ci if (can_nocow < 0) 127062306a36Sopenharmony_ci ret = can_nocow; 127162306a36Sopenharmony_ci if (can_nocow > 0) 127262306a36Sopenharmony_ci ret = 0; 127362306a36Sopenharmony_ci if (ret) 127462306a36Sopenharmony_ci break; 127562306a36Sopenharmony_ci only_release_metadata = true; 127662306a36Sopenharmony_ci } 127762306a36Sopenharmony_ci 127862306a36Sopenharmony_ci num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); 127962306a36Sopenharmony_ci WARN_ON(num_pages > nrptrs); 128062306a36Sopenharmony_ci reserve_bytes = round_up(write_bytes + sector_offset, 128162306a36Sopenharmony_ci fs_info->sectorsize); 128262306a36Sopenharmony_ci WARN_ON(reserve_bytes == 0); 128362306a36Sopenharmony_ci ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), 128462306a36Sopenharmony_ci reserve_bytes, 128562306a36Sopenharmony_ci reserve_bytes, nowait); 128662306a36Sopenharmony_ci if (ret) { 128762306a36Sopenharmony_ci if (!only_release_metadata) 128862306a36Sopenharmony_ci btrfs_free_reserved_data_space(BTRFS_I(inode), 128962306a36Sopenharmony_ci data_reserved, pos, 129062306a36Sopenharmony_ci write_bytes); 129162306a36Sopenharmony_ci else 129262306a36Sopenharmony_ci btrfs_check_nocow_unlock(BTRFS_I(inode)); 129362306a36Sopenharmony_ci 129462306a36Sopenharmony_ci if (nowait && ret == -ENOSPC) 129562306a36Sopenharmony_ci ret = -EAGAIN; 129662306a36Sopenharmony_ci break; 129762306a36Sopenharmony_ci } 129862306a36Sopenharmony_ci 129962306a36Sopenharmony_ci release_bytes = reserve_bytes; 130062306a36Sopenharmony_ciagain: 130162306a36Sopenharmony_ci ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); 130262306a36Sopenharmony_ci if (ret) { 130362306a36Sopenharmony_ci btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); 130462306a36Sopenharmony_ci break; 130562306a36Sopenharmony_ci } 130662306a36Sopenharmony_ci 130762306a36Sopenharmony_ci /* 130862306a36Sopenharmony_ci * This is going to setup the pages array with the number of 130962306a36Sopenharmony_ci * pages we want, so we don't really need to worry about the 131062306a36Sopenharmony_ci * contents of pages from loop to loop 131162306a36Sopenharmony_ci */ 131262306a36Sopenharmony_ci ret = prepare_pages(inode, pages, num_pages, 131362306a36Sopenharmony_ci pos, write_bytes, force_page_uptodate, false); 131462306a36Sopenharmony_ci if (ret) { 131562306a36Sopenharmony_ci btrfs_delalloc_release_extents(BTRFS_I(inode), 131662306a36Sopenharmony_ci reserve_bytes); 131762306a36Sopenharmony_ci break; 131862306a36Sopenharmony_ci } 131962306a36Sopenharmony_ci 132062306a36Sopenharmony_ci extents_locked = lock_and_cleanup_extent_if_need( 132162306a36Sopenharmony_ci BTRFS_I(inode), pages, 132262306a36Sopenharmony_ci num_pages, pos, write_bytes, &lockstart, 132362306a36Sopenharmony_ci &lockend, nowait, &cached_state); 132462306a36Sopenharmony_ci if (extents_locked < 0) { 132562306a36Sopenharmony_ci if (!nowait && extents_locked == -EAGAIN) 132662306a36Sopenharmony_ci goto again; 132762306a36Sopenharmony_ci 132862306a36Sopenharmony_ci btrfs_delalloc_release_extents(BTRFS_I(inode), 132962306a36Sopenharmony_ci reserve_bytes); 133062306a36Sopenharmony_ci ret = extents_locked; 133162306a36Sopenharmony_ci break; 133262306a36Sopenharmony_ci } 133362306a36Sopenharmony_ci 133462306a36Sopenharmony_ci copied = btrfs_copy_from_user(pos, write_bytes, pages, i); 133562306a36Sopenharmony_ci 133662306a36Sopenharmony_ci num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); 133762306a36Sopenharmony_ci dirty_sectors = round_up(copied + sector_offset, 133862306a36Sopenharmony_ci fs_info->sectorsize); 133962306a36Sopenharmony_ci dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); 134062306a36Sopenharmony_ci 134162306a36Sopenharmony_ci /* 134262306a36Sopenharmony_ci * if we have trouble faulting in the pages, fall 134362306a36Sopenharmony_ci * back to one page at a time 134462306a36Sopenharmony_ci */ 134562306a36Sopenharmony_ci if (copied < write_bytes) 134662306a36Sopenharmony_ci nrptrs = 1; 134762306a36Sopenharmony_ci 134862306a36Sopenharmony_ci if (copied == 0) { 134962306a36Sopenharmony_ci force_page_uptodate = true; 135062306a36Sopenharmony_ci dirty_sectors = 0; 135162306a36Sopenharmony_ci dirty_pages = 0; 135262306a36Sopenharmony_ci } else { 135362306a36Sopenharmony_ci force_page_uptodate = false; 135462306a36Sopenharmony_ci dirty_pages = DIV_ROUND_UP(copied + offset, 135562306a36Sopenharmony_ci PAGE_SIZE); 135662306a36Sopenharmony_ci } 135762306a36Sopenharmony_ci 135862306a36Sopenharmony_ci if (num_sectors > dirty_sectors) { 135962306a36Sopenharmony_ci /* release everything except the sectors we dirtied */ 136062306a36Sopenharmony_ci release_bytes -= dirty_sectors << fs_info->sectorsize_bits; 136162306a36Sopenharmony_ci if (only_release_metadata) { 136262306a36Sopenharmony_ci btrfs_delalloc_release_metadata(BTRFS_I(inode), 136362306a36Sopenharmony_ci release_bytes, true); 136462306a36Sopenharmony_ci } else { 136562306a36Sopenharmony_ci u64 __pos; 136662306a36Sopenharmony_ci 136762306a36Sopenharmony_ci __pos = round_down(pos, 136862306a36Sopenharmony_ci fs_info->sectorsize) + 136962306a36Sopenharmony_ci (dirty_pages << PAGE_SHIFT); 137062306a36Sopenharmony_ci btrfs_delalloc_release_space(BTRFS_I(inode), 137162306a36Sopenharmony_ci data_reserved, __pos, 137262306a36Sopenharmony_ci release_bytes, true); 137362306a36Sopenharmony_ci } 137462306a36Sopenharmony_ci } 137562306a36Sopenharmony_ci 137662306a36Sopenharmony_ci release_bytes = round_up(copied + sector_offset, 137762306a36Sopenharmony_ci fs_info->sectorsize); 137862306a36Sopenharmony_ci 137962306a36Sopenharmony_ci ret = btrfs_dirty_pages(BTRFS_I(inode), pages, 138062306a36Sopenharmony_ci dirty_pages, pos, copied, 138162306a36Sopenharmony_ci &cached_state, only_release_metadata); 138262306a36Sopenharmony_ci 138362306a36Sopenharmony_ci /* 138462306a36Sopenharmony_ci * If we have not locked the extent range, because the range's 138562306a36Sopenharmony_ci * start offset is >= i_size, we might still have a non-NULL 138662306a36Sopenharmony_ci * cached extent state, acquired while marking the extent range 138762306a36Sopenharmony_ci * as delalloc through btrfs_dirty_pages(). Therefore free any 138862306a36Sopenharmony_ci * possible cached extent state to avoid a memory leak. 138962306a36Sopenharmony_ci */ 139062306a36Sopenharmony_ci if (extents_locked) 139162306a36Sopenharmony_ci unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, 139262306a36Sopenharmony_ci lockend, &cached_state); 139362306a36Sopenharmony_ci else 139462306a36Sopenharmony_ci free_extent_state(cached_state); 139562306a36Sopenharmony_ci 139662306a36Sopenharmony_ci btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); 139762306a36Sopenharmony_ci if (ret) { 139862306a36Sopenharmony_ci btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); 139962306a36Sopenharmony_ci break; 140062306a36Sopenharmony_ci } 140162306a36Sopenharmony_ci 140262306a36Sopenharmony_ci release_bytes = 0; 140362306a36Sopenharmony_ci if (only_release_metadata) 140462306a36Sopenharmony_ci btrfs_check_nocow_unlock(BTRFS_I(inode)); 140562306a36Sopenharmony_ci 140662306a36Sopenharmony_ci btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); 140762306a36Sopenharmony_ci 140862306a36Sopenharmony_ci cond_resched(); 140962306a36Sopenharmony_ci 141062306a36Sopenharmony_ci pos += copied; 141162306a36Sopenharmony_ci num_written += copied; 141262306a36Sopenharmony_ci } 141362306a36Sopenharmony_ci 141462306a36Sopenharmony_ci kfree(pages); 141562306a36Sopenharmony_ci 141662306a36Sopenharmony_ci if (release_bytes) { 141762306a36Sopenharmony_ci if (only_release_metadata) { 141862306a36Sopenharmony_ci btrfs_check_nocow_unlock(BTRFS_I(inode)); 141962306a36Sopenharmony_ci btrfs_delalloc_release_metadata(BTRFS_I(inode), 142062306a36Sopenharmony_ci release_bytes, true); 142162306a36Sopenharmony_ci } else { 142262306a36Sopenharmony_ci btrfs_delalloc_release_space(BTRFS_I(inode), 142362306a36Sopenharmony_ci data_reserved, 142462306a36Sopenharmony_ci round_down(pos, fs_info->sectorsize), 142562306a36Sopenharmony_ci release_bytes, true); 142662306a36Sopenharmony_ci } 142762306a36Sopenharmony_ci } 142862306a36Sopenharmony_ci 142962306a36Sopenharmony_ci extent_changeset_free(data_reserved); 143062306a36Sopenharmony_ci if (num_written > 0) { 143162306a36Sopenharmony_ci pagecache_isize_extended(inode, old_isize, iocb->ki_pos); 143262306a36Sopenharmony_ci iocb->ki_pos += num_written; 143362306a36Sopenharmony_ci } 143462306a36Sopenharmony_ciout: 143562306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 143662306a36Sopenharmony_ci return num_written ? num_written : ret; 143762306a36Sopenharmony_ci} 143862306a36Sopenharmony_ci 143962306a36Sopenharmony_cistatic ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, 144062306a36Sopenharmony_ci const struct iov_iter *iter, loff_t offset) 144162306a36Sopenharmony_ci{ 144262306a36Sopenharmony_ci const u32 blocksize_mask = fs_info->sectorsize - 1; 144362306a36Sopenharmony_ci 144462306a36Sopenharmony_ci if (offset & blocksize_mask) 144562306a36Sopenharmony_ci return -EINVAL; 144662306a36Sopenharmony_ci 144762306a36Sopenharmony_ci if (iov_iter_alignment(iter) & blocksize_mask) 144862306a36Sopenharmony_ci return -EINVAL; 144962306a36Sopenharmony_ci 145062306a36Sopenharmony_ci return 0; 145162306a36Sopenharmony_ci} 145262306a36Sopenharmony_ci 145362306a36Sopenharmony_cistatic ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) 145462306a36Sopenharmony_ci{ 145562306a36Sopenharmony_ci struct file *file = iocb->ki_filp; 145662306a36Sopenharmony_ci struct inode *inode = file_inode(file); 145762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 145862306a36Sopenharmony_ci loff_t pos; 145962306a36Sopenharmony_ci ssize_t written = 0; 146062306a36Sopenharmony_ci ssize_t written_buffered; 146162306a36Sopenharmony_ci size_t prev_left = 0; 146262306a36Sopenharmony_ci loff_t endbyte; 146362306a36Sopenharmony_ci ssize_t err; 146462306a36Sopenharmony_ci unsigned int ilock_flags = 0; 146562306a36Sopenharmony_ci struct iomap_dio *dio; 146662306a36Sopenharmony_ci 146762306a36Sopenharmony_ci if (iocb->ki_flags & IOCB_NOWAIT) 146862306a36Sopenharmony_ci ilock_flags |= BTRFS_ILOCK_TRY; 146962306a36Sopenharmony_ci 147062306a36Sopenharmony_ci /* 147162306a36Sopenharmony_ci * If the write DIO is within EOF, use a shared lock and also only if 147262306a36Sopenharmony_ci * security bits will likely not be dropped by file_remove_privs() called 147362306a36Sopenharmony_ci * from btrfs_write_check(). Either will need to be rechecked after the 147462306a36Sopenharmony_ci * lock was acquired. 147562306a36Sopenharmony_ci */ 147662306a36Sopenharmony_ci if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) 147762306a36Sopenharmony_ci ilock_flags |= BTRFS_ILOCK_SHARED; 147862306a36Sopenharmony_ci 147962306a36Sopenharmony_cirelock: 148062306a36Sopenharmony_ci err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); 148162306a36Sopenharmony_ci if (err < 0) 148262306a36Sopenharmony_ci return err; 148362306a36Sopenharmony_ci 148462306a36Sopenharmony_ci /* Shared lock cannot be used with security bits set. */ 148562306a36Sopenharmony_ci if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { 148662306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 148762306a36Sopenharmony_ci ilock_flags &= ~BTRFS_ILOCK_SHARED; 148862306a36Sopenharmony_ci goto relock; 148962306a36Sopenharmony_ci } 149062306a36Sopenharmony_ci 149162306a36Sopenharmony_ci err = generic_write_checks(iocb, from); 149262306a36Sopenharmony_ci if (err <= 0) { 149362306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 149462306a36Sopenharmony_ci return err; 149562306a36Sopenharmony_ci } 149662306a36Sopenharmony_ci 149762306a36Sopenharmony_ci err = btrfs_write_check(iocb, from, err); 149862306a36Sopenharmony_ci if (err < 0) { 149962306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 150062306a36Sopenharmony_ci goto out; 150162306a36Sopenharmony_ci } 150262306a36Sopenharmony_ci 150362306a36Sopenharmony_ci pos = iocb->ki_pos; 150462306a36Sopenharmony_ci /* 150562306a36Sopenharmony_ci * Re-check since file size may have changed just before taking the 150662306a36Sopenharmony_ci * lock or pos may have changed because of O_APPEND in generic_write_check() 150762306a36Sopenharmony_ci */ 150862306a36Sopenharmony_ci if ((ilock_flags & BTRFS_ILOCK_SHARED) && 150962306a36Sopenharmony_ci pos + iov_iter_count(from) > i_size_read(inode)) { 151062306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 151162306a36Sopenharmony_ci ilock_flags &= ~BTRFS_ILOCK_SHARED; 151262306a36Sopenharmony_ci goto relock; 151362306a36Sopenharmony_ci } 151462306a36Sopenharmony_ci 151562306a36Sopenharmony_ci if (check_direct_IO(fs_info, from, pos)) { 151662306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 151762306a36Sopenharmony_ci goto buffered; 151862306a36Sopenharmony_ci } 151962306a36Sopenharmony_ci 152062306a36Sopenharmony_ci /* 152162306a36Sopenharmony_ci * The iov_iter can be mapped to the same file range we are writing to. 152262306a36Sopenharmony_ci * If that's the case, then we will deadlock in the iomap code, because 152362306a36Sopenharmony_ci * it first calls our callback btrfs_dio_iomap_begin(), which will create 152462306a36Sopenharmony_ci * an ordered extent, and after that it will fault in the pages that the 152562306a36Sopenharmony_ci * iov_iter refers to. During the fault in we end up in the readahead 152662306a36Sopenharmony_ci * pages code (starting at btrfs_readahead()), which will lock the range, 152762306a36Sopenharmony_ci * find that ordered extent and then wait for it to complete (at 152862306a36Sopenharmony_ci * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since 152962306a36Sopenharmony_ci * obviously the ordered extent can never complete as we didn't submit 153062306a36Sopenharmony_ci * yet the respective bio(s). This always happens when the buffer is 153162306a36Sopenharmony_ci * memory mapped to the same file range, since the iomap DIO code always 153262306a36Sopenharmony_ci * invalidates pages in the target file range (after starting and waiting 153362306a36Sopenharmony_ci * for any writeback). 153462306a36Sopenharmony_ci * 153562306a36Sopenharmony_ci * So here we disable page faults in the iov_iter and then retry if we 153662306a36Sopenharmony_ci * got -EFAULT, faulting in the pages before the retry. 153762306a36Sopenharmony_ci */ 153862306a36Sopenharmony_ci from->nofault = true; 153962306a36Sopenharmony_ci dio = btrfs_dio_write(iocb, from, written); 154062306a36Sopenharmony_ci from->nofault = false; 154162306a36Sopenharmony_ci 154262306a36Sopenharmony_ci /* 154362306a36Sopenharmony_ci * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync 154462306a36Sopenharmony_ci * iocb, and that needs to lock the inode. So unlock it before calling 154562306a36Sopenharmony_ci * iomap_dio_complete() to avoid a deadlock. 154662306a36Sopenharmony_ci */ 154762306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); 154862306a36Sopenharmony_ci 154962306a36Sopenharmony_ci if (IS_ERR_OR_NULL(dio)) 155062306a36Sopenharmony_ci err = PTR_ERR_OR_ZERO(dio); 155162306a36Sopenharmony_ci else 155262306a36Sopenharmony_ci err = iomap_dio_complete(dio); 155362306a36Sopenharmony_ci 155462306a36Sopenharmony_ci /* No increment (+=) because iomap returns a cumulative value. */ 155562306a36Sopenharmony_ci if (err > 0) 155662306a36Sopenharmony_ci written = err; 155762306a36Sopenharmony_ci 155862306a36Sopenharmony_ci if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) { 155962306a36Sopenharmony_ci const size_t left = iov_iter_count(from); 156062306a36Sopenharmony_ci /* 156162306a36Sopenharmony_ci * We have more data left to write. Try to fault in as many as 156262306a36Sopenharmony_ci * possible of the remainder pages and retry. We do this without 156362306a36Sopenharmony_ci * releasing and locking again the inode, to prevent races with 156462306a36Sopenharmony_ci * truncate. 156562306a36Sopenharmony_ci * 156662306a36Sopenharmony_ci * Also, in case the iov refers to pages in the file range of the 156762306a36Sopenharmony_ci * file we want to write to (due to a mmap), we could enter an 156862306a36Sopenharmony_ci * infinite loop if we retry after faulting the pages in, since 156962306a36Sopenharmony_ci * iomap will invalidate any pages in the range early on, before 157062306a36Sopenharmony_ci * it tries to fault in the pages of the iov. So we keep track of 157162306a36Sopenharmony_ci * how much was left of iov in the previous EFAULT and fallback 157262306a36Sopenharmony_ci * to buffered IO in case we haven't made any progress. 157362306a36Sopenharmony_ci */ 157462306a36Sopenharmony_ci if (left == prev_left) { 157562306a36Sopenharmony_ci err = -ENOTBLK; 157662306a36Sopenharmony_ci } else { 157762306a36Sopenharmony_ci fault_in_iov_iter_readable(from, left); 157862306a36Sopenharmony_ci prev_left = left; 157962306a36Sopenharmony_ci goto relock; 158062306a36Sopenharmony_ci } 158162306a36Sopenharmony_ci } 158262306a36Sopenharmony_ci 158362306a36Sopenharmony_ci /* 158462306a36Sopenharmony_ci * If 'err' is -ENOTBLK or we have not written all data, then it means 158562306a36Sopenharmony_ci * we must fallback to buffered IO. 158662306a36Sopenharmony_ci */ 158762306a36Sopenharmony_ci if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) 158862306a36Sopenharmony_ci goto out; 158962306a36Sopenharmony_ci 159062306a36Sopenharmony_cibuffered: 159162306a36Sopenharmony_ci /* 159262306a36Sopenharmony_ci * If we are in a NOWAIT context, then return -EAGAIN to signal the caller 159362306a36Sopenharmony_ci * it must retry the operation in a context where blocking is acceptable, 159462306a36Sopenharmony_ci * because even if we end up not blocking during the buffered IO attempt 159562306a36Sopenharmony_ci * below, we will block when flushing and waiting for the IO. 159662306a36Sopenharmony_ci */ 159762306a36Sopenharmony_ci if (iocb->ki_flags & IOCB_NOWAIT) { 159862306a36Sopenharmony_ci err = -EAGAIN; 159962306a36Sopenharmony_ci goto out; 160062306a36Sopenharmony_ci } 160162306a36Sopenharmony_ci 160262306a36Sopenharmony_ci pos = iocb->ki_pos; 160362306a36Sopenharmony_ci written_buffered = btrfs_buffered_write(iocb, from); 160462306a36Sopenharmony_ci if (written_buffered < 0) { 160562306a36Sopenharmony_ci err = written_buffered; 160662306a36Sopenharmony_ci goto out; 160762306a36Sopenharmony_ci } 160862306a36Sopenharmony_ci /* 160962306a36Sopenharmony_ci * Ensure all data is persisted. We want the next direct IO read to be 161062306a36Sopenharmony_ci * able to read what was just written. 161162306a36Sopenharmony_ci */ 161262306a36Sopenharmony_ci endbyte = pos + written_buffered - 1; 161362306a36Sopenharmony_ci err = btrfs_fdatawrite_range(inode, pos, endbyte); 161462306a36Sopenharmony_ci if (err) 161562306a36Sopenharmony_ci goto out; 161662306a36Sopenharmony_ci err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); 161762306a36Sopenharmony_ci if (err) 161862306a36Sopenharmony_ci goto out; 161962306a36Sopenharmony_ci written += written_buffered; 162062306a36Sopenharmony_ci iocb->ki_pos = pos + written_buffered; 162162306a36Sopenharmony_ci invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT, 162262306a36Sopenharmony_ci endbyte >> PAGE_SHIFT); 162362306a36Sopenharmony_ciout: 162462306a36Sopenharmony_ci return err < 0 ? err : written; 162562306a36Sopenharmony_ci} 162662306a36Sopenharmony_ci 162762306a36Sopenharmony_cistatic ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, 162862306a36Sopenharmony_ci const struct btrfs_ioctl_encoded_io_args *encoded) 162962306a36Sopenharmony_ci{ 163062306a36Sopenharmony_ci struct file *file = iocb->ki_filp; 163162306a36Sopenharmony_ci struct inode *inode = file_inode(file); 163262306a36Sopenharmony_ci loff_t count; 163362306a36Sopenharmony_ci ssize_t ret; 163462306a36Sopenharmony_ci 163562306a36Sopenharmony_ci btrfs_inode_lock(BTRFS_I(inode), 0); 163662306a36Sopenharmony_ci count = encoded->len; 163762306a36Sopenharmony_ci ret = generic_write_checks_count(iocb, &count); 163862306a36Sopenharmony_ci if (ret == 0 && count != encoded->len) { 163962306a36Sopenharmony_ci /* 164062306a36Sopenharmony_ci * The write got truncated by generic_write_checks_count(). We 164162306a36Sopenharmony_ci * can't do a partial encoded write. 164262306a36Sopenharmony_ci */ 164362306a36Sopenharmony_ci ret = -EFBIG; 164462306a36Sopenharmony_ci } 164562306a36Sopenharmony_ci if (ret || encoded->len == 0) 164662306a36Sopenharmony_ci goto out; 164762306a36Sopenharmony_ci 164862306a36Sopenharmony_ci ret = btrfs_write_check(iocb, from, encoded->len); 164962306a36Sopenharmony_ci if (ret < 0) 165062306a36Sopenharmony_ci goto out; 165162306a36Sopenharmony_ci 165262306a36Sopenharmony_ci ret = btrfs_do_encoded_write(iocb, from, encoded); 165362306a36Sopenharmony_ciout: 165462306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), 0); 165562306a36Sopenharmony_ci return ret; 165662306a36Sopenharmony_ci} 165762306a36Sopenharmony_ci 165862306a36Sopenharmony_cissize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, 165962306a36Sopenharmony_ci const struct btrfs_ioctl_encoded_io_args *encoded) 166062306a36Sopenharmony_ci{ 166162306a36Sopenharmony_ci struct file *file = iocb->ki_filp; 166262306a36Sopenharmony_ci struct btrfs_inode *inode = BTRFS_I(file_inode(file)); 166362306a36Sopenharmony_ci ssize_t num_written, num_sync; 166462306a36Sopenharmony_ci 166562306a36Sopenharmony_ci /* 166662306a36Sopenharmony_ci * If the fs flips readonly due to some impossible error, although we 166762306a36Sopenharmony_ci * have opened a file as writable, we have to stop this write operation 166862306a36Sopenharmony_ci * to ensure consistency. 166962306a36Sopenharmony_ci */ 167062306a36Sopenharmony_ci if (BTRFS_FS_ERROR(inode->root->fs_info)) 167162306a36Sopenharmony_ci return -EROFS; 167262306a36Sopenharmony_ci 167362306a36Sopenharmony_ci if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) 167462306a36Sopenharmony_ci return -EOPNOTSUPP; 167562306a36Sopenharmony_ci 167662306a36Sopenharmony_ci if (encoded) { 167762306a36Sopenharmony_ci num_written = btrfs_encoded_write(iocb, from, encoded); 167862306a36Sopenharmony_ci num_sync = encoded->len; 167962306a36Sopenharmony_ci } else if (iocb->ki_flags & IOCB_DIRECT) { 168062306a36Sopenharmony_ci num_written = btrfs_direct_write(iocb, from); 168162306a36Sopenharmony_ci num_sync = num_written; 168262306a36Sopenharmony_ci } else { 168362306a36Sopenharmony_ci num_written = btrfs_buffered_write(iocb, from); 168462306a36Sopenharmony_ci num_sync = num_written; 168562306a36Sopenharmony_ci } 168662306a36Sopenharmony_ci 168762306a36Sopenharmony_ci btrfs_set_inode_last_sub_trans(inode); 168862306a36Sopenharmony_ci 168962306a36Sopenharmony_ci if (num_sync > 0) { 169062306a36Sopenharmony_ci num_sync = generic_write_sync(iocb, num_sync); 169162306a36Sopenharmony_ci if (num_sync < 0) 169262306a36Sopenharmony_ci num_written = num_sync; 169362306a36Sopenharmony_ci } 169462306a36Sopenharmony_ci 169562306a36Sopenharmony_ci return num_written; 169662306a36Sopenharmony_ci} 169762306a36Sopenharmony_ci 169862306a36Sopenharmony_cistatic ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 169962306a36Sopenharmony_ci{ 170062306a36Sopenharmony_ci return btrfs_do_write_iter(iocb, from, NULL); 170162306a36Sopenharmony_ci} 170262306a36Sopenharmony_ci 170362306a36Sopenharmony_ciint btrfs_release_file(struct inode *inode, struct file *filp) 170462306a36Sopenharmony_ci{ 170562306a36Sopenharmony_ci struct btrfs_file_private *private = filp->private_data; 170662306a36Sopenharmony_ci 170762306a36Sopenharmony_ci if (private) { 170862306a36Sopenharmony_ci kfree(private->filldir_buf); 170962306a36Sopenharmony_ci free_extent_state(private->llseek_cached_state); 171062306a36Sopenharmony_ci kfree(private); 171162306a36Sopenharmony_ci filp->private_data = NULL; 171262306a36Sopenharmony_ci } 171362306a36Sopenharmony_ci 171462306a36Sopenharmony_ci /* 171562306a36Sopenharmony_ci * Set by setattr when we are about to truncate a file from a non-zero 171662306a36Sopenharmony_ci * size to a zero size. This tries to flush down new bytes that may 171762306a36Sopenharmony_ci * have been written if the application were using truncate to replace 171862306a36Sopenharmony_ci * a file in place. 171962306a36Sopenharmony_ci */ 172062306a36Sopenharmony_ci if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, 172162306a36Sopenharmony_ci &BTRFS_I(inode)->runtime_flags)) 172262306a36Sopenharmony_ci filemap_flush(inode->i_mapping); 172362306a36Sopenharmony_ci return 0; 172462306a36Sopenharmony_ci} 172562306a36Sopenharmony_ci 172662306a36Sopenharmony_cistatic int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) 172762306a36Sopenharmony_ci{ 172862306a36Sopenharmony_ci int ret; 172962306a36Sopenharmony_ci struct blk_plug plug; 173062306a36Sopenharmony_ci 173162306a36Sopenharmony_ci /* 173262306a36Sopenharmony_ci * This is only called in fsync, which would do synchronous writes, so 173362306a36Sopenharmony_ci * a plug can merge adjacent IOs as much as possible. Esp. in case of 173462306a36Sopenharmony_ci * multiple disks using raid profile, a large IO can be split to 173562306a36Sopenharmony_ci * several segments of stripe length (currently 64K). 173662306a36Sopenharmony_ci */ 173762306a36Sopenharmony_ci blk_start_plug(&plug); 173862306a36Sopenharmony_ci ret = btrfs_fdatawrite_range(inode, start, end); 173962306a36Sopenharmony_ci blk_finish_plug(&plug); 174062306a36Sopenharmony_ci 174162306a36Sopenharmony_ci return ret; 174262306a36Sopenharmony_ci} 174362306a36Sopenharmony_ci 174462306a36Sopenharmony_cistatic inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) 174562306a36Sopenharmony_ci{ 174662306a36Sopenharmony_ci struct btrfs_inode *inode = BTRFS_I(ctx->inode); 174762306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = inode->root->fs_info; 174862306a36Sopenharmony_ci 174962306a36Sopenharmony_ci if (btrfs_inode_in_log(inode, fs_info->generation) && 175062306a36Sopenharmony_ci list_empty(&ctx->ordered_extents)) 175162306a36Sopenharmony_ci return true; 175262306a36Sopenharmony_ci 175362306a36Sopenharmony_ci /* 175462306a36Sopenharmony_ci * If we are doing a fast fsync we can not bail out if the inode's 175562306a36Sopenharmony_ci * last_trans is <= then the last committed transaction, because we only 175662306a36Sopenharmony_ci * update the last_trans of the inode during ordered extent completion, 175762306a36Sopenharmony_ci * and for a fast fsync we don't wait for that, we only wait for the 175862306a36Sopenharmony_ci * writeback to complete. 175962306a36Sopenharmony_ci */ 176062306a36Sopenharmony_ci if (inode->last_trans <= fs_info->last_trans_committed && 176162306a36Sopenharmony_ci (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || 176262306a36Sopenharmony_ci list_empty(&ctx->ordered_extents))) 176362306a36Sopenharmony_ci return true; 176462306a36Sopenharmony_ci 176562306a36Sopenharmony_ci return false; 176662306a36Sopenharmony_ci} 176762306a36Sopenharmony_ci 176862306a36Sopenharmony_ci/* 176962306a36Sopenharmony_ci * fsync call for both files and directories. This logs the inode into 177062306a36Sopenharmony_ci * the tree log instead of forcing full commits whenever possible. 177162306a36Sopenharmony_ci * 177262306a36Sopenharmony_ci * It needs to call filemap_fdatawait so that all ordered extent updates are 177362306a36Sopenharmony_ci * in the metadata btree are up to date for copying to the log. 177462306a36Sopenharmony_ci * 177562306a36Sopenharmony_ci * It drops the inode mutex before doing the tree log commit. This is an 177662306a36Sopenharmony_ci * important optimization for directories because holding the mutex prevents 177762306a36Sopenharmony_ci * new operations on the dir while we write to disk. 177862306a36Sopenharmony_ci */ 177962306a36Sopenharmony_ciint btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 178062306a36Sopenharmony_ci{ 178162306a36Sopenharmony_ci struct dentry *dentry = file_dentry(file); 178262306a36Sopenharmony_ci struct inode *inode = d_inode(dentry); 178362306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 178462306a36Sopenharmony_ci struct btrfs_root *root = BTRFS_I(inode)->root; 178562306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 178662306a36Sopenharmony_ci struct btrfs_log_ctx ctx; 178762306a36Sopenharmony_ci int ret = 0, err; 178862306a36Sopenharmony_ci u64 len; 178962306a36Sopenharmony_ci bool full_sync; 179062306a36Sopenharmony_ci 179162306a36Sopenharmony_ci trace_btrfs_sync_file(file, datasync); 179262306a36Sopenharmony_ci 179362306a36Sopenharmony_ci btrfs_init_log_ctx(&ctx, inode); 179462306a36Sopenharmony_ci 179562306a36Sopenharmony_ci /* 179662306a36Sopenharmony_ci * Always set the range to a full range, otherwise we can get into 179762306a36Sopenharmony_ci * several problems, from missing file extent items to represent holes 179862306a36Sopenharmony_ci * when not using the NO_HOLES feature, to log tree corruption due to 179962306a36Sopenharmony_ci * races between hole detection during logging and completion of ordered 180062306a36Sopenharmony_ci * extents outside the range, to missing checksums due to ordered extents 180162306a36Sopenharmony_ci * for which we flushed only a subset of their pages. 180262306a36Sopenharmony_ci */ 180362306a36Sopenharmony_ci start = 0; 180462306a36Sopenharmony_ci end = LLONG_MAX; 180562306a36Sopenharmony_ci len = (u64)LLONG_MAX + 1; 180662306a36Sopenharmony_ci 180762306a36Sopenharmony_ci /* 180862306a36Sopenharmony_ci * We write the dirty pages in the range and wait until they complete 180962306a36Sopenharmony_ci * out of the ->i_mutex. If so, we can flush the dirty pages by 181062306a36Sopenharmony_ci * multi-task, and make the performance up. See 181162306a36Sopenharmony_ci * btrfs_wait_ordered_range for an explanation of the ASYNC check. 181262306a36Sopenharmony_ci */ 181362306a36Sopenharmony_ci ret = start_ordered_ops(inode, start, end); 181462306a36Sopenharmony_ci if (ret) 181562306a36Sopenharmony_ci goto out; 181662306a36Sopenharmony_ci 181762306a36Sopenharmony_ci btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 181862306a36Sopenharmony_ci 181962306a36Sopenharmony_ci atomic_inc(&root->log_batch); 182062306a36Sopenharmony_ci 182162306a36Sopenharmony_ci /* 182262306a36Sopenharmony_ci * Before we acquired the inode's lock and the mmap lock, someone may 182362306a36Sopenharmony_ci * have dirtied more pages in the target range. We need to make sure 182462306a36Sopenharmony_ci * that writeback for any such pages does not start while we are logging 182562306a36Sopenharmony_ci * the inode, because if it does, any of the following might happen when 182662306a36Sopenharmony_ci * we are not doing a full inode sync: 182762306a36Sopenharmony_ci * 182862306a36Sopenharmony_ci * 1) We log an extent after its writeback finishes but before its 182962306a36Sopenharmony_ci * checksums are added to the csum tree, leading to -EIO errors 183062306a36Sopenharmony_ci * when attempting to read the extent after a log replay. 183162306a36Sopenharmony_ci * 183262306a36Sopenharmony_ci * 2) We can end up logging an extent before its writeback finishes. 183362306a36Sopenharmony_ci * Therefore after the log replay we will have a file extent item 183462306a36Sopenharmony_ci * pointing to an unwritten extent (and no data checksums as well). 183562306a36Sopenharmony_ci * 183662306a36Sopenharmony_ci * So trigger writeback for any eventual new dirty pages and then we 183762306a36Sopenharmony_ci * wait for all ordered extents to complete below. 183862306a36Sopenharmony_ci */ 183962306a36Sopenharmony_ci ret = start_ordered_ops(inode, start, end); 184062306a36Sopenharmony_ci if (ret) { 184162306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 184262306a36Sopenharmony_ci goto out; 184362306a36Sopenharmony_ci } 184462306a36Sopenharmony_ci 184562306a36Sopenharmony_ci /* 184662306a36Sopenharmony_ci * Always check for the full sync flag while holding the inode's lock, 184762306a36Sopenharmony_ci * to avoid races with other tasks. The flag must be either set all the 184862306a36Sopenharmony_ci * time during logging or always off all the time while logging. 184962306a36Sopenharmony_ci * We check the flag here after starting delalloc above, because when 185062306a36Sopenharmony_ci * running delalloc the full sync flag may be set if we need to drop 185162306a36Sopenharmony_ci * extra extent map ranges due to temporary memory allocation failures. 185262306a36Sopenharmony_ci */ 185362306a36Sopenharmony_ci full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 185462306a36Sopenharmony_ci &BTRFS_I(inode)->runtime_flags); 185562306a36Sopenharmony_ci 185662306a36Sopenharmony_ci /* 185762306a36Sopenharmony_ci * We have to do this here to avoid the priority inversion of waiting on 185862306a36Sopenharmony_ci * IO of a lower priority task while holding a transaction open. 185962306a36Sopenharmony_ci * 186062306a36Sopenharmony_ci * For a full fsync we wait for the ordered extents to complete while 186162306a36Sopenharmony_ci * for a fast fsync we wait just for writeback to complete, and then 186262306a36Sopenharmony_ci * attach the ordered extents to the transaction so that a transaction 186362306a36Sopenharmony_ci * commit waits for their completion, to avoid data loss if we fsync, 186462306a36Sopenharmony_ci * the current transaction commits before the ordered extents complete 186562306a36Sopenharmony_ci * and a power failure happens right after that. 186662306a36Sopenharmony_ci * 186762306a36Sopenharmony_ci * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the 186862306a36Sopenharmony_ci * logical address recorded in the ordered extent may change. We need 186962306a36Sopenharmony_ci * to wait for the IO to stabilize the logical address. 187062306a36Sopenharmony_ci */ 187162306a36Sopenharmony_ci if (full_sync || btrfs_is_zoned(fs_info)) { 187262306a36Sopenharmony_ci ret = btrfs_wait_ordered_range(inode, start, len); 187362306a36Sopenharmony_ci } else { 187462306a36Sopenharmony_ci /* 187562306a36Sopenharmony_ci * Get our ordered extents as soon as possible to avoid doing 187662306a36Sopenharmony_ci * checksum lookups in the csum tree, and use instead the 187762306a36Sopenharmony_ci * checksums attached to the ordered extents. 187862306a36Sopenharmony_ci */ 187962306a36Sopenharmony_ci btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), 188062306a36Sopenharmony_ci &ctx.ordered_extents); 188162306a36Sopenharmony_ci ret = filemap_fdatawait_range(inode->i_mapping, start, end); 188262306a36Sopenharmony_ci } 188362306a36Sopenharmony_ci 188462306a36Sopenharmony_ci if (ret) 188562306a36Sopenharmony_ci goto out_release_extents; 188662306a36Sopenharmony_ci 188762306a36Sopenharmony_ci atomic_inc(&root->log_batch); 188862306a36Sopenharmony_ci 188962306a36Sopenharmony_ci smp_mb(); 189062306a36Sopenharmony_ci if (skip_inode_logging(&ctx)) { 189162306a36Sopenharmony_ci /* 189262306a36Sopenharmony_ci * We've had everything committed since the last time we were 189362306a36Sopenharmony_ci * modified so clear this flag in case it was set for whatever 189462306a36Sopenharmony_ci * reason, it's no longer relevant. 189562306a36Sopenharmony_ci */ 189662306a36Sopenharmony_ci clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 189762306a36Sopenharmony_ci &BTRFS_I(inode)->runtime_flags); 189862306a36Sopenharmony_ci /* 189962306a36Sopenharmony_ci * An ordered extent might have started before and completed 190062306a36Sopenharmony_ci * already with io errors, in which case the inode was not 190162306a36Sopenharmony_ci * updated and we end up here. So check the inode's mapping 190262306a36Sopenharmony_ci * for any errors that might have happened since we last 190362306a36Sopenharmony_ci * checked called fsync. 190462306a36Sopenharmony_ci */ 190562306a36Sopenharmony_ci ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); 190662306a36Sopenharmony_ci goto out_release_extents; 190762306a36Sopenharmony_ci } 190862306a36Sopenharmony_ci 190962306a36Sopenharmony_ci /* 191062306a36Sopenharmony_ci * We use start here because we will need to wait on the IO to complete 191162306a36Sopenharmony_ci * in btrfs_sync_log, which could require joining a transaction (for 191262306a36Sopenharmony_ci * example checking cross references in the nocow path). If we use join 191362306a36Sopenharmony_ci * here we could get into a situation where we're waiting on IO to 191462306a36Sopenharmony_ci * happen that is blocked on a transaction trying to commit. With start 191562306a36Sopenharmony_ci * we inc the extwriter counter, so we wait for all extwriters to exit 191662306a36Sopenharmony_ci * before we start blocking joiners. This comment is to keep somebody 191762306a36Sopenharmony_ci * from thinking they are super smart and changing this to 191862306a36Sopenharmony_ci * btrfs_join_transaction *cough*Josef*cough*. 191962306a36Sopenharmony_ci */ 192062306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 0); 192162306a36Sopenharmony_ci if (IS_ERR(trans)) { 192262306a36Sopenharmony_ci ret = PTR_ERR(trans); 192362306a36Sopenharmony_ci goto out_release_extents; 192462306a36Sopenharmony_ci } 192562306a36Sopenharmony_ci trans->in_fsync = true; 192662306a36Sopenharmony_ci 192762306a36Sopenharmony_ci ret = btrfs_log_dentry_safe(trans, dentry, &ctx); 192862306a36Sopenharmony_ci btrfs_release_log_ctx_extents(&ctx); 192962306a36Sopenharmony_ci if (ret < 0) { 193062306a36Sopenharmony_ci /* Fallthrough and commit/free transaction. */ 193162306a36Sopenharmony_ci ret = BTRFS_LOG_FORCE_COMMIT; 193262306a36Sopenharmony_ci } 193362306a36Sopenharmony_ci 193462306a36Sopenharmony_ci /* we've logged all the items and now have a consistent 193562306a36Sopenharmony_ci * version of the file in the log. It is possible that 193662306a36Sopenharmony_ci * someone will come in and modify the file, but that's 193762306a36Sopenharmony_ci * fine because the log is consistent on disk, and we 193862306a36Sopenharmony_ci * have references to all of the file's extents 193962306a36Sopenharmony_ci * 194062306a36Sopenharmony_ci * It is possible that someone will come in and log the 194162306a36Sopenharmony_ci * file again, but that will end up using the synchronization 194262306a36Sopenharmony_ci * inside btrfs_sync_log to keep things safe. 194362306a36Sopenharmony_ci */ 194462306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 194562306a36Sopenharmony_ci 194662306a36Sopenharmony_ci if (ret == BTRFS_NO_LOG_SYNC) { 194762306a36Sopenharmony_ci ret = btrfs_end_transaction(trans); 194862306a36Sopenharmony_ci goto out; 194962306a36Sopenharmony_ci } 195062306a36Sopenharmony_ci 195162306a36Sopenharmony_ci /* We successfully logged the inode, attempt to sync the log. */ 195262306a36Sopenharmony_ci if (!ret) { 195362306a36Sopenharmony_ci ret = btrfs_sync_log(trans, root, &ctx); 195462306a36Sopenharmony_ci if (!ret) { 195562306a36Sopenharmony_ci ret = btrfs_end_transaction(trans); 195662306a36Sopenharmony_ci goto out; 195762306a36Sopenharmony_ci } 195862306a36Sopenharmony_ci } 195962306a36Sopenharmony_ci 196062306a36Sopenharmony_ci /* 196162306a36Sopenharmony_ci * At this point we need to commit the transaction because we had 196262306a36Sopenharmony_ci * btrfs_need_log_full_commit() or some other error. 196362306a36Sopenharmony_ci * 196462306a36Sopenharmony_ci * If we didn't do a full sync we have to stop the trans handle, wait on 196562306a36Sopenharmony_ci * the ordered extents, start it again and commit the transaction. If 196662306a36Sopenharmony_ci * we attempt to wait on the ordered extents here we could deadlock with 196762306a36Sopenharmony_ci * something like fallocate() that is holding the extent lock trying to 196862306a36Sopenharmony_ci * start a transaction while some other thread is trying to commit the 196962306a36Sopenharmony_ci * transaction while we (fsync) are currently holding the transaction 197062306a36Sopenharmony_ci * open. 197162306a36Sopenharmony_ci */ 197262306a36Sopenharmony_ci if (!full_sync) { 197362306a36Sopenharmony_ci ret = btrfs_end_transaction(trans); 197462306a36Sopenharmony_ci if (ret) 197562306a36Sopenharmony_ci goto out; 197662306a36Sopenharmony_ci ret = btrfs_wait_ordered_range(inode, start, len); 197762306a36Sopenharmony_ci if (ret) 197862306a36Sopenharmony_ci goto out; 197962306a36Sopenharmony_ci 198062306a36Sopenharmony_ci /* 198162306a36Sopenharmony_ci * This is safe to use here because we're only interested in 198262306a36Sopenharmony_ci * making sure the transaction that had the ordered extents is 198362306a36Sopenharmony_ci * committed. We aren't waiting on anything past this point, 198462306a36Sopenharmony_ci * we're purely getting the transaction and committing it. 198562306a36Sopenharmony_ci */ 198662306a36Sopenharmony_ci trans = btrfs_attach_transaction_barrier(root); 198762306a36Sopenharmony_ci if (IS_ERR(trans)) { 198862306a36Sopenharmony_ci ret = PTR_ERR(trans); 198962306a36Sopenharmony_ci 199062306a36Sopenharmony_ci /* 199162306a36Sopenharmony_ci * We committed the transaction and there's no currently 199262306a36Sopenharmony_ci * running transaction, this means everything we care 199362306a36Sopenharmony_ci * about made it to disk and we are done. 199462306a36Sopenharmony_ci */ 199562306a36Sopenharmony_ci if (ret == -ENOENT) 199662306a36Sopenharmony_ci ret = 0; 199762306a36Sopenharmony_ci goto out; 199862306a36Sopenharmony_ci } 199962306a36Sopenharmony_ci } 200062306a36Sopenharmony_ci 200162306a36Sopenharmony_ci ret = btrfs_commit_transaction(trans); 200262306a36Sopenharmony_ciout: 200362306a36Sopenharmony_ci ASSERT(list_empty(&ctx.list)); 200462306a36Sopenharmony_ci ASSERT(list_empty(&ctx.conflict_inodes)); 200562306a36Sopenharmony_ci err = file_check_and_advance_wb_err(file); 200662306a36Sopenharmony_ci if (!ret) 200762306a36Sopenharmony_ci ret = err; 200862306a36Sopenharmony_ci return ret > 0 ? -EIO : ret; 200962306a36Sopenharmony_ci 201062306a36Sopenharmony_ciout_release_extents: 201162306a36Sopenharmony_ci btrfs_release_log_ctx_extents(&ctx); 201262306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 201362306a36Sopenharmony_ci goto out; 201462306a36Sopenharmony_ci} 201562306a36Sopenharmony_ci 201662306a36Sopenharmony_cistatic const struct vm_operations_struct btrfs_file_vm_ops = { 201762306a36Sopenharmony_ci .fault = filemap_fault, 201862306a36Sopenharmony_ci .map_pages = filemap_map_pages, 201962306a36Sopenharmony_ci .page_mkwrite = btrfs_page_mkwrite, 202062306a36Sopenharmony_ci}; 202162306a36Sopenharmony_ci 202262306a36Sopenharmony_cistatic int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 202362306a36Sopenharmony_ci{ 202462306a36Sopenharmony_ci struct address_space *mapping = filp->f_mapping; 202562306a36Sopenharmony_ci 202662306a36Sopenharmony_ci if (!mapping->a_ops->read_folio) 202762306a36Sopenharmony_ci return -ENOEXEC; 202862306a36Sopenharmony_ci 202962306a36Sopenharmony_ci file_accessed(filp); 203062306a36Sopenharmony_ci vma->vm_ops = &btrfs_file_vm_ops; 203162306a36Sopenharmony_ci 203262306a36Sopenharmony_ci return 0; 203362306a36Sopenharmony_ci} 203462306a36Sopenharmony_ci 203562306a36Sopenharmony_cistatic int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, 203662306a36Sopenharmony_ci int slot, u64 start, u64 end) 203762306a36Sopenharmony_ci{ 203862306a36Sopenharmony_ci struct btrfs_file_extent_item *fi; 203962306a36Sopenharmony_ci struct btrfs_key key; 204062306a36Sopenharmony_ci 204162306a36Sopenharmony_ci if (slot < 0 || slot >= btrfs_header_nritems(leaf)) 204262306a36Sopenharmony_ci return 0; 204362306a36Sopenharmony_ci 204462306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, slot); 204562306a36Sopenharmony_ci if (key.objectid != btrfs_ino(inode) || 204662306a36Sopenharmony_ci key.type != BTRFS_EXTENT_DATA_KEY) 204762306a36Sopenharmony_ci return 0; 204862306a36Sopenharmony_ci 204962306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 205062306a36Sopenharmony_ci 205162306a36Sopenharmony_ci if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) 205262306a36Sopenharmony_ci return 0; 205362306a36Sopenharmony_ci 205462306a36Sopenharmony_ci if (btrfs_file_extent_disk_bytenr(leaf, fi)) 205562306a36Sopenharmony_ci return 0; 205662306a36Sopenharmony_ci 205762306a36Sopenharmony_ci if (key.offset == end) 205862306a36Sopenharmony_ci return 1; 205962306a36Sopenharmony_ci if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) 206062306a36Sopenharmony_ci return 1; 206162306a36Sopenharmony_ci return 0; 206262306a36Sopenharmony_ci} 206362306a36Sopenharmony_ci 206462306a36Sopenharmony_cistatic int fill_holes(struct btrfs_trans_handle *trans, 206562306a36Sopenharmony_ci struct btrfs_inode *inode, 206662306a36Sopenharmony_ci struct btrfs_path *path, u64 offset, u64 end) 206762306a36Sopenharmony_ci{ 206862306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 206962306a36Sopenharmony_ci struct btrfs_root *root = inode->root; 207062306a36Sopenharmony_ci struct extent_buffer *leaf; 207162306a36Sopenharmony_ci struct btrfs_file_extent_item *fi; 207262306a36Sopenharmony_ci struct extent_map *hole_em; 207362306a36Sopenharmony_ci struct btrfs_key key; 207462306a36Sopenharmony_ci int ret; 207562306a36Sopenharmony_ci 207662306a36Sopenharmony_ci if (btrfs_fs_incompat(fs_info, NO_HOLES)) 207762306a36Sopenharmony_ci goto out; 207862306a36Sopenharmony_ci 207962306a36Sopenharmony_ci key.objectid = btrfs_ino(inode); 208062306a36Sopenharmony_ci key.type = BTRFS_EXTENT_DATA_KEY; 208162306a36Sopenharmony_ci key.offset = offset; 208262306a36Sopenharmony_ci 208362306a36Sopenharmony_ci ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 208462306a36Sopenharmony_ci if (ret <= 0) { 208562306a36Sopenharmony_ci /* 208662306a36Sopenharmony_ci * We should have dropped this offset, so if we find it then 208762306a36Sopenharmony_ci * something has gone horribly wrong. 208862306a36Sopenharmony_ci */ 208962306a36Sopenharmony_ci if (ret == 0) 209062306a36Sopenharmony_ci ret = -EINVAL; 209162306a36Sopenharmony_ci return ret; 209262306a36Sopenharmony_ci } 209362306a36Sopenharmony_ci 209462306a36Sopenharmony_ci leaf = path->nodes[0]; 209562306a36Sopenharmony_ci if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) { 209662306a36Sopenharmony_ci u64 num_bytes; 209762306a36Sopenharmony_ci 209862306a36Sopenharmony_ci path->slots[0]--; 209962306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0], 210062306a36Sopenharmony_ci struct btrfs_file_extent_item); 210162306a36Sopenharmony_ci num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + 210262306a36Sopenharmony_ci end - offset; 210362306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 210462306a36Sopenharmony_ci btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 210562306a36Sopenharmony_ci btrfs_set_file_extent_offset(leaf, fi, 0); 210662306a36Sopenharmony_ci btrfs_set_file_extent_generation(leaf, fi, trans->transid); 210762306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 210862306a36Sopenharmony_ci goto out; 210962306a36Sopenharmony_ci } 211062306a36Sopenharmony_ci 211162306a36Sopenharmony_ci if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { 211262306a36Sopenharmony_ci u64 num_bytes; 211362306a36Sopenharmony_ci 211462306a36Sopenharmony_ci key.offset = offset; 211562306a36Sopenharmony_ci btrfs_set_item_key_safe(trans, path, &key); 211662306a36Sopenharmony_ci fi = btrfs_item_ptr(leaf, path->slots[0], 211762306a36Sopenharmony_ci struct btrfs_file_extent_item); 211862306a36Sopenharmony_ci num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - 211962306a36Sopenharmony_ci offset; 212062306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); 212162306a36Sopenharmony_ci btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); 212262306a36Sopenharmony_ci btrfs_set_file_extent_offset(leaf, fi, 0); 212362306a36Sopenharmony_ci btrfs_set_file_extent_generation(leaf, fi, trans->transid); 212462306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 212562306a36Sopenharmony_ci goto out; 212662306a36Sopenharmony_ci } 212762306a36Sopenharmony_ci btrfs_release_path(path); 212862306a36Sopenharmony_ci 212962306a36Sopenharmony_ci ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, 213062306a36Sopenharmony_ci end - offset); 213162306a36Sopenharmony_ci if (ret) 213262306a36Sopenharmony_ci return ret; 213362306a36Sopenharmony_ci 213462306a36Sopenharmony_ciout: 213562306a36Sopenharmony_ci btrfs_release_path(path); 213662306a36Sopenharmony_ci 213762306a36Sopenharmony_ci hole_em = alloc_extent_map(); 213862306a36Sopenharmony_ci if (!hole_em) { 213962306a36Sopenharmony_ci btrfs_drop_extent_map_range(inode, offset, end - 1, false); 214062306a36Sopenharmony_ci btrfs_set_inode_full_sync(inode); 214162306a36Sopenharmony_ci } else { 214262306a36Sopenharmony_ci hole_em->start = offset; 214362306a36Sopenharmony_ci hole_em->len = end - offset; 214462306a36Sopenharmony_ci hole_em->ram_bytes = hole_em->len; 214562306a36Sopenharmony_ci hole_em->orig_start = offset; 214662306a36Sopenharmony_ci 214762306a36Sopenharmony_ci hole_em->block_start = EXTENT_MAP_HOLE; 214862306a36Sopenharmony_ci hole_em->block_len = 0; 214962306a36Sopenharmony_ci hole_em->orig_block_len = 0; 215062306a36Sopenharmony_ci hole_em->compress_type = BTRFS_COMPRESS_NONE; 215162306a36Sopenharmony_ci hole_em->generation = trans->transid; 215262306a36Sopenharmony_ci 215362306a36Sopenharmony_ci ret = btrfs_replace_extent_map_range(inode, hole_em, true); 215462306a36Sopenharmony_ci free_extent_map(hole_em); 215562306a36Sopenharmony_ci if (ret) 215662306a36Sopenharmony_ci btrfs_set_inode_full_sync(inode); 215762306a36Sopenharmony_ci } 215862306a36Sopenharmony_ci 215962306a36Sopenharmony_ci return 0; 216062306a36Sopenharmony_ci} 216162306a36Sopenharmony_ci 216262306a36Sopenharmony_ci/* 216362306a36Sopenharmony_ci * Find a hole extent on given inode and change start/len to the end of hole 216462306a36Sopenharmony_ci * extent.(hole/vacuum extent whose em->start <= start && 216562306a36Sopenharmony_ci * em->start + em->len > start) 216662306a36Sopenharmony_ci * When a hole extent is found, return 1 and modify start/len. 216762306a36Sopenharmony_ci */ 216862306a36Sopenharmony_cistatic int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) 216962306a36Sopenharmony_ci{ 217062306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = inode->root->fs_info; 217162306a36Sopenharmony_ci struct extent_map *em; 217262306a36Sopenharmony_ci int ret = 0; 217362306a36Sopenharmony_ci 217462306a36Sopenharmony_ci em = btrfs_get_extent(inode, NULL, 0, 217562306a36Sopenharmony_ci round_down(*start, fs_info->sectorsize), 217662306a36Sopenharmony_ci round_up(*len, fs_info->sectorsize)); 217762306a36Sopenharmony_ci if (IS_ERR(em)) 217862306a36Sopenharmony_ci return PTR_ERR(em); 217962306a36Sopenharmony_ci 218062306a36Sopenharmony_ci /* Hole or vacuum extent(only exists in no-hole mode) */ 218162306a36Sopenharmony_ci if (em->block_start == EXTENT_MAP_HOLE) { 218262306a36Sopenharmony_ci ret = 1; 218362306a36Sopenharmony_ci *len = em->start + em->len > *start + *len ? 218462306a36Sopenharmony_ci 0 : *start + *len - em->start - em->len; 218562306a36Sopenharmony_ci *start = em->start + em->len; 218662306a36Sopenharmony_ci } 218762306a36Sopenharmony_ci free_extent_map(em); 218862306a36Sopenharmony_ci return ret; 218962306a36Sopenharmony_ci} 219062306a36Sopenharmony_ci 219162306a36Sopenharmony_cistatic void btrfs_punch_hole_lock_range(struct inode *inode, 219262306a36Sopenharmony_ci const u64 lockstart, 219362306a36Sopenharmony_ci const u64 lockend, 219462306a36Sopenharmony_ci struct extent_state **cached_state) 219562306a36Sopenharmony_ci{ 219662306a36Sopenharmony_ci /* 219762306a36Sopenharmony_ci * For subpage case, if the range is not at page boundary, we could 219862306a36Sopenharmony_ci * have pages at the leading/tailing part of the range. 219962306a36Sopenharmony_ci * This could lead to dead loop since filemap_range_has_page() 220062306a36Sopenharmony_ci * will always return true. 220162306a36Sopenharmony_ci * So here we need to do extra page alignment for 220262306a36Sopenharmony_ci * filemap_range_has_page(). 220362306a36Sopenharmony_ci */ 220462306a36Sopenharmony_ci const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); 220562306a36Sopenharmony_ci const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; 220662306a36Sopenharmony_ci 220762306a36Sopenharmony_ci while (1) { 220862306a36Sopenharmony_ci truncate_pagecache_range(inode, lockstart, lockend); 220962306a36Sopenharmony_ci 221062306a36Sopenharmony_ci lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 221162306a36Sopenharmony_ci cached_state); 221262306a36Sopenharmony_ci /* 221362306a36Sopenharmony_ci * We can't have ordered extents in the range, nor dirty/writeback 221462306a36Sopenharmony_ci * pages, because we have locked the inode's VFS lock in exclusive 221562306a36Sopenharmony_ci * mode, we have locked the inode's i_mmap_lock in exclusive mode, 221662306a36Sopenharmony_ci * we have flushed all delalloc in the range and we have waited 221762306a36Sopenharmony_ci * for any ordered extents in the range to complete. 221862306a36Sopenharmony_ci * We can race with anyone reading pages from this range, so after 221962306a36Sopenharmony_ci * locking the range check if we have pages in the range, and if 222062306a36Sopenharmony_ci * we do, unlock the range and retry. 222162306a36Sopenharmony_ci */ 222262306a36Sopenharmony_ci if (!filemap_range_has_page(inode->i_mapping, page_lockstart, 222362306a36Sopenharmony_ci page_lockend)) 222462306a36Sopenharmony_ci break; 222562306a36Sopenharmony_ci 222662306a36Sopenharmony_ci unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 222762306a36Sopenharmony_ci cached_state); 222862306a36Sopenharmony_ci } 222962306a36Sopenharmony_ci 223062306a36Sopenharmony_ci btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); 223162306a36Sopenharmony_ci} 223262306a36Sopenharmony_ci 223362306a36Sopenharmony_cistatic int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, 223462306a36Sopenharmony_ci struct btrfs_inode *inode, 223562306a36Sopenharmony_ci struct btrfs_path *path, 223662306a36Sopenharmony_ci struct btrfs_replace_extent_info *extent_info, 223762306a36Sopenharmony_ci const u64 replace_len, 223862306a36Sopenharmony_ci const u64 bytes_to_drop) 223962306a36Sopenharmony_ci{ 224062306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = trans->fs_info; 224162306a36Sopenharmony_ci struct btrfs_root *root = inode->root; 224262306a36Sopenharmony_ci struct btrfs_file_extent_item *extent; 224362306a36Sopenharmony_ci struct extent_buffer *leaf; 224462306a36Sopenharmony_ci struct btrfs_key key; 224562306a36Sopenharmony_ci int slot; 224662306a36Sopenharmony_ci struct btrfs_ref ref = { 0 }; 224762306a36Sopenharmony_ci int ret; 224862306a36Sopenharmony_ci 224962306a36Sopenharmony_ci if (replace_len == 0) 225062306a36Sopenharmony_ci return 0; 225162306a36Sopenharmony_ci 225262306a36Sopenharmony_ci if (extent_info->disk_offset == 0 && 225362306a36Sopenharmony_ci btrfs_fs_incompat(fs_info, NO_HOLES)) { 225462306a36Sopenharmony_ci btrfs_update_inode_bytes(inode, 0, bytes_to_drop); 225562306a36Sopenharmony_ci return 0; 225662306a36Sopenharmony_ci } 225762306a36Sopenharmony_ci 225862306a36Sopenharmony_ci key.objectid = btrfs_ino(inode); 225962306a36Sopenharmony_ci key.type = BTRFS_EXTENT_DATA_KEY; 226062306a36Sopenharmony_ci key.offset = extent_info->file_offset; 226162306a36Sopenharmony_ci ret = btrfs_insert_empty_item(trans, root, path, &key, 226262306a36Sopenharmony_ci sizeof(struct btrfs_file_extent_item)); 226362306a36Sopenharmony_ci if (ret) 226462306a36Sopenharmony_ci return ret; 226562306a36Sopenharmony_ci leaf = path->nodes[0]; 226662306a36Sopenharmony_ci slot = path->slots[0]; 226762306a36Sopenharmony_ci write_extent_buffer(leaf, extent_info->extent_buf, 226862306a36Sopenharmony_ci btrfs_item_ptr_offset(leaf, slot), 226962306a36Sopenharmony_ci sizeof(struct btrfs_file_extent_item)); 227062306a36Sopenharmony_ci extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); 227162306a36Sopenharmony_ci ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); 227262306a36Sopenharmony_ci btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); 227362306a36Sopenharmony_ci btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); 227462306a36Sopenharmony_ci if (extent_info->is_new_extent) 227562306a36Sopenharmony_ci btrfs_set_file_extent_generation(leaf, extent, trans->transid); 227662306a36Sopenharmony_ci btrfs_mark_buffer_dirty(trans, leaf); 227762306a36Sopenharmony_ci btrfs_release_path(path); 227862306a36Sopenharmony_ci 227962306a36Sopenharmony_ci ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, 228062306a36Sopenharmony_ci replace_len); 228162306a36Sopenharmony_ci if (ret) 228262306a36Sopenharmony_ci return ret; 228362306a36Sopenharmony_ci 228462306a36Sopenharmony_ci /* If it's a hole, nothing more needs to be done. */ 228562306a36Sopenharmony_ci if (extent_info->disk_offset == 0) { 228662306a36Sopenharmony_ci btrfs_update_inode_bytes(inode, 0, bytes_to_drop); 228762306a36Sopenharmony_ci return 0; 228862306a36Sopenharmony_ci } 228962306a36Sopenharmony_ci 229062306a36Sopenharmony_ci btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop); 229162306a36Sopenharmony_ci 229262306a36Sopenharmony_ci if (extent_info->is_new_extent && extent_info->insertions == 0) { 229362306a36Sopenharmony_ci key.objectid = extent_info->disk_offset; 229462306a36Sopenharmony_ci key.type = BTRFS_EXTENT_ITEM_KEY; 229562306a36Sopenharmony_ci key.offset = extent_info->disk_len; 229662306a36Sopenharmony_ci ret = btrfs_alloc_reserved_file_extent(trans, root, 229762306a36Sopenharmony_ci btrfs_ino(inode), 229862306a36Sopenharmony_ci extent_info->file_offset, 229962306a36Sopenharmony_ci extent_info->qgroup_reserved, 230062306a36Sopenharmony_ci &key); 230162306a36Sopenharmony_ci } else { 230262306a36Sopenharmony_ci u64 ref_offset; 230362306a36Sopenharmony_ci 230462306a36Sopenharmony_ci btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, 230562306a36Sopenharmony_ci extent_info->disk_offset, 230662306a36Sopenharmony_ci extent_info->disk_len, 0); 230762306a36Sopenharmony_ci ref_offset = extent_info->file_offset - extent_info->data_offset; 230862306a36Sopenharmony_ci btrfs_init_data_ref(&ref, root->root_key.objectid, 230962306a36Sopenharmony_ci btrfs_ino(inode), ref_offset, 0, false); 231062306a36Sopenharmony_ci ret = btrfs_inc_extent_ref(trans, &ref); 231162306a36Sopenharmony_ci } 231262306a36Sopenharmony_ci 231362306a36Sopenharmony_ci extent_info->insertions++; 231462306a36Sopenharmony_ci 231562306a36Sopenharmony_ci return ret; 231662306a36Sopenharmony_ci} 231762306a36Sopenharmony_ci 231862306a36Sopenharmony_ci/* 231962306a36Sopenharmony_ci * The respective range must have been previously locked, as well as the inode. 232062306a36Sopenharmony_ci * The end offset is inclusive (last byte of the range). 232162306a36Sopenharmony_ci * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing 232262306a36Sopenharmony_ci * the file range with an extent. 232362306a36Sopenharmony_ci * When not punching a hole, we don't want to end up in a state where we dropped 232462306a36Sopenharmony_ci * extents without inserting a new one, so we must abort the transaction to avoid 232562306a36Sopenharmony_ci * a corruption. 232662306a36Sopenharmony_ci */ 232762306a36Sopenharmony_ciint btrfs_replace_file_extents(struct btrfs_inode *inode, 232862306a36Sopenharmony_ci struct btrfs_path *path, const u64 start, 232962306a36Sopenharmony_ci const u64 end, 233062306a36Sopenharmony_ci struct btrfs_replace_extent_info *extent_info, 233162306a36Sopenharmony_ci struct btrfs_trans_handle **trans_out) 233262306a36Sopenharmony_ci{ 233362306a36Sopenharmony_ci struct btrfs_drop_extents_args drop_args = { 0 }; 233462306a36Sopenharmony_ci struct btrfs_root *root = inode->root; 233562306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = root->fs_info; 233662306a36Sopenharmony_ci u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); 233762306a36Sopenharmony_ci u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); 233862306a36Sopenharmony_ci struct btrfs_trans_handle *trans = NULL; 233962306a36Sopenharmony_ci struct btrfs_block_rsv *rsv; 234062306a36Sopenharmony_ci unsigned int rsv_count; 234162306a36Sopenharmony_ci u64 cur_offset; 234262306a36Sopenharmony_ci u64 len = end - start; 234362306a36Sopenharmony_ci int ret = 0; 234462306a36Sopenharmony_ci 234562306a36Sopenharmony_ci if (end <= start) 234662306a36Sopenharmony_ci return -EINVAL; 234762306a36Sopenharmony_ci 234862306a36Sopenharmony_ci rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); 234962306a36Sopenharmony_ci if (!rsv) { 235062306a36Sopenharmony_ci ret = -ENOMEM; 235162306a36Sopenharmony_ci goto out; 235262306a36Sopenharmony_ci } 235362306a36Sopenharmony_ci rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); 235462306a36Sopenharmony_ci rsv->failfast = true; 235562306a36Sopenharmony_ci 235662306a36Sopenharmony_ci /* 235762306a36Sopenharmony_ci * 1 - update the inode 235862306a36Sopenharmony_ci * 1 - removing the extents in the range 235962306a36Sopenharmony_ci * 1 - adding the hole extent if no_holes isn't set or if we are 236062306a36Sopenharmony_ci * replacing the range with a new extent 236162306a36Sopenharmony_ci */ 236262306a36Sopenharmony_ci if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) 236362306a36Sopenharmony_ci rsv_count = 3; 236462306a36Sopenharmony_ci else 236562306a36Sopenharmony_ci rsv_count = 2; 236662306a36Sopenharmony_ci 236762306a36Sopenharmony_ci trans = btrfs_start_transaction(root, rsv_count); 236862306a36Sopenharmony_ci if (IS_ERR(trans)) { 236962306a36Sopenharmony_ci ret = PTR_ERR(trans); 237062306a36Sopenharmony_ci trans = NULL; 237162306a36Sopenharmony_ci goto out_free; 237262306a36Sopenharmony_ci } 237362306a36Sopenharmony_ci 237462306a36Sopenharmony_ci ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, 237562306a36Sopenharmony_ci min_size, false); 237662306a36Sopenharmony_ci if (WARN_ON(ret)) 237762306a36Sopenharmony_ci goto out_trans; 237862306a36Sopenharmony_ci trans->block_rsv = rsv; 237962306a36Sopenharmony_ci 238062306a36Sopenharmony_ci cur_offset = start; 238162306a36Sopenharmony_ci drop_args.path = path; 238262306a36Sopenharmony_ci drop_args.end = end + 1; 238362306a36Sopenharmony_ci drop_args.drop_cache = true; 238462306a36Sopenharmony_ci while (cur_offset < end) { 238562306a36Sopenharmony_ci drop_args.start = cur_offset; 238662306a36Sopenharmony_ci ret = btrfs_drop_extents(trans, root, inode, &drop_args); 238762306a36Sopenharmony_ci /* If we are punching a hole decrement the inode's byte count */ 238862306a36Sopenharmony_ci if (!extent_info) 238962306a36Sopenharmony_ci btrfs_update_inode_bytes(inode, 0, 239062306a36Sopenharmony_ci drop_args.bytes_found); 239162306a36Sopenharmony_ci if (ret != -ENOSPC) { 239262306a36Sopenharmony_ci /* 239362306a36Sopenharmony_ci * The only time we don't want to abort is if we are 239462306a36Sopenharmony_ci * attempting to clone a partial inline extent, in which 239562306a36Sopenharmony_ci * case we'll get EOPNOTSUPP. However if we aren't 239662306a36Sopenharmony_ci * clone we need to abort no matter what, because if we 239762306a36Sopenharmony_ci * got EOPNOTSUPP via prealloc then we messed up and 239862306a36Sopenharmony_ci * need to abort. 239962306a36Sopenharmony_ci */ 240062306a36Sopenharmony_ci if (ret && 240162306a36Sopenharmony_ci (ret != -EOPNOTSUPP || 240262306a36Sopenharmony_ci (extent_info && extent_info->is_new_extent))) 240362306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 240462306a36Sopenharmony_ci break; 240562306a36Sopenharmony_ci } 240662306a36Sopenharmony_ci 240762306a36Sopenharmony_ci trans->block_rsv = &fs_info->trans_block_rsv; 240862306a36Sopenharmony_ci 240962306a36Sopenharmony_ci if (!extent_info && cur_offset < drop_args.drop_end && 241062306a36Sopenharmony_ci cur_offset < ino_size) { 241162306a36Sopenharmony_ci ret = fill_holes(trans, inode, path, cur_offset, 241262306a36Sopenharmony_ci drop_args.drop_end); 241362306a36Sopenharmony_ci if (ret) { 241462306a36Sopenharmony_ci /* 241562306a36Sopenharmony_ci * If we failed then we didn't insert our hole 241662306a36Sopenharmony_ci * entries for the area we dropped, so now the 241762306a36Sopenharmony_ci * fs is corrupted, so we must abort the 241862306a36Sopenharmony_ci * transaction. 241962306a36Sopenharmony_ci */ 242062306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 242162306a36Sopenharmony_ci break; 242262306a36Sopenharmony_ci } 242362306a36Sopenharmony_ci } else if (!extent_info && cur_offset < drop_args.drop_end) { 242462306a36Sopenharmony_ci /* 242562306a36Sopenharmony_ci * We are past the i_size here, but since we didn't 242662306a36Sopenharmony_ci * insert holes we need to clear the mapped area so we 242762306a36Sopenharmony_ci * know to not set disk_i_size in this area until a new 242862306a36Sopenharmony_ci * file extent is inserted here. 242962306a36Sopenharmony_ci */ 243062306a36Sopenharmony_ci ret = btrfs_inode_clear_file_extent_range(inode, 243162306a36Sopenharmony_ci cur_offset, 243262306a36Sopenharmony_ci drop_args.drop_end - cur_offset); 243362306a36Sopenharmony_ci if (ret) { 243462306a36Sopenharmony_ci /* 243562306a36Sopenharmony_ci * We couldn't clear our area, so we could 243662306a36Sopenharmony_ci * presumably adjust up and corrupt the fs, so 243762306a36Sopenharmony_ci * we need to abort. 243862306a36Sopenharmony_ci */ 243962306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 244062306a36Sopenharmony_ci break; 244162306a36Sopenharmony_ci } 244262306a36Sopenharmony_ci } 244362306a36Sopenharmony_ci 244462306a36Sopenharmony_ci if (extent_info && 244562306a36Sopenharmony_ci drop_args.drop_end > extent_info->file_offset) { 244662306a36Sopenharmony_ci u64 replace_len = drop_args.drop_end - 244762306a36Sopenharmony_ci extent_info->file_offset; 244862306a36Sopenharmony_ci 244962306a36Sopenharmony_ci ret = btrfs_insert_replace_extent(trans, inode, path, 245062306a36Sopenharmony_ci extent_info, replace_len, 245162306a36Sopenharmony_ci drop_args.bytes_found); 245262306a36Sopenharmony_ci if (ret) { 245362306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 245462306a36Sopenharmony_ci break; 245562306a36Sopenharmony_ci } 245662306a36Sopenharmony_ci extent_info->data_len -= replace_len; 245762306a36Sopenharmony_ci extent_info->data_offset += replace_len; 245862306a36Sopenharmony_ci extent_info->file_offset += replace_len; 245962306a36Sopenharmony_ci } 246062306a36Sopenharmony_ci 246162306a36Sopenharmony_ci /* 246262306a36Sopenharmony_ci * We are releasing our handle on the transaction, balance the 246362306a36Sopenharmony_ci * dirty pages of the btree inode and flush delayed items, and 246462306a36Sopenharmony_ci * then get a new transaction handle, which may now point to a 246562306a36Sopenharmony_ci * new transaction in case someone else may have committed the 246662306a36Sopenharmony_ci * transaction we used to replace/drop file extent items. So 246762306a36Sopenharmony_ci * bump the inode's iversion and update mtime and ctime except 246862306a36Sopenharmony_ci * if we are called from a dedupe context. This is because a 246962306a36Sopenharmony_ci * power failure/crash may happen after the transaction is 247062306a36Sopenharmony_ci * committed and before we finish replacing/dropping all the 247162306a36Sopenharmony_ci * file extent items we need. 247262306a36Sopenharmony_ci */ 247362306a36Sopenharmony_ci inode_inc_iversion(&inode->vfs_inode); 247462306a36Sopenharmony_ci 247562306a36Sopenharmony_ci if (!extent_info || extent_info->update_times) 247662306a36Sopenharmony_ci inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode); 247762306a36Sopenharmony_ci 247862306a36Sopenharmony_ci ret = btrfs_update_inode(trans, root, inode); 247962306a36Sopenharmony_ci if (ret) 248062306a36Sopenharmony_ci break; 248162306a36Sopenharmony_ci 248262306a36Sopenharmony_ci btrfs_end_transaction(trans); 248362306a36Sopenharmony_ci btrfs_btree_balance_dirty(fs_info); 248462306a36Sopenharmony_ci 248562306a36Sopenharmony_ci trans = btrfs_start_transaction(root, rsv_count); 248662306a36Sopenharmony_ci if (IS_ERR(trans)) { 248762306a36Sopenharmony_ci ret = PTR_ERR(trans); 248862306a36Sopenharmony_ci trans = NULL; 248962306a36Sopenharmony_ci break; 249062306a36Sopenharmony_ci } 249162306a36Sopenharmony_ci 249262306a36Sopenharmony_ci ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, 249362306a36Sopenharmony_ci rsv, min_size, false); 249462306a36Sopenharmony_ci if (WARN_ON(ret)) 249562306a36Sopenharmony_ci break; 249662306a36Sopenharmony_ci trans->block_rsv = rsv; 249762306a36Sopenharmony_ci 249862306a36Sopenharmony_ci cur_offset = drop_args.drop_end; 249962306a36Sopenharmony_ci len = end - cur_offset; 250062306a36Sopenharmony_ci if (!extent_info && len) { 250162306a36Sopenharmony_ci ret = find_first_non_hole(inode, &cur_offset, &len); 250262306a36Sopenharmony_ci if (unlikely(ret < 0)) 250362306a36Sopenharmony_ci break; 250462306a36Sopenharmony_ci if (ret && !len) { 250562306a36Sopenharmony_ci ret = 0; 250662306a36Sopenharmony_ci break; 250762306a36Sopenharmony_ci } 250862306a36Sopenharmony_ci } 250962306a36Sopenharmony_ci } 251062306a36Sopenharmony_ci 251162306a36Sopenharmony_ci /* 251262306a36Sopenharmony_ci * If we were cloning, force the next fsync to be a full one since we 251362306a36Sopenharmony_ci * we replaced (or just dropped in the case of cloning holes when 251462306a36Sopenharmony_ci * NO_HOLES is enabled) file extent items and did not setup new extent 251562306a36Sopenharmony_ci * maps for the replacement extents (or holes). 251662306a36Sopenharmony_ci */ 251762306a36Sopenharmony_ci if (extent_info && !extent_info->is_new_extent) 251862306a36Sopenharmony_ci btrfs_set_inode_full_sync(inode); 251962306a36Sopenharmony_ci 252062306a36Sopenharmony_ci if (ret) 252162306a36Sopenharmony_ci goto out_trans; 252262306a36Sopenharmony_ci 252362306a36Sopenharmony_ci trans->block_rsv = &fs_info->trans_block_rsv; 252462306a36Sopenharmony_ci /* 252562306a36Sopenharmony_ci * If we are using the NO_HOLES feature we might have had already an 252662306a36Sopenharmony_ci * hole that overlaps a part of the region [lockstart, lockend] and 252762306a36Sopenharmony_ci * ends at (or beyond) lockend. Since we have no file extent items to 252862306a36Sopenharmony_ci * represent holes, drop_end can be less than lockend and so we must 252962306a36Sopenharmony_ci * make sure we have an extent map representing the existing hole (the 253062306a36Sopenharmony_ci * call to __btrfs_drop_extents() might have dropped the existing extent 253162306a36Sopenharmony_ci * map representing the existing hole), otherwise the fast fsync path 253262306a36Sopenharmony_ci * will not record the existence of the hole region 253362306a36Sopenharmony_ci * [existing_hole_start, lockend]. 253462306a36Sopenharmony_ci */ 253562306a36Sopenharmony_ci if (drop_args.drop_end <= end) 253662306a36Sopenharmony_ci drop_args.drop_end = end + 1; 253762306a36Sopenharmony_ci /* 253862306a36Sopenharmony_ci * Don't insert file hole extent item if it's for a range beyond eof 253962306a36Sopenharmony_ci * (because it's useless) or if it represents a 0 bytes range (when 254062306a36Sopenharmony_ci * cur_offset == drop_end). 254162306a36Sopenharmony_ci */ 254262306a36Sopenharmony_ci if (!extent_info && cur_offset < ino_size && 254362306a36Sopenharmony_ci cur_offset < drop_args.drop_end) { 254462306a36Sopenharmony_ci ret = fill_holes(trans, inode, path, cur_offset, 254562306a36Sopenharmony_ci drop_args.drop_end); 254662306a36Sopenharmony_ci if (ret) { 254762306a36Sopenharmony_ci /* Same comment as above. */ 254862306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 254962306a36Sopenharmony_ci goto out_trans; 255062306a36Sopenharmony_ci } 255162306a36Sopenharmony_ci } else if (!extent_info && cur_offset < drop_args.drop_end) { 255262306a36Sopenharmony_ci /* See the comment in the loop above for the reasoning here. */ 255362306a36Sopenharmony_ci ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, 255462306a36Sopenharmony_ci drop_args.drop_end - cur_offset); 255562306a36Sopenharmony_ci if (ret) { 255662306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 255762306a36Sopenharmony_ci goto out_trans; 255862306a36Sopenharmony_ci } 255962306a36Sopenharmony_ci 256062306a36Sopenharmony_ci } 256162306a36Sopenharmony_ci if (extent_info) { 256262306a36Sopenharmony_ci ret = btrfs_insert_replace_extent(trans, inode, path, 256362306a36Sopenharmony_ci extent_info, extent_info->data_len, 256462306a36Sopenharmony_ci drop_args.bytes_found); 256562306a36Sopenharmony_ci if (ret) { 256662306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 256762306a36Sopenharmony_ci goto out_trans; 256862306a36Sopenharmony_ci } 256962306a36Sopenharmony_ci } 257062306a36Sopenharmony_ci 257162306a36Sopenharmony_ciout_trans: 257262306a36Sopenharmony_ci if (!trans) 257362306a36Sopenharmony_ci goto out_free; 257462306a36Sopenharmony_ci 257562306a36Sopenharmony_ci trans->block_rsv = &fs_info->trans_block_rsv; 257662306a36Sopenharmony_ci if (ret) 257762306a36Sopenharmony_ci btrfs_end_transaction(trans); 257862306a36Sopenharmony_ci else 257962306a36Sopenharmony_ci *trans_out = trans; 258062306a36Sopenharmony_ciout_free: 258162306a36Sopenharmony_ci btrfs_free_block_rsv(fs_info, rsv); 258262306a36Sopenharmony_ciout: 258362306a36Sopenharmony_ci return ret; 258462306a36Sopenharmony_ci} 258562306a36Sopenharmony_ci 258662306a36Sopenharmony_cistatic int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) 258762306a36Sopenharmony_ci{ 258862306a36Sopenharmony_ci struct inode *inode = file_inode(file); 258962306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 259062306a36Sopenharmony_ci struct btrfs_root *root = BTRFS_I(inode)->root; 259162306a36Sopenharmony_ci struct extent_state *cached_state = NULL; 259262306a36Sopenharmony_ci struct btrfs_path *path; 259362306a36Sopenharmony_ci struct btrfs_trans_handle *trans = NULL; 259462306a36Sopenharmony_ci u64 lockstart; 259562306a36Sopenharmony_ci u64 lockend; 259662306a36Sopenharmony_ci u64 tail_start; 259762306a36Sopenharmony_ci u64 tail_len; 259862306a36Sopenharmony_ci u64 orig_start = offset; 259962306a36Sopenharmony_ci int ret = 0; 260062306a36Sopenharmony_ci bool same_block; 260162306a36Sopenharmony_ci u64 ino_size; 260262306a36Sopenharmony_ci bool truncated_block = false; 260362306a36Sopenharmony_ci bool updated_inode = false; 260462306a36Sopenharmony_ci 260562306a36Sopenharmony_ci btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 260662306a36Sopenharmony_ci 260762306a36Sopenharmony_ci ret = btrfs_wait_ordered_range(inode, offset, len); 260862306a36Sopenharmony_ci if (ret) 260962306a36Sopenharmony_ci goto out_only_mutex; 261062306a36Sopenharmony_ci 261162306a36Sopenharmony_ci ino_size = round_up(inode->i_size, fs_info->sectorsize); 261262306a36Sopenharmony_ci ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); 261362306a36Sopenharmony_ci if (ret < 0) 261462306a36Sopenharmony_ci goto out_only_mutex; 261562306a36Sopenharmony_ci if (ret && !len) { 261662306a36Sopenharmony_ci /* Already in a large hole */ 261762306a36Sopenharmony_ci ret = 0; 261862306a36Sopenharmony_ci goto out_only_mutex; 261962306a36Sopenharmony_ci } 262062306a36Sopenharmony_ci 262162306a36Sopenharmony_ci ret = file_modified(file); 262262306a36Sopenharmony_ci if (ret) 262362306a36Sopenharmony_ci goto out_only_mutex; 262462306a36Sopenharmony_ci 262562306a36Sopenharmony_ci lockstart = round_up(offset, fs_info->sectorsize); 262662306a36Sopenharmony_ci lockend = round_down(offset + len, fs_info->sectorsize) - 1; 262762306a36Sopenharmony_ci same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) 262862306a36Sopenharmony_ci == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); 262962306a36Sopenharmony_ci /* 263062306a36Sopenharmony_ci * We needn't truncate any block which is beyond the end of the file 263162306a36Sopenharmony_ci * because we are sure there is no data there. 263262306a36Sopenharmony_ci */ 263362306a36Sopenharmony_ci /* 263462306a36Sopenharmony_ci * Only do this if we are in the same block and we aren't doing the 263562306a36Sopenharmony_ci * entire block. 263662306a36Sopenharmony_ci */ 263762306a36Sopenharmony_ci if (same_block && len < fs_info->sectorsize) { 263862306a36Sopenharmony_ci if (offset < ino_size) { 263962306a36Sopenharmony_ci truncated_block = true; 264062306a36Sopenharmony_ci ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, 264162306a36Sopenharmony_ci 0); 264262306a36Sopenharmony_ci } else { 264362306a36Sopenharmony_ci ret = 0; 264462306a36Sopenharmony_ci } 264562306a36Sopenharmony_ci goto out_only_mutex; 264662306a36Sopenharmony_ci } 264762306a36Sopenharmony_ci 264862306a36Sopenharmony_ci /* zero back part of the first block */ 264962306a36Sopenharmony_ci if (offset < ino_size) { 265062306a36Sopenharmony_ci truncated_block = true; 265162306a36Sopenharmony_ci ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); 265262306a36Sopenharmony_ci if (ret) { 265362306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 265462306a36Sopenharmony_ci return ret; 265562306a36Sopenharmony_ci } 265662306a36Sopenharmony_ci } 265762306a36Sopenharmony_ci 265862306a36Sopenharmony_ci /* Check the aligned pages after the first unaligned page, 265962306a36Sopenharmony_ci * if offset != orig_start, which means the first unaligned page 266062306a36Sopenharmony_ci * including several following pages are already in holes, 266162306a36Sopenharmony_ci * the extra check can be skipped */ 266262306a36Sopenharmony_ci if (offset == orig_start) { 266362306a36Sopenharmony_ci /* after truncate page, check hole again */ 266462306a36Sopenharmony_ci len = offset + len - lockstart; 266562306a36Sopenharmony_ci offset = lockstart; 266662306a36Sopenharmony_ci ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); 266762306a36Sopenharmony_ci if (ret < 0) 266862306a36Sopenharmony_ci goto out_only_mutex; 266962306a36Sopenharmony_ci if (ret && !len) { 267062306a36Sopenharmony_ci ret = 0; 267162306a36Sopenharmony_ci goto out_only_mutex; 267262306a36Sopenharmony_ci } 267362306a36Sopenharmony_ci lockstart = offset; 267462306a36Sopenharmony_ci } 267562306a36Sopenharmony_ci 267662306a36Sopenharmony_ci /* Check the tail unaligned part is in a hole */ 267762306a36Sopenharmony_ci tail_start = lockend + 1; 267862306a36Sopenharmony_ci tail_len = offset + len - tail_start; 267962306a36Sopenharmony_ci if (tail_len) { 268062306a36Sopenharmony_ci ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len); 268162306a36Sopenharmony_ci if (unlikely(ret < 0)) 268262306a36Sopenharmony_ci goto out_only_mutex; 268362306a36Sopenharmony_ci if (!ret) { 268462306a36Sopenharmony_ci /* zero the front end of the last page */ 268562306a36Sopenharmony_ci if (tail_start + tail_len < ino_size) { 268662306a36Sopenharmony_ci truncated_block = true; 268762306a36Sopenharmony_ci ret = btrfs_truncate_block(BTRFS_I(inode), 268862306a36Sopenharmony_ci tail_start + tail_len, 268962306a36Sopenharmony_ci 0, 1); 269062306a36Sopenharmony_ci if (ret) 269162306a36Sopenharmony_ci goto out_only_mutex; 269262306a36Sopenharmony_ci } 269362306a36Sopenharmony_ci } 269462306a36Sopenharmony_ci } 269562306a36Sopenharmony_ci 269662306a36Sopenharmony_ci if (lockend < lockstart) { 269762306a36Sopenharmony_ci ret = 0; 269862306a36Sopenharmony_ci goto out_only_mutex; 269962306a36Sopenharmony_ci } 270062306a36Sopenharmony_ci 270162306a36Sopenharmony_ci btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); 270262306a36Sopenharmony_ci 270362306a36Sopenharmony_ci path = btrfs_alloc_path(); 270462306a36Sopenharmony_ci if (!path) { 270562306a36Sopenharmony_ci ret = -ENOMEM; 270662306a36Sopenharmony_ci goto out; 270762306a36Sopenharmony_ci } 270862306a36Sopenharmony_ci 270962306a36Sopenharmony_ci ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart, 271062306a36Sopenharmony_ci lockend, NULL, &trans); 271162306a36Sopenharmony_ci btrfs_free_path(path); 271262306a36Sopenharmony_ci if (ret) 271362306a36Sopenharmony_ci goto out; 271462306a36Sopenharmony_ci 271562306a36Sopenharmony_ci ASSERT(trans != NULL); 271662306a36Sopenharmony_ci inode_inc_iversion(inode); 271762306a36Sopenharmony_ci inode->i_mtime = inode_set_ctime_current(inode); 271862306a36Sopenharmony_ci ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 271962306a36Sopenharmony_ci updated_inode = true; 272062306a36Sopenharmony_ci btrfs_end_transaction(trans); 272162306a36Sopenharmony_ci btrfs_btree_balance_dirty(fs_info); 272262306a36Sopenharmony_ciout: 272362306a36Sopenharmony_ci unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 272462306a36Sopenharmony_ci &cached_state); 272562306a36Sopenharmony_ciout_only_mutex: 272662306a36Sopenharmony_ci if (!updated_inode && truncated_block && !ret) { 272762306a36Sopenharmony_ci /* 272862306a36Sopenharmony_ci * If we only end up zeroing part of a page, we still need to 272962306a36Sopenharmony_ci * update the inode item, so that all the time fields are 273062306a36Sopenharmony_ci * updated as well as the necessary btrfs inode in memory fields 273162306a36Sopenharmony_ci * for detecting, at fsync time, if the inode isn't yet in the 273262306a36Sopenharmony_ci * log tree or it's there but not up to date. 273362306a36Sopenharmony_ci */ 273462306a36Sopenharmony_ci struct timespec64 now = inode_set_ctime_current(inode); 273562306a36Sopenharmony_ci 273662306a36Sopenharmony_ci inode_inc_iversion(inode); 273762306a36Sopenharmony_ci inode->i_mtime = now; 273862306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 1); 273962306a36Sopenharmony_ci if (IS_ERR(trans)) { 274062306a36Sopenharmony_ci ret = PTR_ERR(trans); 274162306a36Sopenharmony_ci } else { 274262306a36Sopenharmony_ci int ret2; 274362306a36Sopenharmony_ci 274462306a36Sopenharmony_ci ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 274562306a36Sopenharmony_ci ret2 = btrfs_end_transaction(trans); 274662306a36Sopenharmony_ci if (!ret) 274762306a36Sopenharmony_ci ret = ret2; 274862306a36Sopenharmony_ci } 274962306a36Sopenharmony_ci } 275062306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 275162306a36Sopenharmony_ci return ret; 275262306a36Sopenharmony_ci} 275362306a36Sopenharmony_ci 275462306a36Sopenharmony_ci/* Helper structure to record which range is already reserved */ 275562306a36Sopenharmony_cistruct falloc_range { 275662306a36Sopenharmony_ci struct list_head list; 275762306a36Sopenharmony_ci u64 start; 275862306a36Sopenharmony_ci u64 len; 275962306a36Sopenharmony_ci}; 276062306a36Sopenharmony_ci 276162306a36Sopenharmony_ci/* 276262306a36Sopenharmony_ci * Helper function to add falloc range 276362306a36Sopenharmony_ci * 276462306a36Sopenharmony_ci * Caller should have locked the larger range of extent containing 276562306a36Sopenharmony_ci * [start, len) 276662306a36Sopenharmony_ci */ 276762306a36Sopenharmony_cistatic int add_falloc_range(struct list_head *head, u64 start, u64 len) 276862306a36Sopenharmony_ci{ 276962306a36Sopenharmony_ci struct falloc_range *range = NULL; 277062306a36Sopenharmony_ci 277162306a36Sopenharmony_ci if (!list_empty(head)) { 277262306a36Sopenharmony_ci /* 277362306a36Sopenharmony_ci * As fallocate iterates by bytenr order, we only need to check 277462306a36Sopenharmony_ci * the last range. 277562306a36Sopenharmony_ci */ 277662306a36Sopenharmony_ci range = list_last_entry(head, struct falloc_range, list); 277762306a36Sopenharmony_ci if (range->start + range->len == start) { 277862306a36Sopenharmony_ci range->len += len; 277962306a36Sopenharmony_ci return 0; 278062306a36Sopenharmony_ci } 278162306a36Sopenharmony_ci } 278262306a36Sopenharmony_ci 278362306a36Sopenharmony_ci range = kmalloc(sizeof(*range), GFP_KERNEL); 278462306a36Sopenharmony_ci if (!range) 278562306a36Sopenharmony_ci return -ENOMEM; 278662306a36Sopenharmony_ci range->start = start; 278762306a36Sopenharmony_ci range->len = len; 278862306a36Sopenharmony_ci list_add_tail(&range->list, head); 278962306a36Sopenharmony_ci return 0; 279062306a36Sopenharmony_ci} 279162306a36Sopenharmony_ci 279262306a36Sopenharmony_cistatic int btrfs_fallocate_update_isize(struct inode *inode, 279362306a36Sopenharmony_ci const u64 end, 279462306a36Sopenharmony_ci const int mode) 279562306a36Sopenharmony_ci{ 279662306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 279762306a36Sopenharmony_ci struct btrfs_root *root = BTRFS_I(inode)->root; 279862306a36Sopenharmony_ci int ret; 279962306a36Sopenharmony_ci int ret2; 280062306a36Sopenharmony_ci 280162306a36Sopenharmony_ci if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) 280262306a36Sopenharmony_ci return 0; 280362306a36Sopenharmony_ci 280462306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 1); 280562306a36Sopenharmony_ci if (IS_ERR(trans)) 280662306a36Sopenharmony_ci return PTR_ERR(trans); 280762306a36Sopenharmony_ci 280862306a36Sopenharmony_ci inode_set_ctime_current(inode); 280962306a36Sopenharmony_ci i_size_write(inode, end); 281062306a36Sopenharmony_ci btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 281162306a36Sopenharmony_ci ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 281262306a36Sopenharmony_ci ret2 = btrfs_end_transaction(trans); 281362306a36Sopenharmony_ci 281462306a36Sopenharmony_ci return ret ? ret : ret2; 281562306a36Sopenharmony_ci} 281662306a36Sopenharmony_ci 281762306a36Sopenharmony_cienum { 281862306a36Sopenharmony_ci RANGE_BOUNDARY_WRITTEN_EXTENT, 281962306a36Sopenharmony_ci RANGE_BOUNDARY_PREALLOC_EXTENT, 282062306a36Sopenharmony_ci RANGE_BOUNDARY_HOLE, 282162306a36Sopenharmony_ci}; 282262306a36Sopenharmony_ci 282362306a36Sopenharmony_cistatic int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, 282462306a36Sopenharmony_ci u64 offset) 282562306a36Sopenharmony_ci{ 282662306a36Sopenharmony_ci const u64 sectorsize = inode->root->fs_info->sectorsize; 282762306a36Sopenharmony_ci struct extent_map *em; 282862306a36Sopenharmony_ci int ret; 282962306a36Sopenharmony_ci 283062306a36Sopenharmony_ci offset = round_down(offset, sectorsize); 283162306a36Sopenharmony_ci em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); 283262306a36Sopenharmony_ci if (IS_ERR(em)) 283362306a36Sopenharmony_ci return PTR_ERR(em); 283462306a36Sopenharmony_ci 283562306a36Sopenharmony_ci if (em->block_start == EXTENT_MAP_HOLE) 283662306a36Sopenharmony_ci ret = RANGE_BOUNDARY_HOLE; 283762306a36Sopenharmony_ci else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 283862306a36Sopenharmony_ci ret = RANGE_BOUNDARY_PREALLOC_EXTENT; 283962306a36Sopenharmony_ci else 284062306a36Sopenharmony_ci ret = RANGE_BOUNDARY_WRITTEN_EXTENT; 284162306a36Sopenharmony_ci 284262306a36Sopenharmony_ci free_extent_map(em); 284362306a36Sopenharmony_ci return ret; 284462306a36Sopenharmony_ci} 284562306a36Sopenharmony_ci 284662306a36Sopenharmony_cistatic int btrfs_zero_range(struct inode *inode, 284762306a36Sopenharmony_ci loff_t offset, 284862306a36Sopenharmony_ci loff_t len, 284962306a36Sopenharmony_ci const int mode) 285062306a36Sopenharmony_ci{ 285162306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 285262306a36Sopenharmony_ci struct extent_map *em; 285362306a36Sopenharmony_ci struct extent_changeset *data_reserved = NULL; 285462306a36Sopenharmony_ci int ret; 285562306a36Sopenharmony_ci u64 alloc_hint = 0; 285662306a36Sopenharmony_ci const u64 sectorsize = fs_info->sectorsize; 285762306a36Sopenharmony_ci u64 alloc_start = round_down(offset, sectorsize); 285862306a36Sopenharmony_ci u64 alloc_end = round_up(offset + len, sectorsize); 285962306a36Sopenharmony_ci u64 bytes_to_reserve = 0; 286062306a36Sopenharmony_ci bool space_reserved = false; 286162306a36Sopenharmony_ci 286262306a36Sopenharmony_ci em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, 286362306a36Sopenharmony_ci alloc_end - alloc_start); 286462306a36Sopenharmony_ci if (IS_ERR(em)) { 286562306a36Sopenharmony_ci ret = PTR_ERR(em); 286662306a36Sopenharmony_ci goto out; 286762306a36Sopenharmony_ci } 286862306a36Sopenharmony_ci 286962306a36Sopenharmony_ci /* 287062306a36Sopenharmony_ci * Avoid hole punching and extent allocation for some cases. More cases 287162306a36Sopenharmony_ci * could be considered, but these are unlikely common and we keep things 287262306a36Sopenharmony_ci * as simple as possible for now. Also, intentionally, if the target 287362306a36Sopenharmony_ci * range contains one or more prealloc extents together with regular 287462306a36Sopenharmony_ci * extents and holes, we drop all the existing extents and allocate a 287562306a36Sopenharmony_ci * new prealloc extent, so that we get a larger contiguous disk extent. 287662306a36Sopenharmony_ci */ 287762306a36Sopenharmony_ci if (em->start <= alloc_start && 287862306a36Sopenharmony_ci test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 287962306a36Sopenharmony_ci const u64 em_end = em->start + em->len; 288062306a36Sopenharmony_ci 288162306a36Sopenharmony_ci if (em_end >= offset + len) { 288262306a36Sopenharmony_ci /* 288362306a36Sopenharmony_ci * The whole range is already a prealloc extent, 288462306a36Sopenharmony_ci * do nothing except updating the inode's i_size if 288562306a36Sopenharmony_ci * needed. 288662306a36Sopenharmony_ci */ 288762306a36Sopenharmony_ci free_extent_map(em); 288862306a36Sopenharmony_ci ret = btrfs_fallocate_update_isize(inode, offset + len, 288962306a36Sopenharmony_ci mode); 289062306a36Sopenharmony_ci goto out; 289162306a36Sopenharmony_ci } 289262306a36Sopenharmony_ci /* 289362306a36Sopenharmony_ci * Part of the range is already a prealloc extent, so operate 289462306a36Sopenharmony_ci * only on the remaining part of the range. 289562306a36Sopenharmony_ci */ 289662306a36Sopenharmony_ci alloc_start = em_end; 289762306a36Sopenharmony_ci ASSERT(IS_ALIGNED(alloc_start, sectorsize)); 289862306a36Sopenharmony_ci len = offset + len - alloc_start; 289962306a36Sopenharmony_ci offset = alloc_start; 290062306a36Sopenharmony_ci alloc_hint = em->block_start + em->len; 290162306a36Sopenharmony_ci } 290262306a36Sopenharmony_ci free_extent_map(em); 290362306a36Sopenharmony_ci 290462306a36Sopenharmony_ci if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == 290562306a36Sopenharmony_ci BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { 290662306a36Sopenharmony_ci em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, 290762306a36Sopenharmony_ci sectorsize); 290862306a36Sopenharmony_ci if (IS_ERR(em)) { 290962306a36Sopenharmony_ci ret = PTR_ERR(em); 291062306a36Sopenharmony_ci goto out; 291162306a36Sopenharmony_ci } 291262306a36Sopenharmony_ci 291362306a36Sopenharmony_ci if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 291462306a36Sopenharmony_ci free_extent_map(em); 291562306a36Sopenharmony_ci ret = btrfs_fallocate_update_isize(inode, offset + len, 291662306a36Sopenharmony_ci mode); 291762306a36Sopenharmony_ci goto out; 291862306a36Sopenharmony_ci } 291962306a36Sopenharmony_ci if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) { 292062306a36Sopenharmony_ci free_extent_map(em); 292162306a36Sopenharmony_ci ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, 292262306a36Sopenharmony_ci 0); 292362306a36Sopenharmony_ci if (!ret) 292462306a36Sopenharmony_ci ret = btrfs_fallocate_update_isize(inode, 292562306a36Sopenharmony_ci offset + len, 292662306a36Sopenharmony_ci mode); 292762306a36Sopenharmony_ci return ret; 292862306a36Sopenharmony_ci } 292962306a36Sopenharmony_ci free_extent_map(em); 293062306a36Sopenharmony_ci alloc_start = round_down(offset, sectorsize); 293162306a36Sopenharmony_ci alloc_end = alloc_start + sectorsize; 293262306a36Sopenharmony_ci goto reserve_space; 293362306a36Sopenharmony_ci } 293462306a36Sopenharmony_ci 293562306a36Sopenharmony_ci alloc_start = round_up(offset, sectorsize); 293662306a36Sopenharmony_ci alloc_end = round_down(offset + len, sectorsize); 293762306a36Sopenharmony_ci 293862306a36Sopenharmony_ci /* 293962306a36Sopenharmony_ci * For unaligned ranges, check the pages at the boundaries, they might 294062306a36Sopenharmony_ci * map to an extent, in which case we need to partially zero them, or 294162306a36Sopenharmony_ci * they might map to a hole, in which case we need our allocation range 294262306a36Sopenharmony_ci * to cover them. 294362306a36Sopenharmony_ci */ 294462306a36Sopenharmony_ci if (!IS_ALIGNED(offset, sectorsize)) { 294562306a36Sopenharmony_ci ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), 294662306a36Sopenharmony_ci offset); 294762306a36Sopenharmony_ci if (ret < 0) 294862306a36Sopenharmony_ci goto out; 294962306a36Sopenharmony_ci if (ret == RANGE_BOUNDARY_HOLE) { 295062306a36Sopenharmony_ci alloc_start = round_down(offset, sectorsize); 295162306a36Sopenharmony_ci ret = 0; 295262306a36Sopenharmony_ci } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { 295362306a36Sopenharmony_ci ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); 295462306a36Sopenharmony_ci if (ret) 295562306a36Sopenharmony_ci goto out; 295662306a36Sopenharmony_ci } else { 295762306a36Sopenharmony_ci ret = 0; 295862306a36Sopenharmony_ci } 295962306a36Sopenharmony_ci } 296062306a36Sopenharmony_ci 296162306a36Sopenharmony_ci if (!IS_ALIGNED(offset + len, sectorsize)) { 296262306a36Sopenharmony_ci ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), 296362306a36Sopenharmony_ci offset + len); 296462306a36Sopenharmony_ci if (ret < 0) 296562306a36Sopenharmony_ci goto out; 296662306a36Sopenharmony_ci if (ret == RANGE_BOUNDARY_HOLE) { 296762306a36Sopenharmony_ci alloc_end = round_up(offset + len, sectorsize); 296862306a36Sopenharmony_ci ret = 0; 296962306a36Sopenharmony_ci } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { 297062306a36Sopenharmony_ci ret = btrfs_truncate_block(BTRFS_I(inode), offset + len, 297162306a36Sopenharmony_ci 0, 1); 297262306a36Sopenharmony_ci if (ret) 297362306a36Sopenharmony_ci goto out; 297462306a36Sopenharmony_ci } else { 297562306a36Sopenharmony_ci ret = 0; 297662306a36Sopenharmony_ci } 297762306a36Sopenharmony_ci } 297862306a36Sopenharmony_ci 297962306a36Sopenharmony_cireserve_space: 298062306a36Sopenharmony_ci if (alloc_start < alloc_end) { 298162306a36Sopenharmony_ci struct extent_state *cached_state = NULL; 298262306a36Sopenharmony_ci const u64 lockstart = alloc_start; 298362306a36Sopenharmony_ci const u64 lockend = alloc_end - 1; 298462306a36Sopenharmony_ci 298562306a36Sopenharmony_ci bytes_to_reserve = alloc_end - alloc_start; 298662306a36Sopenharmony_ci ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 298762306a36Sopenharmony_ci bytes_to_reserve); 298862306a36Sopenharmony_ci if (ret < 0) 298962306a36Sopenharmony_ci goto out; 299062306a36Sopenharmony_ci space_reserved = true; 299162306a36Sopenharmony_ci btrfs_punch_hole_lock_range(inode, lockstart, lockend, 299262306a36Sopenharmony_ci &cached_state); 299362306a36Sopenharmony_ci ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, 299462306a36Sopenharmony_ci alloc_start, bytes_to_reserve); 299562306a36Sopenharmony_ci if (ret) { 299662306a36Sopenharmony_ci unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, 299762306a36Sopenharmony_ci lockend, &cached_state); 299862306a36Sopenharmony_ci goto out; 299962306a36Sopenharmony_ci } 300062306a36Sopenharmony_ci ret = btrfs_prealloc_file_range(inode, mode, alloc_start, 300162306a36Sopenharmony_ci alloc_end - alloc_start, 300262306a36Sopenharmony_ci i_blocksize(inode), 300362306a36Sopenharmony_ci offset + len, &alloc_hint); 300462306a36Sopenharmony_ci unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, 300562306a36Sopenharmony_ci &cached_state); 300662306a36Sopenharmony_ci /* btrfs_prealloc_file_range releases reserved space on error */ 300762306a36Sopenharmony_ci if (ret) { 300862306a36Sopenharmony_ci space_reserved = false; 300962306a36Sopenharmony_ci goto out; 301062306a36Sopenharmony_ci } 301162306a36Sopenharmony_ci } 301262306a36Sopenharmony_ci ret = btrfs_fallocate_update_isize(inode, offset + len, mode); 301362306a36Sopenharmony_ci out: 301462306a36Sopenharmony_ci if (ret && space_reserved) 301562306a36Sopenharmony_ci btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, 301662306a36Sopenharmony_ci alloc_start, bytes_to_reserve); 301762306a36Sopenharmony_ci extent_changeset_free(data_reserved); 301862306a36Sopenharmony_ci 301962306a36Sopenharmony_ci return ret; 302062306a36Sopenharmony_ci} 302162306a36Sopenharmony_ci 302262306a36Sopenharmony_cistatic long btrfs_fallocate(struct file *file, int mode, 302362306a36Sopenharmony_ci loff_t offset, loff_t len) 302462306a36Sopenharmony_ci{ 302562306a36Sopenharmony_ci struct inode *inode = file_inode(file); 302662306a36Sopenharmony_ci struct extent_state *cached_state = NULL; 302762306a36Sopenharmony_ci struct extent_changeset *data_reserved = NULL; 302862306a36Sopenharmony_ci struct falloc_range *range; 302962306a36Sopenharmony_ci struct falloc_range *tmp; 303062306a36Sopenharmony_ci LIST_HEAD(reserve_list); 303162306a36Sopenharmony_ci u64 cur_offset; 303262306a36Sopenharmony_ci u64 last_byte; 303362306a36Sopenharmony_ci u64 alloc_start; 303462306a36Sopenharmony_ci u64 alloc_end; 303562306a36Sopenharmony_ci u64 alloc_hint = 0; 303662306a36Sopenharmony_ci u64 locked_end; 303762306a36Sopenharmony_ci u64 actual_end = 0; 303862306a36Sopenharmony_ci u64 data_space_needed = 0; 303962306a36Sopenharmony_ci u64 data_space_reserved = 0; 304062306a36Sopenharmony_ci u64 qgroup_reserved = 0; 304162306a36Sopenharmony_ci struct extent_map *em; 304262306a36Sopenharmony_ci int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; 304362306a36Sopenharmony_ci int ret; 304462306a36Sopenharmony_ci 304562306a36Sopenharmony_ci /* Do not allow fallocate in ZONED mode */ 304662306a36Sopenharmony_ci if (btrfs_is_zoned(btrfs_sb(inode->i_sb))) 304762306a36Sopenharmony_ci return -EOPNOTSUPP; 304862306a36Sopenharmony_ci 304962306a36Sopenharmony_ci alloc_start = round_down(offset, blocksize); 305062306a36Sopenharmony_ci alloc_end = round_up(offset + len, blocksize); 305162306a36Sopenharmony_ci cur_offset = alloc_start; 305262306a36Sopenharmony_ci 305362306a36Sopenharmony_ci /* Make sure we aren't being give some crap mode */ 305462306a36Sopenharmony_ci if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 305562306a36Sopenharmony_ci FALLOC_FL_ZERO_RANGE)) 305662306a36Sopenharmony_ci return -EOPNOTSUPP; 305762306a36Sopenharmony_ci 305862306a36Sopenharmony_ci if (mode & FALLOC_FL_PUNCH_HOLE) 305962306a36Sopenharmony_ci return btrfs_punch_hole(file, offset, len); 306062306a36Sopenharmony_ci 306162306a36Sopenharmony_ci btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 306262306a36Sopenharmony_ci 306362306a36Sopenharmony_ci if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { 306462306a36Sopenharmony_ci ret = inode_newsize_ok(inode, offset + len); 306562306a36Sopenharmony_ci if (ret) 306662306a36Sopenharmony_ci goto out; 306762306a36Sopenharmony_ci } 306862306a36Sopenharmony_ci 306962306a36Sopenharmony_ci ret = file_modified(file); 307062306a36Sopenharmony_ci if (ret) 307162306a36Sopenharmony_ci goto out; 307262306a36Sopenharmony_ci 307362306a36Sopenharmony_ci /* 307462306a36Sopenharmony_ci * TODO: Move these two operations after we have checked 307562306a36Sopenharmony_ci * accurate reserved space, or fallocate can still fail but 307662306a36Sopenharmony_ci * with page truncated or size expanded. 307762306a36Sopenharmony_ci * 307862306a36Sopenharmony_ci * But that's a minor problem and won't do much harm BTW. 307962306a36Sopenharmony_ci */ 308062306a36Sopenharmony_ci if (alloc_start > inode->i_size) { 308162306a36Sopenharmony_ci ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode), 308262306a36Sopenharmony_ci alloc_start); 308362306a36Sopenharmony_ci if (ret) 308462306a36Sopenharmony_ci goto out; 308562306a36Sopenharmony_ci } else if (offset + len > inode->i_size) { 308662306a36Sopenharmony_ci /* 308762306a36Sopenharmony_ci * If we are fallocating from the end of the file onward we 308862306a36Sopenharmony_ci * need to zero out the end of the block if i_size lands in the 308962306a36Sopenharmony_ci * middle of a block. 309062306a36Sopenharmony_ci */ 309162306a36Sopenharmony_ci ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); 309262306a36Sopenharmony_ci if (ret) 309362306a36Sopenharmony_ci goto out; 309462306a36Sopenharmony_ci } 309562306a36Sopenharmony_ci 309662306a36Sopenharmony_ci /* 309762306a36Sopenharmony_ci * We have locked the inode at the VFS level (in exclusive mode) and we 309862306a36Sopenharmony_ci * have locked the i_mmap_lock lock (in exclusive mode). Now before 309962306a36Sopenharmony_ci * locking the file range, flush all dealloc in the range and wait for 310062306a36Sopenharmony_ci * all ordered extents in the range to complete. After this we can lock 310162306a36Sopenharmony_ci * the file range and, due to the previous locking we did, we know there 310262306a36Sopenharmony_ci * can't be more delalloc or ordered extents in the range. 310362306a36Sopenharmony_ci */ 310462306a36Sopenharmony_ci ret = btrfs_wait_ordered_range(inode, alloc_start, 310562306a36Sopenharmony_ci alloc_end - alloc_start); 310662306a36Sopenharmony_ci if (ret) 310762306a36Sopenharmony_ci goto out; 310862306a36Sopenharmony_ci 310962306a36Sopenharmony_ci if (mode & FALLOC_FL_ZERO_RANGE) { 311062306a36Sopenharmony_ci ret = btrfs_zero_range(inode, offset, len, mode); 311162306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 311262306a36Sopenharmony_ci return ret; 311362306a36Sopenharmony_ci } 311462306a36Sopenharmony_ci 311562306a36Sopenharmony_ci locked_end = alloc_end - 1; 311662306a36Sopenharmony_ci lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 311762306a36Sopenharmony_ci &cached_state); 311862306a36Sopenharmony_ci 311962306a36Sopenharmony_ci btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); 312062306a36Sopenharmony_ci 312162306a36Sopenharmony_ci /* First, check if we exceed the qgroup limit */ 312262306a36Sopenharmony_ci while (cur_offset < alloc_end) { 312362306a36Sopenharmony_ci em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, 312462306a36Sopenharmony_ci alloc_end - cur_offset); 312562306a36Sopenharmony_ci if (IS_ERR(em)) { 312662306a36Sopenharmony_ci ret = PTR_ERR(em); 312762306a36Sopenharmony_ci break; 312862306a36Sopenharmony_ci } 312962306a36Sopenharmony_ci last_byte = min(extent_map_end(em), alloc_end); 313062306a36Sopenharmony_ci actual_end = min_t(u64, extent_map_end(em), offset + len); 313162306a36Sopenharmony_ci last_byte = ALIGN(last_byte, blocksize); 313262306a36Sopenharmony_ci if (em->block_start == EXTENT_MAP_HOLE || 313362306a36Sopenharmony_ci (cur_offset >= inode->i_size && 313462306a36Sopenharmony_ci !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 313562306a36Sopenharmony_ci const u64 range_len = last_byte - cur_offset; 313662306a36Sopenharmony_ci 313762306a36Sopenharmony_ci ret = add_falloc_range(&reserve_list, cur_offset, range_len); 313862306a36Sopenharmony_ci if (ret < 0) { 313962306a36Sopenharmony_ci free_extent_map(em); 314062306a36Sopenharmony_ci break; 314162306a36Sopenharmony_ci } 314262306a36Sopenharmony_ci ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), 314362306a36Sopenharmony_ci &data_reserved, cur_offset, range_len); 314462306a36Sopenharmony_ci if (ret < 0) { 314562306a36Sopenharmony_ci free_extent_map(em); 314662306a36Sopenharmony_ci break; 314762306a36Sopenharmony_ci } 314862306a36Sopenharmony_ci qgroup_reserved += range_len; 314962306a36Sopenharmony_ci data_space_needed += range_len; 315062306a36Sopenharmony_ci } 315162306a36Sopenharmony_ci free_extent_map(em); 315262306a36Sopenharmony_ci cur_offset = last_byte; 315362306a36Sopenharmony_ci } 315462306a36Sopenharmony_ci 315562306a36Sopenharmony_ci if (!ret && data_space_needed > 0) { 315662306a36Sopenharmony_ci /* 315762306a36Sopenharmony_ci * We are safe to reserve space here as we can't have delalloc 315862306a36Sopenharmony_ci * in the range, see above. 315962306a36Sopenharmony_ci */ 316062306a36Sopenharmony_ci ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), 316162306a36Sopenharmony_ci data_space_needed); 316262306a36Sopenharmony_ci if (!ret) 316362306a36Sopenharmony_ci data_space_reserved = data_space_needed; 316462306a36Sopenharmony_ci } 316562306a36Sopenharmony_ci 316662306a36Sopenharmony_ci /* 316762306a36Sopenharmony_ci * If ret is still 0, means we're OK to fallocate. 316862306a36Sopenharmony_ci * Or just cleanup the list and exit. 316962306a36Sopenharmony_ci */ 317062306a36Sopenharmony_ci list_for_each_entry_safe(range, tmp, &reserve_list, list) { 317162306a36Sopenharmony_ci if (!ret) { 317262306a36Sopenharmony_ci ret = btrfs_prealloc_file_range(inode, mode, 317362306a36Sopenharmony_ci range->start, 317462306a36Sopenharmony_ci range->len, i_blocksize(inode), 317562306a36Sopenharmony_ci offset + len, &alloc_hint); 317662306a36Sopenharmony_ci /* 317762306a36Sopenharmony_ci * btrfs_prealloc_file_range() releases space even 317862306a36Sopenharmony_ci * if it returns an error. 317962306a36Sopenharmony_ci */ 318062306a36Sopenharmony_ci data_space_reserved -= range->len; 318162306a36Sopenharmony_ci qgroup_reserved -= range->len; 318262306a36Sopenharmony_ci } else if (data_space_reserved > 0) { 318362306a36Sopenharmony_ci btrfs_free_reserved_data_space(BTRFS_I(inode), 318462306a36Sopenharmony_ci data_reserved, range->start, 318562306a36Sopenharmony_ci range->len); 318662306a36Sopenharmony_ci data_space_reserved -= range->len; 318762306a36Sopenharmony_ci qgroup_reserved -= range->len; 318862306a36Sopenharmony_ci } else if (qgroup_reserved > 0) { 318962306a36Sopenharmony_ci btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, 319062306a36Sopenharmony_ci range->start, range->len, NULL); 319162306a36Sopenharmony_ci qgroup_reserved -= range->len; 319262306a36Sopenharmony_ci } 319362306a36Sopenharmony_ci list_del(&range->list); 319462306a36Sopenharmony_ci kfree(range); 319562306a36Sopenharmony_ci } 319662306a36Sopenharmony_ci if (ret < 0) 319762306a36Sopenharmony_ci goto out_unlock; 319862306a36Sopenharmony_ci 319962306a36Sopenharmony_ci /* 320062306a36Sopenharmony_ci * We didn't need to allocate any more space, but we still extended the 320162306a36Sopenharmony_ci * size of the file so we need to update i_size and the inode item. 320262306a36Sopenharmony_ci */ 320362306a36Sopenharmony_ci ret = btrfs_fallocate_update_isize(inode, actual_end, mode); 320462306a36Sopenharmony_ciout_unlock: 320562306a36Sopenharmony_ci unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 320662306a36Sopenharmony_ci &cached_state); 320762306a36Sopenharmony_ciout: 320862306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); 320962306a36Sopenharmony_ci extent_changeset_free(data_reserved); 321062306a36Sopenharmony_ci return ret; 321162306a36Sopenharmony_ci} 321262306a36Sopenharmony_ci 321362306a36Sopenharmony_ci/* 321462306a36Sopenharmony_ci * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range 321562306a36Sopenharmony_ci * that has unflushed and/or flushing delalloc. There might be other adjacent 321662306a36Sopenharmony_ci * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps 321762306a36Sopenharmony_ci * looping while it gets adjacent subranges, and merging them together. 321862306a36Sopenharmony_ci */ 321962306a36Sopenharmony_cistatic bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, 322062306a36Sopenharmony_ci struct extent_state **cached_state, 322162306a36Sopenharmony_ci bool *search_io_tree, 322262306a36Sopenharmony_ci u64 *delalloc_start_ret, u64 *delalloc_end_ret) 322362306a36Sopenharmony_ci{ 322462306a36Sopenharmony_ci u64 len = end + 1 - start; 322562306a36Sopenharmony_ci u64 delalloc_len = 0; 322662306a36Sopenharmony_ci struct btrfs_ordered_extent *oe; 322762306a36Sopenharmony_ci u64 oe_start; 322862306a36Sopenharmony_ci u64 oe_end; 322962306a36Sopenharmony_ci 323062306a36Sopenharmony_ci /* 323162306a36Sopenharmony_ci * Search the io tree first for EXTENT_DELALLOC. If we find any, it 323262306a36Sopenharmony_ci * means we have delalloc (dirty pages) for which writeback has not 323362306a36Sopenharmony_ci * started yet. 323462306a36Sopenharmony_ci */ 323562306a36Sopenharmony_ci if (*search_io_tree) { 323662306a36Sopenharmony_ci spin_lock(&inode->lock); 323762306a36Sopenharmony_ci if (inode->delalloc_bytes > 0) { 323862306a36Sopenharmony_ci spin_unlock(&inode->lock); 323962306a36Sopenharmony_ci *delalloc_start_ret = start; 324062306a36Sopenharmony_ci delalloc_len = count_range_bits(&inode->io_tree, 324162306a36Sopenharmony_ci delalloc_start_ret, end, 324262306a36Sopenharmony_ci len, EXTENT_DELALLOC, 1, 324362306a36Sopenharmony_ci cached_state); 324462306a36Sopenharmony_ci } else { 324562306a36Sopenharmony_ci spin_unlock(&inode->lock); 324662306a36Sopenharmony_ci } 324762306a36Sopenharmony_ci } 324862306a36Sopenharmony_ci 324962306a36Sopenharmony_ci if (delalloc_len > 0) { 325062306a36Sopenharmony_ci /* 325162306a36Sopenharmony_ci * If delalloc was found then *delalloc_start_ret has a sector size 325262306a36Sopenharmony_ci * aligned value (rounded down). 325362306a36Sopenharmony_ci */ 325462306a36Sopenharmony_ci *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; 325562306a36Sopenharmony_ci 325662306a36Sopenharmony_ci if (*delalloc_start_ret == start) { 325762306a36Sopenharmony_ci /* Delalloc for the whole range, nothing more to do. */ 325862306a36Sopenharmony_ci if (*delalloc_end_ret == end) 325962306a36Sopenharmony_ci return true; 326062306a36Sopenharmony_ci /* Else trim our search range for ordered extents. */ 326162306a36Sopenharmony_ci start = *delalloc_end_ret + 1; 326262306a36Sopenharmony_ci len = end + 1 - start; 326362306a36Sopenharmony_ci } 326462306a36Sopenharmony_ci } else { 326562306a36Sopenharmony_ci /* No delalloc, future calls don't need to search again. */ 326662306a36Sopenharmony_ci *search_io_tree = false; 326762306a36Sopenharmony_ci } 326862306a36Sopenharmony_ci 326962306a36Sopenharmony_ci /* 327062306a36Sopenharmony_ci * Now also check if there's any ordered extent in the range. 327162306a36Sopenharmony_ci * We do this because: 327262306a36Sopenharmony_ci * 327362306a36Sopenharmony_ci * 1) When delalloc is flushed, the file range is locked, we clear the 327462306a36Sopenharmony_ci * EXTENT_DELALLOC bit from the io tree and create an extent map and 327562306a36Sopenharmony_ci * an ordered extent for the write. So we might just have been called 327662306a36Sopenharmony_ci * after delalloc is flushed and before the ordered extent completes 327762306a36Sopenharmony_ci * and inserts the new file extent item in the subvolume's btree; 327862306a36Sopenharmony_ci * 327962306a36Sopenharmony_ci * 2) We may have an ordered extent created by flushing delalloc for a 328062306a36Sopenharmony_ci * subrange that starts before the subrange we found marked with 328162306a36Sopenharmony_ci * EXTENT_DELALLOC in the io tree. 328262306a36Sopenharmony_ci * 328362306a36Sopenharmony_ci * We could also use the extent map tree to find such delalloc that is 328462306a36Sopenharmony_ci * being flushed, but using the ordered extents tree is more efficient 328562306a36Sopenharmony_ci * because it's usually much smaller as ordered extents are removed from 328662306a36Sopenharmony_ci * the tree once they complete. With the extent maps, we mau have them 328762306a36Sopenharmony_ci * in the extent map tree for a very long time, and they were either 328862306a36Sopenharmony_ci * created by previous writes or loaded by read operations. 328962306a36Sopenharmony_ci */ 329062306a36Sopenharmony_ci oe = btrfs_lookup_first_ordered_range(inode, start, len); 329162306a36Sopenharmony_ci if (!oe) 329262306a36Sopenharmony_ci return (delalloc_len > 0); 329362306a36Sopenharmony_ci 329462306a36Sopenharmony_ci /* The ordered extent may span beyond our search range. */ 329562306a36Sopenharmony_ci oe_start = max(oe->file_offset, start); 329662306a36Sopenharmony_ci oe_end = min(oe->file_offset + oe->num_bytes - 1, end); 329762306a36Sopenharmony_ci 329862306a36Sopenharmony_ci btrfs_put_ordered_extent(oe); 329962306a36Sopenharmony_ci 330062306a36Sopenharmony_ci /* Don't have unflushed delalloc, return the ordered extent range. */ 330162306a36Sopenharmony_ci if (delalloc_len == 0) { 330262306a36Sopenharmony_ci *delalloc_start_ret = oe_start; 330362306a36Sopenharmony_ci *delalloc_end_ret = oe_end; 330462306a36Sopenharmony_ci return true; 330562306a36Sopenharmony_ci } 330662306a36Sopenharmony_ci 330762306a36Sopenharmony_ci /* 330862306a36Sopenharmony_ci * We have both unflushed delalloc (io_tree) and an ordered extent. 330962306a36Sopenharmony_ci * If the ranges are adjacent returned a combined range, otherwise 331062306a36Sopenharmony_ci * return the leftmost range. 331162306a36Sopenharmony_ci */ 331262306a36Sopenharmony_ci if (oe_start < *delalloc_start_ret) { 331362306a36Sopenharmony_ci if (oe_end < *delalloc_start_ret) 331462306a36Sopenharmony_ci *delalloc_end_ret = oe_end; 331562306a36Sopenharmony_ci *delalloc_start_ret = oe_start; 331662306a36Sopenharmony_ci } else if (*delalloc_end_ret + 1 == oe_start) { 331762306a36Sopenharmony_ci *delalloc_end_ret = oe_end; 331862306a36Sopenharmony_ci } 331962306a36Sopenharmony_ci 332062306a36Sopenharmony_ci return true; 332162306a36Sopenharmony_ci} 332262306a36Sopenharmony_ci 332362306a36Sopenharmony_ci/* 332462306a36Sopenharmony_ci * Check if there's delalloc in a given range. 332562306a36Sopenharmony_ci * 332662306a36Sopenharmony_ci * @inode: The inode. 332762306a36Sopenharmony_ci * @start: The start offset of the range. It does not need to be 332862306a36Sopenharmony_ci * sector size aligned. 332962306a36Sopenharmony_ci * @end: The end offset (inclusive value) of the search range. 333062306a36Sopenharmony_ci * It does not need to be sector size aligned. 333162306a36Sopenharmony_ci * @cached_state: Extent state record used for speeding up delalloc 333262306a36Sopenharmony_ci * searches in the inode's io_tree. Can be NULL. 333362306a36Sopenharmony_ci * @delalloc_start_ret: Output argument, set to the start offset of the 333462306a36Sopenharmony_ci * subrange found with delalloc (may not be sector size 333562306a36Sopenharmony_ci * aligned). 333662306a36Sopenharmony_ci * @delalloc_end_ret: Output argument, set to he end offset (inclusive value) 333762306a36Sopenharmony_ci * of the subrange found with delalloc. 333862306a36Sopenharmony_ci * 333962306a36Sopenharmony_ci * Returns true if a subrange with delalloc is found within the given range, and 334062306a36Sopenharmony_ci * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and 334162306a36Sopenharmony_ci * end offsets of the subrange. 334262306a36Sopenharmony_ci */ 334362306a36Sopenharmony_cibool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, 334462306a36Sopenharmony_ci struct extent_state **cached_state, 334562306a36Sopenharmony_ci u64 *delalloc_start_ret, u64 *delalloc_end_ret) 334662306a36Sopenharmony_ci{ 334762306a36Sopenharmony_ci u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); 334862306a36Sopenharmony_ci u64 prev_delalloc_end = 0; 334962306a36Sopenharmony_ci bool search_io_tree = true; 335062306a36Sopenharmony_ci bool ret = false; 335162306a36Sopenharmony_ci 335262306a36Sopenharmony_ci while (cur_offset <= end) { 335362306a36Sopenharmony_ci u64 delalloc_start; 335462306a36Sopenharmony_ci u64 delalloc_end; 335562306a36Sopenharmony_ci bool delalloc; 335662306a36Sopenharmony_ci 335762306a36Sopenharmony_ci delalloc = find_delalloc_subrange(inode, cur_offset, end, 335862306a36Sopenharmony_ci cached_state, &search_io_tree, 335962306a36Sopenharmony_ci &delalloc_start, 336062306a36Sopenharmony_ci &delalloc_end); 336162306a36Sopenharmony_ci if (!delalloc) 336262306a36Sopenharmony_ci break; 336362306a36Sopenharmony_ci 336462306a36Sopenharmony_ci if (prev_delalloc_end == 0) { 336562306a36Sopenharmony_ci /* First subrange found. */ 336662306a36Sopenharmony_ci *delalloc_start_ret = max(delalloc_start, start); 336762306a36Sopenharmony_ci *delalloc_end_ret = delalloc_end; 336862306a36Sopenharmony_ci ret = true; 336962306a36Sopenharmony_ci } else if (delalloc_start == prev_delalloc_end + 1) { 337062306a36Sopenharmony_ci /* Subrange adjacent to the previous one, merge them. */ 337162306a36Sopenharmony_ci *delalloc_end_ret = delalloc_end; 337262306a36Sopenharmony_ci } else { 337362306a36Sopenharmony_ci /* Subrange not adjacent to the previous one, exit. */ 337462306a36Sopenharmony_ci break; 337562306a36Sopenharmony_ci } 337662306a36Sopenharmony_ci 337762306a36Sopenharmony_ci prev_delalloc_end = delalloc_end; 337862306a36Sopenharmony_ci cur_offset = delalloc_end + 1; 337962306a36Sopenharmony_ci cond_resched(); 338062306a36Sopenharmony_ci } 338162306a36Sopenharmony_ci 338262306a36Sopenharmony_ci return ret; 338362306a36Sopenharmony_ci} 338462306a36Sopenharmony_ci 338562306a36Sopenharmony_ci/* 338662306a36Sopenharmony_ci * Check if there's a hole or delalloc range in a range representing a hole (or 338762306a36Sopenharmony_ci * prealloc extent) found in the inode's subvolume btree. 338862306a36Sopenharmony_ci * 338962306a36Sopenharmony_ci * @inode: The inode. 339062306a36Sopenharmony_ci * @whence: Seek mode (SEEK_DATA or SEEK_HOLE). 339162306a36Sopenharmony_ci * @start: Start offset of the hole region. It does not need to be sector 339262306a36Sopenharmony_ci * size aligned. 339362306a36Sopenharmony_ci * @end: End offset (inclusive value) of the hole region. It does not 339462306a36Sopenharmony_ci * need to be sector size aligned. 339562306a36Sopenharmony_ci * @start_ret: Return parameter, used to set the start of the subrange in the 339662306a36Sopenharmony_ci * hole that matches the search criteria (seek mode), if such 339762306a36Sopenharmony_ci * subrange is found (return value of the function is true). 339862306a36Sopenharmony_ci * The value returned here may not be sector size aligned. 339962306a36Sopenharmony_ci * 340062306a36Sopenharmony_ci * Returns true if a subrange matching the given seek mode is found, and if one 340162306a36Sopenharmony_ci * is found, it updates @start_ret with the start of the subrange. 340262306a36Sopenharmony_ci */ 340362306a36Sopenharmony_cistatic bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, 340462306a36Sopenharmony_ci struct extent_state **cached_state, 340562306a36Sopenharmony_ci u64 start, u64 end, u64 *start_ret) 340662306a36Sopenharmony_ci{ 340762306a36Sopenharmony_ci u64 delalloc_start; 340862306a36Sopenharmony_ci u64 delalloc_end; 340962306a36Sopenharmony_ci bool delalloc; 341062306a36Sopenharmony_ci 341162306a36Sopenharmony_ci delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state, 341262306a36Sopenharmony_ci &delalloc_start, &delalloc_end); 341362306a36Sopenharmony_ci if (delalloc && whence == SEEK_DATA) { 341462306a36Sopenharmony_ci *start_ret = delalloc_start; 341562306a36Sopenharmony_ci return true; 341662306a36Sopenharmony_ci } 341762306a36Sopenharmony_ci 341862306a36Sopenharmony_ci if (delalloc && whence == SEEK_HOLE) { 341962306a36Sopenharmony_ci /* 342062306a36Sopenharmony_ci * We found delalloc but it starts after out start offset. So we 342162306a36Sopenharmony_ci * have a hole between our start offset and the delalloc start. 342262306a36Sopenharmony_ci */ 342362306a36Sopenharmony_ci if (start < delalloc_start) { 342462306a36Sopenharmony_ci *start_ret = start; 342562306a36Sopenharmony_ci return true; 342662306a36Sopenharmony_ci } 342762306a36Sopenharmony_ci /* 342862306a36Sopenharmony_ci * Delalloc range starts at our start offset. 342962306a36Sopenharmony_ci * If the delalloc range's length is smaller than our range, 343062306a36Sopenharmony_ci * then it means we have a hole that starts where the delalloc 343162306a36Sopenharmony_ci * subrange ends. 343262306a36Sopenharmony_ci */ 343362306a36Sopenharmony_ci if (delalloc_end < end) { 343462306a36Sopenharmony_ci *start_ret = delalloc_end + 1; 343562306a36Sopenharmony_ci return true; 343662306a36Sopenharmony_ci } 343762306a36Sopenharmony_ci 343862306a36Sopenharmony_ci /* There's delalloc for the whole range. */ 343962306a36Sopenharmony_ci return false; 344062306a36Sopenharmony_ci } 344162306a36Sopenharmony_ci 344262306a36Sopenharmony_ci if (!delalloc && whence == SEEK_HOLE) { 344362306a36Sopenharmony_ci *start_ret = start; 344462306a36Sopenharmony_ci return true; 344562306a36Sopenharmony_ci } 344662306a36Sopenharmony_ci 344762306a36Sopenharmony_ci /* 344862306a36Sopenharmony_ci * No delalloc in the range and we are seeking for data. The caller has 344962306a36Sopenharmony_ci * to iterate to the next extent item in the subvolume btree. 345062306a36Sopenharmony_ci */ 345162306a36Sopenharmony_ci return false; 345262306a36Sopenharmony_ci} 345362306a36Sopenharmony_ci 345462306a36Sopenharmony_cistatic loff_t find_desired_extent(struct file *file, loff_t offset, int whence) 345562306a36Sopenharmony_ci{ 345662306a36Sopenharmony_ci struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); 345762306a36Sopenharmony_ci struct btrfs_file_private *private = file->private_data; 345862306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = inode->root->fs_info; 345962306a36Sopenharmony_ci struct extent_state *cached_state = NULL; 346062306a36Sopenharmony_ci struct extent_state **delalloc_cached_state; 346162306a36Sopenharmony_ci const loff_t i_size = i_size_read(&inode->vfs_inode); 346262306a36Sopenharmony_ci const u64 ino = btrfs_ino(inode); 346362306a36Sopenharmony_ci struct btrfs_root *root = inode->root; 346462306a36Sopenharmony_ci struct btrfs_path *path; 346562306a36Sopenharmony_ci struct btrfs_key key; 346662306a36Sopenharmony_ci u64 last_extent_end; 346762306a36Sopenharmony_ci u64 lockstart; 346862306a36Sopenharmony_ci u64 lockend; 346962306a36Sopenharmony_ci u64 start; 347062306a36Sopenharmony_ci int ret; 347162306a36Sopenharmony_ci bool found = false; 347262306a36Sopenharmony_ci 347362306a36Sopenharmony_ci if (i_size == 0 || offset >= i_size) 347462306a36Sopenharmony_ci return -ENXIO; 347562306a36Sopenharmony_ci 347662306a36Sopenharmony_ci /* 347762306a36Sopenharmony_ci * Quick path. If the inode has no prealloc extents and its number of 347862306a36Sopenharmony_ci * bytes used matches its i_size, then it can not have holes. 347962306a36Sopenharmony_ci */ 348062306a36Sopenharmony_ci if (whence == SEEK_HOLE && 348162306a36Sopenharmony_ci !(inode->flags & BTRFS_INODE_PREALLOC) && 348262306a36Sopenharmony_ci inode_get_bytes(&inode->vfs_inode) == i_size) 348362306a36Sopenharmony_ci return i_size; 348462306a36Sopenharmony_ci 348562306a36Sopenharmony_ci if (!private) { 348662306a36Sopenharmony_ci private = kzalloc(sizeof(*private), GFP_KERNEL); 348762306a36Sopenharmony_ci /* 348862306a36Sopenharmony_ci * No worries if memory allocation failed. 348962306a36Sopenharmony_ci * The private structure is used only for speeding up multiple 349062306a36Sopenharmony_ci * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, 349162306a36Sopenharmony_ci * so everything will still be correct. 349262306a36Sopenharmony_ci */ 349362306a36Sopenharmony_ci file->private_data = private; 349462306a36Sopenharmony_ci } 349562306a36Sopenharmony_ci 349662306a36Sopenharmony_ci if (private) 349762306a36Sopenharmony_ci delalloc_cached_state = &private->llseek_cached_state; 349862306a36Sopenharmony_ci else 349962306a36Sopenharmony_ci delalloc_cached_state = NULL; 350062306a36Sopenharmony_ci 350162306a36Sopenharmony_ci /* 350262306a36Sopenharmony_ci * offset can be negative, in this case we start finding DATA/HOLE from 350362306a36Sopenharmony_ci * the very start of the file. 350462306a36Sopenharmony_ci */ 350562306a36Sopenharmony_ci start = max_t(loff_t, 0, offset); 350662306a36Sopenharmony_ci 350762306a36Sopenharmony_ci lockstart = round_down(start, fs_info->sectorsize); 350862306a36Sopenharmony_ci lockend = round_up(i_size, fs_info->sectorsize); 350962306a36Sopenharmony_ci if (lockend <= lockstart) 351062306a36Sopenharmony_ci lockend = lockstart + fs_info->sectorsize; 351162306a36Sopenharmony_ci lockend--; 351262306a36Sopenharmony_ci 351362306a36Sopenharmony_ci path = btrfs_alloc_path(); 351462306a36Sopenharmony_ci if (!path) 351562306a36Sopenharmony_ci return -ENOMEM; 351662306a36Sopenharmony_ci path->reada = READA_FORWARD; 351762306a36Sopenharmony_ci 351862306a36Sopenharmony_ci key.objectid = ino; 351962306a36Sopenharmony_ci key.type = BTRFS_EXTENT_DATA_KEY; 352062306a36Sopenharmony_ci key.offset = start; 352162306a36Sopenharmony_ci 352262306a36Sopenharmony_ci last_extent_end = lockstart; 352362306a36Sopenharmony_ci 352462306a36Sopenharmony_ci lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 352562306a36Sopenharmony_ci 352662306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 352762306a36Sopenharmony_ci if (ret < 0) { 352862306a36Sopenharmony_ci goto out; 352962306a36Sopenharmony_ci } else if (ret > 0 && path->slots[0] > 0) { 353062306a36Sopenharmony_ci btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); 353162306a36Sopenharmony_ci if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) 353262306a36Sopenharmony_ci path->slots[0]--; 353362306a36Sopenharmony_ci } 353462306a36Sopenharmony_ci 353562306a36Sopenharmony_ci while (start < i_size) { 353662306a36Sopenharmony_ci struct extent_buffer *leaf = path->nodes[0]; 353762306a36Sopenharmony_ci struct btrfs_file_extent_item *extent; 353862306a36Sopenharmony_ci u64 extent_end; 353962306a36Sopenharmony_ci u8 type; 354062306a36Sopenharmony_ci 354162306a36Sopenharmony_ci if (path->slots[0] >= btrfs_header_nritems(leaf)) { 354262306a36Sopenharmony_ci ret = btrfs_next_leaf(root, path); 354362306a36Sopenharmony_ci if (ret < 0) 354462306a36Sopenharmony_ci goto out; 354562306a36Sopenharmony_ci else if (ret > 0) 354662306a36Sopenharmony_ci break; 354762306a36Sopenharmony_ci 354862306a36Sopenharmony_ci leaf = path->nodes[0]; 354962306a36Sopenharmony_ci } 355062306a36Sopenharmony_ci 355162306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 355262306a36Sopenharmony_ci if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) 355362306a36Sopenharmony_ci break; 355462306a36Sopenharmony_ci 355562306a36Sopenharmony_ci extent_end = btrfs_file_extent_end(path); 355662306a36Sopenharmony_ci 355762306a36Sopenharmony_ci /* 355862306a36Sopenharmony_ci * In the first iteration we may have a slot that points to an 355962306a36Sopenharmony_ci * extent that ends before our start offset, so skip it. 356062306a36Sopenharmony_ci */ 356162306a36Sopenharmony_ci if (extent_end <= start) { 356262306a36Sopenharmony_ci path->slots[0]++; 356362306a36Sopenharmony_ci continue; 356462306a36Sopenharmony_ci } 356562306a36Sopenharmony_ci 356662306a36Sopenharmony_ci /* We have an implicit hole, NO_HOLES feature is likely set. */ 356762306a36Sopenharmony_ci if (last_extent_end < key.offset) { 356862306a36Sopenharmony_ci u64 search_start = last_extent_end; 356962306a36Sopenharmony_ci u64 found_start; 357062306a36Sopenharmony_ci 357162306a36Sopenharmony_ci /* 357262306a36Sopenharmony_ci * First iteration, @start matches @offset and it's 357362306a36Sopenharmony_ci * within the hole. 357462306a36Sopenharmony_ci */ 357562306a36Sopenharmony_ci if (start == offset) 357662306a36Sopenharmony_ci search_start = offset; 357762306a36Sopenharmony_ci 357862306a36Sopenharmony_ci found = find_desired_extent_in_hole(inode, whence, 357962306a36Sopenharmony_ci delalloc_cached_state, 358062306a36Sopenharmony_ci search_start, 358162306a36Sopenharmony_ci key.offset - 1, 358262306a36Sopenharmony_ci &found_start); 358362306a36Sopenharmony_ci if (found) { 358462306a36Sopenharmony_ci start = found_start; 358562306a36Sopenharmony_ci break; 358662306a36Sopenharmony_ci } 358762306a36Sopenharmony_ci /* 358862306a36Sopenharmony_ci * Didn't find data or a hole (due to delalloc) in the 358962306a36Sopenharmony_ci * implicit hole range, so need to analyze the extent. 359062306a36Sopenharmony_ci */ 359162306a36Sopenharmony_ci } 359262306a36Sopenharmony_ci 359362306a36Sopenharmony_ci extent = btrfs_item_ptr(leaf, path->slots[0], 359462306a36Sopenharmony_ci struct btrfs_file_extent_item); 359562306a36Sopenharmony_ci type = btrfs_file_extent_type(leaf, extent); 359662306a36Sopenharmony_ci 359762306a36Sopenharmony_ci /* 359862306a36Sopenharmony_ci * Can't access the extent's disk_bytenr field if this is an 359962306a36Sopenharmony_ci * inline extent, since at that offset, it's where the extent 360062306a36Sopenharmony_ci * data starts. 360162306a36Sopenharmony_ci */ 360262306a36Sopenharmony_ci if (type == BTRFS_FILE_EXTENT_PREALLOC || 360362306a36Sopenharmony_ci (type == BTRFS_FILE_EXTENT_REG && 360462306a36Sopenharmony_ci btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) { 360562306a36Sopenharmony_ci /* 360662306a36Sopenharmony_ci * Explicit hole or prealloc extent, search for delalloc. 360762306a36Sopenharmony_ci * A prealloc extent is treated like a hole. 360862306a36Sopenharmony_ci */ 360962306a36Sopenharmony_ci u64 search_start = key.offset; 361062306a36Sopenharmony_ci u64 found_start; 361162306a36Sopenharmony_ci 361262306a36Sopenharmony_ci /* 361362306a36Sopenharmony_ci * First iteration, @start matches @offset and it's 361462306a36Sopenharmony_ci * within the hole. 361562306a36Sopenharmony_ci */ 361662306a36Sopenharmony_ci if (start == offset) 361762306a36Sopenharmony_ci search_start = offset; 361862306a36Sopenharmony_ci 361962306a36Sopenharmony_ci found = find_desired_extent_in_hole(inode, whence, 362062306a36Sopenharmony_ci delalloc_cached_state, 362162306a36Sopenharmony_ci search_start, 362262306a36Sopenharmony_ci extent_end - 1, 362362306a36Sopenharmony_ci &found_start); 362462306a36Sopenharmony_ci if (found) { 362562306a36Sopenharmony_ci start = found_start; 362662306a36Sopenharmony_ci break; 362762306a36Sopenharmony_ci } 362862306a36Sopenharmony_ci /* 362962306a36Sopenharmony_ci * Didn't find data or a hole (due to delalloc) in the 363062306a36Sopenharmony_ci * implicit hole range, so need to analyze the next 363162306a36Sopenharmony_ci * extent item. 363262306a36Sopenharmony_ci */ 363362306a36Sopenharmony_ci } else { 363462306a36Sopenharmony_ci /* 363562306a36Sopenharmony_ci * Found a regular or inline extent. 363662306a36Sopenharmony_ci * If we are seeking for data, adjust the start offset 363762306a36Sopenharmony_ci * and stop, we're done. 363862306a36Sopenharmony_ci */ 363962306a36Sopenharmony_ci if (whence == SEEK_DATA) { 364062306a36Sopenharmony_ci start = max_t(u64, key.offset, offset); 364162306a36Sopenharmony_ci found = true; 364262306a36Sopenharmony_ci break; 364362306a36Sopenharmony_ci } 364462306a36Sopenharmony_ci /* 364562306a36Sopenharmony_ci * Else, we are seeking for a hole, check the next file 364662306a36Sopenharmony_ci * extent item. 364762306a36Sopenharmony_ci */ 364862306a36Sopenharmony_ci } 364962306a36Sopenharmony_ci 365062306a36Sopenharmony_ci start = extent_end; 365162306a36Sopenharmony_ci last_extent_end = extent_end; 365262306a36Sopenharmony_ci path->slots[0]++; 365362306a36Sopenharmony_ci if (fatal_signal_pending(current)) { 365462306a36Sopenharmony_ci ret = -EINTR; 365562306a36Sopenharmony_ci goto out; 365662306a36Sopenharmony_ci } 365762306a36Sopenharmony_ci cond_resched(); 365862306a36Sopenharmony_ci } 365962306a36Sopenharmony_ci 366062306a36Sopenharmony_ci /* We have an implicit hole from the last extent found up to i_size. */ 366162306a36Sopenharmony_ci if (!found && start < i_size) { 366262306a36Sopenharmony_ci found = find_desired_extent_in_hole(inode, whence, 366362306a36Sopenharmony_ci delalloc_cached_state, start, 366462306a36Sopenharmony_ci i_size - 1, &start); 366562306a36Sopenharmony_ci if (!found) 366662306a36Sopenharmony_ci start = i_size; 366762306a36Sopenharmony_ci } 366862306a36Sopenharmony_ci 366962306a36Sopenharmony_ciout: 367062306a36Sopenharmony_ci unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); 367162306a36Sopenharmony_ci btrfs_free_path(path); 367262306a36Sopenharmony_ci 367362306a36Sopenharmony_ci if (ret < 0) 367462306a36Sopenharmony_ci return ret; 367562306a36Sopenharmony_ci 367662306a36Sopenharmony_ci if (whence == SEEK_DATA && start >= i_size) 367762306a36Sopenharmony_ci return -ENXIO; 367862306a36Sopenharmony_ci 367962306a36Sopenharmony_ci return min_t(loff_t, start, i_size); 368062306a36Sopenharmony_ci} 368162306a36Sopenharmony_ci 368262306a36Sopenharmony_cistatic loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) 368362306a36Sopenharmony_ci{ 368462306a36Sopenharmony_ci struct inode *inode = file->f_mapping->host; 368562306a36Sopenharmony_ci 368662306a36Sopenharmony_ci switch (whence) { 368762306a36Sopenharmony_ci default: 368862306a36Sopenharmony_ci return generic_file_llseek(file, offset, whence); 368962306a36Sopenharmony_ci case SEEK_DATA: 369062306a36Sopenharmony_ci case SEEK_HOLE: 369162306a36Sopenharmony_ci btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 369262306a36Sopenharmony_ci offset = find_desired_extent(file, offset, whence); 369362306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 369462306a36Sopenharmony_ci break; 369562306a36Sopenharmony_ci } 369662306a36Sopenharmony_ci 369762306a36Sopenharmony_ci if (offset < 0) 369862306a36Sopenharmony_ci return offset; 369962306a36Sopenharmony_ci 370062306a36Sopenharmony_ci return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 370162306a36Sopenharmony_ci} 370262306a36Sopenharmony_ci 370362306a36Sopenharmony_cistatic int btrfs_file_open(struct inode *inode, struct file *filp) 370462306a36Sopenharmony_ci{ 370562306a36Sopenharmony_ci int ret; 370662306a36Sopenharmony_ci 370762306a36Sopenharmony_ci filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | 370862306a36Sopenharmony_ci FMODE_CAN_ODIRECT; 370962306a36Sopenharmony_ci 371062306a36Sopenharmony_ci ret = fsverity_file_open(inode, filp); 371162306a36Sopenharmony_ci if (ret) 371262306a36Sopenharmony_ci return ret; 371362306a36Sopenharmony_ci return generic_file_open(inode, filp); 371462306a36Sopenharmony_ci} 371562306a36Sopenharmony_ci 371662306a36Sopenharmony_cistatic int check_direct_read(struct btrfs_fs_info *fs_info, 371762306a36Sopenharmony_ci const struct iov_iter *iter, loff_t offset) 371862306a36Sopenharmony_ci{ 371962306a36Sopenharmony_ci int ret; 372062306a36Sopenharmony_ci int i, seg; 372162306a36Sopenharmony_ci 372262306a36Sopenharmony_ci ret = check_direct_IO(fs_info, iter, offset); 372362306a36Sopenharmony_ci if (ret < 0) 372462306a36Sopenharmony_ci return ret; 372562306a36Sopenharmony_ci 372662306a36Sopenharmony_ci if (!iter_is_iovec(iter)) 372762306a36Sopenharmony_ci return 0; 372862306a36Sopenharmony_ci 372962306a36Sopenharmony_ci for (seg = 0; seg < iter->nr_segs; seg++) { 373062306a36Sopenharmony_ci for (i = seg + 1; i < iter->nr_segs; i++) { 373162306a36Sopenharmony_ci const struct iovec *iov1 = iter_iov(iter) + seg; 373262306a36Sopenharmony_ci const struct iovec *iov2 = iter_iov(iter) + i; 373362306a36Sopenharmony_ci 373462306a36Sopenharmony_ci if (iov1->iov_base == iov2->iov_base) 373562306a36Sopenharmony_ci return -EINVAL; 373662306a36Sopenharmony_ci } 373762306a36Sopenharmony_ci } 373862306a36Sopenharmony_ci return 0; 373962306a36Sopenharmony_ci} 374062306a36Sopenharmony_ci 374162306a36Sopenharmony_cistatic ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) 374262306a36Sopenharmony_ci{ 374362306a36Sopenharmony_ci struct inode *inode = file_inode(iocb->ki_filp); 374462306a36Sopenharmony_ci size_t prev_left = 0; 374562306a36Sopenharmony_ci ssize_t read = 0; 374662306a36Sopenharmony_ci ssize_t ret; 374762306a36Sopenharmony_ci 374862306a36Sopenharmony_ci if (fsverity_active(inode)) 374962306a36Sopenharmony_ci return 0; 375062306a36Sopenharmony_ci 375162306a36Sopenharmony_ci if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) 375262306a36Sopenharmony_ci return 0; 375362306a36Sopenharmony_ci 375462306a36Sopenharmony_ci btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 375562306a36Sopenharmony_ciagain: 375662306a36Sopenharmony_ci /* 375762306a36Sopenharmony_ci * This is similar to what we do for direct IO writes, see the comment 375862306a36Sopenharmony_ci * at btrfs_direct_write(), but we also disable page faults in addition 375962306a36Sopenharmony_ci * to disabling them only at the iov_iter level. This is because when 376062306a36Sopenharmony_ci * reading from a hole or prealloc extent, iomap calls iov_iter_zero(), 376162306a36Sopenharmony_ci * which can still trigger page fault ins despite having set ->nofault 376262306a36Sopenharmony_ci * to true of our 'to' iov_iter. 376362306a36Sopenharmony_ci * 376462306a36Sopenharmony_ci * The difference to direct IO writes is that we deadlock when trying 376562306a36Sopenharmony_ci * to lock the extent range in the inode's tree during he page reads 376662306a36Sopenharmony_ci * triggered by the fault in (while for writes it is due to waiting for 376762306a36Sopenharmony_ci * our own ordered extent). This is because for direct IO reads, 376862306a36Sopenharmony_ci * btrfs_dio_iomap_begin() returns with the extent range locked, which 376962306a36Sopenharmony_ci * is only unlocked in the endio callback (end_bio_extent_readpage()). 377062306a36Sopenharmony_ci */ 377162306a36Sopenharmony_ci pagefault_disable(); 377262306a36Sopenharmony_ci to->nofault = true; 377362306a36Sopenharmony_ci ret = btrfs_dio_read(iocb, to, read); 377462306a36Sopenharmony_ci to->nofault = false; 377562306a36Sopenharmony_ci pagefault_enable(); 377662306a36Sopenharmony_ci 377762306a36Sopenharmony_ci /* No increment (+=) because iomap returns a cumulative value. */ 377862306a36Sopenharmony_ci if (ret > 0) 377962306a36Sopenharmony_ci read = ret; 378062306a36Sopenharmony_ci 378162306a36Sopenharmony_ci if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) { 378262306a36Sopenharmony_ci const size_t left = iov_iter_count(to); 378362306a36Sopenharmony_ci 378462306a36Sopenharmony_ci if (left == prev_left) { 378562306a36Sopenharmony_ci /* 378662306a36Sopenharmony_ci * We didn't make any progress since the last attempt, 378762306a36Sopenharmony_ci * fallback to a buffered read for the remainder of the 378862306a36Sopenharmony_ci * range. This is just to avoid any possibility of looping 378962306a36Sopenharmony_ci * for too long. 379062306a36Sopenharmony_ci */ 379162306a36Sopenharmony_ci ret = read; 379262306a36Sopenharmony_ci } else { 379362306a36Sopenharmony_ci /* 379462306a36Sopenharmony_ci * We made some progress since the last retry or this is 379562306a36Sopenharmony_ci * the first time we are retrying. Fault in as many pages 379662306a36Sopenharmony_ci * as possible and retry. 379762306a36Sopenharmony_ci */ 379862306a36Sopenharmony_ci fault_in_iov_iter_writeable(to, left); 379962306a36Sopenharmony_ci prev_left = left; 380062306a36Sopenharmony_ci goto again; 380162306a36Sopenharmony_ci } 380262306a36Sopenharmony_ci } 380362306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); 380462306a36Sopenharmony_ci return ret < 0 ? ret : read; 380562306a36Sopenharmony_ci} 380662306a36Sopenharmony_ci 380762306a36Sopenharmony_cistatic ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 380862306a36Sopenharmony_ci{ 380962306a36Sopenharmony_ci ssize_t ret = 0; 381062306a36Sopenharmony_ci 381162306a36Sopenharmony_ci if (iocb->ki_flags & IOCB_DIRECT) { 381262306a36Sopenharmony_ci ret = btrfs_direct_read(iocb, to); 381362306a36Sopenharmony_ci if (ret < 0 || !iov_iter_count(to) || 381462306a36Sopenharmony_ci iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) 381562306a36Sopenharmony_ci return ret; 381662306a36Sopenharmony_ci } 381762306a36Sopenharmony_ci 381862306a36Sopenharmony_ci return filemap_read(iocb, to, ret); 381962306a36Sopenharmony_ci} 382062306a36Sopenharmony_ci 382162306a36Sopenharmony_ciconst struct file_operations btrfs_file_operations = { 382262306a36Sopenharmony_ci .llseek = btrfs_file_llseek, 382362306a36Sopenharmony_ci .read_iter = btrfs_file_read_iter, 382462306a36Sopenharmony_ci .splice_read = filemap_splice_read, 382562306a36Sopenharmony_ci .write_iter = btrfs_file_write_iter, 382662306a36Sopenharmony_ci .splice_write = iter_file_splice_write, 382762306a36Sopenharmony_ci .mmap = btrfs_file_mmap, 382862306a36Sopenharmony_ci .open = btrfs_file_open, 382962306a36Sopenharmony_ci .release = btrfs_release_file, 383062306a36Sopenharmony_ci .get_unmapped_area = thp_get_unmapped_area, 383162306a36Sopenharmony_ci .fsync = btrfs_sync_file, 383262306a36Sopenharmony_ci .fallocate = btrfs_fallocate, 383362306a36Sopenharmony_ci .unlocked_ioctl = btrfs_ioctl, 383462306a36Sopenharmony_ci#ifdef CONFIG_COMPAT 383562306a36Sopenharmony_ci .compat_ioctl = btrfs_compat_ioctl, 383662306a36Sopenharmony_ci#endif 383762306a36Sopenharmony_ci .remap_file_range = btrfs_remap_file_range, 383862306a36Sopenharmony_ci}; 383962306a36Sopenharmony_ci 384062306a36Sopenharmony_ciint btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) 384162306a36Sopenharmony_ci{ 384262306a36Sopenharmony_ci int ret; 384362306a36Sopenharmony_ci 384462306a36Sopenharmony_ci /* 384562306a36Sopenharmony_ci * So with compression we will find and lock a dirty page and clear the 384662306a36Sopenharmony_ci * first one as dirty, setup an async extent, and immediately return 384762306a36Sopenharmony_ci * with the entire range locked but with nobody actually marked with 384862306a36Sopenharmony_ci * writeback. So we can't just filemap_write_and_wait_range() and 384962306a36Sopenharmony_ci * expect it to work since it will just kick off a thread to do the 385062306a36Sopenharmony_ci * actual work. So we need to call filemap_fdatawrite_range _again_ 385162306a36Sopenharmony_ci * since it will wait on the page lock, which won't be unlocked until 385262306a36Sopenharmony_ci * after the pages have been marked as writeback and so we're good to go 385362306a36Sopenharmony_ci * from there. We have to do this otherwise we'll miss the ordered 385462306a36Sopenharmony_ci * extents and that results in badness. Please Josef, do not think you 385562306a36Sopenharmony_ci * know better and pull this out at some point in the future, it is 385662306a36Sopenharmony_ci * right and you are wrong. 385762306a36Sopenharmony_ci */ 385862306a36Sopenharmony_ci ret = filemap_fdatawrite_range(inode->i_mapping, start, end); 385962306a36Sopenharmony_ci if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 386062306a36Sopenharmony_ci &BTRFS_I(inode)->runtime_flags)) 386162306a36Sopenharmony_ci ret = filemap_fdatawrite_range(inode->i_mapping, start, end); 386262306a36Sopenharmony_ci 386362306a36Sopenharmony_ci return ret; 386462306a36Sopenharmony_ci} 3865