162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci 362306a36Sopenharmony_ci#include <linux/blkdev.h> 462306a36Sopenharmony_ci#include <linux/iversion.h> 562306a36Sopenharmony_ci#include "ctree.h" 662306a36Sopenharmony_ci#include "fs.h" 762306a36Sopenharmony_ci#include "messages.h" 862306a36Sopenharmony_ci#include "compression.h" 962306a36Sopenharmony_ci#include "delalloc-space.h" 1062306a36Sopenharmony_ci#include "disk-io.h" 1162306a36Sopenharmony_ci#include "reflink.h" 1262306a36Sopenharmony_ci#include "transaction.h" 1362306a36Sopenharmony_ci#include "subpage.h" 1462306a36Sopenharmony_ci#include "accessors.h" 1562306a36Sopenharmony_ci#include "file-item.h" 1662306a36Sopenharmony_ci#include "file.h" 1762306a36Sopenharmony_ci#include "super.h" 1862306a36Sopenharmony_ci 1962306a36Sopenharmony_ci#define BTRFS_MAX_DEDUPE_LEN SZ_16M 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_cistatic int clone_finish_inode_update(struct btrfs_trans_handle *trans, 2262306a36Sopenharmony_ci struct inode *inode, 2362306a36Sopenharmony_ci u64 endoff, 2462306a36Sopenharmony_ci const u64 destoff, 2562306a36Sopenharmony_ci const u64 olen, 2662306a36Sopenharmony_ci int no_time_update) 2762306a36Sopenharmony_ci{ 2862306a36Sopenharmony_ci struct btrfs_root *root = BTRFS_I(inode)->root; 2962306a36Sopenharmony_ci int ret; 3062306a36Sopenharmony_ci 3162306a36Sopenharmony_ci inode_inc_iversion(inode); 3262306a36Sopenharmony_ci if (!no_time_update) { 3362306a36Sopenharmony_ci inode->i_mtime = inode_set_ctime_current(inode); 3462306a36Sopenharmony_ci } 3562306a36Sopenharmony_ci /* 3662306a36Sopenharmony_ci * We round up to the block size at eof when determining which 3762306a36Sopenharmony_ci * extents to clone above, but shouldn't round up the file size. 3862306a36Sopenharmony_ci */ 3962306a36Sopenharmony_ci if (endoff > destoff + olen) 4062306a36Sopenharmony_ci endoff = destoff + olen; 4162306a36Sopenharmony_ci if (endoff > inode->i_size) { 4262306a36Sopenharmony_ci i_size_write(inode, endoff); 4362306a36Sopenharmony_ci btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); 4462306a36Sopenharmony_ci } 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_ci ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); 4762306a36Sopenharmony_ci if (ret) { 4862306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 4962306a36Sopenharmony_ci btrfs_end_transaction(trans); 5062306a36Sopenharmony_ci goto out; 5162306a36Sopenharmony_ci } 5262306a36Sopenharmony_ci ret = btrfs_end_transaction(trans); 5362306a36Sopenharmony_ciout: 5462306a36Sopenharmony_ci return ret; 5562306a36Sopenharmony_ci} 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_cistatic int copy_inline_to_page(struct btrfs_inode *inode, 5862306a36Sopenharmony_ci const u64 file_offset, 5962306a36Sopenharmony_ci char *inline_data, 6062306a36Sopenharmony_ci const u64 size, 6162306a36Sopenharmony_ci const u64 datal, 6262306a36Sopenharmony_ci const u8 comp_type) 6362306a36Sopenharmony_ci{ 6462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = inode->root->fs_info; 6562306a36Sopenharmony_ci const u32 block_size = fs_info->sectorsize; 6662306a36Sopenharmony_ci const u64 range_end = file_offset + block_size - 1; 6762306a36Sopenharmony_ci const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); 6862306a36Sopenharmony_ci char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); 6962306a36Sopenharmony_ci struct extent_changeset *data_reserved = NULL; 7062306a36Sopenharmony_ci struct page *page = NULL; 7162306a36Sopenharmony_ci struct address_space *mapping = inode->vfs_inode.i_mapping; 7262306a36Sopenharmony_ci int ret; 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_ci ASSERT(IS_ALIGNED(file_offset, block_size)); 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci /* 7762306a36Sopenharmony_ci * We have flushed and locked the ranges of the source and destination 7862306a36Sopenharmony_ci * inodes, we also have locked the inodes, so we are safe to do a 7962306a36Sopenharmony_ci * reservation here. Also we must not do the reservation while holding 8062306a36Sopenharmony_ci * a transaction open, otherwise we would deadlock. 8162306a36Sopenharmony_ci */ 8262306a36Sopenharmony_ci ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, 8362306a36Sopenharmony_ci block_size); 8462306a36Sopenharmony_ci if (ret) 8562306a36Sopenharmony_ci goto out; 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, 8862306a36Sopenharmony_ci btrfs_alloc_write_mask(mapping)); 8962306a36Sopenharmony_ci if (!page) { 9062306a36Sopenharmony_ci ret = -ENOMEM; 9162306a36Sopenharmony_ci goto out_unlock; 9262306a36Sopenharmony_ci } 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci ret = set_page_extent_mapped(page); 9562306a36Sopenharmony_ci if (ret < 0) 9662306a36Sopenharmony_ci goto out_unlock; 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci clear_extent_bit(&inode->io_tree, file_offset, range_end, 9962306a36Sopenharmony_ci EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 10062306a36Sopenharmony_ci NULL); 10162306a36Sopenharmony_ci ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); 10262306a36Sopenharmony_ci if (ret) 10362306a36Sopenharmony_ci goto out_unlock; 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci /* 10662306a36Sopenharmony_ci * After dirtying the page our caller will need to start a transaction, 10762306a36Sopenharmony_ci * and if we are low on metadata free space, that can cause flushing of 10862306a36Sopenharmony_ci * delalloc for all inodes in order to get metadata space released. 10962306a36Sopenharmony_ci * However we are holding the range locked for the whole duration of 11062306a36Sopenharmony_ci * the clone/dedupe operation, so we may deadlock if that happens and no 11162306a36Sopenharmony_ci * other task releases enough space. So mark this inode as not being 11262306a36Sopenharmony_ci * possible to flush to avoid such deadlock. We will clear that flag 11362306a36Sopenharmony_ci * when we finish cloning all extents, since a transaction is started 11462306a36Sopenharmony_ci * after finding each extent to clone. 11562306a36Sopenharmony_ci */ 11662306a36Sopenharmony_ci set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci if (comp_type == BTRFS_COMPRESS_NONE) { 11962306a36Sopenharmony_ci memcpy_to_page(page, offset_in_page(file_offset), data_start, 12062306a36Sopenharmony_ci datal); 12162306a36Sopenharmony_ci } else { 12262306a36Sopenharmony_ci ret = btrfs_decompress(comp_type, data_start, page, 12362306a36Sopenharmony_ci offset_in_page(file_offset), 12462306a36Sopenharmony_ci inline_size, datal); 12562306a36Sopenharmony_ci if (ret) 12662306a36Sopenharmony_ci goto out_unlock; 12762306a36Sopenharmony_ci flush_dcache_page(page); 12862306a36Sopenharmony_ci } 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci /* 13162306a36Sopenharmony_ci * If our inline data is smaller then the block/page size, then the 13262306a36Sopenharmony_ci * remaining of the block/page is equivalent to zeroes. We had something 13362306a36Sopenharmony_ci * like the following done: 13462306a36Sopenharmony_ci * 13562306a36Sopenharmony_ci * $ xfs_io -f -c "pwrite -S 0xab 0 500" file 13662306a36Sopenharmony_ci * $ sync # (or fsync) 13762306a36Sopenharmony_ci * $ xfs_io -c "falloc 0 4K" file 13862306a36Sopenharmony_ci * $ xfs_io -c "pwrite -S 0xcd 4K 4K" 13962306a36Sopenharmony_ci * 14062306a36Sopenharmony_ci * So what's in the range [500, 4095] corresponds to zeroes. 14162306a36Sopenharmony_ci */ 14262306a36Sopenharmony_ci if (datal < block_size) 14362306a36Sopenharmony_ci memzero_page(page, datal, block_size - datal); 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); 14662306a36Sopenharmony_ci btrfs_page_clear_checked(fs_info, page, file_offset, block_size); 14762306a36Sopenharmony_ci btrfs_page_set_dirty(fs_info, page, file_offset, block_size); 14862306a36Sopenharmony_ciout_unlock: 14962306a36Sopenharmony_ci if (page) { 15062306a36Sopenharmony_ci unlock_page(page); 15162306a36Sopenharmony_ci put_page(page); 15262306a36Sopenharmony_ci } 15362306a36Sopenharmony_ci if (ret) 15462306a36Sopenharmony_ci btrfs_delalloc_release_space(inode, data_reserved, file_offset, 15562306a36Sopenharmony_ci block_size, true); 15662306a36Sopenharmony_ci btrfs_delalloc_release_extents(inode, block_size); 15762306a36Sopenharmony_ciout: 15862306a36Sopenharmony_ci extent_changeset_free(data_reserved); 15962306a36Sopenharmony_ci 16062306a36Sopenharmony_ci return ret; 16162306a36Sopenharmony_ci} 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci/* 16462306a36Sopenharmony_ci * Deal with cloning of inline extents. We try to copy the inline extent from 16562306a36Sopenharmony_ci * the source inode to destination inode when possible. When not possible we 16662306a36Sopenharmony_ci * copy the inline extent's data into the respective page of the inode. 16762306a36Sopenharmony_ci */ 16862306a36Sopenharmony_cistatic int clone_copy_inline_extent(struct inode *dst, 16962306a36Sopenharmony_ci struct btrfs_path *path, 17062306a36Sopenharmony_ci struct btrfs_key *new_key, 17162306a36Sopenharmony_ci const u64 drop_start, 17262306a36Sopenharmony_ci const u64 datal, 17362306a36Sopenharmony_ci const u64 size, 17462306a36Sopenharmony_ci const u8 comp_type, 17562306a36Sopenharmony_ci char *inline_data, 17662306a36Sopenharmony_ci struct btrfs_trans_handle **trans_out) 17762306a36Sopenharmony_ci{ 17862306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); 17962306a36Sopenharmony_ci struct btrfs_root *root = BTRFS_I(dst)->root; 18062306a36Sopenharmony_ci const u64 aligned_end = ALIGN(new_key->offset + datal, 18162306a36Sopenharmony_ci fs_info->sectorsize); 18262306a36Sopenharmony_ci struct btrfs_trans_handle *trans = NULL; 18362306a36Sopenharmony_ci struct btrfs_drop_extents_args drop_args = { 0 }; 18462306a36Sopenharmony_ci int ret; 18562306a36Sopenharmony_ci struct btrfs_key key; 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci if (new_key->offset > 0) { 18862306a36Sopenharmony_ci ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, 18962306a36Sopenharmony_ci inline_data, size, datal, comp_type); 19062306a36Sopenharmony_ci goto out; 19162306a36Sopenharmony_ci } 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci key.objectid = btrfs_ino(BTRFS_I(dst)); 19462306a36Sopenharmony_ci key.type = BTRFS_EXTENT_DATA_KEY; 19562306a36Sopenharmony_ci key.offset = 0; 19662306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 19762306a36Sopenharmony_ci if (ret < 0) { 19862306a36Sopenharmony_ci return ret; 19962306a36Sopenharmony_ci } else if (ret > 0) { 20062306a36Sopenharmony_ci if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 20162306a36Sopenharmony_ci ret = btrfs_next_leaf(root, path); 20262306a36Sopenharmony_ci if (ret < 0) 20362306a36Sopenharmony_ci return ret; 20462306a36Sopenharmony_ci else if (ret > 0) 20562306a36Sopenharmony_ci goto copy_inline_extent; 20662306a36Sopenharmony_ci } 20762306a36Sopenharmony_ci btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 20862306a36Sopenharmony_ci if (key.objectid == btrfs_ino(BTRFS_I(dst)) && 20962306a36Sopenharmony_ci key.type == BTRFS_EXTENT_DATA_KEY) { 21062306a36Sopenharmony_ci /* 21162306a36Sopenharmony_ci * There's an implicit hole at file offset 0, copy the 21262306a36Sopenharmony_ci * inline extent's data to the page. 21362306a36Sopenharmony_ci */ 21462306a36Sopenharmony_ci ASSERT(key.offset > 0); 21562306a36Sopenharmony_ci goto copy_to_page; 21662306a36Sopenharmony_ci } 21762306a36Sopenharmony_ci } else if (i_size_read(dst) <= datal) { 21862306a36Sopenharmony_ci struct btrfs_file_extent_item *ei; 21962306a36Sopenharmony_ci 22062306a36Sopenharmony_ci ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 22162306a36Sopenharmony_ci struct btrfs_file_extent_item); 22262306a36Sopenharmony_ci /* 22362306a36Sopenharmony_ci * If it's an inline extent replace it with the source inline 22462306a36Sopenharmony_ci * extent, otherwise copy the source inline extent data into 22562306a36Sopenharmony_ci * the respective page at the destination inode. 22662306a36Sopenharmony_ci */ 22762306a36Sopenharmony_ci if (btrfs_file_extent_type(path->nodes[0], ei) == 22862306a36Sopenharmony_ci BTRFS_FILE_EXTENT_INLINE) 22962306a36Sopenharmony_ci goto copy_inline_extent; 23062306a36Sopenharmony_ci 23162306a36Sopenharmony_ci goto copy_to_page; 23262306a36Sopenharmony_ci } 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_cicopy_inline_extent: 23562306a36Sopenharmony_ci /* 23662306a36Sopenharmony_ci * We have no extent items, or we have an extent at offset 0 which may 23762306a36Sopenharmony_ci * or may not be inlined. All these cases are dealt the same way. 23862306a36Sopenharmony_ci */ 23962306a36Sopenharmony_ci if (i_size_read(dst) > datal) { 24062306a36Sopenharmony_ci /* 24162306a36Sopenharmony_ci * At the destination offset 0 we have either a hole, a regular 24262306a36Sopenharmony_ci * extent or an inline extent larger then the one we want to 24362306a36Sopenharmony_ci * clone. Deal with all these cases by copying the inline extent 24462306a36Sopenharmony_ci * data into the respective page at the destination inode. 24562306a36Sopenharmony_ci */ 24662306a36Sopenharmony_ci goto copy_to_page; 24762306a36Sopenharmony_ci } 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci /* 25062306a36Sopenharmony_ci * Release path before starting a new transaction so we don't hold locks 25162306a36Sopenharmony_ci * that would confuse lockdep. 25262306a36Sopenharmony_ci */ 25362306a36Sopenharmony_ci btrfs_release_path(path); 25462306a36Sopenharmony_ci /* 25562306a36Sopenharmony_ci * If we end up here it means were copy the inline extent into a leaf 25662306a36Sopenharmony_ci * of the destination inode. We know we will drop or adjust at most one 25762306a36Sopenharmony_ci * extent item in the destination root. 25862306a36Sopenharmony_ci * 25962306a36Sopenharmony_ci * 1 unit - adjusting old extent (we may have to split it) 26062306a36Sopenharmony_ci * 1 unit - add new extent 26162306a36Sopenharmony_ci * 1 unit - inode update 26262306a36Sopenharmony_ci */ 26362306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 3); 26462306a36Sopenharmony_ci if (IS_ERR(trans)) { 26562306a36Sopenharmony_ci ret = PTR_ERR(trans); 26662306a36Sopenharmony_ci trans = NULL; 26762306a36Sopenharmony_ci goto out; 26862306a36Sopenharmony_ci } 26962306a36Sopenharmony_ci drop_args.path = path; 27062306a36Sopenharmony_ci drop_args.start = drop_start; 27162306a36Sopenharmony_ci drop_args.end = aligned_end; 27262306a36Sopenharmony_ci drop_args.drop_cache = true; 27362306a36Sopenharmony_ci ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); 27462306a36Sopenharmony_ci if (ret) 27562306a36Sopenharmony_ci goto out; 27662306a36Sopenharmony_ci ret = btrfs_insert_empty_item(trans, root, path, new_key, size); 27762306a36Sopenharmony_ci if (ret) 27862306a36Sopenharmony_ci goto out; 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci write_extent_buffer(path->nodes[0], inline_data, 28162306a36Sopenharmony_ci btrfs_item_ptr_offset(path->nodes[0], 28262306a36Sopenharmony_ci path->slots[0]), 28362306a36Sopenharmony_ci size); 28462306a36Sopenharmony_ci btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); 28562306a36Sopenharmony_ci btrfs_set_inode_full_sync(BTRFS_I(dst)); 28662306a36Sopenharmony_ci ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); 28762306a36Sopenharmony_ciout: 28862306a36Sopenharmony_ci if (!ret && !trans) { 28962306a36Sopenharmony_ci /* 29062306a36Sopenharmony_ci * No transaction here means we copied the inline extent into a 29162306a36Sopenharmony_ci * page of the destination inode. 29262306a36Sopenharmony_ci * 29362306a36Sopenharmony_ci * 1 unit to update inode item 29462306a36Sopenharmony_ci */ 29562306a36Sopenharmony_ci trans = btrfs_start_transaction(root, 1); 29662306a36Sopenharmony_ci if (IS_ERR(trans)) { 29762306a36Sopenharmony_ci ret = PTR_ERR(trans); 29862306a36Sopenharmony_ci trans = NULL; 29962306a36Sopenharmony_ci } 30062306a36Sopenharmony_ci } 30162306a36Sopenharmony_ci if (ret && trans) { 30262306a36Sopenharmony_ci btrfs_abort_transaction(trans, ret); 30362306a36Sopenharmony_ci btrfs_end_transaction(trans); 30462306a36Sopenharmony_ci } 30562306a36Sopenharmony_ci if (!ret) 30662306a36Sopenharmony_ci *trans_out = trans; 30762306a36Sopenharmony_ci 30862306a36Sopenharmony_ci return ret; 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_cicopy_to_page: 31162306a36Sopenharmony_ci /* 31262306a36Sopenharmony_ci * Release our path because we don't need it anymore and also because 31362306a36Sopenharmony_ci * copy_inline_to_page() needs to reserve data and metadata, which may 31462306a36Sopenharmony_ci * need to flush delalloc when we are low on available space and 31562306a36Sopenharmony_ci * therefore cause a deadlock if writeback of an inline extent needs to 31662306a36Sopenharmony_ci * write to the same leaf or an ordered extent completion needs to write 31762306a36Sopenharmony_ci * to the same leaf. 31862306a36Sopenharmony_ci */ 31962306a36Sopenharmony_ci btrfs_release_path(path); 32062306a36Sopenharmony_ci 32162306a36Sopenharmony_ci ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, 32262306a36Sopenharmony_ci inline_data, size, datal, comp_type); 32362306a36Sopenharmony_ci goto out; 32462306a36Sopenharmony_ci} 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci/* 32762306a36Sopenharmony_ci * Clone a range from inode file to another. 32862306a36Sopenharmony_ci * 32962306a36Sopenharmony_ci * @src: Inode to clone from 33062306a36Sopenharmony_ci * @inode: Inode to clone to 33162306a36Sopenharmony_ci * @off: Offset within source to start clone from 33262306a36Sopenharmony_ci * @olen: Original length, passed by user, of range to clone 33362306a36Sopenharmony_ci * @olen_aligned: Block-aligned value of olen 33462306a36Sopenharmony_ci * @destoff: Offset within @inode to start clone 33562306a36Sopenharmony_ci * @no_time_update: Whether to update mtime/ctime on the target inode 33662306a36Sopenharmony_ci */ 33762306a36Sopenharmony_cistatic int btrfs_clone(struct inode *src, struct inode *inode, 33862306a36Sopenharmony_ci const u64 off, const u64 olen, const u64 olen_aligned, 33962306a36Sopenharmony_ci const u64 destoff, int no_time_update) 34062306a36Sopenharmony_ci{ 34162306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 34262306a36Sopenharmony_ci struct btrfs_path *path = NULL; 34362306a36Sopenharmony_ci struct extent_buffer *leaf; 34462306a36Sopenharmony_ci struct btrfs_trans_handle *trans; 34562306a36Sopenharmony_ci char *buf = NULL; 34662306a36Sopenharmony_ci struct btrfs_key key; 34762306a36Sopenharmony_ci u32 nritems; 34862306a36Sopenharmony_ci int slot; 34962306a36Sopenharmony_ci int ret; 35062306a36Sopenharmony_ci const u64 len = olen_aligned; 35162306a36Sopenharmony_ci u64 last_dest_end = destoff; 35262306a36Sopenharmony_ci u64 prev_extent_end = off; 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci ret = -ENOMEM; 35562306a36Sopenharmony_ci buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); 35662306a36Sopenharmony_ci if (!buf) 35762306a36Sopenharmony_ci return ret; 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci path = btrfs_alloc_path(); 36062306a36Sopenharmony_ci if (!path) { 36162306a36Sopenharmony_ci kvfree(buf); 36262306a36Sopenharmony_ci return ret; 36362306a36Sopenharmony_ci } 36462306a36Sopenharmony_ci 36562306a36Sopenharmony_ci path->reada = READA_FORWARD; 36662306a36Sopenharmony_ci /* Clone data */ 36762306a36Sopenharmony_ci key.objectid = btrfs_ino(BTRFS_I(src)); 36862306a36Sopenharmony_ci key.type = BTRFS_EXTENT_DATA_KEY; 36962306a36Sopenharmony_ci key.offset = off; 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci while (1) { 37262306a36Sopenharmony_ci struct btrfs_file_extent_item *extent; 37362306a36Sopenharmony_ci u64 extent_gen; 37462306a36Sopenharmony_ci int type; 37562306a36Sopenharmony_ci u32 size; 37662306a36Sopenharmony_ci struct btrfs_key new_key; 37762306a36Sopenharmony_ci u64 disko = 0, diskl = 0; 37862306a36Sopenharmony_ci u64 datao = 0, datal = 0; 37962306a36Sopenharmony_ci u8 comp; 38062306a36Sopenharmony_ci u64 drop_start; 38162306a36Sopenharmony_ci 38262306a36Sopenharmony_ci /* Note the key will change type as we walk through the tree */ 38362306a36Sopenharmony_ci ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path, 38462306a36Sopenharmony_ci 0, 0); 38562306a36Sopenharmony_ci if (ret < 0) 38662306a36Sopenharmony_ci goto out; 38762306a36Sopenharmony_ci /* 38862306a36Sopenharmony_ci * First search, if no extent item that starts at offset off was 38962306a36Sopenharmony_ci * found but the previous item is an extent item, it's possible 39062306a36Sopenharmony_ci * it might overlap our target range, therefore process it. 39162306a36Sopenharmony_ci */ 39262306a36Sopenharmony_ci if (key.offset == off && ret > 0 && path->slots[0] > 0) { 39362306a36Sopenharmony_ci btrfs_item_key_to_cpu(path->nodes[0], &key, 39462306a36Sopenharmony_ci path->slots[0] - 1); 39562306a36Sopenharmony_ci if (key.type == BTRFS_EXTENT_DATA_KEY) 39662306a36Sopenharmony_ci path->slots[0]--; 39762306a36Sopenharmony_ci } 39862306a36Sopenharmony_ci 39962306a36Sopenharmony_ci nritems = btrfs_header_nritems(path->nodes[0]); 40062306a36Sopenharmony_ciprocess_slot: 40162306a36Sopenharmony_ci if (path->slots[0] >= nritems) { 40262306a36Sopenharmony_ci ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 40362306a36Sopenharmony_ci if (ret < 0) 40462306a36Sopenharmony_ci goto out; 40562306a36Sopenharmony_ci if (ret > 0) 40662306a36Sopenharmony_ci break; 40762306a36Sopenharmony_ci nritems = btrfs_header_nritems(path->nodes[0]); 40862306a36Sopenharmony_ci } 40962306a36Sopenharmony_ci leaf = path->nodes[0]; 41062306a36Sopenharmony_ci slot = path->slots[0]; 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_ci btrfs_item_key_to_cpu(leaf, &key, slot); 41362306a36Sopenharmony_ci if (key.type > BTRFS_EXTENT_DATA_KEY || 41462306a36Sopenharmony_ci key.objectid != btrfs_ino(BTRFS_I(src))) 41562306a36Sopenharmony_ci break; 41662306a36Sopenharmony_ci 41762306a36Sopenharmony_ci ASSERT(key.type == BTRFS_EXTENT_DATA_KEY); 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_ci extent = btrfs_item_ptr(leaf, slot, 42062306a36Sopenharmony_ci struct btrfs_file_extent_item); 42162306a36Sopenharmony_ci extent_gen = btrfs_file_extent_generation(leaf, extent); 42262306a36Sopenharmony_ci comp = btrfs_file_extent_compression(leaf, extent); 42362306a36Sopenharmony_ci type = btrfs_file_extent_type(leaf, extent); 42462306a36Sopenharmony_ci if (type == BTRFS_FILE_EXTENT_REG || 42562306a36Sopenharmony_ci type == BTRFS_FILE_EXTENT_PREALLOC) { 42662306a36Sopenharmony_ci disko = btrfs_file_extent_disk_bytenr(leaf, extent); 42762306a36Sopenharmony_ci diskl = btrfs_file_extent_disk_num_bytes(leaf, extent); 42862306a36Sopenharmony_ci datao = btrfs_file_extent_offset(leaf, extent); 42962306a36Sopenharmony_ci datal = btrfs_file_extent_num_bytes(leaf, extent); 43062306a36Sopenharmony_ci } else if (type == BTRFS_FILE_EXTENT_INLINE) { 43162306a36Sopenharmony_ci /* Take upper bound, may be compressed */ 43262306a36Sopenharmony_ci datal = btrfs_file_extent_ram_bytes(leaf, extent); 43362306a36Sopenharmony_ci } 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci /* 43662306a36Sopenharmony_ci * The first search might have left us at an extent item that 43762306a36Sopenharmony_ci * ends before our target range's start, can happen if we have 43862306a36Sopenharmony_ci * holes and NO_HOLES feature enabled. 43962306a36Sopenharmony_ci * 44062306a36Sopenharmony_ci * Subsequent searches may leave us on a file range we have 44162306a36Sopenharmony_ci * processed before - this happens due to a race with ordered 44262306a36Sopenharmony_ci * extent completion for a file range that is outside our source 44362306a36Sopenharmony_ci * range, but that range was part of a file extent item that 44462306a36Sopenharmony_ci * also covered a leading part of our source range. 44562306a36Sopenharmony_ci */ 44662306a36Sopenharmony_ci if (key.offset + datal <= prev_extent_end) { 44762306a36Sopenharmony_ci path->slots[0]++; 44862306a36Sopenharmony_ci goto process_slot; 44962306a36Sopenharmony_ci } else if (key.offset >= off + len) { 45062306a36Sopenharmony_ci break; 45162306a36Sopenharmony_ci } 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_ci prev_extent_end = key.offset + datal; 45462306a36Sopenharmony_ci size = btrfs_item_size(leaf, slot); 45562306a36Sopenharmony_ci read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), 45662306a36Sopenharmony_ci size); 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci btrfs_release_path(path); 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci memcpy(&new_key, &key, sizeof(new_key)); 46162306a36Sopenharmony_ci new_key.objectid = btrfs_ino(BTRFS_I(inode)); 46262306a36Sopenharmony_ci if (off <= key.offset) 46362306a36Sopenharmony_ci new_key.offset = key.offset + destoff - off; 46462306a36Sopenharmony_ci else 46562306a36Sopenharmony_ci new_key.offset = destoff; 46662306a36Sopenharmony_ci 46762306a36Sopenharmony_ci /* 46862306a36Sopenharmony_ci * Deal with a hole that doesn't have an extent item that 46962306a36Sopenharmony_ci * represents it (NO_HOLES feature enabled). 47062306a36Sopenharmony_ci * This hole is either in the middle of the cloning range or at 47162306a36Sopenharmony_ci * the beginning (fully overlaps it or partially overlaps it). 47262306a36Sopenharmony_ci */ 47362306a36Sopenharmony_ci if (new_key.offset != last_dest_end) 47462306a36Sopenharmony_ci drop_start = last_dest_end; 47562306a36Sopenharmony_ci else 47662306a36Sopenharmony_ci drop_start = new_key.offset; 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_ci if (type == BTRFS_FILE_EXTENT_REG || 47962306a36Sopenharmony_ci type == BTRFS_FILE_EXTENT_PREALLOC) { 48062306a36Sopenharmony_ci struct btrfs_replace_extent_info clone_info; 48162306a36Sopenharmony_ci 48262306a36Sopenharmony_ci /* 48362306a36Sopenharmony_ci * a | --- range to clone ---| b 48462306a36Sopenharmony_ci * | ------------- extent ------------- | 48562306a36Sopenharmony_ci */ 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci /* Subtract range b */ 48862306a36Sopenharmony_ci if (key.offset + datal > off + len) 48962306a36Sopenharmony_ci datal = off + len - key.offset; 49062306a36Sopenharmony_ci 49162306a36Sopenharmony_ci /* Subtract range a */ 49262306a36Sopenharmony_ci if (off > key.offset) { 49362306a36Sopenharmony_ci datao += off - key.offset; 49462306a36Sopenharmony_ci datal -= off - key.offset; 49562306a36Sopenharmony_ci } 49662306a36Sopenharmony_ci 49762306a36Sopenharmony_ci clone_info.disk_offset = disko; 49862306a36Sopenharmony_ci clone_info.disk_len = diskl; 49962306a36Sopenharmony_ci clone_info.data_offset = datao; 50062306a36Sopenharmony_ci clone_info.data_len = datal; 50162306a36Sopenharmony_ci clone_info.file_offset = new_key.offset; 50262306a36Sopenharmony_ci clone_info.extent_buf = buf; 50362306a36Sopenharmony_ci clone_info.is_new_extent = false; 50462306a36Sopenharmony_ci clone_info.update_times = !no_time_update; 50562306a36Sopenharmony_ci ret = btrfs_replace_file_extents(BTRFS_I(inode), path, 50662306a36Sopenharmony_ci drop_start, new_key.offset + datal - 1, 50762306a36Sopenharmony_ci &clone_info, &trans); 50862306a36Sopenharmony_ci if (ret) 50962306a36Sopenharmony_ci goto out; 51062306a36Sopenharmony_ci } else { 51162306a36Sopenharmony_ci ASSERT(type == BTRFS_FILE_EXTENT_INLINE); 51262306a36Sopenharmony_ci /* 51362306a36Sopenharmony_ci * Inline extents always have to start at file offset 0 51462306a36Sopenharmony_ci * and can never be bigger then the sector size. We can 51562306a36Sopenharmony_ci * never clone only parts of an inline extent, since all 51662306a36Sopenharmony_ci * reflink operations must start at a sector size aligned 51762306a36Sopenharmony_ci * offset, and the length must be aligned too or end at 51862306a36Sopenharmony_ci * the i_size (which implies the whole inlined data). 51962306a36Sopenharmony_ci */ 52062306a36Sopenharmony_ci ASSERT(key.offset == 0); 52162306a36Sopenharmony_ci ASSERT(datal <= fs_info->sectorsize); 52262306a36Sopenharmony_ci if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) || 52362306a36Sopenharmony_ci WARN_ON(key.offset != 0) || 52462306a36Sopenharmony_ci WARN_ON(datal > fs_info->sectorsize)) { 52562306a36Sopenharmony_ci ret = -EUCLEAN; 52662306a36Sopenharmony_ci goto out; 52762306a36Sopenharmony_ci } 52862306a36Sopenharmony_ci 52962306a36Sopenharmony_ci ret = clone_copy_inline_extent(inode, path, &new_key, 53062306a36Sopenharmony_ci drop_start, datal, size, 53162306a36Sopenharmony_ci comp, buf, &trans); 53262306a36Sopenharmony_ci if (ret) 53362306a36Sopenharmony_ci goto out; 53462306a36Sopenharmony_ci } 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ci btrfs_release_path(path); 53762306a36Sopenharmony_ci 53862306a36Sopenharmony_ci /* 53962306a36Sopenharmony_ci * Whenever we share an extent we update the last_reflink_trans 54062306a36Sopenharmony_ci * of each inode to the current transaction. This is needed to 54162306a36Sopenharmony_ci * make sure fsync does not log multiple checksum items with 54262306a36Sopenharmony_ci * overlapping ranges (because some extent items might refer 54362306a36Sopenharmony_ci * only to sections of the original extent). For the destination 54462306a36Sopenharmony_ci * inode we do this regardless of the generation of the extents 54562306a36Sopenharmony_ci * or even if they are inline extents or explicit holes, to make 54662306a36Sopenharmony_ci * sure a full fsync does not skip them. For the source inode, 54762306a36Sopenharmony_ci * we only need to update last_reflink_trans in case it's a new 54862306a36Sopenharmony_ci * extent that is not a hole or an inline extent, to deal with 54962306a36Sopenharmony_ci * the checksums problem on fsync. 55062306a36Sopenharmony_ci */ 55162306a36Sopenharmony_ci if (extent_gen == trans->transid && disko > 0) 55262306a36Sopenharmony_ci BTRFS_I(src)->last_reflink_trans = trans->transid; 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ci BTRFS_I(inode)->last_reflink_trans = trans->transid; 55562306a36Sopenharmony_ci 55662306a36Sopenharmony_ci last_dest_end = ALIGN(new_key.offset + datal, 55762306a36Sopenharmony_ci fs_info->sectorsize); 55862306a36Sopenharmony_ci ret = clone_finish_inode_update(trans, inode, last_dest_end, 55962306a36Sopenharmony_ci destoff, olen, no_time_update); 56062306a36Sopenharmony_ci if (ret) 56162306a36Sopenharmony_ci goto out; 56262306a36Sopenharmony_ci if (new_key.offset + datal >= destoff + len) 56362306a36Sopenharmony_ci break; 56462306a36Sopenharmony_ci 56562306a36Sopenharmony_ci btrfs_release_path(path); 56662306a36Sopenharmony_ci key.offset = prev_extent_end; 56762306a36Sopenharmony_ci 56862306a36Sopenharmony_ci if (fatal_signal_pending(current)) { 56962306a36Sopenharmony_ci ret = -EINTR; 57062306a36Sopenharmony_ci goto out; 57162306a36Sopenharmony_ci } 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ci cond_resched(); 57462306a36Sopenharmony_ci } 57562306a36Sopenharmony_ci ret = 0; 57662306a36Sopenharmony_ci 57762306a36Sopenharmony_ci if (last_dest_end < destoff + len) { 57862306a36Sopenharmony_ci /* 57962306a36Sopenharmony_ci * We have an implicit hole that fully or partially overlaps our 58062306a36Sopenharmony_ci * cloning range at its end. This means that we either have the 58162306a36Sopenharmony_ci * NO_HOLES feature enabled or the implicit hole happened due to 58262306a36Sopenharmony_ci * mixing buffered and direct IO writes against this file. 58362306a36Sopenharmony_ci */ 58462306a36Sopenharmony_ci btrfs_release_path(path); 58562306a36Sopenharmony_ci 58662306a36Sopenharmony_ci /* 58762306a36Sopenharmony_ci * When using NO_HOLES and we are cloning a range that covers 58862306a36Sopenharmony_ci * only a hole (no extents) into a range beyond the current 58962306a36Sopenharmony_ci * i_size, punching a hole in the target range will not create 59062306a36Sopenharmony_ci * an extent map defining a hole, because the range starts at or 59162306a36Sopenharmony_ci * beyond current i_size. If the file previously had an i_size 59262306a36Sopenharmony_ci * greater than the new i_size set by this clone operation, we 59362306a36Sopenharmony_ci * need to make sure the next fsync is a full fsync, so that it 59462306a36Sopenharmony_ci * detects and logs a hole covering a range from the current 59562306a36Sopenharmony_ci * i_size to the new i_size. If the clone range covers extents, 59662306a36Sopenharmony_ci * besides a hole, then we know the full sync flag was already 59762306a36Sopenharmony_ci * set by previous calls to btrfs_replace_file_extents() that 59862306a36Sopenharmony_ci * replaced file extent items. 59962306a36Sopenharmony_ci */ 60062306a36Sopenharmony_ci if (last_dest_end >= i_size_read(inode)) 60162306a36Sopenharmony_ci btrfs_set_inode_full_sync(BTRFS_I(inode)); 60262306a36Sopenharmony_ci 60362306a36Sopenharmony_ci ret = btrfs_replace_file_extents(BTRFS_I(inode), path, 60462306a36Sopenharmony_ci last_dest_end, destoff + len - 1, NULL, &trans); 60562306a36Sopenharmony_ci if (ret) 60662306a36Sopenharmony_ci goto out; 60762306a36Sopenharmony_ci 60862306a36Sopenharmony_ci ret = clone_finish_inode_update(trans, inode, destoff + len, 60962306a36Sopenharmony_ci destoff, olen, no_time_update); 61062306a36Sopenharmony_ci } 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ciout: 61362306a36Sopenharmony_ci btrfs_free_path(path); 61462306a36Sopenharmony_ci kvfree(buf); 61562306a36Sopenharmony_ci clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags); 61662306a36Sopenharmony_ci 61762306a36Sopenharmony_ci return ret; 61862306a36Sopenharmony_ci} 61962306a36Sopenharmony_ci 62062306a36Sopenharmony_cistatic void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, 62162306a36Sopenharmony_ci struct inode *inode2, u64 loff2, u64 len) 62262306a36Sopenharmony_ci{ 62362306a36Sopenharmony_ci unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1, NULL); 62462306a36Sopenharmony_ci unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1, NULL); 62562306a36Sopenharmony_ci} 62662306a36Sopenharmony_ci 62762306a36Sopenharmony_cistatic void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, 62862306a36Sopenharmony_ci struct inode *inode2, u64 loff2, u64 len) 62962306a36Sopenharmony_ci{ 63062306a36Sopenharmony_ci u64 range1_end = loff1 + len - 1; 63162306a36Sopenharmony_ci u64 range2_end = loff2 + len - 1; 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci if (inode1 < inode2) { 63462306a36Sopenharmony_ci swap(inode1, inode2); 63562306a36Sopenharmony_ci swap(loff1, loff2); 63662306a36Sopenharmony_ci swap(range1_end, range2_end); 63762306a36Sopenharmony_ci } else if (inode1 == inode2 && loff2 < loff1) { 63862306a36Sopenharmony_ci swap(loff1, loff2); 63962306a36Sopenharmony_ci swap(range1_end, range2_end); 64062306a36Sopenharmony_ci } 64162306a36Sopenharmony_ci 64262306a36Sopenharmony_ci lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end, NULL); 64362306a36Sopenharmony_ci lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end, NULL); 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_ci btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); 64662306a36Sopenharmony_ci btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); 64762306a36Sopenharmony_ci} 64862306a36Sopenharmony_ci 64962306a36Sopenharmony_cistatic void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) 65062306a36Sopenharmony_ci{ 65162306a36Sopenharmony_ci if (inode1 < inode2) 65262306a36Sopenharmony_ci swap(inode1, inode2); 65362306a36Sopenharmony_ci down_write(&BTRFS_I(inode1)->i_mmap_lock); 65462306a36Sopenharmony_ci down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); 65562306a36Sopenharmony_ci} 65662306a36Sopenharmony_ci 65762306a36Sopenharmony_cistatic void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) 65862306a36Sopenharmony_ci{ 65962306a36Sopenharmony_ci up_write(&BTRFS_I(inode1)->i_mmap_lock); 66062306a36Sopenharmony_ci up_write(&BTRFS_I(inode2)->i_mmap_lock); 66162306a36Sopenharmony_ci} 66262306a36Sopenharmony_ci 66362306a36Sopenharmony_cistatic int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, 66462306a36Sopenharmony_ci struct inode *dst, u64 dst_loff) 66562306a36Sopenharmony_ci{ 66662306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; 66762306a36Sopenharmony_ci const u64 bs = fs_info->sb->s_blocksize; 66862306a36Sopenharmony_ci int ret; 66962306a36Sopenharmony_ci 67062306a36Sopenharmony_ci /* 67162306a36Sopenharmony_ci * Lock destination range to serialize with concurrent readahead() and 67262306a36Sopenharmony_ci * source range to serialize with relocation. 67362306a36Sopenharmony_ci */ 67462306a36Sopenharmony_ci btrfs_double_extent_lock(src, loff, dst, dst_loff, len); 67562306a36Sopenharmony_ci ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); 67662306a36Sopenharmony_ci btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); 67762306a36Sopenharmony_ci 67862306a36Sopenharmony_ci btrfs_btree_balance_dirty(fs_info); 67962306a36Sopenharmony_ci 68062306a36Sopenharmony_ci return ret; 68162306a36Sopenharmony_ci} 68262306a36Sopenharmony_ci 68362306a36Sopenharmony_cistatic int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, 68462306a36Sopenharmony_ci struct inode *dst, u64 dst_loff) 68562306a36Sopenharmony_ci{ 68662306a36Sopenharmony_ci int ret = 0; 68762306a36Sopenharmony_ci u64 i, tail_len, chunk_count; 68862306a36Sopenharmony_ci struct btrfs_root *root_dst = BTRFS_I(dst)->root; 68962306a36Sopenharmony_ci 69062306a36Sopenharmony_ci spin_lock(&root_dst->root_item_lock); 69162306a36Sopenharmony_ci if (root_dst->send_in_progress) { 69262306a36Sopenharmony_ci btrfs_warn_rl(root_dst->fs_info, 69362306a36Sopenharmony_ci"cannot deduplicate to root %llu while send operations are using it (%d in progress)", 69462306a36Sopenharmony_ci root_dst->root_key.objectid, 69562306a36Sopenharmony_ci root_dst->send_in_progress); 69662306a36Sopenharmony_ci spin_unlock(&root_dst->root_item_lock); 69762306a36Sopenharmony_ci return -EAGAIN; 69862306a36Sopenharmony_ci } 69962306a36Sopenharmony_ci root_dst->dedupe_in_progress++; 70062306a36Sopenharmony_ci spin_unlock(&root_dst->root_item_lock); 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_ci tail_len = olen % BTRFS_MAX_DEDUPE_LEN; 70362306a36Sopenharmony_ci chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); 70462306a36Sopenharmony_ci 70562306a36Sopenharmony_ci for (i = 0; i < chunk_count; i++) { 70662306a36Sopenharmony_ci ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, 70762306a36Sopenharmony_ci dst, dst_loff); 70862306a36Sopenharmony_ci if (ret) 70962306a36Sopenharmony_ci goto out; 71062306a36Sopenharmony_ci 71162306a36Sopenharmony_ci loff += BTRFS_MAX_DEDUPE_LEN; 71262306a36Sopenharmony_ci dst_loff += BTRFS_MAX_DEDUPE_LEN; 71362306a36Sopenharmony_ci } 71462306a36Sopenharmony_ci 71562306a36Sopenharmony_ci if (tail_len > 0) 71662306a36Sopenharmony_ci ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); 71762306a36Sopenharmony_ciout: 71862306a36Sopenharmony_ci spin_lock(&root_dst->root_item_lock); 71962306a36Sopenharmony_ci root_dst->dedupe_in_progress--; 72062306a36Sopenharmony_ci spin_unlock(&root_dst->root_item_lock); 72162306a36Sopenharmony_ci 72262306a36Sopenharmony_ci return ret; 72362306a36Sopenharmony_ci} 72462306a36Sopenharmony_ci 72562306a36Sopenharmony_cistatic noinline int btrfs_clone_files(struct file *file, struct file *file_src, 72662306a36Sopenharmony_ci u64 off, u64 olen, u64 destoff) 72762306a36Sopenharmony_ci{ 72862306a36Sopenharmony_ci struct inode *inode = file_inode(file); 72962306a36Sopenharmony_ci struct inode *src = file_inode(file_src); 73062306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 73162306a36Sopenharmony_ci int ret; 73262306a36Sopenharmony_ci int wb_ret; 73362306a36Sopenharmony_ci u64 len = olen; 73462306a36Sopenharmony_ci u64 bs = fs_info->sb->s_blocksize; 73562306a36Sopenharmony_ci 73662306a36Sopenharmony_ci /* 73762306a36Sopenharmony_ci * VFS's generic_remap_file_range_prep() protects us from cloning the 73862306a36Sopenharmony_ci * eof block into the middle of a file, which would result in corruption 73962306a36Sopenharmony_ci * if the file size is not blocksize aligned. So we don't need to check 74062306a36Sopenharmony_ci * for that case here. 74162306a36Sopenharmony_ci */ 74262306a36Sopenharmony_ci if (off + len == src->i_size) 74362306a36Sopenharmony_ci len = ALIGN(src->i_size, bs) - off; 74462306a36Sopenharmony_ci 74562306a36Sopenharmony_ci if (destoff > inode->i_size) { 74662306a36Sopenharmony_ci const u64 wb_start = ALIGN_DOWN(inode->i_size, bs); 74762306a36Sopenharmony_ci 74862306a36Sopenharmony_ci ret = btrfs_cont_expand(BTRFS_I(inode), inode->i_size, destoff); 74962306a36Sopenharmony_ci if (ret) 75062306a36Sopenharmony_ci return ret; 75162306a36Sopenharmony_ci /* 75262306a36Sopenharmony_ci * We may have truncated the last block if the inode's size is 75362306a36Sopenharmony_ci * not sector size aligned, so we need to wait for writeback to 75462306a36Sopenharmony_ci * complete before proceeding further, otherwise we can race 75562306a36Sopenharmony_ci * with cloning and attempt to increment a reference to an 75662306a36Sopenharmony_ci * extent that no longer exists (writeback completed right after 75762306a36Sopenharmony_ci * we found the previous extent covering eof and before we 75862306a36Sopenharmony_ci * attempted to increment its reference count). 75962306a36Sopenharmony_ci */ 76062306a36Sopenharmony_ci ret = btrfs_wait_ordered_range(inode, wb_start, 76162306a36Sopenharmony_ci destoff - wb_start); 76262306a36Sopenharmony_ci if (ret) 76362306a36Sopenharmony_ci return ret; 76462306a36Sopenharmony_ci } 76562306a36Sopenharmony_ci 76662306a36Sopenharmony_ci /* 76762306a36Sopenharmony_ci * Lock destination range to serialize with concurrent readahead() and 76862306a36Sopenharmony_ci * source range to serialize with relocation. 76962306a36Sopenharmony_ci */ 77062306a36Sopenharmony_ci btrfs_double_extent_lock(src, off, inode, destoff, len); 77162306a36Sopenharmony_ci ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); 77262306a36Sopenharmony_ci btrfs_double_extent_unlock(src, off, inode, destoff, len); 77362306a36Sopenharmony_ci 77462306a36Sopenharmony_ci /* 77562306a36Sopenharmony_ci * We may have copied an inline extent into a page of the destination 77662306a36Sopenharmony_ci * range, so wait for writeback to complete before truncating pages 77762306a36Sopenharmony_ci * from the page cache. This is a rare case. 77862306a36Sopenharmony_ci */ 77962306a36Sopenharmony_ci wb_ret = btrfs_wait_ordered_range(inode, destoff, len); 78062306a36Sopenharmony_ci ret = ret ? ret : wb_ret; 78162306a36Sopenharmony_ci /* 78262306a36Sopenharmony_ci * Truncate page cache pages so that future reads will see the cloned 78362306a36Sopenharmony_ci * data immediately and not the previous data. 78462306a36Sopenharmony_ci */ 78562306a36Sopenharmony_ci truncate_inode_pages_range(&inode->i_data, 78662306a36Sopenharmony_ci round_down(destoff, PAGE_SIZE), 78762306a36Sopenharmony_ci round_up(destoff + len, PAGE_SIZE) - 1); 78862306a36Sopenharmony_ci 78962306a36Sopenharmony_ci btrfs_btree_balance_dirty(fs_info); 79062306a36Sopenharmony_ci 79162306a36Sopenharmony_ci return ret; 79262306a36Sopenharmony_ci} 79362306a36Sopenharmony_ci 79462306a36Sopenharmony_cistatic int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, 79562306a36Sopenharmony_ci struct file *file_out, loff_t pos_out, 79662306a36Sopenharmony_ci loff_t *len, unsigned int remap_flags) 79762306a36Sopenharmony_ci{ 79862306a36Sopenharmony_ci struct inode *inode_in = file_inode(file_in); 79962306a36Sopenharmony_ci struct inode *inode_out = file_inode(file_out); 80062306a36Sopenharmony_ci u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; 80162306a36Sopenharmony_ci u64 wb_len; 80262306a36Sopenharmony_ci int ret; 80362306a36Sopenharmony_ci 80462306a36Sopenharmony_ci if (!(remap_flags & REMAP_FILE_DEDUP)) { 80562306a36Sopenharmony_ci struct btrfs_root *root_out = BTRFS_I(inode_out)->root; 80662306a36Sopenharmony_ci 80762306a36Sopenharmony_ci if (btrfs_root_readonly(root_out)) 80862306a36Sopenharmony_ci return -EROFS; 80962306a36Sopenharmony_ci 81062306a36Sopenharmony_ci ASSERT(inode_in->i_sb == inode_out->i_sb); 81162306a36Sopenharmony_ci } 81262306a36Sopenharmony_ci 81362306a36Sopenharmony_ci /* Don't make the dst file partly checksummed */ 81462306a36Sopenharmony_ci if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != 81562306a36Sopenharmony_ci (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { 81662306a36Sopenharmony_ci return -EINVAL; 81762306a36Sopenharmony_ci } 81862306a36Sopenharmony_ci 81962306a36Sopenharmony_ci /* 82062306a36Sopenharmony_ci * Now that the inodes are locked, we need to start writeback ourselves 82162306a36Sopenharmony_ci * and can not rely on the writeback from the VFS's generic helper 82262306a36Sopenharmony_ci * generic_remap_file_range_prep() because: 82362306a36Sopenharmony_ci * 82462306a36Sopenharmony_ci * 1) For compression we must call filemap_fdatawrite_range() range 82562306a36Sopenharmony_ci * twice (btrfs_fdatawrite_range() does it for us), and the generic 82662306a36Sopenharmony_ci * helper only calls it once; 82762306a36Sopenharmony_ci * 82862306a36Sopenharmony_ci * 2) filemap_fdatawrite_range(), called by the generic helper only 82962306a36Sopenharmony_ci * waits for the writeback to complete, i.e. for IO to be done, and 83062306a36Sopenharmony_ci * not for the ordered extents to complete. We need to wait for them 83162306a36Sopenharmony_ci * to complete so that new file extent items are in the fs tree. 83262306a36Sopenharmony_ci */ 83362306a36Sopenharmony_ci if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) 83462306a36Sopenharmony_ci wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); 83562306a36Sopenharmony_ci else 83662306a36Sopenharmony_ci wb_len = ALIGN(*len, bs); 83762306a36Sopenharmony_ci 83862306a36Sopenharmony_ci /* 83962306a36Sopenharmony_ci * Workaround to make sure NOCOW buffered write reach disk as NOCOW. 84062306a36Sopenharmony_ci * 84162306a36Sopenharmony_ci * Btrfs' back references do not have a block level granularity, they 84262306a36Sopenharmony_ci * work at the whole extent level. 84362306a36Sopenharmony_ci * NOCOW buffered write without data space reserved may not be able 84462306a36Sopenharmony_ci * to fall back to CoW due to lack of data space, thus could cause 84562306a36Sopenharmony_ci * data loss. 84662306a36Sopenharmony_ci * 84762306a36Sopenharmony_ci * Here we take a shortcut by flushing the whole inode, so that all 84862306a36Sopenharmony_ci * nocow write should reach disk as nocow before we increase the 84962306a36Sopenharmony_ci * reference of the extent. We could do better by only flushing NOCOW 85062306a36Sopenharmony_ci * data, but that needs extra accounting. 85162306a36Sopenharmony_ci * 85262306a36Sopenharmony_ci * Also we don't need to check ASYNC_EXTENT, as async extent will be 85362306a36Sopenharmony_ci * CoWed anyway, not affecting nocow part. 85462306a36Sopenharmony_ci */ 85562306a36Sopenharmony_ci ret = filemap_flush(inode_in->i_mapping); 85662306a36Sopenharmony_ci if (ret < 0) 85762306a36Sopenharmony_ci return ret; 85862306a36Sopenharmony_ci 85962306a36Sopenharmony_ci ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), 86062306a36Sopenharmony_ci wb_len); 86162306a36Sopenharmony_ci if (ret < 0) 86262306a36Sopenharmony_ci return ret; 86362306a36Sopenharmony_ci ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), 86462306a36Sopenharmony_ci wb_len); 86562306a36Sopenharmony_ci if (ret < 0) 86662306a36Sopenharmony_ci return ret; 86762306a36Sopenharmony_ci 86862306a36Sopenharmony_ci return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, 86962306a36Sopenharmony_ci len, remap_flags); 87062306a36Sopenharmony_ci} 87162306a36Sopenharmony_ci 87262306a36Sopenharmony_cistatic bool file_sync_write(const struct file *file) 87362306a36Sopenharmony_ci{ 87462306a36Sopenharmony_ci if (file->f_flags & (__O_SYNC | O_DSYNC)) 87562306a36Sopenharmony_ci return true; 87662306a36Sopenharmony_ci if (IS_SYNC(file_inode(file))) 87762306a36Sopenharmony_ci return true; 87862306a36Sopenharmony_ci 87962306a36Sopenharmony_ci return false; 88062306a36Sopenharmony_ci} 88162306a36Sopenharmony_ci 88262306a36Sopenharmony_ciloff_t btrfs_remap_file_range(struct file *src_file, loff_t off, 88362306a36Sopenharmony_ci struct file *dst_file, loff_t destoff, loff_t len, 88462306a36Sopenharmony_ci unsigned int remap_flags) 88562306a36Sopenharmony_ci{ 88662306a36Sopenharmony_ci struct inode *src_inode = file_inode(src_file); 88762306a36Sopenharmony_ci struct inode *dst_inode = file_inode(dst_file); 88862306a36Sopenharmony_ci bool same_inode = dst_inode == src_inode; 88962306a36Sopenharmony_ci int ret; 89062306a36Sopenharmony_ci 89162306a36Sopenharmony_ci if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) 89262306a36Sopenharmony_ci return -EINVAL; 89362306a36Sopenharmony_ci 89462306a36Sopenharmony_ci if (same_inode) { 89562306a36Sopenharmony_ci btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); 89662306a36Sopenharmony_ci } else { 89762306a36Sopenharmony_ci lock_two_nondirectories(src_inode, dst_inode); 89862306a36Sopenharmony_ci btrfs_double_mmap_lock(src_inode, dst_inode); 89962306a36Sopenharmony_ci } 90062306a36Sopenharmony_ci 90162306a36Sopenharmony_ci ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, 90262306a36Sopenharmony_ci &len, remap_flags); 90362306a36Sopenharmony_ci if (ret < 0 || len == 0) 90462306a36Sopenharmony_ci goto out_unlock; 90562306a36Sopenharmony_ci 90662306a36Sopenharmony_ci if (remap_flags & REMAP_FILE_DEDUP) 90762306a36Sopenharmony_ci ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); 90862306a36Sopenharmony_ci else 90962306a36Sopenharmony_ci ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); 91062306a36Sopenharmony_ci 91162306a36Sopenharmony_ciout_unlock: 91262306a36Sopenharmony_ci if (same_inode) { 91362306a36Sopenharmony_ci btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); 91462306a36Sopenharmony_ci } else { 91562306a36Sopenharmony_ci btrfs_double_mmap_unlock(src_inode, dst_inode); 91662306a36Sopenharmony_ci unlock_two_nondirectories(src_inode, dst_inode); 91762306a36Sopenharmony_ci } 91862306a36Sopenharmony_ci 91962306a36Sopenharmony_ci /* 92062306a36Sopenharmony_ci * If either the source or the destination file was opened with O_SYNC, 92162306a36Sopenharmony_ci * O_DSYNC or has the S_SYNC attribute, fsync both the destination and 92262306a36Sopenharmony_ci * source files/ranges, so that after a successful return (0) followed 92362306a36Sopenharmony_ci * by a power failure results in the reflinked data to be readable from 92462306a36Sopenharmony_ci * both files/ranges. 92562306a36Sopenharmony_ci */ 92662306a36Sopenharmony_ci if (ret == 0 && len > 0 && 92762306a36Sopenharmony_ci (file_sync_write(src_file) || file_sync_write(dst_file))) { 92862306a36Sopenharmony_ci ret = btrfs_sync_file(src_file, off, off + len - 1, 0); 92962306a36Sopenharmony_ci if (ret == 0) 93062306a36Sopenharmony_ci ret = btrfs_sync_file(dst_file, destoff, 93162306a36Sopenharmony_ci destoff + len - 1, 0); 93262306a36Sopenharmony_ci } 93362306a36Sopenharmony_ci 93462306a36Sopenharmony_ci return ret < 0 ? ret : len; 93562306a36Sopenharmony_ci} 936