162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0+ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2016 Oracle. All Rights Reserved. 462306a36Sopenharmony_ci * Author: Darrick J. Wong <darrick.wong@oracle.com> 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci#include "xfs.h" 762306a36Sopenharmony_ci#include "xfs_fs.h" 862306a36Sopenharmony_ci#include "xfs_shared.h" 962306a36Sopenharmony_ci#include "xfs_format.h" 1062306a36Sopenharmony_ci#include "xfs_log_format.h" 1162306a36Sopenharmony_ci#include "xfs_trans_resv.h" 1262306a36Sopenharmony_ci#include "xfs_mount.h" 1362306a36Sopenharmony_ci#include "xfs_defer.h" 1462306a36Sopenharmony_ci#include "xfs_inode.h" 1562306a36Sopenharmony_ci#include "xfs_trans.h" 1662306a36Sopenharmony_ci#include "xfs_bmap.h" 1762306a36Sopenharmony_ci#include "xfs_bmap_util.h" 1862306a36Sopenharmony_ci#include "xfs_trace.h" 1962306a36Sopenharmony_ci#include "xfs_icache.h" 2062306a36Sopenharmony_ci#include "xfs_btree.h" 2162306a36Sopenharmony_ci#include "xfs_refcount_btree.h" 2262306a36Sopenharmony_ci#include "xfs_refcount.h" 2362306a36Sopenharmony_ci#include "xfs_bmap_btree.h" 2462306a36Sopenharmony_ci#include "xfs_trans_space.h" 2562306a36Sopenharmony_ci#include "xfs_bit.h" 2662306a36Sopenharmony_ci#include "xfs_alloc.h" 2762306a36Sopenharmony_ci#include "xfs_quota.h" 2862306a36Sopenharmony_ci#include "xfs_reflink.h" 2962306a36Sopenharmony_ci#include "xfs_iomap.h" 3062306a36Sopenharmony_ci#include "xfs_ag.h" 3162306a36Sopenharmony_ci#include "xfs_ag_resv.h" 3262306a36Sopenharmony_ci 3362306a36Sopenharmony_ci/* 3462306a36Sopenharmony_ci * Copy on Write of Shared Blocks 3562306a36Sopenharmony_ci * 3662306a36Sopenharmony_ci * XFS must preserve "the usual" file semantics even when two files share 3762306a36Sopenharmony_ci * the same physical blocks. This means that a write to one file must not 3862306a36Sopenharmony_ci * alter the blocks in a different file; the way that we'll do that is 3962306a36Sopenharmony_ci * through the use of a copy-on-write mechanism. At a high level, that 4062306a36Sopenharmony_ci * means that when we want to write to a shared block, we allocate a new 4162306a36Sopenharmony_ci * block, write the data to the new block, and if that succeeds we map the 4262306a36Sopenharmony_ci * new block into the file. 4362306a36Sopenharmony_ci * 4462306a36Sopenharmony_ci * XFS provides a "delayed allocation" mechanism that defers the allocation 4562306a36Sopenharmony_ci * of disk blocks to dirty-but-not-yet-mapped file blocks as long as 4662306a36Sopenharmony_ci * possible. This reduces fragmentation by enabling the filesystem to ask 4762306a36Sopenharmony_ci * for bigger chunks less often, which is exactly what we want for CoW. 4862306a36Sopenharmony_ci * 4962306a36Sopenharmony_ci * The delalloc mechanism begins when the kernel wants to make a block 5062306a36Sopenharmony_ci * writable (write_begin or page_mkwrite). If the offset is not mapped, we 5162306a36Sopenharmony_ci * create a delalloc mapping, which is a regular in-core extent, but without 5262306a36Sopenharmony_ci * a real startblock. (For delalloc mappings, the startblock encodes both 5362306a36Sopenharmony_ci * a flag that this is a delalloc mapping, and a worst-case estimate of how 5462306a36Sopenharmony_ci * many blocks might be required to put the mapping into the BMBT.) delalloc 5562306a36Sopenharmony_ci * mappings are a reservation against the free space in the filesystem; 5662306a36Sopenharmony_ci * adjacent mappings can also be combined into fewer larger mappings. 5762306a36Sopenharmony_ci * 5862306a36Sopenharmony_ci * As an optimization, the CoW extent size hint (cowextsz) creates 5962306a36Sopenharmony_ci * outsized aligned delalloc reservations in the hope of landing out of 6062306a36Sopenharmony_ci * order nearby CoW writes in a single extent on disk, thereby reducing 6162306a36Sopenharmony_ci * fragmentation and improving future performance. 6262306a36Sopenharmony_ci * 6362306a36Sopenharmony_ci * D: --RRRRRRSSSRRRRRRRR--- (data fork) 6462306a36Sopenharmony_ci * C: ------DDDDDDD--------- (CoW fork) 6562306a36Sopenharmony_ci * 6662306a36Sopenharmony_ci * When dirty pages are being written out (typically in writepage), the 6762306a36Sopenharmony_ci * delalloc reservations are converted into unwritten mappings by 6862306a36Sopenharmony_ci * allocating blocks and replacing the delalloc mapping with real ones. 6962306a36Sopenharmony_ci * A delalloc mapping can be replaced by several unwritten ones if the 7062306a36Sopenharmony_ci * free space is fragmented. 7162306a36Sopenharmony_ci * 7262306a36Sopenharmony_ci * D: --RRRRRRSSSRRRRRRRR--- 7362306a36Sopenharmony_ci * C: ------UUUUUUU--------- 7462306a36Sopenharmony_ci * 7562306a36Sopenharmony_ci * We want to adapt the delalloc mechanism for copy-on-write, since the 7662306a36Sopenharmony_ci * write paths are similar. The first two steps (creating the reservation 7762306a36Sopenharmony_ci * and allocating the blocks) are exactly the same as delalloc except that 7862306a36Sopenharmony_ci * the mappings must be stored in a separate CoW fork because we do not want 7962306a36Sopenharmony_ci * to disturb the mapping in the data fork until we're sure that the write 8062306a36Sopenharmony_ci * succeeded. IO completion in this case is the process of removing the old 8162306a36Sopenharmony_ci * mapping from the data fork and moving the new mapping from the CoW fork to 8262306a36Sopenharmony_ci * the data fork. This will be discussed shortly. 8362306a36Sopenharmony_ci * 8462306a36Sopenharmony_ci * For now, unaligned directio writes will be bounced back to the page cache. 8562306a36Sopenharmony_ci * Block-aligned directio writes will use the same mechanism as buffered 8662306a36Sopenharmony_ci * writes. 8762306a36Sopenharmony_ci * 8862306a36Sopenharmony_ci * Just prior to submitting the actual disk write requests, we convert 8962306a36Sopenharmony_ci * the extents representing the range of the file actually being written 9062306a36Sopenharmony_ci * (as opposed to extra pieces created for the cowextsize hint) to real 9162306a36Sopenharmony_ci * extents. This will become important in the next step: 9262306a36Sopenharmony_ci * 9362306a36Sopenharmony_ci * D: --RRRRRRSSSRRRRRRRR--- 9462306a36Sopenharmony_ci * C: ------UUrrUUU--------- 9562306a36Sopenharmony_ci * 9662306a36Sopenharmony_ci * CoW remapping must be done after the data block write completes, 9762306a36Sopenharmony_ci * because we don't want to destroy the old data fork map until we're sure 9862306a36Sopenharmony_ci * the new block has been written. Since the new mappings are kept in a 9962306a36Sopenharmony_ci * separate fork, we can simply iterate these mappings to find the ones 10062306a36Sopenharmony_ci * that cover the file blocks that we just CoW'd. For each extent, simply 10162306a36Sopenharmony_ci * unmap the corresponding range in the data fork, map the new range into 10262306a36Sopenharmony_ci * the data fork, and remove the extent from the CoW fork. Because of 10362306a36Sopenharmony_ci * the presence of the cowextsize hint, however, we must be careful 10462306a36Sopenharmony_ci * only to remap the blocks that we've actually written out -- we must 10562306a36Sopenharmony_ci * never remap delalloc reservations nor CoW staging blocks that have 10662306a36Sopenharmony_ci * yet to be written. This corresponds exactly to the real extents in 10762306a36Sopenharmony_ci * the CoW fork: 10862306a36Sopenharmony_ci * 10962306a36Sopenharmony_ci * D: --RRRRRRrrSRRRRRRRR--- 11062306a36Sopenharmony_ci * C: ------UU--UUU--------- 11162306a36Sopenharmony_ci * 11262306a36Sopenharmony_ci * Since the remapping operation can be applied to an arbitrary file 11362306a36Sopenharmony_ci * range, we record the need for the remap step as a flag in the ioend 11462306a36Sopenharmony_ci * instead of declaring a new IO type. This is required for direct io 11562306a36Sopenharmony_ci * because we only have ioend for the whole dio, and we have to be able to 11662306a36Sopenharmony_ci * remember the presence of unwritten blocks and CoW blocks with a single 11762306a36Sopenharmony_ci * ioend structure. Better yet, the more ground we can cover with one 11862306a36Sopenharmony_ci * ioend, the better. 11962306a36Sopenharmony_ci */ 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci/* 12262306a36Sopenharmony_ci * Given an AG extent, find the lowest-numbered run of shared blocks 12362306a36Sopenharmony_ci * within that range and return the range in fbno/flen. If 12462306a36Sopenharmony_ci * find_end_of_shared is true, return the longest contiguous extent of 12562306a36Sopenharmony_ci * shared blocks. If there are no shared extents, fbno and flen will 12662306a36Sopenharmony_ci * be set to NULLAGBLOCK and 0, respectively. 12762306a36Sopenharmony_ci */ 12862306a36Sopenharmony_cistatic int 12962306a36Sopenharmony_cixfs_reflink_find_shared( 13062306a36Sopenharmony_ci struct xfs_perag *pag, 13162306a36Sopenharmony_ci struct xfs_trans *tp, 13262306a36Sopenharmony_ci xfs_agblock_t agbno, 13362306a36Sopenharmony_ci xfs_extlen_t aglen, 13462306a36Sopenharmony_ci xfs_agblock_t *fbno, 13562306a36Sopenharmony_ci xfs_extlen_t *flen, 13662306a36Sopenharmony_ci bool find_end_of_shared) 13762306a36Sopenharmony_ci{ 13862306a36Sopenharmony_ci struct xfs_buf *agbp; 13962306a36Sopenharmony_ci struct xfs_btree_cur *cur; 14062306a36Sopenharmony_ci int error; 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_ci error = xfs_alloc_read_agf(pag, tp, 0, &agbp); 14362306a36Sopenharmony_ci if (error) 14462306a36Sopenharmony_ci return error; 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag); 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, 14962306a36Sopenharmony_ci find_end_of_shared); 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_ci xfs_btree_del_cursor(cur, error); 15262306a36Sopenharmony_ci 15362306a36Sopenharmony_ci xfs_trans_brelse(tp, agbp); 15462306a36Sopenharmony_ci return error; 15562306a36Sopenharmony_ci} 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci/* 15862306a36Sopenharmony_ci * Trim the mapping to the next block where there's a change in the 15962306a36Sopenharmony_ci * shared/unshared status. More specifically, this means that we 16062306a36Sopenharmony_ci * find the lowest-numbered extent of shared blocks that coincides with 16162306a36Sopenharmony_ci * the given block mapping. If the shared extent overlaps the start of 16262306a36Sopenharmony_ci * the mapping, trim the mapping to the end of the shared extent. If 16362306a36Sopenharmony_ci * the shared region intersects the mapping, trim the mapping to the 16462306a36Sopenharmony_ci * start of the shared extent. If there are no shared regions that 16562306a36Sopenharmony_ci * overlap, just return the original extent. 16662306a36Sopenharmony_ci */ 16762306a36Sopenharmony_ciint 16862306a36Sopenharmony_cixfs_reflink_trim_around_shared( 16962306a36Sopenharmony_ci struct xfs_inode *ip, 17062306a36Sopenharmony_ci struct xfs_bmbt_irec *irec, 17162306a36Sopenharmony_ci bool *shared) 17262306a36Sopenharmony_ci{ 17362306a36Sopenharmony_ci struct xfs_mount *mp = ip->i_mount; 17462306a36Sopenharmony_ci struct xfs_perag *pag; 17562306a36Sopenharmony_ci xfs_agblock_t agbno; 17662306a36Sopenharmony_ci xfs_extlen_t aglen; 17762306a36Sopenharmony_ci xfs_agblock_t fbno; 17862306a36Sopenharmony_ci xfs_extlen_t flen; 17962306a36Sopenharmony_ci int error = 0; 18062306a36Sopenharmony_ci 18162306a36Sopenharmony_ci /* Holes, unwritten, and delalloc extents cannot be shared */ 18262306a36Sopenharmony_ci if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { 18362306a36Sopenharmony_ci *shared = false; 18462306a36Sopenharmony_ci return 0; 18562306a36Sopenharmony_ci } 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci trace_xfs_reflink_trim_around_shared(ip, irec); 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); 19062306a36Sopenharmony_ci agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); 19162306a36Sopenharmony_ci aglen = irec->br_blockcount; 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen, 19462306a36Sopenharmony_ci true); 19562306a36Sopenharmony_ci xfs_perag_put(pag); 19662306a36Sopenharmony_ci if (error) 19762306a36Sopenharmony_ci return error; 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci *shared = false; 20062306a36Sopenharmony_ci if (fbno == NULLAGBLOCK) { 20162306a36Sopenharmony_ci /* No shared blocks at all. */ 20262306a36Sopenharmony_ci return 0; 20362306a36Sopenharmony_ci } 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci if (fbno == agbno) { 20662306a36Sopenharmony_ci /* 20762306a36Sopenharmony_ci * The start of this extent is shared. Truncate the 20862306a36Sopenharmony_ci * mapping at the end of the shared region so that a 20962306a36Sopenharmony_ci * subsequent iteration starts at the start of the 21062306a36Sopenharmony_ci * unshared region. 21162306a36Sopenharmony_ci */ 21262306a36Sopenharmony_ci irec->br_blockcount = flen; 21362306a36Sopenharmony_ci *shared = true; 21462306a36Sopenharmony_ci return 0; 21562306a36Sopenharmony_ci } 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci /* 21862306a36Sopenharmony_ci * There's a shared extent midway through this extent. 21962306a36Sopenharmony_ci * Truncate the mapping at the start of the shared 22062306a36Sopenharmony_ci * extent so that a subsequent iteration starts at the 22162306a36Sopenharmony_ci * start of the shared region. 22262306a36Sopenharmony_ci */ 22362306a36Sopenharmony_ci irec->br_blockcount = fbno - agbno; 22462306a36Sopenharmony_ci return 0; 22562306a36Sopenharmony_ci} 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ciint 22862306a36Sopenharmony_cixfs_bmap_trim_cow( 22962306a36Sopenharmony_ci struct xfs_inode *ip, 23062306a36Sopenharmony_ci struct xfs_bmbt_irec *imap, 23162306a36Sopenharmony_ci bool *shared) 23262306a36Sopenharmony_ci{ 23362306a36Sopenharmony_ci /* We can't update any real extents in always COW mode. */ 23462306a36Sopenharmony_ci if (xfs_is_always_cow_inode(ip) && 23562306a36Sopenharmony_ci !isnullstartblock(imap->br_startblock)) { 23662306a36Sopenharmony_ci *shared = true; 23762306a36Sopenharmony_ci return 0; 23862306a36Sopenharmony_ci } 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci /* Trim the mapping to the nearest shared extent boundary. */ 24162306a36Sopenharmony_ci return xfs_reflink_trim_around_shared(ip, imap, shared); 24262306a36Sopenharmony_ci} 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_cistatic int 24562306a36Sopenharmony_cixfs_reflink_convert_cow_locked( 24662306a36Sopenharmony_ci struct xfs_inode *ip, 24762306a36Sopenharmony_ci xfs_fileoff_t offset_fsb, 24862306a36Sopenharmony_ci xfs_filblks_t count_fsb) 24962306a36Sopenharmony_ci{ 25062306a36Sopenharmony_ci struct xfs_iext_cursor icur; 25162306a36Sopenharmony_ci struct xfs_bmbt_irec got; 25262306a36Sopenharmony_ci struct xfs_btree_cur *dummy_cur = NULL; 25362306a36Sopenharmony_ci int dummy_logflags; 25462306a36Sopenharmony_ci int error = 0; 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) 25762306a36Sopenharmony_ci return 0; 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci do { 26062306a36Sopenharmony_ci if (got.br_startoff >= offset_fsb + count_fsb) 26162306a36Sopenharmony_ci break; 26262306a36Sopenharmony_ci if (got.br_state == XFS_EXT_NORM) 26362306a36Sopenharmony_ci continue; 26462306a36Sopenharmony_ci if (WARN_ON_ONCE(isnullstartblock(got.br_startblock))) 26562306a36Sopenharmony_ci return -EIO; 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci xfs_trim_extent(&got, offset_fsb, count_fsb); 26862306a36Sopenharmony_ci if (!got.br_blockcount) 26962306a36Sopenharmony_ci continue; 27062306a36Sopenharmony_ci 27162306a36Sopenharmony_ci got.br_state = XFS_EXT_NORM; 27262306a36Sopenharmony_ci error = xfs_bmap_add_extent_unwritten_real(NULL, ip, 27362306a36Sopenharmony_ci XFS_COW_FORK, &icur, &dummy_cur, &got, 27462306a36Sopenharmony_ci &dummy_logflags); 27562306a36Sopenharmony_ci if (error) 27662306a36Sopenharmony_ci return error; 27762306a36Sopenharmony_ci } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got)); 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_ci return error; 28062306a36Sopenharmony_ci} 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ci/* Convert all of the unwritten CoW extents in a file's range to real ones. */ 28362306a36Sopenharmony_ciint 28462306a36Sopenharmony_cixfs_reflink_convert_cow( 28562306a36Sopenharmony_ci struct xfs_inode *ip, 28662306a36Sopenharmony_ci xfs_off_t offset, 28762306a36Sopenharmony_ci xfs_off_t count) 28862306a36Sopenharmony_ci{ 28962306a36Sopenharmony_ci struct xfs_mount *mp = ip->i_mount; 29062306a36Sopenharmony_ci xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); 29162306a36Sopenharmony_ci xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 29262306a36Sopenharmony_ci xfs_filblks_t count_fsb = end_fsb - offset_fsb; 29362306a36Sopenharmony_ci int error; 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci ASSERT(count != 0); 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci xfs_ilock(ip, XFS_ILOCK_EXCL); 29862306a36Sopenharmony_ci error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); 29962306a36Sopenharmony_ci xfs_iunlock(ip, XFS_ILOCK_EXCL); 30062306a36Sopenharmony_ci return error; 30162306a36Sopenharmony_ci} 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci/* 30462306a36Sopenharmony_ci * Find the extent that maps the given range in the COW fork. Even if the extent 30562306a36Sopenharmony_ci * is not shared we might have a preallocation for it in the COW fork. If so we 30662306a36Sopenharmony_ci * use it that rather than trigger a new allocation. 30762306a36Sopenharmony_ci */ 30862306a36Sopenharmony_cistatic int 30962306a36Sopenharmony_cixfs_find_trim_cow_extent( 31062306a36Sopenharmony_ci struct xfs_inode *ip, 31162306a36Sopenharmony_ci struct xfs_bmbt_irec *imap, 31262306a36Sopenharmony_ci struct xfs_bmbt_irec *cmap, 31362306a36Sopenharmony_ci bool *shared, 31462306a36Sopenharmony_ci bool *found) 31562306a36Sopenharmony_ci{ 31662306a36Sopenharmony_ci xfs_fileoff_t offset_fsb = imap->br_startoff; 31762306a36Sopenharmony_ci xfs_filblks_t count_fsb = imap->br_blockcount; 31862306a36Sopenharmony_ci struct xfs_iext_cursor icur; 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci *found = false; 32162306a36Sopenharmony_ci 32262306a36Sopenharmony_ci /* 32362306a36Sopenharmony_ci * If we don't find an overlapping extent, trim the range we need to 32462306a36Sopenharmony_ci * allocate to fit the hole we found. 32562306a36Sopenharmony_ci */ 32662306a36Sopenharmony_ci if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap)) 32762306a36Sopenharmony_ci cmap->br_startoff = offset_fsb + count_fsb; 32862306a36Sopenharmony_ci if (cmap->br_startoff > offset_fsb) { 32962306a36Sopenharmony_ci xfs_trim_extent(imap, imap->br_startoff, 33062306a36Sopenharmony_ci cmap->br_startoff - imap->br_startoff); 33162306a36Sopenharmony_ci return xfs_bmap_trim_cow(ip, imap, shared); 33262306a36Sopenharmony_ci } 33362306a36Sopenharmony_ci 33462306a36Sopenharmony_ci *shared = true; 33562306a36Sopenharmony_ci if (isnullstartblock(cmap->br_startblock)) { 33662306a36Sopenharmony_ci xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount); 33762306a36Sopenharmony_ci return 0; 33862306a36Sopenharmony_ci } 33962306a36Sopenharmony_ci 34062306a36Sopenharmony_ci /* real extent found - no need to allocate */ 34162306a36Sopenharmony_ci xfs_trim_extent(cmap, offset_fsb, count_fsb); 34262306a36Sopenharmony_ci *found = true; 34362306a36Sopenharmony_ci return 0; 34462306a36Sopenharmony_ci} 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_cistatic int 34762306a36Sopenharmony_cixfs_reflink_convert_unwritten( 34862306a36Sopenharmony_ci struct xfs_inode *ip, 34962306a36Sopenharmony_ci struct xfs_bmbt_irec *imap, 35062306a36Sopenharmony_ci struct xfs_bmbt_irec *cmap, 35162306a36Sopenharmony_ci bool convert_now) 35262306a36Sopenharmony_ci{ 35362306a36Sopenharmony_ci xfs_fileoff_t offset_fsb = imap->br_startoff; 35462306a36Sopenharmony_ci xfs_filblks_t count_fsb = imap->br_blockcount; 35562306a36Sopenharmony_ci int error; 35662306a36Sopenharmony_ci 35762306a36Sopenharmony_ci /* 35862306a36Sopenharmony_ci * cmap might larger than imap due to cowextsize hint. 35962306a36Sopenharmony_ci */ 36062306a36Sopenharmony_ci xfs_trim_extent(cmap, offset_fsb, count_fsb); 36162306a36Sopenharmony_ci 36262306a36Sopenharmony_ci /* 36362306a36Sopenharmony_ci * COW fork extents are supposed to remain unwritten until we're ready 36462306a36Sopenharmony_ci * to initiate a disk write. For direct I/O we are going to write the 36562306a36Sopenharmony_ci * data and need the conversion, but for buffered writes we're done. 36662306a36Sopenharmony_ci */ 36762306a36Sopenharmony_ci if (!convert_now || cmap->br_state == XFS_EXT_NORM) 36862306a36Sopenharmony_ci return 0; 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci trace_xfs_reflink_convert_cow(ip, cmap); 37162306a36Sopenharmony_ci 37262306a36Sopenharmony_ci error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); 37362306a36Sopenharmony_ci if (!error) 37462306a36Sopenharmony_ci cmap->br_state = XFS_EXT_NORM; 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_ci return error; 37762306a36Sopenharmony_ci} 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_cistatic int 38062306a36Sopenharmony_cixfs_reflink_fill_cow_hole( 38162306a36Sopenharmony_ci struct xfs_inode *ip, 38262306a36Sopenharmony_ci struct xfs_bmbt_irec *imap, 38362306a36Sopenharmony_ci struct xfs_bmbt_irec *cmap, 38462306a36Sopenharmony_ci bool *shared, 38562306a36Sopenharmony_ci uint *lockmode, 38662306a36Sopenharmony_ci bool convert_now) 38762306a36Sopenharmony_ci{ 38862306a36Sopenharmony_ci struct xfs_mount *mp = ip->i_mount; 38962306a36Sopenharmony_ci struct xfs_trans *tp; 39062306a36Sopenharmony_ci xfs_filblks_t resaligned; 39162306a36Sopenharmony_ci xfs_extlen_t resblks; 39262306a36Sopenharmony_ci int nimaps; 39362306a36Sopenharmony_ci int error; 39462306a36Sopenharmony_ci bool found; 39562306a36Sopenharmony_ci 39662306a36Sopenharmony_ci resaligned = xfs_aligned_fsb_count(imap->br_startoff, 39762306a36Sopenharmony_ci imap->br_blockcount, xfs_get_cowextsz_hint(ip)); 39862306a36Sopenharmony_ci resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); 39962306a36Sopenharmony_ci 40062306a36Sopenharmony_ci xfs_iunlock(ip, *lockmode); 40162306a36Sopenharmony_ci *lockmode = 0; 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_ci error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, 40462306a36Sopenharmony_ci false, &tp); 40562306a36Sopenharmony_ci if (error) 40662306a36Sopenharmony_ci return error; 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_ci *lockmode = XFS_ILOCK_EXCL; 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); 41162306a36Sopenharmony_ci if (error || !*shared) 41262306a36Sopenharmony_ci goto out_trans_cancel; 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci if (found) { 41562306a36Sopenharmony_ci xfs_trans_cancel(tp); 41662306a36Sopenharmony_ci goto convert; 41762306a36Sopenharmony_ci } 41862306a36Sopenharmony_ci 41962306a36Sopenharmony_ci /* Allocate the entire reservation as unwritten blocks. */ 42062306a36Sopenharmony_ci nimaps = 1; 42162306a36Sopenharmony_ci error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, 42262306a36Sopenharmony_ci XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap, 42362306a36Sopenharmony_ci &nimaps); 42462306a36Sopenharmony_ci if (error) 42562306a36Sopenharmony_ci goto out_trans_cancel; 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci xfs_inode_set_cowblocks_tag(ip); 42862306a36Sopenharmony_ci error = xfs_trans_commit(tp); 42962306a36Sopenharmony_ci if (error) 43062306a36Sopenharmony_ci return error; 43162306a36Sopenharmony_ci 43262306a36Sopenharmony_ci /* 43362306a36Sopenharmony_ci * Allocation succeeded but the requested range was not even partially 43462306a36Sopenharmony_ci * satisfied? Bail out! 43562306a36Sopenharmony_ci */ 43662306a36Sopenharmony_ci if (nimaps == 0) 43762306a36Sopenharmony_ci return -ENOSPC; 43862306a36Sopenharmony_ci 43962306a36Sopenharmony_ciconvert: 44062306a36Sopenharmony_ci return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); 44162306a36Sopenharmony_ci 44262306a36Sopenharmony_ciout_trans_cancel: 44362306a36Sopenharmony_ci xfs_trans_cancel(tp); 44462306a36Sopenharmony_ci return error; 44562306a36Sopenharmony_ci} 44662306a36Sopenharmony_ci 44762306a36Sopenharmony_cistatic int 44862306a36Sopenharmony_cixfs_reflink_fill_delalloc( 44962306a36Sopenharmony_ci struct xfs_inode *ip, 45062306a36Sopenharmony_ci struct xfs_bmbt_irec *imap, 45162306a36Sopenharmony_ci struct xfs_bmbt_irec *cmap, 45262306a36Sopenharmony_ci bool *shared, 45362306a36Sopenharmony_ci uint *lockmode, 45462306a36Sopenharmony_ci bool convert_now) 45562306a36Sopenharmony_ci{ 45662306a36Sopenharmony_ci struct xfs_mount *mp = ip->i_mount; 45762306a36Sopenharmony_ci struct xfs_trans *tp; 45862306a36Sopenharmony_ci int nimaps; 45962306a36Sopenharmony_ci int error; 46062306a36Sopenharmony_ci bool found; 46162306a36Sopenharmony_ci 46262306a36Sopenharmony_ci do { 46362306a36Sopenharmony_ci xfs_iunlock(ip, *lockmode); 46462306a36Sopenharmony_ci *lockmode = 0; 46562306a36Sopenharmony_ci 46662306a36Sopenharmony_ci error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0, 46762306a36Sopenharmony_ci false, &tp); 46862306a36Sopenharmony_ci if (error) 46962306a36Sopenharmony_ci return error; 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci *lockmode = XFS_ILOCK_EXCL; 47262306a36Sopenharmony_ci 47362306a36Sopenharmony_ci error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, 47462306a36Sopenharmony_ci &found); 47562306a36Sopenharmony_ci if (error || !*shared) 47662306a36Sopenharmony_ci goto out_trans_cancel; 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_ci if (found) { 47962306a36Sopenharmony_ci xfs_trans_cancel(tp); 48062306a36Sopenharmony_ci break; 48162306a36Sopenharmony_ci } 48262306a36Sopenharmony_ci 48362306a36Sopenharmony_ci ASSERT(isnullstartblock(cmap->br_startblock) || 48462306a36Sopenharmony_ci cmap->br_startblock == DELAYSTARTBLOCK); 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_ci /* 48762306a36Sopenharmony_ci * Replace delalloc reservation with an unwritten extent. 48862306a36Sopenharmony_ci */ 48962306a36Sopenharmony_ci nimaps = 1; 49062306a36Sopenharmony_ci error = xfs_bmapi_write(tp, ip, cmap->br_startoff, 49162306a36Sopenharmony_ci cmap->br_blockcount, 49262306a36Sopenharmony_ci XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, 49362306a36Sopenharmony_ci cmap, &nimaps); 49462306a36Sopenharmony_ci if (error) 49562306a36Sopenharmony_ci goto out_trans_cancel; 49662306a36Sopenharmony_ci 49762306a36Sopenharmony_ci xfs_inode_set_cowblocks_tag(ip); 49862306a36Sopenharmony_ci error = xfs_trans_commit(tp); 49962306a36Sopenharmony_ci if (error) 50062306a36Sopenharmony_ci return error; 50162306a36Sopenharmony_ci 50262306a36Sopenharmony_ci /* 50362306a36Sopenharmony_ci * Allocation succeeded but the requested range was not even 50462306a36Sopenharmony_ci * partially satisfied? Bail out! 50562306a36Sopenharmony_ci */ 50662306a36Sopenharmony_ci if (nimaps == 0) 50762306a36Sopenharmony_ci return -ENOSPC; 50862306a36Sopenharmony_ci } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ciout_trans_cancel: 51362306a36Sopenharmony_ci xfs_trans_cancel(tp); 51462306a36Sopenharmony_ci return error; 51562306a36Sopenharmony_ci} 51662306a36Sopenharmony_ci 51762306a36Sopenharmony_ci/* Allocate all CoW reservations covering a range of blocks in a file. */ 51862306a36Sopenharmony_ciint 51962306a36Sopenharmony_cixfs_reflink_allocate_cow( 52062306a36Sopenharmony_ci struct xfs_inode *ip, 52162306a36Sopenharmony_ci struct xfs_bmbt_irec *imap, 52262306a36Sopenharmony_ci struct xfs_bmbt_irec *cmap, 52362306a36Sopenharmony_ci bool *shared, 52462306a36Sopenharmony_ci uint *lockmode, 52562306a36Sopenharmony_ci bool convert_now) 52662306a36Sopenharmony_ci{ 52762306a36Sopenharmony_ci int error; 52862306a36Sopenharmony_ci bool found; 52962306a36Sopenharmony_ci 53062306a36Sopenharmony_ci ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 53162306a36Sopenharmony_ci if (!ip->i_cowfp) { 53262306a36Sopenharmony_ci ASSERT(!xfs_is_reflink_inode(ip)); 53362306a36Sopenharmony_ci xfs_ifork_init_cow(ip); 53462306a36Sopenharmony_ci } 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ci error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found); 53762306a36Sopenharmony_ci if (error || !*shared) 53862306a36Sopenharmony_ci return error; 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_ci /* CoW fork has a real extent */ 54162306a36Sopenharmony_ci if (found) 54262306a36Sopenharmony_ci return xfs_reflink_convert_unwritten(ip, imap, cmap, 54362306a36Sopenharmony_ci convert_now); 54462306a36Sopenharmony_ci 54562306a36Sopenharmony_ci /* 54662306a36Sopenharmony_ci * CoW fork does not have an extent and data extent is shared. 54762306a36Sopenharmony_ci * Allocate a real extent in the CoW fork. 54862306a36Sopenharmony_ci */ 54962306a36Sopenharmony_ci if (cmap->br_startoff > imap->br_startoff) 55062306a36Sopenharmony_ci return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared, 55162306a36Sopenharmony_ci lockmode, convert_now); 55262306a36Sopenharmony_ci 55362306a36Sopenharmony_ci /* 55462306a36Sopenharmony_ci * CoW fork has a delalloc reservation. Replace it with a real extent. 55562306a36Sopenharmony_ci * There may or may not be a data fork mapping. 55662306a36Sopenharmony_ci */ 55762306a36Sopenharmony_ci if (isnullstartblock(cmap->br_startblock) || 55862306a36Sopenharmony_ci cmap->br_startblock == DELAYSTARTBLOCK) 55962306a36Sopenharmony_ci return xfs_reflink_fill_delalloc(ip, imap, cmap, shared, 56062306a36Sopenharmony_ci lockmode, convert_now); 56162306a36Sopenharmony_ci 56262306a36Sopenharmony_ci /* Shouldn't get here. */ 56362306a36Sopenharmony_ci ASSERT(0); 56462306a36Sopenharmony_ci return -EFSCORRUPTED; 56562306a36Sopenharmony_ci} 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_ci/* 56862306a36Sopenharmony_ci * Cancel CoW reservations for some block range of an inode. 56962306a36Sopenharmony_ci * 57062306a36Sopenharmony_ci * If cancel_real is true this function cancels all COW fork extents for the 57162306a36Sopenharmony_ci * inode; if cancel_real is false, real extents are not cleared. 57262306a36Sopenharmony_ci * 57362306a36Sopenharmony_ci * Caller must have already joined the inode to the current transaction. The 57462306a36Sopenharmony_ci * inode will be joined to the transaction returned to the caller. 57562306a36Sopenharmony_ci */ 57662306a36Sopenharmony_ciint 57762306a36Sopenharmony_cixfs_reflink_cancel_cow_blocks( 57862306a36Sopenharmony_ci struct xfs_inode *ip, 57962306a36Sopenharmony_ci struct xfs_trans **tpp, 58062306a36Sopenharmony_ci xfs_fileoff_t offset_fsb, 58162306a36Sopenharmony_ci xfs_fileoff_t end_fsb, 58262306a36Sopenharmony_ci bool cancel_real) 58362306a36Sopenharmony_ci{ 58462306a36Sopenharmony_ci struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); 58562306a36Sopenharmony_ci struct xfs_bmbt_irec got, del; 58662306a36Sopenharmony_ci struct xfs_iext_cursor icur; 58762306a36Sopenharmony_ci int error = 0; 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci if (!xfs_inode_has_cow_data(ip)) 59062306a36Sopenharmony_ci return 0; 59162306a36Sopenharmony_ci if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) 59262306a36Sopenharmony_ci return 0; 59362306a36Sopenharmony_ci 59462306a36Sopenharmony_ci /* Walk backwards until we're out of the I/O range... */ 59562306a36Sopenharmony_ci while (got.br_startoff + got.br_blockcount > offset_fsb) { 59662306a36Sopenharmony_ci del = got; 59762306a36Sopenharmony_ci xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); 59862306a36Sopenharmony_ci 59962306a36Sopenharmony_ci /* Extent delete may have bumped ext forward */ 60062306a36Sopenharmony_ci if (!del.br_blockcount) { 60162306a36Sopenharmony_ci xfs_iext_prev(ifp, &icur); 60262306a36Sopenharmony_ci goto next_extent; 60362306a36Sopenharmony_ci } 60462306a36Sopenharmony_ci 60562306a36Sopenharmony_ci trace_xfs_reflink_cancel_cow(ip, &del); 60662306a36Sopenharmony_ci 60762306a36Sopenharmony_ci if (isnullstartblock(del.br_startblock)) { 60862306a36Sopenharmony_ci error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, 60962306a36Sopenharmony_ci &icur, &got, &del); 61062306a36Sopenharmony_ci if (error) 61162306a36Sopenharmony_ci break; 61262306a36Sopenharmony_ci } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { 61362306a36Sopenharmony_ci ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); 61462306a36Sopenharmony_ci 61562306a36Sopenharmony_ci /* Free the CoW orphan record. */ 61662306a36Sopenharmony_ci xfs_refcount_free_cow_extent(*tpp, del.br_startblock, 61762306a36Sopenharmony_ci del.br_blockcount); 61862306a36Sopenharmony_ci 61962306a36Sopenharmony_ci error = xfs_free_extent_later(*tpp, del.br_startblock, 62062306a36Sopenharmony_ci del.br_blockcount, NULL, 62162306a36Sopenharmony_ci XFS_AG_RESV_NONE); 62262306a36Sopenharmony_ci if (error) 62362306a36Sopenharmony_ci break; 62462306a36Sopenharmony_ci 62562306a36Sopenharmony_ci /* Roll the transaction */ 62662306a36Sopenharmony_ci error = xfs_defer_finish(tpp); 62762306a36Sopenharmony_ci if (error) 62862306a36Sopenharmony_ci break; 62962306a36Sopenharmony_ci 63062306a36Sopenharmony_ci /* Remove the mapping from the CoW fork. */ 63162306a36Sopenharmony_ci xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci /* Remove the quota reservation */ 63462306a36Sopenharmony_ci error = xfs_quota_unreserve_blkres(ip, 63562306a36Sopenharmony_ci del.br_blockcount); 63662306a36Sopenharmony_ci if (error) 63762306a36Sopenharmony_ci break; 63862306a36Sopenharmony_ci } else { 63962306a36Sopenharmony_ci /* Didn't do anything, push cursor back. */ 64062306a36Sopenharmony_ci xfs_iext_prev(ifp, &icur); 64162306a36Sopenharmony_ci } 64262306a36Sopenharmony_cinext_extent: 64362306a36Sopenharmony_ci if (!xfs_iext_get_extent(ifp, &icur, &got)) 64462306a36Sopenharmony_ci break; 64562306a36Sopenharmony_ci } 64662306a36Sopenharmony_ci 64762306a36Sopenharmony_ci /* clear tag if cow fork is emptied */ 64862306a36Sopenharmony_ci if (!ifp->if_bytes) 64962306a36Sopenharmony_ci xfs_inode_clear_cowblocks_tag(ip); 65062306a36Sopenharmony_ci return error; 65162306a36Sopenharmony_ci} 65262306a36Sopenharmony_ci 65362306a36Sopenharmony_ci/* 65462306a36Sopenharmony_ci * Cancel CoW reservations for some byte range of an inode. 65562306a36Sopenharmony_ci * 65662306a36Sopenharmony_ci * If cancel_real is true this function cancels all COW fork extents for the 65762306a36Sopenharmony_ci * inode; if cancel_real is false, real extents are not cleared. 65862306a36Sopenharmony_ci */ 65962306a36Sopenharmony_ciint 66062306a36Sopenharmony_cixfs_reflink_cancel_cow_range( 66162306a36Sopenharmony_ci struct xfs_inode *ip, 66262306a36Sopenharmony_ci xfs_off_t offset, 66362306a36Sopenharmony_ci xfs_off_t count, 66462306a36Sopenharmony_ci bool cancel_real) 66562306a36Sopenharmony_ci{ 66662306a36Sopenharmony_ci struct xfs_trans *tp; 66762306a36Sopenharmony_ci xfs_fileoff_t offset_fsb; 66862306a36Sopenharmony_ci xfs_fileoff_t end_fsb; 66962306a36Sopenharmony_ci int error; 67062306a36Sopenharmony_ci 67162306a36Sopenharmony_ci trace_xfs_reflink_cancel_cow_range(ip, offset, count); 67262306a36Sopenharmony_ci ASSERT(ip->i_cowfp); 67362306a36Sopenharmony_ci 67462306a36Sopenharmony_ci offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 67562306a36Sopenharmony_ci if (count == NULLFILEOFF) 67662306a36Sopenharmony_ci end_fsb = NULLFILEOFF; 67762306a36Sopenharmony_ci else 67862306a36Sopenharmony_ci end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 67962306a36Sopenharmony_ci 68062306a36Sopenharmony_ci /* Start a rolling transaction to remove the mappings */ 68162306a36Sopenharmony_ci error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, 68262306a36Sopenharmony_ci 0, 0, 0, &tp); 68362306a36Sopenharmony_ci if (error) 68462306a36Sopenharmony_ci goto out; 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci xfs_ilock(ip, XFS_ILOCK_EXCL); 68762306a36Sopenharmony_ci xfs_trans_ijoin(tp, ip, 0); 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ci /* Scrape out the old CoW reservations */ 69062306a36Sopenharmony_ci error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, 69162306a36Sopenharmony_ci cancel_real); 69262306a36Sopenharmony_ci if (error) 69362306a36Sopenharmony_ci goto out_cancel; 69462306a36Sopenharmony_ci 69562306a36Sopenharmony_ci error = xfs_trans_commit(tp); 69662306a36Sopenharmony_ci 69762306a36Sopenharmony_ci xfs_iunlock(ip, XFS_ILOCK_EXCL); 69862306a36Sopenharmony_ci return error; 69962306a36Sopenharmony_ci 70062306a36Sopenharmony_ciout_cancel: 70162306a36Sopenharmony_ci xfs_trans_cancel(tp); 70262306a36Sopenharmony_ci xfs_iunlock(ip, XFS_ILOCK_EXCL); 70362306a36Sopenharmony_ciout: 70462306a36Sopenharmony_ci trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); 70562306a36Sopenharmony_ci return error; 70662306a36Sopenharmony_ci} 70762306a36Sopenharmony_ci 70862306a36Sopenharmony_ci/* 70962306a36Sopenharmony_ci * Remap part of the CoW fork into the data fork. 71062306a36Sopenharmony_ci * 71162306a36Sopenharmony_ci * We aim to remap the range starting at @offset_fsb and ending at @end_fsb 71262306a36Sopenharmony_ci * into the data fork; this function will remap what it can (at the end of the 71362306a36Sopenharmony_ci * range) and update @end_fsb appropriately. Each remap gets its own 71462306a36Sopenharmony_ci * transaction because we can end up merging and splitting bmbt blocks for 71562306a36Sopenharmony_ci * every remap operation and we'd like to keep the block reservation 71662306a36Sopenharmony_ci * requirements as low as possible. 71762306a36Sopenharmony_ci */ 71862306a36Sopenharmony_ciSTATIC int 71962306a36Sopenharmony_cixfs_reflink_end_cow_extent( 72062306a36Sopenharmony_ci struct xfs_inode *ip, 72162306a36Sopenharmony_ci xfs_fileoff_t *offset_fsb, 72262306a36Sopenharmony_ci xfs_fileoff_t end_fsb) 72362306a36Sopenharmony_ci{ 72462306a36Sopenharmony_ci struct xfs_iext_cursor icur; 72562306a36Sopenharmony_ci struct xfs_bmbt_irec got, del, data; 72662306a36Sopenharmony_ci struct xfs_mount *mp = ip->i_mount; 72762306a36Sopenharmony_ci struct xfs_trans *tp; 72862306a36Sopenharmony_ci struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); 72962306a36Sopenharmony_ci unsigned int resblks; 73062306a36Sopenharmony_ci int nmaps; 73162306a36Sopenharmony_ci int error; 73262306a36Sopenharmony_ci 73362306a36Sopenharmony_ci /* No COW extents? That's easy! */ 73462306a36Sopenharmony_ci if (ifp->if_bytes == 0) { 73562306a36Sopenharmony_ci *offset_fsb = end_fsb; 73662306a36Sopenharmony_ci return 0; 73762306a36Sopenharmony_ci } 73862306a36Sopenharmony_ci 73962306a36Sopenharmony_ci resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 74062306a36Sopenharmony_ci error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 74162306a36Sopenharmony_ci XFS_TRANS_RESERVE, &tp); 74262306a36Sopenharmony_ci if (error) 74362306a36Sopenharmony_ci return error; 74462306a36Sopenharmony_ci 74562306a36Sopenharmony_ci /* 74662306a36Sopenharmony_ci * Lock the inode. We have to ijoin without automatic unlock because 74762306a36Sopenharmony_ci * the lead transaction is the refcountbt record deletion; the data 74862306a36Sopenharmony_ci * fork update follows as a deferred log item. 74962306a36Sopenharmony_ci */ 75062306a36Sopenharmony_ci xfs_ilock(ip, XFS_ILOCK_EXCL); 75162306a36Sopenharmony_ci xfs_trans_ijoin(tp, ip, 0); 75262306a36Sopenharmony_ci 75362306a36Sopenharmony_ci error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, 75462306a36Sopenharmony_ci XFS_IEXT_REFLINK_END_COW_CNT); 75562306a36Sopenharmony_ci if (error == -EFBIG) 75662306a36Sopenharmony_ci error = xfs_iext_count_upgrade(tp, ip, 75762306a36Sopenharmony_ci XFS_IEXT_REFLINK_END_COW_CNT); 75862306a36Sopenharmony_ci if (error) 75962306a36Sopenharmony_ci goto out_cancel; 76062306a36Sopenharmony_ci 76162306a36Sopenharmony_ci /* 76262306a36Sopenharmony_ci * In case of racing, overlapping AIO writes no COW extents might be 76362306a36Sopenharmony_ci * left by the time I/O completes for the loser of the race. In that 76462306a36Sopenharmony_ci * case we are done. 76562306a36Sopenharmony_ci */ 76662306a36Sopenharmony_ci if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || 76762306a36Sopenharmony_ci got.br_startoff >= end_fsb) { 76862306a36Sopenharmony_ci *offset_fsb = end_fsb; 76962306a36Sopenharmony_ci goto out_cancel; 77062306a36Sopenharmony_ci } 77162306a36Sopenharmony_ci 77262306a36Sopenharmony_ci /* 77362306a36Sopenharmony_ci * Only remap real extents that contain data. With AIO, speculative 77462306a36Sopenharmony_ci * preallocations can leak into the range we are called upon, and we 77562306a36Sopenharmony_ci * need to skip them. Preserve @got for the eventual CoW fork 77662306a36Sopenharmony_ci * deletion; from now on @del represents the mapping that we're 77762306a36Sopenharmony_ci * actually remapping. 77862306a36Sopenharmony_ci */ 77962306a36Sopenharmony_ci while (!xfs_bmap_is_written_extent(&got)) { 78062306a36Sopenharmony_ci if (!xfs_iext_next_extent(ifp, &icur, &got) || 78162306a36Sopenharmony_ci got.br_startoff >= end_fsb) { 78262306a36Sopenharmony_ci *offset_fsb = end_fsb; 78362306a36Sopenharmony_ci goto out_cancel; 78462306a36Sopenharmony_ci } 78562306a36Sopenharmony_ci } 78662306a36Sopenharmony_ci del = got; 78762306a36Sopenharmony_ci xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb); 78862306a36Sopenharmony_ci 78962306a36Sopenharmony_ci /* Grab the corresponding mapping in the data fork. */ 79062306a36Sopenharmony_ci nmaps = 1; 79162306a36Sopenharmony_ci error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, 79262306a36Sopenharmony_ci &nmaps, 0); 79362306a36Sopenharmony_ci if (error) 79462306a36Sopenharmony_ci goto out_cancel; 79562306a36Sopenharmony_ci 79662306a36Sopenharmony_ci /* We can only remap the smaller of the two extent sizes. */ 79762306a36Sopenharmony_ci data.br_blockcount = min(data.br_blockcount, del.br_blockcount); 79862306a36Sopenharmony_ci del.br_blockcount = data.br_blockcount; 79962306a36Sopenharmony_ci 80062306a36Sopenharmony_ci trace_xfs_reflink_cow_remap_from(ip, &del); 80162306a36Sopenharmony_ci trace_xfs_reflink_cow_remap_to(ip, &data); 80262306a36Sopenharmony_ci 80362306a36Sopenharmony_ci if (xfs_bmap_is_real_extent(&data)) { 80462306a36Sopenharmony_ci /* 80562306a36Sopenharmony_ci * If the extent we're remapping is backed by storage (written 80662306a36Sopenharmony_ci * or not), unmap the extent and drop its refcount. 80762306a36Sopenharmony_ci */ 80862306a36Sopenharmony_ci xfs_bmap_unmap_extent(tp, ip, &data); 80962306a36Sopenharmony_ci xfs_refcount_decrease_extent(tp, &data); 81062306a36Sopenharmony_ci xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 81162306a36Sopenharmony_ci -data.br_blockcount); 81262306a36Sopenharmony_ci } else if (data.br_startblock == DELAYSTARTBLOCK) { 81362306a36Sopenharmony_ci int done; 81462306a36Sopenharmony_ci 81562306a36Sopenharmony_ci /* 81662306a36Sopenharmony_ci * If the extent we're remapping is a delalloc reservation, 81762306a36Sopenharmony_ci * we can use the regular bunmapi function to release the 81862306a36Sopenharmony_ci * incore state. Dropping the delalloc reservation takes care 81962306a36Sopenharmony_ci * of the quota reservation for us. 82062306a36Sopenharmony_ci */ 82162306a36Sopenharmony_ci error = xfs_bunmapi(NULL, ip, data.br_startoff, 82262306a36Sopenharmony_ci data.br_blockcount, 0, 1, &done); 82362306a36Sopenharmony_ci if (error) 82462306a36Sopenharmony_ci goto out_cancel; 82562306a36Sopenharmony_ci ASSERT(done); 82662306a36Sopenharmony_ci } 82762306a36Sopenharmony_ci 82862306a36Sopenharmony_ci /* Free the CoW orphan record. */ 82962306a36Sopenharmony_ci xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); 83062306a36Sopenharmony_ci 83162306a36Sopenharmony_ci /* Map the new blocks into the data fork. */ 83262306a36Sopenharmony_ci xfs_bmap_map_extent(tp, ip, &del); 83362306a36Sopenharmony_ci 83462306a36Sopenharmony_ci /* Charge this new data fork mapping to the on-disk quota. */ 83562306a36Sopenharmony_ci xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, 83662306a36Sopenharmony_ci (long)del.br_blockcount); 83762306a36Sopenharmony_ci 83862306a36Sopenharmony_ci /* Remove the mapping from the CoW fork. */ 83962306a36Sopenharmony_ci xfs_bmap_del_extent_cow(ip, &icur, &got, &del); 84062306a36Sopenharmony_ci 84162306a36Sopenharmony_ci error = xfs_trans_commit(tp); 84262306a36Sopenharmony_ci xfs_iunlock(ip, XFS_ILOCK_EXCL); 84362306a36Sopenharmony_ci if (error) 84462306a36Sopenharmony_ci return error; 84562306a36Sopenharmony_ci 84662306a36Sopenharmony_ci /* Update the caller about how much progress we made. */ 84762306a36Sopenharmony_ci *offset_fsb = del.br_startoff + del.br_blockcount; 84862306a36Sopenharmony_ci return 0; 84962306a36Sopenharmony_ci 85062306a36Sopenharmony_ciout_cancel: 85162306a36Sopenharmony_ci xfs_trans_cancel(tp); 85262306a36Sopenharmony_ci xfs_iunlock(ip, XFS_ILOCK_EXCL); 85362306a36Sopenharmony_ci return error; 85462306a36Sopenharmony_ci} 85562306a36Sopenharmony_ci 85662306a36Sopenharmony_ci/* 85762306a36Sopenharmony_ci * Remap parts of a file's data fork after a successful CoW. 85862306a36Sopenharmony_ci */ 85962306a36Sopenharmony_ciint 86062306a36Sopenharmony_cixfs_reflink_end_cow( 86162306a36Sopenharmony_ci struct xfs_inode *ip, 86262306a36Sopenharmony_ci xfs_off_t offset, 86362306a36Sopenharmony_ci xfs_off_t count) 86462306a36Sopenharmony_ci{ 86562306a36Sopenharmony_ci xfs_fileoff_t offset_fsb; 86662306a36Sopenharmony_ci xfs_fileoff_t end_fsb; 86762306a36Sopenharmony_ci int error = 0; 86862306a36Sopenharmony_ci 86962306a36Sopenharmony_ci trace_xfs_reflink_end_cow(ip, offset, count); 87062306a36Sopenharmony_ci 87162306a36Sopenharmony_ci offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); 87262306a36Sopenharmony_ci end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); 87362306a36Sopenharmony_ci 87462306a36Sopenharmony_ci /* 87562306a36Sopenharmony_ci * Walk forwards until we've remapped the I/O range. The loop function 87662306a36Sopenharmony_ci * repeatedly cycles the ILOCK to allocate one transaction per remapped 87762306a36Sopenharmony_ci * extent. 87862306a36Sopenharmony_ci * 87962306a36Sopenharmony_ci * If we're being called by writeback then the pages will still 88062306a36Sopenharmony_ci * have PageWriteback set, which prevents races with reflink remapping 88162306a36Sopenharmony_ci * and truncate. Reflink remapping prevents races with writeback by 88262306a36Sopenharmony_ci * taking the iolock and mmaplock before flushing the pages and 88362306a36Sopenharmony_ci * remapping, which means there won't be any further writeback or page 88462306a36Sopenharmony_ci * cache dirtying until the reflink completes. 88562306a36Sopenharmony_ci * 88662306a36Sopenharmony_ci * We should never have two threads issuing writeback for the same file 88762306a36Sopenharmony_ci * region. There are also have post-eof checks in the writeback 88862306a36Sopenharmony_ci * preparation code so that we don't bother writing out pages that are 88962306a36Sopenharmony_ci * about to be truncated. 89062306a36Sopenharmony_ci * 89162306a36Sopenharmony_ci * If we're being called as part of directio write completion, the dio 89262306a36Sopenharmony_ci * count is still elevated, which reflink and truncate will wait for. 89362306a36Sopenharmony_ci * Reflink remapping takes the iolock and mmaplock and waits for 89462306a36Sopenharmony_ci * pending dio to finish, which should prevent any directio until the 89562306a36Sopenharmony_ci * remap completes. Multiple concurrent directio writes to the same 89662306a36Sopenharmony_ci * region are handled by end_cow processing only occurring for the 89762306a36Sopenharmony_ci * threads which succeed; the outcome of multiple overlapping direct 89862306a36Sopenharmony_ci * writes is not well defined anyway. 89962306a36Sopenharmony_ci * 90062306a36Sopenharmony_ci * It's possible that a buffered write and a direct write could collide 90162306a36Sopenharmony_ci * here (the buffered write stumbles in after the dio flushes and 90262306a36Sopenharmony_ci * invalidates the page cache and immediately queues writeback), but we 90362306a36Sopenharmony_ci * have never supported this 100%. If either disk write succeeds the 90462306a36Sopenharmony_ci * blocks will be remapped. 90562306a36Sopenharmony_ci */ 90662306a36Sopenharmony_ci while (end_fsb > offset_fsb && !error) 90762306a36Sopenharmony_ci error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb); 90862306a36Sopenharmony_ci 90962306a36Sopenharmony_ci if (error) 91062306a36Sopenharmony_ci trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); 91162306a36Sopenharmony_ci return error; 91262306a36Sopenharmony_ci} 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_ci/* 91562306a36Sopenharmony_ci * Free all CoW staging blocks that are still referenced by the ondisk refcount 91662306a36Sopenharmony_ci * metadata. The ondisk metadata does not track which inode created the 91762306a36Sopenharmony_ci * staging extent, so callers must ensure that there are no cached inodes with 91862306a36Sopenharmony_ci * live CoW staging extents. 91962306a36Sopenharmony_ci */ 92062306a36Sopenharmony_ciint 92162306a36Sopenharmony_cixfs_reflink_recover_cow( 92262306a36Sopenharmony_ci struct xfs_mount *mp) 92362306a36Sopenharmony_ci{ 92462306a36Sopenharmony_ci struct xfs_perag *pag; 92562306a36Sopenharmony_ci xfs_agnumber_t agno; 92662306a36Sopenharmony_ci int error = 0; 92762306a36Sopenharmony_ci 92862306a36Sopenharmony_ci if (!xfs_has_reflink(mp)) 92962306a36Sopenharmony_ci return 0; 93062306a36Sopenharmony_ci 93162306a36Sopenharmony_ci for_each_perag(mp, agno, pag) { 93262306a36Sopenharmony_ci error = xfs_refcount_recover_cow_leftovers(mp, pag); 93362306a36Sopenharmony_ci if (error) { 93462306a36Sopenharmony_ci xfs_perag_rele(pag); 93562306a36Sopenharmony_ci break; 93662306a36Sopenharmony_ci } 93762306a36Sopenharmony_ci } 93862306a36Sopenharmony_ci 93962306a36Sopenharmony_ci return error; 94062306a36Sopenharmony_ci} 94162306a36Sopenharmony_ci 94262306a36Sopenharmony_ci/* 94362306a36Sopenharmony_ci * Reflinking (Block) Ranges of Two Files Together 94462306a36Sopenharmony_ci * 94562306a36Sopenharmony_ci * First, ensure that the reflink flag is set on both inodes. The flag is an 94662306a36Sopenharmony_ci * optimization to avoid unnecessary refcount btree lookups in the write path. 94762306a36Sopenharmony_ci * 94862306a36Sopenharmony_ci * Now we can iteratively remap the range of extents (and holes) in src to the 94962306a36Sopenharmony_ci * corresponding ranges in dest. Let drange and srange denote the ranges of 95062306a36Sopenharmony_ci * logical blocks in dest and src touched by the reflink operation. 95162306a36Sopenharmony_ci * 95262306a36Sopenharmony_ci * While the length of drange is greater than zero, 95362306a36Sopenharmony_ci * - Read src's bmbt at the start of srange ("imap") 95462306a36Sopenharmony_ci * - If imap doesn't exist, make imap appear to start at the end of srange 95562306a36Sopenharmony_ci * with zero length. 95662306a36Sopenharmony_ci * - If imap starts before srange, advance imap to start at srange. 95762306a36Sopenharmony_ci * - If imap goes beyond srange, truncate imap to end at the end of srange. 95862306a36Sopenharmony_ci * - Punch (imap start - srange start + imap len) blocks from dest at 95962306a36Sopenharmony_ci * offset (drange start). 96062306a36Sopenharmony_ci * - If imap points to a real range of pblks, 96162306a36Sopenharmony_ci * > Increase the refcount of the imap's pblks 96262306a36Sopenharmony_ci * > Map imap's pblks into dest at the offset 96362306a36Sopenharmony_ci * (drange start + imap start - srange start) 96462306a36Sopenharmony_ci * - Advance drange and srange by (imap start - srange start + imap len) 96562306a36Sopenharmony_ci * 96662306a36Sopenharmony_ci * Finally, if the reflink made dest longer, update both the in-core and 96762306a36Sopenharmony_ci * on-disk file sizes. 96862306a36Sopenharmony_ci * 96962306a36Sopenharmony_ci * ASCII Art Demonstration: 97062306a36Sopenharmony_ci * 97162306a36Sopenharmony_ci * Let's say we want to reflink this source file: 97262306a36Sopenharmony_ci * 97362306a36Sopenharmony_ci * ----SSSSSSS-SSSSS----SSSSSS (src file) 97462306a36Sopenharmony_ci * <--------------------> 97562306a36Sopenharmony_ci * 97662306a36Sopenharmony_ci * into this destination file: 97762306a36Sopenharmony_ci * 97862306a36Sopenharmony_ci * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) 97962306a36Sopenharmony_ci * <--------------------> 98062306a36Sopenharmony_ci * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. 98162306a36Sopenharmony_ci * Observe that the range has different logical offsets in either file. 98262306a36Sopenharmony_ci * 98362306a36Sopenharmony_ci * Consider that the first extent in the source file doesn't line up with our 98462306a36Sopenharmony_ci * reflink range. Unmapping and remapping are separate operations, so we can 98562306a36Sopenharmony_ci * unmap more blocks from the destination file than we remap. 98662306a36Sopenharmony_ci * 98762306a36Sopenharmony_ci * ----SSSSSSS-SSSSS----SSSSSS 98862306a36Sopenharmony_ci * <-------> 98962306a36Sopenharmony_ci * --DDDDD---------DDDDD--DDD 99062306a36Sopenharmony_ci * <-------> 99162306a36Sopenharmony_ci * 99262306a36Sopenharmony_ci * Now remap the source extent into the destination file: 99362306a36Sopenharmony_ci * 99462306a36Sopenharmony_ci * ----SSSSSSS-SSSSS----SSSSSS 99562306a36Sopenharmony_ci * <-------> 99662306a36Sopenharmony_ci * --DDDDD--SSSSSSSDDDDD--DDD 99762306a36Sopenharmony_ci * <-------> 99862306a36Sopenharmony_ci * 99962306a36Sopenharmony_ci * Do likewise with the second hole and extent in our range. Holes in the 100062306a36Sopenharmony_ci * unmap range don't affect our operation. 100162306a36Sopenharmony_ci * 100262306a36Sopenharmony_ci * ----SSSSSSS-SSSSS----SSSSSS 100362306a36Sopenharmony_ci * <----> 100462306a36Sopenharmony_ci * --DDDDD--SSSSSSS-SSSSS-DDD 100562306a36Sopenharmony_ci * <----> 100662306a36Sopenharmony_ci * 100762306a36Sopenharmony_ci * Finally, unmap and remap part of the third extent. This will increase the 100862306a36Sopenharmony_ci * size of the destination file. 100962306a36Sopenharmony_ci * 101062306a36Sopenharmony_ci * ----SSSSSSS-SSSSS----SSSSSS 101162306a36Sopenharmony_ci * <-----> 101262306a36Sopenharmony_ci * --DDDDD--SSSSSSS-SSSSS----SSS 101362306a36Sopenharmony_ci * <-----> 101462306a36Sopenharmony_ci * 101562306a36Sopenharmony_ci * Once we update the destination file's i_size, we're done. 101662306a36Sopenharmony_ci */ 101762306a36Sopenharmony_ci 101862306a36Sopenharmony_ci/* 101962306a36Sopenharmony_ci * Ensure the reflink bit is set in both inodes. 102062306a36Sopenharmony_ci */ 102162306a36Sopenharmony_ciSTATIC int 102262306a36Sopenharmony_cixfs_reflink_set_inode_flag( 102362306a36Sopenharmony_ci struct xfs_inode *src, 102462306a36Sopenharmony_ci struct xfs_inode *dest) 102562306a36Sopenharmony_ci{ 102662306a36Sopenharmony_ci struct xfs_mount *mp = src->i_mount; 102762306a36Sopenharmony_ci int error; 102862306a36Sopenharmony_ci struct xfs_trans *tp; 102962306a36Sopenharmony_ci 103062306a36Sopenharmony_ci if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) 103162306a36Sopenharmony_ci return 0; 103262306a36Sopenharmony_ci 103362306a36Sopenharmony_ci error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 103462306a36Sopenharmony_ci if (error) 103562306a36Sopenharmony_ci goto out_error; 103662306a36Sopenharmony_ci 103762306a36Sopenharmony_ci /* Lock both files against IO */ 103862306a36Sopenharmony_ci if (src->i_ino == dest->i_ino) 103962306a36Sopenharmony_ci xfs_ilock(src, XFS_ILOCK_EXCL); 104062306a36Sopenharmony_ci else 104162306a36Sopenharmony_ci xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL); 104262306a36Sopenharmony_ci 104362306a36Sopenharmony_ci if (!xfs_is_reflink_inode(src)) { 104462306a36Sopenharmony_ci trace_xfs_reflink_set_inode_flag(src); 104562306a36Sopenharmony_ci xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); 104662306a36Sopenharmony_ci src->i_diflags2 |= XFS_DIFLAG2_REFLINK; 104762306a36Sopenharmony_ci xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); 104862306a36Sopenharmony_ci xfs_ifork_init_cow(src); 104962306a36Sopenharmony_ci } else 105062306a36Sopenharmony_ci xfs_iunlock(src, XFS_ILOCK_EXCL); 105162306a36Sopenharmony_ci 105262306a36Sopenharmony_ci if (src->i_ino == dest->i_ino) 105362306a36Sopenharmony_ci goto commit_flags; 105462306a36Sopenharmony_ci 105562306a36Sopenharmony_ci if (!xfs_is_reflink_inode(dest)) { 105662306a36Sopenharmony_ci trace_xfs_reflink_set_inode_flag(dest); 105762306a36Sopenharmony_ci xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 105862306a36Sopenharmony_ci dest->i_diflags2 |= XFS_DIFLAG2_REFLINK; 105962306a36Sopenharmony_ci xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 106062306a36Sopenharmony_ci xfs_ifork_init_cow(dest); 106162306a36Sopenharmony_ci } else 106262306a36Sopenharmony_ci xfs_iunlock(dest, XFS_ILOCK_EXCL); 106362306a36Sopenharmony_ci 106462306a36Sopenharmony_cicommit_flags: 106562306a36Sopenharmony_ci error = xfs_trans_commit(tp); 106662306a36Sopenharmony_ci if (error) 106762306a36Sopenharmony_ci goto out_error; 106862306a36Sopenharmony_ci return error; 106962306a36Sopenharmony_ci 107062306a36Sopenharmony_ciout_error: 107162306a36Sopenharmony_ci trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); 107262306a36Sopenharmony_ci return error; 107362306a36Sopenharmony_ci} 107462306a36Sopenharmony_ci 107562306a36Sopenharmony_ci/* 107662306a36Sopenharmony_ci * Update destination inode size & cowextsize hint, if necessary. 107762306a36Sopenharmony_ci */ 107862306a36Sopenharmony_ciint 107962306a36Sopenharmony_cixfs_reflink_update_dest( 108062306a36Sopenharmony_ci struct xfs_inode *dest, 108162306a36Sopenharmony_ci xfs_off_t newlen, 108262306a36Sopenharmony_ci xfs_extlen_t cowextsize, 108362306a36Sopenharmony_ci unsigned int remap_flags) 108462306a36Sopenharmony_ci{ 108562306a36Sopenharmony_ci struct xfs_mount *mp = dest->i_mount; 108662306a36Sopenharmony_ci struct xfs_trans *tp; 108762306a36Sopenharmony_ci int error; 108862306a36Sopenharmony_ci 108962306a36Sopenharmony_ci if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) 109062306a36Sopenharmony_ci return 0; 109162306a36Sopenharmony_ci 109262306a36Sopenharmony_ci error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); 109362306a36Sopenharmony_ci if (error) 109462306a36Sopenharmony_ci goto out_error; 109562306a36Sopenharmony_ci 109662306a36Sopenharmony_ci xfs_ilock(dest, XFS_ILOCK_EXCL); 109762306a36Sopenharmony_ci xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); 109862306a36Sopenharmony_ci 109962306a36Sopenharmony_ci if (newlen > i_size_read(VFS_I(dest))) { 110062306a36Sopenharmony_ci trace_xfs_reflink_update_inode_size(dest, newlen); 110162306a36Sopenharmony_ci i_size_write(VFS_I(dest), newlen); 110262306a36Sopenharmony_ci dest->i_disk_size = newlen; 110362306a36Sopenharmony_ci } 110462306a36Sopenharmony_ci 110562306a36Sopenharmony_ci if (cowextsize) { 110662306a36Sopenharmony_ci dest->i_cowextsize = cowextsize; 110762306a36Sopenharmony_ci dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; 110862306a36Sopenharmony_ci } 110962306a36Sopenharmony_ci 111062306a36Sopenharmony_ci xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); 111162306a36Sopenharmony_ci 111262306a36Sopenharmony_ci error = xfs_trans_commit(tp); 111362306a36Sopenharmony_ci if (error) 111462306a36Sopenharmony_ci goto out_error; 111562306a36Sopenharmony_ci return error; 111662306a36Sopenharmony_ci 111762306a36Sopenharmony_ciout_error: 111862306a36Sopenharmony_ci trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); 111962306a36Sopenharmony_ci return error; 112062306a36Sopenharmony_ci} 112162306a36Sopenharmony_ci 112262306a36Sopenharmony_ci/* 112362306a36Sopenharmony_ci * Do we have enough reserve in this AG to handle a reflink? The refcount 112462306a36Sopenharmony_ci * btree already reserved all the space it needs, but the rmap btree can grow 112562306a36Sopenharmony_ci * infinitely, so we won't allow more reflinks when the AG is down to the 112662306a36Sopenharmony_ci * btree reserves. 112762306a36Sopenharmony_ci */ 112862306a36Sopenharmony_cistatic int 112962306a36Sopenharmony_cixfs_reflink_ag_has_free_space( 113062306a36Sopenharmony_ci struct xfs_mount *mp, 113162306a36Sopenharmony_ci xfs_agnumber_t agno) 113262306a36Sopenharmony_ci{ 113362306a36Sopenharmony_ci struct xfs_perag *pag; 113462306a36Sopenharmony_ci int error = 0; 113562306a36Sopenharmony_ci 113662306a36Sopenharmony_ci if (!xfs_has_rmapbt(mp)) 113762306a36Sopenharmony_ci return 0; 113862306a36Sopenharmony_ci 113962306a36Sopenharmony_ci pag = xfs_perag_get(mp, agno); 114062306a36Sopenharmony_ci if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || 114162306a36Sopenharmony_ci xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) 114262306a36Sopenharmony_ci error = -ENOSPC; 114362306a36Sopenharmony_ci xfs_perag_put(pag); 114462306a36Sopenharmony_ci return error; 114562306a36Sopenharmony_ci} 114662306a36Sopenharmony_ci 114762306a36Sopenharmony_ci/* 114862306a36Sopenharmony_ci * Remap the given extent into the file. The dmap blockcount will be set to 114962306a36Sopenharmony_ci * the number of blocks that were actually remapped. 115062306a36Sopenharmony_ci */ 115162306a36Sopenharmony_ciSTATIC int 115262306a36Sopenharmony_cixfs_reflink_remap_extent( 115362306a36Sopenharmony_ci struct xfs_inode *ip, 115462306a36Sopenharmony_ci struct xfs_bmbt_irec *dmap, 115562306a36Sopenharmony_ci xfs_off_t new_isize) 115662306a36Sopenharmony_ci{ 115762306a36Sopenharmony_ci struct xfs_bmbt_irec smap; 115862306a36Sopenharmony_ci struct xfs_mount *mp = ip->i_mount; 115962306a36Sopenharmony_ci struct xfs_trans *tp; 116062306a36Sopenharmony_ci xfs_off_t newlen; 116162306a36Sopenharmony_ci int64_t qdelta = 0; 116262306a36Sopenharmony_ci unsigned int resblks; 116362306a36Sopenharmony_ci bool quota_reserved = true; 116462306a36Sopenharmony_ci bool smap_real; 116562306a36Sopenharmony_ci bool dmap_written = xfs_bmap_is_written_extent(dmap); 116662306a36Sopenharmony_ci int iext_delta = 0; 116762306a36Sopenharmony_ci int nimaps; 116862306a36Sopenharmony_ci int error; 116962306a36Sopenharmony_ci 117062306a36Sopenharmony_ci /* 117162306a36Sopenharmony_ci * Start a rolling transaction to switch the mappings. 117262306a36Sopenharmony_ci * 117362306a36Sopenharmony_ci * Adding a written extent to the extent map can cause a bmbt split, 117462306a36Sopenharmony_ci * and removing a mapped extent from the extent can cause a bmbt split. 117562306a36Sopenharmony_ci * The two operations cannot both cause a split since they operate on 117662306a36Sopenharmony_ci * the same index in the bmap btree, so we only need a reservation for 117762306a36Sopenharmony_ci * one bmbt split if either thing is happening. However, we haven't 117862306a36Sopenharmony_ci * locked the inode yet, so we reserve assuming this is the case. 117962306a36Sopenharmony_ci * 118062306a36Sopenharmony_ci * The first allocation call tries to reserve enough space to handle 118162306a36Sopenharmony_ci * mapping dmap into a sparse part of the file plus the bmbt split. We 118262306a36Sopenharmony_ci * haven't locked the inode or read the existing mapping yet, so we do 118362306a36Sopenharmony_ci * not know for sure that we need the space. This should succeed most 118462306a36Sopenharmony_ci * of the time. 118562306a36Sopenharmony_ci * 118662306a36Sopenharmony_ci * If the first attempt fails, try again but reserving only enough 118762306a36Sopenharmony_ci * space to handle a bmbt split. This is the hard minimum requirement, 118862306a36Sopenharmony_ci * and we revisit quota reservations later when we know more about what 118962306a36Sopenharmony_ci * we're remapping. 119062306a36Sopenharmony_ci */ 119162306a36Sopenharmony_ci resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 119262306a36Sopenharmony_ci error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 119362306a36Sopenharmony_ci resblks + dmap->br_blockcount, 0, false, &tp); 119462306a36Sopenharmony_ci if (error == -EDQUOT || error == -ENOSPC) { 119562306a36Sopenharmony_ci quota_reserved = false; 119662306a36Sopenharmony_ci error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 119762306a36Sopenharmony_ci resblks, 0, false, &tp); 119862306a36Sopenharmony_ci } 119962306a36Sopenharmony_ci if (error) 120062306a36Sopenharmony_ci goto out; 120162306a36Sopenharmony_ci 120262306a36Sopenharmony_ci /* 120362306a36Sopenharmony_ci * Read what's currently mapped in the destination file into smap. 120462306a36Sopenharmony_ci * If smap isn't a hole, we will have to remove it before we can add 120562306a36Sopenharmony_ci * dmap to the destination file. 120662306a36Sopenharmony_ci */ 120762306a36Sopenharmony_ci nimaps = 1; 120862306a36Sopenharmony_ci error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount, 120962306a36Sopenharmony_ci &smap, &nimaps, 0); 121062306a36Sopenharmony_ci if (error) 121162306a36Sopenharmony_ci goto out_cancel; 121262306a36Sopenharmony_ci ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff); 121362306a36Sopenharmony_ci smap_real = xfs_bmap_is_real_extent(&smap); 121462306a36Sopenharmony_ci 121562306a36Sopenharmony_ci /* 121662306a36Sopenharmony_ci * We can only remap as many blocks as the smaller of the two extent 121762306a36Sopenharmony_ci * maps, because we can only remap one extent at a time. 121862306a36Sopenharmony_ci */ 121962306a36Sopenharmony_ci dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount); 122062306a36Sopenharmony_ci ASSERT(dmap->br_blockcount == smap.br_blockcount); 122162306a36Sopenharmony_ci 122262306a36Sopenharmony_ci trace_xfs_reflink_remap_extent_dest(ip, &smap); 122362306a36Sopenharmony_ci 122462306a36Sopenharmony_ci /* 122562306a36Sopenharmony_ci * Two extents mapped to the same physical block must not have 122662306a36Sopenharmony_ci * different states; that's filesystem corruption. Move on to the next 122762306a36Sopenharmony_ci * extent if they're both holes or both the same physical extent. 122862306a36Sopenharmony_ci */ 122962306a36Sopenharmony_ci if (dmap->br_startblock == smap.br_startblock) { 123062306a36Sopenharmony_ci if (dmap->br_state != smap.br_state) 123162306a36Sopenharmony_ci error = -EFSCORRUPTED; 123262306a36Sopenharmony_ci goto out_cancel; 123362306a36Sopenharmony_ci } 123462306a36Sopenharmony_ci 123562306a36Sopenharmony_ci /* If both extents are unwritten, leave them alone. */ 123662306a36Sopenharmony_ci if (dmap->br_state == XFS_EXT_UNWRITTEN && 123762306a36Sopenharmony_ci smap.br_state == XFS_EXT_UNWRITTEN) 123862306a36Sopenharmony_ci goto out_cancel; 123962306a36Sopenharmony_ci 124062306a36Sopenharmony_ci /* No reflinking if the AG of the dest mapping is low on space. */ 124162306a36Sopenharmony_ci if (dmap_written) { 124262306a36Sopenharmony_ci error = xfs_reflink_ag_has_free_space(mp, 124362306a36Sopenharmony_ci XFS_FSB_TO_AGNO(mp, dmap->br_startblock)); 124462306a36Sopenharmony_ci if (error) 124562306a36Sopenharmony_ci goto out_cancel; 124662306a36Sopenharmony_ci } 124762306a36Sopenharmony_ci 124862306a36Sopenharmony_ci /* 124962306a36Sopenharmony_ci * Increase quota reservation if we think the quota block counter for 125062306a36Sopenharmony_ci * this file could increase. 125162306a36Sopenharmony_ci * 125262306a36Sopenharmony_ci * If we are mapping a written extent into the file, we need to have 125362306a36Sopenharmony_ci * enough quota block count reservation to handle the blocks in that 125462306a36Sopenharmony_ci * extent. We log only the delta to the quota block counts, so if the 125562306a36Sopenharmony_ci * extent we're unmapping also has blocks allocated to it, we don't 125662306a36Sopenharmony_ci * need a quota reservation for the extent itself. 125762306a36Sopenharmony_ci * 125862306a36Sopenharmony_ci * Note that if we're replacing a delalloc reservation with a written 125962306a36Sopenharmony_ci * extent, we have to take the full quota reservation because removing 126062306a36Sopenharmony_ci * the delalloc reservation gives the block count back to the quota 126162306a36Sopenharmony_ci * count. This is suboptimal, but the VFS flushed the dest range 126262306a36Sopenharmony_ci * before we started. That should have removed all the delalloc 126362306a36Sopenharmony_ci * reservations, but we code defensively. 126462306a36Sopenharmony_ci * 126562306a36Sopenharmony_ci * xfs_trans_alloc_inode above already tried to grab an even larger 126662306a36Sopenharmony_ci * quota reservation, and kicked off a blockgc scan if it couldn't. 126762306a36Sopenharmony_ci * If we can't get a potentially smaller quota reservation now, we're 126862306a36Sopenharmony_ci * done. 126962306a36Sopenharmony_ci */ 127062306a36Sopenharmony_ci if (!quota_reserved && !smap_real && dmap_written) { 127162306a36Sopenharmony_ci error = xfs_trans_reserve_quota_nblks(tp, ip, 127262306a36Sopenharmony_ci dmap->br_blockcount, 0, false); 127362306a36Sopenharmony_ci if (error) 127462306a36Sopenharmony_ci goto out_cancel; 127562306a36Sopenharmony_ci } 127662306a36Sopenharmony_ci 127762306a36Sopenharmony_ci if (smap_real) 127862306a36Sopenharmony_ci ++iext_delta; 127962306a36Sopenharmony_ci 128062306a36Sopenharmony_ci if (dmap_written) 128162306a36Sopenharmony_ci ++iext_delta; 128262306a36Sopenharmony_ci 128362306a36Sopenharmony_ci error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta); 128462306a36Sopenharmony_ci if (error == -EFBIG) 128562306a36Sopenharmony_ci error = xfs_iext_count_upgrade(tp, ip, iext_delta); 128662306a36Sopenharmony_ci if (error) 128762306a36Sopenharmony_ci goto out_cancel; 128862306a36Sopenharmony_ci 128962306a36Sopenharmony_ci if (smap_real) { 129062306a36Sopenharmony_ci /* 129162306a36Sopenharmony_ci * If the extent we're unmapping is backed by storage (written 129262306a36Sopenharmony_ci * or not), unmap the extent and drop its refcount. 129362306a36Sopenharmony_ci */ 129462306a36Sopenharmony_ci xfs_bmap_unmap_extent(tp, ip, &smap); 129562306a36Sopenharmony_ci xfs_refcount_decrease_extent(tp, &smap); 129662306a36Sopenharmony_ci qdelta -= smap.br_blockcount; 129762306a36Sopenharmony_ci } else if (smap.br_startblock == DELAYSTARTBLOCK) { 129862306a36Sopenharmony_ci int done; 129962306a36Sopenharmony_ci 130062306a36Sopenharmony_ci /* 130162306a36Sopenharmony_ci * If the extent we're unmapping is a delalloc reservation, 130262306a36Sopenharmony_ci * we can use the regular bunmapi function to release the 130362306a36Sopenharmony_ci * incore state. Dropping the delalloc reservation takes care 130462306a36Sopenharmony_ci * of the quota reservation for us. 130562306a36Sopenharmony_ci */ 130662306a36Sopenharmony_ci error = xfs_bunmapi(NULL, ip, smap.br_startoff, 130762306a36Sopenharmony_ci smap.br_blockcount, 0, 1, &done); 130862306a36Sopenharmony_ci if (error) 130962306a36Sopenharmony_ci goto out_cancel; 131062306a36Sopenharmony_ci ASSERT(done); 131162306a36Sopenharmony_ci } 131262306a36Sopenharmony_ci 131362306a36Sopenharmony_ci /* 131462306a36Sopenharmony_ci * If the extent we're sharing is backed by written storage, increase 131562306a36Sopenharmony_ci * its refcount and map it into the file. 131662306a36Sopenharmony_ci */ 131762306a36Sopenharmony_ci if (dmap_written) { 131862306a36Sopenharmony_ci xfs_refcount_increase_extent(tp, dmap); 131962306a36Sopenharmony_ci xfs_bmap_map_extent(tp, ip, dmap); 132062306a36Sopenharmony_ci qdelta += dmap->br_blockcount; 132162306a36Sopenharmony_ci } 132262306a36Sopenharmony_ci 132362306a36Sopenharmony_ci xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta); 132462306a36Sopenharmony_ci 132562306a36Sopenharmony_ci /* Update dest isize if needed. */ 132662306a36Sopenharmony_ci newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount); 132762306a36Sopenharmony_ci newlen = min_t(xfs_off_t, newlen, new_isize); 132862306a36Sopenharmony_ci if (newlen > i_size_read(VFS_I(ip))) { 132962306a36Sopenharmony_ci trace_xfs_reflink_update_inode_size(ip, newlen); 133062306a36Sopenharmony_ci i_size_write(VFS_I(ip), newlen); 133162306a36Sopenharmony_ci ip->i_disk_size = newlen; 133262306a36Sopenharmony_ci xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 133362306a36Sopenharmony_ci } 133462306a36Sopenharmony_ci 133562306a36Sopenharmony_ci /* Commit everything and unlock. */ 133662306a36Sopenharmony_ci error = xfs_trans_commit(tp); 133762306a36Sopenharmony_ci goto out_unlock; 133862306a36Sopenharmony_ci 133962306a36Sopenharmony_ciout_cancel: 134062306a36Sopenharmony_ci xfs_trans_cancel(tp); 134162306a36Sopenharmony_ciout_unlock: 134262306a36Sopenharmony_ci xfs_iunlock(ip, XFS_ILOCK_EXCL); 134362306a36Sopenharmony_ciout: 134462306a36Sopenharmony_ci if (error) 134562306a36Sopenharmony_ci trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); 134662306a36Sopenharmony_ci return error; 134762306a36Sopenharmony_ci} 134862306a36Sopenharmony_ci 134962306a36Sopenharmony_ci/* Remap a range of one file to the other. */ 135062306a36Sopenharmony_ciint 135162306a36Sopenharmony_cixfs_reflink_remap_blocks( 135262306a36Sopenharmony_ci struct xfs_inode *src, 135362306a36Sopenharmony_ci loff_t pos_in, 135462306a36Sopenharmony_ci struct xfs_inode *dest, 135562306a36Sopenharmony_ci loff_t pos_out, 135662306a36Sopenharmony_ci loff_t remap_len, 135762306a36Sopenharmony_ci loff_t *remapped) 135862306a36Sopenharmony_ci{ 135962306a36Sopenharmony_ci struct xfs_bmbt_irec imap; 136062306a36Sopenharmony_ci struct xfs_mount *mp = src->i_mount; 136162306a36Sopenharmony_ci xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in); 136262306a36Sopenharmony_ci xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out); 136362306a36Sopenharmony_ci xfs_filblks_t len; 136462306a36Sopenharmony_ci xfs_filblks_t remapped_len = 0; 136562306a36Sopenharmony_ci xfs_off_t new_isize = pos_out + remap_len; 136662306a36Sopenharmony_ci int nimaps; 136762306a36Sopenharmony_ci int error = 0; 136862306a36Sopenharmony_ci 136962306a36Sopenharmony_ci len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len), 137062306a36Sopenharmony_ci XFS_MAX_FILEOFF); 137162306a36Sopenharmony_ci 137262306a36Sopenharmony_ci trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff); 137362306a36Sopenharmony_ci 137462306a36Sopenharmony_ci while (len > 0) { 137562306a36Sopenharmony_ci unsigned int lock_mode; 137662306a36Sopenharmony_ci 137762306a36Sopenharmony_ci /* Read extent from the source file */ 137862306a36Sopenharmony_ci nimaps = 1; 137962306a36Sopenharmony_ci lock_mode = xfs_ilock_data_map_shared(src); 138062306a36Sopenharmony_ci error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); 138162306a36Sopenharmony_ci xfs_iunlock(src, lock_mode); 138262306a36Sopenharmony_ci if (error) 138362306a36Sopenharmony_ci break; 138462306a36Sopenharmony_ci /* 138562306a36Sopenharmony_ci * The caller supposedly flushed all dirty pages in the source 138662306a36Sopenharmony_ci * file range, which means that writeback should have allocated 138762306a36Sopenharmony_ci * or deleted all delalloc reservations in that range. If we 138862306a36Sopenharmony_ci * find one, that's a good sign that something is seriously 138962306a36Sopenharmony_ci * wrong here. 139062306a36Sopenharmony_ci */ 139162306a36Sopenharmony_ci ASSERT(nimaps == 1 && imap.br_startoff == srcoff); 139262306a36Sopenharmony_ci if (imap.br_startblock == DELAYSTARTBLOCK) { 139362306a36Sopenharmony_ci ASSERT(imap.br_startblock != DELAYSTARTBLOCK); 139462306a36Sopenharmony_ci error = -EFSCORRUPTED; 139562306a36Sopenharmony_ci break; 139662306a36Sopenharmony_ci } 139762306a36Sopenharmony_ci 139862306a36Sopenharmony_ci trace_xfs_reflink_remap_extent_src(src, &imap); 139962306a36Sopenharmony_ci 140062306a36Sopenharmony_ci /* Remap into the destination file at the given offset. */ 140162306a36Sopenharmony_ci imap.br_startoff = destoff; 140262306a36Sopenharmony_ci error = xfs_reflink_remap_extent(dest, &imap, new_isize); 140362306a36Sopenharmony_ci if (error) 140462306a36Sopenharmony_ci break; 140562306a36Sopenharmony_ci 140662306a36Sopenharmony_ci if (fatal_signal_pending(current)) { 140762306a36Sopenharmony_ci error = -EINTR; 140862306a36Sopenharmony_ci break; 140962306a36Sopenharmony_ci } 141062306a36Sopenharmony_ci 141162306a36Sopenharmony_ci /* Advance drange/srange */ 141262306a36Sopenharmony_ci srcoff += imap.br_blockcount; 141362306a36Sopenharmony_ci destoff += imap.br_blockcount; 141462306a36Sopenharmony_ci len -= imap.br_blockcount; 141562306a36Sopenharmony_ci remapped_len += imap.br_blockcount; 141662306a36Sopenharmony_ci } 141762306a36Sopenharmony_ci 141862306a36Sopenharmony_ci if (error) 141962306a36Sopenharmony_ci trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); 142062306a36Sopenharmony_ci *remapped = min_t(loff_t, remap_len, 142162306a36Sopenharmony_ci XFS_FSB_TO_B(src->i_mount, remapped_len)); 142262306a36Sopenharmony_ci return error; 142362306a36Sopenharmony_ci} 142462306a36Sopenharmony_ci 142562306a36Sopenharmony_ci/* 142662306a36Sopenharmony_ci * If we're reflinking to a point past the destination file's EOF, we must 142762306a36Sopenharmony_ci * zero any speculative post-EOF preallocations that sit between the old EOF 142862306a36Sopenharmony_ci * and the destination file offset. 142962306a36Sopenharmony_ci */ 143062306a36Sopenharmony_cistatic int 143162306a36Sopenharmony_cixfs_reflink_zero_posteof( 143262306a36Sopenharmony_ci struct xfs_inode *ip, 143362306a36Sopenharmony_ci loff_t pos) 143462306a36Sopenharmony_ci{ 143562306a36Sopenharmony_ci loff_t isize = i_size_read(VFS_I(ip)); 143662306a36Sopenharmony_ci 143762306a36Sopenharmony_ci if (pos <= isize) 143862306a36Sopenharmony_ci return 0; 143962306a36Sopenharmony_ci 144062306a36Sopenharmony_ci trace_xfs_zero_eof(ip, isize, pos - isize); 144162306a36Sopenharmony_ci return xfs_zero_range(ip, isize, pos - isize, NULL); 144262306a36Sopenharmony_ci} 144362306a36Sopenharmony_ci 144462306a36Sopenharmony_ci/* 144562306a36Sopenharmony_ci * Prepare two files for range cloning. Upon a successful return both inodes 144662306a36Sopenharmony_ci * will have the iolock and mmaplock held, the page cache of the out file will 144762306a36Sopenharmony_ci * be truncated, and any leases on the out file will have been broken. This 144862306a36Sopenharmony_ci * function borrows heavily from xfs_file_aio_write_checks. 144962306a36Sopenharmony_ci * 145062306a36Sopenharmony_ci * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't 145162306a36Sopenharmony_ci * checked that the bytes beyond EOF physically match. Hence we cannot use the 145262306a36Sopenharmony_ci * EOF block in the source dedupe range because it's not a complete block match, 145362306a36Sopenharmony_ci * hence can introduce a corruption into the file that has it's block replaced. 145462306a36Sopenharmony_ci * 145562306a36Sopenharmony_ci * In similar fashion, the VFS file cloning also allows partial EOF blocks to be 145662306a36Sopenharmony_ci * "block aligned" for the purposes of cloning entire files. However, if the 145762306a36Sopenharmony_ci * source file range includes the EOF block and it lands within the existing EOF 145862306a36Sopenharmony_ci * of the destination file, then we can expose stale data from beyond the source 145962306a36Sopenharmony_ci * file EOF in the destination file. 146062306a36Sopenharmony_ci * 146162306a36Sopenharmony_ci * XFS doesn't support partial block sharing, so in both cases we have check 146262306a36Sopenharmony_ci * these cases ourselves. For dedupe, we can simply round the length to dedupe 146362306a36Sopenharmony_ci * down to the previous whole block and ignore the partial EOF block. While this 146462306a36Sopenharmony_ci * means we can't dedupe the last block of a file, this is an acceptible 146562306a36Sopenharmony_ci * tradeoff for simplicity on implementation. 146662306a36Sopenharmony_ci * 146762306a36Sopenharmony_ci * For cloning, we want to share the partial EOF block if it is also the new EOF 146862306a36Sopenharmony_ci * block of the destination file. If the partial EOF block lies inside the 146962306a36Sopenharmony_ci * existing destination EOF, then we have to abort the clone to avoid exposing 147062306a36Sopenharmony_ci * stale data in the destination file. Hence we reject these clone attempts with 147162306a36Sopenharmony_ci * -EINVAL in this case. 147262306a36Sopenharmony_ci */ 147362306a36Sopenharmony_ciint 147462306a36Sopenharmony_cixfs_reflink_remap_prep( 147562306a36Sopenharmony_ci struct file *file_in, 147662306a36Sopenharmony_ci loff_t pos_in, 147762306a36Sopenharmony_ci struct file *file_out, 147862306a36Sopenharmony_ci loff_t pos_out, 147962306a36Sopenharmony_ci loff_t *len, 148062306a36Sopenharmony_ci unsigned int remap_flags) 148162306a36Sopenharmony_ci{ 148262306a36Sopenharmony_ci struct inode *inode_in = file_inode(file_in); 148362306a36Sopenharmony_ci struct xfs_inode *src = XFS_I(inode_in); 148462306a36Sopenharmony_ci struct inode *inode_out = file_inode(file_out); 148562306a36Sopenharmony_ci struct xfs_inode *dest = XFS_I(inode_out); 148662306a36Sopenharmony_ci int ret; 148762306a36Sopenharmony_ci 148862306a36Sopenharmony_ci /* Lock both files against IO */ 148962306a36Sopenharmony_ci ret = xfs_ilock2_io_mmap(src, dest); 149062306a36Sopenharmony_ci if (ret) 149162306a36Sopenharmony_ci return ret; 149262306a36Sopenharmony_ci 149362306a36Sopenharmony_ci /* Check file eligibility and prepare for block sharing. */ 149462306a36Sopenharmony_ci ret = -EINVAL; 149562306a36Sopenharmony_ci /* Don't reflink realtime inodes */ 149662306a36Sopenharmony_ci if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) 149762306a36Sopenharmony_ci goto out_unlock; 149862306a36Sopenharmony_ci 149962306a36Sopenharmony_ci /* Don't share DAX file data with non-DAX file. */ 150062306a36Sopenharmony_ci if (IS_DAX(inode_in) != IS_DAX(inode_out)) 150162306a36Sopenharmony_ci goto out_unlock; 150262306a36Sopenharmony_ci 150362306a36Sopenharmony_ci if (!IS_DAX(inode_in)) 150462306a36Sopenharmony_ci ret = generic_remap_file_range_prep(file_in, pos_in, file_out, 150562306a36Sopenharmony_ci pos_out, len, remap_flags); 150662306a36Sopenharmony_ci else 150762306a36Sopenharmony_ci ret = dax_remap_file_range_prep(file_in, pos_in, file_out, 150862306a36Sopenharmony_ci pos_out, len, remap_flags, &xfs_read_iomap_ops); 150962306a36Sopenharmony_ci if (ret || *len == 0) 151062306a36Sopenharmony_ci goto out_unlock; 151162306a36Sopenharmony_ci 151262306a36Sopenharmony_ci /* Attach dquots to dest inode before changing block map */ 151362306a36Sopenharmony_ci ret = xfs_qm_dqattach(dest); 151462306a36Sopenharmony_ci if (ret) 151562306a36Sopenharmony_ci goto out_unlock; 151662306a36Sopenharmony_ci 151762306a36Sopenharmony_ci /* 151862306a36Sopenharmony_ci * Zero existing post-eof speculative preallocations in the destination 151962306a36Sopenharmony_ci * file. 152062306a36Sopenharmony_ci */ 152162306a36Sopenharmony_ci ret = xfs_reflink_zero_posteof(dest, pos_out); 152262306a36Sopenharmony_ci if (ret) 152362306a36Sopenharmony_ci goto out_unlock; 152462306a36Sopenharmony_ci 152562306a36Sopenharmony_ci /* Set flags and remap blocks. */ 152662306a36Sopenharmony_ci ret = xfs_reflink_set_inode_flag(src, dest); 152762306a36Sopenharmony_ci if (ret) 152862306a36Sopenharmony_ci goto out_unlock; 152962306a36Sopenharmony_ci 153062306a36Sopenharmony_ci /* 153162306a36Sopenharmony_ci * If pos_out > EOF, we may have dirtied blocks between EOF and 153262306a36Sopenharmony_ci * pos_out. In that case, we need to extend the flush and unmap to cover 153362306a36Sopenharmony_ci * from EOF to the end of the copy length. 153462306a36Sopenharmony_ci */ 153562306a36Sopenharmony_ci if (pos_out > XFS_ISIZE(dest)) { 153662306a36Sopenharmony_ci loff_t flen = *len + (pos_out - XFS_ISIZE(dest)); 153762306a36Sopenharmony_ci ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen); 153862306a36Sopenharmony_ci } else { 153962306a36Sopenharmony_ci ret = xfs_flush_unmap_range(dest, pos_out, *len); 154062306a36Sopenharmony_ci } 154162306a36Sopenharmony_ci if (ret) 154262306a36Sopenharmony_ci goto out_unlock; 154362306a36Sopenharmony_ci 154462306a36Sopenharmony_ci xfs_iflags_set(src, XFS_IREMAPPING); 154562306a36Sopenharmony_ci if (inode_in != inode_out) 154662306a36Sopenharmony_ci xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); 154762306a36Sopenharmony_ci 154862306a36Sopenharmony_ci return 0; 154962306a36Sopenharmony_ciout_unlock: 155062306a36Sopenharmony_ci xfs_iunlock2_io_mmap(src, dest); 155162306a36Sopenharmony_ci return ret; 155262306a36Sopenharmony_ci} 155362306a36Sopenharmony_ci 155462306a36Sopenharmony_ci/* Does this inode need the reflink flag? */ 155562306a36Sopenharmony_ciint 155662306a36Sopenharmony_cixfs_reflink_inode_has_shared_extents( 155762306a36Sopenharmony_ci struct xfs_trans *tp, 155862306a36Sopenharmony_ci struct xfs_inode *ip, 155962306a36Sopenharmony_ci bool *has_shared) 156062306a36Sopenharmony_ci{ 156162306a36Sopenharmony_ci struct xfs_bmbt_irec got; 156262306a36Sopenharmony_ci struct xfs_mount *mp = ip->i_mount; 156362306a36Sopenharmony_ci struct xfs_ifork *ifp; 156462306a36Sopenharmony_ci struct xfs_iext_cursor icur; 156562306a36Sopenharmony_ci bool found; 156662306a36Sopenharmony_ci int error; 156762306a36Sopenharmony_ci 156862306a36Sopenharmony_ci ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 156962306a36Sopenharmony_ci error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); 157062306a36Sopenharmony_ci if (error) 157162306a36Sopenharmony_ci return error; 157262306a36Sopenharmony_ci 157362306a36Sopenharmony_ci *has_shared = false; 157462306a36Sopenharmony_ci found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); 157562306a36Sopenharmony_ci while (found) { 157662306a36Sopenharmony_ci struct xfs_perag *pag; 157762306a36Sopenharmony_ci xfs_agblock_t agbno; 157862306a36Sopenharmony_ci xfs_extlen_t aglen; 157962306a36Sopenharmony_ci xfs_agblock_t rbno; 158062306a36Sopenharmony_ci xfs_extlen_t rlen; 158162306a36Sopenharmony_ci 158262306a36Sopenharmony_ci if (isnullstartblock(got.br_startblock) || 158362306a36Sopenharmony_ci got.br_state != XFS_EXT_NORM) 158462306a36Sopenharmony_ci goto next; 158562306a36Sopenharmony_ci 158662306a36Sopenharmony_ci pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock)); 158762306a36Sopenharmony_ci agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); 158862306a36Sopenharmony_ci aglen = got.br_blockcount; 158962306a36Sopenharmony_ci error = xfs_reflink_find_shared(pag, tp, agbno, aglen, 159062306a36Sopenharmony_ci &rbno, &rlen, false); 159162306a36Sopenharmony_ci xfs_perag_put(pag); 159262306a36Sopenharmony_ci if (error) 159362306a36Sopenharmony_ci return error; 159462306a36Sopenharmony_ci 159562306a36Sopenharmony_ci /* Is there still a shared block here? */ 159662306a36Sopenharmony_ci if (rbno != NULLAGBLOCK) { 159762306a36Sopenharmony_ci *has_shared = true; 159862306a36Sopenharmony_ci return 0; 159962306a36Sopenharmony_ci } 160062306a36Sopenharmony_cinext: 160162306a36Sopenharmony_ci found = xfs_iext_next_extent(ifp, &icur, &got); 160262306a36Sopenharmony_ci } 160362306a36Sopenharmony_ci 160462306a36Sopenharmony_ci return 0; 160562306a36Sopenharmony_ci} 160662306a36Sopenharmony_ci 160762306a36Sopenharmony_ci/* 160862306a36Sopenharmony_ci * Clear the inode reflink flag if there are no shared extents. 160962306a36Sopenharmony_ci * 161062306a36Sopenharmony_ci * The caller is responsible for joining the inode to the transaction passed in. 161162306a36Sopenharmony_ci * The inode will be joined to the transaction that is returned to the caller. 161262306a36Sopenharmony_ci */ 161362306a36Sopenharmony_ciint 161462306a36Sopenharmony_cixfs_reflink_clear_inode_flag( 161562306a36Sopenharmony_ci struct xfs_inode *ip, 161662306a36Sopenharmony_ci struct xfs_trans **tpp) 161762306a36Sopenharmony_ci{ 161862306a36Sopenharmony_ci bool needs_flag; 161962306a36Sopenharmony_ci int error = 0; 162062306a36Sopenharmony_ci 162162306a36Sopenharmony_ci ASSERT(xfs_is_reflink_inode(ip)); 162262306a36Sopenharmony_ci 162362306a36Sopenharmony_ci error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); 162462306a36Sopenharmony_ci if (error || needs_flag) 162562306a36Sopenharmony_ci return error; 162662306a36Sopenharmony_ci 162762306a36Sopenharmony_ci /* 162862306a36Sopenharmony_ci * We didn't find any shared blocks so turn off the reflink flag. 162962306a36Sopenharmony_ci * First, get rid of any leftover CoW mappings. 163062306a36Sopenharmony_ci */ 163162306a36Sopenharmony_ci error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF, 163262306a36Sopenharmony_ci true); 163362306a36Sopenharmony_ci if (error) 163462306a36Sopenharmony_ci return error; 163562306a36Sopenharmony_ci 163662306a36Sopenharmony_ci /* Clear the inode flag. */ 163762306a36Sopenharmony_ci trace_xfs_reflink_unset_inode_flag(ip); 163862306a36Sopenharmony_ci ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 163962306a36Sopenharmony_ci xfs_inode_clear_cowblocks_tag(ip); 164062306a36Sopenharmony_ci xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); 164162306a36Sopenharmony_ci 164262306a36Sopenharmony_ci return error; 164362306a36Sopenharmony_ci} 164462306a36Sopenharmony_ci 164562306a36Sopenharmony_ci/* 164662306a36Sopenharmony_ci * Clear the inode reflink flag if there are no shared extents and the size 164762306a36Sopenharmony_ci * hasn't changed. 164862306a36Sopenharmony_ci */ 164962306a36Sopenharmony_ciSTATIC int 165062306a36Sopenharmony_cixfs_reflink_try_clear_inode_flag( 165162306a36Sopenharmony_ci struct xfs_inode *ip) 165262306a36Sopenharmony_ci{ 165362306a36Sopenharmony_ci struct xfs_mount *mp = ip->i_mount; 165462306a36Sopenharmony_ci struct xfs_trans *tp; 165562306a36Sopenharmony_ci int error = 0; 165662306a36Sopenharmony_ci 165762306a36Sopenharmony_ci /* Start a rolling transaction to remove the mappings */ 165862306a36Sopenharmony_ci error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); 165962306a36Sopenharmony_ci if (error) 166062306a36Sopenharmony_ci return error; 166162306a36Sopenharmony_ci 166262306a36Sopenharmony_ci xfs_ilock(ip, XFS_ILOCK_EXCL); 166362306a36Sopenharmony_ci xfs_trans_ijoin(tp, ip, 0); 166462306a36Sopenharmony_ci 166562306a36Sopenharmony_ci error = xfs_reflink_clear_inode_flag(ip, &tp); 166662306a36Sopenharmony_ci if (error) 166762306a36Sopenharmony_ci goto cancel; 166862306a36Sopenharmony_ci 166962306a36Sopenharmony_ci error = xfs_trans_commit(tp); 167062306a36Sopenharmony_ci if (error) 167162306a36Sopenharmony_ci goto out; 167262306a36Sopenharmony_ci 167362306a36Sopenharmony_ci xfs_iunlock(ip, XFS_ILOCK_EXCL); 167462306a36Sopenharmony_ci return 0; 167562306a36Sopenharmony_cicancel: 167662306a36Sopenharmony_ci xfs_trans_cancel(tp); 167762306a36Sopenharmony_ciout: 167862306a36Sopenharmony_ci xfs_iunlock(ip, XFS_ILOCK_EXCL); 167962306a36Sopenharmony_ci return error; 168062306a36Sopenharmony_ci} 168162306a36Sopenharmony_ci 168262306a36Sopenharmony_ci/* 168362306a36Sopenharmony_ci * Pre-COW all shared blocks within a given byte range of a file and turn off 168462306a36Sopenharmony_ci * the reflink flag if we unshare all of the file's blocks. 168562306a36Sopenharmony_ci */ 168662306a36Sopenharmony_ciint 168762306a36Sopenharmony_cixfs_reflink_unshare( 168862306a36Sopenharmony_ci struct xfs_inode *ip, 168962306a36Sopenharmony_ci xfs_off_t offset, 169062306a36Sopenharmony_ci xfs_off_t len) 169162306a36Sopenharmony_ci{ 169262306a36Sopenharmony_ci struct inode *inode = VFS_I(ip); 169362306a36Sopenharmony_ci int error; 169462306a36Sopenharmony_ci 169562306a36Sopenharmony_ci if (!xfs_is_reflink_inode(ip)) 169662306a36Sopenharmony_ci return 0; 169762306a36Sopenharmony_ci 169862306a36Sopenharmony_ci trace_xfs_reflink_unshare(ip, offset, len); 169962306a36Sopenharmony_ci 170062306a36Sopenharmony_ci inode_dio_wait(inode); 170162306a36Sopenharmony_ci 170262306a36Sopenharmony_ci if (IS_DAX(inode)) 170362306a36Sopenharmony_ci error = dax_file_unshare(inode, offset, len, 170462306a36Sopenharmony_ci &xfs_dax_write_iomap_ops); 170562306a36Sopenharmony_ci else 170662306a36Sopenharmony_ci error = iomap_file_unshare(inode, offset, len, 170762306a36Sopenharmony_ci &xfs_buffered_write_iomap_ops); 170862306a36Sopenharmony_ci if (error) 170962306a36Sopenharmony_ci goto out; 171062306a36Sopenharmony_ci 171162306a36Sopenharmony_ci error = filemap_write_and_wait_range(inode->i_mapping, offset, 171262306a36Sopenharmony_ci offset + len - 1); 171362306a36Sopenharmony_ci if (error) 171462306a36Sopenharmony_ci goto out; 171562306a36Sopenharmony_ci 171662306a36Sopenharmony_ci /* Turn off the reflink flag if possible. */ 171762306a36Sopenharmony_ci error = xfs_reflink_try_clear_inode_flag(ip); 171862306a36Sopenharmony_ci if (error) 171962306a36Sopenharmony_ci goto out; 172062306a36Sopenharmony_ci return 0; 172162306a36Sopenharmony_ci 172262306a36Sopenharmony_ciout: 172362306a36Sopenharmony_ci trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); 172462306a36Sopenharmony_ci return error; 172562306a36Sopenharmony_ci} 1726