162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0+
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2016 Oracle.  All Rights Reserved.
462306a36Sopenharmony_ci * Author: Darrick J. Wong <darrick.wong@oracle.com>
562306a36Sopenharmony_ci */
662306a36Sopenharmony_ci#include "xfs.h"
762306a36Sopenharmony_ci#include "xfs_fs.h"
862306a36Sopenharmony_ci#include "xfs_shared.h"
962306a36Sopenharmony_ci#include "xfs_format.h"
1062306a36Sopenharmony_ci#include "xfs_log_format.h"
1162306a36Sopenharmony_ci#include "xfs_trans_resv.h"
1262306a36Sopenharmony_ci#include "xfs_mount.h"
1362306a36Sopenharmony_ci#include "xfs_defer.h"
1462306a36Sopenharmony_ci#include "xfs_inode.h"
1562306a36Sopenharmony_ci#include "xfs_trans.h"
1662306a36Sopenharmony_ci#include "xfs_bmap.h"
1762306a36Sopenharmony_ci#include "xfs_bmap_util.h"
1862306a36Sopenharmony_ci#include "xfs_trace.h"
1962306a36Sopenharmony_ci#include "xfs_icache.h"
2062306a36Sopenharmony_ci#include "xfs_btree.h"
2162306a36Sopenharmony_ci#include "xfs_refcount_btree.h"
2262306a36Sopenharmony_ci#include "xfs_refcount.h"
2362306a36Sopenharmony_ci#include "xfs_bmap_btree.h"
2462306a36Sopenharmony_ci#include "xfs_trans_space.h"
2562306a36Sopenharmony_ci#include "xfs_bit.h"
2662306a36Sopenharmony_ci#include "xfs_alloc.h"
2762306a36Sopenharmony_ci#include "xfs_quota.h"
2862306a36Sopenharmony_ci#include "xfs_reflink.h"
2962306a36Sopenharmony_ci#include "xfs_iomap.h"
3062306a36Sopenharmony_ci#include "xfs_ag.h"
3162306a36Sopenharmony_ci#include "xfs_ag_resv.h"
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_ci/*
3462306a36Sopenharmony_ci * Copy on Write of Shared Blocks
3562306a36Sopenharmony_ci *
3662306a36Sopenharmony_ci * XFS must preserve "the usual" file semantics even when two files share
3762306a36Sopenharmony_ci * the same physical blocks.  This means that a write to one file must not
3862306a36Sopenharmony_ci * alter the blocks in a different file; the way that we'll do that is
3962306a36Sopenharmony_ci * through the use of a copy-on-write mechanism.  At a high level, that
4062306a36Sopenharmony_ci * means that when we want to write to a shared block, we allocate a new
4162306a36Sopenharmony_ci * block, write the data to the new block, and if that succeeds we map the
4262306a36Sopenharmony_ci * new block into the file.
4362306a36Sopenharmony_ci *
4462306a36Sopenharmony_ci * XFS provides a "delayed allocation" mechanism that defers the allocation
4562306a36Sopenharmony_ci * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
4662306a36Sopenharmony_ci * possible.  This reduces fragmentation by enabling the filesystem to ask
4762306a36Sopenharmony_ci * for bigger chunks less often, which is exactly what we want for CoW.
4862306a36Sopenharmony_ci *
4962306a36Sopenharmony_ci * The delalloc mechanism begins when the kernel wants to make a block
5062306a36Sopenharmony_ci * writable (write_begin or page_mkwrite).  If the offset is not mapped, we
5162306a36Sopenharmony_ci * create a delalloc mapping, which is a regular in-core extent, but without
5262306a36Sopenharmony_ci * a real startblock.  (For delalloc mappings, the startblock encodes both
5362306a36Sopenharmony_ci * a flag that this is a delalloc mapping, and a worst-case estimate of how
5462306a36Sopenharmony_ci * many blocks might be required to put the mapping into the BMBT.)  delalloc
5562306a36Sopenharmony_ci * mappings are a reservation against the free space in the filesystem;
5662306a36Sopenharmony_ci * adjacent mappings can also be combined into fewer larger mappings.
5762306a36Sopenharmony_ci *
5862306a36Sopenharmony_ci * As an optimization, the CoW extent size hint (cowextsz) creates
5962306a36Sopenharmony_ci * outsized aligned delalloc reservations in the hope of landing out of
6062306a36Sopenharmony_ci * order nearby CoW writes in a single extent on disk, thereby reducing
6162306a36Sopenharmony_ci * fragmentation and improving future performance.
6262306a36Sopenharmony_ci *
6362306a36Sopenharmony_ci * D: --RRRRRRSSSRRRRRRRR--- (data fork)
6462306a36Sopenharmony_ci * C: ------DDDDDDD--------- (CoW fork)
6562306a36Sopenharmony_ci *
6662306a36Sopenharmony_ci * When dirty pages are being written out (typically in writepage), the
6762306a36Sopenharmony_ci * delalloc reservations are converted into unwritten mappings by
6862306a36Sopenharmony_ci * allocating blocks and replacing the delalloc mapping with real ones.
6962306a36Sopenharmony_ci * A delalloc mapping can be replaced by several unwritten ones if the
7062306a36Sopenharmony_ci * free space is fragmented.
7162306a36Sopenharmony_ci *
7262306a36Sopenharmony_ci * D: --RRRRRRSSSRRRRRRRR---
7362306a36Sopenharmony_ci * C: ------UUUUUUU---------
7462306a36Sopenharmony_ci *
7562306a36Sopenharmony_ci * We want to adapt the delalloc mechanism for copy-on-write, since the
7662306a36Sopenharmony_ci * write paths are similar.  The first two steps (creating the reservation
7762306a36Sopenharmony_ci * and allocating the blocks) are exactly the same as delalloc except that
7862306a36Sopenharmony_ci * the mappings must be stored in a separate CoW fork because we do not want
7962306a36Sopenharmony_ci * to disturb the mapping in the data fork until we're sure that the write
8062306a36Sopenharmony_ci * succeeded.  IO completion in this case is the process of removing the old
8162306a36Sopenharmony_ci * mapping from the data fork and moving the new mapping from the CoW fork to
8262306a36Sopenharmony_ci * the data fork.  This will be discussed shortly.
8362306a36Sopenharmony_ci *
8462306a36Sopenharmony_ci * For now, unaligned directio writes will be bounced back to the page cache.
8562306a36Sopenharmony_ci * Block-aligned directio writes will use the same mechanism as buffered
8662306a36Sopenharmony_ci * writes.
8762306a36Sopenharmony_ci *
8862306a36Sopenharmony_ci * Just prior to submitting the actual disk write requests, we convert
8962306a36Sopenharmony_ci * the extents representing the range of the file actually being written
9062306a36Sopenharmony_ci * (as opposed to extra pieces created for the cowextsize hint) to real
9162306a36Sopenharmony_ci * extents.  This will become important in the next step:
9262306a36Sopenharmony_ci *
9362306a36Sopenharmony_ci * D: --RRRRRRSSSRRRRRRRR---
9462306a36Sopenharmony_ci * C: ------UUrrUUU---------
9562306a36Sopenharmony_ci *
9662306a36Sopenharmony_ci * CoW remapping must be done after the data block write completes,
9762306a36Sopenharmony_ci * because we don't want to destroy the old data fork map until we're sure
9862306a36Sopenharmony_ci * the new block has been written.  Since the new mappings are kept in a
9962306a36Sopenharmony_ci * separate fork, we can simply iterate these mappings to find the ones
10062306a36Sopenharmony_ci * that cover the file blocks that we just CoW'd.  For each extent, simply
10162306a36Sopenharmony_ci * unmap the corresponding range in the data fork, map the new range into
10262306a36Sopenharmony_ci * the data fork, and remove the extent from the CoW fork.  Because of
10362306a36Sopenharmony_ci * the presence of the cowextsize hint, however, we must be careful
10462306a36Sopenharmony_ci * only to remap the blocks that we've actually written out --  we must
10562306a36Sopenharmony_ci * never remap delalloc reservations nor CoW staging blocks that have
10662306a36Sopenharmony_ci * yet to be written.  This corresponds exactly to the real extents in
10762306a36Sopenharmony_ci * the CoW fork:
10862306a36Sopenharmony_ci *
10962306a36Sopenharmony_ci * D: --RRRRRRrrSRRRRRRRR---
11062306a36Sopenharmony_ci * C: ------UU--UUU---------
11162306a36Sopenharmony_ci *
11262306a36Sopenharmony_ci * Since the remapping operation can be applied to an arbitrary file
11362306a36Sopenharmony_ci * range, we record the need for the remap step as a flag in the ioend
11462306a36Sopenharmony_ci * instead of declaring a new IO type.  This is required for direct io
11562306a36Sopenharmony_ci * because we only have ioend for the whole dio, and we have to be able to
11662306a36Sopenharmony_ci * remember the presence of unwritten blocks and CoW blocks with a single
11762306a36Sopenharmony_ci * ioend structure.  Better yet, the more ground we can cover with one
11862306a36Sopenharmony_ci * ioend, the better.
11962306a36Sopenharmony_ci */
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci/*
12262306a36Sopenharmony_ci * Given an AG extent, find the lowest-numbered run of shared blocks
12362306a36Sopenharmony_ci * within that range and return the range in fbno/flen.  If
12462306a36Sopenharmony_ci * find_end_of_shared is true, return the longest contiguous extent of
12562306a36Sopenharmony_ci * shared blocks.  If there are no shared extents, fbno and flen will
12662306a36Sopenharmony_ci * be set to NULLAGBLOCK and 0, respectively.
12762306a36Sopenharmony_ci */
12862306a36Sopenharmony_cistatic int
12962306a36Sopenharmony_cixfs_reflink_find_shared(
13062306a36Sopenharmony_ci	struct xfs_perag	*pag,
13162306a36Sopenharmony_ci	struct xfs_trans	*tp,
13262306a36Sopenharmony_ci	xfs_agblock_t		agbno,
13362306a36Sopenharmony_ci	xfs_extlen_t		aglen,
13462306a36Sopenharmony_ci	xfs_agblock_t		*fbno,
13562306a36Sopenharmony_ci	xfs_extlen_t		*flen,
13662306a36Sopenharmony_ci	bool			find_end_of_shared)
13762306a36Sopenharmony_ci{
13862306a36Sopenharmony_ci	struct xfs_buf		*agbp;
13962306a36Sopenharmony_ci	struct xfs_btree_cur	*cur;
14062306a36Sopenharmony_ci	int			error;
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
14362306a36Sopenharmony_ci	if (error)
14462306a36Sopenharmony_ci		return error;
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci	cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
14962306a36Sopenharmony_ci			find_end_of_shared);
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_ci	xfs_btree_del_cursor(cur, error);
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_ci	xfs_trans_brelse(tp, agbp);
15462306a36Sopenharmony_ci	return error;
15562306a36Sopenharmony_ci}
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci/*
15862306a36Sopenharmony_ci * Trim the mapping to the next block where there's a change in the
15962306a36Sopenharmony_ci * shared/unshared status.  More specifically, this means that we
16062306a36Sopenharmony_ci * find the lowest-numbered extent of shared blocks that coincides with
16162306a36Sopenharmony_ci * the given block mapping.  If the shared extent overlaps the start of
16262306a36Sopenharmony_ci * the mapping, trim the mapping to the end of the shared extent.  If
16362306a36Sopenharmony_ci * the shared region intersects the mapping, trim the mapping to the
16462306a36Sopenharmony_ci * start of the shared extent.  If there are no shared regions that
16562306a36Sopenharmony_ci * overlap, just return the original extent.
16662306a36Sopenharmony_ci */
16762306a36Sopenharmony_ciint
16862306a36Sopenharmony_cixfs_reflink_trim_around_shared(
16962306a36Sopenharmony_ci	struct xfs_inode	*ip,
17062306a36Sopenharmony_ci	struct xfs_bmbt_irec	*irec,
17162306a36Sopenharmony_ci	bool			*shared)
17262306a36Sopenharmony_ci{
17362306a36Sopenharmony_ci	struct xfs_mount	*mp = ip->i_mount;
17462306a36Sopenharmony_ci	struct xfs_perag	*pag;
17562306a36Sopenharmony_ci	xfs_agblock_t		agbno;
17662306a36Sopenharmony_ci	xfs_extlen_t		aglen;
17762306a36Sopenharmony_ci	xfs_agblock_t		fbno;
17862306a36Sopenharmony_ci	xfs_extlen_t		flen;
17962306a36Sopenharmony_ci	int			error = 0;
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci	/* Holes, unwritten, and delalloc extents cannot be shared */
18262306a36Sopenharmony_ci	if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
18362306a36Sopenharmony_ci		*shared = false;
18462306a36Sopenharmony_ci		return 0;
18562306a36Sopenharmony_ci	}
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ci	trace_xfs_reflink_trim_around_shared(ip, irec);
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci	pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
19062306a36Sopenharmony_ci	agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
19162306a36Sopenharmony_ci	aglen = irec->br_blockcount;
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci	error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
19462306a36Sopenharmony_ci			true);
19562306a36Sopenharmony_ci	xfs_perag_put(pag);
19662306a36Sopenharmony_ci	if (error)
19762306a36Sopenharmony_ci		return error;
19862306a36Sopenharmony_ci
19962306a36Sopenharmony_ci	*shared = false;
20062306a36Sopenharmony_ci	if (fbno == NULLAGBLOCK) {
20162306a36Sopenharmony_ci		/* No shared blocks at all. */
20262306a36Sopenharmony_ci		return 0;
20362306a36Sopenharmony_ci	}
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci	if (fbno == agbno) {
20662306a36Sopenharmony_ci		/*
20762306a36Sopenharmony_ci		 * The start of this extent is shared.  Truncate the
20862306a36Sopenharmony_ci		 * mapping at the end of the shared region so that a
20962306a36Sopenharmony_ci		 * subsequent iteration starts at the start of the
21062306a36Sopenharmony_ci		 * unshared region.
21162306a36Sopenharmony_ci		 */
21262306a36Sopenharmony_ci		irec->br_blockcount = flen;
21362306a36Sopenharmony_ci		*shared = true;
21462306a36Sopenharmony_ci		return 0;
21562306a36Sopenharmony_ci	}
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	/*
21862306a36Sopenharmony_ci	 * There's a shared extent midway through this extent.
21962306a36Sopenharmony_ci	 * Truncate the mapping at the start of the shared
22062306a36Sopenharmony_ci	 * extent so that a subsequent iteration starts at the
22162306a36Sopenharmony_ci	 * start of the shared region.
22262306a36Sopenharmony_ci	 */
22362306a36Sopenharmony_ci	irec->br_blockcount = fbno - agbno;
22462306a36Sopenharmony_ci	return 0;
22562306a36Sopenharmony_ci}
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ciint
22862306a36Sopenharmony_cixfs_bmap_trim_cow(
22962306a36Sopenharmony_ci	struct xfs_inode	*ip,
23062306a36Sopenharmony_ci	struct xfs_bmbt_irec	*imap,
23162306a36Sopenharmony_ci	bool			*shared)
23262306a36Sopenharmony_ci{
23362306a36Sopenharmony_ci	/* We can't update any real extents in always COW mode. */
23462306a36Sopenharmony_ci	if (xfs_is_always_cow_inode(ip) &&
23562306a36Sopenharmony_ci	    !isnullstartblock(imap->br_startblock)) {
23662306a36Sopenharmony_ci		*shared = true;
23762306a36Sopenharmony_ci		return 0;
23862306a36Sopenharmony_ci	}
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_ci	/* Trim the mapping to the nearest shared extent boundary. */
24162306a36Sopenharmony_ci	return xfs_reflink_trim_around_shared(ip, imap, shared);
24262306a36Sopenharmony_ci}
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_cistatic int
24562306a36Sopenharmony_cixfs_reflink_convert_cow_locked(
24662306a36Sopenharmony_ci	struct xfs_inode	*ip,
24762306a36Sopenharmony_ci	xfs_fileoff_t		offset_fsb,
24862306a36Sopenharmony_ci	xfs_filblks_t		count_fsb)
24962306a36Sopenharmony_ci{
25062306a36Sopenharmony_ci	struct xfs_iext_cursor	icur;
25162306a36Sopenharmony_ci	struct xfs_bmbt_irec	got;
25262306a36Sopenharmony_ci	struct xfs_btree_cur	*dummy_cur = NULL;
25362306a36Sopenharmony_ci	int			dummy_logflags;
25462306a36Sopenharmony_ci	int			error = 0;
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_ci	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
25762306a36Sopenharmony_ci		return 0;
25862306a36Sopenharmony_ci
25962306a36Sopenharmony_ci	do {
26062306a36Sopenharmony_ci		if (got.br_startoff >= offset_fsb + count_fsb)
26162306a36Sopenharmony_ci			break;
26262306a36Sopenharmony_ci		if (got.br_state == XFS_EXT_NORM)
26362306a36Sopenharmony_ci			continue;
26462306a36Sopenharmony_ci		if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
26562306a36Sopenharmony_ci			return -EIO;
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci		xfs_trim_extent(&got, offset_fsb, count_fsb);
26862306a36Sopenharmony_ci		if (!got.br_blockcount)
26962306a36Sopenharmony_ci			continue;
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_ci		got.br_state = XFS_EXT_NORM;
27262306a36Sopenharmony_ci		error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
27362306a36Sopenharmony_ci				XFS_COW_FORK, &icur, &dummy_cur, &got,
27462306a36Sopenharmony_ci				&dummy_logflags);
27562306a36Sopenharmony_ci		if (error)
27662306a36Sopenharmony_ci			return error;
27762306a36Sopenharmony_ci	} while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci	return error;
28062306a36Sopenharmony_ci}
28162306a36Sopenharmony_ci
28262306a36Sopenharmony_ci/* Convert all of the unwritten CoW extents in a file's range to real ones. */
28362306a36Sopenharmony_ciint
28462306a36Sopenharmony_cixfs_reflink_convert_cow(
28562306a36Sopenharmony_ci	struct xfs_inode	*ip,
28662306a36Sopenharmony_ci	xfs_off_t		offset,
28762306a36Sopenharmony_ci	xfs_off_t		count)
28862306a36Sopenharmony_ci{
28962306a36Sopenharmony_ci	struct xfs_mount	*mp = ip->i_mount;
29062306a36Sopenharmony_ci	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
29162306a36Sopenharmony_ci	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
29262306a36Sopenharmony_ci	xfs_filblks_t		count_fsb = end_fsb - offset_fsb;
29362306a36Sopenharmony_ci	int			error;
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci	ASSERT(count != 0);
29662306a36Sopenharmony_ci
29762306a36Sopenharmony_ci	xfs_ilock(ip, XFS_ILOCK_EXCL);
29862306a36Sopenharmony_ci	error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
29962306a36Sopenharmony_ci	xfs_iunlock(ip, XFS_ILOCK_EXCL);
30062306a36Sopenharmony_ci	return error;
30162306a36Sopenharmony_ci}
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci/*
30462306a36Sopenharmony_ci * Find the extent that maps the given range in the COW fork. Even if the extent
30562306a36Sopenharmony_ci * is not shared we might have a preallocation for it in the COW fork. If so we
30662306a36Sopenharmony_ci * use it that rather than trigger a new allocation.
30762306a36Sopenharmony_ci */
30862306a36Sopenharmony_cistatic int
30962306a36Sopenharmony_cixfs_find_trim_cow_extent(
31062306a36Sopenharmony_ci	struct xfs_inode	*ip,
31162306a36Sopenharmony_ci	struct xfs_bmbt_irec	*imap,
31262306a36Sopenharmony_ci	struct xfs_bmbt_irec	*cmap,
31362306a36Sopenharmony_ci	bool			*shared,
31462306a36Sopenharmony_ci	bool			*found)
31562306a36Sopenharmony_ci{
31662306a36Sopenharmony_ci	xfs_fileoff_t		offset_fsb = imap->br_startoff;
31762306a36Sopenharmony_ci	xfs_filblks_t		count_fsb = imap->br_blockcount;
31862306a36Sopenharmony_ci	struct xfs_iext_cursor	icur;
31962306a36Sopenharmony_ci
32062306a36Sopenharmony_ci	*found = false;
32162306a36Sopenharmony_ci
32262306a36Sopenharmony_ci	/*
32362306a36Sopenharmony_ci	 * If we don't find an overlapping extent, trim the range we need to
32462306a36Sopenharmony_ci	 * allocate to fit the hole we found.
32562306a36Sopenharmony_ci	 */
32662306a36Sopenharmony_ci	if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
32762306a36Sopenharmony_ci		cmap->br_startoff = offset_fsb + count_fsb;
32862306a36Sopenharmony_ci	if (cmap->br_startoff > offset_fsb) {
32962306a36Sopenharmony_ci		xfs_trim_extent(imap, imap->br_startoff,
33062306a36Sopenharmony_ci				cmap->br_startoff - imap->br_startoff);
33162306a36Sopenharmony_ci		return xfs_bmap_trim_cow(ip, imap, shared);
33262306a36Sopenharmony_ci	}
33362306a36Sopenharmony_ci
33462306a36Sopenharmony_ci	*shared = true;
33562306a36Sopenharmony_ci	if (isnullstartblock(cmap->br_startblock)) {
33662306a36Sopenharmony_ci		xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
33762306a36Sopenharmony_ci		return 0;
33862306a36Sopenharmony_ci	}
33962306a36Sopenharmony_ci
34062306a36Sopenharmony_ci	/* real extent found - no need to allocate */
34162306a36Sopenharmony_ci	xfs_trim_extent(cmap, offset_fsb, count_fsb);
34262306a36Sopenharmony_ci	*found = true;
34362306a36Sopenharmony_ci	return 0;
34462306a36Sopenharmony_ci}
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_cistatic int
34762306a36Sopenharmony_cixfs_reflink_convert_unwritten(
34862306a36Sopenharmony_ci	struct xfs_inode	*ip,
34962306a36Sopenharmony_ci	struct xfs_bmbt_irec	*imap,
35062306a36Sopenharmony_ci	struct xfs_bmbt_irec	*cmap,
35162306a36Sopenharmony_ci	bool			convert_now)
35262306a36Sopenharmony_ci{
35362306a36Sopenharmony_ci	xfs_fileoff_t		offset_fsb = imap->br_startoff;
35462306a36Sopenharmony_ci	xfs_filblks_t		count_fsb = imap->br_blockcount;
35562306a36Sopenharmony_ci	int			error;
35662306a36Sopenharmony_ci
35762306a36Sopenharmony_ci	/*
35862306a36Sopenharmony_ci	 * cmap might larger than imap due to cowextsize hint.
35962306a36Sopenharmony_ci	 */
36062306a36Sopenharmony_ci	xfs_trim_extent(cmap, offset_fsb, count_fsb);
36162306a36Sopenharmony_ci
36262306a36Sopenharmony_ci	/*
36362306a36Sopenharmony_ci	 * COW fork extents are supposed to remain unwritten until we're ready
36462306a36Sopenharmony_ci	 * to initiate a disk write.  For direct I/O we are going to write the
36562306a36Sopenharmony_ci	 * data and need the conversion, but for buffered writes we're done.
36662306a36Sopenharmony_ci	 */
36762306a36Sopenharmony_ci	if (!convert_now || cmap->br_state == XFS_EXT_NORM)
36862306a36Sopenharmony_ci		return 0;
36962306a36Sopenharmony_ci
37062306a36Sopenharmony_ci	trace_xfs_reflink_convert_cow(ip, cmap);
37162306a36Sopenharmony_ci
37262306a36Sopenharmony_ci	error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
37362306a36Sopenharmony_ci	if (!error)
37462306a36Sopenharmony_ci		cmap->br_state = XFS_EXT_NORM;
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_ci	return error;
37762306a36Sopenharmony_ci}
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_cistatic int
38062306a36Sopenharmony_cixfs_reflink_fill_cow_hole(
38162306a36Sopenharmony_ci	struct xfs_inode	*ip,
38262306a36Sopenharmony_ci	struct xfs_bmbt_irec	*imap,
38362306a36Sopenharmony_ci	struct xfs_bmbt_irec	*cmap,
38462306a36Sopenharmony_ci	bool			*shared,
38562306a36Sopenharmony_ci	uint			*lockmode,
38662306a36Sopenharmony_ci	bool			convert_now)
38762306a36Sopenharmony_ci{
38862306a36Sopenharmony_ci	struct xfs_mount	*mp = ip->i_mount;
38962306a36Sopenharmony_ci	struct xfs_trans	*tp;
39062306a36Sopenharmony_ci	xfs_filblks_t		resaligned;
39162306a36Sopenharmony_ci	xfs_extlen_t		resblks;
39262306a36Sopenharmony_ci	int			nimaps;
39362306a36Sopenharmony_ci	int			error;
39462306a36Sopenharmony_ci	bool			found;
39562306a36Sopenharmony_ci
39662306a36Sopenharmony_ci	resaligned = xfs_aligned_fsb_count(imap->br_startoff,
39762306a36Sopenharmony_ci		imap->br_blockcount, xfs_get_cowextsz_hint(ip));
39862306a36Sopenharmony_ci	resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
39962306a36Sopenharmony_ci
40062306a36Sopenharmony_ci	xfs_iunlock(ip, *lockmode);
40162306a36Sopenharmony_ci	*lockmode = 0;
40262306a36Sopenharmony_ci
40362306a36Sopenharmony_ci	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
40462306a36Sopenharmony_ci			false, &tp);
40562306a36Sopenharmony_ci	if (error)
40662306a36Sopenharmony_ci		return error;
40762306a36Sopenharmony_ci
40862306a36Sopenharmony_ci	*lockmode = XFS_ILOCK_EXCL;
40962306a36Sopenharmony_ci
41062306a36Sopenharmony_ci	error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
41162306a36Sopenharmony_ci	if (error || !*shared)
41262306a36Sopenharmony_ci		goto out_trans_cancel;
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci	if (found) {
41562306a36Sopenharmony_ci		xfs_trans_cancel(tp);
41662306a36Sopenharmony_ci		goto convert;
41762306a36Sopenharmony_ci	}
41862306a36Sopenharmony_ci
41962306a36Sopenharmony_ci	/* Allocate the entire reservation as unwritten blocks. */
42062306a36Sopenharmony_ci	nimaps = 1;
42162306a36Sopenharmony_ci	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
42262306a36Sopenharmony_ci			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
42362306a36Sopenharmony_ci			&nimaps);
42462306a36Sopenharmony_ci	if (error)
42562306a36Sopenharmony_ci		goto out_trans_cancel;
42662306a36Sopenharmony_ci
42762306a36Sopenharmony_ci	xfs_inode_set_cowblocks_tag(ip);
42862306a36Sopenharmony_ci	error = xfs_trans_commit(tp);
42962306a36Sopenharmony_ci	if (error)
43062306a36Sopenharmony_ci		return error;
43162306a36Sopenharmony_ci
43262306a36Sopenharmony_ci	/*
43362306a36Sopenharmony_ci	 * Allocation succeeded but the requested range was not even partially
43462306a36Sopenharmony_ci	 * satisfied?  Bail out!
43562306a36Sopenharmony_ci	 */
43662306a36Sopenharmony_ci	if (nimaps == 0)
43762306a36Sopenharmony_ci		return -ENOSPC;
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ciconvert:
44062306a36Sopenharmony_ci	return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
44162306a36Sopenharmony_ci
44262306a36Sopenharmony_ciout_trans_cancel:
44362306a36Sopenharmony_ci	xfs_trans_cancel(tp);
44462306a36Sopenharmony_ci	return error;
44562306a36Sopenharmony_ci}
44662306a36Sopenharmony_ci
44762306a36Sopenharmony_cistatic int
44862306a36Sopenharmony_cixfs_reflink_fill_delalloc(
44962306a36Sopenharmony_ci	struct xfs_inode	*ip,
45062306a36Sopenharmony_ci	struct xfs_bmbt_irec	*imap,
45162306a36Sopenharmony_ci	struct xfs_bmbt_irec	*cmap,
45262306a36Sopenharmony_ci	bool			*shared,
45362306a36Sopenharmony_ci	uint			*lockmode,
45462306a36Sopenharmony_ci	bool			convert_now)
45562306a36Sopenharmony_ci{
45662306a36Sopenharmony_ci	struct xfs_mount	*mp = ip->i_mount;
45762306a36Sopenharmony_ci	struct xfs_trans	*tp;
45862306a36Sopenharmony_ci	int			nimaps;
45962306a36Sopenharmony_ci	int			error;
46062306a36Sopenharmony_ci	bool			found;
46162306a36Sopenharmony_ci
46262306a36Sopenharmony_ci	do {
46362306a36Sopenharmony_ci		xfs_iunlock(ip, *lockmode);
46462306a36Sopenharmony_ci		*lockmode = 0;
46562306a36Sopenharmony_ci
46662306a36Sopenharmony_ci		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
46762306a36Sopenharmony_ci				false, &tp);
46862306a36Sopenharmony_ci		if (error)
46962306a36Sopenharmony_ci			return error;
47062306a36Sopenharmony_ci
47162306a36Sopenharmony_ci		*lockmode = XFS_ILOCK_EXCL;
47262306a36Sopenharmony_ci
47362306a36Sopenharmony_ci		error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
47462306a36Sopenharmony_ci				&found);
47562306a36Sopenharmony_ci		if (error || !*shared)
47662306a36Sopenharmony_ci			goto out_trans_cancel;
47762306a36Sopenharmony_ci
47862306a36Sopenharmony_ci		if (found) {
47962306a36Sopenharmony_ci			xfs_trans_cancel(tp);
48062306a36Sopenharmony_ci			break;
48162306a36Sopenharmony_ci		}
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_ci		ASSERT(isnullstartblock(cmap->br_startblock) ||
48462306a36Sopenharmony_ci		       cmap->br_startblock == DELAYSTARTBLOCK);
48562306a36Sopenharmony_ci
48662306a36Sopenharmony_ci		/*
48762306a36Sopenharmony_ci		 * Replace delalloc reservation with an unwritten extent.
48862306a36Sopenharmony_ci		 */
48962306a36Sopenharmony_ci		nimaps = 1;
49062306a36Sopenharmony_ci		error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
49162306a36Sopenharmony_ci				cmap->br_blockcount,
49262306a36Sopenharmony_ci				XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
49362306a36Sopenharmony_ci				cmap, &nimaps);
49462306a36Sopenharmony_ci		if (error)
49562306a36Sopenharmony_ci			goto out_trans_cancel;
49662306a36Sopenharmony_ci
49762306a36Sopenharmony_ci		xfs_inode_set_cowblocks_tag(ip);
49862306a36Sopenharmony_ci		error = xfs_trans_commit(tp);
49962306a36Sopenharmony_ci		if (error)
50062306a36Sopenharmony_ci			return error;
50162306a36Sopenharmony_ci
50262306a36Sopenharmony_ci		/*
50362306a36Sopenharmony_ci		 * Allocation succeeded but the requested range was not even
50462306a36Sopenharmony_ci		 * partially satisfied?  Bail out!
50562306a36Sopenharmony_ci		 */
50662306a36Sopenharmony_ci		if (nimaps == 0)
50762306a36Sopenharmony_ci			return -ENOSPC;
50862306a36Sopenharmony_ci	} while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
50962306a36Sopenharmony_ci
51062306a36Sopenharmony_ci	return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
51162306a36Sopenharmony_ci
51262306a36Sopenharmony_ciout_trans_cancel:
51362306a36Sopenharmony_ci	xfs_trans_cancel(tp);
51462306a36Sopenharmony_ci	return error;
51562306a36Sopenharmony_ci}
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci/* Allocate all CoW reservations covering a range of blocks in a file. */
51862306a36Sopenharmony_ciint
51962306a36Sopenharmony_cixfs_reflink_allocate_cow(
52062306a36Sopenharmony_ci	struct xfs_inode	*ip,
52162306a36Sopenharmony_ci	struct xfs_bmbt_irec	*imap,
52262306a36Sopenharmony_ci	struct xfs_bmbt_irec	*cmap,
52362306a36Sopenharmony_ci	bool			*shared,
52462306a36Sopenharmony_ci	uint			*lockmode,
52562306a36Sopenharmony_ci	bool			convert_now)
52662306a36Sopenharmony_ci{
52762306a36Sopenharmony_ci	int			error;
52862306a36Sopenharmony_ci	bool			found;
52962306a36Sopenharmony_ci
53062306a36Sopenharmony_ci	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
53162306a36Sopenharmony_ci	if (!ip->i_cowfp) {
53262306a36Sopenharmony_ci		ASSERT(!xfs_is_reflink_inode(ip));
53362306a36Sopenharmony_ci		xfs_ifork_init_cow(ip);
53462306a36Sopenharmony_ci	}
53562306a36Sopenharmony_ci
53662306a36Sopenharmony_ci	error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
53762306a36Sopenharmony_ci	if (error || !*shared)
53862306a36Sopenharmony_ci		return error;
53962306a36Sopenharmony_ci
54062306a36Sopenharmony_ci	/* CoW fork has a real extent */
54162306a36Sopenharmony_ci	if (found)
54262306a36Sopenharmony_ci		return xfs_reflink_convert_unwritten(ip, imap, cmap,
54362306a36Sopenharmony_ci				convert_now);
54462306a36Sopenharmony_ci
54562306a36Sopenharmony_ci	/*
54662306a36Sopenharmony_ci	 * CoW fork does not have an extent and data extent is shared.
54762306a36Sopenharmony_ci	 * Allocate a real extent in the CoW fork.
54862306a36Sopenharmony_ci	 */
54962306a36Sopenharmony_ci	if (cmap->br_startoff > imap->br_startoff)
55062306a36Sopenharmony_ci		return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
55162306a36Sopenharmony_ci				lockmode, convert_now);
55262306a36Sopenharmony_ci
55362306a36Sopenharmony_ci	/*
55462306a36Sopenharmony_ci	 * CoW fork has a delalloc reservation. Replace it with a real extent.
55562306a36Sopenharmony_ci	 * There may or may not be a data fork mapping.
55662306a36Sopenharmony_ci	 */
55762306a36Sopenharmony_ci	if (isnullstartblock(cmap->br_startblock) ||
55862306a36Sopenharmony_ci	    cmap->br_startblock == DELAYSTARTBLOCK)
55962306a36Sopenharmony_ci		return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
56062306a36Sopenharmony_ci				lockmode, convert_now);
56162306a36Sopenharmony_ci
56262306a36Sopenharmony_ci	/* Shouldn't get here. */
56362306a36Sopenharmony_ci	ASSERT(0);
56462306a36Sopenharmony_ci	return -EFSCORRUPTED;
56562306a36Sopenharmony_ci}
56662306a36Sopenharmony_ci
56762306a36Sopenharmony_ci/*
56862306a36Sopenharmony_ci * Cancel CoW reservations for some block range of an inode.
56962306a36Sopenharmony_ci *
57062306a36Sopenharmony_ci * If cancel_real is true this function cancels all COW fork extents for the
57162306a36Sopenharmony_ci * inode; if cancel_real is false, real extents are not cleared.
57262306a36Sopenharmony_ci *
57362306a36Sopenharmony_ci * Caller must have already joined the inode to the current transaction. The
57462306a36Sopenharmony_ci * inode will be joined to the transaction returned to the caller.
57562306a36Sopenharmony_ci */
57662306a36Sopenharmony_ciint
57762306a36Sopenharmony_cixfs_reflink_cancel_cow_blocks(
57862306a36Sopenharmony_ci	struct xfs_inode		*ip,
57962306a36Sopenharmony_ci	struct xfs_trans		**tpp,
58062306a36Sopenharmony_ci	xfs_fileoff_t			offset_fsb,
58162306a36Sopenharmony_ci	xfs_fileoff_t			end_fsb,
58262306a36Sopenharmony_ci	bool				cancel_real)
58362306a36Sopenharmony_ci{
58462306a36Sopenharmony_ci	struct xfs_ifork		*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
58562306a36Sopenharmony_ci	struct xfs_bmbt_irec		got, del;
58662306a36Sopenharmony_ci	struct xfs_iext_cursor		icur;
58762306a36Sopenharmony_ci	int				error = 0;
58862306a36Sopenharmony_ci
58962306a36Sopenharmony_ci	if (!xfs_inode_has_cow_data(ip))
59062306a36Sopenharmony_ci		return 0;
59162306a36Sopenharmony_ci	if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
59262306a36Sopenharmony_ci		return 0;
59362306a36Sopenharmony_ci
59462306a36Sopenharmony_ci	/* Walk backwards until we're out of the I/O range... */
59562306a36Sopenharmony_ci	while (got.br_startoff + got.br_blockcount > offset_fsb) {
59662306a36Sopenharmony_ci		del = got;
59762306a36Sopenharmony_ci		xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
59862306a36Sopenharmony_ci
59962306a36Sopenharmony_ci		/* Extent delete may have bumped ext forward */
60062306a36Sopenharmony_ci		if (!del.br_blockcount) {
60162306a36Sopenharmony_ci			xfs_iext_prev(ifp, &icur);
60262306a36Sopenharmony_ci			goto next_extent;
60362306a36Sopenharmony_ci		}
60462306a36Sopenharmony_ci
60562306a36Sopenharmony_ci		trace_xfs_reflink_cancel_cow(ip, &del);
60662306a36Sopenharmony_ci
60762306a36Sopenharmony_ci		if (isnullstartblock(del.br_startblock)) {
60862306a36Sopenharmony_ci			error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
60962306a36Sopenharmony_ci					&icur, &got, &del);
61062306a36Sopenharmony_ci			if (error)
61162306a36Sopenharmony_ci				break;
61262306a36Sopenharmony_ci		} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
61362306a36Sopenharmony_ci			ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
61462306a36Sopenharmony_ci
61562306a36Sopenharmony_ci			/* Free the CoW orphan record. */
61662306a36Sopenharmony_ci			xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
61762306a36Sopenharmony_ci					del.br_blockcount);
61862306a36Sopenharmony_ci
61962306a36Sopenharmony_ci			error = xfs_free_extent_later(*tpp, del.br_startblock,
62062306a36Sopenharmony_ci					del.br_blockcount, NULL,
62162306a36Sopenharmony_ci					XFS_AG_RESV_NONE);
62262306a36Sopenharmony_ci			if (error)
62362306a36Sopenharmony_ci				break;
62462306a36Sopenharmony_ci
62562306a36Sopenharmony_ci			/* Roll the transaction */
62662306a36Sopenharmony_ci			error = xfs_defer_finish(tpp);
62762306a36Sopenharmony_ci			if (error)
62862306a36Sopenharmony_ci				break;
62962306a36Sopenharmony_ci
63062306a36Sopenharmony_ci			/* Remove the mapping from the CoW fork. */
63162306a36Sopenharmony_ci			xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
63262306a36Sopenharmony_ci
63362306a36Sopenharmony_ci			/* Remove the quota reservation */
63462306a36Sopenharmony_ci			error = xfs_quota_unreserve_blkres(ip,
63562306a36Sopenharmony_ci					del.br_blockcount);
63662306a36Sopenharmony_ci			if (error)
63762306a36Sopenharmony_ci				break;
63862306a36Sopenharmony_ci		} else {
63962306a36Sopenharmony_ci			/* Didn't do anything, push cursor back. */
64062306a36Sopenharmony_ci			xfs_iext_prev(ifp, &icur);
64162306a36Sopenharmony_ci		}
64262306a36Sopenharmony_cinext_extent:
64362306a36Sopenharmony_ci		if (!xfs_iext_get_extent(ifp, &icur, &got))
64462306a36Sopenharmony_ci			break;
64562306a36Sopenharmony_ci	}
64662306a36Sopenharmony_ci
64762306a36Sopenharmony_ci	/* clear tag if cow fork is emptied */
64862306a36Sopenharmony_ci	if (!ifp->if_bytes)
64962306a36Sopenharmony_ci		xfs_inode_clear_cowblocks_tag(ip);
65062306a36Sopenharmony_ci	return error;
65162306a36Sopenharmony_ci}
65262306a36Sopenharmony_ci
65362306a36Sopenharmony_ci/*
65462306a36Sopenharmony_ci * Cancel CoW reservations for some byte range of an inode.
65562306a36Sopenharmony_ci *
65662306a36Sopenharmony_ci * If cancel_real is true this function cancels all COW fork extents for the
65762306a36Sopenharmony_ci * inode; if cancel_real is false, real extents are not cleared.
65862306a36Sopenharmony_ci */
65962306a36Sopenharmony_ciint
66062306a36Sopenharmony_cixfs_reflink_cancel_cow_range(
66162306a36Sopenharmony_ci	struct xfs_inode	*ip,
66262306a36Sopenharmony_ci	xfs_off_t		offset,
66362306a36Sopenharmony_ci	xfs_off_t		count,
66462306a36Sopenharmony_ci	bool			cancel_real)
66562306a36Sopenharmony_ci{
66662306a36Sopenharmony_ci	struct xfs_trans	*tp;
66762306a36Sopenharmony_ci	xfs_fileoff_t		offset_fsb;
66862306a36Sopenharmony_ci	xfs_fileoff_t		end_fsb;
66962306a36Sopenharmony_ci	int			error;
67062306a36Sopenharmony_ci
67162306a36Sopenharmony_ci	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
67262306a36Sopenharmony_ci	ASSERT(ip->i_cowfp);
67362306a36Sopenharmony_ci
67462306a36Sopenharmony_ci	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
67562306a36Sopenharmony_ci	if (count == NULLFILEOFF)
67662306a36Sopenharmony_ci		end_fsb = NULLFILEOFF;
67762306a36Sopenharmony_ci	else
67862306a36Sopenharmony_ci		end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
67962306a36Sopenharmony_ci
68062306a36Sopenharmony_ci	/* Start a rolling transaction to remove the mappings */
68162306a36Sopenharmony_ci	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
68262306a36Sopenharmony_ci			0, 0, 0, &tp);
68362306a36Sopenharmony_ci	if (error)
68462306a36Sopenharmony_ci		goto out;
68562306a36Sopenharmony_ci
68662306a36Sopenharmony_ci	xfs_ilock(ip, XFS_ILOCK_EXCL);
68762306a36Sopenharmony_ci	xfs_trans_ijoin(tp, ip, 0);
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci	/* Scrape out the old CoW reservations */
69062306a36Sopenharmony_ci	error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
69162306a36Sopenharmony_ci			cancel_real);
69262306a36Sopenharmony_ci	if (error)
69362306a36Sopenharmony_ci		goto out_cancel;
69462306a36Sopenharmony_ci
69562306a36Sopenharmony_ci	error = xfs_trans_commit(tp);
69662306a36Sopenharmony_ci
69762306a36Sopenharmony_ci	xfs_iunlock(ip, XFS_ILOCK_EXCL);
69862306a36Sopenharmony_ci	return error;
69962306a36Sopenharmony_ci
70062306a36Sopenharmony_ciout_cancel:
70162306a36Sopenharmony_ci	xfs_trans_cancel(tp);
70262306a36Sopenharmony_ci	xfs_iunlock(ip, XFS_ILOCK_EXCL);
70362306a36Sopenharmony_ciout:
70462306a36Sopenharmony_ci	trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
70562306a36Sopenharmony_ci	return error;
70662306a36Sopenharmony_ci}
70762306a36Sopenharmony_ci
70862306a36Sopenharmony_ci/*
70962306a36Sopenharmony_ci * Remap part of the CoW fork into the data fork.
71062306a36Sopenharmony_ci *
71162306a36Sopenharmony_ci * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
71262306a36Sopenharmony_ci * into the data fork; this function will remap what it can (at the end of the
71362306a36Sopenharmony_ci * range) and update @end_fsb appropriately.  Each remap gets its own
71462306a36Sopenharmony_ci * transaction because we can end up merging and splitting bmbt blocks for
71562306a36Sopenharmony_ci * every remap operation and we'd like to keep the block reservation
71662306a36Sopenharmony_ci * requirements as low as possible.
71762306a36Sopenharmony_ci */
71862306a36Sopenharmony_ciSTATIC int
71962306a36Sopenharmony_cixfs_reflink_end_cow_extent(
72062306a36Sopenharmony_ci	struct xfs_inode	*ip,
72162306a36Sopenharmony_ci	xfs_fileoff_t		*offset_fsb,
72262306a36Sopenharmony_ci	xfs_fileoff_t		end_fsb)
72362306a36Sopenharmony_ci{
72462306a36Sopenharmony_ci	struct xfs_iext_cursor	icur;
72562306a36Sopenharmony_ci	struct xfs_bmbt_irec	got, del, data;
72662306a36Sopenharmony_ci	struct xfs_mount	*mp = ip->i_mount;
72762306a36Sopenharmony_ci	struct xfs_trans	*tp;
72862306a36Sopenharmony_ci	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
72962306a36Sopenharmony_ci	unsigned int		resblks;
73062306a36Sopenharmony_ci	int			nmaps;
73162306a36Sopenharmony_ci	int			error;
73262306a36Sopenharmony_ci
73362306a36Sopenharmony_ci	/* No COW extents?  That's easy! */
73462306a36Sopenharmony_ci	if (ifp->if_bytes == 0) {
73562306a36Sopenharmony_ci		*offset_fsb = end_fsb;
73662306a36Sopenharmony_ci		return 0;
73762306a36Sopenharmony_ci	}
73862306a36Sopenharmony_ci
73962306a36Sopenharmony_ci	resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
74062306a36Sopenharmony_ci	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
74162306a36Sopenharmony_ci			XFS_TRANS_RESERVE, &tp);
74262306a36Sopenharmony_ci	if (error)
74362306a36Sopenharmony_ci		return error;
74462306a36Sopenharmony_ci
74562306a36Sopenharmony_ci	/*
74662306a36Sopenharmony_ci	 * Lock the inode.  We have to ijoin without automatic unlock because
74762306a36Sopenharmony_ci	 * the lead transaction is the refcountbt record deletion; the data
74862306a36Sopenharmony_ci	 * fork update follows as a deferred log item.
74962306a36Sopenharmony_ci	 */
75062306a36Sopenharmony_ci	xfs_ilock(ip, XFS_ILOCK_EXCL);
75162306a36Sopenharmony_ci	xfs_trans_ijoin(tp, ip, 0);
75262306a36Sopenharmony_ci
75362306a36Sopenharmony_ci	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
75462306a36Sopenharmony_ci			XFS_IEXT_REFLINK_END_COW_CNT);
75562306a36Sopenharmony_ci	if (error == -EFBIG)
75662306a36Sopenharmony_ci		error = xfs_iext_count_upgrade(tp, ip,
75762306a36Sopenharmony_ci				XFS_IEXT_REFLINK_END_COW_CNT);
75862306a36Sopenharmony_ci	if (error)
75962306a36Sopenharmony_ci		goto out_cancel;
76062306a36Sopenharmony_ci
76162306a36Sopenharmony_ci	/*
76262306a36Sopenharmony_ci	 * In case of racing, overlapping AIO writes no COW extents might be
76362306a36Sopenharmony_ci	 * left by the time I/O completes for the loser of the race.  In that
76462306a36Sopenharmony_ci	 * case we are done.
76562306a36Sopenharmony_ci	 */
76662306a36Sopenharmony_ci	if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
76762306a36Sopenharmony_ci	    got.br_startoff >= end_fsb) {
76862306a36Sopenharmony_ci		*offset_fsb = end_fsb;
76962306a36Sopenharmony_ci		goto out_cancel;
77062306a36Sopenharmony_ci	}
77162306a36Sopenharmony_ci
77262306a36Sopenharmony_ci	/*
77362306a36Sopenharmony_ci	 * Only remap real extents that contain data.  With AIO, speculative
77462306a36Sopenharmony_ci	 * preallocations can leak into the range we are called upon, and we
77562306a36Sopenharmony_ci	 * need to skip them.  Preserve @got for the eventual CoW fork
77662306a36Sopenharmony_ci	 * deletion; from now on @del represents the mapping that we're
77762306a36Sopenharmony_ci	 * actually remapping.
77862306a36Sopenharmony_ci	 */
77962306a36Sopenharmony_ci	while (!xfs_bmap_is_written_extent(&got)) {
78062306a36Sopenharmony_ci		if (!xfs_iext_next_extent(ifp, &icur, &got) ||
78162306a36Sopenharmony_ci		    got.br_startoff >= end_fsb) {
78262306a36Sopenharmony_ci			*offset_fsb = end_fsb;
78362306a36Sopenharmony_ci			goto out_cancel;
78462306a36Sopenharmony_ci		}
78562306a36Sopenharmony_ci	}
78662306a36Sopenharmony_ci	del = got;
78762306a36Sopenharmony_ci	xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb);
78862306a36Sopenharmony_ci
78962306a36Sopenharmony_ci	/* Grab the corresponding mapping in the data fork. */
79062306a36Sopenharmony_ci	nmaps = 1;
79162306a36Sopenharmony_ci	error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
79262306a36Sopenharmony_ci			&nmaps, 0);
79362306a36Sopenharmony_ci	if (error)
79462306a36Sopenharmony_ci		goto out_cancel;
79562306a36Sopenharmony_ci
79662306a36Sopenharmony_ci	/* We can only remap the smaller of the two extent sizes. */
79762306a36Sopenharmony_ci	data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
79862306a36Sopenharmony_ci	del.br_blockcount = data.br_blockcount;
79962306a36Sopenharmony_ci
80062306a36Sopenharmony_ci	trace_xfs_reflink_cow_remap_from(ip, &del);
80162306a36Sopenharmony_ci	trace_xfs_reflink_cow_remap_to(ip, &data);
80262306a36Sopenharmony_ci
80362306a36Sopenharmony_ci	if (xfs_bmap_is_real_extent(&data)) {
80462306a36Sopenharmony_ci		/*
80562306a36Sopenharmony_ci		 * If the extent we're remapping is backed by storage (written
80662306a36Sopenharmony_ci		 * or not), unmap the extent and drop its refcount.
80762306a36Sopenharmony_ci		 */
80862306a36Sopenharmony_ci		xfs_bmap_unmap_extent(tp, ip, &data);
80962306a36Sopenharmony_ci		xfs_refcount_decrease_extent(tp, &data);
81062306a36Sopenharmony_ci		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
81162306a36Sopenharmony_ci				-data.br_blockcount);
81262306a36Sopenharmony_ci	} else if (data.br_startblock == DELAYSTARTBLOCK) {
81362306a36Sopenharmony_ci		int		done;
81462306a36Sopenharmony_ci
81562306a36Sopenharmony_ci		/*
81662306a36Sopenharmony_ci		 * If the extent we're remapping is a delalloc reservation,
81762306a36Sopenharmony_ci		 * we can use the regular bunmapi function to release the
81862306a36Sopenharmony_ci		 * incore state.  Dropping the delalloc reservation takes care
81962306a36Sopenharmony_ci		 * of the quota reservation for us.
82062306a36Sopenharmony_ci		 */
82162306a36Sopenharmony_ci		error = xfs_bunmapi(NULL, ip, data.br_startoff,
82262306a36Sopenharmony_ci				data.br_blockcount, 0, 1, &done);
82362306a36Sopenharmony_ci		if (error)
82462306a36Sopenharmony_ci			goto out_cancel;
82562306a36Sopenharmony_ci		ASSERT(done);
82662306a36Sopenharmony_ci	}
82762306a36Sopenharmony_ci
82862306a36Sopenharmony_ci	/* Free the CoW orphan record. */
82962306a36Sopenharmony_ci	xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
83062306a36Sopenharmony_ci
83162306a36Sopenharmony_ci	/* Map the new blocks into the data fork. */
83262306a36Sopenharmony_ci	xfs_bmap_map_extent(tp, ip, &del);
83362306a36Sopenharmony_ci
83462306a36Sopenharmony_ci	/* Charge this new data fork mapping to the on-disk quota. */
83562306a36Sopenharmony_ci	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
83662306a36Sopenharmony_ci			(long)del.br_blockcount);
83762306a36Sopenharmony_ci
83862306a36Sopenharmony_ci	/* Remove the mapping from the CoW fork. */
83962306a36Sopenharmony_ci	xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
84062306a36Sopenharmony_ci
84162306a36Sopenharmony_ci	error = xfs_trans_commit(tp);
84262306a36Sopenharmony_ci	xfs_iunlock(ip, XFS_ILOCK_EXCL);
84362306a36Sopenharmony_ci	if (error)
84462306a36Sopenharmony_ci		return error;
84562306a36Sopenharmony_ci
84662306a36Sopenharmony_ci	/* Update the caller about how much progress we made. */
84762306a36Sopenharmony_ci	*offset_fsb = del.br_startoff + del.br_blockcount;
84862306a36Sopenharmony_ci	return 0;
84962306a36Sopenharmony_ci
85062306a36Sopenharmony_ciout_cancel:
85162306a36Sopenharmony_ci	xfs_trans_cancel(tp);
85262306a36Sopenharmony_ci	xfs_iunlock(ip, XFS_ILOCK_EXCL);
85362306a36Sopenharmony_ci	return error;
85462306a36Sopenharmony_ci}
85562306a36Sopenharmony_ci
85662306a36Sopenharmony_ci/*
85762306a36Sopenharmony_ci * Remap parts of a file's data fork after a successful CoW.
85862306a36Sopenharmony_ci */
85962306a36Sopenharmony_ciint
86062306a36Sopenharmony_cixfs_reflink_end_cow(
86162306a36Sopenharmony_ci	struct xfs_inode		*ip,
86262306a36Sopenharmony_ci	xfs_off_t			offset,
86362306a36Sopenharmony_ci	xfs_off_t			count)
86462306a36Sopenharmony_ci{
86562306a36Sopenharmony_ci	xfs_fileoff_t			offset_fsb;
86662306a36Sopenharmony_ci	xfs_fileoff_t			end_fsb;
86762306a36Sopenharmony_ci	int				error = 0;
86862306a36Sopenharmony_ci
86962306a36Sopenharmony_ci	trace_xfs_reflink_end_cow(ip, offset, count);
87062306a36Sopenharmony_ci
87162306a36Sopenharmony_ci	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
87262306a36Sopenharmony_ci	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
87362306a36Sopenharmony_ci
87462306a36Sopenharmony_ci	/*
87562306a36Sopenharmony_ci	 * Walk forwards until we've remapped the I/O range.  The loop function
87662306a36Sopenharmony_ci	 * repeatedly cycles the ILOCK to allocate one transaction per remapped
87762306a36Sopenharmony_ci	 * extent.
87862306a36Sopenharmony_ci	 *
87962306a36Sopenharmony_ci	 * If we're being called by writeback then the pages will still
88062306a36Sopenharmony_ci	 * have PageWriteback set, which prevents races with reflink remapping
88162306a36Sopenharmony_ci	 * and truncate.  Reflink remapping prevents races with writeback by
88262306a36Sopenharmony_ci	 * taking the iolock and mmaplock before flushing the pages and
88362306a36Sopenharmony_ci	 * remapping, which means there won't be any further writeback or page
88462306a36Sopenharmony_ci	 * cache dirtying until the reflink completes.
88562306a36Sopenharmony_ci	 *
88662306a36Sopenharmony_ci	 * We should never have two threads issuing writeback for the same file
88762306a36Sopenharmony_ci	 * region.  There are also have post-eof checks in the writeback
88862306a36Sopenharmony_ci	 * preparation code so that we don't bother writing out pages that are
88962306a36Sopenharmony_ci	 * about to be truncated.
89062306a36Sopenharmony_ci	 *
89162306a36Sopenharmony_ci	 * If we're being called as part of directio write completion, the dio
89262306a36Sopenharmony_ci	 * count is still elevated, which reflink and truncate will wait for.
89362306a36Sopenharmony_ci	 * Reflink remapping takes the iolock and mmaplock and waits for
89462306a36Sopenharmony_ci	 * pending dio to finish, which should prevent any directio until the
89562306a36Sopenharmony_ci	 * remap completes.  Multiple concurrent directio writes to the same
89662306a36Sopenharmony_ci	 * region are handled by end_cow processing only occurring for the
89762306a36Sopenharmony_ci	 * threads which succeed; the outcome of multiple overlapping direct
89862306a36Sopenharmony_ci	 * writes is not well defined anyway.
89962306a36Sopenharmony_ci	 *
90062306a36Sopenharmony_ci	 * It's possible that a buffered write and a direct write could collide
90162306a36Sopenharmony_ci	 * here (the buffered write stumbles in after the dio flushes and
90262306a36Sopenharmony_ci	 * invalidates the page cache and immediately queues writeback), but we
90362306a36Sopenharmony_ci	 * have never supported this 100%.  If either disk write succeeds the
90462306a36Sopenharmony_ci	 * blocks will be remapped.
90562306a36Sopenharmony_ci	 */
90662306a36Sopenharmony_ci	while (end_fsb > offset_fsb && !error)
90762306a36Sopenharmony_ci		error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
90862306a36Sopenharmony_ci
90962306a36Sopenharmony_ci	if (error)
91062306a36Sopenharmony_ci		trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
91162306a36Sopenharmony_ci	return error;
91262306a36Sopenharmony_ci}
91362306a36Sopenharmony_ci
91462306a36Sopenharmony_ci/*
91562306a36Sopenharmony_ci * Free all CoW staging blocks that are still referenced by the ondisk refcount
91662306a36Sopenharmony_ci * metadata.  The ondisk metadata does not track which inode created the
91762306a36Sopenharmony_ci * staging extent, so callers must ensure that there are no cached inodes with
91862306a36Sopenharmony_ci * live CoW staging extents.
91962306a36Sopenharmony_ci */
92062306a36Sopenharmony_ciint
92162306a36Sopenharmony_cixfs_reflink_recover_cow(
92262306a36Sopenharmony_ci	struct xfs_mount	*mp)
92362306a36Sopenharmony_ci{
92462306a36Sopenharmony_ci	struct xfs_perag	*pag;
92562306a36Sopenharmony_ci	xfs_agnumber_t		agno;
92662306a36Sopenharmony_ci	int			error = 0;
92762306a36Sopenharmony_ci
92862306a36Sopenharmony_ci	if (!xfs_has_reflink(mp))
92962306a36Sopenharmony_ci		return 0;
93062306a36Sopenharmony_ci
93162306a36Sopenharmony_ci	for_each_perag(mp, agno, pag) {
93262306a36Sopenharmony_ci		error = xfs_refcount_recover_cow_leftovers(mp, pag);
93362306a36Sopenharmony_ci		if (error) {
93462306a36Sopenharmony_ci			xfs_perag_rele(pag);
93562306a36Sopenharmony_ci			break;
93662306a36Sopenharmony_ci		}
93762306a36Sopenharmony_ci	}
93862306a36Sopenharmony_ci
93962306a36Sopenharmony_ci	return error;
94062306a36Sopenharmony_ci}
94162306a36Sopenharmony_ci
94262306a36Sopenharmony_ci/*
94362306a36Sopenharmony_ci * Reflinking (Block) Ranges of Two Files Together
94462306a36Sopenharmony_ci *
94562306a36Sopenharmony_ci * First, ensure that the reflink flag is set on both inodes.  The flag is an
94662306a36Sopenharmony_ci * optimization to avoid unnecessary refcount btree lookups in the write path.
94762306a36Sopenharmony_ci *
94862306a36Sopenharmony_ci * Now we can iteratively remap the range of extents (and holes) in src to the
94962306a36Sopenharmony_ci * corresponding ranges in dest.  Let drange and srange denote the ranges of
95062306a36Sopenharmony_ci * logical blocks in dest and src touched by the reflink operation.
95162306a36Sopenharmony_ci *
95262306a36Sopenharmony_ci * While the length of drange is greater than zero,
95362306a36Sopenharmony_ci *    - Read src's bmbt at the start of srange ("imap")
95462306a36Sopenharmony_ci *    - If imap doesn't exist, make imap appear to start at the end of srange
95562306a36Sopenharmony_ci *      with zero length.
95662306a36Sopenharmony_ci *    - If imap starts before srange, advance imap to start at srange.
95762306a36Sopenharmony_ci *    - If imap goes beyond srange, truncate imap to end at the end of srange.
95862306a36Sopenharmony_ci *    - Punch (imap start - srange start + imap len) blocks from dest at
95962306a36Sopenharmony_ci *      offset (drange start).
96062306a36Sopenharmony_ci *    - If imap points to a real range of pblks,
96162306a36Sopenharmony_ci *         > Increase the refcount of the imap's pblks
96262306a36Sopenharmony_ci *         > Map imap's pblks into dest at the offset
96362306a36Sopenharmony_ci *           (drange start + imap start - srange start)
96462306a36Sopenharmony_ci *    - Advance drange and srange by (imap start - srange start + imap len)
96562306a36Sopenharmony_ci *
96662306a36Sopenharmony_ci * Finally, if the reflink made dest longer, update both the in-core and
96762306a36Sopenharmony_ci * on-disk file sizes.
96862306a36Sopenharmony_ci *
96962306a36Sopenharmony_ci * ASCII Art Demonstration:
97062306a36Sopenharmony_ci *
97162306a36Sopenharmony_ci * Let's say we want to reflink this source file:
97262306a36Sopenharmony_ci *
97362306a36Sopenharmony_ci * ----SSSSSSS-SSSSS----SSSSSS (src file)
97462306a36Sopenharmony_ci *   <-------------------->
97562306a36Sopenharmony_ci *
97662306a36Sopenharmony_ci * into this destination file:
97762306a36Sopenharmony_ci *
97862306a36Sopenharmony_ci * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
97962306a36Sopenharmony_ci *        <-------------------->
98062306a36Sopenharmony_ci * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
98162306a36Sopenharmony_ci * Observe that the range has different logical offsets in either file.
98262306a36Sopenharmony_ci *
98362306a36Sopenharmony_ci * Consider that the first extent in the source file doesn't line up with our
98462306a36Sopenharmony_ci * reflink range.  Unmapping  and remapping are separate operations, so we can
98562306a36Sopenharmony_ci * unmap more blocks from the destination file than we remap.
98662306a36Sopenharmony_ci *
98762306a36Sopenharmony_ci * ----SSSSSSS-SSSSS----SSSSSS
98862306a36Sopenharmony_ci *   <------->
98962306a36Sopenharmony_ci * --DDDDD---------DDDDD--DDD
99062306a36Sopenharmony_ci *        <------->
99162306a36Sopenharmony_ci *
99262306a36Sopenharmony_ci * Now remap the source extent into the destination file:
99362306a36Sopenharmony_ci *
99462306a36Sopenharmony_ci * ----SSSSSSS-SSSSS----SSSSSS
99562306a36Sopenharmony_ci *   <------->
99662306a36Sopenharmony_ci * --DDDDD--SSSSSSSDDDDD--DDD
99762306a36Sopenharmony_ci *        <------->
99862306a36Sopenharmony_ci *
99962306a36Sopenharmony_ci * Do likewise with the second hole and extent in our range.  Holes in the
100062306a36Sopenharmony_ci * unmap range don't affect our operation.
100162306a36Sopenharmony_ci *
100262306a36Sopenharmony_ci * ----SSSSSSS-SSSSS----SSSSSS
100362306a36Sopenharmony_ci *            <---->
100462306a36Sopenharmony_ci * --DDDDD--SSSSSSS-SSSSS-DDD
100562306a36Sopenharmony_ci *                 <---->
100662306a36Sopenharmony_ci *
100762306a36Sopenharmony_ci * Finally, unmap and remap part of the third extent.  This will increase the
100862306a36Sopenharmony_ci * size of the destination file.
100962306a36Sopenharmony_ci *
101062306a36Sopenharmony_ci * ----SSSSSSS-SSSSS----SSSSSS
101162306a36Sopenharmony_ci *                  <----->
101262306a36Sopenharmony_ci * --DDDDD--SSSSSSS-SSSSS----SSS
101362306a36Sopenharmony_ci *                       <----->
101462306a36Sopenharmony_ci *
101562306a36Sopenharmony_ci * Once we update the destination file's i_size, we're done.
101662306a36Sopenharmony_ci */
101762306a36Sopenharmony_ci
101862306a36Sopenharmony_ci/*
101962306a36Sopenharmony_ci * Ensure the reflink bit is set in both inodes.
102062306a36Sopenharmony_ci */
102162306a36Sopenharmony_ciSTATIC int
102262306a36Sopenharmony_cixfs_reflink_set_inode_flag(
102362306a36Sopenharmony_ci	struct xfs_inode	*src,
102462306a36Sopenharmony_ci	struct xfs_inode	*dest)
102562306a36Sopenharmony_ci{
102662306a36Sopenharmony_ci	struct xfs_mount	*mp = src->i_mount;
102762306a36Sopenharmony_ci	int			error;
102862306a36Sopenharmony_ci	struct xfs_trans	*tp;
102962306a36Sopenharmony_ci
103062306a36Sopenharmony_ci	if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
103162306a36Sopenharmony_ci		return 0;
103262306a36Sopenharmony_ci
103362306a36Sopenharmony_ci	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
103462306a36Sopenharmony_ci	if (error)
103562306a36Sopenharmony_ci		goto out_error;
103662306a36Sopenharmony_ci
103762306a36Sopenharmony_ci	/* Lock both files against IO */
103862306a36Sopenharmony_ci	if (src->i_ino == dest->i_ino)
103962306a36Sopenharmony_ci		xfs_ilock(src, XFS_ILOCK_EXCL);
104062306a36Sopenharmony_ci	else
104162306a36Sopenharmony_ci		xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL);
104262306a36Sopenharmony_ci
104362306a36Sopenharmony_ci	if (!xfs_is_reflink_inode(src)) {
104462306a36Sopenharmony_ci		trace_xfs_reflink_set_inode_flag(src);
104562306a36Sopenharmony_ci		xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
104662306a36Sopenharmony_ci		src->i_diflags2 |= XFS_DIFLAG2_REFLINK;
104762306a36Sopenharmony_ci		xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
104862306a36Sopenharmony_ci		xfs_ifork_init_cow(src);
104962306a36Sopenharmony_ci	} else
105062306a36Sopenharmony_ci		xfs_iunlock(src, XFS_ILOCK_EXCL);
105162306a36Sopenharmony_ci
105262306a36Sopenharmony_ci	if (src->i_ino == dest->i_ino)
105362306a36Sopenharmony_ci		goto commit_flags;
105462306a36Sopenharmony_ci
105562306a36Sopenharmony_ci	if (!xfs_is_reflink_inode(dest)) {
105662306a36Sopenharmony_ci		trace_xfs_reflink_set_inode_flag(dest);
105762306a36Sopenharmony_ci		xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
105862306a36Sopenharmony_ci		dest->i_diflags2 |= XFS_DIFLAG2_REFLINK;
105962306a36Sopenharmony_ci		xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
106062306a36Sopenharmony_ci		xfs_ifork_init_cow(dest);
106162306a36Sopenharmony_ci	} else
106262306a36Sopenharmony_ci		xfs_iunlock(dest, XFS_ILOCK_EXCL);
106362306a36Sopenharmony_ci
106462306a36Sopenharmony_cicommit_flags:
106562306a36Sopenharmony_ci	error = xfs_trans_commit(tp);
106662306a36Sopenharmony_ci	if (error)
106762306a36Sopenharmony_ci		goto out_error;
106862306a36Sopenharmony_ci	return error;
106962306a36Sopenharmony_ci
107062306a36Sopenharmony_ciout_error:
107162306a36Sopenharmony_ci	trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
107262306a36Sopenharmony_ci	return error;
107362306a36Sopenharmony_ci}
107462306a36Sopenharmony_ci
107562306a36Sopenharmony_ci/*
107662306a36Sopenharmony_ci * Update destination inode size & cowextsize hint, if necessary.
107762306a36Sopenharmony_ci */
107862306a36Sopenharmony_ciint
107962306a36Sopenharmony_cixfs_reflink_update_dest(
108062306a36Sopenharmony_ci	struct xfs_inode	*dest,
108162306a36Sopenharmony_ci	xfs_off_t		newlen,
108262306a36Sopenharmony_ci	xfs_extlen_t		cowextsize,
108362306a36Sopenharmony_ci	unsigned int		remap_flags)
108462306a36Sopenharmony_ci{
108562306a36Sopenharmony_ci	struct xfs_mount	*mp = dest->i_mount;
108662306a36Sopenharmony_ci	struct xfs_trans	*tp;
108762306a36Sopenharmony_ci	int			error;
108862306a36Sopenharmony_ci
108962306a36Sopenharmony_ci	if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
109062306a36Sopenharmony_ci		return 0;
109162306a36Sopenharmony_ci
109262306a36Sopenharmony_ci	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
109362306a36Sopenharmony_ci	if (error)
109462306a36Sopenharmony_ci		goto out_error;
109562306a36Sopenharmony_ci
109662306a36Sopenharmony_ci	xfs_ilock(dest, XFS_ILOCK_EXCL);
109762306a36Sopenharmony_ci	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
109862306a36Sopenharmony_ci
109962306a36Sopenharmony_ci	if (newlen > i_size_read(VFS_I(dest))) {
110062306a36Sopenharmony_ci		trace_xfs_reflink_update_inode_size(dest, newlen);
110162306a36Sopenharmony_ci		i_size_write(VFS_I(dest), newlen);
110262306a36Sopenharmony_ci		dest->i_disk_size = newlen;
110362306a36Sopenharmony_ci	}
110462306a36Sopenharmony_ci
110562306a36Sopenharmony_ci	if (cowextsize) {
110662306a36Sopenharmony_ci		dest->i_cowextsize = cowextsize;
110762306a36Sopenharmony_ci		dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
110862306a36Sopenharmony_ci	}
110962306a36Sopenharmony_ci
111062306a36Sopenharmony_ci	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
111162306a36Sopenharmony_ci
111262306a36Sopenharmony_ci	error = xfs_trans_commit(tp);
111362306a36Sopenharmony_ci	if (error)
111462306a36Sopenharmony_ci		goto out_error;
111562306a36Sopenharmony_ci	return error;
111662306a36Sopenharmony_ci
111762306a36Sopenharmony_ciout_error:
111862306a36Sopenharmony_ci	trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
111962306a36Sopenharmony_ci	return error;
112062306a36Sopenharmony_ci}
112162306a36Sopenharmony_ci
112262306a36Sopenharmony_ci/*
112362306a36Sopenharmony_ci * Do we have enough reserve in this AG to handle a reflink?  The refcount
112462306a36Sopenharmony_ci * btree already reserved all the space it needs, but the rmap btree can grow
112562306a36Sopenharmony_ci * infinitely, so we won't allow more reflinks when the AG is down to the
112662306a36Sopenharmony_ci * btree reserves.
112762306a36Sopenharmony_ci */
112862306a36Sopenharmony_cistatic int
112962306a36Sopenharmony_cixfs_reflink_ag_has_free_space(
113062306a36Sopenharmony_ci	struct xfs_mount	*mp,
113162306a36Sopenharmony_ci	xfs_agnumber_t		agno)
113262306a36Sopenharmony_ci{
113362306a36Sopenharmony_ci	struct xfs_perag	*pag;
113462306a36Sopenharmony_ci	int			error = 0;
113562306a36Sopenharmony_ci
113662306a36Sopenharmony_ci	if (!xfs_has_rmapbt(mp))
113762306a36Sopenharmony_ci		return 0;
113862306a36Sopenharmony_ci
113962306a36Sopenharmony_ci	pag = xfs_perag_get(mp, agno);
114062306a36Sopenharmony_ci	if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
114162306a36Sopenharmony_ci	    xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
114262306a36Sopenharmony_ci		error = -ENOSPC;
114362306a36Sopenharmony_ci	xfs_perag_put(pag);
114462306a36Sopenharmony_ci	return error;
114562306a36Sopenharmony_ci}
114662306a36Sopenharmony_ci
114762306a36Sopenharmony_ci/*
114862306a36Sopenharmony_ci * Remap the given extent into the file.  The dmap blockcount will be set to
114962306a36Sopenharmony_ci * the number of blocks that were actually remapped.
115062306a36Sopenharmony_ci */
115162306a36Sopenharmony_ciSTATIC int
115262306a36Sopenharmony_cixfs_reflink_remap_extent(
115362306a36Sopenharmony_ci	struct xfs_inode	*ip,
115462306a36Sopenharmony_ci	struct xfs_bmbt_irec	*dmap,
115562306a36Sopenharmony_ci	xfs_off_t		new_isize)
115662306a36Sopenharmony_ci{
115762306a36Sopenharmony_ci	struct xfs_bmbt_irec	smap;
115862306a36Sopenharmony_ci	struct xfs_mount	*mp = ip->i_mount;
115962306a36Sopenharmony_ci	struct xfs_trans	*tp;
116062306a36Sopenharmony_ci	xfs_off_t		newlen;
116162306a36Sopenharmony_ci	int64_t			qdelta = 0;
116262306a36Sopenharmony_ci	unsigned int		resblks;
116362306a36Sopenharmony_ci	bool			quota_reserved = true;
116462306a36Sopenharmony_ci	bool			smap_real;
116562306a36Sopenharmony_ci	bool			dmap_written = xfs_bmap_is_written_extent(dmap);
116662306a36Sopenharmony_ci	int			iext_delta = 0;
116762306a36Sopenharmony_ci	int			nimaps;
116862306a36Sopenharmony_ci	int			error;
116962306a36Sopenharmony_ci
117062306a36Sopenharmony_ci	/*
117162306a36Sopenharmony_ci	 * Start a rolling transaction to switch the mappings.
117262306a36Sopenharmony_ci	 *
117362306a36Sopenharmony_ci	 * Adding a written extent to the extent map can cause a bmbt split,
117462306a36Sopenharmony_ci	 * and removing a mapped extent from the extent can cause a bmbt split.
117562306a36Sopenharmony_ci	 * The two operations cannot both cause a split since they operate on
117662306a36Sopenharmony_ci	 * the same index in the bmap btree, so we only need a reservation for
117762306a36Sopenharmony_ci	 * one bmbt split if either thing is happening.  However, we haven't
117862306a36Sopenharmony_ci	 * locked the inode yet, so we reserve assuming this is the case.
117962306a36Sopenharmony_ci	 *
118062306a36Sopenharmony_ci	 * The first allocation call tries to reserve enough space to handle
118162306a36Sopenharmony_ci	 * mapping dmap into a sparse part of the file plus the bmbt split.  We
118262306a36Sopenharmony_ci	 * haven't locked the inode or read the existing mapping yet, so we do
118362306a36Sopenharmony_ci	 * not know for sure that we need the space.  This should succeed most
118462306a36Sopenharmony_ci	 * of the time.
118562306a36Sopenharmony_ci	 *
118662306a36Sopenharmony_ci	 * If the first attempt fails, try again but reserving only enough
118762306a36Sopenharmony_ci	 * space to handle a bmbt split.  This is the hard minimum requirement,
118862306a36Sopenharmony_ci	 * and we revisit quota reservations later when we know more about what
118962306a36Sopenharmony_ci	 * we're remapping.
119062306a36Sopenharmony_ci	 */
119162306a36Sopenharmony_ci	resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
119262306a36Sopenharmony_ci	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
119362306a36Sopenharmony_ci			resblks + dmap->br_blockcount, 0, false, &tp);
119462306a36Sopenharmony_ci	if (error == -EDQUOT || error == -ENOSPC) {
119562306a36Sopenharmony_ci		quota_reserved = false;
119662306a36Sopenharmony_ci		error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
119762306a36Sopenharmony_ci				resblks, 0, false, &tp);
119862306a36Sopenharmony_ci	}
119962306a36Sopenharmony_ci	if (error)
120062306a36Sopenharmony_ci		goto out;
120162306a36Sopenharmony_ci
120262306a36Sopenharmony_ci	/*
120362306a36Sopenharmony_ci	 * Read what's currently mapped in the destination file into smap.
120462306a36Sopenharmony_ci	 * If smap isn't a hole, we will have to remove it before we can add
120562306a36Sopenharmony_ci	 * dmap to the destination file.
120662306a36Sopenharmony_ci	 */
120762306a36Sopenharmony_ci	nimaps = 1;
120862306a36Sopenharmony_ci	error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
120962306a36Sopenharmony_ci			&smap, &nimaps, 0);
121062306a36Sopenharmony_ci	if (error)
121162306a36Sopenharmony_ci		goto out_cancel;
121262306a36Sopenharmony_ci	ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
121362306a36Sopenharmony_ci	smap_real = xfs_bmap_is_real_extent(&smap);
121462306a36Sopenharmony_ci
121562306a36Sopenharmony_ci	/*
121662306a36Sopenharmony_ci	 * We can only remap as many blocks as the smaller of the two extent
121762306a36Sopenharmony_ci	 * maps, because we can only remap one extent at a time.
121862306a36Sopenharmony_ci	 */
121962306a36Sopenharmony_ci	dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
122062306a36Sopenharmony_ci	ASSERT(dmap->br_blockcount == smap.br_blockcount);
122162306a36Sopenharmony_ci
122262306a36Sopenharmony_ci	trace_xfs_reflink_remap_extent_dest(ip, &smap);
122362306a36Sopenharmony_ci
122462306a36Sopenharmony_ci	/*
122562306a36Sopenharmony_ci	 * Two extents mapped to the same physical block must not have
122662306a36Sopenharmony_ci	 * different states; that's filesystem corruption.  Move on to the next
122762306a36Sopenharmony_ci	 * extent if they're both holes or both the same physical extent.
122862306a36Sopenharmony_ci	 */
122962306a36Sopenharmony_ci	if (dmap->br_startblock == smap.br_startblock) {
123062306a36Sopenharmony_ci		if (dmap->br_state != smap.br_state)
123162306a36Sopenharmony_ci			error = -EFSCORRUPTED;
123262306a36Sopenharmony_ci		goto out_cancel;
123362306a36Sopenharmony_ci	}
123462306a36Sopenharmony_ci
123562306a36Sopenharmony_ci	/* If both extents are unwritten, leave them alone. */
123662306a36Sopenharmony_ci	if (dmap->br_state == XFS_EXT_UNWRITTEN &&
123762306a36Sopenharmony_ci	    smap.br_state == XFS_EXT_UNWRITTEN)
123862306a36Sopenharmony_ci		goto out_cancel;
123962306a36Sopenharmony_ci
124062306a36Sopenharmony_ci	/* No reflinking if the AG of the dest mapping is low on space. */
124162306a36Sopenharmony_ci	if (dmap_written) {
124262306a36Sopenharmony_ci		error = xfs_reflink_ag_has_free_space(mp,
124362306a36Sopenharmony_ci				XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
124462306a36Sopenharmony_ci		if (error)
124562306a36Sopenharmony_ci			goto out_cancel;
124662306a36Sopenharmony_ci	}
124762306a36Sopenharmony_ci
124862306a36Sopenharmony_ci	/*
124962306a36Sopenharmony_ci	 * Increase quota reservation if we think the quota block counter for
125062306a36Sopenharmony_ci	 * this file could increase.
125162306a36Sopenharmony_ci	 *
125262306a36Sopenharmony_ci	 * If we are mapping a written extent into the file, we need to have
125362306a36Sopenharmony_ci	 * enough quota block count reservation to handle the blocks in that
125462306a36Sopenharmony_ci	 * extent.  We log only the delta to the quota block counts, so if the
125562306a36Sopenharmony_ci	 * extent we're unmapping also has blocks allocated to it, we don't
125662306a36Sopenharmony_ci	 * need a quota reservation for the extent itself.
125762306a36Sopenharmony_ci	 *
125862306a36Sopenharmony_ci	 * Note that if we're replacing a delalloc reservation with a written
125962306a36Sopenharmony_ci	 * extent, we have to take the full quota reservation because removing
126062306a36Sopenharmony_ci	 * the delalloc reservation gives the block count back to the quota
126162306a36Sopenharmony_ci	 * count.  This is suboptimal, but the VFS flushed the dest range
126262306a36Sopenharmony_ci	 * before we started.  That should have removed all the delalloc
126362306a36Sopenharmony_ci	 * reservations, but we code defensively.
126462306a36Sopenharmony_ci	 *
126562306a36Sopenharmony_ci	 * xfs_trans_alloc_inode above already tried to grab an even larger
126662306a36Sopenharmony_ci	 * quota reservation, and kicked off a blockgc scan if it couldn't.
126762306a36Sopenharmony_ci	 * If we can't get a potentially smaller quota reservation now, we're
126862306a36Sopenharmony_ci	 * done.
126962306a36Sopenharmony_ci	 */
127062306a36Sopenharmony_ci	if (!quota_reserved && !smap_real && dmap_written) {
127162306a36Sopenharmony_ci		error = xfs_trans_reserve_quota_nblks(tp, ip,
127262306a36Sopenharmony_ci				dmap->br_blockcount, 0, false);
127362306a36Sopenharmony_ci		if (error)
127462306a36Sopenharmony_ci			goto out_cancel;
127562306a36Sopenharmony_ci	}
127662306a36Sopenharmony_ci
127762306a36Sopenharmony_ci	if (smap_real)
127862306a36Sopenharmony_ci		++iext_delta;
127962306a36Sopenharmony_ci
128062306a36Sopenharmony_ci	if (dmap_written)
128162306a36Sopenharmony_ci		++iext_delta;
128262306a36Sopenharmony_ci
128362306a36Sopenharmony_ci	error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
128462306a36Sopenharmony_ci	if (error == -EFBIG)
128562306a36Sopenharmony_ci		error = xfs_iext_count_upgrade(tp, ip, iext_delta);
128662306a36Sopenharmony_ci	if (error)
128762306a36Sopenharmony_ci		goto out_cancel;
128862306a36Sopenharmony_ci
128962306a36Sopenharmony_ci	if (smap_real) {
129062306a36Sopenharmony_ci		/*
129162306a36Sopenharmony_ci		 * If the extent we're unmapping is backed by storage (written
129262306a36Sopenharmony_ci		 * or not), unmap the extent and drop its refcount.
129362306a36Sopenharmony_ci		 */
129462306a36Sopenharmony_ci		xfs_bmap_unmap_extent(tp, ip, &smap);
129562306a36Sopenharmony_ci		xfs_refcount_decrease_extent(tp, &smap);
129662306a36Sopenharmony_ci		qdelta -= smap.br_blockcount;
129762306a36Sopenharmony_ci	} else if (smap.br_startblock == DELAYSTARTBLOCK) {
129862306a36Sopenharmony_ci		int		done;
129962306a36Sopenharmony_ci
130062306a36Sopenharmony_ci		/*
130162306a36Sopenharmony_ci		 * If the extent we're unmapping is a delalloc reservation,
130262306a36Sopenharmony_ci		 * we can use the regular bunmapi function to release the
130362306a36Sopenharmony_ci		 * incore state.  Dropping the delalloc reservation takes care
130462306a36Sopenharmony_ci		 * of the quota reservation for us.
130562306a36Sopenharmony_ci		 */
130662306a36Sopenharmony_ci		error = xfs_bunmapi(NULL, ip, smap.br_startoff,
130762306a36Sopenharmony_ci				smap.br_blockcount, 0, 1, &done);
130862306a36Sopenharmony_ci		if (error)
130962306a36Sopenharmony_ci			goto out_cancel;
131062306a36Sopenharmony_ci		ASSERT(done);
131162306a36Sopenharmony_ci	}
131262306a36Sopenharmony_ci
131362306a36Sopenharmony_ci	/*
131462306a36Sopenharmony_ci	 * If the extent we're sharing is backed by written storage, increase
131562306a36Sopenharmony_ci	 * its refcount and map it into the file.
131662306a36Sopenharmony_ci	 */
131762306a36Sopenharmony_ci	if (dmap_written) {
131862306a36Sopenharmony_ci		xfs_refcount_increase_extent(tp, dmap);
131962306a36Sopenharmony_ci		xfs_bmap_map_extent(tp, ip, dmap);
132062306a36Sopenharmony_ci		qdelta += dmap->br_blockcount;
132162306a36Sopenharmony_ci	}
132262306a36Sopenharmony_ci
132362306a36Sopenharmony_ci	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
132462306a36Sopenharmony_ci
132562306a36Sopenharmony_ci	/* Update dest isize if needed. */
132662306a36Sopenharmony_ci	newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
132762306a36Sopenharmony_ci	newlen = min_t(xfs_off_t, newlen, new_isize);
132862306a36Sopenharmony_ci	if (newlen > i_size_read(VFS_I(ip))) {
132962306a36Sopenharmony_ci		trace_xfs_reflink_update_inode_size(ip, newlen);
133062306a36Sopenharmony_ci		i_size_write(VFS_I(ip), newlen);
133162306a36Sopenharmony_ci		ip->i_disk_size = newlen;
133262306a36Sopenharmony_ci		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
133362306a36Sopenharmony_ci	}
133462306a36Sopenharmony_ci
133562306a36Sopenharmony_ci	/* Commit everything and unlock. */
133662306a36Sopenharmony_ci	error = xfs_trans_commit(tp);
133762306a36Sopenharmony_ci	goto out_unlock;
133862306a36Sopenharmony_ci
133962306a36Sopenharmony_ciout_cancel:
134062306a36Sopenharmony_ci	xfs_trans_cancel(tp);
134162306a36Sopenharmony_ciout_unlock:
134262306a36Sopenharmony_ci	xfs_iunlock(ip, XFS_ILOCK_EXCL);
134362306a36Sopenharmony_ciout:
134462306a36Sopenharmony_ci	if (error)
134562306a36Sopenharmony_ci		trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
134662306a36Sopenharmony_ci	return error;
134762306a36Sopenharmony_ci}
134862306a36Sopenharmony_ci
134962306a36Sopenharmony_ci/* Remap a range of one file to the other. */
135062306a36Sopenharmony_ciint
135162306a36Sopenharmony_cixfs_reflink_remap_blocks(
135262306a36Sopenharmony_ci	struct xfs_inode	*src,
135362306a36Sopenharmony_ci	loff_t			pos_in,
135462306a36Sopenharmony_ci	struct xfs_inode	*dest,
135562306a36Sopenharmony_ci	loff_t			pos_out,
135662306a36Sopenharmony_ci	loff_t			remap_len,
135762306a36Sopenharmony_ci	loff_t			*remapped)
135862306a36Sopenharmony_ci{
135962306a36Sopenharmony_ci	struct xfs_bmbt_irec	imap;
136062306a36Sopenharmony_ci	struct xfs_mount	*mp = src->i_mount;
136162306a36Sopenharmony_ci	xfs_fileoff_t		srcoff = XFS_B_TO_FSBT(mp, pos_in);
136262306a36Sopenharmony_ci	xfs_fileoff_t		destoff = XFS_B_TO_FSBT(mp, pos_out);
136362306a36Sopenharmony_ci	xfs_filblks_t		len;
136462306a36Sopenharmony_ci	xfs_filblks_t		remapped_len = 0;
136562306a36Sopenharmony_ci	xfs_off_t		new_isize = pos_out + remap_len;
136662306a36Sopenharmony_ci	int			nimaps;
136762306a36Sopenharmony_ci	int			error = 0;
136862306a36Sopenharmony_ci
136962306a36Sopenharmony_ci	len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
137062306a36Sopenharmony_ci			XFS_MAX_FILEOFF);
137162306a36Sopenharmony_ci
137262306a36Sopenharmony_ci	trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
137362306a36Sopenharmony_ci
137462306a36Sopenharmony_ci	while (len > 0) {
137562306a36Sopenharmony_ci		unsigned int	lock_mode;
137662306a36Sopenharmony_ci
137762306a36Sopenharmony_ci		/* Read extent from the source file */
137862306a36Sopenharmony_ci		nimaps = 1;
137962306a36Sopenharmony_ci		lock_mode = xfs_ilock_data_map_shared(src);
138062306a36Sopenharmony_ci		error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
138162306a36Sopenharmony_ci		xfs_iunlock(src, lock_mode);
138262306a36Sopenharmony_ci		if (error)
138362306a36Sopenharmony_ci			break;
138462306a36Sopenharmony_ci		/*
138562306a36Sopenharmony_ci		 * The caller supposedly flushed all dirty pages in the source
138662306a36Sopenharmony_ci		 * file range, which means that writeback should have allocated
138762306a36Sopenharmony_ci		 * or deleted all delalloc reservations in that range.  If we
138862306a36Sopenharmony_ci		 * find one, that's a good sign that something is seriously
138962306a36Sopenharmony_ci		 * wrong here.
139062306a36Sopenharmony_ci		 */
139162306a36Sopenharmony_ci		ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
139262306a36Sopenharmony_ci		if (imap.br_startblock == DELAYSTARTBLOCK) {
139362306a36Sopenharmony_ci			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
139462306a36Sopenharmony_ci			error = -EFSCORRUPTED;
139562306a36Sopenharmony_ci			break;
139662306a36Sopenharmony_ci		}
139762306a36Sopenharmony_ci
139862306a36Sopenharmony_ci		trace_xfs_reflink_remap_extent_src(src, &imap);
139962306a36Sopenharmony_ci
140062306a36Sopenharmony_ci		/* Remap into the destination file at the given offset. */
140162306a36Sopenharmony_ci		imap.br_startoff = destoff;
140262306a36Sopenharmony_ci		error = xfs_reflink_remap_extent(dest, &imap, new_isize);
140362306a36Sopenharmony_ci		if (error)
140462306a36Sopenharmony_ci			break;
140562306a36Sopenharmony_ci
140662306a36Sopenharmony_ci		if (fatal_signal_pending(current)) {
140762306a36Sopenharmony_ci			error = -EINTR;
140862306a36Sopenharmony_ci			break;
140962306a36Sopenharmony_ci		}
141062306a36Sopenharmony_ci
141162306a36Sopenharmony_ci		/* Advance drange/srange */
141262306a36Sopenharmony_ci		srcoff += imap.br_blockcount;
141362306a36Sopenharmony_ci		destoff += imap.br_blockcount;
141462306a36Sopenharmony_ci		len -= imap.br_blockcount;
141562306a36Sopenharmony_ci		remapped_len += imap.br_blockcount;
141662306a36Sopenharmony_ci	}
141762306a36Sopenharmony_ci
141862306a36Sopenharmony_ci	if (error)
141962306a36Sopenharmony_ci		trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
142062306a36Sopenharmony_ci	*remapped = min_t(loff_t, remap_len,
142162306a36Sopenharmony_ci			  XFS_FSB_TO_B(src->i_mount, remapped_len));
142262306a36Sopenharmony_ci	return error;
142362306a36Sopenharmony_ci}
142462306a36Sopenharmony_ci
142562306a36Sopenharmony_ci/*
142662306a36Sopenharmony_ci * If we're reflinking to a point past the destination file's EOF, we must
142762306a36Sopenharmony_ci * zero any speculative post-EOF preallocations that sit between the old EOF
142862306a36Sopenharmony_ci * and the destination file offset.
142962306a36Sopenharmony_ci */
143062306a36Sopenharmony_cistatic int
143162306a36Sopenharmony_cixfs_reflink_zero_posteof(
143262306a36Sopenharmony_ci	struct xfs_inode	*ip,
143362306a36Sopenharmony_ci	loff_t			pos)
143462306a36Sopenharmony_ci{
143562306a36Sopenharmony_ci	loff_t			isize = i_size_read(VFS_I(ip));
143662306a36Sopenharmony_ci
143762306a36Sopenharmony_ci	if (pos <= isize)
143862306a36Sopenharmony_ci		return 0;
143962306a36Sopenharmony_ci
144062306a36Sopenharmony_ci	trace_xfs_zero_eof(ip, isize, pos - isize);
144162306a36Sopenharmony_ci	return xfs_zero_range(ip, isize, pos - isize, NULL);
144262306a36Sopenharmony_ci}
144362306a36Sopenharmony_ci
144462306a36Sopenharmony_ci/*
144562306a36Sopenharmony_ci * Prepare two files for range cloning.  Upon a successful return both inodes
144662306a36Sopenharmony_ci * will have the iolock and mmaplock held, the page cache of the out file will
144762306a36Sopenharmony_ci * be truncated, and any leases on the out file will have been broken.  This
144862306a36Sopenharmony_ci * function borrows heavily from xfs_file_aio_write_checks.
144962306a36Sopenharmony_ci *
145062306a36Sopenharmony_ci * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
145162306a36Sopenharmony_ci * checked that the bytes beyond EOF physically match. Hence we cannot use the
145262306a36Sopenharmony_ci * EOF block in the source dedupe range because it's not a complete block match,
145362306a36Sopenharmony_ci * hence can introduce a corruption into the file that has it's block replaced.
145462306a36Sopenharmony_ci *
145562306a36Sopenharmony_ci * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
145662306a36Sopenharmony_ci * "block aligned" for the purposes of cloning entire files.  However, if the
145762306a36Sopenharmony_ci * source file range includes the EOF block and it lands within the existing EOF
145862306a36Sopenharmony_ci * of the destination file, then we can expose stale data from beyond the source
145962306a36Sopenharmony_ci * file EOF in the destination file.
146062306a36Sopenharmony_ci *
146162306a36Sopenharmony_ci * XFS doesn't support partial block sharing, so in both cases we have check
146262306a36Sopenharmony_ci * these cases ourselves. For dedupe, we can simply round the length to dedupe
146362306a36Sopenharmony_ci * down to the previous whole block and ignore the partial EOF block. While this
146462306a36Sopenharmony_ci * means we can't dedupe the last block of a file, this is an acceptible
146562306a36Sopenharmony_ci * tradeoff for simplicity on implementation.
146662306a36Sopenharmony_ci *
146762306a36Sopenharmony_ci * For cloning, we want to share the partial EOF block if it is also the new EOF
146862306a36Sopenharmony_ci * block of the destination file. If the partial EOF block lies inside the
146962306a36Sopenharmony_ci * existing destination EOF, then we have to abort the clone to avoid exposing
147062306a36Sopenharmony_ci * stale data in the destination file. Hence we reject these clone attempts with
147162306a36Sopenharmony_ci * -EINVAL in this case.
147262306a36Sopenharmony_ci */
147362306a36Sopenharmony_ciint
147462306a36Sopenharmony_cixfs_reflink_remap_prep(
147562306a36Sopenharmony_ci	struct file		*file_in,
147662306a36Sopenharmony_ci	loff_t			pos_in,
147762306a36Sopenharmony_ci	struct file		*file_out,
147862306a36Sopenharmony_ci	loff_t			pos_out,
147962306a36Sopenharmony_ci	loff_t			*len,
148062306a36Sopenharmony_ci	unsigned int		remap_flags)
148162306a36Sopenharmony_ci{
148262306a36Sopenharmony_ci	struct inode		*inode_in = file_inode(file_in);
148362306a36Sopenharmony_ci	struct xfs_inode	*src = XFS_I(inode_in);
148462306a36Sopenharmony_ci	struct inode		*inode_out = file_inode(file_out);
148562306a36Sopenharmony_ci	struct xfs_inode	*dest = XFS_I(inode_out);
148662306a36Sopenharmony_ci	int			ret;
148762306a36Sopenharmony_ci
148862306a36Sopenharmony_ci	/* Lock both files against IO */
148962306a36Sopenharmony_ci	ret = xfs_ilock2_io_mmap(src, dest);
149062306a36Sopenharmony_ci	if (ret)
149162306a36Sopenharmony_ci		return ret;
149262306a36Sopenharmony_ci
149362306a36Sopenharmony_ci	/* Check file eligibility and prepare for block sharing. */
149462306a36Sopenharmony_ci	ret = -EINVAL;
149562306a36Sopenharmony_ci	/* Don't reflink realtime inodes */
149662306a36Sopenharmony_ci	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
149762306a36Sopenharmony_ci		goto out_unlock;
149862306a36Sopenharmony_ci
149962306a36Sopenharmony_ci	/* Don't share DAX file data with non-DAX file. */
150062306a36Sopenharmony_ci	if (IS_DAX(inode_in) != IS_DAX(inode_out))
150162306a36Sopenharmony_ci		goto out_unlock;
150262306a36Sopenharmony_ci
150362306a36Sopenharmony_ci	if (!IS_DAX(inode_in))
150462306a36Sopenharmony_ci		ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
150562306a36Sopenharmony_ci				pos_out, len, remap_flags);
150662306a36Sopenharmony_ci	else
150762306a36Sopenharmony_ci		ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
150862306a36Sopenharmony_ci				pos_out, len, remap_flags, &xfs_read_iomap_ops);
150962306a36Sopenharmony_ci	if (ret || *len == 0)
151062306a36Sopenharmony_ci		goto out_unlock;
151162306a36Sopenharmony_ci
151262306a36Sopenharmony_ci	/* Attach dquots to dest inode before changing block map */
151362306a36Sopenharmony_ci	ret = xfs_qm_dqattach(dest);
151462306a36Sopenharmony_ci	if (ret)
151562306a36Sopenharmony_ci		goto out_unlock;
151662306a36Sopenharmony_ci
151762306a36Sopenharmony_ci	/*
151862306a36Sopenharmony_ci	 * Zero existing post-eof speculative preallocations in the destination
151962306a36Sopenharmony_ci	 * file.
152062306a36Sopenharmony_ci	 */
152162306a36Sopenharmony_ci	ret = xfs_reflink_zero_posteof(dest, pos_out);
152262306a36Sopenharmony_ci	if (ret)
152362306a36Sopenharmony_ci		goto out_unlock;
152462306a36Sopenharmony_ci
152562306a36Sopenharmony_ci	/* Set flags and remap blocks. */
152662306a36Sopenharmony_ci	ret = xfs_reflink_set_inode_flag(src, dest);
152762306a36Sopenharmony_ci	if (ret)
152862306a36Sopenharmony_ci		goto out_unlock;
152962306a36Sopenharmony_ci
153062306a36Sopenharmony_ci	/*
153162306a36Sopenharmony_ci	 * If pos_out > EOF, we may have dirtied blocks between EOF and
153262306a36Sopenharmony_ci	 * pos_out. In that case, we need to extend the flush and unmap to cover
153362306a36Sopenharmony_ci	 * from EOF to the end of the copy length.
153462306a36Sopenharmony_ci	 */
153562306a36Sopenharmony_ci	if (pos_out > XFS_ISIZE(dest)) {
153662306a36Sopenharmony_ci		loff_t	flen = *len + (pos_out - XFS_ISIZE(dest));
153762306a36Sopenharmony_ci		ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
153862306a36Sopenharmony_ci	} else {
153962306a36Sopenharmony_ci		ret = xfs_flush_unmap_range(dest, pos_out, *len);
154062306a36Sopenharmony_ci	}
154162306a36Sopenharmony_ci	if (ret)
154262306a36Sopenharmony_ci		goto out_unlock;
154362306a36Sopenharmony_ci
154462306a36Sopenharmony_ci	xfs_iflags_set(src, XFS_IREMAPPING);
154562306a36Sopenharmony_ci	if (inode_in != inode_out)
154662306a36Sopenharmony_ci		xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
154762306a36Sopenharmony_ci
154862306a36Sopenharmony_ci	return 0;
154962306a36Sopenharmony_ciout_unlock:
155062306a36Sopenharmony_ci	xfs_iunlock2_io_mmap(src, dest);
155162306a36Sopenharmony_ci	return ret;
155262306a36Sopenharmony_ci}
155362306a36Sopenharmony_ci
155462306a36Sopenharmony_ci/* Does this inode need the reflink flag? */
155562306a36Sopenharmony_ciint
155662306a36Sopenharmony_cixfs_reflink_inode_has_shared_extents(
155762306a36Sopenharmony_ci	struct xfs_trans		*tp,
155862306a36Sopenharmony_ci	struct xfs_inode		*ip,
155962306a36Sopenharmony_ci	bool				*has_shared)
156062306a36Sopenharmony_ci{
156162306a36Sopenharmony_ci	struct xfs_bmbt_irec		got;
156262306a36Sopenharmony_ci	struct xfs_mount		*mp = ip->i_mount;
156362306a36Sopenharmony_ci	struct xfs_ifork		*ifp;
156462306a36Sopenharmony_ci	struct xfs_iext_cursor		icur;
156562306a36Sopenharmony_ci	bool				found;
156662306a36Sopenharmony_ci	int				error;
156762306a36Sopenharmony_ci
156862306a36Sopenharmony_ci	ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
156962306a36Sopenharmony_ci	error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
157062306a36Sopenharmony_ci	if (error)
157162306a36Sopenharmony_ci		return error;
157262306a36Sopenharmony_ci
157362306a36Sopenharmony_ci	*has_shared = false;
157462306a36Sopenharmony_ci	found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
157562306a36Sopenharmony_ci	while (found) {
157662306a36Sopenharmony_ci		struct xfs_perag	*pag;
157762306a36Sopenharmony_ci		xfs_agblock_t		agbno;
157862306a36Sopenharmony_ci		xfs_extlen_t		aglen;
157962306a36Sopenharmony_ci		xfs_agblock_t		rbno;
158062306a36Sopenharmony_ci		xfs_extlen_t		rlen;
158162306a36Sopenharmony_ci
158262306a36Sopenharmony_ci		if (isnullstartblock(got.br_startblock) ||
158362306a36Sopenharmony_ci		    got.br_state != XFS_EXT_NORM)
158462306a36Sopenharmony_ci			goto next;
158562306a36Sopenharmony_ci
158662306a36Sopenharmony_ci		pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
158762306a36Sopenharmony_ci		agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
158862306a36Sopenharmony_ci		aglen = got.br_blockcount;
158962306a36Sopenharmony_ci		error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
159062306a36Sopenharmony_ci				&rbno, &rlen, false);
159162306a36Sopenharmony_ci		xfs_perag_put(pag);
159262306a36Sopenharmony_ci		if (error)
159362306a36Sopenharmony_ci			return error;
159462306a36Sopenharmony_ci
159562306a36Sopenharmony_ci		/* Is there still a shared block here? */
159662306a36Sopenharmony_ci		if (rbno != NULLAGBLOCK) {
159762306a36Sopenharmony_ci			*has_shared = true;
159862306a36Sopenharmony_ci			return 0;
159962306a36Sopenharmony_ci		}
160062306a36Sopenharmony_cinext:
160162306a36Sopenharmony_ci		found = xfs_iext_next_extent(ifp, &icur, &got);
160262306a36Sopenharmony_ci	}
160362306a36Sopenharmony_ci
160462306a36Sopenharmony_ci	return 0;
160562306a36Sopenharmony_ci}
160662306a36Sopenharmony_ci
160762306a36Sopenharmony_ci/*
160862306a36Sopenharmony_ci * Clear the inode reflink flag if there are no shared extents.
160962306a36Sopenharmony_ci *
161062306a36Sopenharmony_ci * The caller is responsible for joining the inode to the transaction passed in.
161162306a36Sopenharmony_ci * The inode will be joined to the transaction that is returned to the caller.
161262306a36Sopenharmony_ci */
161362306a36Sopenharmony_ciint
161462306a36Sopenharmony_cixfs_reflink_clear_inode_flag(
161562306a36Sopenharmony_ci	struct xfs_inode	*ip,
161662306a36Sopenharmony_ci	struct xfs_trans	**tpp)
161762306a36Sopenharmony_ci{
161862306a36Sopenharmony_ci	bool			needs_flag;
161962306a36Sopenharmony_ci	int			error = 0;
162062306a36Sopenharmony_ci
162162306a36Sopenharmony_ci	ASSERT(xfs_is_reflink_inode(ip));
162262306a36Sopenharmony_ci
162362306a36Sopenharmony_ci	error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
162462306a36Sopenharmony_ci	if (error || needs_flag)
162562306a36Sopenharmony_ci		return error;
162662306a36Sopenharmony_ci
162762306a36Sopenharmony_ci	/*
162862306a36Sopenharmony_ci	 * We didn't find any shared blocks so turn off the reflink flag.
162962306a36Sopenharmony_ci	 * First, get rid of any leftover CoW mappings.
163062306a36Sopenharmony_ci	 */
163162306a36Sopenharmony_ci	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
163262306a36Sopenharmony_ci			true);
163362306a36Sopenharmony_ci	if (error)
163462306a36Sopenharmony_ci		return error;
163562306a36Sopenharmony_ci
163662306a36Sopenharmony_ci	/* Clear the inode flag. */
163762306a36Sopenharmony_ci	trace_xfs_reflink_unset_inode_flag(ip);
163862306a36Sopenharmony_ci	ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
163962306a36Sopenharmony_ci	xfs_inode_clear_cowblocks_tag(ip);
164062306a36Sopenharmony_ci	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
164162306a36Sopenharmony_ci
164262306a36Sopenharmony_ci	return error;
164362306a36Sopenharmony_ci}
164462306a36Sopenharmony_ci
164562306a36Sopenharmony_ci/*
164662306a36Sopenharmony_ci * Clear the inode reflink flag if there are no shared extents and the size
164762306a36Sopenharmony_ci * hasn't changed.
164862306a36Sopenharmony_ci */
164962306a36Sopenharmony_ciSTATIC int
165062306a36Sopenharmony_cixfs_reflink_try_clear_inode_flag(
165162306a36Sopenharmony_ci	struct xfs_inode	*ip)
165262306a36Sopenharmony_ci{
165362306a36Sopenharmony_ci	struct xfs_mount	*mp = ip->i_mount;
165462306a36Sopenharmony_ci	struct xfs_trans	*tp;
165562306a36Sopenharmony_ci	int			error = 0;
165662306a36Sopenharmony_ci
165762306a36Sopenharmony_ci	/* Start a rolling transaction to remove the mappings */
165862306a36Sopenharmony_ci	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
165962306a36Sopenharmony_ci	if (error)
166062306a36Sopenharmony_ci		return error;
166162306a36Sopenharmony_ci
166262306a36Sopenharmony_ci	xfs_ilock(ip, XFS_ILOCK_EXCL);
166362306a36Sopenharmony_ci	xfs_trans_ijoin(tp, ip, 0);
166462306a36Sopenharmony_ci
166562306a36Sopenharmony_ci	error = xfs_reflink_clear_inode_flag(ip, &tp);
166662306a36Sopenharmony_ci	if (error)
166762306a36Sopenharmony_ci		goto cancel;
166862306a36Sopenharmony_ci
166962306a36Sopenharmony_ci	error = xfs_trans_commit(tp);
167062306a36Sopenharmony_ci	if (error)
167162306a36Sopenharmony_ci		goto out;
167262306a36Sopenharmony_ci
167362306a36Sopenharmony_ci	xfs_iunlock(ip, XFS_ILOCK_EXCL);
167462306a36Sopenharmony_ci	return 0;
167562306a36Sopenharmony_cicancel:
167662306a36Sopenharmony_ci	xfs_trans_cancel(tp);
167762306a36Sopenharmony_ciout:
167862306a36Sopenharmony_ci	xfs_iunlock(ip, XFS_ILOCK_EXCL);
167962306a36Sopenharmony_ci	return error;
168062306a36Sopenharmony_ci}
168162306a36Sopenharmony_ci
168262306a36Sopenharmony_ci/*
168362306a36Sopenharmony_ci * Pre-COW all shared blocks within a given byte range of a file and turn off
168462306a36Sopenharmony_ci * the reflink flag if we unshare all of the file's blocks.
168562306a36Sopenharmony_ci */
168662306a36Sopenharmony_ciint
168762306a36Sopenharmony_cixfs_reflink_unshare(
168862306a36Sopenharmony_ci	struct xfs_inode	*ip,
168962306a36Sopenharmony_ci	xfs_off_t		offset,
169062306a36Sopenharmony_ci	xfs_off_t		len)
169162306a36Sopenharmony_ci{
169262306a36Sopenharmony_ci	struct inode		*inode = VFS_I(ip);
169362306a36Sopenharmony_ci	int			error;
169462306a36Sopenharmony_ci
169562306a36Sopenharmony_ci	if (!xfs_is_reflink_inode(ip))
169662306a36Sopenharmony_ci		return 0;
169762306a36Sopenharmony_ci
169862306a36Sopenharmony_ci	trace_xfs_reflink_unshare(ip, offset, len);
169962306a36Sopenharmony_ci
170062306a36Sopenharmony_ci	inode_dio_wait(inode);
170162306a36Sopenharmony_ci
170262306a36Sopenharmony_ci	if (IS_DAX(inode))
170362306a36Sopenharmony_ci		error = dax_file_unshare(inode, offset, len,
170462306a36Sopenharmony_ci				&xfs_dax_write_iomap_ops);
170562306a36Sopenharmony_ci	else
170662306a36Sopenharmony_ci		error = iomap_file_unshare(inode, offset, len,
170762306a36Sopenharmony_ci				&xfs_buffered_write_iomap_ops);
170862306a36Sopenharmony_ci	if (error)
170962306a36Sopenharmony_ci		goto out;
171062306a36Sopenharmony_ci
171162306a36Sopenharmony_ci	error = filemap_write_and_wait_range(inode->i_mapping, offset,
171262306a36Sopenharmony_ci			offset + len - 1);
171362306a36Sopenharmony_ci	if (error)
171462306a36Sopenharmony_ci		goto out;
171562306a36Sopenharmony_ci
171662306a36Sopenharmony_ci	/* Turn off the reflink flag if possible. */
171762306a36Sopenharmony_ci	error = xfs_reflink_try_clear_inode_flag(ip);
171862306a36Sopenharmony_ci	if (error)
171962306a36Sopenharmony_ci		goto out;
172062306a36Sopenharmony_ci	return 0;
172162306a36Sopenharmony_ci
172262306a36Sopenharmony_ciout:
172362306a36Sopenharmony_ci	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
172462306a36Sopenharmony_ci	return error;
172562306a36Sopenharmony_ci}
1726