162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * move_extents.c
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 2011 Oracle.  All rights reserved.
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci#include <linux/fs.h>
862306a36Sopenharmony_ci#include <linux/types.h>
962306a36Sopenharmony_ci#include <linux/mount.h>
1062306a36Sopenharmony_ci#include <linux/swap.h>
1162306a36Sopenharmony_ci
1262306a36Sopenharmony_ci#include <cluster/masklog.h>
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci#include "ocfs2.h"
1562306a36Sopenharmony_ci#include "ocfs2_ioctl.h"
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ci#include "alloc.h"
1862306a36Sopenharmony_ci#include "localalloc.h"
1962306a36Sopenharmony_ci#include "aops.h"
2062306a36Sopenharmony_ci#include "dlmglue.h"
2162306a36Sopenharmony_ci#include "extent_map.h"
2262306a36Sopenharmony_ci#include "inode.h"
2362306a36Sopenharmony_ci#include "journal.h"
2462306a36Sopenharmony_ci#include "suballoc.h"
2562306a36Sopenharmony_ci#include "uptodate.h"
2662306a36Sopenharmony_ci#include "super.h"
2762306a36Sopenharmony_ci#include "dir.h"
2862306a36Sopenharmony_ci#include "buffer_head_io.h"
2962306a36Sopenharmony_ci#include "sysfile.h"
3062306a36Sopenharmony_ci#include "refcounttree.h"
3162306a36Sopenharmony_ci#include "move_extents.h"
3262306a36Sopenharmony_ci
3362306a36Sopenharmony_cistruct ocfs2_move_extents_context {
3462306a36Sopenharmony_ci	struct inode *inode;
3562306a36Sopenharmony_ci	struct file *file;
3662306a36Sopenharmony_ci	int auto_defrag;
3762306a36Sopenharmony_ci	int partial;
3862306a36Sopenharmony_ci	int credits;
3962306a36Sopenharmony_ci	u32 new_phys_cpos;
4062306a36Sopenharmony_ci	u32 clusters_moved;
4162306a36Sopenharmony_ci	u64 refcount_loc;
4262306a36Sopenharmony_ci	struct ocfs2_move_extents *range;
4362306a36Sopenharmony_ci	struct ocfs2_extent_tree et;
4462306a36Sopenharmony_ci	struct ocfs2_alloc_context *meta_ac;
4562306a36Sopenharmony_ci	struct ocfs2_alloc_context *data_ac;
4662306a36Sopenharmony_ci	struct ocfs2_cached_dealloc_ctxt dealloc;
4762306a36Sopenharmony_ci};
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_cistatic int __ocfs2_move_extent(handle_t *handle,
5062306a36Sopenharmony_ci			       struct ocfs2_move_extents_context *context,
5162306a36Sopenharmony_ci			       u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
5262306a36Sopenharmony_ci			       int ext_flags)
5362306a36Sopenharmony_ci{
5462306a36Sopenharmony_ci	int ret = 0, index;
5562306a36Sopenharmony_ci	struct inode *inode = context->inode;
5662306a36Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5762306a36Sopenharmony_ci	struct ocfs2_extent_rec *rec, replace_rec;
5862306a36Sopenharmony_ci	struct ocfs2_path *path = NULL;
5962306a36Sopenharmony_ci	struct ocfs2_extent_list *el;
6062306a36Sopenharmony_ci	u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
6162306a36Sopenharmony_ci	u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci	ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
6462306a36Sopenharmony_ci					       p_cpos, new_p_cpos, len);
6562306a36Sopenharmony_ci	if (ret) {
6662306a36Sopenharmony_ci		mlog_errno(ret);
6762306a36Sopenharmony_ci		goto out;
6862306a36Sopenharmony_ci	}
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci	memset(&replace_rec, 0, sizeof(replace_rec));
7162306a36Sopenharmony_ci	replace_rec.e_cpos = cpu_to_le32(cpos);
7262306a36Sopenharmony_ci	replace_rec.e_leaf_clusters = cpu_to_le16(len);
7362306a36Sopenharmony_ci	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
7462306a36Sopenharmony_ci								   new_p_cpos));
7562306a36Sopenharmony_ci
7662306a36Sopenharmony_ci	path = ocfs2_new_path_from_et(&context->et);
7762306a36Sopenharmony_ci	if (!path) {
7862306a36Sopenharmony_ci		ret = -ENOMEM;
7962306a36Sopenharmony_ci		mlog_errno(ret);
8062306a36Sopenharmony_ci		goto out;
8162306a36Sopenharmony_ci	}
8262306a36Sopenharmony_ci
8362306a36Sopenharmony_ci	ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
8462306a36Sopenharmony_ci	if (ret) {
8562306a36Sopenharmony_ci		mlog_errno(ret);
8662306a36Sopenharmony_ci		goto out;
8762306a36Sopenharmony_ci	}
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci	el = path_leaf_el(path);
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	index = ocfs2_search_extent_list(el, cpos);
9262306a36Sopenharmony_ci	if (index == -1) {
9362306a36Sopenharmony_ci		ret = ocfs2_error(inode->i_sb,
9462306a36Sopenharmony_ci				  "Inode %llu has an extent at cpos %u which can no longer be found\n",
9562306a36Sopenharmony_ci				  (unsigned long long)ino, cpos);
9662306a36Sopenharmony_ci		goto out;
9762306a36Sopenharmony_ci	}
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci	rec = &el->l_recs[index];
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci	BUG_ON(ext_flags != rec->e_flags);
10262306a36Sopenharmony_ci	/*
10362306a36Sopenharmony_ci	 * after moving/defraging to new location, the extent is not going
10462306a36Sopenharmony_ci	 * to be refcounted anymore.
10562306a36Sopenharmony_ci	 */
10662306a36Sopenharmony_ci	replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci	ret = ocfs2_split_extent(handle, &context->et, path, index,
10962306a36Sopenharmony_ci				 &replace_rec, context->meta_ac,
11062306a36Sopenharmony_ci				 &context->dealloc);
11162306a36Sopenharmony_ci	if (ret) {
11262306a36Sopenharmony_ci		mlog_errno(ret);
11362306a36Sopenharmony_ci		goto out;
11462306a36Sopenharmony_ci	}
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci	context->new_phys_cpos = new_p_cpos;
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci	/*
11962306a36Sopenharmony_ci	 * need I to append truncate log for old clusters?
12062306a36Sopenharmony_ci	 */
12162306a36Sopenharmony_ci	if (old_blkno) {
12262306a36Sopenharmony_ci		if (ext_flags & OCFS2_EXT_REFCOUNTED)
12362306a36Sopenharmony_ci			ret = ocfs2_decrease_refcount(inode, handle,
12462306a36Sopenharmony_ci					ocfs2_blocks_to_clusters(osb->sb,
12562306a36Sopenharmony_ci								 old_blkno),
12662306a36Sopenharmony_ci					len, context->meta_ac,
12762306a36Sopenharmony_ci					&context->dealloc, 1);
12862306a36Sopenharmony_ci		else
12962306a36Sopenharmony_ci			ret = ocfs2_truncate_log_append(osb, handle,
13062306a36Sopenharmony_ci							old_blkno, len);
13162306a36Sopenharmony_ci	}
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci	ocfs2_update_inode_fsync_trans(handle, inode, 0);
13462306a36Sopenharmony_ciout:
13562306a36Sopenharmony_ci	ocfs2_free_path(path);
13662306a36Sopenharmony_ci	return ret;
13762306a36Sopenharmony_ci}
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci/*
14062306a36Sopenharmony_ci * lock allocator, and reserve appropriate number of bits for
14162306a36Sopenharmony_ci * meta blocks.
14262306a36Sopenharmony_ci */
14362306a36Sopenharmony_cistatic int ocfs2_lock_meta_allocator_move_extents(struct inode *inode,
14462306a36Sopenharmony_ci					struct ocfs2_extent_tree *et,
14562306a36Sopenharmony_ci					u32 clusters_to_move,
14662306a36Sopenharmony_ci					u32 extents_to_split,
14762306a36Sopenharmony_ci					struct ocfs2_alloc_context **meta_ac,
14862306a36Sopenharmony_ci					int extra_blocks,
14962306a36Sopenharmony_ci					int *credits)
15062306a36Sopenharmony_ci{
15162306a36Sopenharmony_ci	int ret, num_free_extents;
15262306a36Sopenharmony_ci	unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
15362306a36Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci	num_free_extents = ocfs2_num_free_extents(et);
15662306a36Sopenharmony_ci	if (num_free_extents < 0) {
15762306a36Sopenharmony_ci		ret = num_free_extents;
15862306a36Sopenharmony_ci		mlog_errno(ret);
15962306a36Sopenharmony_ci		goto out;
16062306a36Sopenharmony_ci	}
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci	if (!num_free_extents ||
16362306a36Sopenharmony_ci	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
16462306a36Sopenharmony_ci		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
16562306a36Sopenharmony_ci
16662306a36Sopenharmony_ci	ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
16762306a36Sopenharmony_ci	if (ret) {
16862306a36Sopenharmony_ci		mlog_errno(ret);
16962306a36Sopenharmony_ci		goto out;
17062306a36Sopenharmony_ci	}
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_ci	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
17462306a36Sopenharmony_ci
17562306a36Sopenharmony_ci	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
17662306a36Sopenharmony_ci	     extra_blocks, clusters_to_move, *credits);
17762306a36Sopenharmony_ciout:
17862306a36Sopenharmony_ci	if (ret) {
17962306a36Sopenharmony_ci		if (*meta_ac) {
18062306a36Sopenharmony_ci			ocfs2_free_alloc_context(*meta_ac);
18162306a36Sopenharmony_ci			*meta_ac = NULL;
18262306a36Sopenharmony_ci		}
18362306a36Sopenharmony_ci	}
18462306a36Sopenharmony_ci
18562306a36Sopenharmony_ci	return ret;
18662306a36Sopenharmony_ci}
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci/*
18962306a36Sopenharmony_ci * Using one journal handle to guarantee the data consistency in case
19062306a36Sopenharmony_ci * crash happens anywhere.
19162306a36Sopenharmony_ci *
19262306a36Sopenharmony_ci *  XXX: defrag can end up with finishing partial extent as requested,
19362306a36Sopenharmony_ci * due to not enough contiguous clusters can be found in allocator.
19462306a36Sopenharmony_ci */
19562306a36Sopenharmony_cistatic int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
19662306a36Sopenharmony_ci			       u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
19762306a36Sopenharmony_ci{
19862306a36Sopenharmony_ci	int ret, credits = 0, extra_blocks = 0, partial = context->partial;
19962306a36Sopenharmony_ci	handle_t *handle;
20062306a36Sopenharmony_ci	struct inode *inode = context->inode;
20162306a36Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
20262306a36Sopenharmony_ci	struct inode *tl_inode = osb->osb_tl_inode;
20362306a36Sopenharmony_ci	struct ocfs2_refcount_tree *ref_tree = NULL;
20462306a36Sopenharmony_ci	u32 new_phys_cpos, new_len;
20562306a36Sopenharmony_ci	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
20662306a36Sopenharmony_ci	int need_free = 0;
20762306a36Sopenharmony_ci
20862306a36Sopenharmony_ci	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
20962306a36Sopenharmony_ci		BUG_ON(!ocfs2_is_refcount_inode(inode));
21062306a36Sopenharmony_ci		BUG_ON(!context->refcount_loc);
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
21362306a36Sopenharmony_ci					       &ref_tree, NULL);
21462306a36Sopenharmony_ci		if (ret) {
21562306a36Sopenharmony_ci			mlog_errno(ret);
21662306a36Sopenharmony_ci			return ret;
21762306a36Sopenharmony_ci		}
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci		ret = ocfs2_prepare_refcount_change_for_del(inode,
22062306a36Sopenharmony_ci							context->refcount_loc,
22162306a36Sopenharmony_ci							phys_blkno,
22262306a36Sopenharmony_ci							*len,
22362306a36Sopenharmony_ci							&credits,
22462306a36Sopenharmony_ci							&extra_blocks);
22562306a36Sopenharmony_ci		if (ret) {
22662306a36Sopenharmony_ci			mlog_errno(ret);
22762306a36Sopenharmony_ci			goto out;
22862306a36Sopenharmony_ci		}
22962306a36Sopenharmony_ci	}
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_ci	ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
23262306a36Sopenharmony_ci						*len, 1,
23362306a36Sopenharmony_ci						&context->meta_ac,
23462306a36Sopenharmony_ci						extra_blocks, &credits);
23562306a36Sopenharmony_ci	if (ret) {
23662306a36Sopenharmony_ci		mlog_errno(ret);
23762306a36Sopenharmony_ci		goto out;
23862306a36Sopenharmony_ci	}
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_ci	/*
24162306a36Sopenharmony_ci	 * should be using allocation reservation strategy there?
24262306a36Sopenharmony_ci	 *
24362306a36Sopenharmony_ci	 * if (context->data_ac)
24462306a36Sopenharmony_ci	 *	context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
24562306a36Sopenharmony_ci	 */
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	inode_lock(tl_inode);
24862306a36Sopenharmony_ci
24962306a36Sopenharmony_ci	if (ocfs2_truncate_log_needs_flush(osb)) {
25062306a36Sopenharmony_ci		ret = __ocfs2_flush_truncate_log(osb);
25162306a36Sopenharmony_ci		if (ret < 0) {
25262306a36Sopenharmony_ci			mlog_errno(ret);
25362306a36Sopenharmony_ci			goto out_unlock_mutex;
25462306a36Sopenharmony_ci		}
25562306a36Sopenharmony_ci	}
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	/*
25862306a36Sopenharmony_ci	 * Make sure ocfs2_reserve_cluster is called after
25962306a36Sopenharmony_ci	 * __ocfs2_flush_truncate_log, otherwise, dead lock may happen.
26062306a36Sopenharmony_ci	 *
26162306a36Sopenharmony_ci	 * If ocfs2_reserve_cluster is called
26262306a36Sopenharmony_ci	 * before __ocfs2_flush_truncate_log, dead lock on global bitmap
26362306a36Sopenharmony_ci	 * may happen.
26462306a36Sopenharmony_ci	 *
26562306a36Sopenharmony_ci	 */
26662306a36Sopenharmony_ci	ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac);
26762306a36Sopenharmony_ci	if (ret) {
26862306a36Sopenharmony_ci		mlog_errno(ret);
26962306a36Sopenharmony_ci		goto out_unlock_mutex;
27062306a36Sopenharmony_ci	}
27162306a36Sopenharmony_ci
27262306a36Sopenharmony_ci	handle = ocfs2_start_trans(osb, credits);
27362306a36Sopenharmony_ci	if (IS_ERR(handle)) {
27462306a36Sopenharmony_ci		ret = PTR_ERR(handle);
27562306a36Sopenharmony_ci		mlog_errno(ret);
27662306a36Sopenharmony_ci		goto out_unlock_mutex;
27762306a36Sopenharmony_ci	}
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci	ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
28062306a36Sopenharmony_ci				     &new_phys_cpos, &new_len);
28162306a36Sopenharmony_ci	if (ret) {
28262306a36Sopenharmony_ci		mlog_errno(ret);
28362306a36Sopenharmony_ci		goto out_commit;
28462306a36Sopenharmony_ci	}
28562306a36Sopenharmony_ci
28662306a36Sopenharmony_ci	/*
28762306a36Sopenharmony_ci	 * allowing partial extent moving is kind of 'pros and cons', it makes
28862306a36Sopenharmony_ci	 * whole defragmentation less likely to fail, on the contrary, the bad
28962306a36Sopenharmony_ci	 * thing is it may make the fs even more fragmented after moving, let
29062306a36Sopenharmony_ci	 * userspace make a good decision here.
29162306a36Sopenharmony_ci	 */
29262306a36Sopenharmony_ci	if (new_len != *len) {
29362306a36Sopenharmony_ci		mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
29462306a36Sopenharmony_ci		if (!partial) {
29562306a36Sopenharmony_ci			context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
29662306a36Sopenharmony_ci			ret = -ENOSPC;
29762306a36Sopenharmony_ci			need_free = 1;
29862306a36Sopenharmony_ci			goto out_commit;
29962306a36Sopenharmony_ci		}
30062306a36Sopenharmony_ci	}
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_ci	mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
30362306a36Sopenharmony_ci	     phys_cpos, new_phys_cpos);
30462306a36Sopenharmony_ci
30562306a36Sopenharmony_ci	ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
30662306a36Sopenharmony_ci				  new_phys_cpos, ext_flags);
30762306a36Sopenharmony_ci	if (ret)
30862306a36Sopenharmony_ci		mlog_errno(ret);
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_ci	if (partial && (new_len != *len))
31162306a36Sopenharmony_ci		*len = new_len;
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci	/*
31462306a36Sopenharmony_ci	 * Here we should write the new page out first if we are
31562306a36Sopenharmony_ci	 * in write-back mode.
31662306a36Sopenharmony_ci	 */
31762306a36Sopenharmony_ci	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
31862306a36Sopenharmony_ci	if (ret)
31962306a36Sopenharmony_ci		mlog_errno(ret);
32062306a36Sopenharmony_ci
32162306a36Sopenharmony_ciout_commit:
32262306a36Sopenharmony_ci	if (need_free && context->data_ac) {
32362306a36Sopenharmony_ci		struct ocfs2_alloc_context *data_ac = context->data_ac;
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_ci		if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL)
32662306a36Sopenharmony_ci			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
32762306a36Sopenharmony_ci					new_phys_cpos, new_len);
32862306a36Sopenharmony_ci		else
32962306a36Sopenharmony_ci			ocfs2_free_clusters(handle,
33062306a36Sopenharmony_ci					data_ac->ac_inode,
33162306a36Sopenharmony_ci					data_ac->ac_bh,
33262306a36Sopenharmony_ci					ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos),
33362306a36Sopenharmony_ci					new_len);
33462306a36Sopenharmony_ci	}
33562306a36Sopenharmony_ci
33662306a36Sopenharmony_ci	ocfs2_commit_trans(osb, handle);
33762306a36Sopenharmony_ci
33862306a36Sopenharmony_ciout_unlock_mutex:
33962306a36Sopenharmony_ci	inode_unlock(tl_inode);
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	if (context->data_ac) {
34262306a36Sopenharmony_ci		ocfs2_free_alloc_context(context->data_ac);
34362306a36Sopenharmony_ci		context->data_ac = NULL;
34462306a36Sopenharmony_ci	}
34562306a36Sopenharmony_ci
34662306a36Sopenharmony_ci	if (context->meta_ac) {
34762306a36Sopenharmony_ci		ocfs2_free_alloc_context(context->meta_ac);
34862306a36Sopenharmony_ci		context->meta_ac = NULL;
34962306a36Sopenharmony_ci	}
35062306a36Sopenharmony_ci
35162306a36Sopenharmony_ciout:
35262306a36Sopenharmony_ci	if (ref_tree)
35362306a36Sopenharmony_ci		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
35462306a36Sopenharmony_ci
35562306a36Sopenharmony_ci	return ret;
35662306a36Sopenharmony_ci}
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_ci/*
35962306a36Sopenharmony_ci * find the victim alloc group, where #blkno fits.
36062306a36Sopenharmony_ci */
36162306a36Sopenharmony_cistatic int ocfs2_find_victim_alloc_group(struct inode *inode,
36262306a36Sopenharmony_ci					 u64 vict_blkno,
36362306a36Sopenharmony_ci					 int type, int slot,
36462306a36Sopenharmony_ci					 int *vict_bit,
36562306a36Sopenharmony_ci					 struct buffer_head **ret_bh)
36662306a36Sopenharmony_ci{
36762306a36Sopenharmony_ci	int ret, i, bits_per_unit = 0;
36862306a36Sopenharmony_ci	u64 blkno;
36962306a36Sopenharmony_ci	char namebuf[40];
37062306a36Sopenharmony_ci
37162306a36Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
37262306a36Sopenharmony_ci	struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
37362306a36Sopenharmony_ci	struct ocfs2_chain_list *cl;
37462306a36Sopenharmony_ci	struct ocfs2_chain_rec *rec;
37562306a36Sopenharmony_ci	struct ocfs2_dinode *ac_dinode;
37662306a36Sopenharmony_ci	struct ocfs2_group_desc *bg;
37762306a36Sopenharmony_ci
37862306a36Sopenharmony_ci	ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
37962306a36Sopenharmony_ci	ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
38062306a36Sopenharmony_ci					 strlen(namebuf), &blkno);
38162306a36Sopenharmony_ci	if (ret) {
38262306a36Sopenharmony_ci		ret = -ENOENT;
38362306a36Sopenharmony_ci		goto out;
38462306a36Sopenharmony_ci	}
38562306a36Sopenharmony_ci
38662306a36Sopenharmony_ci	ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
38762306a36Sopenharmony_ci	if (ret) {
38862306a36Sopenharmony_ci		mlog_errno(ret);
38962306a36Sopenharmony_ci		goto out;
39062306a36Sopenharmony_ci	}
39162306a36Sopenharmony_ci
39262306a36Sopenharmony_ci	ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
39362306a36Sopenharmony_ci	cl = &(ac_dinode->id2.i_chain);
39462306a36Sopenharmony_ci	rec = &(cl->cl_recs[0]);
39562306a36Sopenharmony_ci
39662306a36Sopenharmony_ci	if (type == GLOBAL_BITMAP_SYSTEM_INODE)
39762306a36Sopenharmony_ci		bits_per_unit = osb->s_clustersize_bits -
39862306a36Sopenharmony_ci					inode->i_sb->s_blocksize_bits;
39962306a36Sopenharmony_ci	/*
40062306a36Sopenharmony_ci	 * 'vict_blkno' was out of the valid range.
40162306a36Sopenharmony_ci	 */
40262306a36Sopenharmony_ci	if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
40362306a36Sopenharmony_ci	    (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
40462306a36Sopenharmony_ci				bits_per_unit))) {
40562306a36Sopenharmony_ci		ret = -EINVAL;
40662306a36Sopenharmony_ci		goto out;
40762306a36Sopenharmony_ci	}
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_ci		rec = &(cl->cl_recs[i]);
41262306a36Sopenharmony_ci		if (!rec)
41362306a36Sopenharmony_ci			continue;
41462306a36Sopenharmony_ci
41562306a36Sopenharmony_ci		bg = NULL;
41662306a36Sopenharmony_ci
41762306a36Sopenharmony_ci		do {
41862306a36Sopenharmony_ci			if (!bg)
41962306a36Sopenharmony_ci				blkno = le64_to_cpu(rec->c_blkno);
42062306a36Sopenharmony_ci			else
42162306a36Sopenharmony_ci				blkno = le64_to_cpu(bg->bg_next_group);
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci			if (gd_bh) {
42462306a36Sopenharmony_ci				brelse(gd_bh);
42562306a36Sopenharmony_ci				gd_bh = NULL;
42662306a36Sopenharmony_ci			}
42762306a36Sopenharmony_ci
42862306a36Sopenharmony_ci			ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
42962306a36Sopenharmony_ci			if (ret) {
43062306a36Sopenharmony_ci				mlog_errno(ret);
43162306a36Sopenharmony_ci				goto out;
43262306a36Sopenharmony_ci			}
43362306a36Sopenharmony_ci
43462306a36Sopenharmony_ci			bg = (struct ocfs2_group_desc *)gd_bh->b_data;
43562306a36Sopenharmony_ci
43662306a36Sopenharmony_ci			if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
43762306a36Sopenharmony_ci						(le16_to_cpu(bg->bg_bits) << bits_per_unit))) {
43862306a36Sopenharmony_ci
43962306a36Sopenharmony_ci				*ret_bh = gd_bh;
44062306a36Sopenharmony_ci				*vict_bit = (vict_blkno - blkno) >>
44162306a36Sopenharmony_ci							bits_per_unit;
44262306a36Sopenharmony_ci				mlog(0, "find the victim group: #%llu, "
44362306a36Sopenharmony_ci				     "total_bits: %u, vict_bit: %u\n",
44462306a36Sopenharmony_ci				     blkno, le16_to_cpu(bg->bg_bits),
44562306a36Sopenharmony_ci				     *vict_bit);
44662306a36Sopenharmony_ci				goto out;
44762306a36Sopenharmony_ci			}
44862306a36Sopenharmony_ci
44962306a36Sopenharmony_ci		} while (le64_to_cpu(bg->bg_next_group));
45062306a36Sopenharmony_ci	}
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci	ret = -EINVAL;
45362306a36Sopenharmony_ciout:
45462306a36Sopenharmony_ci	brelse(ac_bh);
45562306a36Sopenharmony_ci
45662306a36Sopenharmony_ci	/*
45762306a36Sopenharmony_ci	 * caller has to release the gd_bh properly.
45862306a36Sopenharmony_ci	 */
45962306a36Sopenharmony_ci	return ret;
46062306a36Sopenharmony_ci}
46162306a36Sopenharmony_ci
46262306a36Sopenharmony_ci/*
46362306a36Sopenharmony_ci * XXX: helper to validate and adjust moving goal.
46462306a36Sopenharmony_ci */
46562306a36Sopenharmony_cistatic int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
46662306a36Sopenharmony_ci					       struct ocfs2_move_extents *range)
46762306a36Sopenharmony_ci{
46862306a36Sopenharmony_ci	int ret, goal_bit = 0;
46962306a36Sopenharmony_ci
47062306a36Sopenharmony_ci	struct buffer_head *gd_bh = NULL;
47162306a36Sopenharmony_ci	struct ocfs2_group_desc *bg;
47262306a36Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
47362306a36Sopenharmony_ci	int c_to_b = 1 << (osb->s_clustersize_bits -
47462306a36Sopenharmony_ci					inode->i_sb->s_blocksize_bits);
47562306a36Sopenharmony_ci
47662306a36Sopenharmony_ci	/*
47762306a36Sopenharmony_ci	 * make goal become cluster aligned.
47862306a36Sopenharmony_ci	 */
47962306a36Sopenharmony_ci	range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
48062306a36Sopenharmony_ci						      range->me_goal);
48162306a36Sopenharmony_ci	/*
48262306a36Sopenharmony_ci	 * validate goal sits within global_bitmap, and return the victim
48362306a36Sopenharmony_ci	 * group desc
48462306a36Sopenharmony_ci	 */
48562306a36Sopenharmony_ci	ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
48662306a36Sopenharmony_ci					    GLOBAL_BITMAP_SYSTEM_INODE,
48762306a36Sopenharmony_ci					    OCFS2_INVALID_SLOT,
48862306a36Sopenharmony_ci					    &goal_bit, &gd_bh);
48962306a36Sopenharmony_ci	if (ret)
49062306a36Sopenharmony_ci		goto out;
49162306a36Sopenharmony_ci
49262306a36Sopenharmony_ci	bg = (struct ocfs2_group_desc *)gd_bh->b_data;
49362306a36Sopenharmony_ci
49462306a36Sopenharmony_ci	/*
49562306a36Sopenharmony_ci	 * moving goal is not allowd to start with a group desc blok(#0 blk)
49662306a36Sopenharmony_ci	 * let's compromise to the latter cluster.
49762306a36Sopenharmony_ci	 */
49862306a36Sopenharmony_ci	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
49962306a36Sopenharmony_ci		range->me_goal += c_to_b;
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_ci	/*
50262306a36Sopenharmony_ci	 * movement is not gonna cross two groups.
50362306a36Sopenharmony_ci	 */
50462306a36Sopenharmony_ci	if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
50562306a36Sopenharmony_ci								range->me_len) {
50662306a36Sopenharmony_ci		ret = -EINVAL;
50762306a36Sopenharmony_ci		goto out;
50862306a36Sopenharmony_ci	}
50962306a36Sopenharmony_ci	/*
51062306a36Sopenharmony_ci	 * more exact validations/adjustments will be performed later during
51162306a36Sopenharmony_ci	 * moving operation for each extent range.
51262306a36Sopenharmony_ci	 */
51362306a36Sopenharmony_ci	mlog(0, "extents get ready to be moved to #%llu block\n",
51462306a36Sopenharmony_ci	     range->me_goal);
51562306a36Sopenharmony_ci
51662306a36Sopenharmony_ciout:
51762306a36Sopenharmony_ci	brelse(gd_bh);
51862306a36Sopenharmony_ci
51962306a36Sopenharmony_ci	return ret;
52062306a36Sopenharmony_ci}
52162306a36Sopenharmony_ci
52262306a36Sopenharmony_cistatic void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
52362306a36Sopenharmony_ci				    int *goal_bit, u32 move_len, u32 max_hop,
52462306a36Sopenharmony_ci				    u32 *phys_cpos)
52562306a36Sopenharmony_ci{
52662306a36Sopenharmony_ci	int i, used, last_free_bits = 0, base_bit = *goal_bit;
52762306a36Sopenharmony_ci	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
52862306a36Sopenharmony_ci	u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
52962306a36Sopenharmony_ci						 le64_to_cpu(gd->bg_blkno));
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_ci	for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_ci		used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
53462306a36Sopenharmony_ci		if (used) {
53562306a36Sopenharmony_ci			/*
53662306a36Sopenharmony_ci			 * we even tried searching the free chunk by jumping
53762306a36Sopenharmony_ci			 * a 'max_hop' distance, but still failed.
53862306a36Sopenharmony_ci			 */
53962306a36Sopenharmony_ci			if ((i - base_bit) > max_hop) {
54062306a36Sopenharmony_ci				*phys_cpos = 0;
54162306a36Sopenharmony_ci				break;
54262306a36Sopenharmony_ci			}
54362306a36Sopenharmony_ci
54462306a36Sopenharmony_ci			if (last_free_bits)
54562306a36Sopenharmony_ci				last_free_bits = 0;
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_ci			continue;
54862306a36Sopenharmony_ci		} else
54962306a36Sopenharmony_ci			last_free_bits++;
55062306a36Sopenharmony_ci
55162306a36Sopenharmony_ci		if (last_free_bits == move_len) {
55262306a36Sopenharmony_ci			i -= move_len;
55362306a36Sopenharmony_ci			*goal_bit = i;
55462306a36Sopenharmony_ci			*phys_cpos = base_cpos + i;
55562306a36Sopenharmony_ci			break;
55662306a36Sopenharmony_ci		}
55762306a36Sopenharmony_ci	}
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_ci	mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
56062306a36Sopenharmony_ci}
56162306a36Sopenharmony_ci
56262306a36Sopenharmony_cistatic int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
56362306a36Sopenharmony_ci			     u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
56462306a36Sopenharmony_ci			     u32 len, int ext_flags)
56562306a36Sopenharmony_ci{
56662306a36Sopenharmony_ci	int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
56762306a36Sopenharmony_ci	handle_t *handle;
56862306a36Sopenharmony_ci	struct inode *inode = context->inode;
56962306a36Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
57062306a36Sopenharmony_ci	struct inode *tl_inode = osb->osb_tl_inode;
57162306a36Sopenharmony_ci	struct inode *gb_inode = NULL;
57262306a36Sopenharmony_ci	struct buffer_head *gb_bh = NULL;
57362306a36Sopenharmony_ci	struct buffer_head *gd_bh = NULL;
57462306a36Sopenharmony_ci	struct ocfs2_group_desc *gd;
57562306a36Sopenharmony_ci	struct ocfs2_refcount_tree *ref_tree = NULL;
57662306a36Sopenharmony_ci	u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
57762306a36Sopenharmony_ci						    context->range->me_threshold);
57862306a36Sopenharmony_ci	u64 phys_blkno, new_phys_blkno;
57962306a36Sopenharmony_ci
58062306a36Sopenharmony_ci	phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
58162306a36Sopenharmony_ci
58262306a36Sopenharmony_ci	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
58362306a36Sopenharmony_ci		BUG_ON(!ocfs2_is_refcount_inode(inode));
58462306a36Sopenharmony_ci		BUG_ON(!context->refcount_loc);
58562306a36Sopenharmony_ci
58662306a36Sopenharmony_ci		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
58762306a36Sopenharmony_ci					       &ref_tree, NULL);
58862306a36Sopenharmony_ci		if (ret) {
58962306a36Sopenharmony_ci			mlog_errno(ret);
59062306a36Sopenharmony_ci			return ret;
59162306a36Sopenharmony_ci		}
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_ci		ret = ocfs2_prepare_refcount_change_for_del(inode,
59462306a36Sopenharmony_ci							context->refcount_loc,
59562306a36Sopenharmony_ci							phys_blkno,
59662306a36Sopenharmony_ci							len,
59762306a36Sopenharmony_ci							&credits,
59862306a36Sopenharmony_ci							&extra_blocks);
59962306a36Sopenharmony_ci		if (ret) {
60062306a36Sopenharmony_ci			mlog_errno(ret);
60162306a36Sopenharmony_ci			goto out;
60262306a36Sopenharmony_ci		}
60362306a36Sopenharmony_ci	}
60462306a36Sopenharmony_ci
60562306a36Sopenharmony_ci	ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
60662306a36Sopenharmony_ci						len, 1,
60762306a36Sopenharmony_ci						&context->meta_ac,
60862306a36Sopenharmony_ci						extra_blocks, &credits);
60962306a36Sopenharmony_ci	if (ret) {
61062306a36Sopenharmony_ci		mlog_errno(ret);
61162306a36Sopenharmony_ci		goto out;
61262306a36Sopenharmony_ci	}
61362306a36Sopenharmony_ci
61462306a36Sopenharmony_ci	/*
61562306a36Sopenharmony_ci	 * need to count 2 extra credits for global_bitmap inode and
61662306a36Sopenharmony_ci	 * group descriptor.
61762306a36Sopenharmony_ci	 */
61862306a36Sopenharmony_ci	credits += OCFS2_INODE_UPDATE_CREDITS + 1;
61962306a36Sopenharmony_ci
62062306a36Sopenharmony_ci	/*
62162306a36Sopenharmony_ci	 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
62262306a36Sopenharmony_ci	 * logic, while we still need to lock the global_bitmap.
62362306a36Sopenharmony_ci	 */
62462306a36Sopenharmony_ci	gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
62562306a36Sopenharmony_ci					       OCFS2_INVALID_SLOT);
62662306a36Sopenharmony_ci	if (!gb_inode) {
62762306a36Sopenharmony_ci		mlog(ML_ERROR, "unable to get global_bitmap inode\n");
62862306a36Sopenharmony_ci		ret = -EIO;
62962306a36Sopenharmony_ci		goto out;
63062306a36Sopenharmony_ci	}
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ci	inode_lock(gb_inode);
63362306a36Sopenharmony_ci
63462306a36Sopenharmony_ci	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
63562306a36Sopenharmony_ci	if (ret) {
63662306a36Sopenharmony_ci		mlog_errno(ret);
63762306a36Sopenharmony_ci		goto out_unlock_gb_mutex;
63862306a36Sopenharmony_ci	}
63962306a36Sopenharmony_ci
64062306a36Sopenharmony_ci	inode_lock(tl_inode);
64162306a36Sopenharmony_ci
64262306a36Sopenharmony_ci	handle = ocfs2_start_trans(osb, credits);
64362306a36Sopenharmony_ci	if (IS_ERR(handle)) {
64462306a36Sopenharmony_ci		ret = PTR_ERR(handle);
64562306a36Sopenharmony_ci		mlog_errno(ret);
64662306a36Sopenharmony_ci		goto out_unlock_tl_inode;
64762306a36Sopenharmony_ci	}
64862306a36Sopenharmony_ci
64962306a36Sopenharmony_ci	new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
65062306a36Sopenharmony_ci	ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
65162306a36Sopenharmony_ci					    GLOBAL_BITMAP_SYSTEM_INODE,
65262306a36Sopenharmony_ci					    OCFS2_INVALID_SLOT,
65362306a36Sopenharmony_ci					    &goal_bit, &gd_bh);
65462306a36Sopenharmony_ci	if (ret) {
65562306a36Sopenharmony_ci		mlog_errno(ret);
65662306a36Sopenharmony_ci		goto out_commit;
65762306a36Sopenharmony_ci	}
65862306a36Sopenharmony_ci
65962306a36Sopenharmony_ci	/*
66062306a36Sopenharmony_ci	 * probe the victim cluster group to find a proper
66162306a36Sopenharmony_ci	 * region to fit wanted movement, it even will perfrom
66262306a36Sopenharmony_ci	 * a best-effort attempt by compromising to a threshold
66362306a36Sopenharmony_ci	 * around the goal.
66462306a36Sopenharmony_ci	 */
66562306a36Sopenharmony_ci	ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
66662306a36Sopenharmony_ci				new_phys_cpos);
66762306a36Sopenharmony_ci	if (!*new_phys_cpos) {
66862306a36Sopenharmony_ci		ret = -ENOSPC;
66962306a36Sopenharmony_ci		goto out_commit;
67062306a36Sopenharmony_ci	}
67162306a36Sopenharmony_ci
67262306a36Sopenharmony_ci	ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
67362306a36Sopenharmony_ci				  *new_phys_cpos, ext_flags);
67462306a36Sopenharmony_ci	if (ret) {
67562306a36Sopenharmony_ci		mlog_errno(ret);
67662306a36Sopenharmony_ci		goto out_commit;
67762306a36Sopenharmony_ci	}
67862306a36Sopenharmony_ci
67962306a36Sopenharmony_ci	gd = (struct ocfs2_group_desc *)gd_bh->b_data;
68062306a36Sopenharmony_ci	ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
68162306a36Sopenharmony_ci					       le16_to_cpu(gd->bg_chain));
68262306a36Sopenharmony_ci	if (ret) {
68362306a36Sopenharmony_ci		mlog_errno(ret);
68462306a36Sopenharmony_ci		goto out_commit;
68562306a36Sopenharmony_ci	}
68662306a36Sopenharmony_ci
68762306a36Sopenharmony_ci	ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
68862306a36Sopenharmony_ci					 goal_bit, len);
68962306a36Sopenharmony_ci	if (ret) {
69062306a36Sopenharmony_ci		ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
69162306a36Sopenharmony_ci					       le16_to_cpu(gd->bg_chain));
69262306a36Sopenharmony_ci		mlog_errno(ret);
69362306a36Sopenharmony_ci	}
69462306a36Sopenharmony_ci
69562306a36Sopenharmony_ci	/*
69662306a36Sopenharmony_ci	 * Here we should write the new page out first if we are
69762306a36Sopenharmony_ci	 * in write-back mode.
69862306a36Sopenharmony_ci	 */
69962306a36Sopenharmony_ci	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
70062306a36Sopenharmony_ci	if (ret)
70162306a36Sopenharmony_ci		mlog_errno(ret);
70262306a36Sopenharmony_ci
70362306a36Sopenharmony_ciout_commit:
70462306a36Sopenharmony_ci	ocfs2_commit_trans(osb, handle);
70562306a36Sopenharmony_ci	brelse(gd_bh);
70662306a36Sopenharmony_ci
70762306a36Sopenharmony_ciout_unlock_tl_inode:
70862306a36Sopenharmony_ci	inode_unlock(tl_inode);
70962306a36Sopenharmony_ci
71062306a36Sopenharmony_ci	ocfs2_inode_unlock(gb_inode, 1);
71162306a36Sopenharmony_ciout_unlock_gb_mutex:
71262306a36Sopenharmony_ci	inode_unlock(gb_inode);
71362306a36Sopenharmony_ci	brelse(gb_bh);
71462306a36Sopenharmony_ci	iput(gb_inode);
71562306a36Sopenharmony_ci
71662306a36Sopenharmony_ciout:
71762306a36Sopenharmony_ci	if (context->meta_ac) {
71862306a36Sopenharmony_ci		ocfs2_free_alloc_context(context->meta_ac);
71962306a36Sopenharmony_ci		context->meta_ac = NULL;
72062306a36Sopenharmony_ci	}
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci	if (ref_tree)
72362306a36Sopenharmony_ci		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
72462306a36Sopenharmony_ci
72562306a36Sopenharmony_ci	return ret;
72662306a36Sopenharmony_ci}
72762306a36Sopenharmony_ci
72862306a36Sopenharmony_ci/*
72962306a36Sopenharmony_ci * Helper to calculate the defraging length in one run according to threshold.
73062306a36Sopenharmony_ci */
73162306a36Sopenharmony_cistatic void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
73262306a36Sopenharmony_ci					 u32 threshold, int *skip)
73362306a36Sopenharmony_ci{
73462306a36Sopenharmony_ci	if ((*alloc_size + *len_defraged) < threshold) {
73562306a36Sopenharmony_ci		/*
73662306a36Sopenharmony_ci		 * proceed defragmentation until we meet the thresh
73762306a36Sopenharmony_ci		 */
73862306a36Sopenharmony_ci		*len_defraged += *alloc_size;
73962306a36Sopenharmony_ci	} else if (*len_defraged == 0) {
74062306a36Sopenharmony_ci		/*
74162306a36Sopenharmony_ci		 * XXX: skip a large extent.
74262306a36Sopenharmony_ci		 */
74362306a36Sopenharmony_ci		*skip = 1;
74462306a36Sopenharmony_ci	} else {
74562306a36Sopenharmony_ci		/*
74662306a36Sopenharmony_ci		 * split this extent to coalesce with former pieces as
74762306a36Sopenharmony_ci		 * to reach the threshold.
74862306a36Sopenharmony_ci		 *
74962306a36Sopenharmony_ci		 * we're done here with one cycle of defragmentation
75062306a36Sopenharmony_ci		 * in a size of 'thresh', resetting 'len_defraged'
75162306a36Sopenharmony_ci		 * forces a new defragmentation.
75262306a36Sopenharmony_ci		 */
75362306a36Sopenharmony_ci		*alloc_size = threshold - *len_defraged;
75462306a36Sopenharmony_ci		*len_defraged = 0;
75562306a36Sopenharmony_ci	}
75662306a36Sopenharmony_ci}
75762306a36Sopenharmony_ci
75862306a36Sopenharmony_cistatic int __ocfs2_move_extents_range(struct buffer_head *di_bh,
75962306a36Sopenharmony_ci				struct ocfs2_move_extents_context *context)
76062306a36Sopenharmony_ci{
76162306a36Sopenharmony_ci	int ret = 0, flags, do_defrag, skip = 0;
76262306a36Sopenharmony_ci	u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
76362306a36Sopenharmony_ci	u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
76462306a36Sopenharmony_ci
76562306a36Sopenharmony_ci	struct inode *inode = context->inode;
76662306a36Sopenharmony_ci	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
76762306a36Sopenharmony_ci	struct ocfs2_move_extents *range = context->range;
76862306a36Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
76962306a36Sopenharmony_ci
77062306a36Sopenharmony_ci	if ((i_size_read(inode) == 0) || (range->me_len == 0))
77162306a36Sopenharmony_ci		return 0;
77262306a36Sopenharmony_ci
77362306a36Sopenharmony_ci	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
77462306a36Sopenharmony_ci		return 0;
77562306a36Sopenharmony_ci
77662306a36Sopenharmony_ci	context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
77762306a36Sopenharmony_ci
77862306a36Sopenharmony_ci	ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
77962306a36Sopenharmony_ci	ocfs2_init_dealloc_ctxt(&context->dealloc);
78062306a36Sopenharmony_ci
78162306a36Sopenharmony_ci	/*
78262306a36Sopenharmony_ci	 * TO-DO XXX:
78362306a36Sopenharmony_ci	 *
78462306a36Sopenharmony_ci	 * - xattr extents.
78562306a36Sopenharmony_ci	 */
78662306a36Sopenharmony_ci
78762306a36Sopenharmony_ci	do_defrag = context->auto_defrag;
78862306a36Sopenharmony_ci
78962306a36Sopenharmony_ci	/*
79062306a36Sopenharmony_ci	 * extents moving happens in unit of clusters, for the sake
79162306a36Sopenharmony_ci	 * of simplicity, we may ignore two clusters where 'byte_start'
79262306a36Sopenharmony_ci	 * and 'byte_start + len' were within.
79362306a36Sopenharmony_ci	 */
79462306a36Sopenharmony_ci	move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
79562306a36Sopenharmony_ci	len_to_move = (range->me_start + range->me_len) >>
79662306a36Sopenharmony_ci						osb->s_clustersize_bits;
79762306a36Sopenharmony_ci	if (len_to_move >= move_start)
79862306a36Sopenharmony_ci		len_to_move -= move_start;
79962306a36Sopenharmony_ci	else
80062306a36Sopenharmony_ci		len_to_move = 0;
80162306a36Sopenharmony_ci
80262306a36Sopenharmony_ci	if (do_defrag) {
80362306a36Sopenharmony_ci		defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
80462306a36Sopenharmony_ci		if (defrag_thresh <= 1)
80562306a36Sopenharmony_ci			goto done;
80662306a36Sopenharmony_ci	} else
80762306a36Sopenharmony_ci		new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
80862306a36Sopenharmony_ci							 range->me_goal);
80962306a36Sopenharmony_ci
81062306a36Sopenharmony_ci	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
81162306a36Sopenharmony_ci	     "thresh: %u\n",
81262306a36Sopenharmony_ci	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
81362306a36Sopenharmony_ci	     (unsigned long long)range->me_start,
81462306a36Sopenharmony_ci	     (unsigned long long)range->me_len,
81562306a36Sopenharmony_ci	     move_start, len_to_move, defrag_thresh);
81662306a36Sopenharmony_ci
81762306a36Sopenharmony_ci	cpos = move_start;
81862306a36Sopenharmony_ci	while (len_to_move) {
81962306a36Sopenharmony_ci		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
82062306a36Sopenharmony_ci					 &flags);
82162306a36Sopenharmony_ci		if (ret) {
82262306a36Sopenharmony_ci			mlog_errno(ret);
82362306a36Sopenharmony_ci			goto out;
82462306a36Sopenharmony_ci		}
82562306a36Sopenharmony_ci
82662306a36Sopenharmony_ci		if (alloc_size > len_to_move)
82762306a36Sopenharmony_ci			alloc_size = len_to_move;
82862306a36Sopenharmony_ci
82962306a36Sopenharmony_ci		/*
83062306a36Sopenharmony_ci		 * XXX: how to deal with a hole:
83162306a36Sopenharmony_ci		 *
83262306a36Sopenharmony_ci		 * - skip the hole of course
83362306a36Sopenharmony_ci		 * - force a new defragmentation
83462306a36Sopenharmony_ci		 */
83562306a36Sopenharmony_ci		if (!phys_cpos) {
83662306a36Sopenharmony_ci			if (do_defrag)
83762306a36Sopenharmony_ci				len_defraged = 0;
83862306a36Sopenharmony_ci
83962306a36Sopenharmony_ci			goto next;
84062306a36Sopenharmony_ci		}
84162306a36Sopenharmony_ci
84262306a36Sopenharmony_ci		if (do_defrag) {
84362306a36Sopenharmony_ci			ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
84462306a36Sopenharmony_ci						     defrag_thresh, &skip);
84562306a36Sopenharmony_ci			/*
84662306a36Sopenharmony_ci			 * skip large extents
84762306a36Sopenharmony_ci			 */
84862306a36Sopenharmony_ci			if (skip) {
84962306a36Sopenharmony_ci				skip = 0;
85062306a36Sopenharmony_ci				goto next;
85162306a36Sopenharmony_ci			}
85262306a36Sopenharmony_ci
85362306a36Sopenharmony_ci			mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
85462306a36Sopenharmony_ci			     "alloc_size: %u, len_defraged: %u\n",
85562306a36Sopenharmony_ci			     cpos, phys_cpos, alloc_size, len_defraged);
85662306a36Sopenharmony_ci
85762306a36Sopenharmony_ci			ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
85862306a36Sopenharmony_ci						  &alloc_size, flags);
85962306a36Sopenharmony_ci		} else {
86062306a36Sopenharmony_ci			ret = ocfs2_move_extent(context, cpos, phys_cpos,
86162306a36Sopenharmony_ci						&new_phys_cpos, alloc_size,
86262306a36Sopenharmony_ci						flags);
86362306a36Sopenharmony_ci
86462306a36Sopenharmony_ci			new_phys_cpos += alloc_size;
86562306a36Sopenharmony_ci		}
86662306a36Sopenharmony_ci
86762306a36Sopenharmony_ci		if (ret < 0) {
86862306a36Sopenharmony_ci			mlog_errno(ret);
86962306a36Sopenharmony_ci			goto out;
87062306a36Sopenharmony_ci		}
87162306a36Sopenharmony_ci
87262306a36Sopenharmony_ci		context->clusters_moved += alloc_size;
87362306a36Sopenharmony_cinext:
87462306a36Sopenharmony_ci		cpos += alloc_size;
87562306a36Sopenharmony_ci		len_to_move -= alloc_size;
87662306a36Sopenharmony_ci	}
87762306a36Sopenharmony_ci
87862306a36Sopenharmony_cidone:
87962306a36Sopenharmony_ci	range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
88062306a36Sopenharmony_ci
88162306a36Sopenharmony_ciout:
88262306a36Sopenharmony_ci	range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
88362306a36Sopenharmony_ci						      context->clusters_moved);
88462306a36Sopenharmony_ci	range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
88562306a36Sopenharmony_ci						       context->new_phys_cpos);
88662306a36Sopenharmony_ci
88762306a36Sopenharmony_ci	ocfs2_schedule_truncate_log_flush(osb, 1);
88862306a36Sopenharmony_ci	ocfs2_run_deallocs(osb, &context->dealloc);
88962306a36Sopenharmony_ci
89062306a36Sopenharmony_ci	return ret;
89162306a36Sopenharmony_ci}
89262306a36Sopenharmony_ci
89362306a36Sopenharmony_cistatic int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
89462306a36Sopenharmony_ci{
89562306a36Sopenharmony_ci	int status;
89662306a36Sopenharmony_ci	handle_t *handle;
89762306a36Sopenharmony_ci	struct inode *inode = context->inode;
89862306a36Sopenharmony_ci	struct ocfs2_dinode *di;
89962306a36Sopenharmony_ci	struct buffer_head *di_bh = NULL;
90062306a36Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
90162306a36Sopenharmony_ci
90262306a36Sopenharmony_ci	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
90362306a36Sopenharmony_ci		return -EROFS;
90462306a36Sopenharmony_ci
90562306a36Sopenharmony_ci	inode_lock(inode);
90662306a36Sopenharmony_ci
90762306a36Sopenharmony_ci	/*
90862306a36Sopenharmony_ci	 * This prevents concurrent writes from other nodes
90962306a36Sopenharmony_ci	 */
91062306a36Sopenharmony_ci	status = ocfs2_rw_lock(inode, 1);
91162306a36Sopenharmony_ci	if (status) {
91262306a36Sopenharmony_ci		mlog_errno(status);
91362306a36Sopenharmony_ci		goto out;
91462306a36Sopenharmony_ci	}
91562306a36Sopenharmony_ci
91662306a36Sopenharmony_ci	status = ocfs2_inode_lock(inode, &di_bh, 1);
91762306a36Sopenharmony_ci	if (status) {
91862306a36Sopenharmony_ci		mlog_errno(status);
91962306a36Sopenharmony_ci		goto out_rw_unlock;
92062306a36Sopenharmony_ci	}
92162306a36Sopenharmony_ci
92262306a36Sopenharmony_ci	/*
92362306a36Sopenharmony_ci	 * rememer ip_xattr_sem also needs to be held if necessary
92462306a36Sopenharmony_ci	 */
92562306a36Sopenharmony_ci	down_write(&OCFS2_I(inode)->ip_alloc_sem);
92662306a36Sopenharmony_ci
92762306a36Sopenharmony_ci	status = __ocfs2_move_extents_range(di_bh, context);
92862306a36Sopenharmony_ci
92962306a36Sopenharmony_ci	up_write(&OCFS2_I(inode)->ip_alloc_sem);
93062306a36Sopenharmony_ci	if (status) {
93162306a36Sopenharmony_ci		mlog_errno(status);
93262306a36Sopenharmony_ci		goto out_inode_unlock;
93362306a36Sopenharmony_ci	}
93462306a36Sopenharmony_ci
93562306a36Sopenharmony_ci	/*
93662306a36Sopenharmony_ci	 * We update ctime for these changes
93762306a36Sopenharmony_ci	 */
93862306a36Sopenharmony_ci	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
93962306a36Sopenharmony_ci	if (IS_ERR(handle)) {
94062306a36Sopenharmony_ci		status = PTR_ERR(handle);
94162306a36Sopenharmony_ci		mlog_errno(status);
94262306a36Sopenharmony_ci		goto out_inode_unlock;
94362306a36Sopenharmony_ci	}
94462306a36Sopenharmony_ci
94562306a36Sopenharmony_ci	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
94662306a36Sopenharmony_ci					 OCFS2_JOURNAL_ACCESS_WRITE);
94762306a36Sopenharmony_ci	if (status) {
94862306a36Sopenharmony_ci		mlog_errno(status);
94962306a36Sopenharmony_ci		goto out_commit;
95062306a36Sopenharmony_ci	}
95162306a36Sopenharmony_ci
95262306a36Sopenharmony_ci	di = (struct ocfs2_dinode *)di_bh->b_data;
95362306a36Sopenharmony_ci	inode_set_ctime_current(inode);
95462306a36Sopenharmony_ci	di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
95562306a36Sopenharmony_ci	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
95662306a36Sopenharmony_ci	ocfs2_update_inode_fsync_trans(handle, inode, 0);
95762306a36Sopenharmony_ci
95862306a36Sopenharmony_ci	ocfs2_journal_dirty(handle, di_bh);
95962306a36Sopenharmony_ci
96062306a36Sopenharmony_ciout_commit:
96162306a36Sopenharmony_ci	ocfs2_commit_trans(osb, handle);
96262306a36Sopenharmony_ci
96362306a36Sopenharmony_ciout_inode_unlock:
96462306a36Sopenharmony_ci	brelse(di_bh);
96562306a36Sopenharmony_ci	ocfs2_inode_unlock(inode, 1);
96662306a36Sopenharmony_ciout_rw_unlock:
96762306a36Sopenharmony_ci	ocfs2_rw_unlock(inode, 1);
96862306a36Sopenharmony_ciout:
96962306a36Sopenharmony_ci	inode_unlock(inode);
97062306a36Sopenharmony_ci
97162306a36Sopenharmony_ci	return status;
97262306a36Sopenharmony_ci}
97362306a36Sopenharmony_ci
97462306a36Sopenharmony_ciint ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
97562306a36Sopenharmony_ci{
97662306a36Sopenharmony_ci	int status;
97762306a36Sopenharmony_ci
97862306a36Sopenharmony_ci	struct inode *inode = file_inode(filp);
97962306a36Sopenharmony_ci	struct ocfs2_move_extents range;
98062306a36Sopenharmony_ci	struct ocfs2_move_extents_context *context;
98162306a36Sopenharmony_ci
98262306a36Sopenharmony_ci	if (!argp)
98362306a36Sopenharmony_ci		return -EINVAL;
98462306a36Sopenharmony_ci
98562306a36Sopenharmony_ci	status = mnt_want_write_file(filp);
98662306a36Sopenharmony_ci	if (status)
98762306a36Sopenharmony_ci		return status;
98862306a36Sopenharmony_ci
98962306a36Sopenharmony_ci	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
99062306a36Sopenharmony_ci		status = -EPERM;
99162306a36Sopenharmony_ci		goto out_drop;
99262306a36Sopenharmony_ci	}
99362306a36Sopenharmony_ci
99462306a36Sopenharmony_ci	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
99562306a36Sopenharmony_ci		status = -EPERM;
99662306a36Sopenharmony_ci		goto out_drop;
99762306a36Sopenharmony_ci	}
99862306a36Sopenharmony_ci
99962306a36Sopenharmony_ci	context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
100062306a36Sopenharmony_ci	if (!context) {
100162306a36Sopenharmony_ci		status = -ENOMEM;
100262306a36Sopenharmony_ci		mlog_errno(status);
100362306a36Sopenharmony_ci		goto out_drop;
100462306a36Sopenharmony_ci	}
100562306a36Sopenharmony_ci
100662306a36Sopenharmony_ci	context->inode = inode;
100762306a36Sopenharmony_ci	context->file = filp;
100862306a36Sopenharmony_ci
100962306a36Sopenharmony_ci	if (copy_from_user(&range, argp, sizeof(range))) {
101062306a36Sopenharmony_ci		status = -EFAULT;
101162306a36Sopenharmony_ci		goto out_free;
101262306a36Sopenharmony_ci	}
101362306a36Sopenharmony_ci
101462306a36Sopenharmony_ci	if (range.me_start > i_size_read(inode)) {
101562306a36Sopenharmony_ci		status = -EINVAL;
101662306a36Sopenharmony_ci		goto out_free;
101762306a36Sopenharmony_ci	}
101862306a36Sopenharmony_ci
101962306a36Sopenharmony_ci	if (range.me_start + range.me_len > i_size_read(inode))
102062306a36Sopenharmony_ci			range.me_len = i_size_read(inode) - range.me_start;
102162306a36Sopenharmony_ci
102262306a36Sopenharmony_ci	context->range = &range;
102362306a36Sopenharmony_ci
102462306a36Sopenharmony_ci	/*
102562306a36Sopenharmony_ci	 * ok, the default theshold for the defragmentation
102662306a36Sopenharmony_ci	 * is 1M, since our maximum clustersize was 1M also.
102762306a36Sopenharmony_ci	 * any thought?
102862306a36Sopenharmony_ci	 */
102962306a36Sopenharmony_ci	if (!range.me_threshold)
103062306a36Sopenharmony_ci		range.me_threshold = 1024 * 1024;
103162306a36Sopenharmony_ci
103262306a36Sopenharmony_ci	if (range.me_threshold > i_size_read(inode))
103362306a36Sopenharmony_ci		range.me_threshold = i_size_read(inode);
103462306a36Sopenharmony_ci
103562306a36Sopenharmony_ci	if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
103662306a36Sopenharmony_ci		context->auto_defrag = 1;
103762306a36Sopenharmony_ci
103862306a36Sopenharmony_ci		if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
103962306a36Sopenharmony_ci			context->partial = 1;
104062306a36Sopenharmony_ci	} else {
104162306a36Sopenharmony_ci		/*
104262306a36Sopenharmony_ci		 * first best-effort attempt to validate and adjust the goal
104362306a36Sopenharmony_ci		 * (physical address in block), while it can't guarantee later
104462306a36Sopenharmony_ci		 * operation can succeed all the time since global_bitmap may
104562306a36Sopenharmony_ci		 * change a bit over time.
104662306a36Sopenharmony_ci		 */
104762306a36Sopenharmony_ci
104862306a36Sopenharmony_ci		status = ocfs2_validate_and_adjust_move_goal(inode, &range);
104962306a36Sopenharmony_ci		if (status)
105062306a36Sopenharmony_ci			goto out_copy;
105162306a36Sopenharmony_ci	}
105262306a36Sopenharmony_ci
105362306a36Sopenharmony_ci	status = ocfs2_move_extents(context);
105462306a36Sopenharmony_ci	if (status)
105562306a36Sopenharmony_ci		mlog_errno(status);
105662306a36Sopenharmony_ciout_copy:
105762306a36Sopenharmony_ci	/*
105862306a36Sopenharmony_ci	 * movement/defragmentation may end up being partially completed,
105962306a36Sopenharmony_ci	 * that's the reason why we need to return userspace the finished
106062306a36Sopenharmony_ci	 * length and new_offset even if failure happens somewhere.
106162306a36Sopenharmony_ci	 */
106262306a36Sopenharmony_ci	if (copy_to_user(argp, &range, sizeof(range)))
106362306a36Sopenharmony_ci		status = -EFAULT;
106462306a36Sopenharmony_ci
106562306a36Sopenharmony_ciout_free:
106662306a36Sopenharmony_ci	kfree(context);
106762306a36Sopenharmony_ciout_drop:
106862306a36Sopenharmony_ci	mnt_drop_write_file(filp);
106962306a36Sopenharmony_ci
107062306a36Sopenharmony_ci	return status;
107162306a36Sopenharmony_ci}
1072