xref: /kernel/linux/linux-5.10/fs/ext4/extents.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
4 * Written by Alex Tomas <alex@clusterfs.com>
5 *
6 * Architecture independence:
7 *   Copyright (c) 2005, Bull S.A.
8 *   Written by Pierre Peiffer <pierre.peiffer@bull.net>
9 */
10
11/*
12 * Extents support for EXT4
13 *
14 * TODO:
15 *   - ext4*_error() should be used in some situations
16 *   - analyze all BUG()/BUG_ON(), use -EIO where appropriate
17 *   - smart tree reduction
18 */
19
20#include <linux/fs.h>
21#include <linux/time.h>
22#include <linux/jbd2.h>
23#include <linux/highuid.h>
24#include <linux/pagemap.h>
25#include <linux/quotaops.h>
26#include <linux/string.h>
27#include <linux/slab.h>
28#include <linux/uaccess.h>
29#include <linux/fiemap.h>
30#include <linux/backing-dev.h>
31#include <linux/iomap.h>
32#include "ext4_jbd2.h"
33#include "ext4_extents.h"
34#include "xattr.h"
35
36#include <trace/events/ext4.h>
37
38/*
39 * used by extent splitting.
40 */
41#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
42					due to ENOSPC */
43#define EXT4_EXT_MARK_UNWRIT1	0x2  /* mark first half unwritten */
44#define EXT4_EXT_MARK_UNWRIT2	0x4  /* mark second half unwritten */
45
46#define EXT4_EXT_DATA_VALID1	0x8  /* first half contains valid data */
47#define EXT4_EXT_DATA_VALID2	0x10 /* second half contains valid data */
48
49static __le32 ext4_extent_block_csum(struct inode *inode,
50				     struct ext4_extent_header *eh)
51{
52	struct ext4_inode_info *ei = EXT4_I(inode);
53	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
54	__u32 csum;
55
56	csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
57			   EXT4_EXTENT_TAIL_OFFSET(eh));
58	return cpu_to_le32(csum);
59}
60
61static int ext4_extent_block_csum_verify(struct inode *inode,
62					 struct ext4_extent_header *eh)
63{
64	struct ext4_extent_tail *et;
65
66	if (!ext4_has_metadata_csum(inode->i_sb))
67		return 1;
68
69	et = find_ext4_extent_tail(eh);
70	if (et->et_checksum != ext4_extent_block_csum(inode, eh))
71		return 0;
72	return 1;
73}
74
75static void ext4_extent_block_csum_set(struct inode *inode,
76				       struct ext4_extent_header *eh)
77{
78	struct ext4_extent_tail *et;
79
80	if (!ext4_has_metadata_csum(inode->i_sb))
81		return;
82
83	et = find_ext4_extent_tail(eh);
84	et->et_checksum = ext4_extent_block_csum(inode, eh);
85}
86
87static int ext4_split_extent_at(handle_t *handle,
88			     struct inode *inode,
89			     struct ext4_ext_path **ppath,
90			     ext4_lblk_t split,
91			     int split_flag,
92			     int flags);
93
94static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
95{
96	/*
97	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
98	 * moment, get_block can be called only for blocks inside i_size since
99	 * page cache has been already dropped and writes are blocked by
100	 * i_mutex. So we can safely drop the i_data_sem here.
101	 */
102	BUG_ON(EXT4_JOURNAL(inode) == NULL);
103	ext4_discard_preallocations(inode, 0);
104	up_write(&EXT4_I(inode)->i_data_sem);
105	*dropped = 1;
106	return 0;
107}
108
109/*
110 * Make sure 'handle' has at least 'check_cred' credits. If not, restart
111 * transaction with 'restart_cred' credits. The function drops i_data_sem
112 * when restarting transaction and gets it after transaction is restarted.
113 *
114 * The function returns 0 on success, 1 if transaction had to be restarted,
115 * and < 0 in case of fatal error.
116 */
117int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
118				int check_cred, int restart_cred,
119				int revoke_cred)
120{
121	int ret;
122	int dropped = 0;
123
124	ret = ext4_journal_ensure_credits_fn(handle, check_cred, restart_cred,
125		revoke_cred, ext4_ext_trunc_restart_fn(inode, &dropped));
126	if (dropped)
127		down_write(&EXT4_I(inode)->i_data_sem);
128	return ret;
129}
130
131/*
132 * could return:
133 *  - EROFS
134 *  - ENOMEM
135 */
136static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
137				struct ext4_ext_path *path)
138{
139	int err = 0;
140
141	if (path->p_bh) {
142		/* path points to block */
143		BUFFER_TRACE(path->p_bh, "get_write_access");
144		err = ext4_journal_get_write_access(handle, path->p_bh);
145		/*
146		 * The extent buffer's verified bit will be set again in
147		 * __ext4_ext_dirty(). We could leave an inconsistent
148		 * buffer if the extents updating procudure break off du
149		 * to some error happens, force to check it again.
150		 */
151		if (!err)
152			clear_buffer_verified(path->p_bh);
153	}
154	/* path points to leaf/index in inode body */
155	/* we use in-core data, no need to protect them */
156	return err;
157}
158
159/*
160 * could return:
161 *  - EROFS
162 *  - ENOMEM
163 *  - EIO
164 */
165static int __ext4_ext_dirty(const char *where, unsigned int line,
166			    handle_t *handle, struct inode *inode,
167			    struct ext4_ext_path *path)
168{
169	int err;
170
171	WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
172	if (path->p_bh) {
173		ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
174		/* path points to block */
175		err = __ext4_handle_dirty_metadata(where, line, handle,
176						   inode, path->p_bh);
177		/* Extents updating done, re-set verified flag */
178		if (!err)
179			set_buffer_verified(path->p_bh);
180	} else {
181		/* path points to leaf/index in inode body */
182		err = ext4_mark_inode_dirty(handle, inode);
183	}
184	return err;
185}
186
187#define ext4_ext_dirty(handle, inode, path) \
188		__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
189
190static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
191			      struct ext4_ext_path *path,
192			      ext4_lblk_t block)
193{
194	if (path) {
195		int depth = path->p_depth;
196		struct ext4_extent *ex;
197
198		/*
199		 * Try to predict block placement assuming that we are
200		 * filling in a file which will eventually be
201		 * non-sparse --- i.e., in the case of libbfd writing
202		 * an ELF object sections out-of-order but in a way
203		 * the eventually results in a contiguous object or
204		 * executable file, or some database extending a table
205		 * space file.  However, this is actually somewhat
206		 * non-ideal if we are writing a sparse file such as
207		 * qemu or KVM writing a raw image file that is going
208		 * to stay fairly sparse, since it will end up
209		 * fragmenting the file system's free space.  Maybe we
210		 * should have some hueristics or some way to allow
211		 * userspace to pass a hint to file system,
212		 * especially if the latter case turns out to be
213		 * common.
214		 */
215		ex = path[depth].p_ext;
216		if (ex) {
217			ext4_fsblk_t ext_pblk = ext4_ext_pblock(ex);
218			ext4_lblk_t ext_block = le32_to_cpu(ex->ee_block);
219
220			if (block > ext_block)
221				return ext_pblk + (block - ext_block);
222			else
223				return ext_pblk - (ext_block - block);
224		}
225
226		/* it looks like index is empty;
227		 * try to find starting block from index itself */
228		if (path[depth].p_bh)
229			return path[depth].p_bh->b_blocknr;
230	}
231
232	/* OK. use inode's group */
233	return ext4_inode_to_goal_block(inode);
234}
235
236/*
237 * Allocation for a meta data block
238 */
239static ext4_fsblk_t
240ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
241			struct ext4_ext_path *path,
242			struct ext4_extent *ex, int *err, unsigned int flags)
243{
244	ext4_fsblk_t goal, newblock;
245
246	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
247	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
248					NULL, err);
249	return newblock;
250}
251
252static inline int ext4_ext_space_block(struct inode *inode, int check)
253{
254	int size;
255
256	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
257			/ sizeof(struct ext4_extent);
258#ifdef AGGRESSIVE_TEST
259	if (!check && size > 6)
260		size = 6;
261#endif
262	return size;
263}
264
265static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
266{
267	int size;
268
269	size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
270			/ sizeof(struct ext4_extent_idx);
271#ifdef AGGRESSIVE_TEST
272	if (!check && size > 5)
273		size = 5;
274#endif
275	return size;
276}
277
278static inline int ext4_ext_space_root(struct inode *inode, int check)
279{
280	int size;
281
282	size = sizeof(EXT4_I(inode)->i_data);
283	size -= sizeof(struct ext4_extent_header);
284	size /= sizeof(struct ext4_extent);
285#ifdef AGGRESSIVE_TEST
286	if (!check && size > 3)
287		size = 3;
288#endif
289	return size;
290}
291
292static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
293{
294	int size;
295
296	size = sizeof(EXT4_I(inode)->i_data);
297	size -= sizeof(struct ext4_extent_header);
298	size /= sizeof(struct ext4_extent_idx);
299#ifdef AGGRESSIVE_TEST
300	if (!check && size > 4)
301		size = 4;
302#endif
303	return size;
304}
305
306static inline int
307ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
308			   struct ext4_ext_path **ppath, ext4_lblk_t lblk,
309			   int nofail)
310{
311	struct ext4_ext_path *path = *ppath;
312	int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
313	int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
314
315	if (nofail)
316		flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
317
318	return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
319			EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
320			flags);
321}
322
323static int
324ext4_ext_max_entries(struct inode *inode, int depth)
325{
326	int max;
327
328	if (depth == ext_depth(inode)) {
329		if (depth == 0)
330			max = ext4_ext_space_root(inode, 1);
331		else
332			max = ext4_ext_space_root_idx(inode, 1);
333	} else {
334		if (depth == 0)
335			max = ext4_ext_space_block(inode, 1);
336		else
337			max = ext4_ext_space_block_idx(inode, 1);
338	}
339
340	return max;
341}
342
343static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
344{
345	ext4_fsblk_t block = ext4_ext_pblock(ext);
346	int len = ext4_ext_get_actual_len(ext);
347	ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
348
349	/*
350	 * We allow neither:
351	 *  - zero length
352	 *  - overflow/wrap-around
353	 */
354	if (lblock + len <= lblock)
355		return 0;
356	return ext4_inode_block_valid(inode, block, len);
357}
358
359static int ext4_valid_extent_idx(struct inode *inode,
360				struct ext4_extent_idx *ext_idx)
361{
362	ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
363
364	return ext4_inode_block_valid(inode, block, 1);
365}
366
367static int ext4_valid_extent_entries(struct inode *inode,
368				     struct ext4_extent_header *eh,
369				     ext4_lblk_t lblk, ext4_fsblk_t *pblk,
370				     int depth)
371{
372	unsigned short entries;
373	ext4_lblk_t lblock = 0;
374	ext4_lblk_t cur = 0;
375
376	if (eh->eh_entries == 0)
377		return 1;
378
379	entries = le16_to_cpu(eh->eh_entries);
380
381	if (depth == 0) {
382		/* leaf entries */
383		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
384
385		/*
386		 * The logical block in the first entry should equal to
387		 * the number in the index block.
388		 */
389		if (depth != ext_depth(inode) &&
390		    lblk != le32_to_cpu(ext->ee_block))
391			return 0;
392		while (entries) {
393			if (!ext4_valid_extent(inode, ext))
394				return 0;
395
396			/* Check for overlapping extents */
397			lblock = le32_to_cpu(ext->ee_block);
398			if (lblock < cur) {
399				*pblk = ext4_ext_pblock(ext);
400				return 0;
401			}
402			cur = lblock + ext4_ext_get_actual_len(ext);
403			ext++;
404			entries--;
405		}
406	} else {
407		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
408
409		/*
410		 * The logical block in the first entry should equal to
411		 * the number in the parent index block.
412		 */
413		if (depth != ext_depth(inode) &&
414		    lblk != le32_to_cpu(ext_idx->ei_block))
415			return 0;
416		while (entries) {
417			if (!ext4_valid_extent_idx(inode, ext_idx))
418				return 0;
419
420			/* Check for overlapping index extents */
421			lblock = le32_to_cpu(ext_idx->ei_block);
422			if (lblock < cur) {
423				*pblk = ext4_idx_pblock(ext_idx);
424				return 0;
425			}
426			ext_idx++;
427			entries--;
428			cur = lblock + 1;
429		}
430	}
431	return 1;
432}
433
434static int __ext4_ext_check(const char *function, unsigned int line,
435			    struct inode *inode, struct ext4_extent_header *eh,
436			    int depth, ext4_fsblk_t pblk, ext4_lblk_t lblk)
437{
438	const char *error_msg;
439	int max = 0, err = -EFSCORRUPTED;
440
441	if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
442		error_msg = "invalid magic";
443		goto corrupted;
444	}
445	if (unlikely(le16_to_cpu(eh->eh_depth) != depth)) {
446		error_msg = "unexpected eh_depth";
447		goto corrupted;
448	}
449	if (unlikely(eh->eh_max == 0)) {
450		error_msg = "invalid eh_max";
451		goto corrupted;
452	}
453	max = ext4_ext_max_entries(inode, depth);
454	if (unlikely(le16_to_cpu(eh->eh_max) > max)) {
455		error_msg = "too large eh_max";
456		goto corrupted;
457	}
458	if (unlikely(le16_to_cpu(eh->eh_entries) > le16_to_cpu(eh->eh_max))) {
459		error_msg = "invalid eh_entries";
460		goto corrupted;
461	}
462	if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
463		error_msg = "eh_entries is 0 but eh_depth is > 0";
464		goto corrupted;
465	}
466	if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
467		error_msg = "invalid extent entries";
468		goto corrupted;
469	}
470	if (unlikely(depth > 32)) {
471		error_msg = "too large eh_depth";
472		goto corrupted;
473	}
474	/* Verify checksum on non-root extent tree nodes */
475	if (ext_depth(inode) != depth &&
476	    !ext4_extent_block_csum_verify(inode, eh)) {
477		error_msg = "extent tree corrupted";
478		err = -EFSBADCRC;
479		goto corrupted;
480	}
481	return 0;
482
483corrupted:
484	ext4_error_inode_err(inode, function, line, 0, -err,
485			     "pblk %llu bad header/extent: %s - magic %x, "
486			     "entries %u, max %u(%u), depth %u(%u)",
487			     (unsigned long long) pblk, error_msg,
488			     le16_to_cpu(eh->eh_magic),
489			     le16_to_cpu(eh->eh_entries),
490			     le16_to_cpu(eh->eh_max),
491			     max, le16_to_cpu(eh->eh_depth), depth);
492	return err;
493}
494
495#define ext4_ext_check(inode, eh, depth, pblk)			\
496	__ext4_ext_check(__func__, __LINE__, (inode), (eh), (depth), (pblk), 0)
497
498int ext4_ext_check_inode(struct inode *inode)
499{
500	return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
501}
502
503static void ext4_cache_extents(struct inode *inode,
504			       struct ext4_extent_header *eh)
505{
506	struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
507	ext4_lblk_t prev = 0;
508	int i;
509
510	for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
511		unsigned int status = EXTENT_STATUS_WRITTEN;
512		ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
513		int len = ext4_ext_get_actual_len(ex);
514
515		if (prev && (prev != lblk))
516			ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
517					     EXTENT_STATUS_HOLE);
518
519		if (ext4_ext_is_unwritten(ex))
520			status = EXTENT_STATUS_UNWRITTEN;
521		ext4_es_cache_extent(inode, lblk, len,
522				     ext4_ext_pblock(ex), status);
523		prev = lblk + len;
524	}
525}
526
527static struct buffer_head *
528__read_extent_tree_block(const char *function, unsigned int line,
529			 struct inode *inode, struct ext4_extent_idx *idx,
530			 int depth, int flags)
531{
532	struct buffer_head		*bh;
533	int				err;
534	gfp_t				gfp_flags = __GFP_MOVABLE | GFP_NOFS;
535	ext4_fsblk_t			pblk;
536
537	if (flags & EXT4_EX_NOFAIL)
538		gfp_flags |= __GFP_NOFAIL;
539
540	pblk = ext4_idx_pblock(idx);
541	bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags);
542	if (unlikely(!bh))
543		return ERR_PTR(-ENOMEM);
544
545	if (!bh_uptodate_or_lock(bh)) {
546		trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
547		err = ext4_read_bh(bh, 0, NULL);
548		if (err < 0)
549			goto errout;
550	}
551	if (buffer_verified(bh)) {
552		if (unlikely(ext_block_hdr(bh)->eh_magic != EXT4_EXT_MAGIC)) {
553			err = -EFSCORRUPTED;
554			ext4_error_inode(inode, function, line, 0,
555				"invalid magic for verified extent block %llu",
556				(unsigned long long)bh->b_blocknr);
557			goto errout;
558		}
559
560		if (!(flags & EXT4_EX_FORCE_CACHE))
561			return bh;
562	} else {
563		err = __ext4_ext_check(function, line, inode, ext_block_hdr(bh),
564				       depth, pblk, le32_to_cpu(idx->ei_block));
565		if (err)
566			goto errout;
567		set_buffer_verified(bh);
568	}
569	/*
570	 * If this is a leaf block, cache all of its entries
571	 */
572	if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
573		struct ext4_extent_header *eh = ext_block_hdr(bh);
574		ext4_cache_extents(inode, eh);
575	}
576	return bh;
577errout:
578	put_bh(bh);
579	return ERR_PTR(err);
580
581}
582
583#define read_extent_tree_block(inode, idx, depth, flags)		\
584	__read_extent_tree_block(__func__, __LINE__, (inode), (idx),	\
585				 (depth), (flags))
586
587/*
588 * This function is called to cache a file's extent information in the
589 * extent status tree
590 */
591int ext4_ext_precache(struct inode *inode)
592{
593	struct ext4_inode_info *ei = EXT4_I(inode);
594	struct ext4_ext_path *path = NULL;
595	struct buffer_head *bh;
596	int i = 0, depth, ret = 0;
597
598	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
599		return 0;	/* not an extent-mapped inode */
600
601	down_read(&ei->i_data_sem);
602	depth = ext_depth(inode);
603
604	/* Don't cache anything if there are no external extent blocks */
605	if (!depth) {
606		up_read(&ei->i_data_sem);
607		return ret;
608	}
609
610	path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
611		       GFP_NOFS);
612	if (path == NULL) {
613		up_read(&ei->i_data_sem);
614		return -ENOMEM;
615	}
616
617	path[0].p_hdr = ext_inode_hdr(inode);
618	ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
619	if (ret)
620		goto out;
621	path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
622	while (i >= 0) {
623		/*
624		 * If this is a leaf block or we've reached the end of
625		 * the index block, go up
626		 */
627		if ((i == depth) ||
628		    path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
629			brelse(path[i].p_bh);
630			path[i].p_bh = NULL;
631			i--;
632			continue;
633		}
634		bh = read_extent_tree_block(inode, path[i].p_idx++,
635					    depth - i - 1,
636					    EXT4_EX_FORCE_CACHE);
637		if (IS_ERR(bh)) {
638			ret = PTR_ERR(bh);
639			break;
640		}
641		i++;
642		path[i].p_bh = bh;
643		path[i].p_hdr = ext_block_hdr(bh);
644		path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
645	}
646	ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
647out:
648	up_read(&ei->i_data_sem);
649	ext4_ext_drop_refs(path);
650	kfree(path);
651	return ret;
652}
653
654#ifdef EXT_DEBUG
655static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
656{
657	int k, l = path->p_depth;
658
659	ext_debug(inode, "path:");
660	for (k = 0; k <= l; k++, path++) {
661		if (path->p_idx) {
662			ext_debug(inode, "  %d->%llu",
663				  le32_to_cpu(path->p_idx->ei_block),
664				  ext4_idx_pblock(path->p_idx));
665		} else if (path->p_ext) {
666			ext_debug(inode, "  %d:[%d]%d:%llu ",
667				  le32_to_cpu(path->p_ext->ee_block),
668				  ext4_ext_is_unwritten(path->p_ext),
669				  ext4_ext_get_actual_len(path->p_ext),
670				  ext4_ext_pblock(path->p_ext));
671		} else
672			ext_debug(inode, "  []");
673	}
674	ext_debug(inode, "\n");
675}
676
677static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
678{
679	int depth = ext_depth(inode);
680	struct ext4_extent_header *eh;
681	struct ext4_extent *ex;
682	int i;
683
684	if (!path)
685		return;
686
687	eh = path[depth].p_hdr;
688	ex = EXT_FIRST_EXTENT(eh);
689
690	ext_debug(inode, "Displaying leaf extents\n");
691
692	for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
693		ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
694			  ext4_ext_is_unwritten(ex),
695			  ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
696	}
697	ext_debug(inode, "\n");
698}
699
700static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
701			ext4_fsblk_t newblock, int level)
702{
703	int depth = ext_depth(inode);
704	struct ext4_extent *ex;
705
706	if (depth != level) {
707		struct ext4_extent_idx *idx;
708		idx = path[level].p_idx;
709		while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
710			ext_debug(inode, "%d: move %d:%llu in new index %llu\n",
711				  level, le32_to_cpu(idx->ei_block),
712				  ext4_idx_pblock(idx), newblock);
713			idx++;
714		}
715
716		return;
717	}
718
719	ex = path[depth].p_ext;
720	while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
721		ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n",
722				le32_to_cpu(ex->ee_block),
723				ext4_ext_pblock(ex),
724				ext4_ext_is_unwritten(ex),
725				ext4_ext_get_actual_len(ex),
726				newblock);
727		ex++;
728	}
729}
730
731#else
732#define ext4_ext_show_path(inode, path)
733#define ext4_ext_show_leaf(inode, path)
734#define ext4_ext_show_move(inode, path, newblock, level)
735#endif
736
737void ext4_ext_drop_refs(struct ext4_ext_path *path)
738{
739	int depth, i;
740
741	if (!path)
742		return;
743	depth = path->p_depth;
744	for (i = 0; i <= depth; i++, path++) {
745		brelse(path->p_bh);
746		path->p_bh = NULL;
747	}
748}
749
750/*
751 * ext4_ext_binsearch_idx:
752 * binary search for the closest index of the given block
753 * the header must be checked before calling this
754 */
755static void
756ext4_ext_binsearch_idx(struct inode *inode,
757			struct ext4_ext_path *path, ext4_lblk_t block)
758{
759	struct ext4_extent_header *eh = path->p_hdr;
760	struct ext4_extent_idx *r, *l, *m;
761
762
763	ext_debug(inode, "binsearch for %u(idx):  ", block);
764
765	l = EXT_FIRST_INDEX(eh) + 1;
766	r = EXT_LAST_INDEX(eh);
767	while (l <= r) {
768		m = l + (r - l) / 2;
769		if (block < le32_to_cpu(m->ei_block))
770			r = m - 1;
771		else
772			l = m + 1;
773		ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
774			  le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block),
775			  r, le32_to_cpu(r->ei_block));
776	}
777
778	path->p_idx = l - 1;
779	ext_debug(inode, "  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
780		  ext4_idx_pblock(path->p_idx));
781
782#ifdef CHECK_BINSEARCH
783	{
784		struct ext4_extent_idx *chix, *ix;
785		int k;
786
787		chix = ix = EXT_FIRST_INDEX(eh);
788		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
789			if (k != 0 && le32_to_cpu(ix->ei_block) <=
790			    le32_to_cpu(ix[-1].ei_block)) {
791				printk(KERN_DEBUG "k=%d, ix=0x%p, "
792				       "first=0x%p\n", k,
793				       ix, EXT_FIRST_INDEX(eh));
794				printk(KERN_DEBUG "%u <= %u\n",
795				       le32_to_cpu(ix->ei_block),
796				       le32_to_cpu(ix[-1].ei_block));
797			}
798			BUG_ON(k && le32_to_cpu(ix->ei_block)
799					   <= le32_to_cpu(ix[-1].ei_block));
800			if (block < le32_to_cpu(ix->ei_block))
801				break;
802			chix = ix;
803		}
804		BUG_ON(chix != path->p_idx);
805	}
806#endif
807
808}
809
810/*
811 * ext4_ext_binsearch:
812 * binary search for closest extent of the given block
813 * the header must be checked before calling this
814 */
815static void
816ext4_ext_binsearch(struct inode *inode,
817		struct ext4_ext_path *path, ext4_lblk_t block)
818{
819	struct ext4_extent_header *eh = path->p_hdr;
820	struct ext4_extent *r, *l, *m;
821
822	if (eh->eh_entries == 0) {
823		/*
824		 * this leaf is empty:
825		 * we get such a leaf in split/add case
826		 */
827		return;
828	}
829
830	ext_debug(inode, "binsearch for %u:  ", block);
831
832	l = EXT_FIRST_EXTENT(eh) + 1;
833	r = EXT_LAST_EXTENT(eh);
834
835	while (l <= r) {
836		m = l + (r - l) / 2;
837		if (block < le32_to_cpu(m->ee_block))
838			r = m - 1;
839		else
840			l = m + 1;
841		ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l,
842			  le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block),
843			  r, le32_to_cpu(r->ee_block));
844	}
845
846	path->p_ext = l - 1;
847	ext_debug(inode, "  -> %d:%llu:[%d]%d ",
848			le32_to_cpu(path->p_ext->ee_block),
849			ext4_ext_pblock(path->p_ext),
850			ext4_ext_is_unwritten(path->p_ext),
851			ext4_ext_get_actual_len(path->p_ext));
852
853#ifdef CHECK_BINSEARCH
854	{
855		struct ext4_extent *chex, *ex;
856		int k;
857
858		chex = ex = EXT_FIRST_EXTENT(eh);
859		for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ex++) {
860			BUG_ON(k && le32_to_cpu(ex->ee_block)
861					  <= le32_to_cpu(ex[-1].ee_block));
862			if (block < le32_to_cpu(ex->ee_block))
863				break;
864			chex = ex;
865		}
866		BUG_ON(chex != path->p_ext);
867	}
868#endif
869
870}
871
872void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
873{
874	struct ext4_extent_header *eh;
875
876	eh = ext_inode_hdr(inode);
877	eh->eh_depth = 0;
878	eh->eh_entries = 0;
879	eh->eh_magic = EXT4_EXT_MAGIC;
880	eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
881	eh->eh_generation = 0;
882	ext4_mark_inode_dirty(handle, inode);
883}
884
885struct ext4_ext_path *
886ext4_find_extent(struct inode *inode, ext4_lblk_t block,
887		 struct ext4_ext_path **orig_path, int flags)
888{
889	struct ext4_extent_header *eh;
890	struct buffer_head *bh;
891	struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
892	short int depth, i, ppos = 0;
893	int ret;
894	gfp_t gfp_flags = GFP_NOFS;
895
896	if (flags & EXT4_EX_NOFAIL)
897		gfp_flags |= __GFP_NOFAIL;
898
899	eh = ext_inode_hdr(inode);
900	depth = ext_depth(inode);
901	if (depth < 0 || depth > EXT4_MAX_EXTENT_DEPTH) {
902		EXT4_ERROR_INODE(inode, "inode has invalid extent depth: %d",
903				 depth);
904		ret = -EFSCORRUPTED;
905		goto err;
906	}
907
908	if (path) {
909		ext4_ext_drop_refs(path);
910		if (depth > path[0].p_maxdepth) {
911			kfree(path);
912			*orig_path = path = NULL;
913		}
914	}
915	if (!path) {
916		/* account possible depth increase */
917		path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
918				gfp_flags);
919		if (unlikely(!path))
920			return ERR_PTR(-ENOMEM);
921		path[0].p_maxdepth = depth + 1;
922	}
923	path[0].p_hdr = eh;
924	path[0].p_bh = NULL;
925
926	i = depth;
927	if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
928		ext4_cache_extents(inode, eh);
929	/* walk through the tree */
930	while (i) {
931		ext_debug(inode, "depth %d: num %d, max %d\n",
932			  ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
933
934		ext4_ext_binsearch_idx(inode, path + ppos, block);
935		path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
936		path[ppos].p_depth = i;
937		path[ppos].p_ext = NULL;
938
939		bh = read_extent_tree_block(inode, path[ppos].p_idx, --i, flags);
940		if (IS_ERR(bh)) {
941			ret = PTR_ERR(bh);
942			goto err;
943		}
944
945		eh = ext_block_hdr(bh);
946		ppos++;
947		path[ppos].p_bh = bh;
948		path[ppos].p_hdr = eh;
949	}
950
951	path[ppos].p_depth = i;
952	path[ppos].p_ext = NULL;
953	path[ppos].p_idx = NULL;
954
955	/* find extent */
956	ext4_ext_binsearch(inode, path + ppos, block);
957	/* if not an empty leaf */
958	if (path[ppos].p_ext)
959		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
960
961	ext4_ext_show_path(inode, path);
962
963	return path;
964
965err:
966	ext4_ext_drop_refs(path);
967	kfree(path);
968	if (orig_path)
969		*orig_path = NULL;
970	return ERR_PTR(ret);
971}
972
973/*
974 * ext4_ext_insert_index:
975 * insert new index [@logical;@ptr] into the block at @curp;
976 * check where to insert: before @curp or after @curp
977 */
978static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
979				 struct ext4_ext_path *curp,
980				 int logical, ext4_fsblk_t ptr)
981{
982	struct ext4_extent_idx *ix;
983	int len, err;
984
985	err = ext4_ext_get_access(handle, inode, curp);
986	if (err)
987		return err;
988
989	if (unlikely(logical == le32_to_cpu(curp->p_idx->ei_block))) {
990		EXT4_ERROR_INODE(inode,
991				 "logical %d == ei_block %d!",
992				 logical, le32_to_cpu(curp->p_idx->ei_block));
993		return -EFSCORRUPTED;
994	}
995
996	if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
997			     >= le16_to_cpu(curp->p_hdr->eh_max))) {
998		EXT4_ERROR_INODE(inode,
999				 "eh_entries %d >= eh_max %d!",
1000				 le16_to_cpu(curp->p_hdr->eh_entries),
1001				 le16_to_cpu(curp->p_hdr->eh_max));
1002		return -EFSCORRUPTED;
1003	}
1004
1005	if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
1006		/* insert after */
1007		ext_debug(inode, "insert new index %d after: %llu\n",
1008			  logical, ptr);
1009		ix = curp->p_idx + 1;
1010	} else {
1011		/* insert before */
1012		ext_debug(inode, "insert new index %d before: %llu\n",
1013			  logical, ptr);
1014		ix = curp->p_idx;
1015	}
1016
1017	if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
1018		EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
1019		return -EFSCORRUPTED;
1020	}
1021
1022	len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
1023	BUG_ON(len < 0);
1024	if (len > 0) {
1025		ext_debug(inode, "insert new index %d: "
1026				"move %d indices from 0x%p to 0x%p\n",
1027				logical, len, ix, ix + 1);
1028		memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
1029	}
1030
1031	ix->ei_block = cpu_to_le32(logical);
1032	ext4_idx_store_pblock(ix, ptr);
1033	le16_add_cpu(&curp->p_hdr->eh_entries, 1);
1034
1035	if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
1036		EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
1037		return -EFSCORRUPTED;
1038	}
1039
1040	err = ext4_ext_dirty(handle, inode, curp);
1041	ext4_std_error(inode->i_sb, err);
1042
1043	return err;
1044}
1045
1046/*
1047 * ext4_ext_split:
1048 * inserts new subtree into the path, using free index entry
1049 * at depth @at:
1050 * - allocates all needed blocks (new leaf and all intermediate index blocks)
1051 * - makes decision where to split
1052 * - moves remaining extents and index entries (right to the split point)
1053 *   into the newly allocated blocks
1054 * - initializes subtree
1055 */
1056static int ext4_ext_split(handle_t *handle, struct inode *inode,
1057			  unsigned int flags,
1058			  struct ext4_ext_path *path,
1059			  struct ext4_extent *newext, int at)
1060{
1061	struct buffer_head *bh = NULL;
1062	int depth = ext_depth(inode);
1063	struct ext4_extent_header *neh;
1064	struct ext4_extent_idx *fidx;
1065	int i = at, k, m, a;
1066	ext4_fsblk_t newblock, oldblock;
1067	__le32 border;
1068	ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */
1069	gfp_t gfp_flags = GFP_NOFS;
1070	int err = 0;
1071	size_t ext_size = 0;
1072
1073	if (flags & EXT4_EX_NOFAIL)
1074		gfp_flags |= __GFP_NOFAIL;
1075
1076	/* make decision: where to split? */
1077	/* FIXME: now decision is simplest: at current extent */
1078
1079	/* if current leaf will be split, then we should use
1080	 * border from split point */
1081	if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
1082		EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
1083		return -EFSCORRUPTED;
1084	}
1085	if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
1086		border = path[depth].p_ext[1].ee_block;
1087		ext_debug(inode, "leaf will be split."
1088				" next leaf starts at %d\n",
1089				  le32_to_cpu(border));
1090	} else {
1091		border = newext->ee_block;
1092		ext_debug(inode, "leaf will be added."
1093				" next leaf starts at %d\n",
1094				le32_to_cpu(border));
1095	}
1096
1097	/*
1098	 * If error occurs, then we break processing
1099	 * and mark filesystem read-only. index won't
1100	 * be inserted and tree will be in consistent
1101	 * state. Next mount will repair buffers too.
1102	 */
1103
1104	/*
1105	 * Get array to track all allocated blocks.
1106	 * We need this to handle errors and free blocks
1107	 * upon them.
1108	 */
1109	ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags);
1110	if (!ablocks)
1111		return -ENOMEM;
1112
1113	/* allocate all needed blocks */
1114	ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at);
1115	for (a = 0; a < depth - at; a++) {
1116		newblock = ext4_ext_new_meta_block(handle, inode, path,
1117						   newext, &err, flags);
1118		if (newblock == 0)
1119			goto cleanup;
1120		ablocks[a] = newblock;
1121	}
1122
1123	/* initialize new leaf */
1124	newblock = ablocks[--a];
1125	if (unlikely(newblock == 0)) {
1126		EXT4_ERROR_INODE(inode, "newblock == 0!");
1127		err = -EFSCORRUPTED;
1128		goto cleanup;
1129	}
1130	bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1131	if (unlikely(!bh)) {
1132		err = -ENOMEM;
1133		goto cleanup;
1134	}
1135	lock_buffer(bh);
1136
1137	err = ext4_journal_get_create_access(handle, bh);
1138	if (err)
1139		goto cleanup;
1140
1141	neh = ext_block_hdr(bh);
1142	neh->eh_entries = 0;
1143	neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1144	neh->eh_magic = EXT4_EXT_MAGIC;
1145	neh->eh_depth = 0;
1146	neh->eh_generation = 0;
1147
1148	/* move remainder of path[depth] to the new leaf */
1149	if (unlikely(path[depth].p_hdr->eh_entries !=
1150		     path[depth].p_hdr->eh_max)) {
1151		EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
1152				 path[depth].p_hdr->eh_entries,
1153				 path[depth].p_hdr->eh_max);
1154		err = -EFSCORRUPTED;
1155		goto cleanup;
1156	}
1157	/* start copy from next extent */
1158	m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
1159	ext4_ext_show_move(inode, path, newblock, depth);
1160	if (m) {
1161		struct ext4_extent *ex;
1162		ex = EXT_FIRST_EXTENT(neh);
1163		memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
1164		le16_add_cpu(&neh->eh_entries, m);
1165	}
1166
1167	/* zero out unused area in the extent block */
1168	ext_size = sizeof(struct ext4_extent_header) +
1169		sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries);
1170	memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1171	ext4_extent_block_csum_set(inode, neh);
1172	set_buffer_uptodate(bh);
1173	unlock_buffer(bh);
1174
1175	err = ext4_handle_dirty_metadata(handle, inode, bh);
1176	if (err)
1177		goto cleanup;
1178	brelse(bh);
1179	bh = NULL;
1180
1181	/* correct old leaf */
1182	if (m) {
1183		err = ext4_ext_get_access(handle, inode, path + depth);
1184		if (err)
1185			goto cleanup;
1186		le16_add_cpu(&path[depth].p_hdr->eh_entries, -m);
1187		err = ext4_ext_dirty(handle, inode, path + depth);
1188		if (err)
1189			goto cleanup;
1190
1191	}
1192
1193	/* create intermediate indexes */
1194	k = depth - at - 1;
1195	if (unlikely(k < 0)) {
1196		EXT4_ERROR_INODE(inode, "k %d < 0!", k);
1197		err = -EFSCORRUPTED;
1198		goto cleanup;
1199	}
1200	if (k)
1201		ext_debug(inode, "create %d intermediate indices\n", k);
1202	/* insert new index into current index block */
1203	/* current depth stored in i var */
1204	i = depth - 1;
1205	while (k--) {
1206		oldblock = newblock;
1207		newblock = ablocks[--a];
1208		bh = sb_getblk(inode->i_sb, newblock);
1209		if (unlikely(!bh)) {
1210			err = -ENOMEM;
1211			goto cleanup;
1212		}
1213		lock_buffer(bh);
1214
1215		err = ext4_journal_get_create_access(handle, bh);
1216		if (err)
1217			goto cleanup;
1218
1219		neh = ext_block_hdr(bh);
1220		neh->eh_entries = cpu_to_le16(1);
1221		neh->eh_magic = EXT4_EXT_MAGIC;
1222		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1223		neh->eh_depth = cpu_to_le16(depth - i);
1224		neh->eh_generation = 0;
1225		fidx = EXT_FIRST_INDEX(neh);
1226		fidx->ei_block = border;
1227		ext4_idx_store_pblock(fidx, oldblock);
1228
1229		ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n",
1230				i, newblock, le32_to_cpu(border), oldblock);
1231
1232		/* move remainder of path[i] to the new index block */
1233		if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
1234					EXT_LAST_INDEX(path[i].p_hdr))) {
1235			EXT4_ERROR_INODE(inode,
1236					 "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
1237					 le32_to_cpu(path[i].p_ext->ee_block));
1238			err = -EFSCORRUPTED;
1239			goto cleanup;
1240		}
1241		/* start copy indexes */
1242		m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
1243		ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx,
1244				EXT_MAX_INDEX(path[i].p_hdr));
1245		ext4_ext_show_move(inode, path, newblock, i);
1246		if (m) {
1247			memmove(++fidx, path[i].p_idx,
1248				sizeof(struct ext4_extent_idx) * m);
1249			le16_add_cpu(&neh->eh_entries, m);
1250		}
1251		/* zero out unused area in the extent block */
1252		ext_size = sizeof(struct ext4_extent_header) +
1253		   (sizeof(struct ext4_extent) * le16_to_cpu(neh->eh_entries));
1254		memset(bh->b_data + ext_size, 0,
1255			inode->i_sb->s_blocksize - ext_size);
1256		ext4_extent_block_csum_set(inode, neh);
1257		set_buffer_uptodate(bh);
1258		unlock_buffer(bh);
1259
1260		err = ext4_handle_dirty_metadata(handle, inode, bh);
1261		if (err)
1262			goto cleanup;
1263		brelse(bh);
1264		bh = NULL;
1265
1266		/* correct old index */
1267		if (m) {
1268			err = ext4_ext_get_access(handle, inode, path + i);
1269			if (err)
1270				goto cleanup;
1271			le16_add_cpu(&path[i].p_hdr->eh_entries, -m);
1272			err = ext4_ext_dirty(handle, inode, path + i);
1273			if (err)
1274				goto cleanup;
1275		}
1276
1277		i--;
1278	}
1279
1280	/* insert new index */
1281	err = ext4_ext_insert_index(handle, inode, path + at,
1282				    le32_to_cpu(border), newblock);
1283
1284cleanup:
1285	if (bh) {
1286		if (buffer_locked(bh))
1287			unlock_buffer(bh);
1288		brelse(bh);
1289	}
1290
1291	if (err) {
1292		/* free all allocated blocks in error case */
1293		for (i = 0; i < depth; i++) {
1294			if (!ablocks[i])
1295				continue;
1296			ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1297					 EXT4_FREE_BLOCKS_METADATA);
1298		}
1299	}
1300	kfree(ablocks);
1301
1302	return err;
1303}
1304
1305/*
1306 * ext4_ext_grow_indepth:
1307 * implements tree growing procedure:
1308 * - allocates new block
1309 * - moves top-level data (index block or leaf) into the new block
1310 * - initializes new top-level, creating index that points to the
1311 *   just created block
1312 */
1313static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1314				 unsigned int flags)
1315{
1316	struct ext4_extent_header *neh;
1317	struct buffer_head *bh;
1318	ext4_fsblk_t newblock, goal = 0;
1319	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
1320	int err = 0;
1321	size_t ext_size = 0;
1322
1323	/* Try to prepend new index to old one */
1324	if (ext_depth(inode))
1325		goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
1326	if (goal > le32_to_cpu(es->s_first_data_block)) {
1327		flags |= EXT4_MB_HINT_TRY_GOAL;
1328		goal--;
1329	} else
1330		goal = ext4_inode_to_goal_block(inode);
1331	newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
1332					NULL, &err);
1333	if (newblock == 0)
1334		return err;
1335
1336	bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
1337	if (unlikely(!bh))
1338		return -ENOMEM;
1339	lock_buffer(bh);
1340
1341	err = ext4_journal_get_create_access(handle, bh);
1342	if (err) {
1343		unlock_buffer(bh);
1344		goto out;
1345	}
1346
1347	ext_size = sizeof(EXT4_I(inode)->i_data);
1348	/* move top-level index/leaf into new block */
1349	memmove(bh->b_data, EXT4_I(inode)->i_data, ext_size);
1350	/* zero out unused area in the extent block */
1351	memset(bh->b_data + ext_size, 0, inode->i_sb->s_blocksize - ext_size);
1352
1353	/* set size of new block */
1354	neh = ext_block_hdr(bh);
1355	/* old root could have indexes or leaves
1356	 * so calculate e_max right way */
1357	if (ext_depth(inode))
1358		neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1359	else
1360		neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1361	neh->eh_magic = EXT4_EXT_MAGIC;
1362	ext4_extent_block_csum_set(inode, neh);
1363	set_buffer_uptodate(bh);
1364	unlock_buffer(bh);
1365
1366	err = ext4_handle_dirty_metadata(handle, inode, bh);
1367	if (err)
1368		goto out;
1369
1370	/* Update top-level index: num,max,pointer */
1371	neh = ext_inode_hdr(inode);
1372	neh->eh_entries = cpu_to_le16(1);
1373	ext4_idx_store_pblock(EXT_FIRST_INDEX(neh), newblock);
1374	if (neh->eh_depth == 0) {
1375		/* Root extent block becomes index block */
1376		neh->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1377		EXT_FIRST_INDEX(neh)->ei_block =
1378			EXT_FIRST_EXTENT(neh)->ee_block;
1379	}
1380	ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n",
1381		  le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1382		  le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1383		  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
1384
1385	le16_add_cpu(&neh->eh_depth, 1);
1386	err = ext4_mark_inode_dirty(handle, inode);
1387out:
1388	brelse(bh);
1389
1390	return err;
1391}
1392
1393/*
1394 * ext4_ext_create_new_leaf:
1395 * finds empty index and adds new leaf.
1396 * if no free index is found, then it requests in-depth growing.
1397 */
1398static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1399				    unsigned int mb_flags,
1400				    unsigned int gb_flags,
1401				    struct ext4_ext_path **ppath,
1402				    struct ext4_extent *newext)
1403{
1404	struct ext4_ext_path *path = *ppath;
1405	struct ext4_ext_path *curp;
1406	int depth, i, err = 0;
1407
1408repeat:
1409	i = depth = ext_depth(inode);
1410
1411	/* walk up to the tree and look for free index entry */
1412	curp = path + depth;
1413	while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) {
1414		i--;
1415		curp--;
1416	}
1417
1418	/* we use already allocated block for index block,
1419	 * so subsequent data blocks should be contiguous */
1420	if (EXT_HAS_FREE_INDEX(curp)) {
1421		/* if we found index with free entry, then use that
1422		 * entry: create all needed subtree and add new leaf */
1423		err = ext4_ext_split(handle, inode, mb_flags, path, newext, i);
1424		if (err)
1425			goto out;
1426
1427		/* refill path */
1428		path = ext4_find_extent(inode,
1429				    (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1430				    ppath, gb_flags);
1431		if (IS_ERR(path))
1432			err = PTR_ERR(path);
1433	} else {
1434		/* tree is full, time to grow in depth */
1435		err = ext4_ext_grow_indepth(handle, inode, mb_flags);
1436		if (err)
1437			goto out;
1438
1439		/* refill path */
1440		path = ext4_find_extent(inode,
1441				   (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1442				    ppath, gb_flags);
1443		if (IS_ERR(path)) {
1444			err = PTR_ERR(path);
1445			goto out;
1446		}
1447
1448		/*
1449		 * only first (depth 0 -> 1) produces free space;
1450		 * in all other cases we have to split the grown tree
1451		 */
1452		depth = ext_depth(inode);
1453		if (path[depth].p_hdr->eh_entries == path[depth].p_hdr->eh_max) {
1454			/* now we need to split */
1455			goto repeat;
1456		}
1457	}
1458
1459out:
1460	return err;
1461}
1462
1463/*
1464 * search the closest allocated block to the left for *logical
1465 * and returns it at @logical + it's physical address at @phys
1466 * if *logical is the smallest allocated block, the function
1467 * returns 0 at @phys
1468 * return value contains 0 (success) or error code
1469 */
1470static int ext4_ext_search_left(struct inode *inode,
1471				struct ext4_ext_path *path,
1472				ext4_lblk_t *logical, ext4_fsblk_t *phys)
1473{
1474	struct ext4_extent_idx *ix;
1475	struct ext4_extent *ex;
1476	int depth, ee_len;
1477
1478	if (unlikely(path == NULL)) {
1479		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1480		return -EFSCORRUPTED;
1481	}
1482	depth = path->p_depth;
1483	*phys = 0;
1484
1485	if (depth == 0 && path->p_ext == NULL)
1486		return 0;
1487
1488	/* usually extent in the path covers blocks smaller
1489	 * then *logical, but it can be that extent is the
1490	 * first one in the file */
1491
1492	ex = path[depth].p_ext;
1493	ee_len = ext4_ext_get_actual_len(ex);
1494	if (*logical < le32_to_cpu(ex->ee_block)) {
1495		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1496			EXT4_ERROR_INODE(inode,
1497					 "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
1498					 *logical, le32_to_cpu(ex->ee_block));
1499			return -EFSCORRUPTED;
1500		}
1501		while (--depth >= 0) {
1502			ix = path[depth].p_idx;
1503			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1504				EXT4_ERROR_INODE(inode,
1505				  "ix (%d) != EXT_FIRST_INDEX (%d) (depth %d)!",
1506				  ix != NULL ? le32_to_cpu(ix->ei_block) : 0,
1507				  EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
1508		le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
1509				  depth);
1510				return -EFSCORRUPTED;
1511			}
1512		}
1513		return 0;
1514	}
1515
1516	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1517		EXT4_ERROR_INODE(inode,
1518				 "logical %d < ee_block %d + ee_len %d!",
1519				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1520		return -EFSCORRUPTED;
1521	}
1522
1523	*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
1524	*phys = ext4_ext_pblock(ex) + ee_len - 1;
1525	return 0;
1526}
1527
1528/*
1529 * Search the closest allocated block to the right for *logical
1530 * and returns it at @logical + it's physical address at @phys.
1531 * If not exists, return 0 and @phys is set to 0. We will return
1532 * 1 which means we found an allocated block and ret_ex is valid.
1533 * Or return a (< 0) error code.
1534 */
1535static int ext4_ext_search_right(struct inode *inode,
1536				 struct ext4_ext_path *path,
1537				 ext4_lblk_t *logical, ext4_fsblk_t *phys,
1538				 struct ext4_extent *ret_ex)
1539{
1540	struct buffer_head *bh = NULL;
1541	struct ext4_extent_header *eh;
1542	struct ext4_extent_idx *ix;
1543	struct ext4_extent *ex;
1544	int depth;	/* Note, NOT eh_depth; depth from top of tree */
1545	int ee_len;
1546
1547	if (unlikely(path == NULL)) {
1548		EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
1549		return -EFSCORRUPTED;
1550	}
1551	depth = path->p_depth;
1552	*phys = 0;
1553
1554	if (depth == 0 && path->p_ext == NULL)
1555		return 0;
1556
1557	/* usually extent in the path covers blocks smaller
1558	 * then *logical, but it can be that extent is the
1559	 * first one in the file */
1560
1561	ex = path[depth].p_ext;
1562	ee_len = ext4_ext_get_actual_len(ex);
1563	if (*logical < le32_to_cpu(ex->ee_block)) {
1564		if (unlikely(EXT_FIRST_EXTENT(path[depth].p_hdr) != ex)) {
1565			EXT4_ERROR_INODE(inode,
1566					 "first_extent(path[%d].p_hdr) != ex",
1567					 depth);
1568			return -EFSCORRUPTED;
1569		}
1570		while (--depth >= 0) {
1571			ix = path[depth].p_idx;
1572			if (unlikely(ix != EXT_FIRST_INDEX(path[depth].p_hdr))) {
1573				EXT4_ERROR_INODE(inode,
1574						 "ix != EXT_FIRST_INDEX *logical %d!",
1575						 *logical);
1576				return -EFSCORRUPTED;
1577			}
1578		}
1579		goto found_extent;
1580	}
1581
1582	if (unlikely(*logical < (le32_to_cpu(ex->ee_block) + ee_len))) {
1583		EXT4_ERROR_INODE(inode,
1584				 "logical %d < ee_block %d + ee_len %d!",
1585				 *logical, le32_to_cpu(ex->ee_block), ee_len);
1586		return -EFSCORRUPTED;
1587	}
1588
1589	if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
1590		/* next allocated block in this leaf */
1591		ex++;
1592		goto found_extent;
1593	}
1594
1595	/* go up and search for index to the right */
1596	while (--depth >= 0) {
1597		ix = path[depth].p_idx;
1598		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
1599			goto got_index;
1600	}
1601
1602	/* we've gone up to the root and found no index to the right */
1603	return 0;
1604
1605got_index:
1606	/* we've found index to the right, let's
1607	 * follow it and find the closest allocated
1608	 * block to the right */
1609	ix++;
1610	while (++depth < path->p_depth) {
1611		/* subtract from p_depth to get proper eh_depth */
1612		bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
1613		if (IS_ERR(bh))
1614			return PTR_ERR(bh);
1615		eh = ext_block_hdr(bh);
1616		ix = EXT_FIRST_INDEX(eh);
1617		put_bh(bh);
1618	}
1619
1620	bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
1621	if (IS_ERR(bh))
1622		return PTR_ERR(bh);
1623	eh = ext_block_hdr(bh);
1624	ex = EXT_FIRST_EXTENT(eh);
1625found_extent:
1626	*logical = le32_to_cpu(ex->ee_block);
1627	*phys = ext4_ext_pblock(ex);
1628	if (ret_ex)
1629		*ret_ex = *ex;
1630	if (bh)
1631		put_bh(bh);
1632	return 1;
1633}
1634
1635/*
1636 * ext4_ext_next_allocated_block:
1637 * returns allocated block in subsequent extent or EXT_MAX_BLOCKS.
1638 * NOTE: it considers block number from index entry as
1639 * allocated block. Thus, index entries have to be consistent
1640 * with leaves.
1641 */
1642ext4_lblk_t
1643ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1644{
1645	int depth;
1646
1647	BUG_ON(path == NULL);
1648	depth = path->p_depth;
1649
1650	if (depth == 0 && path->p_ext == NULL)
1651		return EXT_MAX_BLOCKS;
1652
1653	while (depth >= 0) {
1654		struct ext4_ext_path *p = &path[depth];
1655
1656		if (depth == path->p_depth) {
1657			/* leaf */
1658			if (p->p_ext && p->p_ext != EXT_LAST_EXTENT(p->p_hdr))
1659				return le32_to_cpu(p->p_ext[1].ee_block);
1660		} else {
1661			/* index */
1662			if (p->p_idx != EXT_LAST_INDEX(p->p_hdr))
1663				return le32_to_cpu(p->p_idx[1].ei_block);
1664		}
1665		depth--;
1666	}
1667
1668	return EXT_MAX_BLOCKS;
1669}
1670
1671/*
1672 * ext4_ext_next_leaf_block:
1673 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1674 */
1675static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1676{
1677	int depth;
1678
1679	BUG_ON(path == NULL);
1680	depth = path->p_depth;
1681
1682	/* zero-tree has no leaf blocks at all */
1683	if (depth == 0)
1684		return EXT_MAX_BLOCKS;
1685
1686	/* go to index block */
1687	depth--;
1688
1689	while (depth >= 0) {
1690		if (path[depth].p_idx !=
1691				EXT_LAST_INDEX(path[depth].p_hdr))
1692			return (ext4_lblk_t)
1693				le32_to_cpu(path[depth].p_idx[1].ei_block);
1694		depth--;
1695	}
1696
1697	return EXT_MAX_BLOCKS;
1698}
1699
1700/*
1701 * ext4_ext_correct_indexes:
1702 * if leaf gets modified and modified extent is first in the leaf,
1703 * then we have to correct all indexes above.
1704 * TODO: do we need to correct tree in all cases?
1705 */
1706static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
1707				struct ext4_ext_path *path)
1708{
1709	struct ext4_extent_header *eh;
1710	int depth = ext_depth(inode);
1711	struct ext4_extent *ex;
1712	__le32 border;
1713	int k, err = 0;
1714
1715	eh = path[depth].p_hdr;
1716	ex = path[depth].p_ext;
1717
1718	if (unlikely(ex == NULL || eh == NULL)) {
1719		EXT4_ERROR_INODE(inode,
1720				 "ex %p == NULL or eh %p == NULL", ex, eh);
1721		return -EFSCORRUPTED;
1722	}
1723
1724	if (depth == 0) {
1725		/* there is no tree at all */
1726		return 0;
1727	}
1728
1729	if (ex != EXT_FIRST_EXTENT(eh)) {
1730		/* we correct tree if first leaf got modified only */
1731		return 0;
1732	}
1733
1734	/*
1735	 * TODO: we need correction if border is smaller than current one
1736	 */
1737	k = depth - 1;
1738	border = path[depth].p_ext->ee_block;
1739	err = ext4_ext_get_access(handle, inode, path + k);
1740	if (err)
1741		return err;
1742	path[k].p_idx->ei_block = border;
1743	err = ext4_ext_dirty(handle, inode, path + k);
1744	if (err)
1745		return err;
1746
1747	while (k--) {
1748		/* change all left-side indexes */
1749		if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr))
1750			break;
1751		err = ext4_ext_get_access(handle, inode, path + k);
1752		if (err)
1753			break;
1754		path[k].p_idx->ei_block = border;
1755		err = ext4_ext_dirty(handle, inode, path + k);
1756		if (err)
1757			break;
1758	}
1759
1760	return err;
1761}
1762
1763static int ext4_can_extents_be_merged(struct inode *inode,
1764				      struct ext4_extent *ex1,
1765				      struct ext4_extent *ex2)
1766{
1767	unsigned short ext1_ee_len, ext2_ee_len;
1768
1769	if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1770		return 0;
1771
1772	ext1_ee_len = ext4_ext_get_actual_len(ex1);
1773	ext2_ee_len = ext4_ext_get_actual_len(ex2);
1774
1775	if (le32_to_cpu(ex1->ee_block) + ext1_ee_len !=
1776			le32_to_cpu(ex2->ee_block))
1777		return 0;
1778
1779	if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1780		return 0;
1781
1782	if (ext4_ext_is_unwritten(ex1) &&
1783	    ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)
1784		return 0;
1785#ifdef AGGRESSIVE_TEST
1786	if (ext1_ee_len >= 4)
1787		return 0;
1788#endif
1789
1790	if (ext4_ext_pblock(ex1) + ext1_ee_len == ext4_ext_pblock(ex2))
1791		return 1;
1792	return 0;
1793}
1794
1795/*
1796 * This function tries to merge the "ex" extent to the next extent in the tree.
1797 * It always tries to merge towards right. If you want to merge towards
1798 * left, pass "ex - 1" as argument instead of "ex".
1799 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1800 * 1 if they got merged.
1801 */
1802static int ext4_ext_try_to_merge_right(struct inode *inode,
1803				 struct ext4_ext_path *path,
1804				 struct ext4_extent *ex)
1805{
1806	struct ext4_extent_header *eh;
1807	unsigned int depth, len;
1808	int merge_done = 0, unwritten;
1809
1810	depth = ext_depth(inode);
1811	BUG_ON(path[depth].p_hdr == NULL);
1812	eh = path[depth].p_hdr;
1813
1814	while (ex < EXT_LAST_EXTENT(eh)) {
1815		if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
1816			break;
1817		/* merge with next extent! */
1818		unwritten = ext4_ext_is_unwritten(ex);
1819		ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
1820				+ ext4_ext_get_actual_len(ex + 1));
1821		if (unwritten)
1822			ext4_ext_mark_unwritten(ex);
1823
1824		if (ex + 1 < EXT_LAST_EXTENT(eh)) {
1825			len = (EXT_LAST_EXTENT(eh) - ex - 1)
1826				* sizeof(struct ext4_extent);
1827			memmove(ex + 1, ex + 2, len);
1828		}
1829		le16_add_cpu(&eh->eh_entries, -1);
1830		merge_done = 1;
1831		WARN_ON(eh->eh_entries == 0);
1832		if (!eh->eh_entries)
1833			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1834	}
1835
1836	return merge_done;
1837}
1838
1839/*
1840 * This function does a very simple check to see if we can collapse
1841 * an extent tree with a single extent tree leaf block into the inode.
1842 */
1843static void ext4_ext_try_to_merge_up(handle_t *handle,
1844				     struct inode *inode,
1845				     struct ext4_ext_path *path)
1846{
1847	size_t s;
1848	unsigned max_root = ext4_ext_space_root(inode, 0);
1849	ext4_fsblk_t blk;
1850
1851	if ((path[0].p_depth != 1) ||
1852	    (le16_to_cpu(path[0].p_hdr->eh_entries) != 1) ||
1853	    (le16_to_cpu(path[1].p_hdr->eh_entries) > max_root))
1854		return;
1855
1856	/*
1857	 * We need to modify the block allocation bitmap and the block
1858	 * group descriptor to release the extent tree block.  If we
1859	 * can't get the journal credits, give up.
1860	 */
1861	if (ext4_journal_extend(handle, 2,
1862			ext4_free_metadata_revoke_credits(inode->i_sb, 1)))
1863		return;
1864
1865	/*
1866	 * Copy the extent data up to the inode
1867	 */
1868	blk = ext4_idx_pblock(path[0].p_idx);
1869	s = le16_to_cpu(path[1].p_hdr->eh_entries) *
1870		sizeof(struct ext4_extent_idx);
1871	s += sizeof(struct ext4_extent_header);
1872
1873	path[1].p_maxdepth = path[0].p_maxdepth;
1874	memcpy(path[0].p_hdr, path[1].p_hdr, s);
1875	path[0].p_depth = 0;
1876	path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
1877		(path[1].p_ext - EXT_FIRST_EXTENT(path[1].p_hdr));
1878	path[0].p_hdr->eh_max = cpu_to_le16(max_root);
1879
1880	brelse(path[1].p_bh);
1881	ext4_free_blocks(handle, inode, NULL, blk, 1,
1882			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1883}
1884
1885/*
1886 * This function tries to merge the @ex extent to neighbours in the tree, then
1887 * tries to collapse the extent tree into the inode.
1888 */
1889static void ext4_ext_try_to_merge(handle_t *handle,
1890				  struct inode *inode,
1891				  struct ext4_ext_path *path,
1892				  struct ext4_extent *ex)
1893{
1894	struct ext4_extent_header *eh;
1895	unsigned int depth;
1896	int merge_done = 0;
1897
1898	depth = ext_depth(inode);
1899	BUG_ON(path[depth].p_hdr == NULL);
1900	eh = path[depth].p_hdr;
1901
1902	if (ex > EXT_FIRST_EXTENT(eh))
1903		merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1904
1905	if (!merge_done)
1906		(void) ext4_ext_try_to_merge_right(inode, path, ex);
1907
1908	ext4_ext_try_to_merge_up(handle, inode, path);
1909}
1910
1911/*
1912 * check if a portion of the "newext" extent overlaps with an
1913 * existing extent.
1914 *
1915 * If there is an overlap discovered, it updates the length of the newext
1916 * such that there will be no overlap, and then returns 1.
1917 * If there is no overlap found, it returns 0.
1918 */
1919static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
1920					   struct inode *inode,
1921					   struct ext4_extent *newext,
1922					   struct ext4_ext_path *path)
1923{
1924	ext4_lblk_t b1, b2;
1925	unsigned int depth, len1;
1926	unsigned int ret = 0;
1927
1928	b1 = le32_to_cpu(newext->ee_block);
1929	len1 = ext4_ext_get_actual_len(newext);
1930	depth = ext_depth(inode);
1931	if (!path[depth].p_ext)
1932		goto out;
1933	b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
1934
1935	/*
1936	 * get the next allocated block if the extent in the path
1937	 * is before the requested block(s)
1938	 */
1939	if (b2 < b1) {
1940		b2 = ext4_ext_next_allocated_block(path);
1941		if (b2 == EXT_MAX_BLOCKS)
1942			goto out;
1943		b2 = EXT4_LBLK_CMASK(sbi, b2);
1944	}
1945
1946	/* check for wrap through zero on extent logical start block*/
1947	if (b1 + len1 < b1) {
1948		len1 = EXT_MAX_BLOCKS - b1;
1949		newext->ee_len = cpu_to_le16(len1);
1950		ret = 1;
1951	}
1952
1953	/* check for overlap */
1954	if (b1 + len1 > b2) {
1955		newext->ee_len = cpu_to_le16(b2 - b1);
1956		ret = 1;
1957	}
1958out:
1959	return ret;
1960}
1961
1962/*
1963 * ext4_ext_insert_extent:
1964 * tries to merge requested extent into the existing extent or
1965 * inserts requested extent as new one into the tree,
1966 * creating new leaf in the no-space case.
1967 */
1968int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1969				struct ext4_ext_path **ppath,
1970				struct ext4_extent *newext, int gb_flags)
1971{
1972	struct ext4_ext_path *path = *ppath;
1973	struct ext4_extent_header *eh;
1974	struct ext4_extent *ex, *fex;
1975	struct ext4_extent *nearex; /* nearest extent */
1976	struct ext4_ext_path *npath = NULL;
1977	int depth, len, err;
1978	ext4_lblk_t next;
1979	int mb_flags = 0, unwritten;
1980
1981	if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1982		mb_flags |= EXT4_MB_DELALLOC_RESERVED;
1983	if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1984		EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1985		return -EFSCORRUPTED;
1986	}
1987	depth = ext_depth(inode);
1988	ex = path[depth].p_ext;
1989	eh = path[depth].p_hdr;
1990	if (unlikely(path[depth].p_hdr == NULL)) {
1991		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
1992		return -EFSCORRUPTED;
1993	}
1994
1995	/* try to insert block into found extent and return */
1996	if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) {
1997
1998		/*
1999		 * Try to see whether we should rather test the extent on
2000		 * right from ex, or from the left of ex. This is because
2001		 * ext4_find_extent() can return either extent on the
2002		 * left, or on the right from the searched position. This
2003		 * will make merging more effective.
2004		 */
2005		if (ex < EXT_LAST_EXTENT(eh) &&
2006		    (le32_to_cpu(ex->ee_block) +
2007		    ext4_ext_get_actual_len(ex) <
2008		    le32_to_cpu(newext->ee_block))) {
2009			ex += 1;
2010			goto prepend;
2011		} else if ((ex > EXT_FIRST_EXTENT(eh)) &&
2012			   (le32_to_cpu(newext->ee_block) +
2013			   ext4_ext_get_actual_len(newext) <
2014			   le32_to_cpu(ex->ee_block)))
2015			ex -= 1;
2016
2017		/* Try to append newex to the ex */
2018		if (ext4_can_extents_be_merged(inode, ex, newext)) {
2019			ext_debug(inode, "append [%d]%d block to %u:[%d]%d"
2020				  "(from %llu)\n",
2021				  ext4_ext_is_unwritten(newext),
2022				  ext4_ext_get_actual_len(newext),
2023				  le32_to_cpu(ex->ee_block),
2024				  ext4_ext_is_unwritten(ex),
2025				  ext4_ext_get_actual_len(ex),
2026				  ext4_ext_pblock(ex));
2027			err = ext4_ext_get_access(handle, inode,
2028						  path + depth);
2029			if (err)
2030				return err;
2031			unwritten = ext4_ext_is_unwritten(ex);
2032			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2033					+ ext4_ext_get_actual_len(newext));
2034			if (unwritten)
2035				ext4_ext_mark_unwritten(ex);
2036			eh = path[depth].p_hdr;
2037			nearex = ex;
2038			goto merge;
2039		}
2040
2041prepend:
2042		/* Try to prepend newex to the ex */
2043		if (ext4_can_extents_be_merged(inode, newext, ex)) {
2044			ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d"
2045				  "(from %llu)\n",
2046				  le32_to_cpu(newext->ee_block),
2047				  ext4_ext_is_unwritten(newext),
2048				  ext4_ext_get_actual_len(newext),
2049				  le32_to_cpu(ex->ee_block),
2050				  ext4_ext_is_unwritten(ex),
2051				  ext4_ext_get_actual_len(ex),
2052				  ext4_ext_pblock(ex));
2053			err = ext4_ext_get_access(handle, inode,
2054						  path + depth);
2055			if (err)
2056				return err;
2057
2058			unwritten = ext4_ext_is_unwritten(ex);
2059			ex->ee_block = newext->ee_block;
2060			ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
2061			ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
2062					+ ext4_ext_get_actual_len(newext));
2063			if (unwritten)
2064				ext4_ext_mark_unwritten(ex);
2065			eh = path[depth].p_hdr;
2066			nearex = ex;
2067			goto merge;
2068		}
2069	}
2070
2071	depth = ext_depth(inode);
2072	eh = path[depth].p_hdr;
2073	if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
2074		goto has_space;
2075
2076	/* probably next leaf has space for us? */
2077	fex = EXT_LAST_EXTENT(eh);
2078	next = EXT_MAX_BLOCKS;
2079	if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
2080		next = ext4_ext_next_leaf_block(path);
2081	if (next != EXT_MAX_BLOCKS) {
2082		ext_debug(inode, "next leaf block - %u\n", next);
2083		BUG_ON(npath != NULL);
2084		npath = ext4_find_extent(inode, next, NULL, gb_flags);
2085		if (IS_ERR(npath))
2086			return PTR_ERR(npath);
2087		BUG_ON(npath->p_depth != path->p_depth);
2088		eh = npath[depth].p_hdr;
2089		if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
2090			ext_debug(inode, "next leaf isn't full(%d)\n",
2091				  le16_to_cpu(eh->eh_entries));
2092			path = npath;
2093			goto has_space;
2094		}
2095		ext_debug(inode, "next leaf has no free space(%d,%d)\n",
2096			  le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
2097	}
2098
2099	/*
2100	 * There is no free space in the found leaf.
2101	 * We're gonna add a new leaf in the tree.
2102	 */
2103	if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2104		mb_flags |= EXT4_MB_USE_RESERVED;
2105	err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2106				       ppath, newext);
2107	if (err)
2108		goto cleanup;
2109	depth = ext_depth(inode);
2110	eh = path[depth].p_hdr;
2111
2112has_space:
2113	nearex = path[depth].p_ext;
2114
2115	err = ext4_ext_get_access(handle, inode, path + depth);
2116	if (err)
2117		goto cleanup;
2118
2119	if (!nearex) {
2120		/* there is no extent in this leaf, create first one */
2121		ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n",
2122				le32_to_cpu(newext->ee_block),
2123				ext4_ext_pblock(newext),
2124				ext4_ext_is_unwritten(newext),
2125				ext4_ext_get_actual_len(newext));
2126		nearex = EXT_FIRST_EXTENT(eh);
2127	} else {
2128		if (le32_to_cpu(newext->ee_block)
2129			   > le32_to_cpu(nearex->ee_block)) {
2130			/* Insert after */
2131			ext_debug(inode, "insert %u:%llu:[%d]%d before: "
2132					"nearest %p\n",
2133					le32_to_cpu(newext->ee_block),
2134					ext4_ext_pblock(newext),
2135					ext4_ext_is_unwritten(newext),
2136					ext4_ext_get_actual_len(newext),
2137					nearex);
2138			nearex++;
2139		} else {
2140			/* Insert before */
2141			BUG_ON(newext->ee_block == nearex->ee_block);
2142			ext_debug(inode, "insert %u:%llu:[%d]%d after: "
2143					"nearest %p\n",
2144					le32_to_cpu(newext->ee_block),
2145					ext4_ext_pblock(newext),
2146					ext4_ext_is_unwritten(newext),
2147					ext4_ext_get_actual_len(newext),
2148					nearex);
2149		}
2150		len = EXT_LAST_EXTENT(eh) - nearex + 1;
2151		if (len > 0) {
2152			ext_debug(inode, "insert %u:%llu:[%d]%d: "
2153					"move %d extents from 0x%p to 0x%p\n",
2154					le32_to_cpu(newext->ee_block),
2155					ext4_ext_pblock(newext),
2156					ext4_ext_is_unwritten(newext),
2157					ext4_ext_get_actual_len(newext),
2158					len, nearex, nearex + 1);
2159			memmove(nearex + 1, nearex,
2160				len * sizeof(struct ext4_extent));
2161		}
2162	}
2163
2164	le16_add_cpu(&eh->eh_entries, 1);
2165	path[depth].p_ext = nearex;
2166	nearex->ee_block = newext->ee_block;
2167	ext4_ext_store_pblock(nearex, ext4_ext_pblock(newext));
2168	nearex->ee_len = newext->ee_len;
2169
2170merge:
2171	/* try to merge extents */
2172	if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO))
2173		ext4_ext_try_to_merge(handle, inode, path, nearex);
2174
2175
2176	/* time to correct all indexes above */
2177	err = ext4_ext_correct_indexes(handle, inode, path);
2178	if (err)
2179		goto cleanup;
2180
2181	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2182
2183cleanup:
2184	ext4_ext_drop_refs(npath);
2185	kfree(npath);
2186	return err;
2187}
2188
2189static int ext4_fill_es_cache_info(struct inode *inode,
2190				   ext4_lblk_t block, ext4_lblk_t num,
2191				   struct fiemap_extent_info *fieinfo)
2192{
2193	ext4_lblk_t next, end = block + num - 1;
2194	struct extent_status es;
2195	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
2196	unsigned int flags;
2197	int err;
2198
2199	while (block <= end) {
2200		next = 0;
2201		flags = 0;
2202		if (!ext4_es_lookup_extent(inode, block, &next, &es))
2203			break;
2204		if (ext4_es_is_unwritten(&es))
2205			flags |= FIEMAP_EXTENT_UNWRITTEN;
2206		if (ext4_es_is_delayed(&es))
2207			flags |= (FIEMAP_EXTENT_DELALLOC |
2208				  FIEMAP_EXTENT_UNKNOWN);
2209		if (ext4_es_is_hole(&es))
2210			flags |= EXT4_FIEMAP_EXTENT_HOLE;
2211		if (next == 0)
2212			flags |= FIEMAP_EXTENT_LAST;
2213		if (flags & (FIEMAP_EXTENT_DELALLOC|
2214			     EXT4_FIEMAP_EXTENT_HOLE))
2215			es.es_pblk = 0;
2216		else
2217			es.es_pblk = ext4_es_pblock(&es);
2218		err = fiemap_fill_next_extent(fieinfo,
2219				(__u64)es.es_lblk << blksize_bits,
2220				(__u64)es.es_pblk << blksize_bits,
2221				(__u64)es.es_len << blksize_bits,
2222				flags);
2223		if (next == 0)
2224			break;
2225		block = next;
2226		if (err < 0)
2227			return err;
2228		if (err == 1)
2229			return 0;
2230	}
2231	return 0;
2232}
2233
2234
2235/*
2236 * ext4_ext_determine_hole - determine hole around given block
2237 * @inode:	inode we lookup in
2238 * @path:	path in extent tree to @lblk
2239 * @lblk:	pointer to logical block around which we want to determine hole
2240 *
2241 * Determine hole length (and start if easily possible) around given logical
2242 * block. We don't try too hard to find the beginning of the hole but @path
2243 * actually points to extent before @lblk, we provide it.
2244 *
2245 * The function returns the length of a hole starting at @lblk. We update @lblk
2246 * to the beginning of the hole if we managed to find it.
2247 */
2248static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
2249					   struct ext4_ext_path *path,
2250					   ext4_lblk_t *lblk)
2251{
2252	int depth = ext_depth(inode);
2253	struct ext4_extent *ex;
2254	ext4_lblk_t len;
2255
2256	ex = path[depth].p_ext;
2257	if (ex == NULL) {
2258		/* there is no extent yet, so gap is [0;-] */
2259		*lblk = 0;
2260		len = EXT_MAX_BLOCKS;
2261	} else if (*lblk < le32_to_cpu(ex->ee_block)) {
2262		len = le32_to_cpu(ex->ee_block) - *lblk;
2263	} else if (*lblk >= le32_to_cpu(ex->ee_block)
2264			+ ext4_ext_get_actual_len(ex)) {
2265		ext4_lblk_t next;
2266
2267		*lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2268		next = ext4_ext_next_allocated_block(path);
2269		BUG_ON(next == *lblk);
2270		len = next - *lblk;
2271	} else {
2272		BUG();
2273	}
2274	return len;
2275}
2276
2277/*
2278 * ext4_ext_put_gap_in_cache:
2279 * calculate boundaries of the gap that the requested block fits into
2280 * and cache this gap
2281 */
2282static void
2283ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2284			  ext4_lblk_t hole_len)
2285{
2286	struct extent_status es;
2287
2288	ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
2289				  hole_start + hole_len - 1, &es);
2290	if (es.es_len) {
2291		/* There's delayed extent containing lblock? */
2292		if (es.es_lblk <= hole_start)
2293			return;
2294		hole_len = min(es.es_lblk - hole_start, hole_len);
2295	}
2296	ext_debug(inode, " -> %u:%u\n", hole_start, hole_len);
2297	ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
2298			      EXTENT_STATUS_HOLE);
2299}
2300
2301/*
2302 * ext4_ext_rm_idx:
2303 * removes index from the index block.
2304 */
2305static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2306			struct ext4_ext_path *path, int depth)
2307{
2308	int err;
2309	ext4_fsblk_t leaf;
2310
2311	/* free index block */
2312	depth--;
2313	path = path + depth;
2314	leaf = ext4_idx_pblock(path->p_idx);
2315	if (unlikely(path->p_hdr->eh_entries == 0)) {
2316		EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
2317		return -EFSCORRUPTED;
2318	}
2319	err = ext4_ext_get_access(handle, inode, path);
2320	if (err)
2321		return err;
2322
2323	if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2324		int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2325		len *= sizeof(struct ext4_extent_idx);
2326		memmove(path->p_idx, path->p_idx + 1, len);
2327	}
2328
2329	le16_add_cpu(&path->p_hdr->eh_entries, -1);
2330	err = ext4_ext_dirty(handle, inode, path);
2331	if (err)
2332		return err;
2333	ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf);
2334	trace_ext4_ext_rm_idx(inode, leaf);
2335
2336	ext4_free_blocks(handle, inode, NULL, leaf, 1,
2337			 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2338
2339	while (--depth >= 0) {
2340		if (path->p_idx != EXT_FIRST_INDEX(path->p_hdr))
2341			break;
2342		path--;
2343		err = ext4_ext_get_access(handle, inode, path);
2344		if (err)
2345			break;
2346		path->p_idx->ei_block = (path+1)->p_idx->ei_block;
2347		err = ext4_ext_dirty(handle, inode, path);
2348		if (err)
2349			break;
2350	}
2351	return err;
2352}
2353
2354/*
2355 * ext4_ext_calc_credits_for_single_extent:
2356 * This routine returns max. credits that needed to insert an extent
2357 * to the extent tree.
2358 * When pass the actual path, the caller should calculate credits
2359 * under i_data_sem.
2360 */
2361int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2362						struct ext4_ext_path *path)
2363{
2364	if (path) {
2365		int depth = ext_depth(inode);
2366		int ret = 0;
2367
2368		/* probably there is space in leaf? */
2369		if (le16_to_cpu(path[depth].p_hdr->eh_entries)
2370				< le16_to_cpu(path[depth].p_hdr->eh_max)) {
2371
2372			/*
2373			 *  There are some space in the leaf tree, no
2374			 *  need to account for leaf block credit
2375			 *
2376			 *  bitmaps and block group descriptor blocks
2377			 *  and other metadata blocks still need to be
2378			 *  accounted.
2379			 */
2380			/* 1 bitmap, 1 block group descriptor */
2381			ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
2382			return ret;
2383		}
2384	}
2385
2386	return ext4_chunk_trans_blocks(inode, nrblocks);
2387}
2388
2389/*
2390 * How many index/leaf blocks need to change/allocate to add @extents extents?
2391 *
2392 * If we add a single extent, then in the worse case, each tree level
2393 * index/leaf need to be changed in case of the tree split.
2394 *
2395 * If more extents are inserted, they could cause the whole tree split more
2396 * than once, but this is really rare.
2397 */
2398int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2399{
2400	int index;
2401	int depth;
2402
2403	/* If we are converting the inline data, only one is needed here. */
2404	if (ext4_has_inline_data(inode))
2405		return 1;
2406
2407	depth = ext_depth(inode);
2408
2409	if (extents <= 1)
2410		index = depth * 2;
2411	else
2412		index = depth * 3;
2413
2414	return index;
2415}
2416
2417static inline int get_default_free_blocks_flags(struct inode *inode)
2418{
2419	if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
2420	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
2421		return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2422	else if (ext4_should_journal_data(inode))
2423		return EXT4_FREE_BLOCKS_FORGET;
2424	return 0;
2425}
2426
2427/*
2428 * ext4_rereserve_cluster - increment the reserved cluster count when
2429 *                          freeing a cluster with a pending reservation
2430 *
2431 * @inode - file containing the cluster
2432 * @lblk - logical block in cluster to be reserved
2433 *
2434 * Increments the reserved cluster count and adjusts quota in a bigalloc
2435 * file system when freeing a partial cluster containing at least one
2436 * delayed and unwritten block.  A partial cluster meeting that
2437 * requirement will have a pending reservation.  If so, the
2438 * RERESERVE_CLUSTER flag is used when calling ext4_free_blocks() to
2439 * defer reserved and allocated space accounting to a subsequent call
2440 * to this function.
2441 */
2442static void ext4_rereserve_cluster(struct inode *inode, ext4_lblk_t lblk)
2443{
2444	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2445	struct ext4_inode_info *ei = EXT4_I(inode);
2446
2447	dquot_reclaim_block(inode, EXT4_C2B(sbi, 1));
2448
2449	spin_lock(&ei->i_block_reservation_lock);
2450	ei->i_reserved_data_blocks++;
2451	percpu_counter_add(&sbi->s_dirtyclusters_counter, 1);
2452	spin_unlock(&ei->i_block_reservation_lock);
2453
2454	percpu_counter_add(&sbi->s_freeclusters_counter, 1);
2455	ext4_remove_pending(inode, lblk);
2456}
2457
2458static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2459			      struct ext4_extent *ex,
2460			      struct partial_cluster *partial,
2461			      ext4_lblk_t from, ext4_lblk_t to)
2462{
2463	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2464	unsigned short ee_len = ext4_ext_get_actual_len(ex);
2465	ext4_fsblk_t last_pblk, pblk;
2466	ext4_lblk_t num;
2467	int flags;
2468
2469	/* only extent tail removal is allowed */
2470	if (from < le32_to_cpu(ex->ee_block) ||
2471	    to != le32_to_cpu(ex->ee_block) + ee_len - 1) {
2472		ext4_error(sbi->s_sb,
2473			   "strange request: removal(2) %u-%u from %u:%u",
2474			   from, to, le32_to_cpu(ex->ee_block), ee_len);
2475		return 0;
2476	}
2477
2478#ifdef EXTENTS_STATS
2479	spin_lock(&sbi->s_ext_stats_lock);
2480	sbi->s_ext_blocks += ee_len;
2481	sbi->s_ext_extents++;
2482	if (ee_len < sbi->s_ext_min)
2483		sbi->s_ext_min = ee_len;
2484	if (ee_len > sbi->s_ext_max)
2485		sbi->s_ext_max = ee_len;
2486	if (ext_depth(inode) > sbi->s_depth_max)
2487		sbi->s_depth_max = ext_depth(inode);
2488	spin_unlock(&sbi->s_ext_stats_lock);
2489#endif
2490
2491	trace_ext4_remove_blocks(inode, ex, from, to, partial);
2492
2493	/*
2494	 * if we have a partial cluster, and it's different from the
2495	 * cluster of the last block in the extent, we free it
2496	 */
2497	last_pblk = ext4_ext_pblock(ex) + ee_len - 1;
2498
2499	if (partial->state != initial &&
2500	    partial->pclu != EXT4_B2C(sbi, last_pblk)) {
2501		if (partial->state == tofree) {
2502			flags = get_default_free_blocks_flags(inode);
2503			if (ext4_is_pending(inode, partial->lblk))
2504				flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2505			ext4_free_blocks(handle, inode, NULL,
2506					 EXT4_C2B(sbi, partial->pclu),
2507					 sbi->s_cluster_ratio, flags);
2508			if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2509				ext4_rereserve_cluster(inode, partial->lblk);
2510		}
2511		partial->state = initial;
2512	}
2513
2514	num = le32_to_cpu(ex->ee_block) + ee_len - from;
2515	pblk = ext4_ext_pblock(ex) + ee_len - num;
2516
2517	/*
2518	 * We free the partial cluster at the end of the extent (if any),
2519	 * unless the cluster is used by another extent (partial_cluster
2520	 * state is nofree).  If a partial cluster exists here, it must be
2521	 * shared with the last block in the extent.
2522	 */
2523	flags = get_default_free_blocks_flags(inode);
2524
2525	/* partial, left end cluster aligned, right end unaligned */
2526	if ((EXT4_LBLK_COFF(sbi, to) != sbi->s_cluster_ratio - 1) &&
2527	    (EXT4_LBLK_CMASK(sbi, to) >= from) &&
2528	    (partial->state != nofree)) {
2529		if (ext4_is_pending(inode, to))
2530			flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2531		ext4_free_blocks(handle, inode, NULL,
2532				 EXT4_PBLK_CMASK(sbi, last_pblk),
2533				 sbi->s_cluster_ratio, flags);
2534		if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2535			ext4_rereserve_cluster(inode, to);
2536		partial->state = initial;
2537		flags = get_default_free_blocks_flags(inode);
2538	}
2539
2540	flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2541
2542	/*
2543	 * For bigalloc file systems, we never free a partial cluster
2544	 * at the beginning of the extent.  Instead, we check to see if we
2545	 * need to free it on a subsequent call to ext4_remove_blocks,
2546	 * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space.
2547	 */
2548	flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER;
2549	ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2550
2551	/* reset the partial cluster if we've freed past it */
2552	if (partial->state != initial && partial->pclu != EXT4_B2C(sbi, pblk))
2553		partial->state = initial;
2554
2555	/*
2556	 * If we've freed the entire extent but the beginning is not left
2557	 * cluster aligned and is not marked as ineligible for freeing we
2558	 * record the partial cluster at the beginning of the extent.  It
2559	 * wasn't freed by the preceding ext4_free_blocks() call, and we
2560	 * need to look farther to the left to determine if it's to be freed
2561	 * (not shared with another extent). Else, reset the partial
2562	 * cluster - we're either  done freeing or the beginning of the
2563	 * extent is left cluster aligned.
2564	 */
2565	if (EXT4_LBLK_COFF(sbi, from) && num == ee_len) {
2566		if (partial->state == initial) {
2567			partial->pclu = EXT4_B2C(sbi, pblk);
2568			partial->lblk = from;
2569			partial->state = tofree;
2570		}
2571	} else {
2572		partial->state = initial;
2573	}
2574
2575	return 0;
2576}
2577
2578/*
2579 * ext4_ext_rm_leaf() Removes the extents associated with the
2580 * blocks appearing between "start" and "end".  Both "start"
2581 * and "end" must appear in the same extent or EIO is returned.
2582 *
2583 * @handle: The journal handle
2584 * @inode:  The files inode
2585 * @path:   The path to the leaf
2586 * @partial_cluster: The cluster which we'll have to free if all extents
2587 *                   has been released from it.  However, if this value is
2588 *                   negative, it's a cluster just to the right of the
2589 *                   punched region and it must not be freed.
2590 * @start:  The first block to remove
2591 * @end:   The last block to remove
2592 */
2593static int
2594ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2595		 struct ext4_ext_path *path,
2596		 struct partial_cluster *partial,
2597		 ext4_lblk_t start, ext4_lblk_t end)
2598{
2599	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2600	int err = 0, correct_index = 0;
2601	int depth = ext_depth(inode), credits, revoke_credits;
2602	struct ext4_extent_header *eh;
2603	ext4_lblk_t a, b;
2604	unsigned num;
2605	ext4_lblk_t ex_ee_block;
2606	unsigned short ex_ee_len;
2607	unsigned unwritten = 0;
2608	struct ext4_extent *ex;
2609	ext4_fsblk_t pblk;
2610
2611	/* the header must be checked already in ext4_ext_remove_space() */
2612	ext_debug(inode, "truncate since %u in leaf to %u\n", start, end);
2613	if (!path[depth].p_hdr)
2614		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
2615	eh = path[depth].p_hdr;
2616	if (unlikely(path[depth].p_hdr == NULL)) {
2617		EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
2618		return -EFSCORRUPTED;
2619	}
2620	/* find where to start removing */
2621	ex = path[depth].p_ext;
2622	if (!ex)
2623		ex = EXT_LAST_EXTENT(eh);
2624
2625	ex_ee_block = le32_to_cpu(ex->ee_block);
2626	ex_ee_len = ext4_ext_get_actual_len(ex);
2627
2628	trace_ext4_ext_rm_leaf(inode, start, ex, partial);
2629
2630	while (ex >= EXT_FIRST_EXTENT(eh) &&
2631			ex_ee_block + ex_ee_len > start) {
2632
2633		if (ext4_ext_is_unwritten(ex))
2634			unwritten = 1;
2635		else
2636			unwritten = 0;
2637
2638		ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block,
2639			  unwritten, ex_ee_len);
2640		path[depth].p_ext = ex;
2641
2642		a = ex_ee_block > start ? ex_ee_block : start;
2643		b = ex_ee_block+ex_ee_len - 1 < end ?
2644			ex_ee_block+ex_ee_len - 1 : end;
2645
2646		ext_debug(inode, "  border %u:%u\n", a, b);
2647
2648		/* If this extent is beyond the end of the hole, skip it */
2649		if (end < ex_ee_block) {
2650			/*
2651			 * We're going to skip this extent and move to another,
2652			 * so note that its first cluster is in use to avoid
2653			 * freeing it when removing blocks.  Eventually, the
2654			 * right edge of the truncated/punched region will
2655			 * be just to the left.
2656			 */
2657			if (sbi->s_cluster_ratio > 1) {
2658				pblk = ext4_ext_pblock(ex);
2659				partial->pclu = EXT4_B2C(sbi, pblk);
2660				partial->state = nofree;
2661			}
2662			ex--;
2663			ex_ee_block = le32_to_cpu(ex->ee_block);
2664			ex_ee_len = ext4_ext_get_actual_len(ex);
2665			continue;
2666		} else if (b != ex_ee_block + ex_ee_len - 1) {
2667			EXT4_ERROR_INODE(inode,
2668					 "can not handle truncate %u:%u "
2669					 "on extent %u:%u",
2670					 start, end, ex_ee_block,
2671					 ex_ee_block + ex_ee_len - 1);
2672			err = -EFSCORRUPTED;
2673			goto out;
2674		} else if (a != ex_ee_block) {
2675			/* remove tail of the extent */
2676			num = a - ex_ee_block;
2677		} else {
2678			/* remove whole extent: excellent! */
2679			num = 0;
2680		}
2681		/*
2682		 * 3 for leaf, sb, and inode plus 2 (bmap and group
2683		 * descriptor) for each block group; assume two block
2684		 * groups plus ex_ee_len/blocks_per_block_group for
2685		 * the worst case
2686		 */
2687		credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
2688		if (ex == EXT_FIRST_EXTENT(eh)) {
2689			correct_index = 1;
2690			credits += (ext_depth(inode)) + 1;
2691		}
2692		credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2693		/*
2694		 * We may end up freeing some index blocks and data from the
2695		 * punched range. Note that partial clusters are accounted for
2696		 * by ext4_free_data_revoke_credits().
2697		 */
2698		revoke_credits =
2699			ext4_free_metadata_revoke_credits(inode->i_sb,
2700							  ext_depth(inode)) +
2701			ext4_free_data_revoke_credits(inode, b - a + 1);
2702
2703		err = ext4_datasem_ensure_credits(handle, inode, credits,
2704						  credits, revoke_credits);
2705		if (err) {
2706			if (err > 0)
2707				err = -EAGAIN;
2708			goto out;
2709		}
2710
2711		err = ext4_ext_get_access(handle, inode, path + depth);
2712		if (err)
2713			goto out;
2714
2715		err = ext4_remove_blocks(handle, inode, ex, partial, a, b);
2716		if (err)
2717			goto out;
2718
2719		if (num == 0)
2720			/* this extent is removed; mark slot entirely unused */
2721			ext4_ext_store_pblock(ex, 0);
2722
2723		ex->ee_len = cpu_to_le16(num);
2724		/*
2725		 * Do not mark unwritten if all the blocks in the
2726		 * extent have been removed.
2727		 */
2728		if (unwritten && num)
2729			ext4_ext_mark_unwritten(ex);
2730		/*
2731		 * If the extent was completely released,
2732		 * we need to remove it from the leaf
2733		 */
2734		if (num == 0) {
2735			if (end != EXT_MAX_BLOCKS - 1) {
2736				/*
2737				 * For hole punching, we need to scoot all the
2738				 * extents up when an extent is removed so that
2739				 * we dont have blank extents in the middle
2740				 */
2741				memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2742					sizeof(struct ext4_extent));
2743
2744				/* Now get rid of the one at the end */
2745				memset(EXT_LAST_EXTENT(eh), 0,
2746					sizeof(struct ext4_extent));
2747			}
2748			le16_add_cpu(&eh->eh_entries, -1);
2749		}
2750
2751		err = ext4_ext_dirty(handle, inode, path + depth);
2752		if (err)
2753			goto out;
2754
2755		ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num,
2756				ext4_ext_pblock(ex));
2757		ex--;
2758		ex_ee_block = le32_to_cpu(ex->ee_block);
2759		ex_ee_len = ext4_ext_get_actual_len(ex);
2760	}
2761
2762	if (correct_index && eh->eh_entries)
2763		err = ext4_ext_correct_indexes(handle, inode, path);
2764
2765	/*
2766	 * If there's a partial cluster and at least one extent remains in
2767	 * the leaf, free the partial cluster if it isn't shared with the
2768	 * current extent.  If it is shared with the current extent
2769	 * we reset the partial cluster because we've reached the start of the
2770	 * truncated/punched region and we're done removing blocks.
2771	 */
2772	if (partial->state == tofree && ex >= EXT_FIRST_EXTENT(eh)) {
2773		pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
2774		if (partial->pclu != EXT4_B2C(sbi, pblk)) {
2775			int flags = get_default_free_blocks_flags(inode);
2776
2777			if (ext4_is_pending(inode, partial->lblk))
2778				flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
2779			ext4_free_blocks(handle, inode, NULL,
2780					 EXT4_C2B(sbi, partial->pclu),
2781					 sbi->s_cluster_ratio, flags);
2782			if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
2783				ext4_rereserve_cluster(inode, partial->lblk);
2784		}
2785		partial->state = initial;
2786	}
2787
2788	/* if this leaf is free, then we should
2789	 * remove it from index block above */
2790	if (err == 0 && eh->eh_entries == 0 && path[depth].p_bh != NULL)
2791		err = ext4_ext_rm_idx(handle, inode, path, depth);
2792
2793out:
2794	return err;
2795}
2796
2797/*
2798 * ext4_ext_more_to_rm:
2799 * returns 1 if current index has to be freed (even partial)
2800 */
2801static int
2802ext4_ext_more_to_rm(struct ext4_ext_path *path)
2803{
2804	BUG_ON(path->p_idx == NULL);
2805
2806	if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr))
2807		return 0;
2808
2809	/*
2810	 * if truncate on deeper level happened, it wasn't partial,
2811	 * so we have to consider current index for truncation
2812	 */
2813	if (le16_to_cpu(path->p_hdr->eh_entries) == path->p_block)
2814		return 0;
2815	return 1;
2816}
2817
2818int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2819			  ext4_lblk_t end)
2820{
2821	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2822	int depth = ext_depth(inode);
2823	struct ext4_ext_path *path = NULL;
2824	struct partial_cluster partial;
2825	handle_t *handle;
2826	int i = 0, err = 0;
2827
2828	partial.pclu = 0;
2829	partial.lblk = 0;
2830	partial.state = initial;
2831
2832	ext_debug(inode, "truncate since %u to %u\n", start, end);
2833
2834	/* probably first extent we're gonna free will be last in block */
2835	handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE,
2836			depth + 1,
2837			ext4_free_metadata_revoke_credits(inode->i_sb, depth));
2838	if (IS_ERR(handle))
2839		return PTR_ERR(handle);
2840
2841again:
2842	trace_ext4_ext_remove_space(inode, start, end, depth);
2843
2844	/*
2845	 * Check if we are removing extents inside the extent tree. If that
2846	 * is the case, we are going to punch a hole inside the extent tree
2847	 * so we have to check whether we need to split the extent covering
2848	 * the last block to remove so we can easily remove the part of it
2849	 * in ext4_ext_rm_leaf().
2850	 */
2851	if (end < EXT_MAX_BLOCKS - 1) {
2852		struct ext4_extent *ex;
2853		ext4_lblk_t ee_block, ex_end, lblk;
2854		ext4_fsblk_t pblk;
2855
2856		/* find extent for or closest extent to this block */
2857		path = ext4_find_extent(inode, end, NULL,
2858					EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
2859		if (IS_ERR(path)) {
2860			ext4_journal_stop(handle);
2861			return PTR_ERR(path);
2862		}
2863		depth = ext_depth(inode);
2864		/* Leaf not may not exist only if inode has no blocks at all */
2865		ex = path[depth].p_ext;
2866		if (!ex) {
2867			if (depth) {
2868				EXT4_ERROR_INODE(inode,
2869						 "path[%d].p_hdr == NULL",
2870						 depth);
2871				err = -EFSCORRUPTED;
2872			}
2873			goto out;
2874		}
2875
2876		ee_block = le32_to_cpu(ex->ee_block);
2877		ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1;
2878
2879		/*
2880		 * See if the last block is inside the extent, if so split
2881		 * the extent at 'end' block so we can easily remove the
2882		 * tail of the first part of the split extent in
2883		 * ext4_ext_rm_leaf().
2884		 */
2885		if (end >= ee_block && end < ex_end) {
2886
2887			/*
2888			 * If we're going to split the extent, note that
2889			 * the cluster containing the block after 'end' is
2890			 * in use to avoid freeing it when removing blocks.
2891			 */
2892			if (sbi->s_cluster_ratio > 1) {
2893				pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
2894				partial.pclu = EXT4_B2C(sbi, pblk);
2895				partial.state = nofree;
2896			}
2897
2898			/*
2899			 * Split the extent in two so that 'end' is the last
2900			 * block in the first new extent. Also we should not
2901			 * fail removing space due to ENOSPC so try to use
2902			 * reserved block if that happens.
2903			 */
2904			err = ext4_force_split_extent_at(handle, inode, &path,
2905							 end + 1, 1);
2906			if (err < 0)
2907				goto out;
2908
2909		} else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
2910			   partial.state == initial) {
2911			/*
2912			 * If we're punching, there's an extent to the right.
2913			 * If the partial cluster hasn't been set, set it to
2914			 * that extent's first cluster and its state to nofree
2915			 * so it won't be freed should it contain blocks to be
2916			 * removed. If it's already set (tofree/nofree), we're
2917			 * retrying and keep the original partial cluster info
2918			 * so a cluster marked tofree as a result of earlier
2919			 * extent removal is not lost.
2920			 */
2921			lblk = ex_end + 1;
2922			err = ext4_ext_search_right(inode, path, &lblk, &pblk,
2923						    NULL);
2924			if (err < 0)
2925				goto out;
2926			if (pblk) {
2927				partial.pclu = EXT4_B2C(sbi, pblk);
2928				partial.state = nofree;
2929			}
2930		}
2931	}
2932	/*
2933	 * We start scanning from right side, freeing all the blocks
2934	 * after i_size and walking into the tree depth-wise.
2935	 */
2936	depth = ext_depth(inode);
2937	if (path) {
2938		int k = i = depth;
2939		while (--k > 0)
2940			path[k].p_block =
2941				le16_to_cpu(path[k].p_hdr->eh_entries)+1;
2942	} else {
2943		path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
2944			       GFP_NOFS | __GFP_NOFAIL);
2945		if (path == NULL) {
2946			ext4_journal_stop(handle);
2947			return -ENOMEM;
2948		}
2949		path[0].p_maxdepth = path[0].p_depth = depth;
2950		path[0].p_hdr = ext_inode_hdr(inode);
2951		i = 0;
2952
2953		if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
2954			err = -EFSCORRUPTED;
2955			goto out;
2956		}
2957	}
2958	err = 0;
2959
2960	while (i >= 0 && err == 0) {
2961		if (i == depth) {
2962			/* this is leaf block */
2963			err = ext4_ext_rm_leaf(handle, inode, path,
2964					       &partial, start, end);
2965			/* root level has p_bh == NULL, brelse() eats this */
2966			brelse(path[i].p_bh);
2967			path[i].p_bh = NULL;
2968			i--;
2969			continue;
2970		}
2971
2972		/* this is index block */
2973		if (!path[i].p_hdr) {
2974			ext_debug(inode, "initialize header\n");
2975			path[i].p_hdr = ext_block_hdr(path[i].p_bh);
2976		}
2977
2978		if (!path[i].p_idx) {
2979			/* this level hasn't been touched yet */
2980			path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr);
2981			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1;
2982			ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n",
2983				  path[i].p_hdr,
2984				  le16_to_cpu(path[i].p_hdr->eh_entries));
2985		} else {
2986			/* we were already here, see at next index */
2987			path[i].p_idx--;
2988		}
2989
2990		ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n",
2991				i, EXT_FIRST_INDEX(path[i].p_hdr),
2992				path[i].p_idx);
2993		if (ext4_ext_more_to_rm(path + i)) {
2994			struct buffer_head *bh;
2995			/* go to the next level */
2996			ext_debug(inode, "move to level %d (block %llu)\n",
2997				  i + 1, ext4_idx_pblock(path[i].p_idx));
2998			memset(path + i + 1, 0, sizeof(*path));
2999			bh = read_extent_tree_block(inode, path[i].p_idx,
3000						    depth - i - 1,
3001						    EXT4_EX_NOCACHE);
3002			if (IS_ERR(bh)) {
3003				/* should we reset i_size? */
3004				err = PTR_ERR(bh);
3005				break;
3006			}
3007			/* Yield here to deal with large extent trees.
3008			 * Should be a no-op if we did IO above. */
3009			cond_resched();
3010			if (WARN_ON(i + 1 > depth)) {
3011				err = -EFSCORRUPTED;
3012				break;
3013			}
3014			path[i + 1].p_bh = bh;
3015
3016			/* save actual number of indexes since this
3017			 * number is changed at the next iteration */
3018			path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries);
3019			i++;
3020		} else {
3021			/* we finished processing this index, go up */
3022			if (path[i].p_hdr->eh_entries == 0 && i > 0) {
3023				/* index is empty, remove it;
3024				 * handle must be already prepared by the
3025				 * truncatei_leaf() */
3026				err = ext4_ext_rm_idx(handle, inode, path, i);
3027			}
3028			/* root level has p_bh == NULL, brelse() eats this */
3029			brelse(path[i].p_bh);
3030			path[i].p_bh = NULL;
3031			i--;
3032			ext_debug(inode, "return to level %d\n", i);
3033		}
3034	}
3035
3036	trace_ext4_ext_remove_space_done(inode, start, end, depth, &partial,
3037					 path->p_hdr->eh_entries);
3038
3039	/*
3040	 * if there's a partial cluster and we have removed the first extent
3041	 * in the file, then we also free the partial cluster, if any
3042	 */
3043	if (partial.state == tofree && err == 0) {
3044		int flags = get_default_free_blocks_flags(inode);
3045
3046		if (ext4_is_pending(inode, partial.lblk))
3047			flags |= EXT4_FREE_BLOCKS_RERESERVE_CLUSTER;
3048		ext4_free_blocks(handle, inode, NULL,
3049				 EXT4_C2B(sbi, partial.pclu),
3050				 sbi->s_cluster_ratio, flags);
3051		if (flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)
3052			ext4_rereserve_cluster(inode, partial.lblk);
3053		partial.state = initial;
3054	}
3055
3056	/* TODO: flexible tree reduction should be here */
3057	if (path->p_hdr->eh_entries == 0) {
3058		/*
3059		 * truncate to zero freed all the tree,
3060		 * so we need to correct eh_depth
3061		 */
3062		err = ext4_ext_get_access(handle, inode, path);
3063		if (err == 0) {
3064			ext_inode_hdr(inode)->eh_depth = 0;
3065			ext_inode_hdr(inode)->eh_max =
3066				cpu_to_le16(ext4_ext_space_root(inode, 0));
3067			err = ext4_ext_dirty(handle, inode, path);
3068		}
3069	}
3070out:
3071	ext4_ext_drop_refs(path);
3072	kfree(path);
3073	path = NULL;
3074	if (err == -EAGAIN)
3075		goto again;
3076	ext4_journal_stop(handle);
3077
3078	return err;
3079}
3080
3081/*
3082 * called at mount time
3083 */
3084void ext4_ext_init(struct super_block *sb)
3085{
3086	/*
3087	 * possible initialization would be here
3088	 */
3089
3090	if (ext4_has_feature_extents(sb)) {
3091#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
3092		printk(KERN_INFO "EXT4-fs: file extents enabled"
3093#ifdef AGGRESSIVE_TEST
3094		       ", aggressive tests"
3095#endif
3096#ifdef CHECK_BINSEARCH
3097		       ", check binsearch"
3098#endif
3099#ifdef EXTENTS_STATS
3100		       ", stats"
3101#endif
3102		       "\n");
3103#endif
3104#ifdef EXTENTS_STATS
3105		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
3106		EXT4_SB(sb)->s_ext_min = 1 << 30;
3107		EXT4_SB(sb)->s_ext_max = 0;
3108#endif
3109	}
3110}
3111
3112/*
3113 * called at umount time
3114 */
3115void ext4_ext_release(struct super_block *sb)
3116{
3117	if (!ext4_has_feature_extents(sb))
3118		return;
3119
3120#ifdef EXTENTS_STATS
3121	if (EXT4_SB(sb)->s_ext_blocks && EXT4_SB(sb)->s_ext_extents) {
3122		struct ext4_sb_info *sbi = EXT4_SB(sb);
3123		printk(KERN_ERR "EXT4-fs: %lu blocks in %lu extents (%lu ave)\n",
3124			sbi->s_ext_blocks, sbi->s_ext_extents,
3125			sbi->s_ext_blocks / sbi->s_ext_extents);
3126		printk(KERN_ERR "EXT4-fs: extents: %lu min, %lu max, max depth %lu\n",
3127			sbi->s_ext_min, sbi->s_ext_max, sbi->s_depth_max);
3128	}
3129#endif
3130}
3131
3132static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
3133{
3134	ext4_lblk_t  ee_block;
3135	ext4_fsblk_t ee_pblock;
3136	unsigned int ee_len;
3137
3138	ee_block  = le32_to_cpu(ex->ee_block);
3139	ee_len    = ext4_ext_get_actual_len(ex);
3140	ee_pblock = ext4_ext_pblock(ex);
3141
3142	if (ee_len == 0)
3143		return 0;
3144
3145	return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
3146				     EXTENT_STATUS_WRITTEN);
3147}
3148
3149/* FIXME!! we need to try to merge to left or right after zero-out  */
3150static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3151{
3152	ext4_fsblk_t ee_pblock;
3153	unsigned int ee_len;
3154
3155	ee_len    = ext4_ext_get_actual_len(ex);
3156	ee_pblock = ext4_ext_pblock(ex);
3157	return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
3158				  ee_len);
3159}
3160
3161/*
3162 * ext4_split_extent_at() splits an extent at given block.
3163 *
3164 * @handle: the journal handle
3165 * @inode: the file inode
3166 * @path: the path to the extent
3167 * @split: the logical block where the extent is splitted.
3168 * @split_flags: indicates if the extent could be zeroout if split fails, and
3169 *		 the states(init or unwritten) of new extents.
3170 * @flags: flags used to insert new extent to extent tree.
3171 *
3172 *
3173 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
3174 * of which are determined by split_flag.
3175 *
3176 * There are two cases:
3177 *  a> the extent are splitted into two extent.
3178 *  b> split is not needed, and just mark the extent.
3179 *
3180 * return 0 on success.
3181 */
3182static int ext4_split_extent_at(handle_t *handle,
3183			     struct inode *inode,
3184			     struct ext4_ext_path **ppath,
3185			     ext4_lblk_t split,
3186			     int split_flag,
3187			     int flags)
3188{
3189	struct ext4_ext_path *path = *ppath;
3190	ext4_fsblk_t newblock;
3191	ext4_lblk_t ee_block;
3192	struct ext4_extent *ex, newex, orig_ex, zero_ex;
3193	struct ext4_extent *ex2 = NULL;
3194	unsigned int ee_len, depth;
3195	int err = 0;
3196
3197	BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
3198	       (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
3199
3200	ext_debug(inode, "logical block %llu\n", (unsigned long long)split);
3201
3202	ext4_ext_show_leaf(inode, path);
3203
3204	depth = ext_depth(inode);
3205	ex = path[depth].p_ext;
3206	ee_block = le32_to_cpu(ex->ee_block);
3207	ee_len = ext4_ext_get_actual_len(ex);
3208	newblock = split - ee_block + ext4_ext_pblock(ex);
3209
3210	BUG_ON(split < ee_block || split >= (ee_block + ee_len));
3211	BUG_ON(!ext4_ext_is_unwritten(ex) &&
3212	       split_flag & (EXT4_EXT_MAY_ZEROOUT |
3213			     EXT4_EXT_MARK_UNWRIT1 |
3214			     EXT4_EXT_MARK_UNWRIT2));
3215
3216	err = ext4_ext_get_access(handle, inode, path + depth);
3217	if (err)
3218		goto out;
3219
3220	if (split == ee_block) {
3221		/*
3222		 * case b: block @split is the block that the extent begins with
3223		 * then we just change the state of the extent, and splitting
3224		 * is not needed.
3225		 */
3226		if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3227			ext4_ext_mark_unwritten(ex);
3228		else
3229			ext4_ext_mark_initialized(ex);
3230
3231		if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
3232			ext4_ext_try_to_merge(handle, inode, path, ex);
3233
3234		err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3235		goto out;
3236	}
3237
3238	/* case a */
3239	memcpy(&orig_ex, ex, sizeof(orig_ex));
3240	ex->ee_len = cpu_to_le16(split - ee_block);
3241	if (split_flag & EXT4_EXT_MARK_UNWRIT1)
3242		ext4_ext_mark_unwritten(ex);
3243
3244	/*
3245	 * path may lead to new leaf, not to original leaf any more
3246	 * after ext4_ext_insert_extent() returns,
3247	 */
3248	err = ext4_ext_dirty(handle, inode, path + depth);
3249	if (err)
3250		goto fix_extent_len;
3251
3252	ex2 = &newex;
3253	ex2->ee_block = cpu_to_le32(split);
3254	ex2->ee_len   = cpu_to_le16(ee_len - (split - ee_block));
3255	ext4_ext_store_pblock(ex2, newblock);
3256	if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3257		ext4_ext_mark_unwritten(ex2);
3258
3259	err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3260	if (err != -ENOSPC && err != -EDQUOT)
3261		goto out;
3262
3263	if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
3264		if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3265			if (split_flag & EXT4_EXT_DATA_VALID1) {
3266				err = ext4_ext_zeroout(inode, ex2);
3267				zero_ex.ee_block = ex2->ee_block;
3268				zero_ex.ee_len = cpu_to_le16(
3269						ext4_ext_get_actual_len(ex2));
3270				ext4_ext_store_pblock(&zero_ex,
3271						      ext4_ext_pblock(ex2));
3272			} else {
3273				err = ext4_ext_zeroout(inode, ex);
3274				zero_ex.ee_block = ex->ee_block;
3275				zero_ex.ee_len = cpu_to_le16(
3276						ext4_ext_get_actual_len(ex));
3277				ext4_ext_store_pblock(&zero_ex,
3278						      ext4_ext_pblock(ex));
3279			}
3280		} else {
3281			err = ext4_ext_zeroout(inode, &orig_ex);
3282			zero_ex.ee_block = orig_ex.ee_block;
3283			zero_ex.ee_len = cpu_to_le16(
3284						ext4_ext_get_actual_len(&orig_ex));
3285			ext4_ext_store_pblock(&zero_ex,
3286					      ext4_ext_pblock(&orig_ex));
3287		}
3288
3289		if (!err) {
3290			/* update the extent length and mark as initialized */
3291			ex->ee_len = cpu_to_le16(ee_len);
3292			ext4_ext_try_to_merge(handle, inode, path, ex);
3293			err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3294			if (!err)
3295				/* update extent status tree */
3296				err = ext4_zeroout_es(inode, &zero_ex);
3297			/* If we failed at this point, we don't know in which
3298			 * state the extent tree exactly is so don't try to fix
3299			 * length of the original extent as it may do even more
3300			 * damage.
3301			 */
3302			goto out;
3303		}
3304	}
3305
3306fix_extent_len:
3307	ex->ee_len = orig_ex.ee_len;
3308	/*
3309	 * Ignore ext4_ext_dirty return value since we are already in error path
3310	 * and err is a non-zero error code.
3311	 */
3312	ext4_ext_dirty(handle, inode, path + path->p_depth);
3313	return err;
3314out:
3315	ext4_ext_show_leaf(inode, path);
3316	return err;
3317}
3318
3319/*
3320 * ext4_split_extents() splits an extent and mark extent which is covered
3321 * by @map as split_flags indicates
3322 *
3323 * It may result in splitting the extent into multiple extents (up to three)
3324 * There are three possibilities:
3325 *   a> There is no split required
3326 *   b> Splits in two extents: Split is happening at either end of the extent
3327 *   c> Splits in three extents: Somone is splitting in middle of the extent
3328 *
3329 */
3330static int ext4_split_extent(handle_t *handle,
3331			      struct inode *inode,
3332			      struct ext4_ext_path **ppath,
3333			      struct ext4_map_blocks *map,
3334			      int split_flag,
3335			      int flags)
3336{
3337	struct ext4_ext_path *path = *ppath;
3338	ext4_lblk_t ee_block;
3339	struct ext4_extent *ex;
3340	unsigned int ee_len, depth;
3341	int err = 0;
3342	int unwritten;
3343	int split_flag1, flags1;
3344	int allocated = map->m_len;
3345
3346	depth = ext_depth(inode);
3347	ex = path[depth].p_ext;
3348	ee_block = le32_to_cpu(ex->ee_block);
3349	ee_len = ext4_ext_get_actual_len(ex);
3350	unwritten = ext4_ext_is_unwritten(ex);
3351
3352	if (map->m_lblk + map->m_len < ee_block + ee_len) {
3353		split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
3354		flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
3355		if (unwritten)
3356			split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
3357				       EXT4_EXT_MARK_UNWRIT2;
3358		if (split_flag & EXT4_EXT_DATA_VALID2)
3359			split_flag1 |= EXT4_EXT_DATA_VALID1;
3360		err = ext4_split_extent_at(handle, inode, ppath,
3361				map->m_lblk + map->m_len, split_flag1, flags1);
3362		if (err)
3363			goto out;
3364	} else {
3365		allocated = ee_len - (map->m_lblk - ee_block);
3366	}
3367	/*
3368	 * Update path is required because previous ext4_split_extent_at() may
3369	 * result in split of original leaf or extent zeroout.
3370	 */
3371	path = ext4_find_extent(inode, map->m_lblk, ppath, flags);
3372	if (IS_ERR(path))
3373		return PTR_ERR(path);
3374	depth = ext_depth(inode);
3375	ex = path[depth].p_ext;
3376	if (!ex) {
3377		EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3378				 (unsigned long) map->m_lblk);
3379		return -EFSCORRUPTED;
3380	}
3381	unwritten = ext4_ext_is_unwritten(ex);
3382	split_flag1 = 0;
3383
3384	if (map->m_lblk >= ee_block) {
3385		split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
3386		if (unwritten) {
3387			split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
3388			split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3389						     EXT4_EXT_MARK_UNWRIT2);
3390		}
3391		err = ext4_split_extent_at(handle, inode, ppath,
3392				map->m_lblk, split_flag1, flags);
3393		if (err)
3394			goto out;
3395	}
3396
3397	ext4_ext_show_leaf(inode, path);
3398out:
3399	return err ? err : allocated;
3400}
3401
3402/*
3403 * This function is called by ext4_ext_map_blocks() if someone tries to write
3404 * to an unwritten extent. It may result in splitting the unwritten
3405 * extent into multiple extents (up to three - one initialized and two
3406 * unwritten).
3407 * There are three possibilities:
3408 *   a> There is no split required: Entire extent should be initialized
3409 *   b> Splits in two extents: Write is happening at either end of the extent
3410 *   c> Splits in three extents: Somone is writing in middle of the extent
3411 *
3412 * Pre-conditions:
3413 *  - The extent pointed to by 'path' is unwritten.
3414 *  - The extent pointed to by 'path' contains a superset
3415 *    of the logical span [map->m_lblk, map->m_lblk + map->m_len).
3416 *
3417 * Post-conditions on success:
3418 *  - the returned value is the number of blocks beyond map->l_lblk
3419 *    that are allocated and initialized.
3420 *    It is guaranteed to be >= map->m_len.
3421 */
3422static int ext4_ext_convert_to_initialized(handle_t *handle,
3423					   struct inode *inode,
3424					   struct ext4_map_blocks *map,
3425					   struct ext4_ext_path **ppath,
3426					   int flags)
3427{
3428	struct ext4_ext_path *path = *ppath;
3429	struct ext4_sb_info *sbi;
3430	struct ext4_extent_header *eh;
3431	struct ext4_map_blocks split_map;
3432	struct ext4_extent zero_ex1, zero_ex2;
3433	struct ext4_extent *ex, *abut_ex;
3434	ext4_lblk_t ee_block, eof_block;
3435	unsigned int ee_len, depth, map_len = map->m_len;
3436	int allocated = 0, max_zeroout = 0;
3437	int err = 0;
3438	int split_flag = EXT4_EXT_DATA_VALID2;
3439
3440	ext_debug(inode, "logical block %llu, max_blocks %u\n",
3441		  (unsigned long long)map->m_lblk, map_len);
3442
3443	sbi = EXT4_SB(inode->i_sb);
3444	eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3445			>> inode->i_sb->s_blocksize_bits;
3446	if (eof_block < map->m_lblk + map_len)
3447		eof_block = map->m_lblk + map_len;
3448
3449	depth = ext_depth(inode);
3450	eh = path[depth].p_hdr;
3451	ex = path[depth].p_ext;
3452	ee_block = le32_to_cpu(ex->ee_block);
3453	ee_len = ext4_ext_get_actual_len(ex);
3454	zero_ex1.ee_len = 0;
3455	zero_ex2.ee_len = 0;
3456
3457	trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
3458
3459	/* Pre-conditions */
3460	BUG_ON(!ext4_ext_is_unwritten(ex));
3461	BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
3462
3463	/*
3464	 * Attempt to transfer newly initialized blocks from the currently
3465	 * unwritten extent to its neighbor. This is much cheaper
3466	 * than an insertion followed by a merge as those involve costly
3467	 * memmove() calls. Transferring to the left is the common case in
3468	 * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
3469	 * followed by append writes.
3470	 *
3471	 * Limitations of the current logic:
3472	 *  - L1: we do not deal with writes covering the whole extent.
3473	 *    This would require removing the extent if the transfer
3474	 *    is possible.
3475	 *  - L2: we only attempt to merge with an extent stored in the
3476	 *    same extent tree node.
3477	 */
3478	if ((map->m_lblk == ee_block) &&
3479		/* See if we can merge left */
3480		(map_len < ee_len) &&		/*L1*/
3481		(ex > EXT_FIRST_EXTENT(eh))) {	/*L2*/
3482		ext4_lblk_t prev_lblk;
3483		ext4_fsblk_t prev_pblk, ee_pblk;
3484		unsigned int prev_len;
3485
3486		abut_ex = ex - 1;
3487		prev_lblk = le32_to_cpu(abut_ex->ee_block);
3488		prev_len = ext4_ext_get_actual_len(abut_ex);
3489		prev_pblk = ext4_ext_pblock(abut_ex);
3490		ee_pblk = ext4_ext_pblock(ex);
3491
3492		/*
3493		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3494		 * upon those conditions:
3495		 * - C1: abut_ex is initialized,
3496		 * - C2: abut_ex is logically abutting ex,
3497		 * - C3: abut_ex is physically abutting ex,
3498		 * - C4: abut_ex can receive the additional blocks without
3499		 *   overflowing the (initialized) length limit.
3500		 */
3501		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
3502			((prev_lblk + prev_len) == ee_block) &&		/*C2*/
3503			((prev_pblk + prev_len) == ee_pblk) &&		/*C3*/
3504			(prev_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
3505			err = ext4_ext_get_access(handle, inode, path + depth);
3506			if (err)
3507				goto out;
3508
3509			trace_ext4_ext_convert_to_initialized_fastpath(inode,
3510				map, ex, abut_ex);
3511
3512			/* Shift the start of ex by 'map_len' blocks */
3513			ex->ee_block = cpu_to_le32(ee_block + map_len);
3514			ext4_ext_store_pblock(ex, ee_pblk + map_len);
3515			ex->ee_len = cpu_to_le16(ee_len - map_len);
3516			ext4_ext_mark_unwritten(ex); /* Restore the flag */
3517
3518			/* Extend abut_ex by 'map_len' blocks */
3519			abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
3520
3521			/* Result: number of initialized blocks past m_lblk */
3522			allocated = map_len;
3523		}
3524	} else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
3525		   (map_len < ee_len) &&	/*L1*/
3526		   ex < EXT_LAST_EXTENT(eh)) {	/*L2*/
3527		/* See if we can merge right */
3528		ext4_lblk_t next_lblk;
3529		ext4_fsblk_t next_pblk, ee_pblk;
3530		unsigned int next_len;
3531
3532		abut_ex = ex + 1;
3533		next_lblk = le32_to_cpu(abut_ex->ee_block);
3534		next_len = ext4_ext_get_actual_len(abut_ex);
3535		next_pblk = ext4_ext_pblock(abut_ex);
3536		ee_pblk = ext4_ext_pblock(ex);
3537
3538		/*
3539		 * A transfer of blocks from 'ex' to 'abut_ex' is allowed
3540		 * upon those conditions:
3541		 * - C1: abut_ex is initialized,
3542		 * - C2: abut_ex is logically abutting ex,
3543		 * - C3: abut_ex is physically abutting ex,
3544		 * - C4: abut_ex can receive the additional blocks without
3545		 *   overflowing the (initialized) length limit.
3546		 */
3547		if ((!ext4_ext_is_unwritten(abut_ex)) &&		/*C1*/
3548		    ((map->m_lblk + map_len) == next_lblk) &&		/*C2*/
3549		    ((ee_pblk + ee_len) == next_pblk) &&		/*C3*/
3550		    (next_len < (EXT_INIT_MAX_LEN - map_len))) {	/*C4*/
3551			err = ext4_ext_get_access(handle, inode, path + depth);
3552			if (err)
3553				goto out;
3554
3555			trace_ext4_ext_convert_to_initialized_fastpath(inode,
3556				map, ex, abut_ex);
3557
3558			/* Shift the start of abut_ex by 'map_len' blocks */
3559			abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
3560			ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
3561			ex->ee_len = cpu_to_le16(ee_len - map_len);
3562			ext4_ext_mark_unwritten(ex); /* Restore the flag */
3563
3564			/* Extend abut_ex by 'map_len' blocks */
3565			abut_ex->ee_len = cpu_to_le16(next_len + map_len);
3566
3567			/* Result: number of initialized blocks past m_lblk */
3568			allocated = map_len;
3569		}
3570	}
3571	if (allocated) {
3572		/* Mark the block containing both extents as dirty */
3573		err = ext4_ext_dirty(handle, inode, path + depth);
3574
3575		/* Update path to point to the right extent */
3576		path[depth].p_ext = abut_ex;
3577		goto out;
3578	} else
3579		allocated = ee_len - (map->m_lblk - ee_block);
3580
3581	WARN_ON(map->m_lblk < ee_block);
3582	/*
3583	 * It is safe to convert extent to initialized via explicit
3584	 * zeroout only if extent is fully inside i_size or new_size.
3585	 */
3586	split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
3587
3588	if (EXT4_EXT_MAY_ZEROOUT & split_flag)
3589		max_zeroout = sbi->s_extent_max_zeroout_kb >>
3590			(inode->i_sb->s_blocksize_bits - 10);
3591
3592	/*
3593	 * five cases:
3594	 * 1. split the extent into three extents.
3595	 * 2. split the extent into two extents, zeroout the head of the first
3596	 *    extent.
3597	 * 3. split the extent into two extents, zeroout the tail of the second
3598	 *    extent.
3599	 * 4. split the extent into two extents with out zeroout.
3600	 * 5. no splitting needed, just possibly zeroout the head and / or the
3601	 *    tail of the extent.
3602	 */
3603	split_map.m_lblk = map->m_lblk;
3604	split_map.m_len = map->m_len;
3605
3606	if (max_zeroout && (allocated > split_map.m_len)) {
3607		if (allocated <= max_zeroout) {
3608			/* case 3 or 5 */
3609			zero_ex1.ee_block =
3610				 cpu_to_le32(split_map.m_lblk +
3611					     split_map.m_len);
3612			zero_ex1.ee_len =
3613				cpu_to_le16(allocated - split_map.m_len);
3614			ext4_ext_store_pblock(&zero_ex1,
3615				ext4_ext_pblock(ex) + split_map.m_lblk +
3616				split_map.m_len - ee_block);
3617			err = ext4_ext_zeroout(inode, &zero_ex1);
3618			if (err)
3619				goto fallback;
3620			split_map.m_len = allocated;
3621		}
3622		if (split_map.m_lblk - ee_block + split_map.m_len <
3623								max_zeroout) {
3624			/* case 2 or 5 */
3625			if (split_map.m_lblk != ee_block) {
3626				zero_ex2.ee_block = ex->ee_block;
3627				zero_ex2.ee_len = cpu_to_le16(split_map.m_lblk -
3628							ee_block);
3629				ext4_ext_store_pblock(&zero_ex2,
3630						      ext4_ext_pblock(ex));
3631				err = ext4_ext_zeroout(inode, &zero_ex2);
3632				if (err)
3633					goto fallback;
3634			}
3635
3636			split_map.m_len += split_map.m_lblk - ee_block;
3637			split_map.m_lblk = ee_block;
3638			allocated = map->m_len;
3639		}
3640	}
3641
3642fallback:
3643	err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
3644				flags);
3645	if (err > 0)
3646		err = 0;
3647out:
3648	/* If we have gotten a failure, don't zero out status tree */
3649	if (!err) {
3650		err = ext4_zeroout_es(inode, &zero_ex1);
3651		if (!err)
3652			err = ext4_zeroout_es(inode, &zero_ex2);
3653	}
3654	return err ? err : allocated;
3655}
3656
3657/*
3658 * This function is called by ext4_ext_map_blocks() from
3659 * ext4_get_blocks_dio_write() when DIO to write
3660 * to an unwritten extent.
3661 *
3662 * Writing to an unwritten extent may result in splitting the unwritten
3663 * extent into multiple initialized/unwritten extents (up to three)
3664 * There are three possibilities:
3665 *   a> There is no split required: Entire extent should be unwritten
3666 *   b> Splits in two extents: Write is happening at either end of the extent
3667 *   c> Splits in three extents: Somone is writing in middle of the extent
3668 *
3669 * This works the same way in the case of initialized -> unwritten conversion.
3670 *
3671 * One of more index blocks maybe needed if the extent tree grow after
3672 * the unwritten extent split. To prevent ENOSPC occur at the IO
3673 * complete, we need to split the unwritten extent before DIO submit
3674 * the IO. The unwritten extent called at this time will be split
3675 * into three unwritten extent(at most). After IO complete, the part
3676 * being filled will be convert to initialized by the end_io callback function
3677 * via ext4_convert_unwritten_extents().
3678 *
3679 * Returns the size of unwritten extent to be written on success.
3680 */
3681static int ext4_split_convert_extents(handle_t *handle,
3682					struct inode *inode,
3683					struct ext4_map_blocks *map,
3684					struct ext4_ext_path **ppath,
3685					int flags)
3686{
3687	struct ext4_ext_path *path = *ppath;
3688	ext4_lblk_t eof_block;
3689	ext4_lblk_t ee_block;
3690	struct ext4_extent *ex;
3691	unsigned int ee_len;
3692	int split_flag = 0, depth;
3693
3694	ext_debug(inode, "logical block %llu, max_blocks %u\n",
3695		  (unsigned long long)map->m_lblk, map->m_len);
3696
3697	eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
3698			>> inode->i_sb->s_blocksize_bits;
3699	if (eof_block < map->m_lblk + map->m_len)
3700		eof_block = map->m_lblk + map->m_len;
3701	/*
3702	 * It is safe to convert extent to initialized via explicit
3703	 * zeroout only if extent is fully inside i_size or new_size.
3704	 */
3705	depth = ext_depth(inode);
3706	ex = path[depth].p_ext;
3707	ee_block = le32_to_cpu(ex->ee_block);
3708	ee_len = ext4_ext_get_actual_len(ex);
3709
3710	/* Convert to unwritten */
3711	if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
3712		split_flag |= EXT4_EXT_DATA_VALID1;
3713	/* Convert to initialized */
3714	} else if (flags & EXT4_GET_BLOCKS_CONVERT) {
3715		split_flag |= ee_block + ee_len <= eof_block ?
3716			      EXT4_EXT_MAY_ZEROOUT : 0;
3717		split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3718	}
3719	flags |= EXT4_GET_BLOCKS_PRE_IO;
3720	return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3721}
3722
3723static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3724						struct inode *inode,
3725						struct ext4_map_blocks *map,
3726						struct ext4_ext_path **ppath)
3727{
3728	struct ext4_ext_path *path = *ppath;
3729	struct ext4_extent *ex;
3730	ext4_lblk_t ee_block;
3731	unsigned int ee_len;
3732	int depth;
3733	int err = 0;
3734
3735	depth = ext_depth(inode);
3736	ex = path[depth].p_ext;
3737	ee_block = le32_to_cpu(ex->ee_block);
3738	ee_len = ext4_ext_get_actual_len(ex);
3739
3740	ext_debug(inode, "logical block %llu, max_blocks %u\n",
3741		  (unsigned long long)ee_block, ee_len);
3742
3743	/* If extent is larger than requested it is a clear sign that we still
3744	 * have some extent state machine issues left. So extent_split is still
3745	 * required.
3746	 * TODO: Once all related issues will be fixed this situation should be
3747	 * illegal.
3748	 */
3749	if (ee_block != map->m_lblk || ee_len > map->m_len) {
3750#ifdef CONFIG_EXT4_DEBUG
3751		ext4_warning(inode->i_sb, "Inode (%ld) finished: extent logical block %llu,"
3752			     " len %u; IO logical block %llu, len %u",
3753			     inode->i_ino, (unsigned long long)ee_block, ee_len,
3754			     (unsigned long long)map->m_lblk, map->m_len);
3755#endif
3756		err = ext4_split_convert_extents(handle, inode, map, ppath,
3757						 EXT4_GET_BLOCKS_CONVERT);
3758		if (err < 0)
3759			return err;
3760		path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3761		if (IS_ERR(path))
3762			return PTR_ERR(path);
3763		depth = ext_depth(inode);
3764		ex = path[depth].p_ext;
3765	}
3766
3767	err = ext4_ext_get_access(handle, inode, path + depth);
3768	if (err)
3769		goto out;
3770	/* first mark the extent as initialized */
3771	ext4_ext_mark_initialized(ex);
3772
3773	/* note: ext4_ext_correct_indexes() isn't needed here because
3774	 * borders are not changed
3775	 */
3776	ext4_ext_try_to_merge(handle, inode, path, ex);
3777
3778	/* Mark modified extent as dirty */
3779	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3780out:
3781	ext4_ext_show_leaf(inode, path);
3782	return err;
3783}
3784
3785static int
3786convert_initialized_extent(handle_t *handle, struct inode *inode,
3787			   struct ext4_map_blocks *map,
3788			   struct ext4_ext_path **ppath,
3789			   unsigned int *allocated)
3790{
3791	struct ext4_ext_path *path = *ppath;
3792	struct ext4_extent *ex;
3793	ext4_lblk_t ee_block;
3794	unsigned int ee_len;
3795	int depth;
3796	int err = 0;
3797
3798	/*
3799	 * Make sure that the extent is no bigger than we support with
3800	 * unwritten extent
3801	 */
3802	if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3803		map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3804
3805	depth = ext_depth(inode);
3806	ex = path[depth].p_ext;
3807	ee_block = le32_to_cpu(ex->ee_block);
3808	ee_len = ext4_ext_get_actual_len(ex);
3809
3810	ext_debug(inode, "logical block %llu, max_blocks %u\n",
3811		  (unsigned long long)ee_block, ee_len);
3812
3813	if (ee_block != map->m_lblk || ee_len > map->m_len) {
3814		err = ext4_split_convert_extents(handle, inode, map, ppath,
3815				EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3816		if (err < 0)
3817			return err;
3818		path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3819		if (IS_ERR(path))
3820			return PTR_ERR(path);
3821		depth = ext_depth(inode);
3822		ex = path[depth].p_ext;
3823		if (!ex) {
3824			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3825					 (unsigned long) map->m_lblk);
3826			return -EFSCORRUPTED;
3827		}
3828	}
3829
3830	err = ext4_ext_get_access(handle, inode, path + depth);
3831	if (err)
3832		return err;
3833	/* first mark the extent as unwritten */
3834	ext4_ext_mark_unwritten(ex);
3835
3836	/* note: ext4_ext_correct_indexes() isn't needed here because
3837	 * borders are not changed
3838	 */
3839	ext4_ext_try_to_merge(handle, inode, path, ex);
3840
3841	/* Mark modified extent as dirty */
3842	err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3843	if (err)
3844		return err;
3845	ext4_ext_show_leaf(inode, path);
3846
3847	ext4_update_inode_fsync_trans(handle, inode, 1);
3848
3849	map->m_flags |= EXT4_MAP_UNWRITTEN;
3850	if (*allocated > map->m_len)
3851		*allocated = map->m_len;
3852	map->m_len = *allocated;
3853	return 0;
3854}
3855
3856static int
3857ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
3858			struct ext4_map_blocks *map,
3859			struct ext4_ext_path **ppath, int flags,
3860			unsigned int allocated, ext4_fsblk_t newblock)
3861{
3862	struct ext4_ext_path __maybe_unused *path = *ppath;
3863	int ret = 0;
3864	int err = 0;
3865
3866	ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n",
3867		  (unsigned long long)map->m_lblk, map->m_len, flags,
3868		  allocated);
3869	ext4_ext_show_leaf(inode, path);
3870
3871	/*
3872	 * When writing into unwritten space, we should not fail to
3873	 * allocate metadata blocks for the new extent block if needed.
3874	 */
3875	flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
3876
3877	trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
3878						    allocated, newblock);
3879
3880	/* get_block() before submitting IO, split the extent */
3881	if (flags & EXT4_GET_BLOCKS_PRE_IO) {
3882		ret = ext4_split_convert_extents(handle, inode, map, ppath,
3883					 flags | EXT4_GET_BLOCKS_CONVERT);
3884		if (ret < 0) {
3885			err = ret;
3886			goto out2;
3887		}
3888		/*
3889		 * shouldn't get a 0 return when splitting an extent unless
3890		 * m_len is 0 (bug) or extent has been corrupted
3891		 */
3892		if (unlikely(ret == 0)) {
3893			EXT4_ERROR_INODE(inode,
3894					 "unexpected ret == 0, m_len = %u",
3895					 map->m_len);
3896			err = -EFSCORRUPTED;
3897			goto out2;
3898		}
3899		map->m_flags |= EXT4_MAP_UNWRITTEN;
3900		goto out;
3901	}
3902	/* IO end_io complete, convert the filled extent to written */
3903	if (flags & EXT4_GET_BLOCKS_CONVERT) {
3904		err = ext4_convert_unwritten_extents_endio(handle, inode, map,
3905							   ppath);
3906		if (err < 0)
3907			goto out2;
3908		ext4_update_inode_fsync_trans(handle, inode, 1);
3909		goto map_out;
3910	}
3911	/* buffered IO cases */
3912	/*
3913	 * repeat fallocate creation request
3914	 * we already have an unwritten extent
3915	 */
3916	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
3917		map->m_flags |= EXT4_MAP_UNWRITTEN;
3918		goto map_out;
3919	}
3920
3921	/* buffered READ or buffered write_begin() lookup */
3922	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3923		/*
3924		 * We have blocks reserved already.  We
3925		 * return allocated blocks so that delalloc
3926		 * won't do block reservation for us.  But
3927		 * the buffer head will be unmapped so that
3928		 * a read from the block returns 0s.
3929		 */
3930		map->m_flags |= EXT4_MAP_UNWRITTEN;
3931		goto out1;
3932	}
3933
3934	/*
3935	 * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1.
3936	 * For buffered writes, at writepage time, etc.  Convert a
3937	 * discovered unwritten extent to written.
3938	 */
3939	ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
3940	if (ret < 0) {
3941		err = ret;
3942		goto out2;
3943	}
3944	ext4_update_inode_fsync_trans(handle, inode, 1);
3945	/*
3946	 * shouldn't get a 0 return when converting an unwritten extent
3947	 * unless m_len is 0 (bug) or extent has been corrupted
3948	 */
3949	if (unlikely(ret == 0)) {
3950		EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u",
3951				 map->m_len);
3952		err = -EFSCORRUPTED;
3953		goto out2;
3954	}
3955
3956out:
3957	allocated = ret;
3958	map->m_flags |= EXT4_MAP_NEW;
3959map_out:
3960	map->m_flags |= EXT4_MAP_MAPPED;
3961out1:
3962	map->m_pblk = newblock;
3963	if (allocated > map->m_len)
3964		allocated = map->m_len;
3965	map->m_len = allocated;
3966	ext4_ext_show_leaf(inode, path);
3967out2:
3968	return err ? err : allocated;
3969}
3970
3971/*
3972 * get_implied_cluster_alloc - check to see if the requested
3973 * allocation (in the map structure) overlaps with a cluster already
3974 * allocated in an extent.
3975 *	@sb	The filesystem superblock structure
3976 *	@map	The requested lblk->pblk mapping
3977 *	@ex	The extent structure which might contain an implied
3978 *			cluster allocation
3979 *
3980 * This function is called by ext4_ext_map_blocks() after we failed to
3981 * find blocks that were already in the inode's extent tree.  Hence,
3982 * we know that the beginning of the requested region cannot overlap
3983 * the extent from the inode's extent tree.  There are three cases we
3984 * want to catch.  The first is this case:
3985 *
3986 *		 |--- cluster # N--|
3987 *    |--- extent ---|	|---- requested region ---|
3988 *			|==========|
3989 *
3990 * The second case that we need to test for is this one:
3991 *
3992 *   |--------- cluster # N ----------------|
3993 *	   |--- requested region --|   |------- extent ----|
3994 *	   |=======================|
3995 *
3996 * The third case is when the requested region lies between two extents
3997 * within the same cluster:
3998 *          |------------- cluster # N-------------|
3999 * |----- ex -----|                  |---- ex_right ----|
4000 *                  |------ requested region ------|
4001 *                  |================|
4002 *
4003 * In each of the above cases, we need to set the map->m_pblk and
4004 * map->m_len so it corresponds to the return the extent labelled as
4005 * "|====|" from cluster #N, since it is already in use for data in
4006 * cluster EXT4_B2C(sbi, map->m_lblk).	We will then return 1 to
4007 * signal to ext4_ext_map_blocks() that map->m_pblk should be treated
4008 * as a new "allocated" block region.  Otherwise, we will return 0 and
4009 * ext4_ext_map_blocks() will then allocate one or more new clusters
4010 * by calling ext4_mb_new_blocks().
4011 */
4012static int get_implied_cluster_alloc(struct super_block *sb,
4013				     struct ext4_map_blocks *map,
4014				     struct ext4_extent *ex,
4015				     struct ext4_ext_path *path)
4016{
4017	struct ext4_sb_info *sbi = EXT4_SB(sb);
4018	ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4019	ext4_lblk_t ex_cluster_start, ex_cluster_end;
4020	ext4_lblk_t rr_cluster_start;
4021	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4022	ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4023	unsigned short ee_len = ext4_ext_get_actual_len(ex);
4024
4025	/* The extent passed in that we are trying to match */
4026	ex_cluster_start = EXT4_B2C(sbi, ee_block);
4027	ex_cluster_end = EXT4_B2C(sbi, ee_block + ee_len - 1);
4028
4029	/* The requested region passed into ext4_map_blocks() */
4030	rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
4031
4032	if ((rr_cluster_start == ex_cluster_end) ||
4033	    (rr_cluster_start == ex_cluster_start)) {
4034		if (rr_cluster_start == ex_cluster_end)
4035			ee_start += ee_len - 1;
4036		map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
4037		map->m_len = min(map->m_len,
4038				 (unsigned) sbi->s_cluster_ratio - c_offset);
4039		/*
4040		 * Check for and handle this case:
4041		 *
4042		 *   |--------- cluster # N-------------|
4043		 *		       |------- extent ----|
4044		 *	   |--- requested region ---|
4045		 *	   |===========|
4046		 */
4047
4048		if (map->m_lblk < ee_block)
4049			map->m_len = min(map->m_len, ee_block - map->m_lblk);
4050
4051		/*
4052		 * Check for the case where there is already another allocated
4053		 * block to the right of 'ex' but before the end of the cluster.
4054		 *
4055		 *          |------------- cluster # N-------------|
4056		 * |----- ex -----|                  |---- ex_right ----|
4057		 *                  |------ requested region ------|
4058		 *                  |================|
4059		 */
4060		if (map->m_lblk > ee_block) {
4061			ext4_lblk_t next = ext4_ext_next_allocated_block(path);
4062			map->m_len = min(map->m_len, next - map->m_lblk);
4063		}
4064
4065		trace_ext4_get_implied_cluster_alloc_exit(sb, map, 1);
4066		return 1;
4067	}
4068
4069	trace_ext4_get_implied_cluster_alloc_exit(sb, map, 0);
4070	return 0;
4071}
4072
4073
4074/*
4075 * Block allocation/map/preallocation routine for extents based files
4076 *
4077 *
4078 * Need to be called with
4079 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
4080 * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
4081 *
4082 * return > 0, number of blocks already mapped/allocated
4083 *          if create == 0 and these are pre-allocated blocks
4084 *          	buffer head is unmapped
4085 *          otherwise blocks are mapped
4086 *
4087 * return = 0, if plain look up failed (blocks have not been allocated)
4088 *          buffer head is unmapped
4089 *
4090 * return < 0, error case.
4091 */
4092int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4093			struct ext4_map_blocks *map, int flags)
4094{
4095	struct ext4_ext_path *path = NULL;
4096	struct ext4_extent newex, *ex, ex2;
4097	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
4098	ext4_fsblk_t newblock = 0, pblk;
4099	int err = 0, depth, ret;
4100	unsigned int allocated = 0, offset = 0;
4101	unsigned int allocated_clusters = 0;
4102	struct ext4_allocation_request ar;
4103	ext4_lblk_t cluster_offset;
4104
4105	ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len);
4106	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4107
4108	/* find extent for this block */
4109	path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4110	if (IS_ERR(path)) {
4111		err = PTR_ERR(path);
4112		path = NULL;
4113		goto out;
4114	}
4115
4116	depth = ext_depth(inode);
4117
4118	/*
4119	 * consistent leaf must not be empty;
4120	 * this situation is possible, though, _during_ tree modification;
4121	 * this is why assert can't be put in ext4_find_extent()
4122	 */
4123	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4124		EXT4_ERROR_INODE(inode, "bad extent address "
4125				 "lblock: %lu, depth: %d pblock %lld",
4126				 (unsigned long) map->m_lblk, depth,
4127				 path[depth].p_block);
4128		err = -EFSCORRUPTED;
4129		goto out;
4130	}
4131
4132	ex = path[depth].p_ext;
4133	if (ex) {
4134		ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
4135		ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
4136		unsigned short ee_len;
4137
4138
4139		/*
4140		 * unwritten extents are treated as holes, except that
4141		 * we split out initialized portions during a write.
4142		 */
4143		ee_len = ext4_ext_get_actual_len(ex);
4144
4145		trace_ext4_ext_show_extent(inode, ee_block, ee_start, ee_len);
4146
4147		/* if found extent covers block, simply return it */
4148		if (in_range(map->m_lblk, ee_block, ee_len)) {
4149			newblock = map->m_lblk - ee_block + ee_start;
4150			/* number of remaining blocks in the extent */
4151			allocated = ee_len - (map->m_lblk - ee_block);
4152			ext_debug(inode, "%u fit into %u:%d -> %llu\n",
4153				  map->m_lblk, ee_block, ee_len, newblock);
4154
4155			/*
4156			 * If the extent is initialized check whether the
4157			 * caller wants to convert it to unwritten.
4158			 */
4159			if ((!ext4_ext_is_unwritten(ex)) &&
4160			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4161				err = convert_initialized_extent(handle,
4162					inode, map, &path, &allocated);
4163				goto out;
4164			} else if (!ext4_ext_is_unwritten(ex)) {
4165				map->m_flags |= EXT4_MAP_MAPPED;
4166				map->m_pblk = newblock;
4167				if (allocated > map->m_len)
4168					allocated = map->m_len;
4169				map->m_len = allocated;
4170				ext4_ext_show_leaf(inode, path);
4171				goto out;
4172			}
4173
4174			ret = ext4_ext_handle_unwritten_extents(
4175				handle, inode, map, &path, flags,
4176				allocated, newblock);
4177			if (ret < 0)
4178				err = ret;
4179			else
4180				allocated = ret;
4181			goto out;
4182		}
4183	}
4184
4185	/*
4186	 * requested block isn't allocated yet;
4187	 * we couldn't try to create block if create flag is zero
4188	 */
4189	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4190		ext4_lblk_t hole_start, hole_len;
4191
4192		hole_start = map->m_lblk;
4193		hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
4194		/*
4195		 * put just found gap into cache to speed up
4196		 * subsequent requests
4197		 */
4198		ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
4199
4200		/* Update hole_len to reflect hole size after map->m_lblk */
4201		if (hole_start != map->m_lblk)
4202			hole_len -= map->m_lblk - hole_start;
4203		map->m_pblk = 0;
4204		map->m_len = min_t(unsigned int, map->m_len, hole_len);
4205
4206		goto out;
4207	}
4208
4209	/*
4210	 * Okay, we need to do block allocation.
4211	 */
4212	newex.ee_block = cpu_to_le32(map->m_lblk);
4213	cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4214
4215	/*
4216	 * If we are doing bigalloc, check to see if the extent returned
4217	 * by ext4_find_extent() implies a cluster we can use.
4218	 */
4219	if (cluster_offset && ex &&
4220	    get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
4221		ar.len = allocated = map->m_len;
4222		newblock = map->m_pblk;
4223		goto got_allocated_blocks;
4224	}
4225
4226	/* find neighbour allocated blocks */
4227	ar.lleft = map->m_lblk;
4228	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
4229	if (err)
4230		goto out;
4231	ar.lright = map->m_lblk;
4232	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
4233	if (err < 0)
4234		goto out;
4235
4236	/* Check if the extent after searching to the right implies a
4237	 * cluster we can use. */
4238	if ((sbi->s_cluster_ratio > 1) && err &&
4239	    get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) {
4240		ar.len = allocated = map->m_len;
4241		newblock = map->m_pblk;
4242		goto got_allocated_blocks;
4243	}
4244
4245	/*
4246	 * See if request is beyond maximum number of blocks we can have in
4247	 * a single extent. For an initialized extent this limit is
4248	 * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
4249	 * EXT_UNWRITTEN_MAX_LEN.
4250	 */
4251	if (map->m_len > EXT_INIT_MAX_LEN &&
4252	    !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4253		map->m_len = EXT_INIT_MAX_LEN;
4254	else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
4255		 (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
4256		map->m_len = EXT_UNWRITTEN_MAX_LEN;
4257
4258	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
4259	newex.ee_len = cpu_to_le16(map->m_len);
4260	err = ext4_ext_check_overlap(sbi, inode, &newex, path);
4261	if (err)
4262		allocated = ext4_ext_get_actual_len(&newex);
4263	else
4264		allocated = map->m_len;
4265
4266	/* allocate new block */
4267	ar.inode = inode;
4268	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
4269	ar.logical = map->m_lblk;
4270	/*
4271	 * We calculate the offset from the beginning of the cluster
4272	 * for the logical block number, since when we allocate a
4273	 * physical cluster, the physical block should start at the
4274	 * same offset from the beginning of the cluster.  This is
4275	 * needed so that future calls to get_implied_cluster_alloc()
4276	 * work correctly.
4277	 */
4278	offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
4279	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
4280	ar.goal -= offset;
4281	ar.logical -= offset;
4282	if (S_ISREG(inode->i_mode))
4283		ar.flags = EXT4_MB_HINT_DATA;
4284	else
4285		/* disable in-core preallocation for non-regular files */
4286		ar.flags = 0;
4287	if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4288		ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4289	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4290		ar.flags |= EXT4_MB_DELALLOC_RESERVED;
4291	if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
4292		ar.flags |= EXT4_MB_USE_RESERVED;
4293	newblock = ext4_mb_new_blocks(handle, &ar, &err);
4294	if (!newblock)
4295		goto out;
4296	allocated_clusters = ar.len;
4297	ar.len = EXT4_C2B(sbi, ar.len) - offset;
4298	ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n",
4299		  ar.goal, newblock, ar.len, allocated);
4300	if (ar.len > allocated)
4301		ar.len = allocated;
4302
4303got_allocated_blocks:
4304	/* try to insert new extent into found leaf and return */
4305	pblk = newblock + offset;
4306	ext4_ext_store_pblock(&newex, pblk);
4307	newex.ee_len = cpu_to_le16(ar.len);
4308	/* Mark unwritten */
4309	if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
4310		ext4_ext_mark_unwritten(&newex);
4311		map->m_flags |= EXT4_MAP_UNWRITTEN;
4312	}
4313
4314	err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags);
4315	if (err) {
4316		if (allocated_clusters) {
4317			int fb_flags = 0;
4318
4319			/*
4320			 * free data blocks we just allocated.
4321			 * not a good idea to call discard here directly,
4322			 * but otherwise we'd need to call it every free().
4323			 */
4324			ext4_discard_preallocations(inode, 0);
4325			if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4326				fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
4327			ext4_free_blocks(handle, inode, NULL, newblock,
4328					 EXT4_C2B(sbi, allocated_clusters),
4329					 fb_flags);
4330		}
4331		goto out;
4332	}
4333
4334	/*
4335	 * Reduce the reserved cluster count to reflect successful deferred
4336	 * allocation of delayed allocated clusters or direct allocation of
4337	 * clusters discovered to be delayed allocated.  Once allocated, a
4338	 * cluster is not included in the reserved count.
4339	 */
4340	if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) {
4341		if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
4342			/*
4343			 * When allocating delayed allocated clusters, simply
4344			 * reduce the reserved cluster count and claim quota
4345			 */
4346			ext4_da_update_reserve_space(inode, allocated_clusters,
4347							1);
4348		} else {
4349			ext4_lblk_t lblk, len;
4350			unsigned int n;
4351
4352			/*
4353			 * When allocating non-delayed allocated clusters
4354			 * (from fallocate, filemap, DIO, or clusters
4355			 * allocated when delalloc has been disabled by
4356			 * ext4_nonda_switch), reduce the reserved cluster
4357			 * count by the number of allocated clusters that
4358			 * have previously been delayed allocated.  Quota
4359			 * has been claimed by ext4_mb_new_blocks() above,
4360			 * so release the quota reservations made for any
4361			 * previously delayed allocated clusters.
4362			 */
4363			lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
4364			len = allocated_clusters << sbi->s_cluster_bits;
4365			n = ext4_es_delayed_clu(inode, lblk, len);
4366			if (n > 0)
4367				ext4_da_update_reserve_space(inode, (int) n, 0);
4368		}
4369	}
4370
4371	/*
4372	 * Cache the extent and update transaction to commit on fdatasync only
4373	 * when it is _not_ an unwritten extent.
4374	 */
4375	if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
4376		ext4_update_inode_fsync_trans(handle, inode, 1);
4377	else
4378		ext4_update_inode_fsync_trans(handle, inode, 0);
4379
4380	map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED);
4381	map->m_pblk = pblk;
4382	map->m_len = ar.len;
4383	allocated = map->m_len;
4384	ext4_ext_show_leaf(inode, path);
4385out:
4386	ext4_ext_drop_refs(path);
4387	kfree(path);
4388
4389	trace_ext4_ext_map_blocks_exit(inode, flags, map,
4390				       err ? err : allocated);
4391	return err ? err : allocated;
4392}
4393
4394int ext4_ext_truncate(handle_t *handle, struct inode *inode)
4395{
4396	struct super_block *sb = inode->i_sb;
4397	ext4_lblk_t last_block;
4398	int err = 0;
4399
4400	/*
4401	 * TODO: optimization is possible here.
4402	 * Probably we need not scan at all,
4403	 * because page truncation is enough.
4404	 */
4405
4406	/* we have to know where to truncate from in crash case */
4407	EXT4_I(inode)->i_disksize = inode->i_size;
4408	err = ext4_mark_inode_dirty(handle, inode);
4409	if (err)
4410		return err;
4411
4412	last_block = (inode->i_size + sb->s_blocksize - 1)
4413			>> EXT4_BLOCK_SIZE_BITS(sb);
4414retry:
4415	err = ext4_es_remove_extent(inode, last_block,
4416				    EXT_MAX_BLOCKS - last_block);
4417	if (err == -ENOMEM) {
4418		cond_resched();
4419		congestion_wait(BLK_RW_ASYNC, HZ/50);
4420		goto retry;
4421	}
4422	if (err)
4423		return err;
4424retry_remove_space:
4425	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4426	if (err == -ENOMEM) {
4427		cond_resched();
4428		congestion_wait(BLK_RW_ASYNC, HZ/50);
4429		goto retry_remove_space;
4430	}
4431	return err;
4432}
4433
4434static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4435				  ext4_lblk_t len, loff_t new_size,
4436				  int flags)
4437{
4438	struct inode *inode = file_inode(file);
4439	handle_t *handle;
4440	int ret = 0;
4441	int ret2 = 0, ret3 = 0;
4442	int retries = 0;
4443	int depth = 0;
4444	struct ext4_map_blocks map;
4445	unsigned int credits;
4446	loff_t epos;
4447
4448	BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
4449	map.m_lblk = offset;
4450	map.m_len = len;
4451	/*
4452	 * Don't normalize the request if it can fit in one extent so
4453	 * that it doesn't get unnecessarily split into multiple
4454	 * extents.
4455	 */
4456	if (len <= EXT_UNWRITTEN_MAX_LEN)
4457		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
4458
4459	/*
4460	 * credits to insert 1 extent into extent tree
4461	 */
4462	credits = ext4_chunk_trans_blocks(inode, len);
4463	depth = ext_depth(inode);
4464
4465retry:
4466	while (ret >= 0 && len) {
4467		/*
4468		 * Recalculate credits when extent tree depth changes.
4469		 */
4470		if (depth != ext_depth(inode)) {
4471			credits = ext4_chunk_trans_blocks(inode, len);
4472			depth = ext_depth(inode);
4473		}
4474
4475		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4476					    credits);
4477		if (IS_ERR(handle)) {
4478			ret = PTR_ERR(handle);
4479			break;
4480		}
4481		ret = ext4_map_blocks(handle, inode, &map, flags);
4482		if (ret <= 0) {
4483			ext4_debug("inode #%lu: block %u: len %u: "
4484				   "ext4_ext_map_blocks returned %d",
4485				   inode->i_ino, map.m_lblk,
4486				   map.m_len, ret);
4487			ext4_mark_inode_dirty(handle, inode);
4488			ret2 = ext4_journal_stop(handle);
4489			break;
4490		}
4491		map.m_lblk += ret;
4492		map.m_len = len = len - ret;
4493		epos = (loff_t)map.m_lblk << inode->i_blkbits;
4494		inode->i_ctime = current_time(inode);
4495		if (new_size) {
4496			if (epos > new_size)
4497				epos = new_size;
4498			if (ext4_update_inode_size(inode, epos) & 0x1)
4499				inode->i_mtime = inode->i_ctime;
4500		}
4501		ret2 = ext4_mark_inode_dirty(handle, inode);
4502		ext4_update_inode_fsync_trans(handle, inode, 1);
4503		ret3 = ext4_journal_stop(handle);
4504		ret2 = ret3 ? ret3 : ret2;
4505		if (unlikely(ret2))
4506			break;
4507	}
4508	if (ret == -ENOSPC &&
4509			ext4_should_retry_alloc(inode->i_sb, &retries)) {
4510		ret = 0;
4511		goto retry;
4512	}
4513
4514	return ret > 0 ? ret2 : ret;
4515}
4516
4517static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
4518
4519static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
4520
4521static long ext4_zero_range(struct file *file, loff_t offset,
4522			    loff_t len, int mode)
4523{
4524	struct inode *inode = file_inode(file);
4525	handle_t *handle = NULL;
4526	unsigned int max_blocks;
4527	loff_t new_size = 0;
4528	int ret = 0;
4529	int flags;
4530	int credits;
4531	int partial_begin, partial_end;
4532	loff_t start, end;
4533	ext4_lblk_t lblk;
4534	unsigned int blkbits = inode->i_blkbits;
4535
4536	trace_ext4_zero_range(inode, offset, len, mode);
4537
4538	/* Call ext4_force_commit to flush all data in case of data=journal. */
4539	if (ext4_should_journal_data(inode)) {
4540		ret = ext4_force_commit(inode->i_sb);
4541		if (ret)
4542			return ret;
4543	}
4544
4545	/*
4546	 * Round up offset. This is not fallocate, we need to zero out
4547	 * blocks, so convert interior block aligned part of the range to
4548	 * unwritten and possibly manually zero out unaligned parts of the
4549	 * range.
4550	 */
4551	start = round_up(offset, 1 << blkbits);
4552	end = round_down((offset + len), 1 << blkbits);
4553
4554	if (start < offset || end > offset + len)
4555		return -EINVAL;
4556	partial_begin = offset & ((1 << blkbits) - 1);
4557	partial_end = (offset + len) & ((1 << blkbits) - 1);
4558
4559	lblk = start >> blkbits;
4560	max_blocks = (end >> blkbits);
4561	if (max_blocks < lblk)
4562		max_blocks = 0;
4563	else
4564		max_blocks -= lblk;
4565
4566	inode_lock(inode);
4567
4568	/*
4569	 * Indirect files do not support unwritten extents
4570	 */
4571	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4572		ret = -EOPNOTSUPP;
4573		goto out_mutex;
4574	}
4575
4576	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4577	    (offset + len > inode->i_size ||
4578	     offset + len > EXT4_I(inode)->i_disksize)) {
4579		new_size = offset + len;
4580		ret = inode_newsize_ok(inode, new_size);
4581		if (ret)
4582			goto out_mutex;
4583	}
4584
4585	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4586
4587	/* Wait all existing dio workers, newcomers will block on i_mutex */
4588	inode_dio_wait(inode);
4589
4590	ret = file_modified(file);
4591	if (ret)
4592		goto out_mutex;
4593
4594	/* Preallocate the range including the unaligned edges */
4595	if (partial_begin || partial_end) {
4596		ret = ext4_alloc_file_blocks(file,
4597				round_down(offset, 1 << blkbits) >> blkbits,
4598				(round_up((offset + len), 1 << blkbits) -
4599				 round_down(offset, 1 << blkbits)) >> blkbits,
4600				new_size, flags);
4601		if (ret)
4602			goto out_mutex;
4603
4604	}
4605
4606	/* Zero range excluding the unaligned edges */
4607	if (max_blocks > 0) {
4608		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4609			  EXT4_EX_NOCACHE);
4610
4611		/*
4612		 * Prevent page faults from reinstantiating pages we have
4613		 * released from page cache.
4614		 */
4615		down_write(&EXT4_I(inode)->i_mmap_sem);
4616
4617		ret = ext4_break_layouts(inode);
4618		if (ret) {
4619			up_write(&EXT4_I(inode)->i_mmap_sem);
4620			goto out_mutex;
4621		}
4622
4623		ret = ext4_update_disksize_before_punch(inode, offset, len);
4624		if (ret) {
4625			up_write(&EXT4_I(inode)->i_mmap_sem);
4626			goto out_mutex;
4627		}
4628		/* Now release the pages and zero block aligned part of pages */
4629		truncate_pagecache_range(inode, start, end - 1);
4630		inode->i_mtime = inode->i_ctime = current_time(inode);
4631
4632		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4633					     flags);
4634		up_write(&EXT4_I(inode)->i_mmap_sem);
4635		if (ret)
4636			goto out_mutex;
4637	}
4638	if (!partial_begin && !partial_end)
4639		goto out_mutex;
4640
4641	/*
4642	 * In worst case we have to writeout two nonadjacent unwritten
4643	 * blocks and update the inode
4644	 */
4645	credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4646	if (ext4_should_journal_data(inode))
4647		credits += 2;
4648	handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4649	if (IS_ERR(handle)) {
4650		ret = PTR_ERR(handle);
4651		ext4_std_error(inode->i_sb, ret);
4652		goto out_mutex;
4653	}
4654
4655	inode->i_mtime = inode->i_ctime = current_time(inode);
4656	if (new_size)
4657		ext4_update_inode_size(inode, new_size);
4658	ret = ext4_mark_inode_dirty(handle, inode);
4659	if (unlikely(ret))
4660		goto out_handle;
4661	/* Zero out partial block at the edges of the range */
4662	ret = ext4_zero_partial_blocks(handle, inode, offset, len);
4663	if (ret >= 0)
4664		ext4_update_inode_fsync_trans(handle, inode, 1);
4665
4666	if (file->f_flags & O_SYNC)
4667		ext4_handle_sync(handle);
4668
4669out_handle:
4670	ext4_journal_stop(handle);
4671out_mutex:
4672	inode_unlock(inode);
4673	return ret;
4674}
4675
4676/*
4677 * preallocate space for a file. This implements ext4's fallocate file
4678 * operation, which gets called from sys_fallocate system call.
4679 * For block-mapped files, posix_fallocate should fall back to the method
4680 * of writing zeroes to the required new blocks (the same behavior which is
4681 * expected for file systems which do not support fallocate() system call).
4682 */
4683long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4684{
4685	struct inode *inode = file_inode(file);
4686	loff_t new_size = 0;
4687	unsigned int max_blocks;
4688	int ret = 0;
4689	int flags;
4690	ext4_lblk_t lblk;
4691	unsigned int blkbits = inode->i_blkbits;
4692
4693	/*
4694	 * Encrypted inodes can't handle collapse range or insert
4695	 * range since we would need to re-encrypt blocks with a
4696	 * different IV or XTS tweak (which are based on the logical
4697	 * block number).
4698	 */
4699	if (IS_ENCRYPTED(inode) &&
4700	    (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
4701		return -EOPNOTSUPP;
4702
4703	/* Return error if mode is not supported */
4704	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4705		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
4706		     FALLOC_FL_INSERT_RANGE))
4707		return -EOPNOTSUPP;
4708
4709	inode_lock(inode);
4710	ret = ext4_convert_inline_data(inode);
4711	inode_unlock(inode);
4712	if (ret)
4713		goto exit;
4714
4715	if (mode & FALLOC_FL_PUNCH_HOLE) {
4716		ret = ext4_punch_hole(file, offset, len);
4717		goto exit;
4718	}
4719
4720	if (mode & FALLOC_FL_COLLAPSE_RANGE) {
4721		ret = ext4_collapse_range(file, offset, len);
4722		goto exit;
4723	}
4724
4725	if (mode & FALLOC_FL_INSERT_RANGE) {
4726		ret = ext4_insert_range(file, offset, len);
4727		goto exit;
4728	}
4729
4730	if (mode & FALLOC_FL_ZERO_RANGE) {
4731		ret = ext4_zero_range(file, offset, len, mode);
4732		goto exit;
4733	}
4734	trace_ext4_fallocate_enter(inode, offset, len, mode);
4735	lblk = offset >> blkbits;
4736
4737	max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4738	flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4739
4740	inode_lock(inode);
4741
4742	/*
4743	 * We only support preallocation for extent-based files only
4744	 */
4745	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
4746		ret = -EOPNOTSUPP;
4747		goto out;
4748	}
4749
4750	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
4751	    (offset + len > inode->i_size ||
4752	     offset + len > EXT4_I(inode)->i_disksize)) {
4753		new_size = offset + len;
4754		ret = inode_newsize_ok(inode, new_size);
4755		if (ret)
4756			goto out;
4757	}
4758
4759	/* Wait all existing dio workers, newcomers will block on i_mutex */
4760	inode_dio_wait(inode);
4761
4762	ret = file_modified(file);
4763	if (ret)
4764		goto out;
4765
4766	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
4767	if (ret)
4768		goto out;
4769
4770	if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4771		ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
4772					EXT4_I(inode)->i_sync_tid);
4773	}
4774out:
4775	inode_unlock(inode);
4776	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
4777exit:
4778	return ret;
4779}
4780
4781/*
4782 * This function convert a range of blocks to written extents
4783 * The caller of this function will pass the start offset and the size.
4784 * all unwritten extents within this range will be converted to
4785 * written extents.
4786 *
4787 * This function is called from the direct IO end io call back
4788 * function, to convert the fallocated extents after IO is completed.
4789 * Returns 0 on success.
4790 */
4791int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4792				   loff_t offset, ssize_t len)
4793{
4794	unsigned int max_blocks;
4795	int ret = 0, ret2 = 0, ret3 = 0;
4796	struct ext4_map_blocks map;
4797	unsigned int blkbits = inode->i_blkbits;
4798	unsigned int credits = 0;
4799
4800	map.m_lblk = offset >> blkbits;
4801	max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
4802
4803	if (!handle) {
4804		/*
4805		 * credits to insert 1 extent into extent tree
4806		 */
4807		credits = ext4_chunk_trans_blocks(inode, max_blocks);
4808	}
4809	while (ret >= 0 && ret < max_blocks) {
4810		map.m_lblk += ret;
4811		map.m_len = (max_blocks -= ret);
4812		if (credits) {
4813			handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4814						    credits);
4815			if (IS_ERR(handle)) {
4816				ret = PTR_ERR(handle);
4817				break;
4818			}
4819		}
4820		ret = ext4_map_blocks(handle, inode, &map,
4821				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
4822		if (ret <= 0)
4823			ext4_warning(inode->i_sb,
4824				     "inode #%lu: block %u: len %u: "
4825				     "ext4_ext_map_blocks returned %d",
4826				     inode->i_ino, map.m_lblk,
4827				     map.m_len, ret);
4828		ret2 = ext4_mark_inode_dirty(handle, inode);
4829		if (credits) {
4830			ret3 = ext4_journal_stop(handle);
4831			if (unlikely(ret3))
4832				ret2 = ret3;
4833		}
4834
4835		if (ret <= 0 || ret2)
4836			break;
4837	}
4838	return ret > 0 ? ret2 : ret;
4839}
4840
4841int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end)
4842{
4843	int ret = 0, err = 0;
4844	struct ext4_io_end_vec *io_end_vec;
4845
4846	/*
4847	 * This is somewhat ugly but the idea is clear: When transaction is
4848	 * reserved, everything goes into it. Otherwise we rather start several
4849	 * smaller transactions for conversion of each extent separately.
4850	 */
4851	if (handle) {
4852		handle = ext4_journal_start_reserved(handle,
4853						     EXT4_HT_EXT_CONVERT);
4854		if (IS_ERR(handle))
4855			return PTR_ERR(handle);
4856	}
4857
4858	list_for_each_entry(io_end_vec, &io_end->list_vec, list) {
4859		ret = ext4_convert_unwritten_extents(handle, io_end->inode,
4860						     io_end_vec->offset,
4861						     io_end_vec->size);
4862		if (ret)
4863			break;
4864	}
4865
4866	if (handle)
4867		err = ext4_journal_stop(handle);
4868
4869	return ret < 0 ? ret : err;
4870}
4871
4872static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap)
4873{
4874	__u64 physical = 0;
4875	__u64 length = 0;
4876	int blockbits = inode->i_sb->s_blocksize_bits;
4877	int error = 0;
4878	u16 iomap_type;
4879
4880	/* in-inode? */
4881	if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
4882		struct ext4_iloc iloc;
4883		int offset;	/* offset of xattr in inode */
4884
4885		error = ext4_get_inode_loc(inode, &iloc);
4886		if (error)
4887			return error;
4888		physical = (__u64)iloc.bh->b_blocknr << blockbits;
4889		offset = EXT4_GOOD_OLD_INODE_SIZE +
4890				EXT4_I(inode)->i_extra_isize;
4891		physical += offset;
4892		length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
4893		brelse(iloc.bh);
4894		iomap_type = IOMAP_INLINE;
4895	} else if (EXT4_I(inode)->i_file_acl) { /* external block */
4896		physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
4897		length = inode->i_sb->s_blocksize;
4898		iomap_type = IOMAP_MAPPED;
4899	} else {
4900		/* no in-inode or external block for xattr, so return -ENOENT */
4901		error = -ENOENT;
4902		goto out;
4903	}
4904
4905	iomap->addr = physical;
4906	iomap->offset = 0;
4907	iomap->length = length;
4908	iomap->type = iomap_type;
4909	iomap->flags = 0;
4910out:
4911	return error;
4912}
4913
4914static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset,
4915				  loff_t length, unsigned flags,
4916				  struct iomap *iomap, struct iomap *srcmap)
4917{
4918	int error;
4919
4920	error = ext4_iomap_xattr_fiemap(inode, iomap);
4921	if (error == 0 && (offset >= iomap->length))
4922		error = -ENOENT;
4923	return error;
4924}
4925
4926static const struct iomap_ops ext4_iomap_xattr_ops = {
4927	.iomap_begin		= ext4_iomap_xattr_begin,
4928};
4929
4930static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
4931{
4932	u64 maxbytes;
4933
4934	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4935		maxbytes = inode->i_sb->s_maxbytes;
4936	else
4937		maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
4938
4939	if (*len == 0)
4940		return -EINVAL;
4941	if (start > maxbytes)
4942		return -EFBIG;
4943
4944	/*
4945	 * Shrink request scope to what the fs can actually handle.
4946	 */
4947	if (*len > maxbytes || (maxbytes - *len) < start)
4948		*len = maxbytes - start;
4949	return 0;
4950}
4951
4952int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4953		u64 start, u64 len)
4954{
4955	int error = 0;
4956
4957	if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
4958		error = ext4_ext_precache(inode);
4959		if (error)
4960			return error;
4961		fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
4962	}
4963
4964	/*
4965	 * For bitmap files the maximum size limit could be smaller than
4966	 * s_maxbytes, so check len here manually instead of just relying on the
4967	 * generic check.
4968	 */
4969	error = ext4_fiemap_check_ranges(inode, start, &len);
4970	if (error)
4971		return error;
4972
4973	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
4974		fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
4975		return iomap_fiemap(inode, fieinfo, start, len,
4976				    &ext4_iomap_xattr_ops);
4977	}
4978
4979	return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
4980}
4981
4982int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
4983		      __u64 start, __u64 len)
4984{
4985	ext4_lblk_t start_blk, len_blks;
4986	__u64 last_blk;
4987	int error = 0;
4988
4989	if (ext4_has_inline_data(inode)) {
4990		int has_inline;
4991
4992		down_read(&EXT4_I(inode)->xattr_sem);
4993		has_inline = ext4_has_inline_data(inode);
4994		up_read(&EXT4_I(inode)->xattr_sem);
4995		if (has_inline)
4996			return 0;
4997	}
4998
4999	if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
5000		error = ext4_ext_precache(inode);
5001		if (error)
5002			return error;
5003		fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
5004	}
5005
5006	error = fiemap_prep(inode, fieinfo, start, &len, 0);
5007	if (error)
5008		return error;
5009
5010	error = ext4_fiemap_check_ranges(inode, start, &len);
5011	if (error)
5012		return error;
5013
5014	start_blk = start >> inode->i_sb->s_blocksize_bits;
5015	last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits;
5016	if (last_blk >= EXT_MAX_BLOCKS)
5017		last_blk = EXT_MAX_BLOCKS-1;
5018	len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
5019
5020	/*
5021	 * Walk the extent tree gathering extent information
5022	 * and pushing extents back to the user.
5023	 */
5024	return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo);
5025}
5026
5027/*
5028 * ext4_ext_shift_path_extents:
5029 * Shift the extents of a path structure lying between path[depth].p_ext
5030 * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
5031 * if it is right shift or left shift operation.
5032 */
5033static int
5034ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
5035			    struct inode *inode, handle_t *handle,
5036			    enum SHIFT_DIRECTION SHIFT)
5037{
5038	int depth, err = 0;
5039	struct ext4_extent *ex_start, *ex_last;
5040	bool update = false;
5041	int credits, restart_credits;
5042	depth = path->p_depth;
5043
5044	while (depth >= 0) {
5045		if (depth == path->p_depth) {
5046			ex_start = path[depth].p_ext;
5047			if (!ex_start)
5048				return -EFSCORRUPTED;
5049
5050			ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
5051			/* leaf + sb + inode */
5052			credits = 3;
5053			if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) {
5054				update = true;
5055				/* extent tree + sb + inode */
5056				credits = depth + 2;
5057			}
5058
5059			restart_credits = ext4_writepage_trans_blocks(inode);
5060			err = ext4_datasem_ensure_credits(handle, inode, credits,
5061					restart_credits, 0);
5062			if (err) {
5063				if (err > 0)
5064					err = -EAGAIN;
5065				goto out;
5066			}
5067
5068			err = ext4_ext_get_access(handle, inode, path + depth);
5069			if (err)
5070				goto out;
5071
5072			while (ex_start <= ex_last) {
5073				if (SHIFT == SHIFT_LEFT) {
5074					le32_add_cpu(&ex_start->ee_block,
5075						-shift);
5076					/* Try to merge to the left. */
5077					if ((ex_start >
5078					    EXT_FIRST_EXTENT(path[depth].p_hdr))
5079					    &&
5080					    ext4_ext_try_to_merge_right(inode,
5081					    path, ex_start - 1))
5082						ex_last--;
5083					else
5084						ex_start++;
5085				} else {
5086					le32_add_cpu(&ex_last->ee_block, shift);
5087					ext4_ext_try_to_merge_right(inode, path,
5088						ex_last);
5089					ex_last--;
5090				}
5091			}
5092			err = ext4_ext_dirty(handle, inode, path + depth);
5093			if (err)
5094				goto out;
5095
5096			if (--depth < 0 || !update)
5097				break;
5098		}
5099
5100		/* Update index too */
5101		err = ext4_ext_get_access(handle, inode, path + depth);
5102		if (err)
5103			goto out;
5104
5105		if (SHIFT == SHIFT_LEFT)
5106			le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
5107		else
5108			le32_add_cpu(&path[depth].p_idx->ei_block, shift);
5109		err = ext4_ext_dirty(handle, inode, path + depth);
5110		if (err)
5111			goto out;
5112
5113		/* we are done if current index is not a starting index */
5114		if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
5115			break;
5116
5117		depth--;
5118	}
5119
5120out:
5121	return err;
5122}
5123
5124/*
5125 * ext4_ext_shift_extents:
5126 * All the extents which lies in the range from @start to the last allocated
5127 * block for the @inode are shifted either towards left or right (depending
5128 * upon @SHIFT) by @shift blocks.
5129 * On success, 0 is returned, error otherwise.
5130 */
5131static int
5132ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5133		       ext4_lblk_t start, ext4_lblk_t shift,
5134		       enum SHIFT_DIRECTION SHIFT)
5135{
5136	struct ext4_ext_path *path;
5137	int ret = 0, depth;
5138	struct ext4_extent *extent;
5139	ext4_lblk_t stop, *iterator, ex_start, ex_end;
5140	ext4_lblk_t tmp = EXT_MAX_BLOCKS;
5141
5142	/* Let path point to the last extent */
5143	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
5144				EXT4_EX_NOCACHE);
5145	if (IS_ERR(path))
5146		return PTR_ERR(path);
5147
5148	depth = path->p_depth;
5149	extent = path[depth].p_ext;
5150	if (!extent)
5151		goto out;
5152
5153	stop = le32_to_cpu(extent->ee_block);
5154
5155       /*
5156	* For left shifts, make sure the hole on the left is big enough to
5157	* accommodate the shift.  For right shifts, make sure the last extent
5158	* won't be shifted beyond EXT_MAX_BLOCKS.
5159	*/
5160	if (SHIFT == SHIFT_LEFT) {
5161		path = ext4_find_extent(inode, start - 1, &path,
5162					EXT4_EX_NOCACHE);
5163		if (IS_ERR(path))
5164			return PTR_ERR(path);
5165		depth = path->p_depth;
5166		extent =  path[depth].p_ext;
5167		if (extent) {
5168			ex_start = le32_to_cpu(extent->ee_block);
5169			ex_end = le32_to_cpu(extent->ee_block) +
5170				ext4_ext_get_actual_len(extent);
5171		} else {
5172			ex_start = 0;
5173			ex_end = 0;
5174		}
5175
5176		if ((start == ex_start && shift > ex_start) ||
5177		    (shift > start - ex_end)) {
5178			ret = -EINVAL;
5179			goto out;
5180		}
5181	} else {
5182		if (shift > EXT_MAX_BLOCKS -
5183		    (stop + ext4_ext_get_actual_len(extent))) {
5184			ret = -EINVAL;
5185			goto out;
5186		}
5187	}
5188
5189	/*
5190	 * In case of left shift, iterator points to start and it is increased
5191	 * till we reach stop. In case of right shift, iterator points to stop
5192	 * and it is decreased till we reach start.
5193	 */
5194again:
5195	ret = 0;
5196	if (SHIFT == SHIFT_LEFT)
5197		iterator = &start;
5198	else
5199		iterator = &stop;
5200
5201	if (tmp != EXT_MAX_BLOCKS)
5202		*iterator = tmp;
5203
5204	/*
5205	 * Its safe to start updating extents.  Start and stop are unsigned, so
5206	 * in case of right shift if extent with 0 block is reached, iterator
5207	 * becomes NULL to indicate the end of the loop.
5208	 */
5209	while (iterator && start <= stop) {
5210		path = ext4_find_extent(inode, *iterator, &path,
5211					EXT4_EX_NOCACHE);
5212		if (IS_ERR(path))
5213			return PTR_ERR(path);
5214		depth = path->p_depth;
5215		extent = path[depth].p_ext;
5216		if (!extent) {
5217			EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
5218					 (unsigned long) *iterator);
5219			return -EFSCORRUPTED;
5220		}
5221		if (SHIFT == SHIFT_LEFT && *iterator >
5222		    le32_to_cpu(extent->ee_block)) {
5223			/* Hole, move to the next extent */
5224			if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5225				path[depth].p_ext++;
5226			} else {
5227				*iterator = ext4_ext_next_allocated_block(path);
5228				continue;
5229			}
5230		}
5231
5232		tmp = *iterator;
5233		if (SHIFT == SHIFT_LEFT) {
5234			extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5235			*iterator = le32_to_cpu(extent->ee_block) +
5236					ext4_ext_get_actual_len(extent);
5237		} else {
5238			extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
5239			if (le32_to_cpu(extent->ee_block) > start)
5240				*iterator = le32_to_cpu(extent->ee_block) - 1;
5241			else if (le32_to_cpu(extent->ee_block) == start)
5242				iterator = NULL;
5243			else {
5244				extent = EXT_LAST_EXTENT(path[depth].p_hdr);
5245				while (le32_to_cpu(extent->ee_block) >= start)
5246					extent--;
5247
5248				if (extent == EXT_LAST_EXTENT(path[depth].p_hdr))
5249					break;
5250
5251				extent++;
5252				iterator = NULL;
5253			}
5254			path[depth].p_ext = extent;
5255		}
5256		ret = ext4_ext_shift_path_extents(path, shift, inode,
5257				handle, SHIFT);
5258		/* iterator can be NULL which means we should break */
5259		if (ret == -EAGAIN)
5260			goto again;
5261		if (ret)
5262			break;
5263	}
5264out:
5265	ext4_ext_drop_refs(path);
5266	kfree(path);
5267	return ret;
5268}
5269
5270/*
5271 * ext4_collapse_range:
5272 * This implements the fallocate's collapse range functionality for ext4
5273 * Returns: 0 and non-zero on error.
5274 */
5275static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
5276{
5277	struct inode *inode = file_inode(file);
5278	struct super_block *sb = inode->i_sb;
5279	ext4_lblk_t punch_start, punch_stop;
5280	handle_t *handle;
5281	unsigned int credits;
5282	loff_t new_size, ioffset;
5283	int ret;
5284
5285	/*
5286	 * We need to test this early because xfstests assumes that a
5287	 * collapse range of (0, 1) will return EOPNOTSUPP if the file
5288	 * system does not support collapse range.
5289	 */
5290	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5291		return -EOPNOTSUPP;
5292
5293	/* Collapse range works only on fs cluster size aligned regions. */
5294	if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5295		return -EINVAL;
5296
5297	trace_ext4_collapse_range(inode, offset, len);
5298
5299	punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5300	punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
5301
5302	/* Call ext4_force_commit to flush all data in case of data=journal. */
5303	if (ext4_should_journal_data(inode)) {
5304		ret = ext4_force_commit(inode->i_sb);
5305		if (ret)
5306			return ret;
5307	}
5308
5309	inode_lock(inode);
5310	/*
5311	 * There is no need to overlap collapse range with EOF, in which case
5312	 * it is effectively a truncate operation
5313	 */
5314	if (offset + len >= inode->i_size) {
5315		ret = -EINVAL;
5316		goto out_mutex;
5317	}
5318
5319	/* Currently just for extent based files */
5320	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5321		ret = -EOPNOTSUPP;
5322		goto out_mutex;
5323	}
5324
5325	/* Wait for existing dio to complete */
5326	inode_dio_wait(inode);
5327
5328	ret = file_modified(file);
5329	if (ret)
5330		goto out_mutex;
5331
5332	/*
5333	 * Prevent page faults from reinstantiating pages we have released from
5334	 * page cache.
5335	 */
5336	down_write(&EXT4_I(inode)->i_mmap_sem);
5337
5338	ret = ext4_break_layouts(inode);
5339	if (ret)
5340		goto out_mmap;
5341
5342	/*
5343	 * Need to round down offset to be aligned with page size boundary
5344	 * for page size > block size.
5345	 */
5346	ioffset = round_down(offset, PAGE_SIZE);
5347	/*
5348	 * Write tail of the last page before removed range since it will get
5349	 * removed from the page cache below.
5350	 */
5351	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
5352	if (ret)
5353		goto out_mmap;
5354	/*
5355	 * Write data that will be shifted to preserve them when discarding
5356	 * page cache below. We are also protected from pages becoming dirty
5357	 * by i_mmap_sem.
5358	 */
5359	ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
5360					   LLONG_MAX);
5361	if (ret)
5362		goto out_mmap;
5363	truncate_pagecache(inode, ioffset);
5364
5365	credits = ext4_writepage_trans_blocks(inode);
5366	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5367	if (IS_ERR(handle)) {
5368		ret = PTR_ERR(handle);
5369		goto out_mmap;
5370	}
5371	ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
5372
5373	down_write(&EXT4_I(inode)->i_data_sem);
5374	ext4_discard_preallocations(inode, 0);
5375
5376	ret = ext4_es_remove_extent(inode, punch_start,
5377				    EXT_MAX_BLOCKS - punch_start);
5378	if (ret) {
5379		up_write(&EXT4_I(inode)->i_data_sem);
5380		goto out_stop;
5381	}
5382
5383	ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
5384	if (ret) {
5385		up_write(&EXT4_I(inode)->i_data_sem);
5386		goto out_stop;
5387	}
5388	ext4_discard_preallocations(inode, 0);
5389
5390	ret = ext4_ext_shift_extents(inode, handle, punch_stop,
5391				     punch_stop - punch_start, SHIFT_LEFT);
5392	if (ret) {
5393		up_write(&EXT4_I(inode)->i_data_sem);
5394		goto out_stop;
5395	}
5396
5397	new_size = inode->i_size - len;
5398	i_size_write(inode, new_size);
5399	EXT4_I(inode)->i_disksize = new_size;
5400
5401	up_write(&EXT4_I(inode)->i_data_sem);
5402	if (IS_SYNC(inode))
5403		ext4_handle_sync(handle);
5404	inode->i_mtime = inode->i_ctime = current_time(inode);
5405	ret = ext4_mark_inode_dirty(handle, inode);
5406	ext4_update_inode_fsync_trans(handle, inode, 1);
5407
5408out_stop:
5409	ext4_journal_stop(handle);
5410	ext4_fc_stop_ineligible(sb);
5411out_mmap:
5412	up_write(&EXT4_I(inode)->i_mmap_sem);
5413out_mutex:
5414	inode_unlock(inode);
5415	return ret;
5416}
5417
5418/*
5419 * ext4_insert_range:
5420 * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
5421 * The data blocks starting from @offset to the EOF are shifted by @len
5422 * towards right to create a hole in the @inode. Inode size is increased
5423 * by len bytes.
5424 * Returns 0 on success, error otherwise.
5425 */
5426static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
5427{
5428	struct inode *inode = file_inode(file);
5429	struct super_block *sb = inode->i_sb;
5430	handle_t *handle;
5431	struct ext4_ext_path *path;
5432	struct ext4_extent *extent;
5433	ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
5434	unsigned int credits, ee_len;
5435	int ret = 0, depth, split_flag = 0;
5436	loff_t ioffset;
5437
5438	/*
5439	 * We need to test this early because xfstests assumes that an
5440	 * insert range of (0, 1) will return EOPNOTSUPP if the file
5441	 * system does not support insert range.
5442	 */
5443	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
5444		return -EOPNOTSUPP;
5445
5446	/* Insert range works only on fs cluster size aligned regions. */
5447	if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
5448		return -EINVAL;
5449
5450	trace_ext4_insert_range(inode, offset, len);
5451
5452	offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
5453	len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
5454
5455	/* Call ext4_force_commit to flush all data in case of data=journal */
5456	if (ext4_should_journal_data(inode)) {
5457		ret = ext4_force_commit(inode->i_sb);
5458		if (ret)
5459			return ret;
5460	}
5461
5462	inode_lock(inode);
5463	/* Currently just for extent based files */
5464	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5465		ret = -EOPNOTSUPP;
5466		goto out_mutex;
5467	}
5468
5469	/* Check whether the maximum file size would be exceeded */
5470	if (len > inode->i_sb->s_maxbytes - inode->i_size) {
5471		ret = -EFBIG;
5472		goto out_mutex;
5473	}
5474
5475	/* Offset must be less than i_size */
5476	if (offset >= inode->i_size) {
5477		ret = -EINVAL;
5478		goto out_mutex;
5479	}
5480
5481	/* Wait for existing dio to complete */
5482	inode_dio_wait(inode);
5483
5484	ret = file_modified(file);
5485	if (ret)
5486		goto out_mutex;
5487
5488	/*
5489	 * Prevent page faults from reinstantiating pages we have released from
5490	 * page cache.
5491	 */
5492	down_write(&EXT4_I(inode)->i_mmap_sem);
5493
5494	ret = ext4_break_layouts(inode);
5495	if (ret)
5496		goto out_mmap;
5497
5498	/*
5499	 * Need to round down to align start offset to page size boundary
5500	 * for page size > block size.
5501	 */
5502	ioffset = round_down(offset, PAGE_SIZE);
5503	/* Write out all dirty pages */
5504	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
5505			LLONG_MAX);
5506	if (ret)
5507		goto out_mmap;
5508	truncate_pagecache(inode, ioffset);
5509
5510	credits = ext4_writepage_trans_blocks(inode);
5511	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
5512	if (IS_ERR(handle)) {
5513		ret = PTR_ERR(handle);
5514		goto out_mmap;
5515	}
5516	ext4_fc_start_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
5517
5518	/* Expand file to avoid data loss if there is error while shifting */
5519	inode->i_size += len;
5520	EXT4_I(inode)->i_disksize += len;
5521	inode->i_mtime = inode->i_ctime = current_time(inode);
5522	ret = ext4_mark_inode_dirty(handle, inode);
5523	if (ret)
5524		goto out_stop;
5525
5526	down_write(&EXT4_I(inode)->i_data_sem);
5527	ext4_discard_preallocations(inode, 0);
5528
5529	path = ext4_find_extent(inode, offset_lblk, NULL, 0);
5530	if (IS_ERR(path)) {
5531		up_write(&EXT4_I(inode)->i_data_sem);
5532		goto out_stop;
5533	}
5534
5535	depth = ext_depth(inode);
5536	extent = path[depth].p_ext;
5537	if (extent) {
5538		ee_start_lblk = le32_to_cpu(extent->ee_block);
5539		ee_len = ext4_ext_get_actual_len(extent);
5540
5541		/*
5542		 * If offset_lblk is not the starting block of extent, split
5543		 * the extent @offset_lblk
5544		 */
5545		if ((offset_lblk > ee_start_lblk) &&
5546				(offset_lblk < (ee_start_lblk + ee_len))) {
5547			if (ext4_ext_is_unwritten(extent))
5548				split_flag = EXT4_EXT_MARK_UNWRIT1 |
5549					EXT4_EXT_MARK_UNWRIT2;
5550			ret = ext4_split_extent_at(handle, inode, &path,
5551					offset_lblk, split_flag,
5552					EXT4_EX_NOCACHE |
5553					EXT4_GET_BLOCKS_PRE_IO |
5554					EXT4_GET_BLOCKS_METADATA_NOFAIL);
5555		}
5556
5557		ext4_ext_drop_refs(path);
5558		kfree(path);
5559		if (ret < 0) {
5560			up_write(&EXT4_I(inode)->i_data_sem);
5561			goto out_stop;
5562		}
5563	} else {
5564		ext4_ext_drop_refs(path);
5565		kfree(path);
5566	}
5567
5568	ret = ext4_es_remove_extent(inode, offset_lblk,
5569			EXT_MAX_BLOCKS - offset_lblk);
5570	if (ret) {
5571		up_write(&EXT4_I(inode)->i_data_sem);
5572		goto out_stop;
5573	}
5574
5575	/*
5576	 * if offset_lblk lies in a hole which is at start of file, use
5577	 * ee_start_lblk to shift extents
5578	 */
5579	ret = ext4_ext_shift_extents(inode, handle,
5580		ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
5581		len_lblk, SHIFT_RIGHT);
5582
5583	up_write(&EXT4_I(inode)->i_data_sem);
5584	if (IS_SYNC(inode))
5585		ext4_handle_sync(handle);
5586	if (ret >= 0)
5587		ext4_update_inode_fsync_trans(handle, inode, 1);
5588
5589out_stop:
5590	ext4_journal_stop(handle);
5591	ext4_fc_stop_ineligible(sb);
5592out_mmap:
5593	up_write(&EXT4_I(inode)->i_mmap_sem);
5594out_mutex:
5595	inode_unlock(inode);
5596	return ret;
5597}
5598
5599/**
5600 * ext4_swap_extents() - Swap extents between two inodes
5601 * @handle: handle for this transaction
5602 * @inode1:	First inode
5603 * @inode2:	Second inode
5604 * @lblk1:	Start block for first inode
5605 * @lblk2:	Start block for second inode
5606 * @count:	Number of blocks to swap
5607 * @unwritten: Mark second inode's extents as unwritten after swap
5608 * @erp:	Pointer to save error value
5609 *
5610 * This helper routine does exactly what is promise "swap extents". All other
5611 * stuff such as page-cache locking consistency, bh mapping consistency or
5612 * extent's data copying must be performed by caller.
5613 * Locking:
5614 * 		i_mutex is held for both inodes
5615 * 		i_data_sem is locked for write for both inodes
5616 * Assumptions:
5617 *		All pages from requested range are locked for both inodes
5618 */
5619int
5620ext4_swap_extents(handle_t *handle, struct inode *inode1,
5621		  struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5622		  ext4_lblk_t count, int unwritten, int *erp)
5623{
5624	struct ext4_ext_path *path1 = NULL;
5625	struct ext4_ext_path *path2 = NULL;
5626	int replaced_count = 0;
5627
5628	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5629	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5630	BUG_ON(!inode_is_locked(inode1));
5631	BUG_ON(!inode_is_locked(inode2));
5632
5633	*erp = ext4_es_remove_extent(inode1, lblk1, count);
5634	if (unlikely(*erp))
5635		return 0;
5636	*erp = ext4_es_remove_extent(inode2, lblk2, count);
5637	if (unlikely(*erp))
5638		return 0;
5639
5640	while (count) {
5641		struct ext4_extent *ex1, *ex2, tmp_ex;
5642		ext4_lblk_t e1_blk, e2_blk;
5643		int e1_len, e2_len, len;
5644		int split = 0;
5645
5646		path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5647		if (IS_ERR(path1)) {
5648			*erp = PTR_ERR(path1);
5649			path1 = NULL;
5650		finish:
5651			count = 0;
5652			goto repeat;
5653		}
5654		path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5655		if (IS_ERR(path2)) {
5656			*erp = PTR_ERR(path2);
5657			path2 = NULL;
5658			goto finish;
5659		}
5660		ex1 = path1[path1->p_depth].p_ext;
5661		ex2 = path2[path2->p_depth].p_ext;
5662		/* Do we have something to swap ? */
5663		if (unlikely(!ex2 || !ex1))
5664			goto finish;
5665
5666		e1_blk = le32_to_cpu(ex1->ee_block);
5667		e2_blk = le32_to_cpu(ex2->ee_block);
5668		e1_len = ext4_ext_get_actual_len(ex1);
5669		e2_len = ext4_ext_get_actual_len(ex2);
5670
5671		/* Hole handling */
5672		if (!in_range(lblk1, e1_blk, e1_len) ||
5673		    !in_range(lblk2, e2_blk, e2_len)) {
5674			ext4_lblk_t next1, next2;
5675
5676			/* if hole after extent, then go to next extent */
5677			next1 = ext4_ext_next_allocated_block(path1);
5678			next2 = ext4_ext_next_allocated_block(path2);
5679			/* If hole before extent, then shift to that extent */
5680			if (e1_blk > lblk1)
5681				next1 = e1_blk;
5682			if (e2_blk > lblk2)
5683				next2 = e2_blk;
5684			/* Do we have something to swap */
5685			if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5686				goto finish;
5687			/* Move to the rightest boundary */
5688			len = next1 - lblk1;
5689			if (len < next2 - lblk2)
5690				len = next2 - lblk2;
5691			if (len > count)
5692				len = count;
5693			lblk1 += len;
5694			lblk2 += len;
5695			count -= len;
5696			goto repeat;
5697		}
5698
5699		/* Prepare left boundary */
5700		if (e1_blk < lblk1) {
5701			split = 1;
5702			*erp = ext4_force_split_extent_at(handle, inode1,
5703						&path1, lblk1, 0);
5704			if (unlikely(*erp))
5705				goto finish;
5706		}
5707		if (e2_blk < lblk2) {
5708			split = 1;
5709			*erp = ext4_force_split_extent_at(handle, inode2,
5710						&path2,  lblk2, 0);
5711			if (unlikely(*erp))
5712				goto finish;
5713		}
5714		/* ext4_split_extent_at() may result in leaf extent split,
5715		 * path must to be revalidated. */
5716		if (split)
5717			goto repeat;
5718
5719		/* Prepare right boundary */
5720		len = count;
5721		if (len > e1_blk + e1_len - lblk1)
5722			len = e1_blk + e1_len - lblk1;
5723		if (len > e2_blk + e2_len - lblk2)
5724			len = e2_blk + e2_len - lblk2;
5725
5726		if (len != e1_len) {
5727			split = 1;
5728			*erp = ext4_force_split_extent_at(handle, inode1,
5729						&path1, lblk1 + len, 0);
5730			if (unlikely(*erp))
5731				goto finish;
5732		}
5733		if (len != e2_len) {
5734			split = 1;
5735			*erp = ext4_force_split_extent_at(handle, inode2,
5736						&path2, lblk2 + len, 0);
5737			if (*erp)
5738				goto finish;
5739		}
5740		/* ext4_split_extent_at() may result in leaf extent split,
5741		 * path must to be revalidated. */
5742		if (split)
5743			goto repeat;
5744
5745		BUG_ON(e2_len != e1_len);
5746		*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5747		if (unlikely(*erp))
5748			goto finish;
5749		*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5750		if (unlikely(*erp))
5751			goto finish;
5752
5753		/* Both extents are fully inside boundaries. Swap it now */
5754		tmp_ex = *ex1;
5755		ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5756		ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5757		ex1->ee_len = cpu_to_le16(e2_len);
5758		ex2->ee_len = cpu_to_le16(e1_len);
5759		if (unwritten)
5760			ext4_ext_mark_unwritten(ex2);
5761		if (ext4_ext_is_unwritten(&tmp_ex))
5762			ext4_ext_mark_unwritten(ex1);
5763
5764		ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5765		ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5766		*erp = ext4_ext_dirty(handle, inode2, path2 +
5767				      path2->p_depth);
5768		if (unlikely(*erp))
5769			goto finish;
5770		*erp = ext4_ext_dirty(handle, inode1, path1 +
5771				      path1->p_depth);
5772		/*
5773		 * Looks scarry ah..? second inode already points to new blocks,
5774		 * and it was successfully dirtied. But luckily error may happen
5775		 * only due to journal error, so full transaction will be
5776		 * aborted anyway.
5777		 */
5778		if (unlikely(*erp))
5779			goto finish;
5780		lblk1 += len;
5781		lblk2 += len;
5782		replaced_count += len;
5783		count -= len;
5784
5785	repeat:
5786		ext4_ext_drop_refs(path1);
5787		kfree(path1);
5788		ext4_ext_drop_refs(path2);
5789		kfree(path2);
5790		path1 = path2 = NULL;
5791	}
5792	return replaced_count;
5793}
5794
5795/*
5796 * ext4_clu_mapped - determine whether any block in a logical cluster has
5797 *                   been mapped to a physical cluster
5798 *
5799 * @inode - file containing the logical cluster
5800 * @lclu - logical cluster of interest
5801 *
5802 * Returns 1 if any block in the logical cluster is mapped, signifying
5803 * that a physical cluster has been allocated for it.  Otherwise,
5804 * returns 0.  Can also return negative error codes.  Derived from
5805 * ext4_ext_map_blocks().
5806 */
5807int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
5808{
5809	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5810	struct ext4_ext_path *path;
5811	int depth, mapped = 0, err = 0;
5812	struct ext4_extent *extent;
5813	ext4_lblk_t first_lblk, first_lclu, last_lclu;
5814
5815	/*
5816	 * if data can be stored inline, the logical cluster isn't
5817	 * mapped - no physical clusters have been allocated, and the
5818	 * file has no extents
5819	 */
5820	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ||
5821	    ext4_has_inline_data(inode))
5822		return 0;
5823
5824	/* search for the extent closest to the first block in the cluster */
5825	path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
5826	if (IS_ERR(path)) {
5827		err = PTR_ERR(path);
5828		path = NULL;
5829		goto out;
5830	}
5831
5832	depth = ext_depth(inode);
5833
5834	/*
5835	 * A consistent leaf must not be empty.  This situation is possible,
5836	 * though, _during_ tree modification, and it's why an assert can't
5837	 * be put in ext4_find_extent().
5838	 */
5839	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
5840		EXT4_ERROR_INODE(inode,
5841		    "bad extent address - lblock: %lu, depth: %d, pblock: %lld",
5842				 (unsigned long) EXT4_C2B(sbi, lclu),
5843				 depth, path[depth].p_block);
5844		err = -EFSCORRUPTED;
5845		goto out;
5846	}
5847
5848	extent = path[depth].p_ext;
5849
5850	/* can't be mapped if the extent tree is empty */
5851	if (extent == NULL)
5852		goto out;
5853
5854	first_lblk = le32_to_cpu(extent->ee_block);
5855	first_lclu = EXT4_B2C(sbi, first_lblk);
5856
5857	/*
5858	 * Three possible outcomes at this point - found extent spanning
5859	 * the target cluster, to the left of the target cluster, or to the
5860	 * right of the target cluster.  The first two cases are handled here.
5861	 * The last case indicates the target cluster is not mapped.
5862	 */
5863	if (lclu >= first_lclu) {
5864		last_lclu = EXT4_B2C(sbi, first_lblk +
5865				     ext4_ext_get_actual_len(extent) - 1);
5866		if (lclu <= last_lclu) {
5867			mapped = 1;
5868		} else {
5869			first_lblk = ext4_ext_next_allocated_block(path);
5870			first_lclu = EXT4_B2C(sbi, first_lblk);
5871			if (lclu == first_lclu)
5872				mapped = 1;
5873		}
5874	}
5875
5876out:
5877	ext4_ext_drop_refs(path);
5878	kfree(path);
5879
5880	return err ? err : mapped;
5881}
5882
5883/*
5884 * Updates physical block address and unwritten status of extent
5885 * starting at lblk start and of len. If such an extent doesn't exist,
5886 * this function splits the extent tree appropriately to create an
5887 * extent like this.  This function is called in the fast commit
5888 * replay path.  Returns 0 on success and error on failure.
5889 */
5890int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
5891			      int len, int unwritten, ext4_fsblk_t pblk)
5892{
5893	struct ext4_ext_path *path = NULL, *ppath;
5894	struct ext4_extent *ex;
5895	int ret;
5896
5897	path = ext4_find_extent(inode, start, NULL, 0);
5898	if (IS_ERR(path))
5899		return PTR_ERR(path);
5900	ex = path[path->p_depth].p_ext;
5901	if (!ex) {
5902		ret = -EFSCORRUPTED;
5903		goto out;
5904	}
5905
5906	if (le32_to_cpu(ex->ee_block) != start ||
5907		ext4_ext_get_actual_len(ex) != len) {
5908		/* We need to split this extent to match our extent first */
5909		ppath = path;
5910		down_write(&EXT4_I(inode)->i_data_sem);
5911		ret = ext4_force_split_extent_at(NULL, inode, &ppath, start, 1);
5912		up_write(&EXT4_I(inode)->i_data_sem);
5913		if (ret)
5914			goto out;
5915		kfree(path);
5916		path = ext4_find_extent(inode, start, NULL, 0);
5917		if (IS_ERR(path))
5918			return -1;
5919		ppath = path;
5920		ex = path[path->p_depth].p_ext;
5921		WARN_ON(le32_to_cpu(ex->ee_block) != start);
5922		if (ext4_ext_get_actual_len(ex) != len) {
5923			down_write(&EXT4_I(inode)->i_data_sem);
5924			ret = ext4_force_split_extent_at(NULL, inode, &ppath,
5925							 start + len, 1);
5926			up_write(&EXT4_I(inode)->i_data_sem);
5927			if (ret)
5928				goto out;
5929			kfree(path);
5930			path = ext4_find_extent(inode, start, NULL, 0);
5931			if (IS_ERR(path))
5932				return -EINVAL;
5933			ex = path[path->p_depth].p_ext;
5934		}
5935	}
5936	if (unwritten)
5937		ext4_ext_mark_unwritten(ex);
5938	else
5939		ext4_ext_mark_initialized(ex);
5940	ext4_ext_store_pblock(ex, pblk);
5941	down_write(&EXT4_I(inode)->i_data_sem);
5942	ret = ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
5943	up_write(&EXT4_I(inode)->i_data_sem);
5944out:
5945	ext4_ext_drop_refs(path);
5946	kfree(path);
5947	ext4_mark_inode_dirty(NULL, inode);
5948	return ret;
5949}
5950
5951/* Try to shrink the extent tree */
5952void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end)
5953{
5954	struct ext4_ext_path *path = NULL;
5955	struct ext4_extent *ex;
5956	ext4_lblk_t old_cur, cur = 0;
5957
5958	while (cur < end) {
5959		path = ext4_find_extent(inode, cur, NULL, 0);
5960		if (IS_ERR(path))
5961			return;
5962		ex = path[path->p_depth].p_ext;
5963		if (!ex) {
5964			ext4_ext_drop_refs(path);
5965			kfree(path);
5966			ext4_mark_inode_dirty(NULL, inode);
5967			return;
5968		}
5969		old_cur = cur;
5970		cur = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
5971		if (cur <= old_cur)
5972			cur = old_cur + 1;
5973		ext4_ext_try_to_merge(NULL, inode, path, ex);
5974		down_write(&EXT4_I(inode)->i_data_sem);
5975		ext4_ext_dirty(NULL, inode, &path[path->p_depth]);
5976		up_write(&EXT4_I(inode)->i_data_sem);
5977		ext4_mark_inode_dirty(NULL, inode);
5978		ext4_ext_drop_refs(path);
5979		kfree(path);
5980	}
5981}
5982
5983/* Check if *cur is a hole and if it is, skip it */
5984static int skip_hole(struct inode *inode, ext4_lblk_t *cur)
5985{
5986	int ret;
5987	struct ext4_map_blocks map;
5988
5989	map.m_lblk = *cur;
5990	map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur;
5991
5992	ret = ext4_map_blocks(NULL, inode, &map, 0);
5993	if (ret < 0)
5994		return ret;
5995	if (ret != 0)
5996		return 0;
5997	*cur = *cur + map.m_len;
5998	return 0;
5999}
6000
6001/* Count number of blocks used by this inode and update i_blocks */
6002int ext4_ext_replay_set_iblocks(struct inode *inode)
6003{
6004	struct ext4_ext_path *path = NULL, *path2 = NULL;
6005	struct ext4_extent *ex;
6006	ext4_lblk_t cur = 0, end;
6007	int numblks = 0, i, ret = 0;
6008	ext4_fsblk_t cmp1, cmp2;
6009	struct ext4_map_blocks map;
6010
6011	/* Determin the size of the file first */
6012	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
6013					EXT4_EX_NOCACHE);
6014	if (IS_ERR(path))
6015		return PTR_ERR(path);
6016	ex = path[path->p_depth].p_ext;
6017	if (!ex) {
6018		ext4_ext_drop_refs(path);
6019		kfree(path);
6020		goto out;
6021	}
6022	end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
6023	ext4_ext_drop_refs(path);
6024	kfree(path);
6025
6026	/* Count the number of data blocks */
6027	cur = 0;
6028	while (cur < end) {
6029		map.m_lblk = cur;
6030		map.m_len = end - cur;
6031		ret = ext4_map_blocks(NULL, inode, &map, 0);
6032		if (ret < 0)
6033			break;
6034		if (ret > 0)
6035			numblks += ret;
6036		cur = cur + map.m_len;
6037	}
6038
6039	/*
6040	 * Count the number of extent tree blocks. We do it by looking up
6041	 * two successive extents and determining the difference between
6042	 * their paths. When path is different for 2 successive extents
6043	 * we compare the blocks in the path at each level and increment
6044	 * iblocks by total number of differences found.
6045	 */
6046	cur = 0;
6047	ret = skip_hole(inode, &cur);
6048	if (ret < 0)
6049		goto out;
6050	path = ext4_find_extent(inode, cur, NULL, 0);
6051	if (IS_ERR(path))
6052		goto out;
6053	numblks += path->p_depth;
6054	ext4_ext_drop_refs(path);
6055	kfree(path);
6056	while (cur < end) {
6057		path = ext4_find_extent(inode, cur, NULL, 0);
6058		if (IS_ERR(path))
6059			break;
6060		ex = path[path->p_depth].p_ext;
6061		if (!ex) {
6062			ext4_ext_drop_refs(path);
6063			kfree(path);
6064			return 0;
6065		}
6066		cur = max(cur + 1, le32_to_cpu(ex->ee_block) +
6067					ext4_ext_get_actual_len(ex));
6068		ret = skip_hole(inode, &cur);
6069		if (ret < 0) {
6070			ext4_ext_drop_refs(path);
6071			kfree(path);
6072			break;
6073		}
6074		path2 = ext4_find_extent(inode, cur, NULL, 0);
6075		if (IS_ERR(path2)) {
6076			ext4_ext_drop_refs(path);
6077			kfree(path);
6078			break;
6079		}
6080		ex = path2[path2->p_depth].p_ext;
6081		for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
6082			cmp1 = cmp2 = 0;
6083			if (i <= path->p_depth)
6084				cmp1 = path[i].p_bh ?
6085					path[i].p_bh->b_blocknr : 0;
6086			if (i <= path2->p_depth)
6087				cmp2 = path2[i].p_bh ?
6088					path2[i].p_bh->b_blocknr : 0;
6089			if (cmp1 != cmp2 && cmp2 != 0)
6090				numblks++;
6091		}
6092		ext4_ext_drop_refs(path);
6093		ext4_ext_drop_refs(path2);
6094		kfree(path);
6095		kfree(path2);
6096	}
6097
6098out:
6099	inode->i_blocks = numblks << (inode->i_sb->s_blocksize_bits - 9);
6100	ext4_mark_inode_dirty(NULL, inode);
6101	return 0;
6102}
6103
6104int ext4_ext_clear_bb(struct inode *inode)
6105{
6106	struct ext4_ext_path *path = NULL;
6107	struct ext4_extent *ex;
6108	ext4_lblk_t cur = 0, end;
6109	int j, ret = 0;
6110	struct ext4_map_blocks map;
6111
6112	/* Determin the size of the file first */
6113	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
6114					EXT4_EX_NOCACHE);
6115	if (IS_ERR(path))
6116		return PTR_ERR(path);
6117	ex = path[path->p_depth].p_ext;
6118	if (!ex) {
6119		ext4_ext_drop_refs(path);
6120		kfree(path);
6121		return 0;
6122	}
6123	end = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
6124	ext4_ext_drop_refs(path);
6125	kfree(path);
6126
6127	cur = 0;
6128	while (cur < end) {
6129		map.m_lblk = cur;
6130		map.m_len = end - cur;
6131		ret = ext4_map_blocks(NULL, inode, &map, 0);
6132		if (ret < 0)
6133			break;
6134		if (ret > 0) {
6135			path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
6136			if (!IS_ERR_OR_NULL(path)) {
6137				for (j = 0; j < path->p_depth; j++) {
6138
6139					ext4_mb_mark_bb(inode->i_sb,
6140							path[j].p_block, 1, 0);
6141					ext4_fc_record_regions(inode->i_sb, inode->i_ino,
6142							0, path[j].p_block, 1, 1);
6143				}
6144				ext4_ext_drop_refs(path);
6145				kfree(path);
6146			}
6147			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
6148			ext4_fc_record_regions(inode->i_sb, inode->i_ino,
6149					map.m_lblk, map.m_pblk, map.m_len, 1);
6150		}
6151		cur = cur + map.m_len;
6152	}
6153
6154	return 0;
6155}
6156