xref: /kernel/linux/linux-5.10/fs/gfs2/bmap.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4 * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
5 */
6
7#include <linux/spinlock.h>
8#include <linux/completion.h>
9#include <linux/buffer_head.h>
10#include <linux/blkdev.h>
11#include <linux/gfs2_ondisk.h>
12#include <linux/crc32.h>
13#include <linux/iomap.h>
14#include <linux/ktime.h>
15
16#include "gfs2.h"
17#include "incore.h"
18#include "bmap.h"
19#include "glock.h"
20#include "inode.h"
21#include "meta_io.h"
22#include "quota.h"
23#include "rgrp.h"
24#include "log.h"
25#include "super.h"
26#include "trans.h"
27#include "dir.h"
28#include "util.h"
29#include "aops.h"
30#include "trace_gfs2.h"
31
32/* This doesn't need to be that large as max 64 bit pointers in a 4k
33 * block is 512, so __u16 is fine for that. It saves stack space to
34 * keep it small.
35 */
36struct metapath {
37	struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
38	__u16 mp_list[GFS2_MAX_META_HEIGHT];
39	int mp_fheight; /* find_metapath height */
40	int mp_aheight; /* actual height (lookup height) */
41};
42
43static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
44
45/**
46 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
47 * @ip: the inode
48 * @dibh: the dinode buffer
49 * @block: the block number that was allocated
50 * @page: The (optional) page. This is looked up if @page is NULL
51 *
52 * Returns: errno
53 */
54
55static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
56			       u64 block, struct page *page)
57{
58	struct inode *inode = &ip->i_inode;
59	int release = 0;
60
61	if (!page || page->index) {
62		page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
63		if (!page)
64			return -ENOMEM;
65		release = 1;
66	}
67
68	if (!PageUptodate(page)) {
69		void *kaddr = kmap(page);
70		u64 dsize = i_size_read(inode);
71
72		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
73		memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
74		kunmap(page);
75
76		SetPageUptodate(page);
77	}
78
79	if (gfs2_is_jdata(ip)) {
80		struct buffer_head *bh;
81
82		if (!page_has_buffers(page))
83			create_empty_buffers(page, BIT(inode->i_blkbits),
84					     BIT(BH_Uptodate));
85
86		bh = page_buffers(page);
87		if (!buffer_mapped(bh))
88			map_bh(bh, inode->i_sb, block);
89
90		set_buffer_uptodate(bh);
91		gfs2_trans_add_data(ip->i_gl, bh);
92	} else {
93		set_page_dirty(page);
94		gfs2_ordered_add_inode(ip);
95	}
96
97	if (release) {
98		unlock_page(page);
99		put_page(page);
100	}
101
102	return 0;
103}
104
105/**
106 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
107 * @ip: The GFS2 inode to unstuff
108 * @page: The (optional) page. This is looked up if the @page is NULL
109 *
110 * This routine unstuffs a dinode and returns it to a "normal" state such
111 * that the height can be grown in the traditional way.
112 *
113 * Returns: errno
114 */
115
116int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
117{
118	struct buffer_head *bh, *dibh;
119	struct gfs2_dinode *di;
120	u64 block = 0;
121	int isdir = gfs2_is_dir(ip);
122	int error;
123
124	down_write(&ip->i_rw_mutex);
125
126	error = gfs2_meta_inode_buffer(ip, &dibh);
127	if (error)
128		goto out;
129
130	if (i_size_read(&ip->i_inode)) {
131		/* Get a free block, fill it with the stuffed data,
132		   and write it out to disk */
133
134		unsigned int n = 1;
135		error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
136		if (error)
137			goto out_brelse;
138		if (isdir) {
139			gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
140			error = gfs2_dir_get_new_buffer(ip, block, &bh);
141			if (error)
142				goto out_brelse;
143			gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
144					      dibh, sizeof(struct gfs2_dinode));
145			brelse(bh);
146		} else {
147			error = gfs2_unstuffer_page(ip, dibh, block, page);
148			if (error)
149				goto out_brelse;
150		}
151	}
152
153	/*  Set up the pointer to the new block  */
154
155	gfs2_trans_add_meta(ip->i_gl, dibh);
156	di = (struct gfs2_dinode *)dibh->b_data;
157	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
158
159	if (i_size_read(&ip->i_inode)) {
160		*(__be64 *)(di + 1) = cpu_to_be64(block);
161		gfs2_add_inode_blocks(&ip->i_inode, 1);
162		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
163	}
164
165	ip->i_height = 1;
166	di->di_height = cpu_to_be16(1);
167
168out_brelse:
169	brelse(dibh);
170out:
171	up_write(&ip->i_rw_mutex);
172	return error;
173}
174
175
176/**
177 * find_metapath - Find path through the metadata tree
178 * @sdp: The superblock
179 * @block: The disk block to look up
180 * @mp: The metapath to return the result in
181 * @height: The pre-calculated height of the metadata tree
182 *
183 *   This routine returns a struct metapath structure that defines a path
184 *   through the metadata of inode "ip" to get to block "block".
185 *
186 *   Example:
187 *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
188 *   filesystem with a blocksize of 4096.
189 *
190 *   find_metapath() would return a struct metapath structure set to:
191 *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
192 *
193 *   That means that in order to get to the block containing the byte at
194 *   offset 101342453, we would load the indirect block pointed to by pointer
195 *   0 in the dinode.  We would then load the indirect block pointed to by
196 *   pointer 48 in that indirect block.  We would then load the data block
197 *   pointed to by pointer 165 in that indirect block.
198 *
199 *             ----------------------------------------
200 *             | Dinode |                             |
201 *             |        |                            4|
202 *             |        |0 1 2 3 4 5                 9|
203 *             |        |                            6|
204 *             ----------------------------------------
205 *                       |
206 *                       |
207 *                       V
208 *             ----------------------------------------
209 *             | Indirect Block                       |
210 *             |                                     5|
211 *             |            4 4 4 4 4 5 5            1|
212 *             |0           5 6 7 8 9 0 1            2|
213 *             ----------------------------------------
214 *                                |
215 *                                |
216 *                                V
217 *             ----------------------------------------
218 *             | Indirect Block                       |
219 *             |                         1 1 1 1 1   5|
220 *             |                         6 6 6 6 6   1|
221 *             |0                        3 4 5 6 7   2|
222 *             ----------------------------------------
223 *                                           |
224 *                                           |
225 *                                           V
226 *             ----------------------------------------
227 *             | Data block containing offset         |
228 *             |            101342453                 |
229 *             |                                      |
230 *             |                                      |
231 *             ----------------------------------------
232 *
233 */
234
235static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
236			  struct metapath *mp, unsigned int height)
237{
238	unsigned int i;
239
240	mp->mp_fheight = height;
241	for (i = height; i--;)
242		mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
243}
244
245static inline unsigned int metapath_branch_start(const struct metapath *mp)
246{
247	if (mp->mp_list[0] == 0)
248		return 2;
249	return 1;
250}
251
252/**
253 * metaptr1 - Return the first possible metadata pointer in a metapath buffer
254 * @height: The metadata height (0 = dinode)
255 * @mp: The metapath
256 */
257static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
258{
259	struct buffer_head *bh = mp->mp_bh[height];
260	if (height == 0)
261		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
262	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
263}
264
265/**
266 * metapointer - Return pointer to start of metadata in a buffer
267 * @height: The metadata height (0 = dinode)
268 * @mp: The metapath
269 *
270 * Return a pointer to the block number of the next height of the metadata
271 * tree given a buffer containing the pointer to the current height of the
272 * metadata tree.
273 */
274
275static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
276{
277	__be64 *p = metaptr1(height, mp);
278	return p + mp->mp_list[height];
279}
280
281static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
282{
283	const struct buffer_head *bh = mp->mp_bh[height];
284	return (const __be64 *)(bh->b_data + bh->b_size);
285}
286
287static void clone_metapath(struct metapath *clone, struct metapath *mp)
288{
289	unsigned int hgt;
290
291	*clone = *mp;
292	for (hgt = 0; hgt < mp->mp_aheight; hgt++)
293		get_bh(clone->mp_bh[hgt]);
294}
295
296static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
297{
298	const __be64 *t;
299
300	for (t = start; t < end; t++) {
301		struct buffer_head *rabh;
302
303		if (!*t)
304			continue;
305
306		rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
307		if (trylock_buffer(rabh)) {
308			if (!buffer_uptodate(rabh)) {
309				rabh->b_end_io = end_buffer_read_sync;
310				submit_bh(REQ_OP_READ,
311					  REQ_RAHEAD | REQ_META | REQ_PRIO,
312					  rabh);
313				continue;
314			}
315			unlock_buffer(rabh);
316		}
317		brelse(rabh);
318	}
319}
320
321static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
322			     unsigned int x, unsigned int h)
323{
324	for (; x < h; x++) {
325		__be64 *ptr = metapointer(x, mp);
326		u64 dblock = be64_to_cpu(*ptr);
327		int ret;
328
329		if (!dblock)
330			break;
331		ret = gfs2_meta_indirect_buffer(ip, x + 1, dblock, &mp->mp_bh[x + 1]);
332		if (ret)
333			return ret;
334	}
335	mp->mp_aheight = x + 1;
336	return 0;
337}
338
339/**
340 * lookup_metapath - Walk the metadata tree to a specific point
341 * @ip: The inode
342 * @mp: The metapath
343 *
344 * Assumes that the inode's buffer has already been looked up and
345 * hooked onto mp->mp_bh[0] and that the metapath has been initialised
346 * by find_metapath().
347 *
348 * If this function encounters part of the tree which has not been
349 * allocated, it returns the current height of the tree at the point
350 * at which it found the unallocated block. Blocks which are found are
351 * added to the mp->mp_bh[] list.
352 *
353 * Returns: error
354 */
355
356static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
357{
358	return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
359}
360
361/**
362 * fillup_metapath - fill up buffers for the metadata path to a specific height
363 * @ip: The inode
364 * @mp: The metapath
365 * @h: The height to which it should be mapped
366 *
367 * Similar to lookup_metapath, but does lookups for a range of heights
368 *
369 * Returns: error or the number of buffers filled
370 */
371
372static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
373{
374	unsigned int x = 0;
375	int ret;
376
377	if (h) {
378		/* find the first buffer we need to look up. */
379		for (x = h - 1; x > 0; x--) {
380			if (mp->mp_bh[x])
381				break;
382		}
383	}
384	ret = __fillup_metapath(ip, mp, x, h);
385	if (ret)
386		return ret;
387	return mp->mp_aheight - x - 1;
388}
389
390static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
391{
392	sector_t factor = 1, block = 0;
393	int hgt;
394
395	for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
396		if (hgt < mp->mp_aheight)
397			block += mp->mp_list[hgt] * factor;
398		factor *= sdp->sd_inptrs;
399	}
400	return block;
401}
402
403static void release_metapath(struct metapath *mp)
404{
405	int i;
406
407	for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
408		if (mp->mp_bh[i] == NULL)
409			break;
410		brelse(mp->mp_bh[i]);
411		mp->mp_bh[i] = NULL;
412	}
413}
414
415/**
416 * gfs2_extent_length - Returns length of an extent of blocks
417 * @bh: The metadata block
418 * @ptr: Current position in @bh
419 * @limit: Max extent length to return
420 * @eob: Set to 1 if we hit "end of block"
421 *
422 * Returns: The length of the extent (minimum of one block)
423 */
424
425static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
426{
427	const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
428	const __be64 *first = ptr;
429	u64 d = be64_to_cpu(*ptr);
430
431	*eob = 0;
432	do {
433		ptr++;
434		if (ptr >= end)
435			break;
436		d++;
437	} while(be64_to_cpu(*ptr) == d);
438	if (ptr >= end)
439		*eob = 1;
440	return ptr - first;
441}
442
443enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
444
445/*
446 * gfs2_metadata_walker - walk an indirect block
447 * @mp: Metapath to indirect block
448 * @ptrs: Number of pointers to look at
449 *
450 * When returning WALK_FOLLOW, the walker must update @mp to point at the right
451 * indirect block to follow.
452 */
453typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
454						   unsigned int ptrs);
455
456/*
457 * gfs2_walk_metadata - walk a tree of indirect blocks
458 * @inode: The inode
459 * @mp: Starting point of walk
460 * @max_len: Maximum number of blocks to walk
461 * @walker: Called during the walk
462 *
463 * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
464 * past the end of metadata, and a negative error code otherwise.
465 */
466
467static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
468		u64 max_len, gfs2_metadata_walker walker)
469{
470	struct gfs2_inode *ip = GFS2_I(inode);
471	struct gfs2_sbd *sdp = GFS2_SB(inode);
472	u64 factor = 1;
473	unsigned int hgt;
474	int ret;
475
476	/*
477	 * The walk starts in the lowest allocated indirect block, which may be
478	 * before the position indicated by @mp.  Adjust @max_len accordingly
479	 * to avoid a short walk.
480	 */
481	for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
482		max_len += mp->mp_list[hgt] * factor;
483		mp->mp_list[hgt] = 0;
484		factor *= sdp->sd_inptrs;
485	}
486
487	for (;;) {
488		u16 start = mp->mp_list[hgt];
489		enum walker_status status;
490		unsigned int ptrs;
491		u64 len;
492
493		/* Walk indirect block. */
494		ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
495		len = ptrs * factor;
496		if (len > max_len)
497			ptrs = DIV_ROUND_UP_ULL(max_len, factor);
498		status = walker(mp, ptrs);
499		switch (status) {
500		case WALK_STOP:
501			return 1;
502		case WALK_FOLLOW:
503			BUG_ON(mp->mp_aheight == mp->mp_fheight);
504			ptrs = mp->mp_list[hgt] - start;
505			len = ptrs * factor;
506			break;
507		case WALK_CONTINUE:
508			break;
509		}
510		if (len >= max_len)
511			break;
512		max_len -= len;
513		if (status == WALK_FOLLOW)
514			goto fill_up_metapath;
515
516lower_metapath:
517		/* Decrease height of metapath. */
518		brelse(mp->mp_bh[hgt]);
519		mp->mp_bh[hgt] = NULL;
520		mp->mp_list[hgt] = 0;
521		if (!hgt)
522			break;
523		hgt--;
524		factor *= sdp->sd_inptrs;
525
526		/* Advance in metadata tree. */
527		(mp->mp_list[hgt])++;
528		if (hgt) {
529			if (mp->mp_list[hgt] >= sdp->sd_inptrs)
530				goto lower_metapath;
531		} else {
532			if (mp->mp_list[hgt] >= sdp->sd_diptrs)
533				break;
534		}
535
536fill_up_metapath:
537		/* Increase height of metapath. */
538		ret = fillup_metapath(ip, mp, ip->i_height - 1);
539		if (ret < 0)
540			return ret;
541		hgt += ret;
542		for (; ret; ret--)
543			do_div(factor, sdp->sd_inptrs);
544		mp->mp_aheight = hgt + 1;
545	}
546	return 0;
547}
548
549static enum walker_status gfs2_hole_walker(struct metapath *mp,
550					   unsigned int ptrs)
551{
552	const __be64 *start, *ptr, *end;
553	unsigned int hgt;
554
555	hgt = mp->mp_aheight - 1;
556	start = metapointer(hgt, mp);
557	end = start + ptrs;
558
559	for (ptr = start; ptr < end; ptr++) {
560		if (*ptr) {
561			mp->mp_list[hgt] += ptr - start;
562			if (mp->mp_aheight == mp->mp_fheight)
563				return WALK_STOP;
564			return WALK_FOLLOW;
565		}
566	}
567	return WALK_CONTINUE;
568}
569
570/**
571 * gfs2_hole_size - figure out the size of a hole
572 * @inode: The inode
573 * @lblock: The logical starting block number
574 * @len: How far to look (in blocks)
575 * @mp: The metapath at lblock
576 * @iomap: The iomap to store the hole size in
577 *
578 * This function modifies @mp.
579 *
580 * Returns: errno on error
581 */
582static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
583			  struct metapath *mp, struct iomap *iomap)
584{
585	struct metapath clone;
586	u64 hole_size;
587	int ret;
588
589	clone_metapath(&clone, mp);
590	ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
591	if (ret < 0)
592		goto out;
593
594	if (ret == 1)
595		hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
596	else
597		hole_size = len;
598	iomap->length = hole_size << inode->i_blkbits;
599	ret = 0;
600
601out:
602	release_metapath(&clone);
603	return ret;
604}
605
606static inline __be64 *gfs2_indirect_init(struct metapath *mp,
607					 struct gfs2_glock *gl, unsigned int i,
608					 unsigned offset, u64 bn)
609{
610	__be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
611		       ((i > 1) ? sizeof(struct gfs2_meta_header) :
612				 sizeof(struct gfs2_dinode)));
613	BUG_ON(i < 1);
614	BUG_ON(mp->mp_bh[i] != NULL);
615	mp->mp_bh[i] = gfs2_meta_new(gl, bn);
616	gfs2_trans_add_meta(gl, mp->mp_bh[i]);
617	gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
618	gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
619	ptr += offset;
620	*ptr = cpu_to_be64(bn);
621	return ptr;
622}
623
624enum alloc_state {
625	ALLOC_DATA = 0,
626	ALLOC_GROW_DEPTH = 1,
627	ALLOC_GROW_HEIGHT = 2,
628	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
629};
630
631/**
632 * gfs2_iomap_alloc - Build a metadata tree of the requested height
633 * @inode: The GFS2 inode
634 * @iomap: The iomap structure
635 * @mp: The metapath, with proper height information calculated
636 *
637 * In this routine we may have to alloc:
638 *   i) Indirect blocks to grow the metadata tree height
639 *  ii) Indirect blocks to fill in lower part of the metadata tree
640 * iii) Data blocks
641 *
642 * This function is called after gfs2_iomap_get, which works out the
643 * total number of blocks which we need via gfs2_alloc_size.
644 *
645 * We then do the actual allocation asking for an extent at a time (if
646 * enough contiguous free blocks are available, there will only be one
647 * allocation request per call) and uses the state machine to initialise
648 * the blocks in order.
649 *
650 * Right now, this function will allocate at most one indirect block
651 * worth of data -- with a default block size of 4K, that's slightly
652 * less than 2M.  If this limitation is ever removed to allow huge
653 * allocations, we would probably still want to limit the iomap size we
654 * return to avoid stalling other tasks during huge writes; the next
655 * iomap iteration would then find the blocks already allocated.
656 *
657 * Returns: errno on error
658 */
659
660static int gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
661			    struct metapath *mp)
662{
663	struct gfs2_inode *ip = GFS2_I(inode);
664	struct gfs2_sbd *sdp = GFS2_SB(inode);
665	struct buffer_head *dibh = mp->mp_bh[0];
666	u64 bn;
667	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
668	size_t dblks = iomap->length >> inode->i_blkbits;
669	const unsigned end_of_metadata = mp->mp_fheight - 1;
670	int ret;
671	enum alloc_state state;
672	__be64 *ptr;
673	__be64 zero_bn = 0;
674
675	BUG_ON(mp->mp_aheight < 1);
676	BUG_ON(dibh == NULL);
677	BUG_ON(dblks < 1);
678
679	gfs2_trans_add_meta(ip->i_gl, dibh);
680
681	down_write(&ip->i_rw_mutex);
682
683	if (mp->mp_fheight == mp->mp_aheight) {
684		/* Bottom indirect block exists */
685		state = ALLOC_DATA;
686	} else {
687		/* Need to allocate indirect blocks */
688		if (mp->mp_fheight == ip->i_height) {
689			/* Writing into existing tree, extend tree down */
690			iblks = mp->mp_fheight - mp->mp_aheight;
691			state = ALLOC_GROW_DEPTH;
692		} else {
693			/* Building up tree height */
694			state = ALLOC_GROW_HEIGHT;
695			iblks = mp->mp_fheight - ip->i_height;
696			branch_start = metapath_branch_start(mp);
697			iblks += (mp->mp_fheight - branch_start);
698		}
699	}
700
701	/* start of the second part of the function (state machine) */
702
703	blks = dblks + iblks;
704	i = mp->mp_aheight;
705	do {
706		n = blks - alloced;
707		ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
708		if (ret)
709			goto out;
710		alloced += n;
711		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
712			gfs2_trans_remove_revoke(sdp, bn, n);
713		switch (state) {
714		/* Growing height of tree */
715		case ALLOC_GROW_HEIGHT:
716			if (i == 1) {
717				ptr = (__be64 *)(dibh->b_data +
718						 sizeof(struct gfs2_dinode));
719				zero_bn = *ptr;
720			}
721			for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
722			     i++, n--)
723				gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
724			if (i - 1 == mp->mp_fheight - ip->i_height) {
725				i--;
726				gfs2_buffer_copy_tail(mp->mp_bh[i],
727						sizeof(struct gfs2_meta_header),
728						dibh, sizeof(struct gfs2_dinode));
729				gfs2_buffer_clear_tail(dibh,
730						sizeof(struct gfs2_dinode) +
731						sizeof(__be64));
732				ptr = (__be64 *)(mp->mp_bh[i]->b_data +
733					sizeof(struct gfs2_meta_header));
734				*ptr = zero_bn;
735				state = ALLOC_GROW_DEPTH;
736				for(i = branch_start; i < mp->mp_fheight; i++) {
737					if (mp->mp_bh[i] == NULL)
738						break;
739					brelse(mp->mp_bh[i]);
740					mp->mp_bh[i] = NULL;
741				}
742				i = branch_start;
743			}
744			if (n == 0)
745				break;
746			fallthrough;	/* To branching from existing tree */
747		case ALLOC_GROW_DEPTH:
748			if (i > 1 && i < mp->mp_fheight)
749				gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
750			for (; i < mp->mp_fheight && n > 0; i++, n--)
751				gfs2_indirect_init(mp, ip->i_gl, i,
752						   mp->mp_list[i-1], bn++);
753			if (i == mp->mp_fheight)
754				state = ALLOC_DATA;
755			if (n == 0)
756				break;
757			fallthrough;	/* To tree complete, adding data blocks */
758		case ALLOC_DATA:
759			BUG_ON(n > dblks);
760			BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
761			gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
762			dblks = n;
763			ptr = metapointer(end_of_metadata, mp);
764			iomap->addr = bn << inode->i_blkbits;
765			iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
766			while (n-- > 0)
767				*ptr++ = cpu_to_be64(bn++);
768			break;
769		}
770	} while (iomap->addr == IOMAP_NULL_ADDR);
771
772	iomap->type = IOMAP_MAPPED;
773	iomap->length = (u64)dblks << inode->i_blkbits;
774	ip->i_height = mp->mp_fheight;
775	gfs2_add_inode_blocks(&ip->i_inode, alloced);
776	gfs2_dinode_out(ip, dibh->b_data);
777out:
778	up_write(&ip->i_rw_mutex);
779	return ret;
780}
781
782#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
783
784/**
785 * gfs2_alloc_size - Compute the maximum allocation size
786 * @inode: The inode
787 * @mp: The metapath
788 * @size: Requested size in blocks
789 *
790 * Compute the maximum size of the next allocation at @mp.
791 *
792 * Returns: size in blocks
793 */
794static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
795{
796	struct gfs2_inode *ip = GFS2_I(inode);
797	struct gfs2_sbd *sdp = GFS2_SB(inode);
798	const __be64 *first, *ptr, *end;
799
800	/*
801	 * For writes to stuffed files, this function is called twice via
802	 * gfs2_iomap_get, before and after unstuffing. The size we return the
803	 * first time needs to be large enough to get the reservation and
804	 * allocation sizes right.  The size we return the second time must
805	 * be exact or else gfs2_iomap_alloc won't do the right thing.
806	 */
807
808	if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
809		unsigned int maxsize = mp->mp_fheight > 1 ?
810			sdp->sd_inptrs : sdp->sd_diptrs;
811		maxsize -= mp->mp_list[mp->mp_fheight - 1];
812		if (size > maxsize)
813			size = maxsize;
814		return size;
815	}
816
817	first = metapointer(ip->i_height - 1, mp);
818	end = metaend(ip->i_height - 1, mp);
819	if (end - first > size)
820		end = first + size;
821	for (ptr = first; ptr < end; ptr++) {
822		if (*ptr)
823			break;
824	}
825	return ptr - first;
826}
827
828/**
829 * gfs2_iomap_get - Map blocks from an inode to disk blocks
830 * @inode: The inode
831 * @pos: Starting position in bytes
832 * @length: Length to map, in bytes
833 * @flags: iomap flags
834 * @iomap: The iomap structure
835 * @mp: The metapath
836 *
837 * Returns: errno
838 */
839static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
840			  unsigned flags, struct iomap *iomap,
841			  struct metapath *mp)
842{
843	struct gfs2_inode *ip = GFS2_I(inode);
844	struct gfs2_sbd *sdp = GFS2_SB(inode);
845	loff_t size = i_size_read(inode);
846	__be64 *ptr;
847	sector_t lblock;
848	sector_t lblock_stop;
849	int ret;
850	int eob;
851	u64 len;
852	struct buffer_head *dibh = NULL, *bh;
853	u8 height;
854
855	if (!length)
856		return -EINVAL;
857
858	down_read(&ip->i_rw_mutex);
859
860	ret = gfs2_meta_inode_buffer(ip, &dibh);
861	if (ret)
862		goto unlock;
863	mp->mp_bh[0] = dibh;
864
865	if (gfs2_is_stuffed(ip)) {
866		if (flags & IOMAP_WRITE) {
867			loff_t max_size = gfs2_max_stuffed_size(ip);
868
869			if (pos + length > max_size)
870				goto unstuff;
871			iomap->length = max_size;
872		} else {
873			if (pos >= size) {
874				if (flags & IOMAP_REPORT) {
875					ret = -ENOENT;
876					goto unlock;
877				} else {
878					iomap->offset = pos;
879					iomap->length = length;
880					goto hole_found;
881				}
882			}
883			iomap->length = size;
884		}
885		iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
886			      sizeof(struct gfs2_dinode);
887		iomap->type = IOMAP_INLINE;
888		iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
889		goto out;
890	}
891
892unstuff:
893	lblock = pos >> inode->i_blkbits;
894	iomap->offset = lblock << inode->i_blkbits;
895	lblock_stop = (pos + length - 1) >> inode->i_blkbits;
896	len = lblock_stop - lblock + 1;
897	iomap->length = len << inode->i_blkbits;
898
899	height = ip->i_height;
900	while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
901		height++;
902	find_metapath(sdp, lblock, mp, height);
903	if (height > ip->i_height || gfs2_is_stuffed(ip))
904		goto do_alloc;
905
906	ret = lookup_metapath(ip, mp);
907	if (ret)
908		goto unlock;
909
910	if (mp->mp_aheight != ip->i_height)
911		goto do_alloc;
912
913	ptr = metapointer(ip->i_height - 1, mp);
914	if (*ptr == 0)
915		goto do_alloc;
916
917	bh = mp->mp_bh[ip->i_height - 1];
918	len = gfs2_extent_length(bh, ptr, len, &eob);
919
920	iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
921	iomap->length = len << inode->i_blkbits;
922	iomap->type = IOMAP_MAPPED;
923	iomap->flags |= IOMAP_F_MERGED;
924	if (eob)
925		iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
926
927out:
928	iomap->bdev = inode->i_sb->s_bdev;
929unlock:
930	up_read(&ip->i_rw_mutex);
931	return ret;
932
933do_alloc:
934	if (flags & IOMAP_REPORT) {
935		if (pos >= size)
936			ret = -ENOENT;
937		else if (height == ip->i_height)
938			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
939		else
940			iomap->length = size - iomap->offset;
941	} else if (flags & IOMAP_WRITE) {
942		u64 alloc_size;
943
944		if (flags & IOMAP_DIRECT)
945			goto out;  /* (see gfs2_file_direct_write) */
946
947		len = gfs2_alloc_size(inode, mp, len);
948		alloc_size = len << inode->i_blkbits;
949		if (alloc_size < iomap->length)
950			iomap->length = alloc_size;
951	} else {
952		if (pos < size && height == ip->i_height)
953			ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
954	}
955hole_found:
956	iomap->addr = IOMAP_NULL_ADDR;
957	iomap->type = IOMAP_HOLE;
958	goto out;
959}
960
961/**
962 * gfs2_lblk_to_dblk - convert logical block to disk block
963 * @inode: the inode of the file we're mapping
964 * @lblock: the block relative to the start of the file
965 * @dblock: the returned dblock, if no error
966 *
967 * This function maps a single block from a file logical block (relative to
968 * the start of the file) to a file system absolute block using iomap.
969 *
970 * Returns: the absolute file system block, or an error
971 */
972int gfs2_lblk_to_dblk(struct inode *inode, u32 lblock, u64 *dblock)
973{
974	struct iomap iomap = { };
975	struct metapath mp = { .mp_aheight = 1, };
976	loff_t pos = (loff_t)lblock << inode->i_blkbits;
977	int ret;
978
979	ret = gfs2_iomap_get(inode, pos, i_blocksize(inode), 0, &iomap, &mp);
980	release_metapath(&mp);
981	if (ret == 0)
982		*dblock = iomap.addr >> inode->i_blkbits;
983
984	return ret;
985}
986
987static int gfs2_write_lock(struct inode *inode)
988{
989	struct gfs2_inode *ip = GFS2_I(inode);
990	struct gfs2_sbd *sdp = GFS2_SB(inode);
991	int error;
992
993	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
994	error = gfs2_glock_nq(&ip->i_gh);
995	if (error)
996		goto out_uninit;
997	if (&ip->i_inode == sdp->sd_rindex) {
998		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
999
1000		error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
1001					   GL_NOCACHE, &m_ip->i_gh);
1002		if (error)
1003			goto out_unlock;
1004	}
1005	return 0;
1006
1007out_unlock:
1008	gfs2_glock_dq(&ip->i_gh);
1009out_uninit:
1010	gfs2_holder_uninit(&ip->i_gh);
1011	return error;
1012}
1013
1014static void gfs2_write_unlock(struct inode *inode)
1015{
1016	struct gfs2_inode *ip = GFS2_I(inode);
1017	struct gfs2_sbd *sdp = GFS2_SB(inode);
1018
1019	if (&ip->i_inode == sdp->sd_rindex) {
1020		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
1021
1022		gfs2_glock_dq_uninit(&m_ip->i_gh);
1023	}
1024	gfs2_glock_dq_uninit(&ip->i_gh);
1025}
1026
1027static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
1028				   unsigned len, struct iomap *iomap)
1029{
1030	unsigned int blockmask = i_blocksize(inode) - 1;
1031	struct gfs2_sbd *sdp = GFS2_SB(inode);
1032	unsigned int blocks;
1033
1034	blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
1035	return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
1036}
1037
1038static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
1039				 unsigned copied, struct page *page,
1040				 struct iomap *iomap)
1041{
1042	struct gfs2_trans *tr = current->journal_info;
1043	struct gfs2_inode *ip = GFS2_I(inode);
1044	struct gfs2_sbd *sdp = GFS2_SB(inode);
1045
1046	if (page && !gfs2_is_stuffed(ip))
1047		gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
1048
1049	if (tr->tr_num_buf_new)
1050		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1051
1052	gfs2_trans_end(sdp);
1053}
1054
1055static const struct iomap_page_ops gfs2_iomap_page_ops = {
1056	.page_prepare = gfs2_iomap_page_prepare,
1057	.page_done = gfs2_iomap_page_done,
1058};
1059
1060static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
1061				  loff_t length, unsigned flags,
1062				  struct iomap *iomap,
1063				  struct metapath *mp)
1064{
1065	struct gfs2_inode *ip = GFS2_I(inode);
1066	struct gfs2_sbd *sdp = GFS2_SB(inode);
1067	bool unstuff;
1068	int ret;
1069
1070	unstuff = gfs2_is_stuffed(ip) &&
1071		  pos + length > gfs2_max_stuffed_size(ip);
1072
1073	if (unstuff || iomap->type == IOMAP_HOLE) {
1074		unsigned int data_blocks, ind_blocks;
1075		struct gfs2_alloc_parms ap = {};
1076		unsigned int rblocks;
1077		struct gfs2_trans *tr;
1078
1079		gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1080				       &ind_blocks);
1081		ap.target = data_blocks + ind_blocks;
1082		ret = gfs2_quota_lock_check(ip, &ap);
1083		if (ret)
1084			return ret;
1085
1086		ret = gfs2_inplace_reserve(ip, &ap);
1087		if (ret)
1088			goto out_qunlock;
1089
1090		rblocks = RES_DINODE + ind_blocks;
1091		if (gfs2_is_jdata(ip))
1092			rblocks += data_blocks;
1093		if (ind_blocks || data_blocks)
1094			rblocks += RES_STATFS + RES_QUOTA;
1095		if (inode == sdp->sd_rindex)
1096			rblocks += 2 * RES_STATFS;
1097		rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1098
1099		ret = gfs2_trans_begin(sdp, rblocks,
1100				       iomap->length >> inode->i_blkbits);
1101		if (ret)
1102			goto out_trans_fail;
1103
1104		if (unstuff) {
1105			ret = gfs2_unstuff_dinode(ip, NULL);
1106			if (ret)
1107				goto out_trans_end;
1108			release_metapath(mp);
1109			ret = gfs2_iomap_get(inode, iomap->offset,
1110					     iomap->length, flags, iomap, mp);
1111			if (ret)
1112				goto out_trans_end;
1113		}
1114
1115		if (iomap->type == IOMAP_HOLE) {
1116			ret = gfs2_iomap_alloc(inode, iomap, mp);
1117			if (ret) {
1118				gfs2_trans_end(sdp);
1119				gfs2_inplace_release(ip);
1120				punch_hole(ip, iomap->offset, iomap->length);
1121				goto out_qunlock;
1122			}
1123		}
1124
1125		tr = current->journal_info;
1126		if (tr->tr_num_buf_new)
1127			__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1128
1129		gfs2_trans_end(sdp);
1130	}
1131
1132	if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
1133		iomap->page_ops = &gfs2_iomap_page_ops;
1134	return 0;
1135
1136out_trans_end:
1137	gfs2_trans_end(sdp);
1138out_trans_fail:
1139	gfs2_inplace_release(ip);
1140out_qunlock:
1141	gfs2_quota_unlock(ip);
1142	return ret;
1143}
1144
1145static inline bool gfs2_iomap_need_write_lock(unsigned flags)
1146{
1147	return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT);
1148}
1149
1150static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1151			    unsigned flags, struct iomap *iomap,
1152			    struct iomap *srcmap)
1153{
1154	struct gfs2_inode *ip = GFS2_I(inode);
1155	struct metapath mp = { .mp_aheight = 1, };
1156	int ret;
1157
1158	if (gfs2_is_jdata(ip))
1159		iomap->flags |= IOMAP_F_BUFFER_HEAD;
1160
1161	trace_gfs2_iomap_start(ip, pos, length, flags);
1162	if (gfs2_iomap_need_write_lock(flags)) {
1163		ret = gfs2_write_lock(inode);
1164		if (ret)
1165			goto out;
1166	}
1167
1168	ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1169	if (ret)
1170		goto out_unlock;
1171
1172	switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1173	case IOMAP_WRITE:
1174		if (flags & IOMAP_DIRECT) {
1175			/*
1176			 * Silently fall back to buffered I/O for stuffed files
1177			 * or if we've got a hole (see gfs2_file_direct_write).
1178			 */
1179			if (iomap->type != IOMAP_MAPPED)
1180				ret = -ENOTBLK;
1181			goto out_unlock;
1182		}
1183		break;
1184	case IOMAP_ZERO:
1185		if (iomap->type == IOMAP_HOLE)
1186			goto out_unlock;
1187		break;
1188	default:
1189		goto out_unlock;
1190	}
1191
1192	ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1193
1194out_unlock:
1195	if (ret && gfs2_iomap_need_write_lock(flags))
1196		gfs2_write_unlock(inode);
1197	release_metapath(&mp);
1198out:
1199	trace_gfs2_iomap_end(ip, iomap, ret);
1200	return ret;
1201}
1202
1203static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1204			  ssize_t written, unsigned flags, struct iomap *iomap)
1205{
1206	struct gfs2_inode *ip = GFS2_I(inode);
1207	struct gfs2_sbd *sdp = GFS2_SB(inode);
1208
1209	switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1210	case IOMAP_WRITE:
1211		if (flags & IOMAP_DIRECT)
1212			return 0;
1213		break;
1214	case IOMAP_ZERO:
1215		 if (iomap->type == IOMAP_HOLE)
1216			 return 0;
1217		 break;
1218	default:
1219		 return 0;
1220	}
1221
1222	if (!gfs2_is_stuffed(ip))
1223		gfs2_ordered_add_inode(ip);
1224
1225	if (inode == sdp->sd_rindex)
1226		adjust_fs_space(inode);
1227
1228	gfs2_inplace_release(ip);
1229
1230	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1231		gfs2_quota_unlock(ip);
1232
1233	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1234		/* Deallocate blocks that were just allocated. */
1235		loff_t hstart = round_up(pos + written, i_blocksize(inode));
1236		loff_t hend = iomap->offset + iomap->length;
1237
1238		if (hstart < hend) {
1239			truncate_pagecache_range(inode, hstart, hend - 1);
1240			punch_hole(ip, hstart, hend - hstart);
1241		}
1242	}
1243
1244	if (unlikely(!written))
1245		goto out_unlock;
1246
1247	if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1248		mark_inode_dirty(inode);
1249	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1250
1251out_unlock:
1252	if (gfs2_iomap_need_write_lock(flags))
1253		gfs2_write_unlock(inode);
1254	return 0;
1255}
1256
1257const struct iomap_ops gfs2_iomap_ops = {
1258	.iomap_begin = gfs2_iomap_begin,
1259	.iomap_end = gfs2_iomap_end,
1260};
1261
1262/**
1263 * gfs2_block_map - Map one or more blocks of an inode to a disk block
1264 * @inode: The inode
1265 * @lblock: The logical block number
1266 * @bh_map: The bh to be mapped
1267 * @create: True if its ok to alloc blocks to satify the request
1268 *
1269 * The size of the requested mapping is defined in bh_map->b_size.
1270 *
1271 * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1272 * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1273 * bh_map->b_size to indicate the size of the mapping when @lblock and
1274 * successive blocks are mapped, up to the requested size.
1275 *
1276 * Sets buffer_boundary() if a read of metadata will be required
1277 * before the next block can be mapped. Sets buffer_new() if new
1278 * blocks were allocated.
1279 *
1280 * Returns: errno
1281 */
1282
1283int gfs2_block_map(struct inode *inode, sector_t lblock,
1284		   struct buffer_head *bh_map, int create)
1285{
1286	struct gfs2_inode *ip = GFS2_I(inode);
1287	loff_t pos = (loff_t)lblock << inode->i_blkbits;
1288	loff_t length = bh_map->b_size;
1289	struct metapath mp = { .mp_aheight = 1, };
1290	struct iomap iomap = { };
1291	int flags = create ? IOMAP_WRITE : 0;
1292	int ret;
1293
1294	clear_buffer_mapped(bh_map);
1295	clear_buffer_new(bh_map);
1296	clear_buffer_boundary(bh_map);
1297	trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1298
1299	ret = gfs2_iomap_get(inode, pos, length, flags, &iomap, &mp);
1300	if (create && !ret && iomap.type == IOMAP_HOLE)
1301		ret = gfs2_iomap_alloc(inode, &iomap, &mp);
1302	release_metapath(&mp);
1303	if (ret)
1304		goto out;
1305
1306	if (iomap.length > bh_map->b_size) {
1307		iomap.length = bh_map->b_size;
1308		iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1309	}
1310	if (iomap.addr != IOMAP_NULL_ADDR)
1311		map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1312	bh_map->b_size = iomap.length;
1313	if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1314		set_buffer_boundary(bh_map);
1315	if (iomap.flags & IOMAP_F_NEW)
1316		set_buffer_new(bh_map);
1317
1318out:
1319	trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1320	return ret;
1321}
1322
1323/*
1324 * Deprecated: do not use in new code
1325 */
1326int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
1327{
1328	struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
1329	int ret;
1330	int create = *new;
1331
1332	BUG_ON(!extlen);
1333	BUG_ON(!dblock);
1334	BUG_ON(!new);
1335
1336	bh.b_size = BIT(inode->i_blkbits + (create ? 0 : 5));
1337	ret = gfs2_block_map(inode, lblock, &bh, create);
1338	*extlen = bh.b_size >> inode->i_blkbits;
1339	*dblock = bh.b_blocknr;
1340	if (buffer_new(&bh))
1341		*new = 1;
1342	else
1343		*new = 0;
1344	return ret;
1345}
1346
1347/*
1348 * NOTE: Never call gfs2_block_zero_range with an open transaction because it
1349 * uses iomap write to perform its actions, which begin their own transactions
1350 * (iomap_begin, page_prepare, etc.)
1351 */
1352static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1353				 unsigned int length)
1354{
1355	BUG_ON(current->journal_info);
1356	return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
1357}
1358
1359#define GFS2_JTRUNC_REVOKES 8192
1360
1361/**
1362 * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1363 * @inode: The inode being truncated
1364 * @oldsize: The original (larger) size
1365 * @newsize: The new smaller size
1366 *
1367 * With jdata files, we have to journal a revoke for each block which is
1368 * truncated. As a result, we need to split this into separate transactions
1369 * if the number of pages being truncated gets too large.
1370 */
1371
1372static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1373{
1374	struct gfs2_sbd *sdp = GFS2_SB(inode);
1375	u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1376	u64 chunk;
1377	int error;
1378
1379	while (oldsize != newsize) {
1380		struct gfs2_trans *tr;
1381		unsigned int offs;
1382
1383		chunk = oldsize - newsize;
1384		if (chunk > max_chunk)
1385			chunk = max_chunk;
1386
1387		offs = oldsize & ~PAGE_MASK;
1388		if (offs && chunk > PAGE_SIZE)
1389			chunk = offs + ((chunk - offs) & PAGE_MASK);
1390
1391		truncate_pagecache(inode, oldsize - chunk);
1392		oldsize -= chunk;
1393
1394		tr = current->journal_info;
1395		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1396			continue;
1397
1398		gfs2_trans_end(sdp);
1399		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1400		if (error)
1401			return error;
1402	}
1403
1404	return 0;
1405}
1406
1407static int trunc_start(struct inode *inode, u64 newsize)
1408{
1409	struct gfs2_inode *ip = GFS2_I(inode);
1410	struct gfs2_sbd *sdp = GFS2_SB(inode);
1411	struct buffer_head *dibh = NULL;
1412	int journaled = gfs2_is_jdata(ip);
1413	u64 oldsize = inode->i_size;
1414	int error;
1415
1416	if (!gfs2_is_stuffed(ip)) {
1417		unsigned int blocksize = i_blocksize(inode);
1418		unsigned int offs = newsize & (blocksize - 1);
1419		if (offs) {
1420			error = gfs2_block_zero_range(inode, newsize,
1421						      blocksize - offs);
1422			if (error)
1423				return error;
1424		}
1425	}
1426	if (journaled)
1427		error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1428	else
1429		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1430	if (error)
1431		return error;
1432
1433	error = gfs2_meta_inode_buffer(ip, &dibh);
1434	if (error)
1435		goto out;
1436
1437	gfs2_trans_add_meta(ip->i_gl, dibh);
1438
1439	if (gfs2_is_stuffed(ip))
1440		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1441	else
1442		ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1443
1444	i_size_write(inode, newsize);
1445	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1446	gfs2_dinode_out(ip, dibh->b_data);
1447
1448	if (journaled)
1449		error = gfs2_journaled_truncate(inode, oldsize, newsize);
1450	else
1451		truncate_pagecache(inode, newsize);
1452
1453out:
1454	brelse(dibh);
1455	if (current->journal_info)
1456		gfs2_trans_end(sdp);
1457	return error;
1458}
1459
1460int gfs2_iomap_get_alloc(struct inode *inode, loff_t pos, loff_t length,
1461			 struct iomap *iomap)
1462{
1463	struct metapath mp = { .mp_aheight = 1, };
1464	int ret;
1465
1466	ret = gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1467	if (!ret && iomap->type == IOMAP_HOLE)
1468		ret = gfs2_iomap_alloc(inode, iomap, &mp);
1469	release_metapath(&mp);
1470	return ret;
1471}
1472
1473/**
1474 * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1475 * @ip: inode
1476 * @rg_gh: holder of resource group glock
1477 * @bh: buffer head to sweep
1478 * @start: starting point in bh
1479 * @end: end point in bh
1480 * @meta: true if bh points to metadata (rather than data)
1481 * @btotal: place to keep count of total blocks freed
1482 *
1483 * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1484 * free, and free them all. However, we do it one rgrp at a time. If this
1485 * block has references to multiple rgrps, we break it into individual
1486 * transactions. This allows other processes to use the rgrps while we're
1487 * focused on a single one, for better concurrency / performance.
1488 * At every transaction boundary, we rewrite the inode into the journal.
1489 * That way the bitmaps are kept consistent with the inode and we can recover
1490 * if we're interrupted by power-outages.
1491 *
1492 * Returns: 0, or return code if an error occurred.
1493 *          *btotal has the total number of blocks freed
1494 */
1495static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1496			      struct buffer_head *bh, __be64 *start, __be64 *end,
1497			      bool meta, u32 *btotal)
1498{
1499	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1500	struct gfs2_rgrpd *rgd;
1501	struct gfs2_trans *tr;
1502	__be64 *p;
1503	int blks_outside_rgrp;
1504	u64 bn, bstart, isize_blks;
1505	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1506	int ret = 0;
1507	bool buf_in_tr = false; /* buffer was added to transaction */
1508
1509more_rgrps:
1510	rgd = NULL;
1511	if (gfs2_holder_initialized(rd_gh)) {
1512		rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1513		gfs2_assert_withdraw(sdp,
1514			     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1515	}
1516	blks_outside_rgrp = 0;
1517	bstart = 0;
1518	blen = 0;
1519
1520	for (p = start; p < end; p++) {
1521		if (!*p)
1522			continue;
1523		bn = be64_to_cpu(*p);
1524
1525		if (rgd) {
1526			if (!rgrp_contains_block(rgd, bn)) {
1527				blks_outside_rgrp++;
1528				continue;
1529			}
1530		} else {
1531			rgd = gfs2_blk2rgrpd(sdp, bn, true);
1532			if (unlikely(!rgd)) {
1533				ret = -EIO;
1534				goto out;
1535			}
1536			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1537						 0, rd_gh);
1538			if (ret)
1539				goto out;
1540
1541			/* Must be done with the rgrp glock held: */
1542			if (gfs2_rs_active(&ip->i_res) &&
1543			    rgd == ip->i_res.rs_rbm.rgd)
1544				gfs2_rs_deltree(&ip->i_res);
1545		}
1546
1547		/* The size of our transactions will be unknown until we
1548		   actually process all the metadata blocks that relate to
1549		   the rgrp. So we estimate. We know it can't be more than
1550		   the dinode's i_blocks and we don't want to exceed the
1551		   journal flush threshold, sd_log_thresh2. */
1552		if (current->journal_info == NULL) {
1553			unsigned int jblocks_rqsted, revokes;
1554
1555			jblocks_rqsted = rgd->rd_length + RES_DINODE +
1556				RES_INDIRECT;
1557			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1558			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1559				jblocks_rqsted +=
1560					atomic_read(&sdp->sd_log_thresh2);
1561			else
1562				jblocks_rqsted += isize_blks;
1563			revokes = jblocks_rqsted;
1564			if (meta)
1565				revokes += end - start;
1566			else if (ip->i_depth)
1567				revokes += sdp->sd_inptrs;
1568			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1569			if (ret)
1570				goto out_unlock;
1571			down_write(&ip->i_rw_mutex);
1572		}
1573		/* check if we will exceed the transaction blocks requested */
1574		tr = current->journal_info;
1575		if (tr->tr_num_buf_new + RES_STATFS +
1576		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1577			/* We set blks_outside_rgrp to ensure the loop will
1578			   be repeated for the same rgrp, but with a new
1579			   transaction. */
1580			blks_outside_rgrp++;
1581			/* This next part is tricky. If the buffer was added
1582			   to the transaction, we've already set some block
1583			   pointers to 0, so we better follow through and free
1584			   them, or we will introduce corruption (so break).
1585			   This may be impossible, or at least rare, but I
1586			   decided to cover the case regardless.
1587
1588			   If the buffer was not added to the transaction
1589			   (this call), doing so would exceed our transaction
1590			   size, so we need to end the transaction and start a
1591			   new one (so goto). */
1592
1593			if (buf_in_tr)
1594				break;
1595			goto out_unlock;
1596		}
1597
1598		gfs2_trans_add_meta(ip->i_gl, bh);
1599		buf_in_tr = true;
1600		*p = 0;
1601		if (bstart + blen == bn) {
1602			blen++;
1603			continue;
1604		}
1605		if (bstart) {
1606			__gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1607			(*btotal) += blen;
1608			gfs2_add_inode_blocks(&ip->i_inode, -blen);
1609		}
1610		bstart = bn;
1611		blen = 1;
1612	}
1613	if (bstart) {
1614		__gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1615		(*btotal) += blen;
1616		gfs2_add_inode_blocks(&ip->i_inode, -blen);
1617	}
1618out_unlock:
1619	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1620					    outside the rgrp we just processed,
1621					    do it all over again. */
1622		if (current->journal_info) {
1623			struct buffer_head *dibh;
1624
1625			ret = gfs2_meta_inode_buffer(ip, &dibh);
1626			if (ret)
1627				goto out;
1628
1629			/* Every transaction boundary, we rewrite the dinode
1630			   to keep its di_blocks current in case of failure. */
1631			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1632				current_time(&ip->i_inode);
1633			gfs2_trans_add_meta(ip->i_gl, dibh);
1634			gfs2_dinode_out(ip, dibh->b_data);
1635			brelse(dibh);
1636			up_write(&ip->i_rw_mutex);
1637			gfs2_trans_end(sdp);
1638			buf_in_tr = false;
1639		}
1640		gfs2_glock_dq_uninit(rd_gh);
1641		cond_resched();
1642		goto more_rgrps;
1643	}
1644out:
1645	return ret;
1646}
1647
1648static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1649{
1650	if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1651		return false;
1652	return true;
1653}
1654
1655/**
1656 * find_nonnull_ptr - find a non-null pointer given a metapath and height
1657 * @mp: starting metapath
1658 * @h: desired height to search
1659 *
1660 * Assumes the metapath is valid (with buffers) out to height h.
1661 * Returns: true if a non-null pointer was found in the metapath buffer
1662 *          false if all remaining pointers are NULL in the buffer
1663 */
1664static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1665			     unsigned int h,
1666			     __u16 *end_list, unsigned int end_aligned)
1667{
1668	struct buffer_head *bh = mp->mp_bh[h];
1669	__be64 *first, *ptr, *end;
1670
1671	first = metaptr1(h, mp);
1672	ptr = first + mp->mp_list[h];
1673	end = (__be64 *)(bh->b_data + bh->b_size);
1674	if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1675		bool keep_end = h < end_aligned;
1676		end = first + end_list[h] + keep_end;
1677	}
1678
1679	while (ptr < end) {
1680		if (*ptr) { /* if we have a non-null pointer */
1681			mp->mp_list[h] = ptr - first;
1682			h++;
1683			if (h < GFS2_MAX_META_HEIGHT)
1684				mp->mp_list[h] = 0;
1685			return true;
1686		}
1687		ptr++;
1688	}
1689	return false;
1690}
1691
1692enum dealloc_states {
1693	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1694	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1695	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1696	DEALLOC_DONE = 3,       /* process complete */
1697};
1698
1699static inline void
1700metapointer_range(struct metapath *mp, int height,
1701		  __u16 *start_list, unsigned int start_aligned,
1702		  __u16 *end_list, unsigned int end_aligned,
1703		  __be64 **start, __be64 **end)
1704{
1705	struct buffer_head *bh = mp->mp_bh[height];
1706	__be64 *first;
1707
1708	first = metaptr1(height, mp);
1709	*start = first;
1710	if (mp_eq_to_hgt(mp, start_list, height)) {
1711		bool keep_start = height < start_aligned;
1712		*start = first + start_list[height] + keep_start;
1713	}
1714	*end = (__be64 *)(bh->b_data + bh->b_size);
1715	if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1716		bool keep_end = height < end_aligned;
1717		*end = first + end_list[height] + keep_end;
1718	}
1719}
1720
1721static inline bool walk_done(struct gfs2_sbd *sdp,
1722			     struct metapath *mp, int height,
1723			     __u16 *end_list, unsigned int end_aligned)
1724{
1725	__u16 end;
1726
1727	if (end_list) {
1728		bool keep_end = height < end_aligned;
1729		if (!mp_eq_to_hgt(mp, end_list, height))
1730			return false;
1731		end = end_list[height] + keep_end;
1732	} else
1733		end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1734	return mp->mp_list[height] >= end;
1735}
1736
1737/**
1738 * punch_hole - deallocate blocks in a file
1739 * @ip: inode to truncate
1740 * @offset: the start of the hole
1741 * @length: the size of the hole (or 0 for truncate)
1742 *
1743 * Punch a hole into a file or truncate a file at a given position.  This
1744 * function operates in whole blocks (@offset and @length are rounded
1745 * accordingly); partially filled blocks must be cleared otherwise.
1746 *
1747 * This function works from the bottom up, and from the right to the left. In
1748 * other words, it strips off the highest layer (data) before stripping any of
1749 * the metadata. Doing it this way is best in case the operation is interrupted
1750 * by power failure, etc.  The dinode is rewritten in every transaction to
1751 * guarantee integrity.
1752 */
1753static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1754{
1755	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1756	u64 maxsize = sdp->sd_heightsize[ip->i_height];
1757	struct metapath mp = {};
1758	struct buffer_head *dibh, *bh;
1759	struct gfs2_holder rd_gh;
1760	unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1761	u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1762	__u16 start_list[GFS2_MAX_META_HEIGHT];
1763	__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1764	unsigned int start_aligned, end_aligned;
1765	unsigned int strip_h = ip->i_height - 1;
1766	u32 btotal = 0;
1767	int ret, state;
1768	int mp_h; /* metapath buffers are read in to this height */
1769	u64 prev_bnr = 0;
1770	__be64 *start, *end;
1771
1772	if (offset >= maxsize) {
1773		/*
1774		 * The starting point lies beyond the allocated meta-data;
1775		 * there are no blocks do deallocate.
1776		 */
1777		return 0;
1778	}
1779
1780	/*
1781	 * The start position of the hole is defined by lblock, start_list, and
1782	 * start_aligned.  The end position of the hole is defined by lend,
1783	 * end_list, and end_aligned.
1784	 *
1785	 * start_aligned and end_aligned define down to which height the start
1786	 * and end positions are aligned to the metadata tree (i.e., the
1787	 * position is a multiple of the metadata granularity at the height
1788	 * above).  This determines at which heights additional meta pointers
1789	 * needs to be preserved for the remaining data.
1790	 */
1791
1792	if (length) {
1793		u64 end_offset = offset + length;
1794		u64 lend;
1795
1796		/*
1797		 * Clip the end at the maximum file size for the given height:
1798		 * that's how far the metadata goes; files bigger than that
1799		 * will have additional layers of indirection.
1800		 */
1801		if (end_offset > maxsize)
1802			end_offset = maxsize;
1803		lend = end_offset >> bsize_shift;
1804
1805		if (lblock >= lend)
1806			return 0;
1807
1808		find_metapath(sdp, lend, &mp, ip->i_height);
1809		end_list = __end_list;
1810		memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1811
1812		for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1813			if (end_list[mp_h])
1814				break;
1815		}
1816		end_aligned = mp_h;
1817	}
1818
1819	find_metapath(sdp, lblock, &mp, ip->i_height);
1820	memcpy(start_list, mp.mp_list, sizeof(start_list));
1821
1822	for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1823		if (start_list[mp_h])
1824			break;
1825	}
1826	start_aligned = mp_h;
1827
1828	ret = gfs2_meta_inode_buffer(ip, &dibh);
1829	if (ret)
1830		return ret;
1831
1832	mp.mp_bh[0] = dibh;
1833	ret = lookup_metapath(ip, &mp);
1834	if (ret)
1835		goto out_metapath;
1836
1837	/* issue read-ahead on metadata */
1838	for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1839		metapointer_range(&mp, mp_h, start_list, start_aligned,
1840				  end_list, end_aligned, &start, &end);
1841		gfs2_metapath_ra(ip->i_gl, start, end);
1842	}
1843
1844	if (mp.mp_aheight == ip->i_height)
1845		state = DEALLOC_MP_FULL; /* We have a complete metapath */
1846	else
1847		state = DEALLOC_FILL_MP; /* deal with partial metapath */
1848
1849	ret = gfs2_rindex_update(sdp);
1850	if (ret)
1851		goto out_metapath;
1852
1853	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1854	if (ret)
1855		goto out_metapath;
1856	gfs2_holder_mark_uninitialized(&rd_gh);
1857
1858	mp_h = strip_h;
1859
1860	while (state != DEALLOC_DONE) {
1861		switch (state) {
1862		/* Truncate a full metapath at the given strip height.
1863		 * Note that strip_h == mp_h in order to be in this state. */
1864		case DEALLOC_MP_FULL:
1865			bh = mp.mp_bh[mp_h];
1866			gfs2_assert_withdraw(sdp, bh);
1867			if (gfs2_assert_withdraw(sdp,
1868						 prev_bnr != bh->b_blocknr)) {
1869				fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u,"
1870					 "s_h:%u, mp_h:%u\n",
1871				       (unsigned long long)ip->i_no_addr,
1872				       prev_bnr, ip->i_height, strip_h, mp_h);
1873			}
1874			prev_bnr = bh->b_blocknr;
1875
1876			if (gfs2_metatype_check(sdp, bh,
1877						(mp_h ? GFS2_METATYPE_IN :
1878							GFS2_METATYPE_DI))) {
1879				ret = -EIO;
1880				goto out;
1881			}
1882
1883			/*
1884			 * Below, passing end_aligned as 0 gives us the
1885			 * metapointer range excluding the end point: the end
1886			 * point is the first metapath we must not deallocate!
1887			 */
1888
1889			metapointer_range(&mp, mp_h, start_list, start_aligned,
1890					  end_list, 0 /* end_aligned */,
1891					  &start, &end);
1892			ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1893						 start, end,
1894						 mp_h != ip->i_height - 1,
1895						 &btotal);
1896
1897			/* If we hit an error or just swept dinode buffer,
1898			   just exit. */
1899			if (ret || !mp_h) {
1900				state = DEALLOC_DONE;
1901				break;
1902			}
1903			state = DEALLOC_MP_LOWER;
1904			break;
1905
1906		/* lower the metapath strip height */
1907		case DEALLOC_MP_LOWER:
1908			/* We're done with the current buffer, so release it,
1909			   unless it's the dinode buffer. Then back up to the
1910			   previous pointer. */
1911			if (mp_h) {
1912				brelse(mp.mp_bh[mp_h]);
1913				mp.mp_bh[mp_h] = NULL;
1914			}
1915			/* If we can't get any lower in height, we've stripped
1916			   off all we can. Next step is to back up and start
1917			   stripping the previous level of metadata. */
1918			if (mp_h == 0) {
1919				strip_h--;
1920				memcpy(mp.mp_list, start_list, sizeof(start_list));
1921				mp_h = strip_h;
1922				state = DEALLOC_FILL_MP;
1923				break;
1924			}
1925			mp.mp_list[mp_h] = 0;
1926			mp_h--; /* search one metadata height down */
1927			mp.mp_list[mp_h]++;
1928			if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1929				break;
1930			/* Here we've found a part of the metapath that is not
1931			 * allocated. We need to search at that height for the
1932			 * next non-null pointer. */
1933			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1934				state = DEALLOC_FILL_MP;
1935				mp_h++;
1936			}
1937			/* No more non-null pointers at this height. Back up
1938			   to the previous height and try again. */
1939			break; /* loop around in the same state */
1940
1941		/* Fill the metapath with buffers to the given height. */
1942		case DEALLOC_FILL_MP:
1943			/* Fill the buffers out to the current height. */
1944			ret = fillup_metapath(ip, &mp, mp_h);
1945			if (ret < 0)
1946				goto out;
1947
1948			/* On the first pass, issue read-ahead on metadata. */
1949			if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1950				unsigned int height = mp.mp_aheight - 1;
1951
1952				/* No read-ahead for data blocks. */
1953				if (mp.mp_aheight - 1 == strip_h)
1954					height--;
1955
1956				for (; height >= mp.mp_aheight - ret; height--) {
1957					metapointer_range(&mp, height,
1958							  start_list, start_aligned,
1959							  end_list, end_aligned,
1960							  &start, &end);
1961					gfs2_metapath_ra(ip->i_gl, start, end);
1962				}
1963			}
1964
1965			/* If buffers found for the entire strip height */
1966			if (mp.mp_aheight - 1 == strip_h) {
1967				state = DEALLOC_MP_FULL;
1968				break;
1969			}
1970			if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1971				mp_h = mp.mp_aheight - 1;
1972
1973			/* If we find a non-null block pointer, crawl a bit
1974			   higher up in the metapath and try again, otherwise
1975			   we need to look lower for a new starting point. */
1976			if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1977				mp_h++;
1978			else
1979				state = DEALLOC_MP_LOWER;
1980			break;
1981		}
1982	}
1983
1984	if (btotal) {
1985		if (current->journal_info == NULL) {
1986			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1987					       RES_QUOTA, 0);
1988			if (ret)
1989				goto out;
1990			down_write(&ip->i_rw_mutex);
1991		}
1992		gfs2_statfs_change(sdp, 0, +btotal, 0);
1993		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1994				  ip->i_inode.i_gid);
1995		ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1996		gfs2_trans_add_meta(ip->i_gl, dibh);
1997		gfs2_dinode_out(ip, dibh->b_data);
1998		up_write(&ip->i_rw_mutex);
1999		gfs2_trans_end(sdp);
2000	}
2001
2002out:
2003	if (gfs2_holder_initialized(&rd_gh))
2004		gfs2_glock_dq_uninit(&rd_gh);
2005	if (current->journal_info) {
2006		up_write(&ip->i_rw_mutex);
2007		gfs2_trans_end(sdp);
2008		cond_resched();
2009	}
2010	gfs2_quota_unhold(ip);
2011out_metapath:
2012	release_metapath(&mp);
2013	return ret;
2014}
2015
2016static int trunc_end(struct gfs2_inode *ip)
2017{
2018	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2019	struct buffer_head *dibh;
2020	int error;
2021
2022	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2023	if (error)
2024		return error;
2025
2026	down_write(&ip->i_rw_mutex);
2027
2028	error = gfs2_meta_inode_buffer(ip, &dibh);
2029	if (error)
2030		goto out;
2031
2032	if (!i_size_read(&ip->i_inode)) {
2033		ip->i_height = 0;
2034		ip->i_goal = ip->i_no_addr;
2035		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
2036		gfs2_ordered_del_inode(ip);
2037	}
2038	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2039	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
2040
2041	gfs2_trans_add_meta(ip->i_gl, dibh);
2042	gfs2_dinode_out(ip, dibh->b_data);
2043	brelse(dibh);
2044
2045out:
2046	up_write(&ip->i_rw_mutex);
2047	gfs2_trans_end(sdp);
2048	return error;
2049}
2050
2051/**
2052 * do_shrink - make a file smaller
2053 * @inode: the inode
2054 * @newsize: the size to make the file
2055 *
2056 * Called with an exclusive lock on @inode. The @size must
2057 * be equal to or smaller than the current inode size.
2058 *
2059 * Returns: errno
2060 */
2061
2062static int do_shrink(struct inode *inode, u64 newsize)
2063{
2064	struct gfs2_inode *ip = GFS2_I(inode);
2065	int error;
2066
2067	error = trunc_start(inode, newsize);
2068	if (error < 0)
2069		return error;
2070	if (gfs2_is_stuffed(ip))
2071		return 0;
2072
2073	error = punch_hole(ip, newsize, 0);
2074	if (error == 0)
2075		error = trunc_end(ip);
2076
2077	return error;
2078}
2079
2080void gfs2_trim_blocks(struct inode *inode)
2081{
2082	int ret;
2083
2084	ret = do_shrink(inode, inode->i_size);
2085	WARN_ON(ret != 0);
2086}
2087
2088/**
2089 * do_grow - Touch and update inode size
2090 * @inode: The inode
2091 * @size: The new size
2092 *
2093 * This function updates the timestamps on the inode and
2094 * may also increase the size of the inode. This function
2095 * must not be called with @size any smaller than the current
2096 * inode size.
2097 *
2098 * Although it is not strictly required to unstuff files here,
2099 * earlier versions of GFS2 have a bug in the stuffed file reading
2100 * code which will result in a buffer overrun if the size is larger
2101 * than the max stuffed file size. In order to prevent this from
2102 * occurring, such files are unstuffed, but in other cases we can
2103 * just update the inode size directly.
2104 *
2105 * Returns: 0 on success, or -ve on error
2106 */
2107
2108static int do_grow(struct inode *inode, u64 size)
2109{
2110	struct gfs2_inode *ip = GFS2_I(inode);
2111	struct gfs2_sbd *sdp = GFS2_SB(inode);
2112	struct gfs2_alloc_parms ap = { .target = 1, };
2113	struct buffer_head *dibh;
2114	int error;
2115	int unstuff = 0;
2116
2117	if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2118		error = gfs2_quota_lock_check(ip, &ap);
2119		if (error)
2120			return error;
2121
2122		error = gfs2_inplace_reserve(ip, &ap);
2123		if (error)
2124			goto do_grow_qunlock;
2125		unstuff = 1;
2126	}
2127
2128	error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2129				 (unstuff &&
2130				  gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2131				 (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2132				  0 : RES_QUOTA), 0);
2133	if (error)
2134		goto do_grow_release;
2135
2136	if (unstuff) {
2137		error = gfs2_unstuff_dinode(ip, NULL);
2138		if (error)
2139			goto do_end_trans;
2140	}
2141
2142	error = gfs2_meta_inode_buffer(ip, &dibh);
2143	if (error)
2144		goto do_end_trans;
2145
2146	truncate_setsize(inode, size);
2147	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2148	gfs2_trans_add_meta(ip->i_gl, dibh);
2149	gfs2_dinode_out(ip, dibh->b_data);
2150	brelse(dibh);
2151
2152do_end_trans:
2153	gfs2_trans_end(sdp);
2154do_grow_release:
2155	if (unstuff) {
2156		gfs2_inplace_release(ip);
2157do_grow_qunlock:
2158		gfs2_quota_unlock(ip);
2159	}
2160	return error;
2161}
2162
2163/**
2164 * gfs2_setattr_size - make a file a given size
2165 * @inode: the inode
2166 * @newsize: the size to make the file
2167 *
2168 * The file size can grow, shrink, or stay the same size. This
2169 * is called holding i_rwsem and an exclusive glock on the inode
2170 * in question.
2171 *
2172 * Returns: errno
2173 */
2174
2175int gfs2_setattr_size(struct inode *inode, u64 newsize)
2176{
2177	struct gfs2_inode *ip = GFS2_I(inode);
2178	int ret;
2179
2180	BUG_ON(!S_ISREG(inode->i_mode));
2181
2182	ret = inode_newsize_ok(inode, newsize);
2183	if (ret)
2184		return ret;
2185
2186	inode_dio_wait(inode);
2187
2188	ret = gfs2_qa_get(ip);
2189	if (ret)
2190		goto out;
2191
2192	if (newsize >= inode->i_size) {
2193		ret = do_grow(inode, newsize);
2194		goto out;
2195	}
2196
2197	ret = do_shrink(inode, newsize);
2198out:
2199	gfs2_rs_delete(ip);
2200	gfs2_qa_put(ip);
2201	return ret;
2202}
2203
2204int gfs2_truncatei_resume(struct gfs2_inode *ip)
2205{
2206	int error;
2207	error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2208	if (!error)
2209		error = trunc_end(ip);
2210	return error;
2211}
2212
2213int gfs2_file_dealloc(struct gfs2_inode *ip)
2214{
2215	return punch_hole(ip, 0, 0);
2216}
2217
2218/**
2219 * gfs2_free_journal_extents - Free cached journal bmap info
2220 * @jd: The journal
2221 *
2222 */
2223
2224void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2225{
2226	struct gfs2_journal_extent *jext;
2227
2228	while(!list_empty(&jd->extent_list)) {
2229		jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2230		list_del(&jext->list);
2231		kfree(jext);
2232	}
2233}
2234
2235/**
2236 * gfs2_add_jextent - Add or merge a new extent to extent cache
2237 * @jd: The journal descriptor
2238 * @lblock: The logical block at start of new extent
2239 * @dblock: The physical block at start of new extent
2240 * @blocks: Size of extent in fs blocks
2241 *
2242 * Returns: 0 on success or -ENOMEM
2243 */
2244
2245static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2246{
2247	struct gfs2_journal_extent *jext;
2248
2249	if (!list_empty(&jd->extent_list)) {
2250		jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2251		if ((jext->dblock + jext->blocks) == dblock) {
2252			jext->blocks += blocks;
2253			return 0;
2254		}
2255	}
2256
2257	jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2258	if (jext == NULL)
2259		return -ENOMEM;
2260	jext->dblock = dblock;
2261	jext->lblock = lblock;
2262	jext->blocks = blocks;
2263	list_add_tail(&jext->list, &jd->extent_list);
2264	jd->nr_extents++;
2265	return 0;
2266}
2267
2268/**
2269 * gfs2_map_journal_extents - Cache journal bmap info
2270 * @sdp: The super block
2271 * @jd: The journal to map
2272 *
2273 * Create a reusable "extent" mapping from all logical
2274 * blocks to all physical blocks for the given journal.  This will save
2275 * us time when writing journal blocks.  Most journals will have only one
2276 * extent that maps all their logical blocks.  That's because gfs2.mkfs
2277 * arranges the journal blocks sequentially to maximize performance.
2278 * So the extent would map the first block for the entire file length.
2279 * However, gfs2_jadd can happen while file activity is happening, so
2280 * those journals may not be sequential.  Less likely is the case where
2281 * the users created their own journals by mounting the metafs and
2282 * laying it out.  But it's still possible.  These journals might have
2283 * several extents.
2284 *
2285 * Returns: 0 on success, or error on failure
2286 */
2287
2288int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2289{
2290	u64 lblock = 0;
2291	u64 lblock_stop;
2292	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2293	struct buffer_head bh;
2294	unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2295	u64 size;
2296	int rc;
2297	ktime_t start, end;
2298
2299	start = ktime_get();
2300	lblock_stop = i_size_read(jd->jd_inode) >> shift;
2301	size = (lblock_stop - lblock) << shift;
2302	jd->nr_extents = 0;
2303	WARN_ON(!list_empty(&jd->extent_list));
2304
2305	do {
2306		bh.b_state = 0;
2307		bh.b_blocknr = 0;
2308		bh.b_size = size;
2309		rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2310		if (rc || !buffer_mapped(&bh))
2311			goto fail;
2312		rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2313		if (rc)
2314			goto fail;
2315		size -= bh.b_size;
2316		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2317	} while(size > 0);
2318
2319	end = ktime_get();
2320	fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2321		jd->nr_extents, ktime_ms_delta(end, start));
2322	return 0;
2323
2324fail:
2325	fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2326		rc, jd->jd_jid,
2327		(unsigned long long)(i_size_read(jd->jd_inode) - size),
2328		jd->nr_extents);
2329	fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2330		rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2331		bh.b_state, (unsigned long long)bh.b_size);
2332	gfs2_free_journal_extents(jd);
2333	return rc;
2334}
2335
2336/**
2337 * gfs2_write_alloc_required - figure out if a write will require an allocation
2338 * @ip: the file being written to
2339 * @offset: the offset to write to
2340 * @len: the number of bytes being written
2341 *
2342 * Returns: 1 if an alloc is required, 0 otherwise
2343 */
2344
2345int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2346			      unsigned int len)
2347{
2348	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2349	struct buffer_head bh;
2350	unsigned int shift;
2351	u64 lblock, lblock_stop, size;
2352	u64 end_of_file;
2353
2354	if (!len)
2355		return 0;
2356
2357	if (gfs2_is_stuffed(ip)) {
2358		if (offset + len > gfs2_max_stuffed_size(ip))
2359			return 1;
2360		return 0;
2361	}
2362
2363	shift = sdp->sd_sb.sb_bsize_shift;
2364	BUG_ON(gfs2_is_dir(ip));
2365	end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2366	lblock = offset >> shift;
2367	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2368	if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2369		return 1;
2370
2371	size = (lblock_stop - lblock) << shift;
2372	do {
2373		bh.b_state = 0;
2374		bh.b_size = size;
2375		gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2376		if (!buffer_mapped(&bh))
2377			return 1;
2378		size -= bh.b_size;
2379		lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2380	} while(size > 0);
2381
2382	return 0;
2383}
2384
2385static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2386{
2387	struct gfs2_inode *ip = GFS2_I(inode);
2388	struct buffer_head *dibh;
2389	int error;
2390
2391	if (offset >= inode->i_size)
2392		return 0;
2393	if (offset + length > inode->i_size)
2394		length = inode->i_size - offset;
2395
2396	error = gfs2_meta_inode_buffer(ip, &dibh);
2397	if (error)
2398		return error;
2399	gfs2_trans_add_meta(ip->i_gl, dibh);
2400	memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2401	       length);
2402	brelse(dibh);
2403	return 0;
2404}
2405
2406static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2407					 loff_t length)
2408{
2409	struct gfs2_sbd *sdp = GFS2_SB(inode);
2410	loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2411	int error;
2412
2413	while (length) {
2414		struct gfs2_trans *tr;
2415		loff_t chunk;
2416		unsigned int offs;
2417
2418		chunk = length;
2419		if (chunk > max_chunk)
2420			chunk = max_chunk;
2421
2422		offs = offset & ~PAGE_MASK;
2423		if (offs && chunk > PAGE_SIZE)
2424			chunk = offs + ((chunk - offs) & PAGE_MASK);
2425
2426		truncate_pagecache_range(inode, offset, chunk);
2427		offset += chunk;
2428		length -= chunk;
2429
2430		tr = current->journal_info;
2431		if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2432			continue;
2433
2434		gfs2_trans_end(sdp);
2435		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2436		if (error)
2437			return error;
2438	}
2439	return 0;
2440}
2441
2442int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2443{
2444	struct inode *inode = file_inode(file);
2445	struct gfs2_inode *ip = GFS2_I(inode);
2446	struct gfs2_sbd *sdp = GFS2_SB(inode);
2447	unsigned int blocksize = i_blocksize(inode);
2448	loff_t start, end;
2449	int error;
2450
2451	if (!gfs2_is_stuffed(ip)) {
2452		unsigned int start_off, end_len;
2453
2454		start_off = offset & (blocksize - 1);
2455		end_len = (offset + length) & (blocksize - 1);
2456		if (start_off) {
2457			unsigned int len = length;
2458			if (length > blocksize - start_off)
2459				len = blocksize - start_off;
2460			error = gfs2_block_zero_range(inode, offset, len);
2461			if (error)
2462				goto out;
2463			if (start_off + length < blocksize)
2464				end_len = 0;
2465		}
2466		if (end_len) {
2467			error = gfs2_block_zero_range(inode,
2468				offset + length - end_len, end_len);
2469			if (error)
2470				goto out;
2471		}
2472	}
2473
2474	start = round_down(offset, blocksize);
2475	end = round_up(offset + length, blocksize) - 1;
2476	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
2477	if (error)
2478		return error;
2479
2480	if (gfs2_is_jdata(ip))
2481		error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2482					 GFS2_JTRUNC_REVOKES);
2483	else
2484		error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2485	if (error)
2486		return error;
2487
2488	if (gfs2_is_stuffed(ip)) {
2489		error = stuffed_zero_range(inode, offset, length);
2490		if (error)
2491			goto out;
2492	}
2493
2494	if (gfs2_is_jdata(ip)) {
2495		BUG_ON(!current->journal_info);
2496		gfs2_journaled_truncate_range(inode, offset, length);
2497	} else
2498		truncate_pagecache_range(inode, offset, offset + length - 1);
2499
2500	file_update_time(file);
2501	mark_inode_dirty(inode);
2502
2503	if (current->journal_info)
2504		gfs2_trans_end(sdp);
2505
2506	if (!gfs2_is_stuffed(ip))
2507		error = punch_hole(ip, offset, length);
2508
2509out:
2510	if (current->journal_info)
2511		gfs2_trans_end(sdp);
2512	return error;
2513}
2514
2515static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
2516		loff_t offset)
2517{
2518	struct metapath mp = { .mp_aheight = 1, };
2519	int ret;
2520
2521	if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
2522		return -EIO;
2523
2524	if (offset >= wpc->iomap.offset &&
2525	    offset < wpc->iomap.offset + wpc->iomap.length)
2526		return 0;
2527
2528	memset(&wpc->iomap, 0, sizeof(wpc->iomap));
2529	ret = gfs2_iomap_get(inode, offset, INT_MAX, 0, &wpc->iomap, &mp);
2530	release_metapath(&mp);
2531	return ret;
2532}
2533
2534const struct iomap_writeback_ops gfs2_writeback_ops = {
2535	.map_blocks		= gfs2_map_blocks,
2536};
2537