xref: /kernel/linux/linux-5.10/fs/gfs2/rgrp.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
4 * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
5 */
6
7#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8
9#include <linux/slab.h>
10#include <linux/spinlock.h>
11#include <linux/completion.h>
12#include <linux/buffer_head.h>
13#include <linux/fs.h>
14#include <linux/gfs2_ondisk.h>
15#include <linux/prefetch.h>
16#include <linux/blkdev.h>
17#include <linux/rbtree.h>
18#include <linux/random.h>
19
20#include "gfs2.h"
21#include "incore.h"
22#include "glock.h"
23#include "glops.h"
24#include "lops.h"
25#include "meta_io.h"
26#include "quota.h"
27#include "rgrp.h"
28#include "super.h"
29#include "trans.h"
30#include "util.h"
31#include "log.h"
32#include "inode.h"
33#include "trace_gfs2.h"
34#include "dir.h"
35
36#define BFITNOENT ((u32)~0)
37#define NO_BLOCK ((u64)~0)
38
39/*
40 * These routines are used by the resource group routines (rgrp.c)
41 * to keep track of block allocation.  Each block is represented by two
42 * bits.  So, each byte represents GFS2_NBBY (i.e. 4) blocks.
43 *
44 * 0 = Free
45 * 1 = Used (not metadata)
46 * 2 = Unlinked (still in use) inode
47 * 3 = Used (metadata)
48 */
49
50struct gfs2_extent {
51	struct gfs2_rbm rbm;
52	u32 len;
53};
54
55static const char valid_change[16] = {
56	        /* current */
57	/* n */ 0, 1, 1, 1,
58	/* e */ 1, 0, 0, 0,
59	/* w */ 0, 0, 0, 1,
60	        1, 0, 0, 0
61};
62
63static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
64			 const struct gfs2_inode *ip, bool nowrap);
65
66
67/**
68 * gfs2_setbit - Set a bit in the bitmaps
69 * @rbm: The position of the bit to set
70 * @do_clone: Also set the clone bitmap, if it exists
71 * @new_state: the new state of the block
72 *
73 */
74
75static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
76			       unsigned char new_state)
77{
78	unsigned char *byte1, *byte2, *end, cur_state;
79	struct gfs2_bitmap *bi = rbm_bi(rbm);
80	unsigned int buflen = bi->bi_bytes;
81	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
82
83	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
84	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
85
86	BUG_ON(byte1 >= end);
87
88	cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
89
90	if (unlikely(!valid_change[new_state * 4 + cur_state])) {
91		struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
92
93		fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
94			rbm->offset, cur_state, new_state);
95		fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
96			(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
97			(unsigned long long)bi->bi_bh->b_blocknr);
98		fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
99			bi->bi_offset, bi->bi_bytes,
100			(unsigned long long)gfs2_rbm_to_block(rbm));
101		dump_stack();
102		gfs2_consist_rgrpd(rbm->rgd);
103		return;
104	}
105	*byte1 ^= (cur_state ^ new_state) << bit;
106
107	if (do_clone && bi->bi_clone) {
108		byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
109		cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
110		*byte2 ^= (cur_state ^ new_state) << bit;
111	}
112}
113
114/**
115 * gfs2_testbit - test a bit in the bitmaps
116 * @rbm: The bit to test
117 * @use_clone: If true, test the clone bitmap, not the official bitmap.
118 *
119 * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps,
120 * not the "real" bitmaps, to avoid allocating recently freed blocks.
121 *
122 * Returns: The two bit block state of the requested bit
123 */
124
125static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
126{
127	struct gfs2_bitmap *bi = rbm_bi(rbm);
128	const u8 *buffer;
129	const u8 *byte;
130	unsigned int bit;
131
132	if (use_clone && bi->bi_clone)
133		buffer = bi->bi_clone;
134	else
135		buffer = bi->bi_bh->b_data;
136	buffer += bi->bi_offset;
137	byte = buffer + (rbm->offset / GFS2_NBBY);
138	bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
139
140	return (*byte >> bit) & GFS2_BIT_MASK;
141}
142
143/**
144 * gfs2_bit_search
145 * @ptr: Pointer to bitmap data
146 * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
147 * @state: The state we are searching for
148 *
149 * We xor the bitmap data with a patter which is the bitwise opposite
150 * of what we are looking for, this gives rise to a pattern of ones
151 * wherever there is a match. Since we have two bits per entry, we
152 * take this pattern, shift it down by one place and then and it with
153 * the original. All the even bit positions (0,2,4, etc) then represent
154 * successful matches, so we mask with 0x55555..... to remove the unwanted
155 * odd bit positions.
156 *
157 * This allows searching of a whole u64 at once (32 blocks) with a
158 * single test (on 64 bit arches).
159 */
160
161static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
162{
163	u64 tmp;
164	static const u64 search[] = {
165		[0] = 0xffffffffffffffffULL,
166		[1] = 0xaaaaaaaaaaaaaaaaULL,
167		[2] = 0x5555555555555555ULL,
168		[3] = 0x0000000000000000ULL,
169	};
170	tmp = le64_to_cpu(*ptr) ^ search[state];
171	tmp &= (tmp >> 1);
172	tmp &= mask;
173	return tmp;
174}
175
176/**
177 * rs_cmp - multi-block reservation range compare
178 * @blk: absolute file system block number of the new reservation
179 * @len: number of blocks in the new reservation
180 * @rs: existing reservation to compare against
181 *
182 * returns: 1 if the block range is beyond the reach of the reservation
183 *         -1 if the block range is before the start of the reservation
184 *          0 if the block range overlaps with the reservation
185 */
186static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
187{
188	u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm);
189
190	if (blk >= startblk + rs->rs_free)
191		return 1;
192	if (blk + len - 1 < startblk)
193		return -1;
194	return 0;
195}
196
197/**
198 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
199 *       a block in a given allocation state.
200 * @buf: the buffer that holds the bitmaps
201 * @len: the length (in bytes) of the buffer
202 * @goal: start search at this block's bit-pair (within @buffer)
203 * @state: GFS2_BLKST_XXX the state of the block we're looking for.
204 *
205 * Scope of @goal and returned block number is only within this bitmap buffer,
206 * not entire rgrp or filesystem.  @buffer will be offset from the actual
207 * beginning of a bitmap block buffer, skipping any header structures, but
208 * headers are always a multiple of 64 bits long so that the buffer is
209 * always aligned to a 64 bit boundary.
210 *
211 * The size of the buffer is in bytes, but is it assumed that it is
212 * always ok to read a complete multiple of 64 bits at the end
213 * of the block in case the end is no aligned to a natural boundary.
214 *
215 * Return: the block number (bitmap buffer scope) that was found
216 */
217
218static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
219		       u32 goal, u8 state)
220{
221	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
222	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
223	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
224	u64 tmp;
225	u64 mask = 0x5555555555555555ULL;
226	u32 bit;
227
228	/* Mask off bits we don't care about at the start of the search */
229	mask <<= spoint;
230	tmp = gfs2_bit_search(ptr, mask, state);
231	ptr++;
232	while(tmp == 0 && ptr < end) {
233		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
234		ptr++;
235	}
236	/* Mask off any bits which are more than len bytes from the start */
237	if (ptr == end && (len & (sizeof(u64) - 1)))
238		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
239	/* Didn't find anything, so return */
240	if (tmp == 0)
241		return BFITNOENT;
242	ptr--;
243	bit = __ffs64(tmp);
244	bit /= 2;	/* two bits per entry in the bitmap */
245	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
246}
247
248/**
249 * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
250 * @rbm: The rbm with rgd already set correctly
251 * @block: The block number (filesystem relative)
252 *
253 * This sets the bi and offset members of an rbm based on a
254 * resource group and a filesystem relative block number. The
255 * resource group must be set in the rbm on entry, the bi and
256 * offset members will be set by this function.
257 *
258 * Returns: 0 on success, or an error code
259 */
260
261static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
262{
263	if (!rgrp_contains_block(rbm->rgd, block))
264		return -E2BIG;
265	rbm->bii = 0;
266	rbm->offset = block - rbm->rgd->rd_data0;
267	/* Check if the block is within the first block */
268	if (rbm->offset < rbm_bi(rbm)->bi_blocks)
269		return 0;
270
271	/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
272	rbm->offset += (sizeof(struct gfs2_rgrp) -
273			sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
274	rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
275	rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
276	return 0;
277}
278
279/**
280 * gfs2_rbm_incr - increment an rbm structure
281 * @rbm: The rbm with rgd already set correctly
282 *
283 * This function takes an existing rbm structure and increments it to the next
284 * viable block offset.
285 *
286 * Returns: If incrementing the offset would cause the rbm to go past the
287 *          end of the rgrp, true is returned, otherwise false.
288 *
289 */
290
291static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
292{
293	if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
294		rbm->offset++;
295		return false;
296	}
297	if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
298		return true;
299
300	rbm->offset = 0;
301	rbm->bii++;
302	return false;
303}
304
305/**
306 * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
307 * @rbm: Position to search (value/result)
308 * @n_unaligned: Number of unaligned blocks to check
309 * @len: Decremented for each block found (terminate on zero)
310 *
311 * Returns: true if a non-free block is encountered
312 */
313
314static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
315{
316	u32 n;
317	u8 res;
318
319	for (n = 0; n < n_unaligned; n++) {
320		res = gfs2_testbit(rbm, true);
321		if (res != GFS2_BLKST_FREE)
322			return true;
323		(*len)--;
324		if (*len == 0)
325			return true;
326		if (gfs2_rbm_incr(rbm))
327			return true;
328	}
329
330	return false;
331}
332
333/**
334 * gfs2_free_extlen - Return extent length of free blocks
335 * @rrbm: Starting position
336 * @len: Max length to check
337 *
338 * Starting at the block specified by the rbm, see how many free blocks
339 * there are, not reading more than len blocks ahead. This can be done
340 * using memchr_inv when the blocks are byte aligned, but has to be done
341 * on a block by block basis in case of unaligned blocks. Also this
342 * function can cope with bitmap boundaries (although it must stop on
343 * a resource group boundary)
344 *
345 * Returns: Number of free blocks in the extent
346 */
347
348static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
349{
350	struct gfs2_rbm rbm = *rrbm;
351	u32 n_unaligned = rbm.offset & 3;
352	u32 size = len;
353	u32 bytes;
354	u32 chunk_size;
355	u8 *ptr, *start, *end;
356	u64 block;
357	struct gfs2_bitmap *bi;
358
359	if (n_unaligned &&
360	    gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
361		goto out;
362
363	n_unaligned = len & 3;
364	/* Start is now byte aligned */
365	while (len > 3) {
366		bi = rbm_bi(&rbm);
367		start = bi->bi_bh->b_data;
368		if (bi->bi_clone)
369			start = bi->bi_clone;
370		start += bi->bi_offset;
371		end = start + bi->bi_bytes;
372		BUG_ON(rbm.offset & 3);
373		start += (rbm.offset / GFS2_NBBY);
374		bytes = min_t(u32, len / GFS2_NBBY, (end - start));
375		ptr = memchr_inv(start, 0, bytes);
376		chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
377		chunk_size *= GFS2_NBBY;
378		BUG_ON(len < chunk_size);
379		len -= chunk_size;
380		block = gfs2_rbm_to_block(&rbm);
381		if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
382			n_unaligned = 0;
383			break;
384		}
385		if (ptr) {
386			n_unaligned = 3;
387			break;
388		}
389		n_unaligned = len & 3;
390	}
391
392	/* Deal with any bits left over at the end */
393	if (n_unaligned)
394		gfs2_unaligned_extlen(&rbm, n_unaligned, &len);
395out:
396	return size - len;
397}
398
399/**
400 * gfs2_bitcount - count the number of bits in a certain state
401 * @rgd: the resource group descriptor
402 * @buffer: the buffer that holds the bitmaps
403 * @buflen: the length (in bytes) of the buffer
404 * @state: the state of the block we're looking for
405 *
406 * Returns: The number of bits
407 */
408
409static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
410			 unsigned int buflen, u8 state)
411{
412	const u8 *byte = buffer;
413	const u8 *end = buffer + buflen;
414	const u8 state1 = state << 2;
415	const u8 state2 = state << 4;
416	const u8 state3 = state << 6;
417	u32 count = 0;
418
419	for (; byte < end; byte++) {
420		if (((*byte) & 0x03) == state)
421			count++;
422		if (((*byte) & 0x0C) == state1)
423			count++;
424		if (((*byte) & 0x30) == state2)
425			count++;
426		if (((*byte) & 0xC0) == state3)
427			count++;
428	}
429
430	return count;
431}
432
433/**
434 * gfs2_rgrp_verify - Verify that a resource group is consistent
435 * @rgd: the rgrp
436 *
437 */
438
439void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
440{
441	struct gfs2_sbd *sdp = rgd->rd_sbd;
442	struct gfs2_bitmap *bi = NULL;
443	u32 length = rgd->rd_length;
444	u32 count[4], tmp;
445	int buf, x;
446
447	memset(count, 0, 4 * sizeof(u32));
448
449	/* Count # blocks in each of 4 possible allocation states */
450	for (buf = 0; buf < length; buf++) {
451		bi = rgd->rd_bits + buf;
452		for (x = 0; x < 4; x++)
453			count[x] += gfs2_bitcount(rgd,
454						  bi->bi_bh->b_data +
455						  bi->bi_offset,
456						  bi->bi_bytes, x);
457	}
458
459	if (count[0] != rgd->rd_free) {
460		gfs2_lm(sdp, "free data mismatch:  %u != %u\n",
461			count[0], rgd->rd_free);
462		gfs2_consist_rgrpd(rgd);
463		return;
464	}
465
466	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
467	if (count[1] != tmp) {
468		gfs2_lm(sdp, "used data mismatch:  %u != %u\n",
469			count[1], tmp);
470		gfs2_consist_rgrpd(rgd);
471		return;
472	}
473
474	if (count[2] + count[3] != rgd->rd_dinodes) {
475		gfs2_lm(sdp, "used metadata mismatch:  %u != %u\n",
476			count[2] + count[3], rgd->rd_dinodes);
477		gfs2_consist_rgrpd(rgd);
478		return;
479	}
480}
481
482/**
483 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
484 * @sdp: The GFS2 superblock
485 * @blk: The data block number
486 * @exact: True if this needs to be an exact match
487 *
488 * The @exact argument should be set to true by most callers. The exception
489 * is when we need to match blocks which are not represented by the rgrp
490 * bitmap, but which are part of the rgrp (i.e. padding blocks) which are
491 * there for alignment purposes. Another way of looking at it is that @exact
492 * matches only valid data/metadata blocks, but with @exact false, it will
493 * match any block within the extent of the rgrp.
494 *
495 * Returns: The resource group, or NULL if not found
496 */
497
498struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
499{
500	struct rb_node *n, *next;
501	struct gfs2_rgrpd *cur;
502
503	spin_lock(&sdp->sd_rindex_spin);
504	n = sdp->sd_rindex_tree.rb_node;
505	while (n) {
506		cur = rb_entry(n, struct gfs2_rgrpd, rd_node);
507		next = NULL;
508		if (blk < cur->rd_addr)
509			next = n->rb_left;
510		else if (blk >= cur->rd_data0 + cur->rd_data)
511			next = n->rb_right;
512		if (next == NULL) {
513			spin_unlock(&sdp->sd_rindex_spin);
514			if (exact) {
515				if (blk < cur->rd_addr)
516					return NULL;
517				if (blk >= cur->rd_data0 + cur->rd_data)
518					return NULL;
519			}
520			return cur;
521		}
522		n = next;
523	}
524	spin_unlock(&sdp->sd_rindex_spin);
525
526	return NULL;
527}
528
529/**
530 * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem
531 * @sdp: The GFS2 superblock
532 *
533 * Returns: The first rgrp in the filesystem
534 */
535
536struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp)
537{
538	const struct rb_node *n;
539	struct gfs2_rgrpd *rgd;
540
541	spin_lock(&sdp->sd_rindex_spin);
542	n = rb_first(&sdp->sd_rindex_tree);
543	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
544	spin_unlock(&sdp->sd_rindex_spin);
545
546	return rgd;
547}
548
549/**
550 * gfs2_rgrpd_get_next - get the next RG
551 * @rgd: the resource group descriptor
552 *
553 * Returns: The next rgrp
554 */
555
556struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
557{
558	struct gfs2_sbd *sdp = rgd->rd_sbd;
559	const struct rb_node *n;
560
561	spin_lock(&sdp->sd_rindex_spin);
562	n = rb_next(&rgd->rd_node);
563	if (n == NULL)
564		n = rb_first(&sdp->sd_rindex_tree);
565
566	if (unlikely(&rgd->rd_node == n)) {
567		spin_unlock(&sdp->sd_rindex_spin);
568		return NULL;
569	}
570	rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
571	spin_unlock(&sdp->sd_rindex_spin);
572	return rgd;
573}
574
575void check_and_update_goal(struct gfs2_inode *ip)
576{
577	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
578	if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
579		ip->i_goal = ip->i_no_addr;
580}
581
582void gfs2_free_clones(struct gfs2_rgrpd *rgd)
583{
584	int x;
585
586	for (x = 0; x < rgd->rd_length; x++) {
587		struct gfs2_bitmap *bi = rgd->rd_bits + x;
588		kfree(bi->bi_clone);
589		bi->bi_clone = NULL;
590	}
591}
592
593static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs,
594		    const char *fs_id_buf)
595{
596	struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res);
597
598	gfs2_print_dbg(seq, "%s  B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf,
599		       (unsigned long long)ip->i_no_addr,
600		       (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm),
601		       rs->rs_rbm.offset, rs->rs_free);
602}
603
604/**
605 * __rs_deltree - remove a multi-block reservation from the rgd tree
606 * @rs: The reservation to remove
607 *
608 */
609static void __rs_deltree(struct gfs2_blkreserv *rs)
610{
611	struct gfs2_rgrpd *rgd;
612
613	if (!gfs2_rs_active(rs))
614		return;
615
616	rgd = rs->rs_rbm.rgd;
617	trace_gfs2_rs(rs, TRACE_RS_TREEDEL);
618	rb_erase(&rs->rs_node, &rgd->rd_rstree);
619	RB_CLEAR_NODE(&rs->rs_node);
620
621	if (rs->rs_free) {
622		u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
623				 rs->rs_free - 1;
624		struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
625		struct gfs2_bitmap *start, *last;
626
627		/* return reserved blocks to the rgrp */
628		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
629		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
630		/* The rgrp extent failure point is likely not to increase;
631		   it will only do so if the freed blocks are somehow
632		   contiguous with a span of free blocks that follows. Still,
633		   it will force the number to be recalculated later. */
634		rgd->rd_extfail_pt += rs->rs_free;
635		rs->rs_free = 0;
636		if (gfs2_rbm_from_block(&last_rbm, last_block))
637			return;
638		start = rbm_bi(&rs->rs_rbm);
639		last = rbm_bi(&last_rbm);
640		do
641			clear_bit(GBF_FULL, &start->bi_flags);
642		while (start++ != last);
643	}
644}
645
646/**
647 * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
648 * @rs: The reservation to remove
649 *
650 */
651void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
652{
653	struct gfs2_rgrpd *rgd;
654
655	rgd = rs->rs_rbm.rgd;
656	if (rgd) {
657		spin_lock(&rgd->rd_rsspin);
658		__rs_deltree(rs);
659		BUG_ON(rs->rs_free);
660		spin_unlock(&rgd->rd_rsspin);
661	}
662}
663
664/**
665 * gfs2_rs_delete - delete a multi-block reservation
666 * @ip: The inode for this reservation
667 *
668 */
669void gfs2_rs_delete(struct gfs2_inode *ip)
670{
671	struct inode *inode = &ip->i_inode;
672
673	down_write(&ip->i_rw_mutex);
674	if (atomic_read(&inode->i_writecount) <= 1)
675		gfs2_rs_deltree(&ip->i_res);
676	up_write(&ip->i_rw_mutex);
677}
678
679/**
680 * return_all_reservations - return all reserved blocks back to the rgrp.
681 * @rgd: the rgrp that needs its space back
682 *
683 * We previously reserved a bunch of blocks for allocation. Now we need to
684 * give them back. This leave the reservation structures in tact, but removes
685 * all of their corresponding "no-fly zones".
686 */
687static void return_all_reservations(struct gfs2_rgrpd *rgd)
688{
689	struct rb_node *n;
690	struct gfs2_blkreserv *rs;
691
692	spin_lock(&rgd->rd_rsspin);
693	while ((n = rb_first(&rgd->rd_rstree))) {
694		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
695		__rs_deltree(rs);
696	}
697	spin_unlock(&rgd->rd_rsspin);
698}
699
700void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
701{
702	struct rb_node *n;
703	struct gfs2_rgrpd *rgd;
704	struct gfs2_glock *gl;
705
706	while ((n = rb_first(&sdp->sd_rindex_tree))) {
707		rgd = rb_entry(n, struct gfs2_rgrpd, rd_node);
708		gl = rgd->rd_gl;
709
710		rb_erase(n, &sdp->sd_rindex_tree);
711
712		if (gl) {
713			if (gl->gl_state != LM_ST_UNLOCKED) {
714				gfs2_glock_cb(gl, LM_ST_UNLOCKED);
715				flush_delayed_work(&gl->gl_work);
716			}
717			gfs2_rgrp_brelse(rgd);
718			glock_clear_object(gl, rgd);
719			gfs2_glock_put(gl);
720		}
721
722		gfs2_free_clones(rgd);
723		return_all_reservations(rgd);
724		kfree(rgd->rd_bits);
725		rgd->rd_bits = NULL;
726		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
727	}
728}
729
730/**
731 * gfs2_compute_bitstructs - Compute the bitmap sizes
732 * @rgd: The resource group descriptor
733 *
734 * Calculates bitmap descriptors, one for each block that contains bitmap data
735 *
736 * Returns: errno
737 */
738
739static int compute_bitstructs(struct gfs2_rgrpd *rgd)
740{
741	struct gfs2_sbd *sdp = rgd->rd_sbd;
742	struct gfs2_bitmap *bi;
743	u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */
744	u32 bytes_left, bytes;
745	int x;
746
747	if (!length)
748		return -EINVAL;
749
750	rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS);
751	if (!rgd->rd_bits)
752		return -ENOMEM;
753
754	bytes_left = rgd->rd_bitbytes;
755
756	for (x = 0; x < length; x++) {
757		bi = rgd->rd_bits + x;
758
759		bi->bi_flags = 0;
760		/* small rgrp; bitmap stored completely in header block */
761		if (length == 1) {
762			bytes = bytes_left;
763			bi->bi_offset = sizeof(struct gfs2_rgrp);
764			bi->bi_start = 0;
765			bi->bi_bytes = bytes;
766			bi->bi_blocks = bytes * GFS2_NBBY;
767		/* header block */
768		} else if (x == 0) {
769			bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
770			bi->bi_offset = sizeof(struct gfs2_rgrp);
771			bi->bi_start = 0;
772			bi->bi_bytes = bytes;
773			bi->bi_blocks = bytes * GFS2_NBBY;
774		/* last block */
775		} else if (x + 1 == length) {
776			bytes = bytes_left;
777			bi->bi_offset = sizeof(struct gfs2_meta_header);
778			bi->bi_start = rgd->rd_bitbytes - bytes_left;
779			bi->bi_bytes = bytes;
780			bi->bi_blocks = bytes * GFS2_NBBY;
781		/* other blocks */
782		} else {
783			bytes = sdp->sd_sb.sb_bsize -
784				sizeof(struct gfs2_meta_header);
785			bi->bi_offset = sizeof(struct gfs2_meta_header);
786			bi->bi_start = rgd->rd_bitbytes - bytes_left;
787			bi->bi_bytes = bytes;
788			bi->bi_blocks = bytes * GFS2_NBBY;
789		}
790
791		bytes_left -= bytes;
792	}
793
794	if (bytes_left) {
795		gfs2_consist_rgrpd(rgd);
796		return -EIO;
797	}
798	bi = rgd->rd_bits + (length - 1);
799	if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
800		gfs2_lm(sdp,
801			"ri_addr = %llu\n"
802			"ri_length = %u\n"
803			"ri_data0 = %llu\n"
804			"ri_data = %u\n"
805			"ri_bitbytes = %u\n"
806			"start=%u len=%u offset=%u\n",
807			(unsigned long long)rgd->rd_addr,
808			rgd->rd_length,
809			(unsigned long long)rgd->rd_data0,
810			rgd->rd_data,
811			rgd->rd_bitbytes,
812			bi->bi_start, bi->bi_bytes, bi->bi_offset);
813		gfs2_consist_rgrpd(rgd);
814		return -EIO;
815	}
816
817	return 0;
818}
819
820/**
821 * gfs2_ri_total - Total up the file system space, according to the rindex.
822 * @sdp: the filesystem
823 *
824 */
825u64 gfs2_ri_total(struct gfs2_sbd *sdp)
826{
827	u64 total_data = 0;
828	struct inode *inode = sdp->sd_rindex;
829	struct gfs2_inode *ip = GFS2_I(inode);
830	char buf[sizeof(struct gfs2_rindex)];
831	int error, rgrps;
832
833	for (rgrps = 0;; rgrps++) {
834		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
835
836		if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode))
837			break;
838		error = gfs2_internal_read(ip, buf, &pos,
839					   sizeof(struct gfs2_rindex));
840		if (error != sizeof(struct gfs2_rindex))
841			break;
842		total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data);
843	}
844	return total_data;
845}
846
847static int rgd_insert(struct gfs2_rgrpd *rgd)
848{
849	struct gfs2_sbd *sdp = rgd->rd_sbd;
850	struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL;
851
852	/* Figure out where to put new node */
853	while (*newn) {
854		struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd,
855						  rd_node);
856
857		parent = *newn;
858		if (rgd->rd_addr < cur->rd_addr)
859			newn = &((*newn)->rb_left);
860		else if (rgd->rd_addr > cur->rd_addr)
861			newn = &((*newn)->rb_right);
862		else
863			return -EEXIST;
864	}
865
866	rb_link_node(&rgd->rd_node, parent, newn);
867	rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree);
868	sdp->sd_rgrps++;
869	return 0;
870}
871
872/**
873 * read_rindex_entry - Pull in a new resource index entry from the disk
874 * @ip: Pointer to the rindex inode
875 *
876 * Returns: 0 on success, > 0 on EOF, error code otherwise
877 */
878
879static int read_rindex_entry(struct gfs2_inode *ip)
880{
881	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
882	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
883	struct gfs2_rindex buf;
884	int error;
885	struct gfs2_rgrpd *rgd;
886
887	if (pos >= i_size_read(&ip->i_inode))
888		return 1;
889
890	error = gfs2_internal_read(ip, (char *)&buf, &pos,
891				   sizeof(struct gfs2_rindex));
892
893	if (error != sizeof(struct gfs2_rindex))
894		return (error == 0) ? 1 : error;
895
896	rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
897	error = -ENOMEM;
898	if (!rgd)
899		return error;
900
901	rgd->rd_sbd = sdp;
902	rgd->rd_addr = be64_to_cpu(buf.ri_addr);
903	rgd->rd_length = be32_to_cpu(buf.ri_length);
904	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
905	rgd->rd_data = be32_to_cpu(buf.ri_data);
906	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
907	spin_lock_init(&rgd->rd_rsspin);
908
909	error = gfs2_glock_get(sdp, rgd->rd_addr,
910			       &gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
911	if (error)
912		goto fail;
913
914	error = compute_bitstructs(rgd);
915	if (error)
916		goto fail_glock;
917
918	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
919	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
920	if (rgd->rd_data > sdp->sd_max_rg_data)
921		sdp->sd_max_rg_data = rgd->rd_data;
922	spin_lock(&sdp->sd_rindex_spin);
923	error = rgd_insert(rgd);
924	spin_unlock(&sdp->sd_rindex_spin);
925	if (!error) {
926		glock_set_object(rgd->rd_gl, rgd);
927		return 0;
928	}
929
930	error = 0; /* someone else read in the rgrp; free it and ignore it */
931fail_glock:
932	gfs2_glock_put(rgd->rd_gl);
933
934fail:
935	kfree(rgd->rd_bits);
936	rgd->rd_bits = NULL;
937	kmem_cache_free(gfs2_rgrpd_cachep, rgd);
938	return error;
939}
940
941/**
942 * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
943 * @sdp: the GFS2 superblock
944 *
945 * The purpose of this function is to select a subset of the resource groups
946 * and mark them as PREFERRED. We do it in such a way that each node prefers
947 * to use a unique set of rgrps to minimize glock contention.
948 */
949static void set_rgrp_preferences(struct gfs2_sbd *sdp)
950{
951	struct gfs2_rgrpd *rgd, *first;
952	int i;
953
954	/* Skip an initial number of rgrps, based on this node's journal ID.
955	   That should start each node out on its own set. */
956	rgd = gfs2_rgrpd_get_first(sdp);
957	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
958		rgd = gfs2_rgrpd_get_next(rgd);
959	first = rgd;
960
961	do {
962		rgd->rd_flags |= GFS2_RDF_PREFERRED;
963		for (i = 0; i < sdp->sd_journals; i++) {
964			rgd = gfs2_rgrpd_get_next(rgd);
965			if (!rgd || rgd == first)
966				break;
967		}
968	} while (rgd && rgd != first);
969}
970
971/**
972 * gfs2_ri_update - Pull in a new resource index from the disk
973 * @ip: pointer to the rindex inode
974 *
975 * Returns: 0 on successful update, error code otherwise
976 */
977
978static int gfs2_ri_update(struct gfs2_inode *ip)
979{
980	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
981	int error;
982
983	do {
984		error = read_rindex_entry(ip);
985	} while (error == 0);
986
987	if (error < 0)
988		return error;
989
990	if (RB_EMPTY_ROOT(&sdp->sd_rindex_tree)) {
991		fs_err(sdp, "no resource groups found in the file system.\n");
992		return -ENOENT;
993	}
994	set_rgrp_preferences(sdp);
995
996	sdp->sd_rindex_uptodate = 1;
997	return 0;
998}
999
1000/**
1001 * gfs2_rindex_update - Update the rindex if required
1002 * @sdp: The GFS2 superblock
1003 *
1004 * We grab a lock on the rindex inode to make sure that it doesn't
1005 * change whilst we are performing an operation. We keep this lock
1006 * for quite long periods of time compared to other locks. This
1007 * doesn't matter, since it is shared and it is very, very rarely
1008 * accessed in the exclusive mode (i.e. only when expanding the filesystem).
1009 *
1010 * This makes sure that we're using the latest copy of the resource index
1011 * special file, which might have been updated if someone expanded the
1012 * filesystem (via gfs2_grow utility), which adds new resource groups.
1013 *
1014 * Returns: 0 on succeess, error code otherwise
1015 */
1016
1017int gfs2_rindex_update(struct gfs2_sbd *sdp)
1018{
1019	struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex);
1020	struct gfs2_glock *gl = ip->i_gl;
1021	struct gfs2_holder ri_gh;
1022	int error = 0;
1023	int unlock_required = 0;
1024
1025	/* Read new copy from disk if we don't have the latest */
1026	if (!sdp->sd_rindex_uptodate) {
1027		if (!gfs2_glock_is_locked_by_me(gl)) {
1028			error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh);
1029			if (error)
1030				return error;
1031			unlock_required = 1;
1032		}
1033		if (!sdp->sd_rindex_uptodate)
1034			error = gfs2_ri_update(ip);
1035		if (unlock_required)
1036			gfs2_glock_dq_uninit(&ri_gh);
1037	}
1038
1039	return error;
1040}
1041
1042static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
1043{
1044	const struct gfs2_rgrp *str = buf;
1045	u32 rg_flags;
1046
1047	rg_flags = be32_to_cpu(str->rg_flags);
1048	rg_flags &= ~GFS2_RDF_MASK;
1049	rgd->rd_flags &= GFS2_RDF_MASK;
1050	rgd->rd_flags |= rg_flags;
1051	rgd->rd_free = be32_to_cpu(str->rg_free);
1052	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
1053	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
1054	/* rd_data0, rd_data and rd_bitbytes already set from rindex */
1055}
1056
1057static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf)
1058{
1059	const struct gfs2_rgrp *str = buf;
1060
1061	rgl->rl_magic = cpu_to_be32(GFS2_MAGIC);
1062	rgl->rl_flags = str->rg_flags;
1063	rgl->rl_free = str->rg_free;
1064	rgl->rl_dinodes = str->rg_dinodes;
1065	rgl->rl_igeneration = str->rg_igeneration;
1066	rgl->__pad = 0UL;
1067}
1068
1069static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
1070{
1071	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
1072	struct gfs2_rgrp *str = buf;
1073	u32 crc;
1074
1075	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
1076	str->rg_free = cpu_to_be32(rgd->rd_free);
1077	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
1078	if (next == NULL)
1079		str->rg_skip = 0;
1080	else if (next->rd_addr > rgd->rd_addr)
1081		str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr);
1082	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
1083	str->rg_data0 = cpu_to_be64(rgd->rd_data0);
1084	str->rg_data = cpu_to_be32(rgd->rd_data);
1085	str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes);
1086	str->rg_crc = 0;
1087	crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp));
1088	str->rg_crc = cpu_to_be32(crc);
1089
1090	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
1091	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf);
1092}
1093
1094static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
1095{
1096	struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
1097	struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
1098	struct gfs2_sbd *sdp = rgd->rd_sbd;
1099	int valid = 1;
1100
1101	if (rgl->rl_flags != str->rg_flags) {
1102		fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u",
1103			(unsigned long long)rgd->rd_addr,
1104		       be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
1105		valid = 0;
1106	}
1107	if (rgl->rl_free != str->rg_free) {
1108		fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u",
1109			(unsigned long long)rgd->rd_addr,
1110			be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
1111		valid = 0;
1112	}
1113	if (rgl->rl_dinodes != str->rg_dinodes) {
1114		fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
1115			(unsigned long long)rgd->rd_addr,
1116			be32_to_cpu(rgl->rl_dinodes),
1117			be32_to_cpu(str->rg_dinodes));
1118		valid = 0;
1119	}
1120	if (rgl->rl_igeneration != str->rg_igeneration) {
1121		fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu",
1122			(unsigned long long)rgd->rd_addr,
1123			(unsigned long long)be64_to_cpu(rgl->rl_igeneration),
1124			(unsigned long long)be64_to_cpu(str->rg_igeneration));
1125		valid = 0;
1126	}
1127	return valid;
1128}
1129
1130static u32 count_unlinked(struct gfs2_rgrpd *rgd)
1131{
1132	struct gfs2_bitmap *bi;
1133	const u32 length = rgd->rd_length;
1134	const u8 *buffer = NULL;
1135	u32 i, goal, count = 0;
1136
1137	for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) {
1138		goal = 0;
1139		buffer = bi->bi_bh->b_data + bi->bi_offset;
1140		WARN_ON(!buffer_uptodate(bi->bi_bh));
1141		while (goal < bi->bi_blocks) {
1142			goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
1143					   GFS2_BLKST_UNLINKED);
1144			if (goal == BFITNOENT)
1145				break;
1146			count++;
1147			goal++;
1148		}
1149	}
1150
1151	return count;
1152}
1153
1154
1155/**
1156 * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
1157 * @rgd: the struct gfs2_rgrpd describing the RG to read in
1158 *
1159 * Read in all of a Resource Group's header and bitmap blocks.
1160 * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
1161 *
1162 * Returns: errno
1163 */
1164
1165static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1166{
1167	struct gfs2_sbd *sdp = rgd->rd_sbd;
1168	struct gfs2_glock *gl = rgd->rd_gl;
1169	unsigned int length = rgd->rd_length;
1170	struct gfs2_bitmap *bi;
1171	unsigned int x, y;
1172	int error;
1173
1174	if (rgd->rd_bits[0].bi_bh != NULL)
1175		return 0;
1176
1177	for (x = 0; x < length; x++) {
1178		bi = rgd->rd_bits + x;
1179		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
1180		if (error)
1181			goto fail;
1182	}
1183
1184	for (y = length; y--;) {
1185		bi = rgd->rd_bits + y;
1186		error = gfs2_meta_wait(sdp, bi->bi_bh);
1187		if (error)
1188			goto fail;
1189		if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB :
1190					      GFS2_METATYPE_RG)) {
1191			error = -EIO;
1192			goto fail;
1193		}
1194	}
1195
1196	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
1197		for (x = 0; x < length; x++)
1198			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
1199		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
1200		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1201		rgd->rd_free_clone = rgd->rd_free;
1202		/* max out the rgrp allocation failure point */
1203		rgd->rd_extfail_pt = rgd->rd_free;
1204	}
1205	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1206		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1207		gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1208				     rgd->rd_bits[0].bi_bh->b_data);
1209	}
1210	else if (sdp->sd_args.ar_rgrplvb) {
1211		if (!gfs2_rgrp_lvb_valid(rgd)){
1212			gfs2_consist_rgrpd(rgd);
1213			error = -EIO;
1214			goto fail;
1215		}
1216		if (rgd->rd_rgl->rl_unlinked == 0)
1217			rgd->rd_flags &= ~GFS2_RDF_CHECK;
1218	}
1219	return 0;
1220
1221fail:
1222	while (x--) {
1223		bi = rgd->rd_bits + x;
1224		brelse(bi->bi_bh);
1225		bi->bi_bh = NULL;
1226		gfs2_assert_warn(sdp, !bi->bi_clone);
1227	}
1228
1229	return error;
1230}
1231
1232static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1233{
1234	u32 rl_flags;
1235
1236	if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1237		return 0;
1238
1239	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1240		return gfs2_rgrp_bh_get(rgd);
1241
1242	rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
1243	rl_flags &= ~GFS2_RDF_MASK;
1244	rgd->rd_flags &= GFS2_RDF_MASK;
1245	rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
1246	if (rgd->rd_rgl->rl_unlinked == 0)
1247		rgd->rd_flags &= ~GFS2_RDF_CHECK;
1248	rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
1249	rgd->rd_free_clone = rgd->rd_free;
1250	/* max out the rgrp allocation failure point */
1251	rgd->rd_extfail_pt = rgd->rd_free;
1252	rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes);
1253	rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration);
1254	return 0;
1255}
1256
1257int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
1258{
1259	struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object;
1260	struct gfs2_sbd *sdp = rgd->rd_sbd;
1261
1262	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
1263		return 0;
1264	return gfs2_rgrp_bh_get(rgd);
1265}
1266
1267/**
1268 * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
1269 * @rgd: The resource group
1270 *
1271 */
1272
1273void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
1274{
1275	int x, length = rgd->rd_length;
1276
1277	for (x = 0; x < length; x++) {
1278		struct gfs2_bitmap *bi = rgd->rd_bits + x;
1279		if (bi->bi_bh) {
1280			brelse(bi->bi_bh);
1281			bi->bi_bh = NULL;
1282		}
1283	}
1284}
1285
1286int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
1287			     struct buffer_head *bh,
1288			     const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed)
1289{
1290	struct super_block *sb = sdp->sd_vfs;
1291	u64 blk;
1292	sector_t start = 0;
1293	sector_t nr_blks = 0;
1294	int rv;
1295	unsigned int x;
1296	u32 trimmed = 0;
1297	u8 diff;
1298
1299	for (x = 0; x < bi->bi_bytes; x++) {
1300		const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
1301		clone += bi->bi_offset;
1302		clone += x;
1303		if (bh) {
1304			const u8 *orig = bh->b_data + bi->bi_offset + x;
1305			diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
1306		} else {
1307			diff = ~(*clone | (*clone >> 1));
1308		}
1309		diff &= 0x55;
1310		if (diff == 0)
1311			continue;
1312		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
1313		while(diff) {
1314			if (diff & 1) {
1315				if (nr_blks == 0)
1316					goto start_new_extent;
1317				if ((start + nr_blks) != blk) {
1318					if (nr_blks >= minlen) {
1319						rv = sb_issue_discard(sb,
1320							start, nr_blks,
1321							GFP_NOFS, 0);
1322						if (rv)
1323							goto fail;
1324						trimmed += nr_blks;
1325					}
1326					nr_blks = 0;
1327start_new_extent:
1328					start = blk;
1329				}
1330				nr_blks++;
1331			}
1332			diff >>= 2;
1333			blk++;
1334		}
1335	}
1336	if (nr_blks >= minlen) {
1337		rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0);
1338		if (rv)
1339			goto fail;
1340		trimmed += nr_blks;
1341	}
1342	if (ptrimmed)
1343		*ptrimmed = trimmed;
1344	return 0;
1345
1346fail:
1347	if (sdp->sd_args.ar_discard)
1348		fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv);
1349	sdp->sd_args.ar_discard = 0;
1350	return -EIO;
1351}
1352
1353/**
1354 * gfs2_fitrim - Generate discard requests for unused bits of the filesystem
1355 * @filp: Any file on the filesystem
1356 * @argp: Pointer to the arguments (also used to pass result)
1357 *
1358 * Returns: 0 on success, otherwise error code
1359 */
1360
1361int gfs2_fitrim(struct file *filp, void __user *argp)
1362{
1363	struct inode *inode = file_inode(filp);
1364	struct gfs2_sbd *sdp = GFS2_SB(inode);
1365	struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
1366	struct buffer_head *bh;
1367	struct gfs2_rgrpd *rgd;
1368	struct gfs2_rgrpd *rgd_end;
1369	struct gfs2_holder gh;
1370	struct fstrim_range r;
1371	int ret = 0;
1372	u64 amt;
1373	u64 trimmed = 0;
1374	u64 start, end, minlen;
1375	unsigned int x;
1376	unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
1377
1378	if (!capable(CAP_SYS_ADMIN))
1379		return -EPERM;
1380
1381	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
1382		return -EROFS;
1383
1384	if (!blk_queue_discard(q))
1385		return -EOPNOTSUPP;
1386
1387	if (copy_from_user(&r, argp, sizeof(r)))
1388		return -EFAULT;
1389
1390	ret = gfs2_rindex_update(sdp);
1391	if (ret)
1392		return ret;
1393
1394	start = r.start >> bs_shift;
1395	end = start + (r.len >> bs_shift);
1396	minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
1397	minlen = max_t(u64, minlen,
1398		       q->limits.discard_granularity) >> bs_shift;
1399
1400	if (end <= start || minlen > sdp->sd_max_rg_data)
1401		return -EINVAL;
1402
1403	rgd = gfs2_blk2rgrpd(sdp, start, 0);
1404	rgd_end = gfs2_blk2rgrpd(sdp, end, 0);
1405
1406	if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end))
1407	    && (start > rgd_end->rd_data0 + rgd_end->rd_data))
1408		return -EINVAL; /* start is beyond the end of the fs */
1409
1410	while (1) {
1411
1412		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
1413		if (ret)
1414			goto out;
1415
1416		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
1417			/* Trim each bitmap in the rgrp */
1418			for (x = 0; x < rgd->rd_length; x++) {
1419				struct gfs2_bitmap *bi = rgd->rd_bits + x;
1420				ret = gfs2_rgrp_send_discards(sdp,
1421						rgd->rd_data0, NULL, bi, minlen,
1422						&amt);
1423				if (ret) {
1424					gfs2_glock_dq_uninit(&gh);
1425					goto out;
1426				}
1427				trimmed += amt;
1428			}
1429
1430			/* Mark rgrp as having been trimmed */
1431			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
1432			if (ret == 0) {
1433				bh = rgd->rd_bits[0].bi_bh;
1434				rgd->rd_flags |= GFS2_RGF_TRIMMED;
1435				gfs2_trans_add_meta(rgd->rd_gl, bh);
1436				gfs2_rgrp_out(rgd, bh->b_data);
1437				gfs2_trans_end(sdp);
1438			}
1439		}
1440		gfs2_glock_dq_uninit(&gh);
1441
1442		if (rgd == rgd_end)
1443			break;
1444
1445		rgd = gfs2_rgrpd_get_next(rgd);
1446	}
1447
1448out:
1449	r.len = trimmed << bs_shift;
1450	if (copy_to_user(argp, &r, sizeof(r)))
1451		return -EFAULT;
1452
1453	return ret;
1454}
1455
1456/**
1457 * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1458 * @ip: the inode structure
1459 *
1460 */
1461static void rs_insert(struct gfs2_inode *ip)
1462{
1463	struct rb_node **newn, *parent = NULL;
1464	int rc;
1465	struct gfs2_blkreserv *rs = &ip->i_res;
1466	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
1467	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
1468
1469	BUG_ON(gfs2_rs_active(rs));
1470
1471	spin_lock(&rgd->rd_rsspin);
1472	newn = &rgd->rd_rstree.rb_node;
1473	while (*newn) {
1474		struct gfs2_blkreserv *cur =
1475			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1476
1477		parent = *newn;
1478		rc = rs_cmp(fsblock, rs->rs_free, cur);
1479		if (rc > 0)
1480			newn = &((*newn)->rb_right);
1481		else if (rc < 0)
1482			newn = &((*newn)->rb_left);
1483		else {
1484			spin_unlock(&rgd->rd_rsspin);
1485			WARN_ON(1);
1486			return;
1487		}
1488	}
1489
1490	rb_link_node(&rs->rs_node, parent, newn);
1491	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1492
1493	/* Do our rgrp accounting for the reservation */
1494	rgd->rd_reserved += rs->rs_free; /* blocks reserved */
1495	spin_unlock(&rgd->rd_rsspin);
1496	trace_gfs2_rs(rs, TRACE_RS_INSERT);
1497}
1498
1499/**
1500 * rgd_free - return the number of free blocks we can allocate.
1501 * @rgd: the resource group
1502 *
1503 * This function returns the number of free blocks for an rgrp.
1504 * That's the clone-free blocks (blocks that are free, not including those
1505 * still being used for unlinked files that haven't been deleted.)
1506 *
1507 * It also subtracts any blocks reserved by someone else, but does not
1508 * include free blocks that are still part of our current reservation,
1509 * because obviously we can (and will) allocate them.
1510 */
1511static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs)
1512{
1513	u32 tot_reserved, tot_free;
1514
1515	if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free))
1516		return 0;
1517	tot_reserved = rgd->rd_reserved - rs->rs_free;
1518
1519	if (rgd->rd_free_clone < tot_reserved)
1520		tot_reserved = 0;
1521
1522	tot_free = rgd->rd_free_clone - tot_reserved;
1523
1524	return tot_free;
1525}
1526
1527/**
1528 * rg_mblk_search - find a group of multiple free blocks to form a reservation
1529 * @rgd: the resource group descriptor
1530 * @ip: pointer to the inode for which we're reserving blocks
1531 * @ap: the allocation parameters
1532 *
1533 */
1534
1535static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
1536			   const struct gfs2_alloc_parms *ap)
1537{
1538	struct gfs2_rbm rbm = { .rgd = rgd, };
1539	u64 goal;
1540	struct gfs2_blkreserv *rs = &ip->i_res;
1541	u32 extlen;
1542	u32 free_blocks = rgd_free(rgd, rs);
1543	int ret;
1544	struct inode *inode = &ip->i_inode;
1545
1546	if (S_ISDIR(inode->i_mode))
1547		extlen = 1;
1548	else {
1549		extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
1550		extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
1551	}
1552	if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
1553		return;
1554
1555	/* Find bitmap block that contains bits for goal block */
1556	if (rgrp_contains_block(rgd, ip->i_goal))
1557		goal = ip->i_goal;
1558	else
1559		goal = rgd->rd_last_alloc + rgd->rd_data0;
1560
1561	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
1562		return;
1563
1564	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true);
1565	if (ret == 0) {
1566		rs->rs_rbm = rbm;
1567		rs->rs_free = extlen;
1568		rs_insert(ip);
1569	} else {
1570		if (goal == rgd->rd_last_alloc + rgd->rd_data0)
1571			rgd->rd_last_alloc = 0;
1572	}
1573}
1574
1575/**
1576 * gfs2_next_unreserved_block - Return next block that is not reserved
1577 * @rgd: The resource group
1578 * @block: The starting block
1579 * @length: The required length
1580 * @ip: Ignore any reservations for this inode
1581 *
1582 * If the block does not appear in any reservation, then return the
1583 * block number unchanged. If it does appear in the reservation, then
1584 * keep looking through the tree of reservations in order to find the
1585 * first block number which is not reserved.
1586 */
1587
1588static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
1589				      u32 length,
1590				      const struct gfs2_inode *ip)
1591{
1592	struct gfs2_blkreserv *rs;
1593	struct rb_node *n;
1594	int rc;
1595
1596	spin_lock(&rgd->rd_rsspin);
1597	n = rgd->rd_rstree.rb_node;
1598	while (n) {
1599		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1600		rc = rs_cmp(block, length, rs);
1601		if (rc < 0)
1602			n = n->rb_left;
1603		else if (rc > 0)
1604			n = n->rb_right;
1605		else
1606			break;
1607	}
1608
1609	if (n) {
1610		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
1611			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
1612			n = n->rb_right;
1613			if (n == NULL)
1614				break;
1615			rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1616		}
1617	}
1618
1619	spin_unlock(&rgd->rd_rsspin);
1620	return block;
1621}
1622
1623/**
1624 * gfs2_reservation_check_and_update - Check for reservations during block alloc
1625 * @rbm: The current position in the resource group
1626 * @ip: The inode for which we are searching for blocks
1627 * @minext: The minimum extent length
1628 * @maxext: A pointer to the maximum extent structure
1629 *
1630 * This checks the current position in the rgrp to see whether there is
1631 * a reservation covering this block. If not then this function is a
1632 * no-op. If there is, then the position is moved to the end of the
1633 * contiguous reservation(s) so that we are pointing at the first
1634 * non-reserved block.
1635 *
1636 * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error
1637 */
1638
1639static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
1640					     const struct gfs2_inode *ip,
1641					     u32 minext,
1642					     struct gfs2_extent *maxext)
1643{
1644	u64 block = gfs2_rbm_to_block(rbm);
1645	u32 extlen = 1;
1646	u64 nblock;
1647	int ret;
1648
1649	/*
1650	 * If we have a minimum extent length, then skip over any extent
1651	 * which is less than the min extent length in size.
1652	 */
1653	if (minext > 1) {
1654		extlen = gfs2_free_extlen(rbm, minext);
1655		if (extlen <= maxext->len)
1656			goto fail;
1657	}
1658
1659	/*
1660	 * Check the extent which has been found against the reservations
1661	 * and skip if parts of it are already reserved
1662	 */
1663	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
1664	if (nblock == block) {
1665		if (!minext || extlen >= minext)
1666			return 0;
1667
1668		if (extlen > maxext->len) {
1669			maxext->len = extlen;
1670			maxext->rbm = *rbm;
1671		}
1672fail:
1673		nblock = block + extlen;
1674	}
1675	ret = gfs2_rbm_from_block(rbm, nblock);
1676	if (ret < 0)
1677		return ret;
1678	return 1;
1679}
1680
1681/**
1682 * gfs2_rbm_find - Look for blocks of a particular state
1683 * @rbm: Value/result starting position and final position
1684 * @state: The state which we want to find
1685 * @minext: Pointer to the requested extent length
1686 *          This is updated to be the actual reservation size.
1687 * @ip: If set, check for reservations
1688 * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
1689 *          around until we've reached the starting point.
1690 *
1691 * Side effects:
1692 * - If looking for free blocks, we set GBF_FULL on each bitmap which
1693 *   has no free blocks in it.
1694 * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
1695 *   has come up short on a free block search.
1696 *
1697 * Returns: 0 on success, -ENOSPC if there is no block of the requested state
1698 */
1699
1700static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
1701			 const struct gfs2_inode *ip, bool nowrap)
1702{
1703	bool scan_from_start = rbm->bii == 0 && rbm->offset == 0;
1704	struct buffer_head *bh;
1705	int last_bii;
1706	u32 offset;
1707	u8 *buffer;
1708	bool wrapped = false;
1709	int ret;
1710	struct gfs2_bitmap *bi;
1711	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
1712
1713	/*
1714	 * Determine the last bitmap to search.  If we're not starting at the
1715	 * beginning of a bitmap, we need to search that bitmap twice to scan
1716	 * the entire resource group.
1717	 */
1718	last_bii = rbm->bii - (rbm->offset == 0);
1719
1720	while(1) {
1721		bi = rbm_bi(rbm);
1722		if (test_bit(GBF_FULL, &bi->bi_flags) &&
1723		    (state == GFS2_BLKST_FREE))
1724			goto next_bitmap;
1725
1726		bh = bi->bi_bh;
1727		buffer = bh->b_data + bi->bi_offset;
1728		WARN_ON(!buffer_uptodate(bh));
1729		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1730			buffer = bi->bi_clone + bi->bi_offset;
1731		offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
1732		if (offset == BFITNOENT) {
1733			if (state == GFS2_BLKST_FREE && rbm->offset == 0)
1734				set_bit(GBF_FULL, &bi->bi_flags);
1735			goto next_bitmap;
1736		}
1737		rbm->offset = offset;
1738		if (ip == NULL)
1739			return 0;
1740
1741		ret = gfs2_reservation_check_and_update(rbm, ip, *minext,
1742							&maxext);
1743		if (ret == 0)
1744			return 0;
1745		if (ret > 0)
1746			goto next_iter;
1747		if (ret == -E2BIG) {
1748			rbm->bii = 0;
1749			rbm->offset = 0;
1750			goto res_covered_end_of_rgrp;
1751		}
1752		return ret;
1753
1754next_bitmap:	/* Find next bitmap in the rgrp */
1755		rbm->offset = 0;
1756		rbm->bii++;
1757		if (rbm->bii == rbm->rgd->rd_length)
1758			rbm->bii = 0;
1759res_covered_end_of_rgrp:
1760		if (rbm->bii == 0) {
1761			if (wrapped)
1762				break;
1763			wrapped = true;
1764			if (nowrap)
1765				break;
1766		}
1767next_iter:
1768		/* Have we scanned the entire resource group? */
1769		if (wrapped && rbm->bii > last_bii)
1770			break;
1771	}
1772
1773	if (state != GFS2_BLKST_FREE)
1774		return -ENOSPC;
1775
1776	/* If the extent was too small, and it's smaller than the smallest
1777	   to have failed before, remember for future reference that it's
1778	   useless to search this rgrp again for this amount or more. */
1779	if (wrapped && (scan_from_start || rbm->bii > last_bii) &&
1780	    *minext < rbm->rgd->rd_extfail_pt)
1781		rbm->rgd->rd_extfail_pt = *minext - 1;
1782
1783	/* If the maximum extent we found is big enough to fulfill the
1784	   minimum requirements, use it anyway. */
1785	if (maxext.len) {
1786		*rbm = maxext.rbm;
1787		*minext = maxext.len;
1788		return 0;
1789	}
1790
1791	return -ENOSPC;
1792}
1793
1794/**
1795 * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes
1796 * @rgd: The rgrp
1797 * @last_unlinked: block address of the last dinode we unlinked
1798 * @skip: block address we should explicitly not unlink
1799 *
1800 * Returns: 0 if no error
1801 *          The inode, if one has been found, in inode.
1802 */
1803
1804static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
1805{
1806	u64 block;
1807	struct gfs2_sbd *sdp = rgd->rd_sbd;
1808	struct gfs2_glock *gl;
1809	struct gfs2_inode *ip;
1810	int error;
1811	int found = 0;
1812	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
1813
1814	while (1) {
1815		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
1816				      true);
1817		if (error == -ENOSPC)
1818			break;
1819		if (WARN_ON_ONCE(error))
1820			break;
1821
1822		block = gfs2_rbm_to_block(&rbm);
1823		if (gfs2_rbm_from_block(&rbm, block + 1))
1824			break;
1825		if (*last_unlinked != NO_BLOCK && block <= *last_unlinked)
1826			continue;
1827		if (block == skip)
1828			continue;
1829		*last_unlinked = block;
1830
1831		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
1832		if (error)
1833			continue;
1834
1835		/* If the inode is already in cache, we can ignore it here
1836		 * because the existing inode disposal code will deal with
1837		 * it when all refs have gone away. Accessing gl_object like
1838		 * this is not safe in general. Here it is ok because we do
1839		 * not dereference the pointer, and we only need an approx
1840		 * answer to whether it is NULL or not.
1841		 */
1842		ip = gl->gl_object;
1843
1844		if (ip || !gfs2_queue_delete_work(gl, 0))
1845			gfs2_glock_put(gl);
1846		else
1847			found++;
1848
1849		/* Limit reclaim to sensible number of tasks */
1850		if (found > NR_CPUS)
1851			return;
1852	}
1853
1854	rgd->rd_flags &= ~GFS2_RDF_CHECK;
1855	return;
1856}
1857
1858/**
1859 * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
1860 * @rgd: The rgrp in question
1861 * @loops: An indication of how picky we can be (0=very, 1=less so)
1862 *
1863 * This function uses the recently added glock statistics in order to
1864 * figure out whether a parciular resource group is suffering from
1865 * contention from multiple nodes. This is done purely on the basis
1866 * of timings, since this is the only data we have to work with and
1867 * our aim here is to reject a resource group which is highly contended
1868 * but (very important) not to do this too often in order to ensure that
1869 * we do not land up introducing fragmentation by changing resource
1870 * groups when not actually required.
1871 *
1872 * The calculation is fairly simple, we want to know whether the SRTTB
1873 * (i.e. smoothed round trip time for blocking operations) to acquire
1874 * the lock for this rgrp's glock is significantly greater than the
1875 * time taken for resource groups on average. We introduce a margin in
1876 * the form of the variable @var which is computed as the sum of the two
1877 * respective variences, and multiplied by a factor depending on @loops
1878 * and whether we have a lot of data to base the decision on. This is
1879 * then tested against the square difference of the means in order to
1880 * decide whether the result is statistically significant or not.
1881 *
1882 * Returns: A boolean verdict on the congestion status
1883 */
1884
1885static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
1886{
1887	const struct gfs2_glock *gl = rgd->rd_gl;
1888	const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
1889	struct gfs2_lkstats *st;
1890	u64 r_dcount, l_dcount;
1891	u64 l_srttb, a_srttb = 0;
1892	s64 srttb_diff;
1893	u64 sqr_diff;
1894	u64 var;
1895	int cpu, nonzero = 0;
1896
1897	preempt_disable();
1898	for_each_present_cpu(cpu) {
1899		st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP];
1900		if (st->stats[GFS2_LKS_SRTTB]) {
1901			a_srttb += st->stats[GFS2_LKS_SRTTB];
1902			nonzero++;
1903		}
1904	}
1905	st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
1906	if (nonzero)
1907		do_div(a_srttb, nonzero);
1908	r_dcount = st->stats[GFS2_LKS_DCOUNT];
1909	var = st->stats[GFS2_LKS_SRTTVARB] +
1910	      gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
1911	preempt_enable();
1912
1913	l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
1914	l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
1915
1916	if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0))
1917		return false;
1918
1919	srttb_diff = a_srttb - l_srttb;
1920	sqr_diff = srttb_diff * srttb_diff;
1921
1922	var *= 2;
1923	if (l_dcount < 8 || r_dcount < 8)
1924		var *= 2;
1925	if (loops == 1)
1926		var *= 2;
1927
1928	return ((srttb_diff < 0) && (sqr_diff > var));
1929}
1930
1931/**
1932 * gfs2_rgrp_used_recently
1933 * @rs: The block reservation with the rgrp to test
1934 * @msecs: The time limit in milliseconds
1935 *
1936 * Returns: True if the rgrp glock has been used within the time limit
1937 */
1938static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
1939				    u64 msecs)
1940{
1941	u64 tdiff;
1942
1943	tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
1944                            rs->rs_rbm.rgd->rd_gl->gl_dstamp));
1945
1946	return tdiff > (msecs * 1000 * 1000);
1947}
1948
1949static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
1950{
1951	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1952	u32 skip;
1953
1954	get_random_bytes(&skip, sizeof(skip));
1955	return skip % sdp->sd_rgrps;
1956}
1957
1958static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
1959{
1960	struct gfs2_rgrpd *rgd = *pos;
1961	struct gfs2_sbd *sdp = rgd->rd_sbd;
1962
1963	rgd = gfs2_rgrpd_get_next(rgd);
1964	if (rgd == NULL)
1965		rgd = gfs2_rgrpd_get_first(sdp);
1966	*pos = rgd;
1967	if (rgd != begin) /* If we didn't wrap */
1968		return true;
1969	return false;
1970}
1971
1972/**
1973 * fast_to_acquire - determine if a resource group will be fast to acquire
1974 *
1975 * If this is one of our preferred rgrps, it should be quicker to acquire,
1976 * because we tried to set ourselves up as dlm lock master.
1977 */
1978static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
1979{
1980	struct gfs2_glock *gl = rgd->rd_gl;
1981
1982	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
1983	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
1984	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
1985		return 1;
1986	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
1987		return 1;
1988	return 0;
1989}
1990
1991/**
1992 * gfs2_inplace_reserve - Reserve space in the filesystem
1993 * @ip: the inode to reserve space for
1994 * @ap: the allocation parameters
1995 *
1996 * We try our best to find an rgrp that has at least ap->target blocks
1997 * available. After a couple of passes (loops == 2), the prospects of finding
1998 * such an rgrp diminish. At this stage, we return the first rgrp that has
1999 * at least ap->min_target blocks available. Either way, we set ap->allowed to
2000 * the number of blocks available in the chosen rgrp.
2001 *
2002 * Returns: 0 on success,
2003 *          -ENOMEM if a suitable rgrp can't be found
2004 *          errno otherwise
2005 */
2006
2007int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
2008{
2009	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2010	struct gfs2_rgrpd *begin = NULL;
2011	struct gfs2_blkreserv *rs = &ip->i_res;
2012	int error = 0, rg_locked, flags = 0;
2013	u64 last_unlinked = NO_BLOCK;
2014	int loops = 0;
2015	u32 free_blocks, skip = 0;
2016
2017	if (sdp->sd_args.ar_rgrplvb)
2018		flags |= GL_SKIP;
2019	if (gfs2_assert_warn(sdp, ap->target))
2020		return -EINVAL;
2021	if (gfs2_rs_active(rs)) {
2022		begin = rs->rs_rbm.rgd;
2023	} else if (rs->rs_rbm.rgd &&
2024		   rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) {
2025		begin = rs->rs_rbm.rgd;
2026	} else {
2027		check_and_update_goal(ip);
2028		rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
2029	}
2030	if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
2031		skip = gfs2_orlov_skip(ip);
2032	if (rs->rs_rbm.rgd == NULL)
2033		return -EBADSLT;
2034
2035	while (loops < 3) {
2036		rg_locked = 1;
2037
2038		if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
2039			rg_locked = 0;
2040			if (skip && skip--)
2041				goto next_rgrp;
2042			if (!gfs2_rs_active(rs)) {
2043				if (loops == 0 &&
2044				    !fast_to_acquire(rs->rs_rbm.rgd))
2045					goto next_rgrp;
2046				if ((loops < 2) &&
2047				    gfs2_rgrp_used_recently(rs, 1000) &&
2048				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2049					goto next_rgrp;
2050			}
2051			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
2052						   LM_ST_EXCLUSIVE, flags,
2053						   &ip->i_rgd_gh);
2054			if (unlikely(error))
2055				return error;
2056			if (!gfs2_rs_active(rs) && (loops < 2) &&
2057			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
2058				goto skip_rgrp;
2059			if (sdp->sd_args.ar_rgrplvb) {
2060				error = update_rgrp_lvb(rs->rs_rbm.rgd);
2061				if (unlikely(error)) {
2062					gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2063					return error;
2064				}
2065			}
2066		}
2067
2068		/* Skip unusable resource groups */
2069		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
2070						 GFS2_RDF_ERROR)) ||
2071		    (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
2072			goto skip_rgrp;
2073
2074		if (sdp->sd_args.ar_rgrplvb)
2075			gfs2_rgrp_bh_get(rs->rs_rbm.rgd);
2076
2077		/* Get a reservation if we don't already have one */
2078		if (!gfs2_rs_active(rs))
2079			rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
2080
2081		/* Skip rgrps when we can't get a reservation on first pass */
2082		if (!gfs2_rs_active(rs) && (loops < 1))
2083			goto check_rgrp;
2084
2085		/* If rgrp has enough free space, use it */
2086		free_blocks = rgd_free(rs->rs_rbm.rgd, rs);
2087		if (free_blocks >= ap->target ||
2088		    (loops == 2 && ap->min_target &&
2089		     free_blocks >= ap->min_target)) {
2090			ap->allowed = free_blocks;
2091			return 0;
2092		}
2093check_rgrp:
2094		/* Check for unlinked inodes which can be reclaimed */
2095		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
2096			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
2097					ip->i_no_addr);
2098skip_rgrp:
2099		/* Drop reservation, if we couldn't use reserved rgrp */
2100		if (gfs2_rs_active(rs))
2101			gfs2_rs_deltree(rs);
2102
2103		/* Unlock rgrp if required */
2104		if (!rg_locked)
2105			gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2106next_rgrp:
2107		/* Find the next rgrp, and continue looking */
2108		if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
2109			continue;
2110		if (skip)
2111			continue;
2112
2113		/* If we've scanned all the rgrps, but found no free blocks
2114		 * then this checks for some less likely conditions before
2115		 * trying again.
2116		 */
2117		loops++;
2118		/* Check that fs hasn't grown if writing to rindex */
2119		if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
2120			error = gfs2_ri_update(ip);
2121			if (error)
2122				return error;
2123		}
2124		/* Flushing the log may release space */
2125		if (loops == 2)
2126			gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
2127				       GFS2_LFC_INPLACE_RESERVE);
2128	}
2129
2130	return -ENOSPC;
2131}
2132
2133/**
2134 * gfs2_inplace_release - release an inplace reservation
2135 * @ip: the inode the reservation was taken out on
2136 *
2137 * Release a reservation made by gfs2_inplace_reserve().
2138 */
2139
2140void gfs2_inplace_release(struct gfs2_inode *ip)
2141{
2142	if (gfs2_holder_initialized(&ip->i_rgd_gh))
2143		gfs2_glock_dq_uninit(&ip->i_rgd_gh);
2144}
2145
2146/**
2147 * gfs2_alloc_extent - allocate an extent from a given bitmap
2148 * @rbm: the resource group information
2149 * @dinode: TRUE if the first block we allocate is for a dinode
2150 * @n: The extent length (value/result)
2151 *
2152 * Add the bitmap buffer to the transaction.
2153 * Set the found bits to @new_state to change block's allocation state.
2154 */
2155static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
2156			     unsigned int *n)
2157{
2158	struct gfs2_rbm pos = { .rgd = rbm->rgd, };
2159	const unsigned int elen = *n;
2160	u64 block;
2161	int ret;
2162
2163	*n = 1;
2164	block = gfs2_rbm_to_block(rbm);
2165	gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
2166	gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2167	block++;
2168	while (*n < elen) {
2169		ret = gfs2_rbm_from_block(&pos, block);
2170		if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE)
2171			break;
2172		gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
2173		gfs2_setbit(&pos, true, GFS2_BLKST_USED);
2174		(*n)++;
2175		block++;
2176	}
2177}
2178
2179/**
2180 * rgblk_free - Change alloc state of given block(s)
2181 * @sdp: the filesystem
2182 * @rgd: the resource group the blocks are in
2183 * @bstart: the start of a run of blocks to free
2184 * @blen: the length of the block run (all must lie within ONE RG!)
2185 * @new_state: GFS2_BLKST_XXX the after-allocation block state
2186 */
2187
2188static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
2189		       u64 bstart, u32 blen, unsigned char new_state)
2190{
2191	struct gfs2_rbm rbm;
2192	struct gfs2_bitmap *bi, *bi_prev = NULL;
2193
2194	rbm.rgd = rgd;
2195	if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
2196		return;
2197	while (blen--) {
2198		bi = rbm_bi(&rbm);
2199		if (bi != bi_prev) {
2200			if (!bi->bi_clone) {
2201				bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2202						      GFP_NOFS | __GFP_NOFAIL);
2203				memcpy(bi->bi_clone + bi->bi_offset,
2204				       bi->bi_bh->b_data + bi->bi_offset,
2205				       bi->bi_bytes);
2206			}
2207			gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2208			bi_prev = bi;
2209		}
2210		gfs2_setbit(&rbm, false, new_state);
2211		gfs2_rbm_incr(&rbm);
2212	}
2213}
2214
2215/**
2216 * gfs2_rgrp_dump - print out an rgrp
2217 * @seq: The iterator
2218 * @rgd: The rgrp in question
2219 * @fs_id_buf: pointer to file system id (if requested)
2220 *
2221 */
2222
2223void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
2224		    const char *fs_id_buf)
2225{
2226	struct gfs2_blkreserv *trs;
2227	const struct rb_node *n;
2228
2229	gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
2230		       fs_id_buf,
2231		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
2232		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
2233		       rgd->rd_reserved, rgd->rd_extfail_pt);
2234	if (rgd->rd_sbd->sd_args.ar_rgrplvb && rgd->rd_rgl) {
2235		struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
2236
2237		gfs2_print_dbg(seq, "%s  L: f:%02x b:%u i:%u\n", fs_id_buf,
2238			       be32_to_cpu(rgl->rl_flags),
2239			       be32_to_cpu(rgl->rl_free),
2240			       be32_to_cpu(rgl->rl_dinodes));
2241	}
2242	spin_lock(&rgd->rd_rsspin);
2243	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
2244		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
2245		dump_rs(seq, trs, fs_id_buf);
2246	}
2247	spin_unlock(&rgd->rd_rsspin);
2248}
2249
2250static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
2251{
2252	struct gfs2_sbd *sdp = rgd->rd_sbd;
2253	char fs_id_buf[sizeof(sdp->sd_fsname) + 7];
2254
2255	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
2256		(unsigned long long)rgd->rd_addr);
2257	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
2258	sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname);
2259	gfs2_rgrp_dump(NULL, rgd, fs_id_buf);
2260	rgd->rd_flags |= GFS2_RDF_ERROR;
2261}
2262
2263/**
2264 * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
2265 * @ip: The inode we have just allocated blocks for
2266 * @rbm: The start of the allocated blocks
2267 * @len: The extent length
2268 *
2269 * Adjusts a reservation after an allocation has taken place. If the
2270 * reservation does not match the allocation, or if it is now empty
2271 * then it is removed.
2272 */
2273
2274static void gfs2_adjust_reservation(struct gfs2_inode *ip,
2275				    const struct gfs2_rbm *rbm, unsigned len)
2276{
2277	struct gfs2_blkreserv *rs = &ip->i_res;
2278	struct gfs2_rgrpd *rgd = rbm->rgd;
2279	unsigned rlen;
2280	u64 block;
2281	int ret;
2282
2283	spin_lock(&rgd->rd_rsspin);
2284	if (gfs2_rs_active(rs)) {
2285		if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) {
2286			block = gfs2_rbm_to_block(rbm);
2287			ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len);
2288			rlen = min(rs->rs_free, len);
2289			rs->rs_free -= rlen;
2290			rgd->rd_reserved -= rlen;
2291			trace_gfs2_rs(rs, TRACE_RS_CLAIM);
2292			if (rs->rs_free && !ret)
2293				goto out;
2294			/* We used up our block reservation, so we should
2295			   reserve more blocks next time. */
2296			atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
2297		}
2298		__rs_deltree(rs);
2299	}
2300out:
2301	spin_unlock(&rgd->rd_rsspin);
2302}
2303
2304/**
2305 * gfs2_set_alloc_start - Set starting point for block allocation
2306 * @rbm: The rbm which will be set to the required location
2307 * @ip: The gfs2 inode
2308 * @dinode: Flag to say if allocation includes a new inode
2309 *
2310 * This sets the starting point from the reservation if one is active
2311 * otherwise it falls back to guessing a start point based on the
2312 * inode's goal block or the last allocation point in the rgrp.
2313 */
2314
2315static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
2316				 const struct gfs2_inode *ip, bool dinode)
2317{
2318	u64 goal;
2319
2320	if (gfs2_rs_active(&ip->i_res)) {
2321		*rbm = ip->i_res.rs_rbm;
2322		return;
2323	}
2324
2325	if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
2326		goal = ip->i_goal;
2327	else
2328		goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
2329
2330	if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
2331		rbm->bii = 0;
2332		rbm->offset = 0;
2333	}
2334}
2335
2336/**
2337 * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
2338 * @ip: the inode to allocate the block for
2339 * @bn: Used to return the starting block number
2340 * @nblocks: requested number of blocks/extent length (value/result)
2341 * @dinode: 1 if we're allocating a dinode block, else 0
2342 * @generation: the generation number of the inode
2343 *
2344 * Returns: 0 or error
2345 */
2346
2347int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
2348		      bool dinode, u64 *generation)
2349{
2350	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2351	struct buffer_head *dibh;
2352	struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, };
2353	unsigned int ndata;
2354	u64 block; /* block, within the file system scope */
2355	u32 minext = 1;
2356	int error;
2357
2358	gfs2_set_alloc_start(&rbm, ip, dinode);
2359	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, ip, false);
2360
2361	if (error == -ENOSPC) {
2362		gfs2_set_alloc_start(&rbm, ip, dinode);
2363		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, NULL, false);
2364	}
2365
2366	/* Since all blocks are reserved in advance, this shouldn't happen */
2367	if (error) {
2368		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
2369			(unsigned long long)ip->i_no_addr, error, *nblocks,
2370			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
2371			rbm.rgd->rd_extfail_pt);
2372		goto rgrp_error;
2373	}
2374
2375	gfs2_alloc_extent(&rbm, dinode, nblocks);
2376	block = gfs2_rbm_to_block(&rbm);
2377	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
2378	if (gfs2_rs_active(&ip->i_res))
2379		gfs2_adjust_reservation(ip, &rbm, *nblocks);
2380	ndata = *nblocks;
2381	if (dinode)
2382		ndata--;
2383
2384	if (!dinode) {
2385		ip->i_goal = block + ndata - 1;
2386		error = gfs2_meta_inode_buffer(ip, &dibh);
2387		if (error == 0) {
2388			struct gfs2_dinode *di =
2389				(struct gfs2_dinode *)dibh->b_data;
2390			gfs2_trans_add_meta(ip->i_gl, dibh);
2391			di->di_goal_meta = di->di_goal_data =
2392				cpu_to_be64(ip->i_goal);
2393			brelse(dibh);
2394		}
2395	}
2396	if (rbm.rgd->rd_free < *nblocks) {
2397		fs_warn(sdp, "nblocks=%u\n", *nblocks);
2398		goto rgrp_error;
2399	}
2400
2401	rbm.rgd->rd_free -= *nblocks;
2402	if (dinode) {
2403		rbm.rgd->rd_dinodes++;
2404		*generation = rbm.rgd->rd_igeneration++;
2405		if (*generation == 0)
2406			*generation = rbm.rgd->rd_igeneration++;
2407	}
2408
2409	gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
2410	gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
2411
2412	gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
2413	if (dinode)
2414		gfs2_trans_remove_revoke(sdp, block, *nblocks);
2415
2416	gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
2417
2418	rbm.rgd->rd_free_clone -= *nblocks;
2419	trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks,
2420			       dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
2421	*bn = block;
2422	return 0;
2423
2424rgrp_error:
2425	gfs2_rgrp_error(rbm.rgd);
2426	return -EIO;
2427}
2428
2429/**
2430 * __gfs2_free_blocks - free a contiguous run of block(s)
2431 * @ip: the inode these blocks are being freed from
2432 * @rgd: the resource group the blocks are in
2433 * @bstart: first block of a run of contiguous blocks
2434 * @blen: the length of the block run
2435 * @meta: 1 if the blocks represent metadata
2436 *
2437 */
2438
2439void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2440			u64 bstart, u32 blen, int meta)
2441{
2442	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2443
2444	rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
2445	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
2446	rgd->rd_free += blen;
2447	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
2448	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2449	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2450
2451	/* Directories keep their data in the metadata address space */
2452	if (meta || ip->i_depth || gfs2_is_jdata(ip))
2453		gfs2_journal_wipe(ip, bstart, blen);
2454}
2455
2456/**
2457 * gfs2_free_meta - free a contiguous run of data block(s)
2458 * @ip: the inode these blocks are being freed from
2459 * @rgd: the resource group the blocks are in
2460 * @bstart: first block of a run of contiguous blocks
2461 * @blen: the length of the block run
2462 *
2463 */
2464
2465void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
2466		    u64 bstart, u32 blen)
2467{
2468	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2469
2470	__gfs2_free_blocks(ip, rgd, bstart, blen, 1);
2471	gfs2_statfs_change(sdp, 0, +blen, 0);
2472	gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
2473}
2474
2475void gfs2_unlink_di(struct inode *inode)
2476{
2477	struct gfs2_inode *ip = GFS2_I(inode);
2478	struct gfs2_sbd *sdp = GFS2_SB(inode);
2479	struct gfs2_rgrpd *rgd;
2480	u64 blkno = ip->i_no_addr;
2481
2482	rgd = gfs2_blk2rgrpd(sdp, blkno, true);
2483	if (!rgd)
2484		return;
2485	rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2486	trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
2487	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2488	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2489	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1);
2490}
2491
2492void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
2493{
2494	struct gfs2_sbd *sdp = rgd->rd_sbd;
2495
2496	rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2497	if (!rgd->rd_dinodes)
2498		gfs2_consist_rgrpd(rgd);
2499	rgd->rd_dinodes--;
2500	rgd->rd_free++;
2501
2502	gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
2503	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
2504	be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1);
2505
2506	gfs2_statfs_change(sdp, 0, +1, -1);
2507	trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
2508	gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid);
2509	gfs2_journal_wipe(ip, ip->i_no_addr, 1);
2510}
2511
2512/**
2513 * gfs2_check_blk_type - Check the type of a block
2514 * @sdp: The superblock
2515 * @no_addr: The block number to check
2516 * @type: The block type we are looking for
2517 *
2518 * Returns: 0 if the block type matches the expected type
2519 *          -ESTALE if it doesn't match
2520 *          or -ve errno if something went wrong while checking
2521 */
2522
2523int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
2524{
2525	struct gfs2_rgrpd *rgd;
2526	struct gfs2_holder rgd_gh;
2527	struct gfs2_rbm rbm;
2528	int error = -EINVAL;
2529
2530	rgd = gfs2_blk2rgrpd(sdp, no_addr, 1);
2531	if (!rgd)
2532		goto fail;
2533
2534	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
2535	if (error)
2536		goto fail;
2537
2538	rbm.rgd = rgd;
2539	error = gfs2_rbm_from_block(&rbm, no_addr);
2540	if (!WARN_ON_ONCE(error)) {
2541		if (gfs2_testbit(&rbm, false) != type)
2542			error = -ESTALE;
2543	}
2544
2545	gfs2_glock_dq_uninit(&rgd_gh);
2546
2547fail:
2548	return error;
2549}
2550
2551/**
2552 * gfs2_rlist_add - add a RG to a list of RGs
2553 * @ip: the inode
2554 * @rlist: the list of resource groups
2555 * @block: the block
2556 *
2557 * Figure out what RG a block belongs to and add that RG to the list
2558 *
2559 * FIXME: Don't use NOFAIL
2560 *
2561 */
2562
2563void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
2564		    u64 block)
2565{
2566	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2567	struct gfs2_rgrpd *rgd;
2568	struct gfs2_rgrpd **tmp;
2569	unsigned int new_space;
2570	unsigned int x;
2571
2572	if (gfs2_assert_warn(sdp, !rlist->rl_ghs))
2573		return;
2574
2575	/*
2576	 * The resource group last accessed is kept in the last position.
2577	 */
2578
2579	if (rlist->rl_rgrps) {
2580		rgd = rlist->rl_rgd[rlist->rl_rgrps - 1];
2581		if (rgrp_contains_block(rgd, block))
2582			return;
2583		rgd = gfs2_blk2rgrpd(sdp, block, 1);
2584	} else {
2585		rgd = ip->i_res.rs_rbm.rgd;
2586		if (!rgd || !rgrp_contains_block(rgd, block))
2587			rgd = gfs2_blk2rgrpd(sdp, block, 1);
2588	}
2589
2590	if (!rgd) {
2591		fs_err(sdp, "rlist_add: no rgrp for block %llu\n",
2592		       (unsigned long long)block);
2593		return;
2594	}
2595
2596	for (x = 0; x < rlist->rl_rgrps; x++) {
2597		if (rlist->rl_rgd[x] == rgd) {
2598			swap(rlist->rl_rgd[x],
2599			     rlist->rl_rgd[rlist->rl_rgrps - 1]);
2600			return;
2601		}
2602	}
2603
2604	if (rlist->rl_rgrps == rlist->rl_space) {
2605		new_space = rlist->rl_space + 10;
2606
2607		tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *),
2608			      GFP_NOFS | __GFP_NOFAIL);
2609
2610		if (rlist->rl_rgd) {
2611			memcpy(tmp, rlist->rl_rgd,
2612			       rlist->rl_space * sizeof(struct gfs2_rgrpd *));
2613			kfree(rlist->rl_rgd);
2614		}
2615
2616		rlist->rl_space = new_space;
2617		rlist->rl_rgd = tmp;
2618	}
2619
2620	rlist->rl_rgd[rlist->rl_rgrps++] = rgd;
2621}
2622
2623/**
2624 * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
2625 *      and initialize an array of glock holders for them
2626 * @rlist: the list of resource groups
2627 *
2628 * FIXME: Don't use NOFAIL
2629 *
2630 */
2631
2632void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
2633{
2634	unsigned int x;
2635
2636	rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps,
2637				      sizeof(struct gfs2_holder),
2638				      GFP_NOFS | __GFP_NOFAIL);
2639	for (x = 0; x < rlist->rl_rgrps; x++)
2640		gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
2641				LM_ST_EXCLUSIVE, 0,
2642				&rlist->rl_ghs[x]);
2643}
2644
2645/**
2646 * gfs2_rlist_free - free a resource group list
2647 * @rlist: the list of resource groups
2648 *
2649 */
2650
2651void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
2652{
2653	unsigned int x;
2654
2655	kfree(rlist->rl_rgd);
2656
2657	if (rlist->rl_ghs) {
2658		for (x = 0; x < rlist->rl_rgrps; x++)
2659			gfs2_holder_uninit(&rlist->rl_ghs[x]);
2660		kfree(rlist->rl_ghs);
2661		rlist->rl_ghs = NULL;
2662	}
2663}
2664
2665