xref: /kernel/linux/linux-6.6/fs/xfs/libxfs/xfs_alloc.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_format.h"
9#include "xfs_log_format.h"
10#include "xfs_shared.h"
11#include "xfs_trans_resv.h"
12#include "xfs_bit.h"
13#include "xfs_mount.h"
14#include "xfs_defer.h"
15#include "xfs_btree.h"
16#include "xfs_rmap.h"
17#include "xfs_alloc_btree.h"
18#include "xfs_alloc.h"
19#include "xfs_extent_busy.h"
20#include "xfs_errortag.h"
21#include "xfs_error.h"
22#include "xfs_trace.h"
23#include "xfs_trans.h"
24#include "xfs_buf_item.h"
25#include "xfs_log.h"
26#include "xfs_ag.h"
27#include "xfs_ag_resv.h"
28#include "xfs_bmap.h"
29
30struct kmem_cache	*xfs_extfree_item_cache;
31
32struct workqueue_struct *xfs_alloc_wq;
33
34#define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
35
36#define	XFSA_FIXUP_BNO_OK	1
37#define	XFSA_FIXUP_CNT_OK	2
38
39/*
40 * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
41 * the beginning of the block for a proper header with the location information
42 * and CRC.
43 */
44unsigned int
45xfs_agfl_size(
46	struct xfs_mount	*mp)
47{
48	unsigned int		size = mp->m_sb.sb_sectsize;
49
50	if (xfs_has_crc(mp))
51		size -= sizeof(struct xfs_agfl);
52
53	return size / sizeof(xfs_agblock_t);
54}
55
56unsigned int
57xfs_refc_block(
58	struct xfs_mount	*mp)
59{
60	if (xfs_has_rmapbt(mp))
61		return XFS_RMAP_BLOCK(mp) + 1;
62	if (xfs_has_finobt(mp))
63		return XFS_FIBT_BLOCK(mp) + 1;
64	return XFS_IBT_BLOCK(mp) + 1;
65}
66
67xfs_extlen_t
68xfs_prealloc_blocks(
69	struct xfs_mount	*mp)
70{
71	if (xfs_has_reflink(mp))
72		return xfs_refc_block(mp) + 1;
73	if (xfs_has_rmapbt(mp))
74		return XFS_RMAP_BLOCK(mp) + 1;
75	if (xfs_has_finobt(mp))
76		return XFS_FIBT_BLOCK(mp) + 1;
77	return XFS_IBT_BLOCK(mp) + 1;
78}
79
80/*
81 * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
82 * guarantee that we can refill the AGFL prior to allocating space in a nearly
83 * full AG.  Although the space described by the free space btrees, the
84 * blocks used by the freesp btrees themselves, and the blocks owned by the
85 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
86 * free space in the AG drop so low that the free space btrees cannot refill an
87 * empty AGFL up to the minimum level.  Rather than grind through empty AGs
88 * until the fs goes down, we subtract this many AG blocks from the incore
89 * fdblocks to ensure user allocation does not overcommit the space the
90 * filesystem needs for the AGFLs.  The rmap btree uses a per-AG reservation to
91 * withhold space from xfs_mod_fdblocks, so we do not account for that here.
92 */
93#define XFS_ALLOCBT_AGFL_RESERVE	4
94
95/*
96 * Compute the number of blocks that we set aside to guarantee the ability to
97 * refill the AGFL and handle a full bmap btree split.
98 *
99 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
100 * AGF buffer (PV 947395), we place constraints on the relationship among
101 * actual allocations for data blocks, freelist blocks, and potential file data
102 * bmap btree blocks. However, these restrictions may result in no actual space
103 * allocated for a delayed extent, for example, a data block in a certain AG is
104 * allocated but there is no additional block for the additional bmap btree
105 * block due to a split of the bmap btree of the file. The result of this may
106 * lead to an infinite loop when the file gets flushed to disk and all delayed
107 * extents need to be actually allocated. To get around this, we explicitly set
108 * aside a few blocks which will not be reserved in delayed allocation.
109 *
110 * For each AG, we need to reserve enough blocks to replenish a totally empty
111 * AGFL and 4 more to handle a potential split of the file's bmap btree.
112 */
113unsigned int
114xfs_alloc_set_aside(
115	struct xfs_mount	*mp)
116{
117	return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
118}
119
120/*
121 * When deciding how much space to allocate out of an AG, we limit the
122 * allocation maximum size to the size the AG. However, we cannot use all the
123 * blocks in the AG - some are permanently used by metadata. These
124 * blocks are generally:
125 *	- the AG superblock, AGF, AGI and AGFL
126 *	- the AGF (bno and cnt) and AGI btree root blocks, and optionally
127 *	  the AGI free inode and rmap btree root blocks.
128 *	- blocks on the AGFL according to xfs_alloc_set_aside() limits
129 *	- the rmapbt root block
130 *
131 * The AG headers are sector sized, so the amount of space they take up is
132 * dependent on filesystem geometry. The others are all single blocks.
133 */
134unsigned int
135xfs_alloc_ag_max_usable(
136	struct xfs_mount	*mp)
137{
138	unsigned int		blocks;
139
140	blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
141	blocks += XFS_ALLOCBT_AGFL_RESERVE;
142	blocks += 3;			/* AGF, AGI btree root blocks */
143	if (xfs_has_finobt(mp))
144		blocks++;		/* finobt root block */
145	if (xfs_has_rmapbt(mp))
146		blocks++;		/* rmap root block */
147	if (xfs_has_reflink(mp))
148		blocks++;		/* refcount root block */
149
150	return mp->m_sb.sb_agblocks - blocks;
151}
152
153/*
154 * Lookup the record equal to [bno, len] in the btree given by cur.
155 */
156STATIC int				/* error */
157xfs_alloc_lookup_eq(
158	struct xfs_btree_cur	*cur,	/* btree cursor */
159	xfs_agblock_t		bno,	/* starting block of extent */
160	xfs_extlen_t		len,	/* length of extent */
161	int			*stat)	/* success/failure */
162{
163	int			error;
164
165	cur->bc_rec.a.ar_startblock = bno;
166	cur->bc_rec.a.ar_blockcount = len;
167	error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
168	cur->bc_ag.abt.active = (*stat == 1);
169	return error;
170}
171
172/*
173 * Lookup the first record greater than or equal to [bno, len]
174 * in the btree given by cur.
175 */
176int				/* error */
177xfs_alloc_lookup_ge(
178	struct xfs_btree_cur	*cur,	/* btree cursor */
179	xfs_agblock_t		bno,	/* starting block of extent */
180	xfs_extlen_t		len,	/* length of extent */
181	int			*stat)	/* success/failure */
182{
183	int			error;
184
185	cur->bc_rec.a.ar_startblock = bno;
186	cur->bc_rec.a.ar_blockcount = len;
187	error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
188	cur->bc_ag.abt.active = (*stat == 1);
189	return error;
190}
191
192/*
193 * Lookup the first record less than or equal to [bno, len]
194 * in the btree given by cur.
195 */
196int					/* error */
197xfs_alloc_lookup_le(
198	struct xfs_btree_cur	*cur,	/* btree cursor */
199	xfs_agblock_t		bno,	/* starting block of extent */
200	xfs_extlen_t		len,	/* length of extent */
201	int			*stat)	/* success/failure */
202{
203	int			error;
204	cur->bc_rec.a.ar_startblock = bno;
205	cur->bc_rec.a.ar_blockcount = len;
206	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
207	cur->bc_ag.abt.active = (*stat == 1);
208	return error;
209}
210
211static inline bool
212xfs_alloc_cur_active(
213	struct xfs_btree_cur	*cur)
214{
215	return cur && cur->bc_ag.abt.active;
216}
217
218/*
219 * Update the record referred to by cur to the value given
220 * by [bno, len].
221 * This either works (return 0) or gets an EFSCORRUPTED error.
222 */
223STATIC int				/* error */
224xfs_alloc_update(
225	struct xfs_btree_cur	*cur,	/* btree cursor */
226	xfs_agblock_t		bno,	/* starting block of extent */
227	xfs_extlen_t		len)	/* length of extent */
228{
229	union xfs_btree_rec	rec;
230
231	rec.alloc.ar_startblock = cpu_to_be32(bno);
232	rec.alloc.ar_blockcount = cpu_to_be32(len);
233	return xfs_btree_update(cur, &rec);
234}
235
236/* Convert the ondisk btree record to its incore representation. */
237void
238xfs_alloc_btrec_to_irec(
239	const union xfs_btree_rec	*rec,
240	struct xfs_alloc_rec_incore	*irec)
241{
242	irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
243	irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
244}
245
246/* Simple checks for free space records. */
247xfs_failaddr_t
248xfs_alloc_check_irec(
249	struct xfs_btree_cur		*cur,
250	const struct xfs_alloc_rec_incore *irec)
251{
252	struct xfs_perag		*pag = cur->bc_ag.pag;
253
254	if (irec->ar_blockcount == 0)
255		return __this_address;
256
257	/* check for valid extent range, including overflow */
258	if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount))
259		return __this_address;
260
261	return NULL;
262}
263
264static inline int
265xfs_alloc_complain_bad_rec(
266	struct xfs_btree_cur		*cur,
267	xfs_failaddr_t			fa,
268	const struct xfs_alloc_rec_incore *irec)
269{
270	struct xfs_mount		*mp = cur->bc_mp;
271
272	xfs_warn(mp,
273		"%s Freespace BTree record corruption in AG %d detected at %pS!",
274		cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
275		cur->bc_ag.pag->pag_agno, fa);
276	xfs_warn(mp,
277		"start block 0x%x block count 0x%x", irec->ar_startblock,
278		irec->ar_blockcount);
279	return -EFSCORRUPTED;
280}
281
282/*
283 * Get the data from the pointed-to record.
284 */
285int					/* error */
286xfs_alloc_get_rec(
287	struct xfs_btree_cur	*cur,	/* btree cursor */
288	xfs_agblock_t		*bno,	/* output: starting block of extent */
289	xfs_extlen_t		*len,	/* output: length of extent */
290	int			*stat)	/* output: success/failure */
291{
292	struct xfs_alloc_rec_incore irec;
293	union xfs_btree_rec	*rec;
294	xfs_failaddr_t		fa;
295	int			error;
296
297	error = xfs_btree_get_rec(cur, &rec, stat);
298	if (error || !(*stat))
299		return error;
300
301	xfs_alloc_btrec_to_irec(rec, &irec);
302	fa = xfs_alloc_check_irec(cur, &irec);
303	if (fa)
304		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
305
306	*bno = irec.ar_startblock;
307	*len = irec.ar_blockcount;
308	return 0;
309}
310
311/*
312 * Compute aligned version of the found extent.
313 * Takes alignment and min length into account.
314 */
315STATIC bool
316xfs_alloc_compute_aligned(
317	xfs_alloc_arg_t	*args,		/* allocation argument structure */
318	xfs_agblock_t	foundbno,	/* starting block in found extent */
319	xfs_extlen_t	foundlen,	/* length in found extent */
320	xfs_agblock_t	*resbno,	/* result block number */
321	xfs_extlen_t	*reslen,	/* result length */
322	unsigned	*busy_gen)
323{
324	xfs_agblock_t	bno = foundbno;
325	xfs_extlen_t	len = foundlen;
326	xfs_extlen_t	diff;
327	bool		busy;
328
329	/* Trim busy sections out of found extent */
330	busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
331
332	/*
333	 * If we have a largish extent that happens to start before min_agbno,
334	 * see if we can shift it into range...
335	 */
336	if (bno < args->min_agbno && bno + len > args->min_agbno) {
337		diff = args->min_agbno - bno;
338		if (len > diff) {
339			bno += diff;
340			len -= diff;
341		}
342	}
343
344	if (args->alignment > 1 && len >= args->minlen) {
345		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
346
347		diff = aligned_bno - bno;
348
349		*resbno = aligned_bno;
350		*reslen = diff >= len ? 0 : len - diff;
351	} else {
352		*resbno = bno;
353		*reslen = len;
354	}
355
356	return busy;
357}
358
359/*
360 * Compute best start block and diff for "near" allocations.
361 * freelen >= wantlen already checked by caller.
362 */
363STATIC xfs_extlen_t			/* difference value (absolute) */
364xfs_alloc_compute_diff(
365	xfs_agblock_t	wantbno,	/* target starting block */
366	xfs_extlen_t	wantlen,	/* target length */
367	xfs_extlen_t	alignment,	/* target alignment */
368	int		datatype,	/* are we allocating data? */
369	xfs_agblock_t	freebno,	/* freespace's starting block */
370	xfs_extlen_t	freelen,	/* freespace's length */
371	xfs_agblock_t	*newbnop)	/* result: best start block from free */
372{
373	xfs_agblock_t	freeend;	/* end of freespace extent */
374	xfs_agblock_t	newbno1;	/* return block number */
375	xfs_agblock_t	newbno2;	/* other new block number */
376	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
377	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
378	xfs_agblock_t	wantend;	/* end of target extent */
379	bool		userdata = datatype & XFS_ALLOC_USERDATA;
380
381	ASSERT(freelen >= wantlen);
382	freeend = freebno + freelen;
383	wantend = wantbno + wantlen;
384	/*
385	 * We want to allocate from the start of a free extent if it is past
386	 * the desired block or if we are allocating user data and the free
387	 * extent is before desired block. The second case is there to allow
388	 * for contiguous allocation from the remaining free space if the file
389	 * grows in the short term.
390	 */
391	if (freebno >= wantbno || (userdata && freeend < wantend)) {
392		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
393			newbno1 = NULLAGBLOCK;
394	} else if (freeend >= wantend && alignment > 1) {
395		newbno1 = roundup(wantbno, alignment);
396		newbno2 = newbno1 - alignment;
397		if (newbno1 >= freeend)
398			newbno1 = NULLAGBLOCK;
399		else
400			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
401		if (newbno2 < freebno)
402			newbno2 = NULLAGBLOCK;
403		else
404			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
405		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
406			if (newlen1 < newlen2 ||
407			    (newlen1 == newlen2 &&
408			     XFS_ABSDIFF(newbno1, wantbno) >
409			     XFS_ABSDIFF(newbno2, wantbno)))
410				newbno1 = newbno2;
411		} else if (newbno2 != NULLAGBLOCK)
412			newbno1 = newbno2;
413	} else if (freeend >= wantend) {
414		newbno1 = wantbno;
415	} else if (alignment > 1) {
416		newbno1 = roundup(freeend - wantlen, alignment);
417		if (newbno1 > freeend - wantlen &&
418		    newbno1 - alignment >= freebno)
419			newbno1 -= alignment;
420		else if (newbno1 >= freeend)
421			newbno1 = NULLAGBLOCK;
422	} else
423		newbno1 = freeend - wantlen;
424	*newbnop = newbno1;
425	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
426}
427
428/*
429 * Fix up the length, based on mod and prod.
430 * len should be k * prod + mod for some k.
431 * If len is too small it is returned unchanged.
432 * If len hits maxlen it is left alone.
433 */
434STATIC void
435xfs_alloc_fix_len(
436	xfs_alloc_arg_t	*args)		/* allocation argument structure */
437{
438	xfs_extlen_t	k;
439	xfs_extlen_t	rlen;
440
441	ASSERT(args->mod < args->prod);
442	rlen = args->len;
443	ASSERT(rlen >= args->minlen);
444	ASSERT(rlen <= args->maxlen);
445	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
446	    (args->mod == 0 && rlen < args->prod))
447		return;
448	k = rlen % args->prod;
449	if (k == args->mod)
450		return;
451	if (k > args->mod)
452		rlen = rlen - (k - args->mod);
453	else
454		rlen = rlen - args->prod + (args->mod - k);
455	/* casts to (int) catch length underflows */
456	if ((int)rlen < (int)args->minlen)
457		return;
458	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
459	ASSERT(rlen % args->prod == args->mod);
460	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
461		rlen + args->minleft);
462	args->len = rlen;
463}
464
465/*
466 * Update the two btrees, logically removing from freespace the extent
467 * starting at rbno, rlen blocks.  The extent is contained within the
468 * actual (current) free extent fbno for flen blocks.
469 * Flags are passed in indicating whether the cursors are set to the
470 * relevant records.
471 */
472STATIC int				/* error code */
473xfs_alloc_fixup_trees(
474	struct xfs_btree_cur *cnt_cur,	/* cursor for by-size btree */
475	struct xfs_btree_cur *bno_cur,	/* cursor for by-block btree */
476	xfs_agblock_t	fbno,		/* starting block of free extent */
477	xfs_extlen_t	flen,		/* length of free extent */
478	xfs_agblock_t	rbno,		/* starting block of returned extent */
479	xfs_extlen_t	rlen,		/* length of returned extent */
480	int		flags)		/* flags, XFSA_FIXUP_... */
481{
482	int		error;		/* error code */
483	int		i;		/* operation results */
484	xfs_agblock_t	nfbno1;		/* first new free startblock */
485	xfs_agblock_t	nfbno2;		/* second new free startblock */
486	xfs_extlen_t	nflen1=0;	/* first new free length */
487	xfs_extlen_t	nflen2=0;	/* second new free length */
488	struct xfs_mount *mp;
489
490	mp = cnt_cur->bc_mp;
491
492	/*
493	 * Look up the record in the by-size tree if necessary.
494	 */
495	if (flags & XFSA_FIXUP_CNT_OK) {
496#ifdef DEBUG
497		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
498			return error;
499		if (XFS_IS_CORRUPT(mp,
500				   i != 1 ||
501				   nfbno1 != fbno ||
502				   nflen1 != flen))
503			return -EFSCORRUPTED;
504#endif
505	} else {
506		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
507			return error;
508		if (XFS_IS_CORRUPT(mp, i != 1))
509			return -EFSCORRUPTED;
510	}
511	/*
512	 * Look up the record in the by-block tree if necessary.
513	 */
514	if (flags & XFSA_FIXUP_BNO_OK) {
515#ifdef DEBUG
516		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
517			return error;
518		if (XFS_IS_CORRUPT(mp,
519				   i != 1 ||
520				   nfbno1 != fbno ||
521				   nflen1 != flen))
522			return -EFSCORRUPTED;
523#endif
524	} else {
525		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
526			return error;
527		if (XFS_IS_CORRUPT(mp, i != 1))
528			return -EFSCORRUPTED;
529	}
530
531#ifdef DEBUG
532	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
533		struct xfs_btree_block	*bnoblock;
534		struct xfs_btree_block	*cntblock;
535
536		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
537		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
538
539		if (XFS_IS_CORRUPT(mp,
540				   bnoblock->bb_numrecs !=
541				   cntblock->bb_numrecs))
542			return -EFSCORRUPTED;
543	}
544#endif
545
546	/*
547	 * Deal with all four cases: the allocated record is contained
548	 * within the freespace record, so we can have new freespace
549	 * at either (or both) end, or no freespace remaining.
550	 */
551	if (rbno == fbno && rlen == flen)
552		nfbno1 = nfbno2 = NULLAGBLOCK;
553	else if (rbno == fbno) {
554		nfbno1 = rbno + rlen;
555		nflen1 = flen - rlen;
556		nfbno2 = NULLAGBLOCK;
557	} else if (rbno + rlen == fbno + flen) {
558		nfbno1 = fbno;
559		nflen1 = flen - rlen;
560		nfbno2 = NULLAGBLOCK;
561	} else {
562		nfbno1 = fbno;
563		nflen1 = rbno - fbno;
564		nfbno2 = rbno + rlen;
565		nflen2 = (fbno + flen) - nfbno2;
566	}
567	/*
568	 * Delete the entry from the by-size btree.
569	 */
570	if ((error = xfs_btree_delete(cnt_cur, &i)))
571		return error;
572	if (XFS_IS_CORRUPT(mp, i != 1))
573		return -EFSCORRUPTED;
574	/*
575	 * Add new by-size btree entry(s).
576	 */
577	if (nfbno1 != NULLAGBLOCK) {
578		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
579			return error;
580		if (XFS_IS_CORRUPT(mp, i != 0))
581			return -EFSCORRUPTED;
582		if ((error = xfs_btree_insert(cnt_cur, &i)))
583			return error;
584		if (XFS_IS_CORRUPT(mp, i != 1))
585			return -EFSCORRUPTED;
586	}
587	if (nfbno2 != NULLAGBLOCK) {
588		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
589			return error;
590		if (XFS_IS_CORRUPT(mp, i != 0))
591			return -EFSCORRUPTED;
592		if ((error = xfs_btree_insert(cnt_cur, &i)))
593			return error;
594		if (XFS_IS_CORRUPT(mp, i != 1))
595			return -EFSCORRUPTED;
596	}
597	/*
598	 * Fix up the by-block btree entry(s).
599	 */
600	if (nfbno1 == NULLAGBLOCK) {
601		/*
602		 * No remaining freespace, just delete the by-block tree entry.
603		 */
604		if ((error = xfs_btree_delete(bno_cur, &i)))
605			return error;
606		if (XFS_IS_CORRUPT(mp, i != 1))
607			return -EFSCORRUPTED;
608	} else {
609		/*
610		 * Update the by-block entry to start later|be shorter.
611		 */
612		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
613			return error;
614	}
615	if (nfbno2 != NULLAGBLOCK) {
616		/*
617		 * 2 resulting free entries, need to add one.
618		 */
619		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
620			return error;
621		if (XFS_IS_CORRUPT(mp, i != 0))
622			return -EFSCORRUPTED;
623		if ((error = xfs_btree_insert(bno_cur, &i)))
624			return error;
625		if (XFS_IS_CORRUPT(mp, i != 1))
626			return -EFSCORRUPTED;
627	}
628	return 0;
629}
630
631/*
632 * We do not verify the AGFL contents against AGF-based index counters here,
633 * even though we may have access to the perag that contains shadow copies. We
634 * don't know if the AGF based counters have been checked, and if they have they
635 * still may be inconsistent because they haven't yet been reset on the first
636 * allocation after the AGF has been read in.
637 *
638 * This means we can only check that all agfl entries contain valid or null
639 * values because we can't reliably determine the active range to exclude
640 * NULLAGBNO as a valid value.
641 *
642 * However, we can't even do that for v4 format filesystems because there are
643 * old versions of mkfs out there that does not initialise the AGFL to known,
644 * verifiable values. HEnce we can't tell the difference between a AGFL block
645 * allocated by mkfs and a corrupted AGFL block here on v4 filesystems.
646 *
647 * As a result, we can only fully validate AGFL block numbers when we pull them
648 * from the freelist in xfs_alloc_get_freelist().
649 */
650static xfs_failaddr_t
651xfs_agfl_verify(
652	struct xfs_buf	*bp)
653{
654	struct xfs_mount *mp = bp->b_mount;
655	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
656	__be32		*agfl_bno = xfs_buf_to_agfl_bno(bp);
657	int		i;
658
659	if (!xfs_has_crc(mp))
660		return NULL;
661
662	if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
663		return __this_address;
664	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
665		return __this_address;
666	/*
667	 * during growfs operations, the perag is not fully initialised,
668	 * so we can't use it for any useful checking. growfs ensures we can't
669	 * use it by using uncached buffers that don't have the perag attached
670	 * so we can detect and avoid this problem.
671	 */
672	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
673		return __this_address;
674
675	for (i = 0; i < xfs_agfl_size(mp); i++) {
676		if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
677		    be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
678			return __this_address;
679	}
680
681	if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
682		return __this_address;
683	return NULL;
684}
685
686static void
687xfs_agfl_read_verify(
688	struct xfs_buf	*bp)
689{
690	struct xfs_mount *mp = bp->b_mount;
691	xfs_failaddr_t	fa;
692
693	/*
694	 * There is no verification of non-crc AGFLs because mkfs does not
695	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
696	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
697	 * can't verify just those entries are valid.
698	 */
699	if (!xfs_has_crc(mp))
700		return;
701
702	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
703		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
704	else {
705		fa = xfs_agfl_verify(bp);
706		if (fa)
707			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
708	}
709}
710
711static void
712xfs_agfl_write_verify(
713	struct xfs_buf	*bp)
714{
715	struct xfs_mount	*mp = bp->b_mount;
716	struct xfs_buf_log_item	*bip = bp->b_log_item;
717	xfs_failaddr_t		fa;
718
719	/* no verification of non-crc AGFLs */
720	if (!xfs_has_crc(mp))
721		return;
722
723	fa = xfs_agfl_verify(bp);
724	if (fa) {
725		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
726		return;
727	}
728
729	if (bip)
730		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
731
732	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
733}
734
735const struct xfs_buf_ops xfs_agfl_buf_ops = {
736	.name = "xfs_agfl",
737	.magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
738	.verify_read = xfs_agfl_read_verify,
739	.verify_write = xfs_agfl_write_verify,
740	.verify_struct = xfs_agfl_verify,
741};
742
743/*
744 * Read in the allocation group free block array.
745 */
746int
747xfs_alloc_read_agfl(
748	struct xfs_perag	*pag,
749	struct xfs_trans	*tp,
750	struct xfs_buf		**bpp)
751{
752	struct xfs_mount	*mp = pag->pag_mount;
753	struct xfs_buf		*bp;
754	int			error;
755
756	error = xfs_trans_read_buf(
757			mp, tp, mp->m_ddev_targp,
758			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
759			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
760	if (error)
761		return error;
762	xfs_buf_set_ref(bp, XFS_AGFL_REF);
763	*bpp = bp;
764	return 0;
765}
766
767STATIC int
768xfs_alloc_update_counters(
769	struct xfs_trans	*tp,
770	struct xfs_buf		*agbp,
771	long			len)
772{
773	struct xfs_agf		*agf = agbp->b_addr;
774
775	agbp->b_pag->pagf_freeblks += len;
776	be32_add_cpu(&agf->agf_freeblks, len);
777
778	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
779		     be32_to_cpu(agf->agf_length))) {
780		xfs_buf_mark_corrupt(agbp);
781		return -EFSCORRUPTED;
782	}
783
784	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
785	return 0;
786}
787
788/*
789 * Block allocation algorithm and data structures.
790 */
791struct xfs_alloc_cur {
792	struct xfs_btree_cur		*cnt;	/* btree cursors */
793	struct xfs_btree_cur		*bnolt;
794	struct xfs_btree_cur		*bnogt;
795	xfs_extlen_t			cur_len;/* current search length */
796	xfs_agblock_t			rec_bno;/* extent startblock */
797	xfs_extlen_t			rec_len;/* extent length */
798	xfs_agblock_t			bno;	/* alloc bno */
799	xfs_extlen_t			len;	/* alloc len */
800	xfs_extlen_t			diff;	/* diff from search bno */
801	unsigned int			busy_gen;/* busy state */
802	bool				busy;
803};
804
805/*
806 * Set up cursors, etc. in the extent allocation cursor. This function can be
807 * called multiple times to reset an initialized structure without having to
808 * reallocate cursors.
809 */
810static int
811xfs_alloc_cur_setup(
812	struct xfs_alloc_arg	*args,
813	struct xfs_alloc_cur	*acur)
814{
815	int			error;
816	int			i;
817
818	acur->cur_len = args->maxlen;
819	acur->rec_bno = 0;
820	acur->rec_len = 0;
821	acur->bno = 0;
822	acur->len = 0;
823	acur->diff = -1;
824	acur->busy = false;
825	acur->busy_gen = 0;
826
827	/*
828	 * Perform an initial cntbt lookup to check for availability of maxlen
829	 * extents. If this fails, we'll return -ENOSPC to signal the caller to
830	 * attempt a small allocation.
831	 */
832	if (!acur->cnt)
833		acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
834					args->agbp, args->pag, XFS_BTNUM_CNT);
835	error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
836	if (error)
837		return error;
838
839	/*
840	 * Allocate the bnobt left and right search cursors.
841	 */
842	if (!acur->bnolt)
843		acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
844					args->agbp, args->pag, XFS_BTNUM_BNO);
845	if (!acur->bnogt)
846		acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
847					args->agbp, args->pag, XFS_BTNUM_BNO);
848	return i == 1 ? 0 : -ENOSPC;
849}
850
851static void
852xfs_alloc_cur_close(
853	struct xfs_alloc_cur	*acur,
854	bool			error)
855{
856	int			cur_error = XFS_BTREE_NOERROR;
857
858	if (error)
859		cur_error = XFS_BTREE_ERROR;
860
861	if (acur->cnt)
862		xfs_btree_del_cursor(acur->cnt, cur_error);
863	if (acur->bnolt)
864		xfs_btree_del_cursor(acur->bnolt, cur_error);
865	if (acur->bnogt)
866		xfs_btree_del_cursor(acur->bnogt, cur_error);
867	acur->cnt = acur->bnolt = acur->bnogt = NULL;
868}
869
870/*
871 * Check an extent for allocation and track the best available candidate in the
872 * allocation structure. The cursor is deactivated if it has entered an out of
873 * range state based on allocation arguments. Optionally return the extent
874 * extent geometry and allocation status if requested by the caller.
875 */
876static int
877xfs_alloc_cur_check(
878	struct xfs_alloc_arg	*args,
879	struct xfs_alloc_cur	*acur,
880	struct xfs_btree_cur	*cur,
881	int			*new)
882{
883	int			error, i;
884	xfs_agblock_t		bno, bnoa, bnew;
885	xfs_extlen_t		len, lena, diff = -1;
886	bool			busy;
887	unsigned		busy_gen = 0;
888	bool			deactivate = false;
889	bool			isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
890
891	*new = 0;
892
893	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
894	if (error)
895		return error;
896	if (XFS_IS_CORRUPT(args->mp, i != 1))
897		return -EFSCORRUPTED;
898
899	/*
900	 * Check minlen and deactivate a cntbt cursor if out of acceptable size
901	 * range (i.e., walking backwards looking for a minlen extent).
902	 */
903	if (len < args->minlen) {
904		deactivate = !isbnobt;
905		goto out;
906	}
907
908	busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
909					 &busy_gen);
910	acur->busy |= busy;
911	if (busy)
912		acur->busy_gen = busy_gen;
913	/* deactivate a bnobt cursor outside of locality range */
914	if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
915		deactivate = isbnobt;
916		goto out;
917	}
918	if (lena < args->minlen)
919		goto out;
920
921	args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
922	xfs_alloc_fix_len(args);
923	ASSERT(args->len >= args->minlen);
924	if (args->len < acur->len)
925		goto out;
926
927	/*
928	 * We have an aligned record that satisfies minlen and beats or matches
929	 * the candidate extent size. Compare locality for near allocation mode.
930	 */
931	diff = xfs_alloc_compute_diff(args->agbno, args->len,
932				      args->alignment, args->datatype,
933				      bnoa, lena, &bnew);
934	if (bnew == NULLAGBLOCK)
935		goto out;
936
937	/*
938	 * Deactivate a bnobt cursor with worse locality than the current best.
939	 */
940	if (diff > acur->diff) {
941		deactivate = isbnobt;
942		goto out;
943	}
944
945	ASSERT(args->len > acur->len ||
946	       (args->len == acur->len && diff <= acur->diff));
947	acur->rec_bno = bno;
948	acur->rec_len = len;
949	acur->bno = bnew;
950	acur->len = args->len;
951	acur->diff = diff;
952	*new = 1;
953
954	/*
955	 * We're done if we found a perfect allocation. This only deactivates
956	 * the current cursor, but this is just an optimization to terminate a
957	 * cntbt search that otherwise runs to the edge of the tree.
958	 */
959	if (acur->diff == 0 && acur->len == args->maxlen)
960		deactivate = true;
961out:
962	if (deactivate)
963		cur->bc_ag.abt.active = false;
964	trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
965				  *new);
966	return 0;
967}
968
969/*
970 * Complete an allocation of a candidate extent. Remove the extent from both
971 * trees and update the args structure.
972 */
973STATIC int
974xfs_alloc_cur_finish(
975	struct xfs_alloc_arg	*args,
976	struct xfs_alloc_cur	*acur)
977{
978	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
979	int			error;
980
981	ASSERT(acur->cnt && acur->bnolt);
982	ASSERT(acur->bno >= acur->rec_bno);
983	ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
984	ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
985
986	error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
987				      acur->rec_len, acur->bno, acur->len, 0);
988	if (error)
989		return error;
990
991	args->agbno = acur->bno;
992	args->len = acur->len;
993	args->wasfromfl = 0;
994
995	trace_xfs_alloc_cur(args);
996	return 0;
997}
998
999/*
1000 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
1001 * bno optimized lookup to search for extents with ideal size and locality.
1002 */
1003STATIC int
1004xfs_alloc_cntbt_iter(
1005	struct xfs_alloc_arg		*args,
1006	struct xfs_alloc_cur		*acur)
1007{
1008	struct xfs_btree_cur	*cur = acur->cnt;
1009	xfs_agblock_t		bno;
1010	xfs_extlen_t		len, cur_len;
1011	int			error;
1012	int			i;
1013
1014	if (!xfs_alloc_cur_active(cur))
1015		return 0;
1016
1017	/* locality optimized lookup */
1018	cur_len = acur->cur_len;
1019	error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
1020	if (error)
1021		return error;
1022	if (i == 0)
1023		return 0;
1024	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1025	if (error)
1026		return error;
1027
1028	/* check the current record and update search length from it */
1029	error = xfs_alloc_cur_check(args, acur, cur, &i);
1030	if (error)
1031		return error;
1032	ASSERT(len >= acur->cur_len);
1033	acur->cur_len = len;
1034
1035	/*
1036	 * We looked up the first record >= [agbno, len] above. The agbno is a
1037	 * secondary key and so the current record may lie just before or after
1038	 * agbno. If it is past agbno, check the previous record too so long as
1039	 * the length matches as it may be closer. Don't check a smaller record
1040	 * because that could deactivate our cursor.
1041	 */
1042	if (bno > args->agbno) {
1043		error = xfs_btree_decrement(cur, 0, &i);
1044		if (!error && i) {
1045			error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1046			if (!error && i && len == acur->cur_len)
1047				error = xfs_alloc_cur_check(args, acur, cur,
1048							    &i);
1049		}
1050		if (error)
1051			return error;
1052	}
1053
1054	/*
1055	 * Increment the search key until we find at least one allocation
1056	 * candidate or if the extent we found was larger. Otherwise, double the
1057	 * search key to optimize the search. Efficiency is more important here
1058	 * than absolute best locality.
1059	 */
1060	cur_len <<= 1;
1061	if (!acur->len || acur->cur_len >= cur_len)
1062		acur->cur_len++;
1063	else
1064		acur->cur_len = cur_len;
1065
1066	return error;
1067}
1068
1069/*
1070 * Deal with the case where only small freespaces remain. Either return the
1071 * contents of the last freespace record, or allocate space from the freelist if
1072 * there is nothing in the tree.
1073 */
1074STATIC int			/* error */
1075xfs_alloc_ag_vextent_small(
1076	struct xfs_alloc_arg	*args,	/* allocation argument structure */
1077	struct xfs_btree_cur	*ccur,	/* optional by-size cursor */
1078	xfs_agblock_t		*fbnop,	/* result block number */
1079	xfs_extlen_t		*flenp,	/* result length */
1080	int			*stat)	/* status: 0-freelist, 1-normal/none */
1081{
1082	struct xfs_agf		*agf = args->agbp->b_addr;
1083	int			error = 0;
1084	xfs_agblock_t		fbno = NULLAGBLOCK;
1085	xfs_extlen_t		flen = 0;
1086	int			i = 0;
1087
1088	/*
1089	 * If a cntbt cursor is provided, try to allocate the largest record in
1090	 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1091	 * allocation. Make sure to respect minleft even when pulling from the
1092	 * freelist.
1093	 */
1094	if (ccur)
1095		error = xfs_btree_decrement(ccur, 0, &i);
1096	if (error)
1097		goto error;
1098	if (i) {
1099		error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1100		if (error)
1101			goto error;
1102		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1103			error = -EFSCORRUPTED;
1104			goto error;
1105		}
1106		goto out;
1107	}
1108
1109	if (args->minlen != 1 || args->alignment != 1 ||
1110	    args->resv == XFS_AG_RESV_AGFL ||
1111	    be32_to_cpu(agf->agf_flcount) <= args->minleft)
1112		goto out;
1113
1114	error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
1115			&fbno, 0);
1116	if (error)
1117		goto error;
1118	if (fbno == NULLAGBLOCK)
1119		goto out;
1120
1121	xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
1122			      (args->datatype & XFS_ALLOC_NOBUSY));
1123
1124	if (args->datatype & XFS_ALLOC_USERDATA) {
1125		struct xfs_buf	*bp;
1126
1127		error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1128				XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1129				args->mp->m_bsize, 0, &bp);
1130		if (error)
1131			goto error;
1132		xfs_trans_binval(args->tp, bp);
1133	}
1134	*fbnop = args->agbno = fbno;
1135	*flenp = args->len = 1;
1136	if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1137		error = -EFSCORRUPTED;
1138		goto error;
1139	}
1140	args->wasfromfl = 1;
1141	trace_xfs_alloc_small_freelist(args);
1142
1143	/*
1144	 * If we're feeding an AGFL block to something that doesn't live in the
1145	 * free space, we need to clear out the OWN_AG rmap.
1146	 */
1147	error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
1148			      &XFS_RMAP_OINFO_AG);
1149	if (error)
1150		goto error;
1151
1152	*stat = 0;
1153	return 0;
1154
1155out:
1156	/*
1157	 * Can't do the allocation, give up.
1158	 */
1159	if (flen < args->minlen) {
1160		args->agbno = NULLAGBLOCK;
1161		trace_xfs_alloc_small_notenough(args);
1162		flen = 0;
1163	}
1164	*fbnop = fbno;
1165	*flenp = flen;
1166	*stat = 1;
1167	trace_xfs_alloc_small_done(args);
1168	return 0;
1169
1170error:
1171	trace_xfs_alloc_small_error(args);
1172	return error;
1173}
1174
1175/*
1176 * Allocate a variable extent at exactly agno/bno.
1177 * Extent's length (returned in *len) will be between minlen and maxlen,
1178 * and of the form k * prod + mod unless there's nothing that large.
1179 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1180 */
1181STATIC int			/* error */
1182xfs_alloc_ag_vextent_exact(
1183	xfs_alloc_arg_t	*args)	/* allocation argument structure */
1184{
1185	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1186	struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
1187	struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
1188	int		error;
1189	xfs_agblock_t	fbno;	/* start block of found extent */
1190	xfs_extlen_t	flen;	/* length of found extent */
1191	xfs_agblock_t	tbno;	/* start block of busy extent */
1192	xfs_extlen_t	tlen;	/* length of busy extent */
1193	xfs_agblock_t	tend;	/* end block of busy extent */
1194	int		i;	/* success/failure of operation */
1195	unsigned	busy_gen;
1196
1197	ASSERT(args->alignment == 1);
1198
1199	/*
1200	 * Allocate/initialize a cursor for the by-number freespace btree.
1201	 */
1202	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1203					  args->pag, XFS_BTNUM_BNO);
1204
1205	/*
1206	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1207	 * Look for the closest free block <= bno, it must contain bno
1208	 * if any free block does.
1209	 */
1210	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1211	if (error)
1212		goto error0;
1213	if (!i)
1214		goto not_found;
1215
1216	/*
1217	 * Grab the freespace record.
1218	 */
1219	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1220	if (error)
1221		goto error0;
1222	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1223		error = -EFSCORRUPTED;
1224		goto error0;
1225	}
1226	ASSERT(fbno <= args->agbno);
1227
1228	/*
1229	 * Check for overlapping busy extents.
1230	 */
1231	tbno = fbno;
1232	tlen = flen;
1233	xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1234
1235	/*
1236	 * Give up if the start of the extent is busy, or the freespace isn't
1237	 * long enough for the minimum request.
1238	 */
1239	if (tbno > args->agbno)
1240		goto not_found;
1241	if (tlen < args->minlen)
1242		goto not_found;
1243	tend = tbno + tlen;
1244	if (tend < args->agbno + args->minlen)
1245		goto not_found;
1246
1247	/*
1248	 * End of extent will be smaller of the freespace end and the
1249	 * maximal requested end.
1250	 *
1251	 * Fix the length according to mod and prod if given.
1252	 */
1253	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1254						- args->agbno;
1255	xfs_alloc_fix_len(args);
1256	ASSERT(args->agbno + args->len <= tend);
1257
1258	/*
1259	 * We are allocating agbno for args->len
1260	 * Allocate/initialize a cursor for the by-size btree.
1261	 */
1262	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1263					args->pag, XFS_BTNUM_CNT);
1264	ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1265	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1266				      args->len, XFSA_FIXUP_BNO_OK);
1267	if (error) {
1268		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1269		goto error0;
1270	}
1271
1272	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1273	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1274
1275	args->wasfromfl = 0;
1276	trace_xfs_alloc_exact_done(args);
1277	return 0;
1278
1279not_found:
1280	/* Didn't find it, return null. */
1281	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1282	args->agbno = NULLAGBLOCK;
1283	trace_xfs_alloc_exact_notfound(args);
1284	return 0;
1285
1286error0:
1287	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1288	trace_xfs_alloc_exact_error(args);
1289	return error;
1290}
1291
1292/*
1293 * Search a given number of btree records in a given direction. Check each
1294 * record against the good extent we've already found.
1295 */
1296STATIC int
1297xfs_alloc_walk_iter(
1298	struct xfs_alloc_arg	*args,
1299	struct xfs_alloc_cur	*acur,
1300	struct xfs_btree_cur	*cur,
1301	bool			increment,
1302	bool			find_one, /* quit on first candidate */
1303	int			count,    /* rec count (-1 for infinite) */
1304	int			*stat)
1305{
1306	int			error;
1307	int			i;
1308
1309	*stat = 0;
1310
1311	/*
1312	 * Search so long as the cursor is active or we find a better extent.
1313	 * The cursor is deactivated if it extends beyond the range of the
1314	 * current allocation candidate.
1315	 */
1316	while (xfs_alloc_cur_active(cur) && count) {
1317		error = xfs_alloc_cur_check(args, acur, cur, &i);
1318		if (error)
1319			return error;
1320		if (i == 1) {
1321			*stat = 1;
1322			if (find_one)
1323				break;
1324		}
1325		if (!xfs_alloc_cur_active(cur))
1326			break;
1327
1328		if (increment)
1329			error = xfs_btree_increment(cur, 0, &i);
1330		else
1331			error = xfs_btree_decrement(cur, 0, &i);
1332		if (error)
1333			return error;
1334		if (i == 0)
1335			cur->bc_ag.abt.active = false;
1336
1337		if (count > 0)
1338			count--;
1339	}
1340
1341	return 0;
1342}
1343
1344/*
1345 * Search the by-bno and by-size btrees in parallel in search of an extent with
1346 * ideal locality based on the NEAR mode ->agbno locality hint.
1347 */
1348STATIC int
1349xfs_alloc_ag_vextent_locality(
1350	struct xfs_alloc_arg	*args,
1351	struct xfs_alloc_cur	*acur,
1352	int			*stat)
1353{
1354	struct xfs_btree_cur	*fbcur = NULL;
1355	int			error;
1356	int			i;
1357	bool			fbinc;
1358
1359	ASSERT(acur->len == 0);
1360
1361	*stat = 0;
1362
1363	error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1364	if (error)
1365		return error;
1366	error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1367	if (error)
1368		return error;
1369	error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1370	if (error)
1371		return error;
1372
1373	/*
1374	 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1375	 * right and lookup the closest extent to the locality hint for each
1376	 * extent size key in the cntbt. The entire search terminates
1377	 * immediately on a bnobt hit because that means we've found best case
1378	 * locality. Otherwise the search continues until the cntbt cursor runs
1379	 * off the end of the tree. If no allocation candidate is found at this
1380	 * point, give up on locality, walk backwards from the end of the cntbt
1381	 * and take the first available extent.
1382	 *
1383	 * The parallel tree searches balance each other out to provide fairly
1384	 * consistent performance for various situations. The bnobt search can
1385	 * have pathological behavior in the worst case scenario of larger
1386	 * allocation requests and fragmented free space. On the other hand, the
1387	 * bnobt is able to satisfy most smaller allocation requests much more
1388	 * quickly than the cntbt. The cntbt search can sift through fragmented
1389	 * free space and sets of free extents for larger allocation requests
1390	 * more quickly than the bnobt. Since the locality hint is just a hint
1391	 * and we don't want to scan the entire bnobt for perfect locality, the
1392	 * cntbt search essentially bounds the bnobt search such that we can
1393	 * find good enough locality at reasonable performance in most cases.
1394	 */
1395	while (xfs_alloc_cur_active(acur->bnolt) ||
1396	       xfs_alloc_cur_active(acur->bnogt) ||
1397	       xfs_alloc_cur_active(acur->cnt)) {
1398
1399		trace_xfs_alloc_cur_lookup(args);
1400
1401		/*
1402		 * Search the bnobt left and right. In the case of a hit, finish
1403		 * the search in the opposite direction and we're done.
1404		 */
1405		error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1406					    true, 1, &i);
1407		if (error)
1408			return error;
1409		if (i == 1) {
1410			trace_xfs_alloc_cur_left(args);
1411			fbcur = acur->bnogt;
1412			fbinc = true;
1413			break;
1414		}
1415		error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1416					    1, &i);
1417		if (error)
1418			return error;
1419		if (i == 1) {
1420			trace_xfs_alloc_cur_right(args);
1421			fbcur = acur->bnolt;
1422			fbinc = false;
1423			break;
1424		}
1425
1426		/*
1427		 * Check the extent with best locality based on the current
1428		 * extent size search key and keep track of the best candidate.
1429		 */
1430		error = xfs_alloc_cntbt_iter(args, acur);
1431		if (error)
1432			return error;
1433		if (!xfs_alloc_cur_active(acur->cnt)) {
1434			trace_xfs_alloc_cur_lookup_done(args);
1435			break;
1436		}
1437	}
1438
1439	/*
1440	 * If we failed to find anything due to busy extents, return empty
1441	 * handed so the caller can flush and retry. If no busy extents were
1442	 * found, walk backwards from the end of the cntbt as a last resort.
1443	 */
1444	if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1445		error = xfs_btree_decrement(acur->cnt, 0, &i);
1446		if (error)
1447			return error;
1448		if (i) {
1449			acur->cnt->bc_ag.abt.active = true;
1450			fbcur = acur->cnt;
1451			fbinc = false;
1452		}
1453	}
1454
1455	/*
1456	 * Search in the opposite direction for a better entry in the case of
1457	 * a bnobt hit or walk backwards from the end of the cntbt.
1458	 */
1459	if (fbcur) {
1460		error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1461					    &i);
1462		if (error)
1463			return error;
1464	}
1465
1466	if (acur->len)
1467		*stat = 1;
1468
1469	return 0;
1470}
1471
1472/* Check the last block of the cnt btree for allocations. */
1473static int
1474xfs_alloc_ag_vextent_lastblock(
1475	struct xfs_alloc_arg	*args,
1476	struct xfs_alloc_cur	*acur,
1477	xfs_agblock_t		*bno,
1478	xfs_extlen_t		*len,
1479	bool			*allocated)
1480{
1481	int			error;
1482	int			i;
1483
1484#ifdef DEBUG
1485	/* Randomly don't execute the first algorithm. */
1486	if (get_random_u32_below(2))
1487		return 0;
1488#endif
1489
1490	/*
1491	 * Start from the entry that lookup found, sequence through all larger
1492	 * free blocks.  If we're actually pointing at a record smaller than
1493	 * maxlen, go to the start of this block, and skip all those smaller
1494	 * than minlen.
1495	 */
1496	if (*len || args->alignment > 1) {
1497		acur->cnt->bc_levels[0].ptr = 1;
1498		do {
1499			error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1500			if (error)
1501				return error;
1502			if (XFS_IS_CORRUPT(args->mp, i != 1))
1503				return -EFSCORRUPTED;
1504			if (*len >= args->minlen)
1505				break;
1506			error = xfs_btree_increment(acur->cnt, 0, &i);
1507			if (error)
1508				return error;
1509		} while (i);
1510		ASSERT(*len >= args->minlen);
1511		if (!i)
1512			return 0;
1513	}
1514
1515	error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1516	if (error)
1517		return error;
1518
1519	/*
1520	 * It didn't work.  We COULD be in a case where there's a good record
1521	 * somewhere, so try again.
1522	 */
1523	if (acur->len == 0)
1524		return 0;
1525
1526	trace_xfs_alloc_near_first(args);
1527	*allocated = true;
1528	return 0;
1529}
1530
1531/*
1532 * Allocate a variable extent near bno in the allocation group agno.
1533 * Extent's length (returned in len) will be between minlen and maxlen,
1534 * and of the form k * prod + mod unless there's nothing that large.
1535 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1536 */
1537STATIC int
1538xfs_alloc_ag_vextent_near(
1539	struct xfs_alloc_arg	*args,
1540	uint32_t		alloc_flags)
1541{
1542	struct xfs_alloc_cur	acur = {};
1543	int			error;		/* error code */
1544	int			i;		/* result code, temporary */
1545	xfs_agblock_t		bno;
1546	xfs_extlen_t		len;
1547
1548	/* handle uninitialized agbno range so caller doesn't have to */
1549	if (!args->min_agbno && !args->max_agbno)
1550		args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1551	ASSERT(args->min_agbno <= args->max_agbno);
1552
1553	/* clamp agbno to the range if it's outside */
1554	if (args->agbno < args->min_agbno)
1555		args->agbno = args->min_agbno;
1556	if (args->agbno > args->max_agbno)
1557		args->agbno = args->max_agbno;
1558
1559	/* Retry once quickly if we find busy extents before blocking. */
1560	alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
1561restart:
1562	len = 0;
1563
1564	/*
1565	 * Set up cursors and see if there are any free extents as big as
1566	 * maxlen. If not, pick the last entry in the tree unless the tree is
1567	 * empty.
1568	 */
1569	error = xfs_alloc_cur_setup(args, &acur);
1570	if (error == -ENOSPC) {
1571		error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1572				&len, &i);
1573		if (error)
1574			goto out;
1575		if (i == 0 || len == 0) {
1576			trace_xfs_alloc_near_noentry(args);
1577			goto out;
1578		}
1579		ASSERT(i == 1);
1580	} else if (error) {
1581		goto out;
1582	}
1583
1584	/*
1585	 * First algorithm.
1586	 * If the requested extent is large wrt the freespaces available
1587	 * in this a.g., then the cursor will be pointing to a btree entry
1588	 * near the right edge of the tree.  If it's in the last btree leaf
1589	 * block, then we just examine all the entries in that block
1590	 * that are big enough, and pick the best one.
1591	 */
1592	if (xfs_btree_islastblock(acur.cnt, 0)) {
1593		bool		allocated = false;
1594
1595		error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1596				&allocated);
1597		if (error)
1598			goto out;
1599		if (allocated)
1600			goto alloc_finish;
1601	}
1602
1603	/*
1604	 * Second algorithm. Combined cntbt and bnobt search to find ideal
1605	 * locality.
1606	 */
1607	error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1608	if (error)
1609		goto out;
1610
1611	/*
1612	 * If we couldn't get anything, give up.
1613	 */
1614	if (!acur.len) {
1615		if (acur.busy) {
1616			/*
1617			 * Our only valid extents must have been busy. Flush and
1618			 * retry the allocation again. If we get an -EAGAIN
1619			 * error, we're being told that a deadlock was avoided
1620			 * and the current transaction needs committing before
1621			 * the allocation can be retried.
1622			 */
1623			trace_xfs_alloc_near_busy(args);
1624			error = xfs_extent_busy_flush(args->tp, args->pag,
1625					acur.busy_gen, alloc_flags);
1626			if (error)
1627				goto out;
1628
1629			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1630			goto restart;
1631		}
1632		trace_xfs_alloc_size_neither(args);
1633		args->agbno = NULLAGBLOCK;
1634		goto out;
1635	}
1636
1637alloc_finish:
1638	/* fix up btrees on a successful allocation */
1639	error = xfs_alloc_cur_finish(args, &acur);
1640
1641out:
1642	xfs_alloc_cur_close(&acur, error);
1643	return error;
1644}
1645
1646/*
1647 * Allocate a variable extent anywhere in the allocation group agno.
1648 * Extent's length (returned in len) will be between minlen and maxlen,
1649 * and of the form k * prod + mod unless there's nothing that large.
1650 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1651 */
1652static int
1653xfs_alloc_ag_vextent_size(
1654	struct xfs_alloc_arg	*args,
1655	uint32_t		alloc_flags)
1656{
1657	struct xfs_agf		*agf = args->agbp->b_addr;
1658	struct xfs_btree_cur	*bno_cur;
1659	struct xfs_btree_cur	*cnt_cur;
1660	xfs_agblock_t		fbno;		/* start of found freespace */
1661	xfs_extlen_t		flen;		/* length of found freespace */
1662	xfs_agblock_t		rbno;		/* returned block number */
1663	xfs_extlen_t		rlen;		/* length of returned extent */
1664	bool			busy;
1665	unsigned		busy_gen;
1666	int			error;
1667	int			i;
1668
1669	/* Retry once quickly if we find busy extents before blocking. */
1670	alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
1671restart:
1672	/*
1673	 * Allocate and initialize a cursor for the by-size btree.
1674	 */
1675	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1676					args->pag, XFS_BTNUM_CNT);
1677	bno_cur = NULL;
1678
1679	/*
1680	 * Look for an entry >= maxlen+alignment-1 blocks.
1681	 */
1682	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1683			args->maxlen + args->alignment - 1, &i)))
1684		goto error0;
1685
1686	/*
1687	 * If none then we have to settle for a smaller extent. In the case that
1688	 * there are no large extents, this will return the last entry in the
1689	 * tree unless the tree is empty. In the case that there are only busy
1690	 * large extents, this will return the largest small extent unless there
1691	 * are no smaller extents available.
1692	 */
1693	if (!i) {
1694		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1695						   &fbno, &flen, &i);
1696		if (error)
1697			goto error0;
1698		if (i == 0 || flen == 0) {
1699			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1700			trace_xfs_alloc_size_noentry(args);
1701			return 0;
1702		}
1703		ASSERT(i == 1);
1704		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1705				&rlen, &busy_gen);
1706	} else {
1707		/*
1708		 * Search for a non-busy extent that is large enough.
1709		 */
1710		for (;;) {
1711			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1712			if (error)
1713				goto error0;
1714			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1715				error = -EFSCORRUPTED;
1716				goto error0;
1717			}
1718
1719			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1720					&rbno, &rlen, &busy_gen);
1721
1722			if (rlen >= args->maxlen)
1723				break;
1724
1725			error = xfs_btree_increment(cnt_cur, 0, &i);
1726			if (error)
1727				goto error0;
1728			if (i)
1729				continue;
1730
1731			/*
1732			 * Our only valid extents must have been busy. Flush and
1733			 * retry the allocation again. If we get an -EAGAIN
1734			 * error, we're being told that a deadlock was avoided
1735			 * and the current transaction needs committing before
1736			 * the allocation can be retried.
1737			 */
1738			trace_xfs_alloc_size_busy(args);
1739			error = xfs_extent_busy_flush(args->tp, args->pag,
1740					busy_gen, alloc_flags);
1741			if (error)
1742				goto error0;
1743
1744			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1745			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1746			goto restart;
1747		}
1748	}
1749
1750	/*
1751	 * In the first case above, we got the last entry in the
1752	 * by-size btree.  Now we check to see if the space hits maxlen
1753	 * once aligned; if not, we search left for something better.
1754	 * This can't happen in the second case above.
1755	 */
1756	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1757	if (XFS_IS_CORRUPT(args->mp,
1758			   rlen != 0 &&
1759			   (rlen > flen ||
1760			    rbno + rlen > fbno + flen))) {
1761		error = -EFSCORRUPTED;
1762		goto error0;
1763	}
1764	if (rlen < args->maxlen) {
1765		xfs_agblock_t	bestfbno;
1766		xfs_extlen_t	bestflen;
1767		xfs_agblock_t	bestrbno;
1768		xfs_extlen_t	bestrlen;
1769
1770		bestrlen = rlen;
1771		bestrbno = rbno;
1772		bestflen = flen;
1773		bestfbno = fbno;
1774		for (;;) {
1775			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1776				goto error0;
1777			if (i == 0)
1778				break;
1779			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1780					&i)))
1781				goto error0;
1782			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1783				error = -EFSCORRUPTED;
1784				goto error0;
1785			}
1786			if (flen < bestrlen)
1787				break;
1788			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1789					&rbno, &rlen, &busy_gen);
1790			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1791			if (XFS_IS_CORRUPT(args->mp,
1792					   rlen != 0 &&
1793					   (rlen > flen ||
1794					    rbno + rlen > fbno + flen))) {
1795				error = -EFSCORRUPTED;
1796				goto error0;
1797			}
1798			if (rlen > bestrlen) {
1799				bestrlen = rlen;
1800				bestrbno = rbno;
1801				bestflen = flen;
1802				bestfbno = fbno;
1803				if (rlen == args->maxlen)
1804					break;
1805			}
1806		}
1807		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1808				&i)))
1809			goto error0;
1810		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1811			error = -EFSCORRUPTED;
1812			goto error0;
1813		}
1814		rlen = bestrlen;
1815		rbno = bestrbno;
1816		flen = bestflen;
1817		fbno = bestfbno;
1818	}
1819	args->wasfromfl = 0;
1820	/*
1821	 * Fix up the length.
1822	 */
1823	args->len = rlen;
1824	if (rlen < args->minlen) {
1825		if (busy) {
1826			/*
1827			 * Our only valid extents must have been busy. Flush and
1828			 * retry the allocation again. If we get an -EAGAIN
1829			 * error, we're being told that a deadlock was avoided
1830			 * and the current transaction needs committing before
1831			 * the allocation can be retried.
1832			 */
1833			trace_xfs_alloc_size_busy(args);
1834			error = xfs_extent_busy_flush(args->tp, args->pag,
1835					busy_gen, alloc_flags);
1836			if (error)
1837				goto error0;
1838
1839			alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1840			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1841			goto restart;
1842		}
1843		goto out_nominleft;
1844	}
1845	xfs_alloc_fix_len(args);
1846
1847	rlen = args->len;
1848	if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1849		error = -EFSCORRUPTED;
1850		goto error0;
1851	}
1852	/*
1853	 * Allocate and initialize a cursor for the by-block tree.
1854	 */
1855	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1856					args->pag, XFS_BTNUM_BNO);
1857	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1858			rbno, rlen, XFSA_FIXUP_CNT_OK)))
1859		goto error0;
1860	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1861	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1862	cnt_cur = bno_cur = NULL;
1863	args->len = rlen;
1864	args->agbno = rbno;
1865	if (XFS_IS_CORRUPT(args->mp,
1866			   args->agbno + args->len >
1867			   be32_to_cpu(agf->agf_length))) {
1868		error = -EFSCORRUPTED;
1869		goto error0;
1870	}
1871	trace_xfs_alloc_size_done(args);
1872	return 0;
1873
1874error0:
1875	trace_xfs_alloc_size_error(args);
1876	if (cnt_cur)
1877		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1878	if (bno_cur)
1879		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1880	return error;
1881
1882out_nominleft:
1883	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1884	trace_xfs_alloc_size_nominleft(args);
1885	args->agbno = NULLAGBLOCK;
1886	return 0;
1887}
1888
1889/*
1890 * Free the extent starting at agno/bno for length.
1891 */
1892STATIC int
1893xfs_free_ag_extent(
1894	struct xfs_trans		*tp,
1895	struct xfs_buf			*agbp,
1896	xfs_agnumber_t			agno,
1897	xfs_agblock_t			bno,
1898	xfs_extlen_t			len,
1899	const struct xfs_owner_info	*oinfo,
1900	enum xfs_ag_resv_type		type)
1901{
1902	struct xfs_mount		*mp;
1903	struct xfs_btree_cur		*bno_cur;
1904	struct xfs_btree_cur		*cnt_cur;
1905	xfs_agblock_t			gtbno; /* start of right neighbor */
1906	xfs_extlen_t			gtlen; /* length of right neighbor */
1907	xfs_agblock_t			ltbno; /* start of left neighbor */
1908	xfs_extlen_t			ltlen; /* length of left neighbor */
1909	xfs_agblock_t			nbno; /* new starting block of freesp */
1910	xfs_extlen_t			nlen; /* new length of freespace */
1911	int				haveleft; /* have a left neighbor */
1912	int				haveright; /* have a right neighbor */
1913	int				i;
1914	int				error;
1915	struct xfs_perag		*pag = agbp->b_pag;
1916
1917	bno_cur = cnt_cur = NULL;
1918	mp = tp->t_mountp;
1919
1920	if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1921		error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
1922		if (error)
1923			goto error0;
1924	}
1925
1926	/*
1927	 * Allocate and initialize a cursor for the by-block btree.
1928	 */
1929	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
1930	/*
1931	 * Look for a neighboring block on the left (lower block numbers)
1932	 * that is contiguous with this space.
1933	 */
1934	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1935		goto error0;
1936	if (haveleft) {
1937		/*
1938		 * There is a block to our left.
1939		 */
1940		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1941			goto error0;
1942		if (XFS_IS_CORRUPT(mp, i != 1)) {
1943			error = -EFSCORRUPTED;
1944			goto error0;
1945		}
1946		/*
1947		 * It's not contiguous, though.
1948		 */
1949		if (ltbno + ltlen < bno)
1950			haveleft = 0;
1951		else {
1952			/*
1953			 * If this failure happens the request to free this
1954			 * space was invalid, it's (partly) already free.
1955			 * Very bad.
1956			 */
1957			if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1958				error = -EFSCORRUPTED;
1959				goto error0;
1960			}
1961		}
1962	}
1963	/*
1964	 * Look for a neighboring block on the right (higher block numbers)
1965	 * that is contiguous with this space.
1966	 */
1967	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1968		goto error0;
1969	if (haveright) {
1970		/*
1971		 * There is a block to our right.
1972		 */
1973		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
1974			goto error0;
1975		if (XFS_IS_CORRUPT(mp, i != 1)) {
1976			error = -EFSCORRUPTED;
1977			goto error0;
1978		}
1979		/*
1980		 * It's not contiguous, though.
1981		 */
1982		if (bno + len < gtbno)
1983			haveright = 0;
1984		else {
1985			/*
1986			 * If this failure happens the request to free this
1987			 * space was invalid, it's (partly) already free.
1988			 * Very bad.
1989			 */
1990			if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1991				error = -EFSCORRUPTED;
1992				goto error0;
1993			}
1994		}
1995	}
1996	/*
1997	 * Now allocate and initialize a cursor for the by-size tree.
1998	 */
1999	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
2000	/*
2001	 * Have both left and right contiguous neighbors.
2002	 * Merge all three into a single free block.
2003	 */
2004	if (haveleft && haveright) {
2005		/*
2006		 * Delete the old by-size entry on the left.
2007		 */
2008		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2009			goto error0;
2010		if (XFS_IS_CORRUPT(mp, i != 1)) {
2011			error = -EFSCORRUPTED;
2012			goto error0;
2013		}
2014		if ((error = xfs_btree_delete(cnt_cur, &i)))
2015			goto error0;
2016		if (XFS_IS_CORRUPT(mp, i != 1)) {
2017			error = -EFSCORRUPTED;
2018			goto error0;
2019		}
2020		/*
2021		 * Delete the old by-size entry on the right.
2022		 */
2023		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2024			goto error0;
2025		if (XFS_IS_CORRUPT(mp, i != 1)) {
2026			error = -EFSCORRUPTED;
2027			goto error0;
2028		}
2029		if ((error = xfs_btree_delete(cnt_cur, &i)))
2030			goto error0;
2031		if (XFS_IS_CORRUPT(mp, i != 1)) {
2032			error = -EFSCORRUPTED;
2033			goto error0;
2034		}
2035		/*
2036		 * Delete the old by-block entry for the right block.
2037		 */
2038		if ((error = xfs_btree_delete(bno_cur, &i)))
2039			goto error0;
2040		if (XFS_IS_CORRUPT(mp, i != 1)) {
2041			error = -EFSCORRUPTED;
2042			goto error0;
2043		}
2044		/*
2045		 * Move the by-block cursor back to the left neighbor.
2046		 */
2047		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2048			goto error0;
2049		if (XFS_IS_CORRUPT(mp, i != 1)) {
2050			error = -EFSCORRUPTED;
2051			goto error0;
2052		}
2053#ifdef DEBUG
2054		/*
2055		 * Check that this is the right record: delete didn't
2056		 * mangle the cursor.
2057		 */
2058		{
2059			xfs_agblock_t	xxbno;
2060			xfs_extlen_t	xxlen;
2061
2062			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2063					&i)))
2064				goto error0;
2065			if (XFS_IS_CORRUPT(mp,
2066					   i != 1 ||
2067					   xxbno != ltbno ||
2068					   xxlen != ltlen)) {
2069				error = -EFSCORRUPTED;
2070				goto error0;
2071			}
2072		}
2073#endif
2074		/*
2075		 * Update remaining by-block entry to the new, joined block.
2076		 */
2077		nbno = ltbno;
2078		nlen = len + ltlen + gtlen;
2079		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2080			goto error0;
2081	}
2082	/*
2083	 * Have only a left contiguous neighbor.
2084	 * Merge it together with the new freespace.
2085	 */
2086	else if (haveleft) {
2087		/*
2088		 * Delete the old by-size entry on the left.
2089		 */
2090		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2091			goto error0;
2092		if (XFS_IS_CORRUPT(mp, i != 1)) {
2093			error = -EFSCORRUPTED;
2094			goto error0;
2095		}
2096		if ((error = xfs_btree_delete(cnt_cur, &i)))
2097			goto error0;
2098		if (XFS_IS_CORRUPT(mp, i != 1)) {
2099			error = -EFSCORRUPTED;
2100			goto error0;
2101		}
2102		/*
2103		 * Back up the by-block cursor to the left neighbor, and
2104		 * update its length.
2105		 */
2106		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2107			goto error0;
2108		if (XFS_IS_CORRUPT(mp, i != 1)) {
2109			error = -EFSCORRUPTED;
2110			goto error0;
2111		}
2112		nbno = ltbno;
2113		nlen = len + ltlen;
2114		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2115			goto error0;
2116	}
2117	/*
2118	 * Have only a right contiguous neighbor.
2119	 * Merge it together with the new freespace.
2120	 */
2121	else if (haveright) {
2122		/*
2123		 * Delete the old by-size entry on the right.
2124		 */
2125		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2126			goto error0;
2127		if (XFS_IS_CORRUPT(mp, i != 1)) {
2128			error = -EFSCORRUPTED;
2129			goto error0;
2130		}
2131		if ((error = xfs_btree_delete(cnt_cur, &i)))
2132			goto error0;
2133		if (XFS_IS_CORRUPT(mp, i != 1)) {
2134			error = -EFSCORRUPTED;
2135			goto error0;
2136		}
2137		/*
2138		 * Update the starting block and length of the right
2139		 * neighbor in the by-block tree.
2140		 */
2141		nbno = bno;
2142		nlen = len + gtlen;
2143		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2144			goto error0;
2145	}
2146	/*
2147	 * No contiguous neighbors.
2148	 * Insert the new freespace into the by-block tree.
2149	 */
2150	else {
2151		nbno = bno;
2152		nlen = len;
2153		if ((error = xfs_btree_insert(bno_cur, &i)))
2154			goto error0;
2155		if (XFS_IS_CORRUPT(mp, i != 1)) {
2156			error = -EFSCORRUPTED;
2157			goto error0;
2158		}
2159	}
2160	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2161	bno_cur = NULL;
2162	/*
2163	 * In all cases we need to insert the new freespace in the by-size tree.
2164	 */
2165	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2166		goto error0;
2167	if (XFS_IS_CORRUPT(mp, i != 0)) {
2168		error = -EFSCORRUPTED;
2169		goto error0;
2170	}
2171	if ((error = xfs_btree_insert(cnt_cur, &i)))
2172		goto error0;
2173	if (XFS_IS_CORRUPT(mp, i != 1)) {
2174		error = -EFSCORRUPTED;
2175		goto error0;
2176	}
2177	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2178	cnt_cur = NULL;
2179
2180	/*
2181	 * Update the freespace totals in the ag and superblock.
2182	 */
2183	error = xfs_alloc_update_counters(tp, agbp, len);
2184	xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2185	if (error)
2186		goto error0;
2187
2188	XFS_STATS_INC(mp, xs_freex);
2189	XFS_STATS_ADD(mp, xs_freeb, len);
2190
2191	trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2192
2193	return 0;
2194
2195 error0:
2196	trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2197	if (bno_cur)
2198		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2199	if (cnt_cur)
2200		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2201	return error;
2202}
2203
2204/*
2205 * Visible (exported) allocation/free functions.
2206 * Some of these are used just by xfs_alloc_btree.c and this file.
2207 */
2208
2209/*
2210 * Compute and fill in value of m_alloc_maxlevels.
2211 */
2212void
2213xfs_alloc_compute_maxlevels(
2214	xfs_mount_t	*mp)	/* file system mount structure */
2215{
2216	mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2217			(mp->m_sb.sb_agblocks + 1) / 2);
2218	ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
2219}
2220
2221/*
2222 * Find the length of the longest extent in an AG.  The 'need' parameter
2223 * specifies how much space we're going to need for the AGFL and the
2224 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2225 * other callers.
2226 */
2227xfs_extlen_t
2228xfs_alloc_longest_free_extent(
2229	struct xfs_perag	*pag,
2230	xfs_extlen_t		need,
2231	xfs_extlen_t		reserved)
2232{
2233	xfs_extlen_t		delta = 0;
2234
2235	/*
2236	 * If the AGFL needs a recharge, we'll have to subtract that from the
2237	 * longest extent.
2238	 */
2239	if (need > pag->pagf_flcount)
2240		delta = need - pag->pagf_flcount;
2241
2242	/*
2243	 * If we cannot maintain others' reservations with space from the
2244	 * not-longest freesp extents, we'll have to subtract /that/ from
2245	 * the longest extent too.
2246	 */
2247	if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2248		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2249
2250	/*
2251	 * If the longest extent is long enough to satisfy all the
2252	 * reservations and AGFL rules in place, we can return this extent.
2253	 */
2254	if (pag->pagf_longest > delta)
2255		return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2256				pag->pagf_longest - delta);
2257
2258	/* Otherwise, let the caller try for 1 block if there's space. */
2259	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2260}
2261
2262/*
2263 * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
2264 * return the largest possible minimum length.
2265 */
2266unsigned int
2267xfs_alloc_min_freelist(
2268	struct xfs_mount	*mp,
2269	struct xfs_perag	*pag)
2270{
2271	/* AG btrees have at least 1 level. */
2272	static const uint8_t	fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2273	const uint8_t		*levels = pag ? pag->pagf_levels : fake_levels;
2274	unsigned int		min_free;
2275
2276	ASSERT(mp->m_alloc_maxlevels > 0);
2277
2278	/*
2279	 * For a btree shorter than the maximum height, the worst case is that
2280	 * every level gets split and a new level is added, then while inserting
2281	 * another entry to refill the AGFL, every level under the old root gets
2282	 * split again. This is:
2283	 *
2284	 *   (full height split reservation) + (AGFL refill split height)
2285	 * = (current height + 1) + (current height - 1)
2286	 * = (new height) + (new height - 2)
2287	 * = 2 * new height - 2
2288	 *
2289	 * For a btree of maximum height, the worst case is that every level
2290	 * under the root gets split, then while inserting another entry to
2291	 * refill the AGFL, every level under the root gets split again. This is
2292	 * also:
2293	 *
2294	 *   2 * (current height - 1)
2295	 * = 2 * (new height - 1)
2296	 * = 2 * new height - 2
2297	 */
2298
2299	/* space needed by-bno freespace btree */
2300	min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2301				       mp->m_alloc_maxlevels) * 2 - 2;
2302	/* space needed by-size freespace btree */
2303	min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2304				       mp->m_alloc_maxlevels) * 2 - 2;
2305	/* space needed reverse mapping used space btree */
2306	if (xfs_has_rmapbt(mp))
2307		min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2308						mp->m_rmap_maxlevels) * 2 - 2;
2309
2310	return min_free;
2311}
2312
2313/*
2314 * Check if the operation we are fixing up the freelist for should go ahead or
2315 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2316 * is dependent on whether the size and shape of free space available will
2317 * permit the requested allocation to take place.
2318 */
2319static bool
2320xfs_alloc_space_available(
2321	struct xfs_alloc_arg	*args,
2322	xfs_extlen_t		min_free,
2323	int			flags)
2324{
2325	struct xfs_perag	*pag = args->pag;
2326	xfs_extlen_t		alloc_len, longest;
2327	xfs_extlen_t		reservation; /* blocks that are still reserved */
2328	int			available;
2329	xfs_extlen_t		agflcount;
2330
2331	if (flags & XFS_ALLOC_FLAG_FREEING)
2332		return true;
2333
2334	reservation = xfs_ag_resv_needed(pag, args->resv);
2335
2336	/* do we have enough contiguous free space for the allocation? */
2337	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2338	longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2339	if (longest < alloc_len)
2340		return false;
2341
2342	/*
2343	 * Do we have enough free space remaining for the allocation? Don't
2344	 * account extra agfl blocks because we are about to defer free them,
2345	 * making them unavailable until the current transaction commits.
2346	 */
2347	agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2348	available = (int)(pag->pagf_freeblks + agflcount -
2349			  reservation - min_free - args->minleft);
2350	if (available < (int)max(args->total, alloc_len))
2351		return false;
2352
2353	/*
2354	 * Clamp maxlen to the amount of free space available for the actual
2355	 * extent allocation.
2356	 */
2357	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2358		args->maxlen = available;
2359		ASSERT(args->maxlen > 0);
2360		ASSERT(args->maxlen >= args->minlen);
2361	}
2362
2363	return true;
2364}
2365
2366int
2367xfs_free_agfl_block(
2368	struct xfs_trans	*tp,
2369	xfs_agnumber_t		agno,
2370	xfs_agblock_t		agbno,
2371	struct xfs_buf		*agbp,
2372	struct xfs_owner_info	*oinfo)
2373{
2374	int			error;
2375	struct xfs_buf		*bp;
2376
2377	error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2378				   XFS_AG_RESV_AGFL);
2379	if (error)
2380		return error;
2381
2382	error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2383			XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2384			tp->t_mountp->m_bsize, 0, &bp);
2385	if (error)
2386		return error;
2387	xfs_trans_binval(tp, bp);
2388
2389	return 0;
2390}
2391
2392/*
2393 * Check the agfl fields of the agf for inconsistency or corruption.
2394 *
2395 * The original purpose was to detect an agfl header padding mismatch between
2396 * current and early v5 kernels. This problem manifests as a 1-slot size
2397 * difference between the on-disk flcount and the active [first, last] range of
2398 * a wrapped agfl.
2399 *
2400 * However, we need to use these same checks to catch agfl count corruptions
2401 * unrelated to padding. This could occur on any v4 or v5 filesystem, so either
2402 * way, we need to reset the agfl and warn the user.
2403 *
2404 * Return true if a reset is required before the agfl can be used, false
2405 * otherwise.
2406 */
2407static bool
2408xfs_agfl_needs_reset(
2409	struct xfs_mount	*mp,
2410	struct xfs_agf		*agf)
2411{
2412	uint32_t		f = be32_to_cpu(agf->agf_flfirst);
2413	uint32_t		l = be32_to_cpu(agf->agf_fllast);
2414	uint32_t		c = be32_to_cpu(agf->agf_flcount);
2415	int			agfl_size = xfs_agfl_size(mp);
2416	int			active;
2417
2418	/*
2419	 * The agf read verifier catches severe corruption of these fields.
2420	 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2421	 * the verifier allows it.
2422	 */
2423	if (f >= agfl_size || l >= agfl_size)
2424		return true;
2425	if (c > agfl_size)
2426		return true;
2427
2428	/*
2429	 * Check consistency between the on-disk count and the active range. An
2430	 * agfl padding mismatch manifests as an inconsistent flcount.
2431	 */
2432	if (c && l >= f)
2433		active = l - f + 1;
2434	else if (c)
2435		active = agfl_size - f + l + 1;
2436	else
2437		active = 0;
2438
2439	return active != c;
2440}
2441
2442/*
2443 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2444 * agfl content cannot be trusted. Warn the user that a repair is required to
2445 * recover leaked blocks.
2446 *
2447 * The purpose of this mechanism is to handle filesystems affected by the agfl
2448 * header padding mismatch problem. A reset keeps the filesystem online with a
2449 * relatively minor free space accounting inconsistency rather than suffer the
2450 * inevitable crash from use of an invalid agfl block.
2451 */
2452static void
2453xfs_agfl_reset(
2454	struct xfs_trans	*tp,
2455	struct xfs_buf		*agbp,
2456	struct xfs_perag	*pag)
2457{
2458	struct xfs_mount	*mp = tp->t_mountp;
2459	struct xfs_agf		*agf = agbp->b_addr;
2460
2461	ASSERT(xfs_perag_agfl_needs_reset(pag));
2462	trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2463
2464	xfs_warn(mp,
2465	       "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2466	       "Please unmount and run xfs_repair.",
2467	         pag->pag_agno, pag->pagf_flcount);
2468
2469	agf->agf_flfirst = 0;
2470	agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2471	agf->agf_flcount = 0;
2472	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2473				    XFS_AGF_FLCOUNT);
2474
2475	pag->pagf_flcount = 0;
2476	clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
2477}
2478
2479/*
2480 * Defer an AGFL block free. This is effectively equivalent to
2481 * xfs_free_extent_later() with some special handling particular to AGFL blocks.
2482 *
2483 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2484 * allocation operations in a transaction. AGFL frees are prone to this problem
2485 * because for one they are always freed one at a time. Further, an immediate
2486 * AGFL block free can cause a btree join and require another block free before
2487 * the real allocation can proceed. Deferring the free disconnects freeing up
2488 * the AGFL slot from freeing the block.
2489 */
2490static int
2491xfs_defer_agfl_block(
2492	struct xfs_trans		*tp,
2493	xfs_agnumber_t			agno,
2494	xfs_agblock_t			agbno,
2495	struct xfs_owner_info		*oinfo)
2496{
2497	struct xfs_mount		*mp = tp->t_mountp;
2498	struct xfs_extent_free_item	*xefi;
2499	xfs_fsblock_t			fsbno = XFS_AGB_TO_FSB(mp, agno, agbno);
2500
2501	ASSERT(xfs_extfree_item_cache != NULL);
2502	ASSERT(oinfo != NULL);
2503
2504	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno)))
2505		return -EFSCORRUPTED;
2506
2507	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
2508			       GFP_KERNEL | __GFP_NOFAIL);
2509	xefi->xefi_startblock = fsbno;
2510	xefi->xefi_blockcount = 1;
2511	xefi->xefi_owner = oinfo->oi_owner;
2512	xefi->xefi_agresv = XFS_AG_RESV_AGFL;
2513
2514	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2515
2516	xfs_extent_free_get_group(mp, xefi);
2517	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
2518	return 0;
2519}
2520
2521/*
2522 * Add the extent to the list of extents to be free at transaction end.
2523 * The list is maintained sorted (by block number).
2524 */
2525int
2526__xfs_free_extent_later(
2527	struct xfs_trans		*tp,
2528	xfs_fsblock_t			bno,
2529	xfs_filblks_t			len,
2530	const struct xfs_owner_info	*oinfo,
2531	enum xfs_ag_resv_type		type,
2532	bool				skip_discard)
2533{
2534	struct xfs_extent_free_item	*xefi;
2535	struct xfs_mount		*mp = tp->t_mountp;
2536#ifdef DEBUG
2537	xfs_agnumber_t			agno;
2538	xfs_agblock_t			agbno;
2539
2540	ASSERT(bno != NULLFSBLOCK);
2541	ASSERT(len > 0);
2542	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
2543	ASSERT(!isnullstartblock(bno));
2544	agno = XFS_FSB_TO_AGNO(mp, bno);
2545	agbno = XFS_FSB_TO_AGBNO(mp, bno);
2546	ASSERT(agno < mp->m_sb.sb_agcount);
2547	ASSERT(agbno < mp->m_sb.sb_agblocks);
2548	ASSERT(len < mp->m_sb.sb_agblocks);
2549	ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
2550#endif
2551	ASSERT(xfs_extfree_item_cache != NULL);
2552	ASSERT(type != XFS_AG_RESV_AGFL);
2553
2554	if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
2555		return -EFSCORRUPTED;
2556
2557	xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
2558			       GFP_KERNEL | __GFP_NOFAIL);
2559	xefi->xefi_startblock = bno;
2560	xefi->xefi_blockcount = (xfs_extlen_t)len;
2561	xefi->xefi_agresv = type;
2562	if (skip_discard)
2563		xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
2564	if (oinfo) {
2565		ASSERT(oinfo->oi_offset == 0);
2566
2567		if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
2568			xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
2569		if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
2570			xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
2571		xefi->xefi_owner = oinfo->oi_owner;
2572	} else {
2573		xefi->xefi_owner = XFS_RMAP_OWN_NULL;
2574	}
2575	trace_xfs_bmap_free_defer(mp,
2576			XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
2577			XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
2578
2579	xfs_extent_free_get_group(mp, xefi);
2580	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
2581	return 0;
2582}
2583
2584#ifdef DEBUG
2585/*
2586 * Check if an AGF has a free extent record whose length is equal to
2587 * args->minlen.
2588 */
2589STATIC int
2590xfs_exact_minlen_extent_available(
2591	struct xfs_alloc_arg	*args,
2592	struct xfs_buf		*agbp,
2593	int			*stat)
2594{
2595	struct xfs_btree_cur	*cnt_cur;
2596	xfs_agblock_t		fbno;
2597	xfs_extlen_t		flen;
2598	int			error = 0;
2599
2600	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
2601					args->pag, XFS_BTNUM_CNT);
2602	error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2603	if (error)
2604		goto out;
2605
2606	if (*stat == 0) {
2607		error = -EFSCORRUPTED;
2608		goto out;
2609	}
2610
2611	error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2612	if (error)
2613		goto out;
2614
2615	if (*stat == 1 && flen != args->minlen)
2616		*stat = 0;
2617
2618out:
2619	xfs_btree_del_cursor(cnt_cur, error);
2620
2621	return error;
2622}
2623#endif
2624
2625/*
2626 * Decide whether to use this allocation group for this allocation.
2627 * If so, fix up the btree freelist's size.
2628 */
2629int			/* error */
2630xfs_alloc_fix_freelist(
2631	struct xfs_alloc_arg	*args,	/* allocation argument structure */
2632	uint32_t		alloc_flags)
2633{
2634	struct xfs_mount	*mp = args->mp;
2635	struct xfs_perag	*pag = args->pag;
2636	struct xfs_trans	*tp = args->tp;
2637	struct xfs_buf		*agbp = NULL;
2638	struct xfs_buf		*agflbp = NULL;
2639	struct xfs_alloc_arg	targs;	/* local allocation arguments */
2640	xfs_agblock_t		bno;	/* freelist block */
2641	xfs_extlen_t		need;	/* total blocks needed in freelist */
2642	int			error = 0;
2643
2644	/* deferred ops (AGFL block frees) require permanent transactions */
2645	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2646
2647	if (!xfs_perag_initialised_agf(pag)) {
2648		error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
2649		if (error) {
2650			/* Couldn't lock the AGF so skip this AG. */
2651			if (error == -EAGAIN)
2652				error = 0;
2653			goto out_no_agbp;
2654		}
2655	}
2656
2657	/*
2658	 * If this is a metadata preferred pag and we are user data then try
2659	 * somewhere else if we are not being asked to try harder at this
2660	 * point
2661	 */
2662	if (xfs_perag_prefers_metadata(pag) &&
2663	    (args->datatype & XFS_ALLOC_USERDATA) &&
2664	    (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2665		ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING));
2666		goto out_agbp_relse;
2667	}
2668
2669	need = xfs_alloc_min_freelist(mp, pag);
2670	if (!xfs_alloc_space_available(args, need, alloc_flags |
2671			XFS_ALLOC_FLAG_CHECK))
2672		goto out_agbp_relse;
2673
2674	/*
2675	 * Get the a.g. freespace buffer.
2676	 * Can fail if we're not blocking on locks, and it's held.
2677	 */
2678	if (!agbp) {
2679		error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
2680		if (error) {
2681			/* Couldn't lock the AGF so skip this AG. */
2682			if (error == -EAGAIN)
2683				error = 0;
2684			goto out_no_agbp;
2685		}
2686	}
2687
2688	/* reset a padding mismatched agfl before final free space check */
2689	if (xfs_perag_agfl_needs_reset(pag))
2690		xfs_agfl_reset(tp, agbp, pag);
2691
2692	/* If there isn't enough total space or single-extent, reject it. */
2693	need = xfs_alloc_min_freelist(mp, pag);
2694	if (!xfs_alloc_space_available(args, need, alloc_flags))
2695		goto out_agbp_relse;
2696
2697#ifdef DEBUG
2698	if (args->alloc_minlen_only) {
2699		int stat;
2700
2701		error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2702		if (error || !stat)
2703			goto out_agbp_relse;
2704	}
2705#endif
2706	/*
2707	 * Make the freelist shorter if it's too long.
2708	 *
2709	 * Note that from this point onwards, we will always release the agf and
2710	 * agfl buffers on error. This handles the case where we error out and
2711	 * the buffers are clean or may not have been joined to the transaction
2712	 * and hence need to be released manually. If they have been joined to
2713	 * the transaction, then xfs_trans_brelse() will handle them
2714	 * appropriately based on the recursion count and dirty state of the
2715	 * buffer.
2716	 *
2717	 * XXX (dgc): When we have lots of free space, does this buy us
2718	 * anything other than extra overhead when we need to put more blocks
2719	 * back on the free list? Maybe we should only do this when space is
2720	 * getting low or the AGFL is more than half full?
2721	 *
2722	 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2723	 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2724	 * updating the rmapbt.  Both flags are used in xfs_repair while we're
2725	 * rebuilding the rmapbt, and neither are used by the kernel.  They're
2726	 * both required to ensure that rmaps are correctly recorded for the
2727	 * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
2728	 * repair/rmap.c in xfsprogs for details.
2729	 */
2730	memset(&targs, 0, sizeof(targs));
2731	/* struct copy below */
2732	if (alloc_flags & XFS_ALLOC_FLAG_NORMAP)
2733		targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2734	else
2735		targs.oinfo = XFS_RMAP_OINFO_AG;
2736	while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) &&
2737			pag->pagf_flcount > need) {
2738		error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
2739		if (error)
2740			goto out_agbp_relse;
2741
2742		/* defer agfl frees */
2743		error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2744		if (error)
2745			goto out_agbp_relse;
2746	}
2747
2748	targs.tp = tp;
2749	targs.mp = mp;
2750	targs.agbp = agbp;
2751	targs.agno = args->agno;
2752	targs.alignment = targs.minlen = targs.prod = 1;
2753	targs.pag = pag;
2754	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2755	if (error)
2756		goto out_agbp_relse;
2757
2758	/* Make the freelist longer if it's too short. */
2759	while (pag->pagf_flcount < need) {
2760		targs.agbno = 0;
2761		targs.maxlen = need - pag->pagf_flcount;
2762		targs.resv = XFS_AG_RESV_AGFL;
2763
2764		/* Allocate as many blocks as possible at once. */
2765		error = xfs_alloc_ag_vextent_size(&targs, alloc_flags);
2766		if (error)
2767			goto out_agflbp_relse;
2768
2769		/*
2770		 * Stop if we run out.  Won't happen if callers are obeying
2771		 * the restrictions correctly.  Can happen for free calls
2772		 * on a completely full ag.
2773		 */
2774		if (targs.agbno == NULLAGBLOCK) {
2775			if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
2776				break;
2777			goto out_agflbp_relse;
2778		}
2779
2780		if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) {
2781			error = xfs_rmap_alloc(tp, agbp, pag,
2782				       targs.agbno, targs.len, &targs.oinfo);
2783			if (error)
2784				goto out_agflbp_relse;
2785		}
2786		error = xfs_alloc_update_counters(tp, agbp,
2787						  -((long)(targs.len)));
2788		if (error)
2789			goto out_agflbp_relse;
2790
2791		/*
2792		 * Put each allocated block on the list.
2793		 */
2794		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2795			error = xfs_alloc_put_freelist(pag, tp, agbp,
2796							agflbp, bno, 0);
2797			if (error)
2798				goto out_agflbp_relse;
2799		}
2800	}
2801	xfs_trans_brelse(tp, agflbp);
2802	args->agbp = agbp;
2803	return 0;
2804
2805out_agflbp_relse:
2806	xfs_trans_brelse(tp, agflbp);
2807out_agbp_relse:
2808	if (agbp)
2809		xfs_trans_brelse(tp, agbp);
2810out_no_agbp:
2811	args->agbp = NULL;
2812	return error;
2813}
2814
2815/*
2816 * Get a block from the freelist.
2817 * Returns with the buffer for the block gotten.
2818 */
2819int
2820xfs_alloc_get_freelist(
2821	struct xfs_perag	*pag,
2822	struct xfs_trans	*tp,
2823	struct xfs_buf		*agbp,
2824	xfs_agblock_t		*bnop,
2825	int			btreeblk)
2826{
2827	struct xfs_agf		*agf = agbp->b_addr;
2828	struct xfs_buf		*agflbp;
2829	xfs_agblock_t		bno;
2830	__be32			*agfl_bno;
2831	int			error;
2832	uint32_t		logflags;
2833	struct xfs_mount	*mp = tp->t_mountp;
2834
2835	/*
2836	 * Freelist is empty, give up.
2837	 */
2838	if (!agf->agf_flcount) {
2839		*bnop = NULLAGBLOCK;
2840		return 0;
2841	}
2842	/*
2843	 * Read the array of free blocks.
2844	 */
2845	error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2846	if (error)
2847		return error;
2848
2849
2850	/*
2851	 * Get the block number and update the data structures.
2852	 */
2853	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2854	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2855	if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno)))
2856		return -EFSCORRUPTED;
2857
2858	be32_add_cpu(&agf->agf_flfirst, 1);
2859	xfs_trans_brelse(tp, agflbp);
2860	if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
2861		agf->agf_flfirst = 0;
2862
2863	ASSERT(!xfs_perag_agfl_needs_reset(pag));
2864	be32_add_cpu(&agf->agf_flcount, -1);
2865	pag->pagf_flcount--;
2866
2867	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2868	if (btreeblk) {
2869		be32_add_cpu(&agf->agf_btreeblks, 1);
2870		pag->pagf_btreeblks++;
2871		logflags |= XFS_AGF_BTREEBLKS;
2872	}
2873
2874	xfs_alloc_log_agf(tp, agbp, logflags);
2875	*bnop = bno;
2876
2877	return 0;
2878}
2879
2880/*
2881 * Log the given fields from the agf structure.
2882 */
2883void
2884xfs_alloc_log_agf(
2885	struct xfs_trans	*tp,
2886	struct xfs_buf		*bp,
2887	uint32_t		fields)
2888{
2889	int	first;		/* first byte offset */
2890	int	last;		/* last byte offset */
2891	static const short	offsets[] = {
2892		offsetof(xfs_agf_t, agf_magicnum),
2893		offsetof(xfs_agf_t, agf_versionnum),
2894		offsetof(xfs_agf_t, agf_seqno),
2895		offsetof(xfs_agf_t, agf_length),
2896		offsetof(xfs_agf_t, agf_roots[0]),
2897		offsetof(xfs_agf_t, agf_levels[0]),
2898		offsetof(xfs_agf_t, agf_flfirst),
2899		offsetof(xfs_agf_t, agf_fllast),
2900		offsetof(xfs_agf_t, agf_flcount),
2901		offsetof(xfs_agf_t, agf_freeblks),
2902		offsetof(xfs_agf_t, agf_longest),
2903		offsetof(xfs_agf_t, agf_btreeblks),
2904		offsetof(xfs_agf_t, agf_uuid),
2905		offsetof(xfs_agf_t, agf_rmap_blocks),
2906		offsetof(xfs_agf_t, agf_refcount_blocks),
2907		offsetof(xfs_agf_t, agf_refcount_root),
2908		offsetof(xfs_agf_t, agf_refcount_level),
2909		/* needed so that we don't log the whole rest of the structure: */
2910		offsetof(xfs_agf_t, agf_spare64),
2911		sizeof(xfs_agf_t)
2912	};
2913
2914	trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
2915
2916	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2917
2918	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2919	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2920}
2921
2922/*
2923 * Put the block on the freelist for the allocation group.
2924 */
2925int
2926xfs_alloc_put_freelist(
2927	struct xfs_perag	*pag,
2928	struct xfs_trans	*tp,
2929	struct xfs_buf		*agbp,
2930	struct xfs_buf		*agflbp,
2931	xfs_agblock_t		bno,
2932	int			btreeblk)
2933{
2934	struct xfs_mount	*mp = tp->t_mountp;
2935	struct xfs_agf		*agf = agbp->b_addr;
2936	__be32			*blockp;
2937	int			error;
2938	uint32_t		logflags;
2939	__be32			*agfl_bno;
2940	int			startoff;
2941
2942	if (!agflbp) {
2943		error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2944		if (error)
2945			return error;
2946	}
2947
2948	be32_add_cpu(&agf->agf_fllast, 1);
2949	if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
2950		agf->agf_fllast = 0;
2951
2952	ASSERT(!xfs_perag_agfl_needs_reset(pag));
2953	be32_add_cpu(&agf->agf_flcount, 1);
2954	pag->pagf_flcount++;
2955
2956	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2957	if (btreeblk) {
2958		be32_add_cpu(&agf->agf_btreeblks, -1);
2959		pag->pagf_btreeblks--;
2960		logflags |= XFS_AGF_BTREEBLKS;
2961	}
2962
2963	xfs_alloc_log_agf(tp, agbp, logflags);
2964
2965	ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
2966
2967	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2968	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2969	*blockp = cpu_to_be32(bno);
2970	startoff = (char *)blockp - (char *)agflbp->b_addr;
2971
2972	xfs_alloc_log_agf(tp, agbp, logflags);
2973
2974	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2975	xfs_trans_log_buf(tp, agflbp, startoff,
2976			  startoff + sizeof(xfs_agblock_t) - 1);
2977	return 0;
2978}
2979
2980/*
2981 * Check that this AGF/AGI header's sequence number and length matches the AG
2982 * number and size in fsblocks.
2983 */
2984xfs_failaddr_t
2985xfs_validate_ag_length(
2986	struct xfs_buf		*bp,
2987	uint32_t		seqno,
2988	uint32_t		length)
2989{
2990	struct xfs_mount	*mp = bp->b_mount;
2991	/*
2992	 * During growfs operations, the perag is not fully initialised,
2993	 * so we can't use it for any useful checking. growfs ensures we can't
2994	 * use it by using uncached buffers that don't have the perag attached
2995	 * so we can detect and avoid this problem.
2996	 */
2997	if (bp->b_pag && seqno != bp->b_pag->pag_agno)
2998		return __this_address;
2999
3000	/*
3001	 * Only the last AG in the filesystem is allowed to be shorter
3002	 * than the AG size recorded in the superblock.
3003	 */
3004	if (length != mp->m_sb.sb_agblocks) {
3005		/*
3006		 * During growfs, the new last AG can get here before we
3007		 * have updated the superblock. Give it a pass on the seqno
3008		 * check.
3009		 */
3010		if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1)
3011			return __this_address;
3012		if (length < XFS_MIN_AG_BLOCKS)
3013			return __this_address;
3014		if (length > mp->m_sb.sb_agblocks)
3015			return __this_address;
3016	}
3017
3018	return NULL;
3019}
3020
3021/*
3022 * Verify the AGF is consistent.
3023 *
3024 * We do not verify the AGFL indexes in the AGF are fully consistent here
3025 * because of issues with variable on-disk structure sizes. Instead, we check
3026 * the agfl indexes for consistency when we initialise the perag from the AGF
3027 * information after a read completes.
3028 *
3029 * If the index is inconsistent, then we mark the perag as needing an AGFL
3030 * reset. The first AGFL update performed then resets the AGFL indexes and
3031 * refills the AGFL with known good free blocks, allowing the filesystem to
3032 * continue operating normally at the cost of a few leaked free space blocks.
3033 */
3034static xfs_failaddr_t
3035xfs_agf_verify(
3036	struct xfs_buf		*bp)
3037{
3038	struct xfs_mount	*mp = bp->b_mount;
3039	struct xfs_agf		*agf = bp->b_addr;
3040	xfs_failaddr_t		fa;
3041	uint32_t		agf_seqno = be32_to_cpu(agf->agf_seqno);
3042	uint32_t		agf_length = be32_to_cpu(agf->agf_length);
3043
3044	if (xfs_has_crc(mp)) {
3045		if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
3046			return __this_address;
3047		if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
3048			return __this_address;
3049	}
3050
3051	if (!xfs_verify_magic(bp, agf->agf_magicnum))
3052		return __this_address;
3053
3054	if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)))
3055		return __this_address;
3056
3057	/*
3058	 * Both agf_seqno and agf_length need to validated before anything else
3059	 * block number related in the AGF or AGFL can be checked.
3060	 */
3061	fa = xfs_validate_ag_length(bp, agf_seqno, agf_length);
3062	if (fa)
3063		return fa;
3064
3065	if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp))
3066		return __this_address;
3067	if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp))
3068		return __this_address;
3069	if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp))
3070		return __this_address;
3071
3072	if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
3073	    be32_to_cpu(agf->agf_freeblks) > agf_length)
3074		return __this_address;
3075
3076	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
3077	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
3078	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
3079						mp->m_alloc_maxlevels ||
3080	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
3081						mp->m_alloc_maxlevels)
3082		return __this_address;
3083
3084	if (xfs_has_lazysbcount(mp) &&
3085	    be32_to_cpu(agf->agf_btreeblks) > agf_length)
3086		return __this_address;
3087
3088	if (xfs_has_rmapbt(mp)) {
3089		if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length)
3090			return __this_address;
3091
3092		if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
3093		    be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
3094							mp->m_rmap_maxlevels)
3095			return __this_address;
3096	}
3097
3098	if (xfs_has_reflink(mp)) {
3099		if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length)
3100			return __this_address;
3101
3102		if (be32_to_cpu(agf->agf_refcount_level) < 1 ||
3103		    be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)
3104			return __this_address;
3105	}
3106
3107	return NULL;
3108}
3109
3110static void
3111xfs_agf_read_verify(
3112	struct xfs_buf	*bp)
3113{
3114	struct xfs_mount *mp = bp->b_mount;
3115	xfs_failaddr_t	fa;
3116
3117	if (xfs_has_crc(mp) &&
3118	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
3119		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
3120	else {
3121		fa = xfs_agf_verify(bp);
3122		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
3123			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3124	}
3125}
3126
3127static void
3128xfs_agf_write_verify(
3129	struct xfs_buf	*bp)
3130{
3131	struct xfs_mount	*mp = bp->b_mount;
3132	struct xfs_buf_log_item	*bip = bp->b_log_item;
3133	struct xfs_agf		*agf = bp->b_addr;
3134	xfs_failaddr_t		fa;
3135
3136	fa = xfs_agf_verify(bp);
3137	if (fa) {
3138		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3139		return;
3140	}
3141
3142	if (!xfs_has_crc(mp))
3143		return;
3144
3145	if (bip)
3146		agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
3147
3148	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
3149}
3150
3151const struct xfs_buf_ops xfs_agf_buf_ops = {
3152	.name = "xfs_agf",
3153	.magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
3154	.verify_read = xfs_agf_read_verify,
3155	.verify_write = xfs_agf_write_verify,
3156	.verify_struct = xfs_agf_verify,
3157};
3158
3159/*
3160 * Read in the allocation group header (free/alloc section).
3161 */
3162int
3163xfs_read_agf(
3164	struct xfs_perag	*pag,
3165	struct xfs_trans	*tp,
3166	int			flags,
3167	struct xfs_buf		**agfbpp)
3168{
3169	struct xfs_mount	*mp = pag->pag_mount;
3170	int			error;
3171
3172	trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
3173
3174	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3175			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
3176			XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
3177	if (error)
3178		return error;
3179
3180	xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
3181	return 0;
3182}
3183
3184/*
3185 * Read in the allocation group header (free/alloc section) and initialise the
3186 * perag structure if necessary. If the caller provides @agfbpp, then return the
3187 * locked buffer to the caller, otherwise free it.
3188 */
3189int
3190xfs_alloc_read_agf(
3191	struct xfs_perag	*pag,
3192	struct xfs_trans	*tp,
3193	int			flags,
3194	struct xfs_buf		**agfbpp)
3195{
3196	struct xfs_buf		*agfbp;
3197	struct xfs_agf		*agf;
3198	int			error;
3199	int			allocbt_blks;
3200
3201	trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
3202
3203	/* We don't support trylock when freeing. */
3204	ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3205			(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3206	error = xfs_read_agf(pag, tp,
3207			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3208			&agfbp);
3209	if (error)
3210		return error;
3211
3212	agf = agfbp->b_addr;
3213	if (!xfs_perag_initialised_agf(pag)) {
3214		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3215		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3216		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3217		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3218		pag->pagf_levels[XFS_BTNUM_BNOi] =
3219			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3220		pag->pagf_levels[XFS_BTNUM_CNTi] =
3221			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3222		pag->pagf_levels[XFS_BTNUM_RMAPi] =
3223			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3224		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3225		if (xfs_agfl_needs_reset(pag->pag_mount, agf))
3226			set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3227		else
3228			clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
3229
3230		/*
3231		 * Update the in-core allocbt counter. Filter out the rmapbt
3232		 * subset of the btreeblks counter because the rmapbt is managed
3233		 * by perag reservation. Subtract one for the rmapbt root block
3234		 * because the rmap counter includes it while the btreeblks
3235		 * counter only tracks non-root blocks.
3236		 */
3237		allocbt_blks = pag->pagf_btreeblks;
3238		if (xfs_has_rmapbt(pag->pag_mount))
3239			allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3240		if (allocbt_blks > 0)
3241			atomic64_add(allocbt_blks,
3242					&pag->pag_mount->m_allocbt_blks);
3243
3244		set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
3245	}
3246#ifdef DEBUG
3247	else if (!xfs_is_shutdown(pag->pag_mount)) {
3248		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3249		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3250		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3251		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3252		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3253		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3254		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3255		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3256	}
3257#endif
3258	if (agfbpp)
3259		*agfbpp = agfbp;
3260	else
3261		xfs_trans_brelse(tp, agfbp);
3262	return 0;
3263}
3264
3265/*
3266 * Pre-proces allocation arguments to set initial state that we don't require
3267 * callers to set up correctly, as well as bounds check the allocation args
3268 * that are set up.
3269 */
3270static int
3271xfs_alloc_vextent_check_args(
3272	struct xfs_alloc_arg	*args,
3273	xfs_fsblock_t		target,
3274	xfs_agnumber_t		*minimum_agno)
3275{
3276	struct xfs_mount	*mp = args->mp;
3277	xfs_agblock_t		agsize;
3278
3279	args->fsbno = NULLFSBLOCK;
3280
3281	*minimum_agno = 0;
3282	if (args->tp->t_highest_agno != NULLAGNUMBER)
3283		*minimum_agno = args->tp->t_highest_agno;
3284
3285	/*
3286	 * Just fix this up, for the case where the last a.g. is shorter
3287	 * (or there's only one a.g.) and the caller couldn't easily figure
3288	 * that out (xfs_bmap_alloc).
3289	 */
3290	agsize = mp->m_sb.sb_agblocks;
3291	if (args->maxlen > agsize)
3292		args->maxlen = agsize;
3293	if (args->alignment == 0)
3294		args->alignment = 1;
3295
3296	ASSERT(args->minlen > 0);
3297	ASSERT(args->maxlen > 0);
3298	ASSERT(args->alignment > 0);
3299	ASSERT(args->resv != XFS_AG_RESV_AGFL);
3300
3301	ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount);
3302	ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize);
3303	ASSERT(args->minlen <= args->maxlen);
3304	ASSERT(args->minlen <= agsize);
3305	ASSERT(args->mod < args->prod);
3306
3307	if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount ||
3308	    XFS_FSB_TO_AGBNO(mp, target) >= agsize ||
3309	    args->minlen > args->maxlen || args->minlen > agsize ||
3310	    args->mod >= args->prod) {
3311		trace_xfs_alloc_vextent_badargs(args);
3312		return -ENOSPC;
3313	}
3314
3315	if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) {
3316		trace_xfs_alloc_vextent_skip_deadlock(args);
3317		return -ENOSPC;
3318	}
3319	return 0;
3320
3321}
3322
3323/*
3324 * Prepare an AG for allocation. If the AG is not prepared to accept the
3325 * allocation, return failure.
3326 *
3327 * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are
3328 * modified to hold their own perag references.
3329 */
3330static int
3331xfs_alloc_vextent_prepare_ag(
3332	struct xfs_alloc_arg	*args,
3333	uint32_t		alloc_flags)
3334{
3335	bool			need_pag = !args->pag;
3336	int			error;
3337
3338	if (need_pag)
3339		args->pag = xfs_perag_get(args->mp, args->agno);
3340
3341	args->agbp = NULL;
3342	error = xfs_alloc_fix_freelist(args, alloc_flags);
3343	if (error) {
3344		trace_xfs_alloc_vextent_nofix(args);
3345		if (need_pag)
3346			xfs_perag_put(args->pag);
3347		args->agbno = NULLAGBLOCK;
3348		return error;
3349	}
3350	if (!args->agbp) {
3351		/* cannot allocate in this AG at all */
3352		trace_xfs_alloc_vextent_noagbp(args);
3353		args->agbno = NULLAGBLOCK;
3354		return 0;
3355	}
3356	args->wasfromfl = 0;
3357	return 0;
3358}
3359
3360/*
3361 * Post-process allocation results to account for the allocation if it succeed
3362 * and set the allocated block number correctly for the caller.
3363 *
3364 * XXX: we should really be returning ENOSPC for ENOSPC, not
3365 * hiding it behind a "successful" NULLFSBLOCK allocation.
3366 */
3367static int
3368xfs_alloc_vextent_finish(
3369	struct xfs_alloc_arg	*args,
3370	xfs_agnumber_t		minimum_agno,
3371	int			alloc_error,
3372	bool			drop_perag)
3373{
3374	struct xfs_mount	*mp = args->mp;
3375	int			error = 0;
3376
3377	/*
3378	 * We can end up here with a locked AGF. If we failed, the caller is
3379	 * likely going to try to allocate again with different parameters, and
3380	 * that can widen the AGs that are searched for free space. If we have
3381	 * to do BMBT block allocation, we have to do a new allocation.
3382	 *
3383	 * Hence leaving this function with the AGF locked opens up potential
3384	 * ABBA AGF deadlocks because a future allocation attempt in this
3385	 * transaction may attempt to lock a lower number AGF.
3386	 *
3387	 * We can't release the AGF until the transaction is commited, so at
3388	 * this point we must update the "first allocation" tracker to point at
3389	 * this AG if the tracker is empty or points to a lower AG. This allows
3390	 * the next allocation attempt to be modified appropriately to avoid
3391	 * deadlocks.
3392	 */
3393	if (args->agbp &&
3394	    (args->tp->t_highest_agno == NULLAGNUMBER ||
3395	     args->agno > minimum_agno))
3396		args->tp->t_highest_agno = args->agno;
3397
3398	/*
3399	 * If the allocation failed with an error or we had an ENOSPC result,
3400	 * preserve the returned error whilst also marking the allocation result
3401	 * as "no extent allocated". This ensures that callers that fail to
3402	 * capture the error will still treat it as a failed allocation.
3403	 */
3404	if (alloc_error || args->agbno == NULLAGBLOCK) {
3405		args->fsbno = NULLFSBLOCK;
3406		error = alloc_error;
3407		goto out_drop_perag;
3408	}
3409
3410	args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3411
3412	ASSERT(args->len >= args->minlen);
3413	ASSERT(args->len <= args->maxlen);
3414	ASSERT(args->agbno % args->alignment == 0);
3415	XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
3416
3417	/* if not file data, insert new block into the reverse map btree */
3418	if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
3419		error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
3420				       args->agbno, args->len, &args->oinfo);
3421		if (error)
3422			goto out_drop_perag;
3423	}
3424
3425	if (!args->wasfromfl) {
3426		error = xfs_alloc_update_counters(args->tp, args->agbp,
3427						  -((long)(args->len)));
3428		if (error)
3429			goto out_drop_perag;
3430
3431		ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno,
3432				args->len));
3433	}
3434
3435	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
3436
3437	XFS_STATS_INC(mp, xs_allocx);
3438	XFS_STATS_ADD(mp, xs_allocb, args->len);
3439
3440	trace_xfs_alloc_vextent_finish(args);
3441
3442out_drop_perag:
3443	if (drop_perag && args->pag) {
3444		xfs_perag_rele(args->pag);
3445		args->pag = NULL;
3446	}
3447	return error;
3448}
3449
3450/*
3451 * Allocate within a single AG only. This uses a best-fit length algorithm so if
3452 * you need an exact sized allocation without locality constraints, this is the
3453 * fastest way to do it.
3454 *
3455 * Caller is expected to hold a perag reference in args->pag.
3456 */
3457int
3458xfs_alloc_vextent_this_ag(
3459	struct xfs_alloc_arg	*args,
3460	xfs_agnumber_t		agno)
3461{
3462	struct xfs_mount	*mp = args->mp;
3463	xfs_agnumber_t		minimum_agno;
3464	uint32_t		alloc_flags = 0;
3465	int			error;
3466
3467	ASSERT(args->pag != NULL);
3468	ASSERT(args->pag->pag_agno == agno);
3469
3470	args->agno = agno;
3471	args->agbno = 0;
3472
3473	trace_xfs_alloc_vextent_this_ag(args);
3474
3475	error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0),
3476			&minimum_agno);
3477	if (error) {
3478		if (error == -ENOSPC)
3479			return 0;
3480		return error;
3481	}
3482
3483	error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3484	if (!error && args->agbp)
3485		error = xfs_alloc_ag_vextent_size(args, alloc_flags);
3486
3487	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3488}
3489
3490/*
3491 * Iterate all AGs trying to allocate an extent starting from @start_ag.
3492 *
3493 * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
3494 * allocation attempts in @start_agno have locality information. If we fail to
3495 * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
3496 * we attempt to allocation in as there is no locality optimisation possible for
3497 * those allocations.
3498 *
3499 * On return, args->pag may be left referenced if we finish before the "all
3500 * failed" return point. The allocation finish still needs the perag, and
3501 * so the caller will release it once they've finished the allocation.
3502 *
3503 * When we wrap the AG iteration at the end of the filesystem, we have to be
3504 * careful not to wrap into AGs below ones we already have locked in the
3505 * transaction if we are doing a blocking iteration. This will result in an
3506 * out-of-order locking of AGFs and hence can cause deadlocks.
3507 */
3508static int
3509xfs_alloc_vextent_iterate_ags(
3510	struct xfs_alloc_arg	*args,
3511	xfs_agnumber_t		minimum_agno,
3512	xfs_agnumber_t		start_agno,
3513	xfs_agblock_t		target_agbno,
3514	uint32_t		alloc_flags)
3515{
3516	struct xfs_mount	*mp = args->mp;
3517	xfs_agnumber_t		restart_agno = minimum_agno;
3518	xfs_agnumber_t		agno;
3519	int			error = 0;
3520
3521	if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)
3522		restart_agno = 0;
3523restart:
3524	for_each_perag_wrap_range(mp, start_agno, restart_agno,
3525			mp->m_sb.sb_agcount, agno, args->pag) {
3526		args->agno = agno;
3527		error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3528		if (error)
3529			break;
3530		if (!args->agbp) {
3531			trace_xfs_alloc_vextent_loopfailed(args);
3532			continue;
3533		}
3534
3535		/*
3536		 * Allocation is supposed to succeed now, so break out of the
3537		 * loop regardless of whether we succeed or not.
3538		 */
3539		if (args->agno == start_agno && target_agbno) {
3540			args->agbno = target_agbno;
3541			error = xfs_alloc_ag_vextent_near(args, alloc_flags);
3542		} else {
3543			args->agbno = 0;
3544			error = xfs_alloc_ag_vextent_size(args, alloc_flags);
3545		}
3546		break;
3547	}
3548	if (error) {
3549		xfs_perag_rele(args->pag);
3550		args->pag = NULL;
3551		return error;
3552	}
3553	if (args->agbp)
3554		return 0;
3555
3556	/*
3557	 * We didn't find an AG we can alloation from. If we were given
3558	 * constraining flags by the caller, drop them and retry the allocation
3559	 * without any constraints being set.
3560	 */
3561	if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) {
3562		alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK;
3563		restart_agno = minimum_agno;
3564		goto restart;
3565	}
3566
3567	ASSERT(args->pag == NULL);
3568	trace_xfs_alloc_vextent_allfailed(args);
3569	return 0;
3570}
3571
3572/*
3573 * Iterate from the AGs from the start AG to the end of the filesystem, trying
3574 * to allocate blocks. It starts with a near allocation attempt in the initial
3575 * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
3576 * back to zero if allowed by previous allocations in this transaction,
3577 * otherwise will wrap back to the start AG and run a second blocking pass to
3578 * the end of the filesystem.
3579 */
3580int
3581xfs_alloc_vextent_start_ag(
3582	struct xfs_alloc_arg	*args,
3583	xfs_fsblock_t		target)
3584{
3585	struct xfs_mount	*mp = args->mp;
3586	xfs_agnumber_t		minimum_agno;
3587	xfs_agnumber_t		start_agno;
3588	xfs_agnumber_t		rotorstep = xfs_rotorstep;
3589	bool			bump_rotor = false;
3590	uint32_t		alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
3591	int			error;
3592
3593	ASSERT(args->pag == NULL);
3594
3595	args->agno = NULLAGNUMBER;
3596	args->agbno = NULLAGBLOCK;
3597
3598	trace_xfs_alloc_vextent_start_ag(args);
3599
3600	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3601	if (error) {
3602		if (error == -ENOSPC)
3603			return 0;
3604		return error;
3605	}
3606
3607	if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3608	    xfs_is_inode32(mp)) {
3609		target = XFS_AGB_TO_FSB(mp,
3610				((mp->m_agfrotor / rotorstep) %
3611				mp->m_sb.sb_agcount), 0);
3612		bump_rotor = 1;
3613	}
3614
3615	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3616	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3617			XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
3618
3619	if (bump_rotor) {
3620		if (args->agno == start_agno)
3621			mp->m_agfrotor = (mp->m_agfrotor + 1) %
3622				(mp->m_sb.sb_agcount * rotorstep);
3623		else
3624			mp->m_agfrotor = (args->agno * rotorstep + 1) %
3625				(mp->m_sb.sb_agcount * rotorstep);
3626	}
3627
3628	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3629}
3630
3631/*
3632 * Iterate from the agno indicated via @target through to the end of the
3633 * filesystem attempting blocking allocation. This does not wrap or try a second
3634 * pass, so will not recurse into AGs lower than indicated by the target.
3635 */
3636int
3637xfs_alloc_vextent_first_ag(
3638	struct xfs_alloc_arg	*args,
3639	xfs_fsblock_t		target)
3640 {
3641	struct xfs_mount	*mp = args->mp;
3642	xfs_agnumber_t		minimum_agno;
3643	xfs_agnumber_t		start_agno;
3644	uint32_t		alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
3645	int			error;
3646
3647	ASSERT(args->pag == NULL);
3648
3649	args->agno = NULLAGNUMBER;
3650	args->agbno = NULLAGBLOCK;
3651
3652	trace_xfs_alloc_vextent_first_ag(args);
3653
3654	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3655	if (error) {
3656		if (error == -ENOSPC)
3657			return 0;
3658		return error;
3659	}
3660
3661	start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
3662	error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
3663			XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
3664	return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
3665}
3666
3667/*
3668 * Allocate at the exact block target or fail. Caller is expected to hold a
3669 * perag reference in args->pag.
3670 */
3671int
3672xfs_alloc_vextent_exact_bno(
3673	struct xfs_alloc_arg	*args,
3674	xfs_fsblock_t		target)
3675{
3676	struct xfs_mount	*mp = args->mp;
3677	xfs_agnumber_t		minimum_agno;
3678	int			error;
3679
3680	ASSERT(args->pag != NULL);
3681	ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3682
3683	args->agno = XFS_FSB_TO_AGNO(mp, target);
3684	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3685
3686	trace_xfs_alloc_vextent_exact_bno(args);
3687
3688	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3689	if (error) {
3690		if (error == -ENOSPC)
3691			return 0;
3692		return error;
3693	}
3694
3695	error = xfs_alloc_vextent_prepare_ag(args, 0);
3696	if (!error && args->agbp)
3697		error = xfs_alloc_ag_vextent_exact(args);
3698
3699	return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
3700}
3701
3702/*
3703 * Allocate an extent as close to the target as possible. If there are not
3704 * viable candidates in the AG, then fail the allocation.
3705 *
3706 * Caller may or may not have a per-ag reference in args->pag.
3707 */
3708int
3709xfs_alloc_vextent_near_bno(
3710	struct xfs_alloc_arg	*args,
3711	xfs_fsblock_t		target)
3712{
3713	struct xfs_mount	*mp = args->mp;
3714	xfs_agnumber_t		minimum_agno;
3715	bool			needs_perag = args->pag == NULL;
3716	uint32_t		alloc_flags = 0;
3717	int			error;
3718
3719	if (!needs_perag)
3720		ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3721
3722	args->agno = XFS_FSB_TO_AGNO(mp, target);
3723	args->agbno = XFS_FSB_TO_AGBNO(mp, target);
3724
3725	trace_xfs_alloc_vextent_near_bno(args);
3726
3727	error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
3728	if (error) {
3729		if (error == -ENOSPC)
3730			return 0;
3731		return error;
3732	}
3733
3734	if (needs_perag)
3735		args->pag = xfs_perag_grab(mp, args->agno);
3736
3737	error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
3738	if (!error && args->agbp)
3739		error = xfs_alloc_ag_vextent_near(args, alloc_flags);
3740
3741	return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag);
3742}
3743
3744/* Ensure that the freelist is at full capacity. */
3745int
3746xfs_free_extent_fix_freelist(
3747	struct xfs_trans	*tp,
3748	struct xfs_perag	*pag,
3749	struct xfs_buf		**agbp)
3750{
3751	struct xfs_alloc_arg	args;
3752	int			error;
3753
3754	memset(&args, 0, sizeof(struct xfs_alloc_arg));
3755	args.tp = tp;
3756	args.mp = tp->t_mountp;
3757	args.agno = pag->pag_agno;
3758	args.pag = pag;
3759
3760	/*
3761	 * validate that the block number is legal - the enables us to detect
3762	 * and handle a silent filesystem corruption rather than crashing.
3763	 */
3764	if (args.agno >= args.mp->m_sb.sb_agcount)
3765		return -EFSCORRUPTED;
3766
3767	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3768	if (error)
3769		return error;
3770
3771	*agbp = args.agbp;
3772	return 0;
3773}
3774
3775/*
3776 * Free an extent.
3777 * Just break up the extent address and hand off to xfs_free_ag_extent
3778 * after fixing up the freelist.
3779 */
3780int
3781__xfs_free_extent(
3782	struct xfs_trans		*tp,
3783	struct xfs_perag		*pag,
3784	xfs_agblock_t			agbno,
3785	xfs_extlen_t			len,
3786	const struct xfs_owner_info	*oinfo,
3787	enum xfs_ag_resv_type		type,
3788	bool				skip_discard)
3789{
3790	struct xfs_mount		*mp = tp->t_mountp;
3791	struct xfs_buf			*agbp;
3792	struct xfs_agf			*agf;
3793	int				error;
3794	unsigned int			busy_flags = 0;
3795
3796	ASSERT(len != 0);
3797	ASSERT(type != XFS_AG_RESV_AGFL);
3798
3799	if (XFS_TEST_ERROR(false, mp,
3800			XFS_ERRTAG_FREE_EXTENT))
3801		return -EIO;
3802
3803	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
3804	if (error)
3805		return error;
3806	agf = agbp->b_addr;
3807
3808	if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3809		error = -EFSCORRUPTED;
3810		goto err_release;
3811	}
3812
3813	/* validate the extent size is legal now we have the agf locked */
3814	if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3815		error = -EFSCORRUPTED;
3816		goto err_release;
3817	}
3818
3819	error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo,
3820			type);
3821	if (error)
3822		goto err_release;
3823
3824	if (skip_discard)
3825		busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3826	xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
3827	return 0;
3828
3829err_release:
3830	xfs_trans_brelse(tp, agbp);
3831	return error;
3832}
3833
3834struct xfs_alloc_query_range_info {
3835	xfs_alloc_query_range_fn	fn;
3836	void				*priv;
3837};
3838
3839/* Format btree record and pass to our callback. */
3840STATIC int
3841xfs_alloc_query_range_helper(
3842	struct xfs_btree_cur		*cur,
3843	const union xfs_btree_rec	*rec,
3844	void				*priv)
3845{
3846	struct xfs_alloc_query_range_info	*query = priv;
3847	struct xfs_alloc_rec_incore		irec;
3848	xfs_failaddr_t				fa;
3849
3850	xfs_alloc_btrec_to_irec(rec, &irec);
3851	fa = xfs_alloc_check_irec(cur, &irec);
3852	if (fa)
3853		return xfs_alloc_complain_bad_rec(cur, fa, &irec);
3854
3855	return query->fn(cur, &irec, query->priv);
3856}
3857
3858/* Find all free space within a given range of blocks. */
3859int
3860xfs_alloc_query_range(
3861	struct xfs_btree_cur			*cur,
3862	const struct xfs_alloc_rec_incore	*low_rec,
3863	const struct xfs_alloc_rec_incore	*high_rec,
3864	xfs_alloc_query_range_fn		fn,
3865	void					*priv)
3866{
3867	union xfs_btree_irec			low_brec = { .a = *low_rec };
3868	union xfs_btree_irec			high_brec = { .a = *high_rec };
3869	struct xfs_alloc_query_range_info	query = { .priv = priv, .fn = fn };
3870
3871	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3872	return xfs_btree_query_range(cur, &low_brec, &high_brec,
3873			xfs_alloc_query_range_helper, &query);
3874}
3875
3876/* Find all free space records. */
3877int
3878xfs_alloc_query_all(
3879	struct xfs_btree_cur			*cur,
3880	xfs_alloc_query_range_fn		fn,
3881	void					*priv)
3882{
3883	struct xfs_alloc_query_range_info	query;
3884
3885	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3886	query.priv = priv;
3887	query.fn = fn;
3888	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3889}
3890
3891/*
3892 * Scan part of the keyspace of the free space and tell us if the area has no
3893 * records, is fully mapped by records, or is partially filled.
3894 */
3895int
3896xfs_alloc_has_records(
3897	struct xfs_btree_cur	*cur,
3898	xfs_agblock_t		bno,
3899	xfs_extlen_t		len,
3900	enum xbtree_recpacking	*outcome)
3901{
3902	union xfs_btree_irec	low;
3903	union xfs_btree_irec	high;
3904
3905	memset(&low, 0, sizeof(low));
3906	low.a.ar_startblock = bno;
3907	memset(&high, 0xFF, sizeof(high));
3908	high.a.ar_startblock = bno + len - 1;
3909
3910	return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
3911}
3912
3913/*
3914 * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
3915 * error code or XFS_ITER_*.
3916 */
3917int
3918xfs_agfl_walk(
3919	struct xfs_mount	*mp,
3920	struct xfs_agf		*agf,
3921	struct xfs_buf		*agflbp,
3922	xfs_agfl_walk_fn	walk_fn,
3923	void			*priv)
3924{
3925	__be32			*agfl_bno;
3926	unsigned int		i;
3927	int			error;
3928
3929	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3930	i = be32_to_cpu(agf->agf_flfirst);
3931
3932	/* Nothing to walk in an empty AGFL. */
3933	if (agf->agf_flcount == cpu_to_be32(0))
3934		return 0;
3935
3936	/* Otherwise, walk from first to last, wrapping as needed. */
3937	for (;;) {
3938		error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3939		if (error)
3940			return error;
3941		if (i == be32_to_cpu(agf->agf_fllast))
3942			break;
3943		if (++i == xfs_agfl_size(mp))
3944			i = 0;
3945	}
3946
3947	return 0;
3948}
3949
3950int __init
3951xfs_extfree_intent_init_cache(void)
3952{
3953	xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
3954			sizeof(struct xfs_extent_free_item),
3955			0, 0, NULL);
3956
3957	return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
3958}
3959
3960void
3961xfs_extfree_intent_destroy_cache(void)
3962{
3963	kmem_cache_destroy(xfs_extfree_item_cache);
3964	xfs_extfree_item_cache = NULL;
3965}
3966