1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_format.h"
9#include "xfs_log_format.h"
10#include "xfs_shared.h"
11#include "xfs_trans_resv.h"
12#include "xfs_bit.h"
13#include "xfs_sb.h"
14#include "xfs_mount.h"
15#include "xfs_defer.h"
16#include "xfs_btree.h"
17#include "xfs_rmap.h"
18#include "xfs_alloc_btree.h"
19#include "xfs_alloc.h"
20#include "xfs_extent_busy.h"
21#include "xfs_errortag.h"
22#include "xfs_error.h"
23#include "xfs_trace.h"
24#include "xfs_trans.h"
25#include "xfs_buf_item.h"
26#include "xfs_log.h"
27#include "xfs_ag_resv.h"
28#include "xfs_bmap.h"
29
30extern kmem_zone_t	*xfs_bmap_free_item_zone;
31
32struct workqueue_struct *xfs_alloc_wq;
33
34#define XFS_ABSDIFF(a,b)	(((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
35
36#define	XFSA_FIXUP_BNO_OK	1
37#define	XFSA_FIXUP_CNT_OK	2
38
39STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
40STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
41STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
42
43/*
44 * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
45 * the beginning of the block for a proper header with the location information
46 * and CRC.
47 */
48unsigned int
49xfs_agfl_size(
50	struct xfs_mount	*mp)
51{
52	unsigned int		size = mp->m_sb.sb_sectsize;
53
54	if (xfs_sb_version_hascrc(&mp->m_sb))
55		size -= sizeof(struct xfs_agfl);
56
57	return size / sizeof(xfs_agblock_t);
58}
59
60unsigned int
61xfs_refc_block(
62	struct xfs_mount	*mp)
63{
64	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
65		return XFS_RMAP_BLOCK(mp) + 1;
66	if (xfs_sb_version_hasfinobt(&mp->m_sb))
67		return XFS_FIBT_BLOCK(mp) + 1;
68	return XFS_IBT_BLOCK(mp) + 1;
69}
70
71xfs_extlen_t
72xfs_prealloc_blocks(
73	struct xfs_mount	*mp)
74{
75	if (xfs_sb_version_hasreflink(&mp->m_sb))
76		return xfs_refc_block(mp) + 1;
77	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
78		return XFS_RMAP_BLOCK(mp) + 1;
79	if (xfs_sb_version_hasfinobt(&mp->m_sb))
80		return XFS_FIBT_BLOCK(mp) + 1;
81	return XFS_IBT_BLOCK(mp) + 1;
82}
83
84/*
85 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
86 * AGF buffer (PV 947395), we place constraints on the relationship among
87 * actual allocations for data blocks, freelist blocks, and potential file data
88 * bmap btree blocks. However, these restrictions may result in no actual space
89 * allocated for a delayed extent, for example, a data block in a certain AG is
90 * allocated but there is no additional block for the additional bmap btree
91 * block due to a split of the bmap btree of the file. The result of this may
92 * lead to an infinite loop when the file gets flushed to disk and all delayed
93 * extents need to be actually allocated. To get around this, we explicitly set
94 * aside a few blocks which will not be reserved in delayed allocation.
95 *
96 * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
97 * potential split of the file's bmap btree.
98 */
99unsigned int
100xfs_alloc_set_aside(
101	struct xfs_mount	*mp)
102{
103	return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
104}
105
106/*
107 * When deciding how much space to allocate out of an AG, we limit the
108 * allocation maximum size to the size the AG. However, we cannot use all the
109 * blocks in the AG - some are permanently used by metadata. These
110 * blocks are generally:
111 *	- the AG superblock, AGF, AGI and AGFL
112 *	- the AGF (bno and cnt) and AGI btree root blocks, and optionally
113 *	  the AGI free inode and rmap btree root blocks.
114 *	- blocks on the AGFL according to xfs_alloc_set_aside() limits
115 *	- the rmapbt root block
116 *
117 * The AG headers are sector sized, so the amount of space they take up is
118 * dependent on filesystem geometry. The others are all single blocks.
119 */
120unsigned int
121xfs_alloc_ag_max_usable(
122	struct xfs_mount	*mp)
123{
124	unsigned int		blocks;
125
126	blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
127	blocks += XFS_ALLOC_AGFL_RESERVE;
128	blocks += 3;			/* AGF, AGI btree root blocks */
129	if (xfs_sb_version_hasfinobt(&mp->m_sb))
130		blocks++;		/* finobt root block */
131	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
132		blocks++; 		/* rmap root block */
133	if (xfs_sb_version_hasreflink(&mp->m_sb))
134		blocks++;		/* refcount root block */
135
136	return mp->m_sb.sb_agblocks - blocks;
137}
138
139/*
140 * Lookup the record equal to [bno, len] in the btree given by cur.
141 */
142STATIC int				/* error */
143xfs_alloc_lookup_eq(
144	struct xfs_btree_cur	*cur,	/* btree cursor */
145	xfs_agblock_t		bno,	/* starting block of extent */
146	xfs_extlen_t		len,	/* length of extent */
147	int			*stat)	/* success/failure */
148{
149	int			error;
150
151	cur->bc_rec.a.ar_startblock = bno;
152	cur->bc_rec.a.ar_blockcount = len;
153	error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
154	cur->bc_ag.abt.active = (*stat == 1);
155	return error;
156}
157
158/*
159 * Lookup the first record greater than or equal to [bno, len]
160 * in the btree given by cur.
161 */
162int				/* error */
163xfs_alloc_lookup_ge(
164	struct xfs_btree_cur	*cur,	/* btree cursor */
165	xfs_agblock_t		bno,	/* starting block of extent */
166	xfs_extlen_t		len,	/* length of extent */
167	int			*stat)	/* success/failure */
168{
169	int			error;
170
171	cur->bc_rec.a.ar_startblock = bno;
172	cur->bc_rec.a.ar_blockcount = len;
173	error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
174	cur->bc_ag.abt.active = (*stat == 1);
175	return error;
176}
177
178/*
179 * Lookup the first record less than or equal to [bno, len]
180 * in the btree given by cur.
181 */
182int					/* error */
183xfs_alloc_lookup_le(
184	struct xfs_btree_cur	*cur,	/* btree cursor */
185	xfs_agblock_t		bno,	/* starting block of extent */
186	xfs_extlen_t		len,	/* length of extent */
187	int			*stat)	/* success/failure */
188{
189	int			error;
190	cur->bc_rec.a.ar_startblock = bno;
191	cur->bc_rec.a.ar_blockcount = len;
192	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
193	cur->bc_ag.abt.active = (*stat == 1);
194	return error;
195}
196
197static inline bool
198xfs_alloc_cur_active(
199	struct xfs_btree_cur	*cur)
200{
201	return cur && cur->bc_ag.abt.active;
202}
203
204/*
205 * Update the record referred to by cur to the value given
206 * by [bno, len].
207 * This either works (return 0) or gets an EFSCORRUPTED error.
208 */
209STATIC int				/* error */
210xfs_alloc_update(
211	struct xfs_btree_cur	*cur,	/* btree cursor */
212	xfs_agblock_t		bno,	/* starting block of extent */
213	xfs_extlen_t		len)	/* length of extent */
214{
215	union xfs_btree_rec	rec;
216
217	rec.alloc.ar_startblock = cpu_to_be32(bno);
218	rec.alloc.ar_blockcount = cpu_to_be32(len);
219	return xfs_btree_update(cur, &rec);
220}
221
222/*
223 * Get the data from the pointed-to record.
224 */
225int					/* error */
226xfs_alloc_get_rec(
227	struct xfs_btree_cur	*cur,	/* btree cursor */
228	xfs_agblock_t		*bno,	/* output: starting block of extent */
229	xfs_extlen_t		*len,	/* output: length of extent */
230	int			*stat)	/* output: success/failure */
231{
232	struct xfs_mount	*mp = cur->bc_mp;
233	xfs_agnumber_t		agno = cur->bc_ag.agno;
234	union xfs_btree_rec	*rec;
235	int			error;
236
237	error = xfs_btree_get_rec(cur, &rec, stat);
238	if (error || !(*stat))
239		return error;
240
241	*bno = be32_to_cpu(rec->alloc.ar_startblock);
242	*len = be32_to_cpu(rec->alloc.ar_blockcount);
243
244	if (*len == 0)
245		goto out_bad_rec;
246
247	/* check for valid extent range, including overflow */
248	if (!xfs_verify_agbno(mp, agno, *bno))
249		goto out_bad_rec;
250	if (*bno > *bno + *len)
251		goto out_bad_rec;
252	if (!xfs_verify_agbno(mp, agno, *bno + *len - 1))
253		goto out_bad_rec;
254
255	return 0;
256
257out_bad_rec:
258	xfs_warn(mp,
259		"%s Freespace BTree record corruption in AG %d detected!",
260		cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno);
261	xfs_warn(mp,
262		"start block 0x%x block count 0x%x", *bno, *len);
263	return -EFSCORRUPTED;
264}
265
266/*
267 * Compute aligned version of the found extent.
268 * Takes alignment and min length into account.
269 */
270STATIC bool
271xfs_alloc_compute_aligned(
272	xfs_alloc_arg_t	*args,		/* allocation argument structure */
273	xfs_agblock_t	foundbno,	/* starting block in found extent */
274	xfs_extlen_t	foundlen,	/* length in found extent */
275	xfs_agblock_t	*resbno,	/* result block number */
276	xfs_extlen_t	*reslen,	/* result length */
277	unsigned	*busy_gen)
278{
279	xfs_agblock_t	bno = foundbno;
280	xfs_extlen_t	len = foundlen;
281	xfs_extlen_t	diff;
282	bool		busy;
283
284	/* Trim busy sections out of found extent */
285	busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
286
287	/*
288	 * If we have a largish extent that happens to start before min_agbno,
289	 * see if we can shift it into range...
290	 */
291	if (bno < args->min_agbno && bno + len > args->min_agbno) {
292		diff = args->min_agbno - bno;
293		if (len > diff) {
294			bno += diff;
295			len -= diff;
296		}
297	}
298
299	if (args->alignment > 1 && len >= args->minlen) {
300		xfs_agblock_t	aligned_bno = roundup(bno, args->alignment);
301
302		diff = aligned_bno - bno;
303
304		*resbno = aligned_bno;
305		*reslen = diff >= len ? 0 : len - diff;
306	} else {
307		*resbno = bno;
308		*reslen = len;
309	}
310
311	return busy;
312}
313
314/*
315 * Compute best start block and diff for "near" allocations.
316 * freelen >= wantlen already checked by caller.
317 */
318STATIC xfs_extlen_t			/* difference value (absolute) */
319xfs_alloc_compute_diff(
320	xfs_agblock_t	wantbno,	/* target starting block */
321	xfs_extlen_t	wantlen,	/* target length */
322	xfs_extlen_t	alignment,	/* target alignment */
323	int		datatype,	/* are we allocating data? */
324	xfs_agblock_t	freebno,	/* freespace's starting block */
325	xfs_extlen_t	freelen,	/* freespace's length */
326	xfs_agblock_t	*newbnop)	/* result: best start block from free */
327{
328	xfs_agblock_t	freeend;	/* end of freespace extent */
329	xfs_agblock_t	newbno1;	/* return block number */
330	xfs_agblock_t	newbno2;	/* other new block number */
331	xfs_extlen_t	newlen1=0;	/* length with newbno1 */
332	xfs_extlen_t	newlen2=0;	/* length with newbno2 */
333	xfs_agblock_t	wantend;	/* end of target extent */
334	bool		userdata = datatype & XFS_ALLOC_USERDATA;
335
336	ASSERT(freelen >= wantlen);
337	freeend = freebno + freelen;
338	wantend = wantbno + wantlen;
339	/*
340	 * We want to allocate from the start of a free extent if it is past
341	 * the desired block or if we are allocating user data and the free
342	 * extent is before desired block. The second case is there to allow
343	 * for contiguous allocation from the remaining free space if the file
344	 * grows in the short term.
345	 */
346	if (freebno >= wantbno || (userdata && freeend < wantend)) {
347		if ((newbno1 = roundup(freebno, alignment)) >= freeend)
348			newbno1 = NULLAGBLOCK;
349	} else if (freeend >= wantend && alignment > 1) {
350		newbno1 = roundup(wantbno, alignment);
351		newbno2 = newbno1 - alignment;
352		if (newbno1 >= freeend)
353			newbno1 = NULLAGBLOCK;
354		else
355			newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
356		if (newbno2 < freebno)
357			newbno2 = NULLAGBLOCK;
358		else
359			newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
360		if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
361			if (newlen1 < newlen2 ||
362			    (newlen1 == newlen2 &&
363			     XFS_ABSDIFF(newbno1, wantbno) >
364			     XFS_ABSDIFF(newbno2, wantbno)))
365				newbno1 = newbno2;
366		} else if (newbno2 != NULLAGBLOCK)
367			newbno1 = newbno2;
368	} else if (freeend >= wantend) {
369		newbno1 = wantbno;
370	} else if (alignment > 1) {
371		newbno1 = roundup(freeend - wantlen, alignment);
372		if (newbno1 > freeend - wantlen &&
373		    newbno1 - alignment >= freebno)
374			newbno1 -= alignment;
375		else if (newbno1 >= freeend)
376			newbno1 = NULLAGBLOCK;
377	} else
378		newbno1 = freeend - wantlen;
379	*newbnop = newbno1;
380	return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
381}
382
383/*
384 * Fix up the length, based on mod and prod.
385 * len should be k * prod + mod for some k.
386 * If len is too small it is returned unchanged.
387 * If len hits maxlen it is left alone.
388 */
389STATIC void
390xfs_alloc_fix_len(
391	xfs_alloc_arg_t	*args)		/* allocation argument structure */
392{
393	xfs_extlen_t	k;
394	xfs_extlen_t	rlen;
395
396	ASSERT(args->mod < args->prod);
397	rlen = args->len;
398	ASSERT(rlen >= args->minlen);
399	ASSERT(rlen <= args->maxlen);
400	if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
401	    (args->mod == 0 && rlen < args->prod))
402		return;
403	k = rlen % args->prod;
404	if (k == args->mod)
405		return;
406	if (k > args->mod)
407		rlen = rlen - (k - args->mod);
408	else
409		rlen = rlen - args->prod + (args->mod - k);
410	/* casts to (int) catch length underflows */
411	if ((int)rlen < (int)args->minlen)
412		return;
413	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
414	ASSERT(rlen % args->prod == args->mod);
415	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
416		rlen + args->minleft);
417	args->len = rlen;
418}
419
420/*
421 * Update the two btrees, logically removing from freespace the extent
422 * starting at rbno, rlen blocks.  The extent is contained within the
423 * actual (current) free extent fbno for flen blocks.
424 * Flags are passed in indicating whether the cursors are set to the
425 * relevant records.
426 */
427STATIC int				/* error code */
428xfs_alloc_fixup_trees(
429	xfs_btree_cur_t	*cnt_cur,	/* cursor for by-size btree */
430	xfs_btree_cur_t	*bno_cur,	/* cursor for by-block btree */
431	xfs_agblock_t	fbno,		/* starting block of free extent */
432	xfs_extlen_t	flen,		/* length of free extent */
433	xfs_agblock_t	rbno,		/* starting block of returned extent */
434	xfs_extlen_t	rlen,		/* length of returned extent */
435	int		flags)		/* flags, XFSA_FIXUP_... */
436{
437	int		error;		/* error code */
438	int		i;		/* operation results */
439	xfs_agblock_t	nfbno1;		/* first new free startblock */
440	xfs_agblock_t	nfbno2;		/* second new free startblock */
441	xfs_extlen_t	nflen1=0;	/* first new free length */
442	xfs_extlen_t	nflen2=0;	/* second new free length */
443	struct xfs_mount *mp;
444
445	mp = cnt_cur->bc_mp;
446
447	/*
448	 * Look up the record in the by-size tree if necessary.
449	 */
450	if (flags & XFSA_FIXUP_CNT_OK) {
451#ifdef DEBUG
452		if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
453			return error;
454		if (XFS_IS_CORRUPT(mp,
455				   i != 1 ||
456				   nfbno1 != fbno ||
457				   nflen1 != flen))
458			return -EFSCORRUPTED;
459#endif
460	} else {
461		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
462			return error;
463		if (XFS_IS_CORRUPT(mp, i != 1))
464			return -EFSCORRUPTED;
465	}
466	/*
467	 * Look up the record in the by-block tree if necessary.
468	 */
469	if (flags & XFSA_FIXUP_BNO_OK) {
470#ifdef DEBUG
471		if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
472			return error;
473		if (XFS_IS_CORRUPT(mp,
474				   i != 1 ||
475				   nfbno1 != fbno ||
476				   nflen1 != flen))
477			return -EFSCORRUPTED;
478#endif
479	} else {
480		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
481			return error;
482		if (XFS_IS_CORRUPT(mp, i != 1))
483			return -EFSCORRUPTED;
484	}
485
486#ifdef DEBUG
487	if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
488		struct xfs_btree_block	*bnoblock;
489		struct xfs_btree_block	*cntblock;
490
491		bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
492		cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
493
494		if (XFS_IS_CORRUPT(mp,
495				   bnoblock->bb_numrecs !=
496				   cntblock->bb_numrecs))
497			return -EFSCORRUPTED;
498	}
499#endif
500
501	/*
502	 * Deal with all four cases: the allocated record is contained
503	 * within the freespace record, so we can have new freespace
504	 * at either (or both) end, or no freespace remaining.
505	 */
506	if (rbno == fbno && rlen == flen)
507		nfbno1 = nfbno2 = NULLAGBLOCK;
508	else if (rbno == fbno) {
509		nfbno1 = rbno + rlen;
510		nflen1 = flen - rlen;
511		nfbno2 = NULLAGBLOCK;
512	} else if (rbno + rlen == fbno + flen) {
513		nfbno1 = fbno;
514		nflen1 = flen - rlen;
515		nfbno2 = NULLAGBLOCK;
516	} else {
517		nfbno1 = fbno;
518		nflen1 = rbno - fbno;
519		nfbno2 = rbno + rlen;
520		nflen2 = (fbno + flen) - nfbno2;
521	}
522	/*
523	 * Delete the entry from the by-size btree.
524	 */
525	if ((error = xfs_btree_delete(cnt_cur, &i)))
526		return error;
527	if (XFS_IS_CORRUPT(mp, i != 1))
528		return -EFSCORRUPTED;
529	/*
530	 * Add new by-size btree entry(s).
531	 */
532	if (nfbno1 != NULLAGBLOCK) {
533		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
534			return error;
535		if (XFS_IS_CORRUPT(mp, i != 0))
536			return -EFSCORRUPTED;
537		if ((error = xfs_btree_insert(cnt_cur, &i)))
538			return error;
539		if (XFS_IS_CORRUPT(mp, i != 1))
540			return -EFSCORRUPTED;
541	}
542	if (nfbno2 != NULLAGBLOCK) {
543		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
544			return error;
545		if (XFS_IS_CORRUPT(mp, i != 0))
546			return -EFSCORRUPTED;
547		if ((error = xfs_btree_insert(cnt_cur, &i)))
548			return error;
549		if (XFS_IS_CORRUPT(mp, i != 1))
550			return -EFSCORRUPTED;
551	}
552	/*
553	 * Fix up the by-block btree entry(s).
554	 */
555	if (nfbno1 == NULLAGBLOCK) {
556		/*
557		 * No remaining freespace, just delete the by-block tree entry.
558		 */
559		if ((error = xfs_btree_delete(bno_cur, &i)))
560			return error;
561		if (XFS_IS_CORRUPT(mp, i != 1))
562			return -EFSCORRUPTED;
563	} else {
564		/*
565		 * Update the by-block entry to start later|be shorter.
566		 */
567		if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
568			return error;
569	}
570	if (nfbno2 != NULLAGBLOCK) {
571		/*
572		 * 2 resulting free entries, need to add one.
573		 */
574		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
575			return error;
576		if (XFS_IS_CORRUPT(mp, i != 0))
577			return -EFSCORRUPTED;
578		if ((error = xfs_btree_insert(bno_cur, &i)))
579			return error;
580		if (XFS_IS_CORRUPT(mp, i != 1))
581			return -EFSCORRUPTED;
582	}
583	return 0;
584}
585
586static xfs_failaddr_t
587xfs_agfl_verify(
588	struct xfs_buf	*bp)
589{
590	struct xfs_mount *mp = bp->b_mount;
591	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
592	__be32		*agfl_bno = xfs_buf_to_agfl_bno(bp);
593	int		i;
594
595	/*
596	 * There is no verification of non-crc AGFLs because mkfs does not
597	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
598	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
599	 * can't verify just those entries are valid.
600	 */
601	if (!xfs_sb_version_hascrc(&mp->m_sb))
602		return NULL;
603
604	if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
605		return __this_address;
606	if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
607		return __this_address;
608	/*
609	 * during growfs operations, the perag is not fully initialised,
610	 * so we can't use it for any useful checking. growfs ensures we can't
611	 * use it by using uncached buffers that don't have the perag attached
612	 * so we can detect and avoid this problem.
613	 */
614	if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
615		return __this_address;
616
617	for (i = 0; i < xfs_agfl_size(mp); i++) {
618		if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
619		    be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
620			return __this_address;
621	}
622
623	if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
624		return __this_address;
625	return NULL;
626}
627
628static void
629xfs_agfl_read_verify(
630	struct xfs_buf	*bp)
631{
632	struct xfs_mount *mp = bp->b_mount;
633	xfs_failaddr_t	fa;
634
635	/*
636	 * There is no verification of non-crc AGFLs because mkfs does not
637	 * initialise the AGFL to zero or NULL. Hence the only valid part of the
638	 * AGFL is what the AGF says is active. We can't get to the AGF, so we
639	 * can't verify just those entries are valid.
640	 */
641	if (!xfs_sb_version_hascrc(&mp->m_sb))
642		return;
643
644	if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
645		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
646	else {
647		fa = xfs_agfl_verify(bp);
648		if (fa)
649			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
650	}
651}
652
653static void
654xfs_agfl_write_verify(
655	struct xfs_buf	*bp)
656{
657	struct xfs_mount	*mp = bp->b_mount;
658	struct xfs_buf_log_item	*bip = bp->b_log_item;
659	xfs_failaddr_t		fa;
660
661	/* no verification of non-crc AGFLs */
662	if (!xfs_sb_version_hascrc(&mp->m_sb))
663		return;
664
665	fa = xfs_agfl_verify(bp);
666	if (fa) {
667		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
668		return;
669	}
670
671	if (bip)
672		XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
673
674	xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
675}
676
677const struct xfs_buf_ops xfs_agfl_buf_ops = {
678	.name = "xfs_agfl",
679	.magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
680	.verify_read = xfs_agfl_read_verify,
681	.verify_write = xfs_agfl_write_verify,
682	.verify_struct = xfs_agfl_verify,
683};
684
685/*
686 * Read in the allocation group free block array.
687 */
688int					/* error */
689xfs_alloc_read_agfl(
690	xfs_mount_t	*mp,		/* mount point structure */
691	xfs_trans_t	*tp,		/* transaction pointer */
692	xfs_agnumber_t	agno,		/* allocation group number */
693	xfs_buf_t	**bpp)		/* buffer for the ag free block array */
694{
695	xfs_buf_t	*bp;		/* return value */
696	int		error;
697
698	ASSERT(agno != NULLAGNUMBER);
699	error = xfs_trans_read_buf(
700			mp, tp, mp->m_ddev_targp,
701			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
702			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
703	if (error)
704		return error;
705	xfs_buf_set_ref(bp, XFS_AGFL_REF);
706	*bpp = bp;
707	return 0;
708}
709
710STATIC int
711xfs_alloc_update_counters(
712	struct xfs_trans	*tp,
713	struct xfs_buf		*agbp,
714	long			len)
715{
716	struct xfs_agf		*agf = agbp->b_addr;
717
718	agbp->b_pag->pagf_freeblks += len;
719	be32_add_cpu(&agf->agf_freeblks, len);
720
721	xfs_trans_agblocks_delta(tp, len);
722	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
723		     be32_to_cpu(agf->agf_length))) {
724		xfs_buf_mark_corrupt(agbp);
725		return -EFSCORRUPTED;
726	}
727
728	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
729	return 0;
730}
731
732/*
733 * Block allocation algorithm and data structures.
734 */
735struct xfs_alloc_cur {
736	struct xfs_btree_cur		*cnt;	/* btree cursors */
737	struct xfs_btree_cur		*bnolt;
738	struct xfs_btree_cur		*bnogt;
739	xfs_extlen_t			cur_len;/* current search length */
740	xfs_agblock_t			rec_bno;/* extent startblock */
741	xfs_extlen_t			rec_len;/* extent length */
742	xfs_agblock_t			bno;	/* alloc bno */
743	xfs_extlen_t			len;	/* alloc len */
744	xfs_extlen_t			diff;	/* diff from search bno */
745	unsigned int			busy_gen;/* busy state */
746	bool				busy;
747};
748
749/*
750 * Set up cursors, etc. in the extent allocation cursor. This function can be
751 * called multiple times to reset an initialized structure without having to
752 * reallocate cursors.
753 */
754static int
755xfs_alloc_cur_setup(
756	struct xfs_alloc_arg	*args,
757	struct xfs_alloc_cur	*acur)
758{
759	int			error;
760	int			i;
761
762	ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
763
764	acur->cur_len = args->maxlen;
765	acur->rec_bno = 0;
766	acur->rec_len = 0;
767	acur->bno = 0;
768	acur->len = 0;
769	acur->diff = -1;
770	acur->busy = false;
771	acur->busy_gen = 0;
772
773	/*
774	 * Perform an initial cntbt lookup to check for availability of maxlen
775	 * extents. If this fails, we'll return -ENOSPC to signal the caller to
776	 * attempt a small allocation.
777	 */
778	if (!acur->cnt)
779		acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
780					args->agbp, args->agno, XFS_BTNUM_CNT);
781	error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
782	if (error)
783		return error;
784
785	/*
786	 * Allocate the bnobt left and right search cursors.
787	 */
788	if (!acur->bnolt)
789		acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
790					args->agbp, args->agno, XFS_BTNUM_BNO);
791	if (!acur->bnogt)
792		acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
793					args->agbp, args->agno, XFS_BTNUM_BNO);
794	return i == 1 ? 0 : -ENOSPC;
795}
796
797static void
798xfs_alloc_cur_close(
799	struct xfs_alloc_cur	*acur,
800	bool			error)
801{
802	int			cur_error = XFS_BTREE_NOERROR;
803
804	if (error)
805		cur_error = XFS_BTREE_ERROR;
806
807	if (acur->cnt)
808		xfs_btree_del_cursor(acur->cnt, cur_error);
809	if (acur->bnolt)
810		xfs_btree_del_cursor(acur->bnolt, cur_error);
811	if (acur->bnogt)
812		xfs_btree_del_cursor(acur->bnogt, cur_error);
813	acur->cnt = acur->bnolt = acur->bnogt = NULL;
814}
815
816/*
817 * Check an extent for allocation and track the best available candidate in the
818 * allocation structure. The cursor is deactivated if it has entered an out of
819 * range state based on allocation arguments. Optionally return the extent
820 * extent geometry and allocation status if requested by the caller.
821 */
822static int
823xfs_alloc_cur_check(
824	struct xfs_alloc_arg	*args,
825	struct xfs_alloc_cur	*acur,
826	struct xfs_btree_cur	*cur,
827	int			*new)
828{
829	int			error, i;
830	xfs_agblock_t		bno, bnoa, bnew;
831	xfs_extlen_t		len, lena, diff = -1;
832	bool			busy;
833	unsigned		busy_gen = 0;
834	bool			deactivate = false;
835	bool			isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
836
837	*new = 0;
838
839	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
840	if (error)
841		return error;
842	if (XFS_IS_CORRUPT(args->mp, i != 1))
843		return -EFSCORRUPTED;
844
845	/*
846	 * Check minlen and deactivate a cntbt cursor if out of acceptable size
847	 * range (i.e., walking backwards looking for a minlen extent).
848	 */
849	if (len < args->minlen) {
850		deactivate = !isbnobt;
851		goto out;
852	}
853
854	busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
855					 &busy_gen);
856	acur->busy |= busy;
857	if (busy)
858		acur->busy_gen = busy_gen;
859	/* deactivate a bnobt cursor outside of locality range */
860	if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
861		deactivate = isbnobt;
862		goto out;
863	}
864	if (lena < args->minlen)
865		goto out;
866
867	args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
868	xfs_alloc_fix_len(args);
869	ASSERT(args->len >= args->minlen);
870	if (args->len < acur->len)
871		goto out;
872
873	/*
874	 * We have an aligned record that satisfies minlen and beats or matches
875	 * the candidate extent size. Compare locality for near allocation mode.
876	 */
877	ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
878	diff = xfs_alloc_compute_diff(args->agbno, args->len,
879				      args->alignment, args->datatype,
880				      bnoa, lena, &bnew);
881	if (bnew == NULLAGBLOCK)
882		goto out;
883
884	/*
885	 * Deactivate a bnobt cursor with worse locality than the current best.
886	 */
887	if (diff > acur->diff) {
888		deactivate = isbnobt;
889		goto out;
890	}
891
892	ASSERT(args->len > acur->len ||
893	       (args->len == acur->len && diff <= acur->diff));
894	acur->rec_bno = bno;
895	acur->rec_len = len;
896	acur->bno = bnew;
897	acur->len = args->len;
898	acur->diff = diff;
899	*new = 1;
900
901	/*
902	 * We're done if we found a perfect allocation. This only deactivates
903	 * the current cursor, but this is just an optimization to terminate a
904	 * cntbt search that otherwise runs to the edge of the tree.
905	 */
906	if (acur->diff == 0 && acur->len == args->maxlen)
907		deactivate = true;
908out:
909	if (deactivate)
910		cur->bc_ag.abt.active = false;
911	trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
912				  *new);
913	return 0;
914}
915
916/*
917 * Complete an allocation of a candidate extent. Remove the extent from both
918 * trees and update the args structure.
919 */
920STATIC int
921xfs_alloc_cur_finish(
922	struct xfs_alloc_arg	*args,
923	struct xfs_alloc_cur	*acur)
924{
925	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
926	int			error;
927
928	ASSERT(acur->cnt && acur->bnolt);
929	ASSERT(acur->bno >= acur->rec_bno);
930	ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
931	ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
932
933	error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
934				      acur->rec_len, acur->bno, acur->len, 0);
935	if (error)
936		return error;
937
938	args->agbno = acur->bno;
939	args->len = acur->len;
940	args->wasfromfl = 0;
941
942	trace_xfs_alloc_cur(args);
943	return 0;
944}
945
946/*
947 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
948 * bno optimized lookup to search for extents with ideal size and locality.
949 */
950STATIC int
951xfs_alloc_cntbt_iter(
952	struct xfs_alloc_arg		*args,
953	struct xfs_alloc_cur		*acur)
954{
955	struct xfs_btree_cur	*cur = acur->cnt;
956	xfs_agblock_t		bno;
957	xfs_extlen_t		len, cur_len;
958	int			error;
959	int			i;
960
961	if (!xfs_alloc_cur_active(cur))
962		return 0;
963
964	/* locality optimized lookup */
965	cur_len = acur->cur_len;
966	error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
967	if (error)
968		return error;
969	if (i == 0)
970		return 0;
971	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
972	if (error)
973		return error;
974
975	/* check the current record and update search length from it */
976	error = xfs_alloc_cur_check(args, acur, cur, &i);
977	if (error)
978		return error;
979	ASSERT(len >= acur->cur_len);
980	acur->cur_len = len;
981
982	/*
983	 * We looked up the first record >= [agbno, len] above. The agbno is a
984	 * secondary key and so the current record may lie just before or after
985	 * agbno. If it is past agbno, check the previous record too so long as
986	 * the length matches as it may be closer. Don't check a smaller record
987	 * because that could deactivate our cursor.
988	 */
989	if (bno > args->agbno) {
990		error = xfs_btree_decrement(cur, 0, &i);
991		if (!error && i) {
992			error = xfs_alloc_get_rec(cur, &bno, &len, &i);
993			if (!error && i && len == acur->cur_len)
994				error = xfs_alloc_cur_check(args, acur, cur,
995							    &i);
996		}
997		if (error)
998			return error;
999	}
1000
1001	/*
1002	 * Increment the search key until we find at least one allocation
1003	 * candidate or if the extent we found was larger. Otherwise, double the
1004	 * search key to optimize the search. Efficiency is more important here
1005	 * than absolute best locality.
1006	 */
1007	cur_len <<= 1;
1008	if (!acur->len || acur->cur_len >= cur_len)
1009		acur->cur_len++;
1010	else
1011		acur->cur_len = cur_len;
1012
1013	return error;
1014}
1015
1016/*
1017 * Deal with the case where only small freespaces remain. Either return the
1018 * contents of the last freespace record, or allocate space from the freelist if
1019 * there is nothing in the tree.
1020 */
1021STATIC int			/* error */
1022xfs_alloc_ag_vextent_small(
1023	struct xfs_alloc_arg	*args,	/* allocation argument structure */
1024	struct xfs_btree_cur	*ccur,	/* optional by-size cursor */
1025	xfs_agblock_t		*fbnop,	/* result block number */
1026	xfs_extlen_t		*flenp,	/* result length */
1027	int			*stat)	/* status: 0-freelist, 1-normal/none */
1028{
1029	struct xfs_agf		*agf = args->agbp->b_addr;
1030	int			error = 0;
1031	xfs_agblock_t		fbno = NULLAGBLOCK;
1032	xfs_extlen_t		flen = 0;
1033	int			i = 0;
1034
1035	/*
1036	 * If a cntbt cursor is provided, try to allocate the largest record in
1037	 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1038	 * allocation. Make sure to respect minleft even when pulling from the
1039	 * freelist.
1040	 */
1041	if (ccur)
1042		error = xfs_btree_decrement(ccur, 0, &i);
1043	if (error)
1044		goto error;
1045	if (i) {
1046		error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1047		if (error)
1048			goto error;
1049		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1050			error = -EFSCORRUPTED;
1051			goto error;
1052		}
1053		goto out;
1054	}
1055
1056	if (args->minlen != 1 || args->alignment != 1 ||
1057	    args->resv == XFS_AG_RESV_AGFL ||
1058	    be32_to_cpu(agf->agf_flcount) <= args->minleft)
1059		goto out;
1060
1061	error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
1062	if (error)
1063		goto error;
1064	if (fbno == NULLAGBLOCK)
1065		goto out;
1066
1067	xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
1068			      (args->datatype & XFS_ALLOC_NOBUSY));
1069
1070	if (args->datatype & XFS_ALLOC_USERDATA) {
1071		struct xfs_buf	*bp;
1072
1073		error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1074				XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1075				args->mp->m_bsize, 0, &bp);
1076		if (error)
1077			goto error;
1078		xfs_trans_binval(args->tp, bp);
1079	}
1080	*fbnop = args->agbno = fbno;
1081	*flenp = args->len = 1;
1082	if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1083		error = -EFSCORRUPTED;
1084		goto error;
1085	}
1086	args->wasfromfl = 1;
1087	trace_xfs_alloc_small_freelist(args);
1088
1089	/*
1090	 * If we're feeding an AGFL block to something that doesn't live in the
1091	 * free space, we need to clear out the OWN_AG rmap.
1092	 */
1093	error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
1094			      &XFS_RMAP_OINFO_AG);
1095	if (error)
1096		goto error;
1097
1098	*stat = 0;
1099	return 0;
1100
1101out:
1102	/*
1103	 * Can't do the allocation, give up.
1104	 */
1105	if (flen < args->minlen) {
1106		args->agbno = NULLAGBLOCK;
1107		trace_xfs_alloc_small_notenough(args);
1108		flen = 0;
1109	}
1110	*fbnop = fbno;
1111	*flenp = flen;
1112	*stat = 1;
1113	trace_xfs_alloc_small_done(args);
1114	return 0;
1115
1116error:
1117	trace_xfs_alloc_small_error(args);
1118	return error;
1119}
1120
1121/*
1122 * Allocate a variable extent in the allocation group agno.
1123 * Type and bno are used to determine where in the allocation group the
1124 * extent will start.
1125 * Extent's length (returned in *len) will be between minlen and maxlen,
1126 * and of the form k * prod + mod unless there's nothing that large.
1127 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1128 */
1129STATIC int			/* error */
1130xfs_alloc_ag_vextent(
1131	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
1132{
1133	int		error=0;
1134
1135	ASSERT(args->minlen > 0);
1136	ASSERT(args->maxlen > 0);
1137	ASSERT(args->minlen <= args->maxlen);
1138	ASSERT(args->mod < args->prod);
1139	ASSERT(args->alignment > 0);
1140
1141	/*
1142	 * Branch to correct routine based on the type.
1143	 */
1144	args->wasfromfl = 0;
1145	switch (args->type) {
1146	case XFS_ALLOCTYPE_THIS_AG:
1147		error = xfs_alloc_ag_vextent_size(args);
1148		break;
1149	case XFS_ALLOCTYPE_NEAR_BNO:
1150		error = xfs_alloc_ag_vextent_near(args);
1151		break;
1152	case XFS_ALLOCTYPE_THIS_BNO:
1153		error = xfs_alloc_ag_vextent_exact(args);
1154		break;
1155	default:
1156		ASSERT(0);
1157		/* NOTREACHED */
1158	}
1159
1160	if (error || args->agbno == NULLAGBLOCK)
1161		return error;
1162
1163	ASSERT(args->len >= args->minlen);
1164	ASSERT(args->len <= args->maxlen);
1165	ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
1166	ASSERT(args->agbno % args->alignment == 0);
1167
1168	/* if not file data, insert new block into the reverse map btree */
1169	if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
1170		error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
1171				       args->agbno, args->len, &args->oinfo);
1172		if (error)
1173			return error;
1174	}
1175
1176	if (!args->wasfromfl) {
1177		error = xfs_alloc_update_counters(args->tp, args->agbp,
1178						  -((long)(args->len)));
1179		if (error)
1180			return error;
1181
1182		ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
1183					      args->agbno, args->len));
1184	}
1185
1186	xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
1187
1188	XFS_STATS_INC(args->mp, xs_allocx);
1189	XFS_STATS_ADD(args->mp, xs_allocb, args->len);
1190	return error;
1191}
1192
1193/*
1194 * Allocate a variable extent at exactly agno/bno.
1195 * Extent's length (returned in *len) will be between minlen and maxlen,
1196 * and of the form k * prod + mod unless there's nothing that large.
1197 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1198 */
1199STATIC int			/* error */
1200xfs_alloc_ag_vextent_exact(
1201	xfs_alloc_arg_t	*args)	/* allocation argument structure */
1202{
1203	struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1204	xfs_btree_cur_t	*bno_cur;/* by block-number btree cursor */
1205	xfs_btree_cur_t	*cnt_cur;/* by count btree cursor */
1206	int		error;
1207	xfs_agblock_t	fbno;	/* start block of found extent */
1208	xfs_extlen_t	flen;	/* length of found extent */
1209	xfs_agblock_t	tbno;	/* start block of busy extent */
1210	xfs_extlen_t	tlen;	/* length of busy extent */
1211	xfs_agblock_t	tend;	/* end block of busy extent */
1212	int		i;	/* success/failure of operation */
1213	unsigned	busy_gen;
1214
1215	ASSERT(args->alignment == 1);
1216
1217	/*
1218	 * Allocate/initialize a cursor for the by-number freespace btree.
1219	 */
1220	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1221					  args->agno, XFS_BTNUM_BNO);
1222
1223	/*
1224	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1225	 * Look for the closest free block <= bno, it must contain bno
1226	 * if any free block does.
1227	 */
1228	error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1229	if (error)
1230		goto error0;
1231	if (!i)
1232		goto not_found;
1233
1234	/*
1235	 * Grab the freespace record.
1236	 */
1237	error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1238	if (error)
1239		goto error0;
1240	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1241		error = -EFSCORRUPTED;
1242		goto error0;
1243	}
1244	ASSERT(fbno <= args->agbno);
1245
1246	/*
1247	 * Check for overlapping busy extents.
1248	 */
1249	tbno = fbno;
1250	tlen = flen;
1251	xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1252
1253	/*
1254	 * Give up if the start of the extent is busy, or the freespace isn't
1255	 * long enough for the minimum request.
1256	 */
1257	if (tbno > args->agbno)
1258		goto not_found;
1259	if (tlen < args->minlen)
1260		goto not_found;
1261	tend = tbno + tlen;
1262	if (tend < args->agbno + args->minlen)
1263		goto not_found;
1264
1265	/*
1266	 * End of extent will be smaller of the freespace end and the
1267	 * maximal requested end.
1268	 *
1269	 * Fix the length according to mod and prod if given.
1270	 */
1271	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1272						- args->agbno;
1273	xfs_alloc_fix_len(args);
1274	ASSERT(args->agbno + args->len <= tend);
1275
1276	/*
1277	 * We are allocating agbno for args->len
1278	 * Allocate/initialize a cursor for the by-size btree.
1279	 */
1280	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1281		args->agno, XFS_BTNUM_CNT);
1282	ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1283	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1284				      args->len, XFSA_FIXUP_BNO_OK);
1285	if (error) {
1286		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1287		goto error0;
1288	}
1289
1290	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1291	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1292
1293	args->wasfromfl = 0;
1294	trace_xfs_alloc_exact_done(args);
1295	return 0;
1296
1297not_found:
1298	/* Didn't find it, return null. */
1299	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1300	args->agbno = NULLAGBLOCK;
1301	trace_xfs_alloc_exact_notfound(args);
1302	return 0;
1303
1304error0:
1305	xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1306	trace_xfs_alloc_exact_error(args);
1307	return error;
1308}
1309
1310/*
1311 * Search a given number of btree records in a given direction. Check each
1312 * record against the good extent we've already found.
1313 */
1314STATIC int
1315xfs_alloc_walk_iter(
1316	struct xfs_alloc_arg	*args,
1317	struct xfs_alloc_cur	*acur,
1318	struct xfs_btree_cur	*cur,
1319	bool			increment,
1320	bool			find_one, /* quit on first candidate */
1321	int			count,    /* rec count (-1 for infinite) */
1322	int			*stat)
1323{
1324	int			error;
1325	int			i;
1326
1327	*stat = 0;
1328
1329	/*
1330	 * Search so long as the cursor is active or we find a better extent.
1331	 * The cursor is deactivated if it extends beyond the range of the
1332	 * current allocation candidate.
1333	 */
1334	while (xfs_alloc_cur_active(cur) && count) {
1335		error = xfs_alloc_cur_check(args, acur, cur, &i);
1336		if (error)
1337			return error;
1338		if (i == 1) {
1339			*stat = 1;
1340			if (find_one)
1341				break;
1342		}
1343		if (!xfs_alloc_cur_active(cur))
1344			break;
1345
1346		if (increment)
1347			error = xfs_btree_increment(cur, 0, &i);
1348		else
1349			error = xfs_btree_decrement(cur, 0, &i);
1350		if (error)
1351			return error;
1352		if (i == 0)
1353			cur->bc_ag.abt.active = false;
1354
1355		if (count > 0)
1356			count--;
1357	}
1358
1359	return 0;
1360}
1361
1362/*
1363 * Search the by-bno and by-size btrees in parallel in search of an extent with
1364 * ideal locality based on the NEAR mode ->agbno locality hint.
1365 */
1366STATIC int
1367xfs_alloc_ag_vextent_locality(
1368	struct xfs_alloc_arg	*args,
1369	struct xfs_alloc_cur	*acur,
1370	int			*stat)
1371{
1372	struct xfs_btree_cur	*fbcur = NULL;
1373	int			error;
1374	int			i;
1375	bool			fbinc;
1376
1377	ASSERT(acur->len == 0);
1378	ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
1379
1380	*stat = 0;
1381
1382	error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1383	if (error)
1384		return error;
1385	error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1386	if (error)
1387		return error;
1388	error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1389	if (error)
1390		return error;
1391
1392	/*
1393	 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1394	 * right and lookup the closest extent to the locality hint for each
1395	 * extent size key in the cntbt. The entire search terminates
1396	 * immediately on a bnobt hit because that means we've found best case
1397	 * locality. Otherwise the search continues until the cntbt cursor runs
1398	 * off the end of the tree. If no allocation candidate is found at this
1399	 * point, give up on locality, walk backwards from the end of the cntbt
1400	 * and take the first available extent.
1401	 *
1402	 * The parallel tree searches balance each other out to provide fairly
1403	 * consistent performance for various situations. The bnobt search can
1404	 * have pathological behavior in the worst case scenario of larger
1405	 * allocation requests and fragmented free space. On the other hand, the
1406	 * bnobt is able to satisfy most smaller allocation requests much more
1407	 * quickly than the cntbt. The cntbt search can sift through fragmented
1408	 * free space and sets of free extents for larger allocation requests
1409	 * more quickly than the bnobt. Since the locality hint is just a hint
1410	 * and we don't want to scan the entire bnobt for perfect locality, the
1411	 * cntbt search essentially bounds the bnobt search such that we can
1412	 * find good enough locality at reasonable performance in most cases.
1413	 */
1414	while (xfs_alloc_cur_active(acur->bnolt) ||
1415	       xfs_alloc_cur_active(acur->bnogt) ||
1416	       xfs_alloc_cur_active(acur->cnt)) {
1417
1418		trace_xfs_alloc_cur_lookup(args);
1419
1420		/*
1421		 * Search the bnobt left and right. In the case of a hit, finish
1422		 * the search in the opposite direction and we're done.
1423		 */
1424		error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1425					    true, 1, &i);
1426		if (error)
1427			return error;
1428		if (i == 1) {
1429			trace_xfs_alloc_cur_left(args);
1430			fbcur = acur->bnogt;
1431			fbinc = true;
1432			break;
1433		}
1434		error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1435					    1, &i);
1436		if (error)
1437			return error;
1438		if (i == 1) {
1439			trace_xfs_alloc_cur_right(args);
1440			fbcur = acur->bnolt;
1441			fbinc = false;
1442			break;
1443		}
1444
1445		/*
1446		 * Check the extent with best locality based on the current
1447		 * extent size search key and keep track of the best candidate.
1448		 */
1449		error = xfs_alloc_cntbt_iter(args, acur);
1450		if (error)
1451			return error;
1452		if (!xfs_alloc_cur_active(acur->cnt)) {
1453			trace_xfs_alloc_cur_lookup_done(args);
1454			break;
1455		}
1456	}
1457
1458	/*
1459	 * If we failed to find anything due to busy extents, return empty
1460	 * handed so the caller can flush and retry. If no busy extents were
1461	 * found, walk backwards from the end of the cntbt as a last resort.
1462	 */
1463	if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1464		error = xfs_btree_decrement(acur->cnt, 0, &i);
1465		if (error)
1466			return error;
1467		if (i) {
1468			acur->cnt->bc_ag.abt.active = true;
1469			fbcur = acur->cnt;
1470			fbinc = false;
1471		}
1472	}
1473
1474	/*
1475	 * Search in the opposite direction for a better entry in the case of
1476	 * a bnobt hit or walk backwards from the end of the cntbt.
1477	 */
1478	if (fbcur) {
1479		error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1480					    &i);
1481		if (error)
1482			return error;
1483	}
1484
1485	if (acur->len)
1486		*stat = 1;
1487
1488	return 0;
1489}
1490
1491/* Check the last block of the cnt btree for allocations. */
1492static int
1493xfs_alloc_ag_vextent_lastblock(
1494	struct xfs_alloc_arg	*args,
1495	struct xfs_alloc_cur	*acur,
1496	xfs_agblock_t		*bno,
1497	xfs_extlen_t		*len,
1498	bool			*allocated)
1499{
1500	int			error;
1501	int			i;
1502
1503#ifdef DEBUG
1504	/* Randomly don't execute the first algorithm. */
1505	if (prandom_u32() & 1)
1506		return 0;
1507#endif
1508
1509	/*
1510	 * Start from the entry that lookup found, sequence through all larger
1511	 * free blocks.  If we're actually pointing at a record smaller than
1512	 * maxlen, go to the start of this block, and skip all those smaller
1513	 * than minlen.
1514	 */
1515	if (*len || args->alignment > 1) {
1516		acur->cnt->bc_ptrs[0] = 1;
1517		do {
1518			error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1519			if (error)
1520				return error;
1521			if (XFS_IS_CORRUPT(args->mp, i != 1))
1522				return -EFSCORRUPTED;
1523			if (*len >= args->minlen)
1524				break;
1525			error = xfs_btree_increment(acur->cnt, 0, &i);
1526			if (error)
1527				return error;
1528		} while (i);
1529		ASSERT(*len >= args->minlen);
1530		if (!i)
1531			return 0;
1532	}
1533
1534	error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1535	if (error)
1536		return error;
1537
1538	/*
1539	 * It didn't work.  We COULD be in a case where there's a good record
1540	 * somewhere, so try again.
1541	 */
1542	if (acur->len == 0)
1543		return 0;
1544
1545	trace_xfs_alloc_near_first(args);
1546	*allocated = true;
1547	return 0;
1548}
1549
1550/*
1551 * Allocate a variable extent near bno in the allocation group agno.
1552 * Extent's length (returned in len) will be between minlen and maxlen,
1553 * and of the form k * prod + mod unless there's nothing that large.
1554 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1555 */
1556STATIC int
1557xfs_alloc_ag_vextent_near(
1558	struct xfs_alloc_arg	*args)
1559{
1560	struct xfs_alloc_cur	acur = {};
1561	int			error;		/* error code */
1562	int			i;		/* result code, temporary */
1563	xfs_agblock_t		bno;
1564	xfs_extlen_t		len;
1565
1566	/* handle uninitialized agbno range so caller doesn't have to */
1567	if (!args->min_agbno && !args->max_agbno)
1568		args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1569	ASSERT(args->min_agbno <= args->max_agbno);
1570
1571	/* clamp agbno to the range if it's outside */
1572	if (args->agbno < args->min_agbno)
1573		args->agbno = args->min_agbno;
1574	if (args->agbno > args->max_agbno)
1575		args->agbno = args->max_agbno;
1576
1577restart:
1578	len = 0;
1579
1580	/*
1581	 * Set up cursors and see if there are any free extents as big as
1582	 * maxlen. If not, pick the last entry in the tree unless the tree is
1583	 * empty.
1584	 */
1585	error = xfs_alloc_cur_setup(args, &acur);
1586	if (error == -ENOSPC) {
1587		error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1588				&len, &i);
1589		if (error)
1590			goto out;
1591		if (i == 0 || len == 0) {
1592			trace_xfs_alloc_near_noentry(args);
1593			goto out;
1594		}
1595		ASSERT(i == 1);
1596	} else if (error) {
1597		goto out;
1598	}
1599
1600	/*
1601	 * First algorithm.
1602	 * If the requested extent is large wrt the freespaces available
1603	 * in this a.g., then the cursor will be pointing to a btree entry
1604	 * near the right edge of the tree.  If it's in the last btree leaf
1605	 * block, then we just examine all the entries in that block
1606	 * that are big enough, and pick the best one.
1607	 */
1608	if (xfs_btree_islastblock(acur.cnt, 0)) {
1609		bool		allocated = false;
1610
1611		error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1612				&allocated);
1613		if (error)
1614			goto out;
1615		if (allocated)
1616			goto alloc_finish;
1617	}
1618
1619	/*
1620	 * Second algorithm. Combined cntbt and bnobt search to find ideal
1621	 * locality.
1622	 */
1623	error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1624	if (error)
1625		goto out;
1626
1627	/*
1628	 * If we couldn't get anything, give up.
1629	 */
1630	if (!acur.len) {
1631		if (acur.busy) {
1632			trace_xfs_alloc_near_busy(args);
1633			xfs_extent_busy_flush(args->mp, args->pag,
1634					      acur.busy_gen);
1635			goto restart;
1636		}
1637		trace_xfs_alloc_size_neither(args);
1638		args->agbno = NULLAGBLOCK;
1639		goto out;
1640	}
1641
1642alloc_finish:
1643	/* fix up btrees on a successful allocation */
1644	error = xfs_alloc_cur_finish(args, &acur);
1645
1646out:
1647	xfs_alloc_cur_close(&acur, error);
1648	return error;
1649}
1650
1651/*
1652 * Allocate a variable extent anywhere in the allocation group agno.
1653 * Extent's length (returned in len) will be between minlen and maxlen,
1654 * and of the form k * prod + mod unless there's nothing that large.
1655 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1656 */
1657STATIC int				/* error */
1658xfs_alloc_ag_vextent_size(
1659	xfs_alloc_arg_t	*args)		/* allocation argument structure */
1660{
1661	struct xfs_agf	*agf = args->agbp->b_addr;
1662	xfs_btree_cur_t	*bno_cur;	/* cursor for bno btree */
1663	xfs_btree_cur_t	*cnt_cur;	/* cursor for cnt btree */
1664	int		error;		/* error result */
1665	xfs_agblock_t	fbno;		/* start of found freespace */
1666	xfs_extlen_t	flen;		/* length of found freespace */
1667	int		i;		/* temp status variable */
1668	xfs_agblock_t	rbno;		/* returned block number */
1669	xfs_extlen_t	rlen;		/* length of returned extent */
1670	bool		busy;
1671	unsigned	busy_gen;
1672
1673restart:
1674	/*
1675	 * Allocate and initialize a cursor for the by-size btree.
1676	 */
1677	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1678		args->agno, XFS_BTNUM_CNT);
1679	bno_cur = NULL;
1680	busy = false;
1681
1682	/*
1683	 * Look for an entry >= maxlen+alignment-1 blocks.
1684	 */
1685	if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1686			args->maxlen + args->alignment - 1, &i)))
1687		goto error0;
1688
1689	/*
1690	 * If none then we have to settle for a smaller extent. In the case that
1691	 * there are no large extents, this will return the last entry in the
1692	 * tree unless the tree is empty. In the case that there are only busy
1693	 * large extents, this will return the largest small extent unless there
1694	 * are no smaller extents available.
1695	 */
1696	if (!i) {
1697		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1698						   &fbno, &flen, &i);
1699		if (error)
1700			goto error0;
1701		if (i == 0 || flen == 0) {
1702			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1703			trace_xfs_alloc_size_noentry(args);
1704			return 0;
1705		}
1706		ASSERT(i == 1);
1707		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1708				&rlen, &busy_gen);
1709	} else {
1710		/*
1711		 * Search for a non-busy extent that is large enough.
1712		 */
1713		for (;;) {
1714			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1715			if (error)
1716				goto error0;
1717			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1718				error = -EFSCORRUPTED;
1719				goto error0;
1720			}
1721
1722			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1723					&rbno, &rlen, &busy_gen);
1724
1725			if (rlen >= args->maxlen)
1726				break;
1727
1728			error = xfs_btree_increment(cnt_cur, 0, &i);
1729			if (error)
1730				goto error0;
1731			if (i == 0) {
1732				/*
1733				 * Our only valid extents must have been busy.
1734				 * Make it unbusy by forcing the log out and
1735				 * retrying.
1736				 */
1737				xfs_btree_del_cursor(cnt_cur,
1738						     XFS_BTREE_NOERROR);
1739				trace_xfs_alloc_size_busy(args);
1740				xfs_extent_busy_flush(args->mp,
1741							args->pag, busy_gen);
1742				goto restart;
1743			}
1744		}
1745	}
1746
1747	/*
1748	 * In the first case above, we got the last entry in the
1749	 * by-size btree.  Now we check to see if the space hits maxlen
1750	 * once aligned; if not, we search left for something better.
1751	 * This can't happen in the second case above.
1752	 */
1753	rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1754	if (XFS_IS_CORRUPT(args->mp,
1755			   rlen != 0 &&
1756			   (rlen > flen ||
1757			    rbno + rlen > fbno + flen))) {
1758		error = -EFSCORRUPTED;
1759		goto error0;
1760	}
1761	if (rlen < args->maxlen) {
1762		xfs_agblock_t	bestfbno;
1763		xfs_extlen_t	bestflen;
1764		xfs_agblock_t	bestrbno;
1765		xfs_extlen_t	bestrlen;
1766
1767		bestrlen = rlen;
1768		bestrbno = rbno;
1769		bestflen = flen;
1770		bestfbno = fbno;
1771		for (;;) {
1772			if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1773				goto error0;
1774			if (i == 0)
1775				break;
1776			if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1777					&i)))
1778				goto error0;
1779			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1780				error = -EFSCORRUPTED;
1781				goto error0;
1782			}
1783			if (flen < bestrlen)
1784				break;
1785			busy = xfs_alloc_compute_aligned(args, fbno, flen,
1786					&rbno, &rlen, &busy_gen);
1787			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1788			if (XFS_IS_CORRUPT(args->mp,
1789					   rlen != 0 &&
1790					   (rlen > flen ||
1791					    rbno + rlen > fbno + flen))) {
1792				error = -EFSCORRUPTED;
1793				goto error0;
1794			}
1795			if (rlen > bestrlen) {
1796				bestrlen = rlen;
1797				bestrbno = rbno;
1798				bestflen = flen;
1799				bestfbno = fbno;
1800				if (rlen == args->maxlen)
1801					break;
1802			}
1803		}
1804		if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1805				&i)))
1806			goto error0;
1807		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1808			error = -EFSCORRUPTED;
1809			goto error0;
1810		}
1811		rlen = bestrlen;
1812		rbno = bestrbno;
1813		flen = bestflen;
1814		fbno = bestfbno;
1815	}
1816	args->wasfromfl = 0;
1817	/*
1818	 * Fix up the length.
1819	 */
1820	args->len = rlen;
1821	if (rlen < args->minlen) {
1822		if (busy) {
1823			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1824			trace_xfs_alloc_size_busy(args);
1825			xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1826			goto restart;
1827		}
1828		goto out_nominleft;
1829	}
1830	xfs_alloc_fix_len(args);
1831
1832	rlen = args->len;
1833	if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1834		error = -EFSCORRUPTED;
1835		goto error0;
1836	}
1837	/*
1838	 * Allocate and initialize a cursor for the by-block tree.
1839	 */
1840	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1841		args->agno, XFS_BTNUM_BNO);
1842	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1843			rbno, rlen, XFSA_FIXUP_CNT_OK)))
1844		goto error0;
1845	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1846	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1847	cnt_cur = bno_cur = NULL;
1848	args->len = rlen;
1849	args->agbno = rbno;
1850	if (XFS_IS_CORRUPT(args->mp,
1851			   args->agbno + args->len >
1852			   be32_to_cpu(agf->agf_length))) {
1853		error = -EFSCORRUPTED;
1854		goto error0;
1855	}
1856	trace_xfs_alloc_size_done(args);
1857	return 0;
1858
1859error0:
1860	trace_xfs_alloc_size_error(args);
1861	if (cnt_cur)
1862		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1863	if (bno_cur)
1864		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1865	return error;
1866
1867out_nominleft:
1868	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1869	trace_xfs_alloc_size_nominleft(args);
1870	args->agbno = NULLAGBLOCK;
1871	return 0;
1872}
1873
1874/*
1875 * Free the extent starting at agno/bno for length.
1876 */
1877STATIC int
1878xfs_free_ag_extent(
1879	struct xfs_trans		*tp,
1880	struct xfs_buf			*agbp,
1881	xfs_agnumber_t			agno,
1882	xfs_agblock_t			bno,
1883	xfs_extlen_t			len,
1884	const struct xfs_owner_info	*oinfo,
1885	enum xfs_ag_resv_type		type)
1886{
1887	struct xfs_mount		*mp;
1888	struct xfs_btree_cur		*bno_cur;
1889	struct xfs_btree_cur		*cnt_cur;
1890	xfs_agblock_t			gtbno; /* start of right neighbor */
1891	xfs_extlen_t			gtlen; /* length of right neighbor */
1892	xfs_agblock_t			ltbno; /* start of left neighbor */
1893	xfs_extlen_t			ltlen; /* length of left neighbor */
1894	xfs_agblock_t			nbno; /* new starting block of freesp */
1895	xfs_extlen_t			nlen; /* new length of freespace */
1896	int				haveleft; /* have a left neighbor */
1897	int				haveright; /* have a right neighbor */
1898	int				i;
1899	int				error;
1900
1901	bno_cur = cnt_cur = NULL;
1902	mp = tp->t_mountp;
1903
1904	if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1905		error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
1906		if (error)
1907			goto error0;
1908	}
1909
1910	/*
1911	 * Allocate and initialize a cursor for the by-block btree.
1912	 */
1913	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
1914	/*
1915	 * Look for a neighboring block on the left (lower block numbers)
1916	 * that is contiguous with this space.
1917	 */
1918	if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1919		goto error0;
1920	if (haveleft) {
1921		/*
1922		 * There is a block to our left.
1923		 */
1924		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1925			goto error0;
1926		if (XFS_IS_CORRUPT(mp, i != 1)) {
1927			error = -EFSCORRUPTED;
1928			goto error0;
1929		}
1930		/*
1931		 * It's not contiguous, though.
1932		 */
1933		if (ltbno + ltlen < bno)
1934			haveleft = 0;
1935		else {
1936			/*
1937			 * If this failure happens the request to free this
1938			 * space was invalid, it's (partly) already free.
1939			 * Very bad.
1940			 */
1941			if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1942				error = -EFSCORRUPTED;
1943				goto error0;
1944			}
1945		}
1946	}
1947	/*
1948	 * Look for a neighboring block on the right (higher block numbers)
1949	 * that is contiguous with this space.
1950	 */
1951	if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1952		goto error0;
1953	if (haveright) {
1954		/*
1955		 * There is a block to our right.
1956		 */
1957		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
1958			goto error0;
1959		if (XFS_IS_CORRUPT(mp, i != 1)) {
1960			error = -EFSCORRUPTED;
1961			goto error0;
1962		}
1963		/*
1964		 * It's not contiguous, though.
1965		 */
1966		if (bno + len < gtbno)
1967			haveright = 0;
1968		else {
1969			/*
1970			 * If this failure happens the request to free this
1971			 * space was invalid, it's (partly) already free.
1972			 * Very bad.
1973			 */
1974			if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1975				error = -EFSCORRUPTED;
1976				goto error0;
1977			}
1978		}
1979	}
1980	/*
1981	 * Now allocate and initialize a cursor for the by-size tree.
1982	 */
1983	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
1984	/*
1985	 * Have both left and right contiguous neighbors.
1986	 * Merge all three into a single free block.
1987	 */
1988	if (haveleft && haveright) {
1989		/*
1990		 * Delete the old by-size entry on the left.
1991		 */
1992		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1993			goto error0;
1994		if (XFS_IS_CORRUPT(mp, i != 1)) {
1995			error = -EFSCORRUPTED;
1996			goto error0;
1997		}
1998		if ((error = xfs_btree_delete(cnt_cur, &i)))
1999			goto error0;
2000		if (XFS_IS_CORRUPT(mp, i != 1)) {
2001			error = -EFSCORRUPTED;
2002			goto error0;
2003		}
2004		/*
2005		 * Delete the old by-size entry on the right.
2006		 */
2007		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2008			goto error0;
2009		if (XFS_IS_CORRUPT(mp, i != 1)) {
2010			error = -EFSCORRUPTED;
2011			goto error0;
2012		}
2013		if ((error = xfs_btree_delete(cnt_cur, &i)))
2014			goto error0;
2015		if (XFS_IS_CORRUPT(mp, i != 1)) {
2016			error = -EFSCORRUPTED;
2017			goto error0;
2018		}
2019		/*
2020		 * Delete the old by-block entry for the right block.
2021		 */
2022		if ((error = xfs_btree_delete(bno_cur, &i)))
2023			goto error0;
2024		if (XFS_IS_CORRUPT(mp, i != 1)) {
2025			error = -EFSCORRUPTED;
2026			goto error0;
2027		}
2028		/*
2029		 * Move the by-block cursor back to the left neighbor.
2030		 */
2031		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2032			goto error0;
2033		if (XFS_IS_CORRUPT(mp, i != 1)) {
2034			error = -EFSCORRUPTED;
2035			goto error0;
2036		}
2037#ifdef DEBUG
2038		/*
2039		 * Check that this is the right record: delete didn't
2040		 * mangle the cursor.
2041		 */
2042		{
2043			xfs_agblock_t	xxbno;
2044			xfs_extlen_t	xxlen;
2045
2046			if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2047					&i)))
2048				goto error0;
2049			if (XFS_IS_CORRUPT(mp,
2050					   i != 1 ||
2051					   xxbno != ltbno ||
2052					   xxlen != ltlen)) {
2053				error = -EFSCORRUPTED;
2054				goto error0;
2055			}
2056		}
2057#endif
2058		/*
2059		 * Update remaining by-block entry to the new, joined block.
2060		 */
2061		nbno = ltbno;
2062		nlen = len + ltlen + gtlen;
2063		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2064			goto error0;
2065	}
2066	/*
2067	 * Have only a left contiguous neighbor.
2068	 * Merge it together with the new freespace.
2069	 */
2070	else if (haveleft) {
2071		/*
2072		 * Delete the old by-size entry on the left.
2073		 */
2074		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2075			goto error0;
2076		if (XFS_IS_CORRUPT(mp, i != 1)) {
2077			error = -EFSCORRUPTED;
2078			goto error0;
2079		}
2080		if ((error = xfs_btree_delete(cnt_cur, &i)))
2081			goto error0;
2082		if (XFS_IS_CORRUPT(mp, i != 1)) {
2083			error = -EFSCORRUPTED;
2084			goto error0;
2085		}
2086		/*
2087		 * Back up the by-block cursor to the left neighbor, and
2088		 * update its length.
2089		 */
2090		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2091			goto error0;
2092		if (XFS_IS_CORRUPT(mp, i != 1)) {
2093			error = -EFSCORRUPTED;
2094			goto error0;
2095		}
2096		nbno = ltbno;
2097		nlen = len + ltlen;
2098		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2099			goto error0;
2100	}
2101	/*
2102	 * Have only a right contiguous neighbor.
2103	 * Merge it together with the new freespace.
2104	 */
2105	else if (haveright) {
2106		/*
2107		 * Delete the old by-size entry on the right.
2108		 */
2109		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2110			goto error0;
2111		if (XFS_IS_CORRUPT(mp, i != 1)) {
2112			error = -EFSCORRUPTED;
2113			goto error0;
2114		}
2115		if ((error = xfs_btree_delete(cnt_cur, &i)))
2116			goto error0;
2117		if (XFS_IS_CORRUPT(mp, i != 1)) {
2118			error = -EFSCORRUPTED;
2119			goto error0;
2120		}
2121		/*
2122		 * Update the starting block and length of the right
2123		 * neighbor in the by-block tree.
2124		 */
2125		nbno = bno;
2126		nlen = len + gtlen;
2127		if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2128			goto error0;
2129	}
2130	/*
2131	 * No contiguous neighbors.
2132	 * Insert the new freespace into the by-block tree.
2133	 */
2134	else {
2135		nbno = bno;
2136		nlen = len;
2137		if ((error = xfs_btree_insert(bno_cur, &i)))
2138			goto error0;
2139		if (XFS_IS_CORRUPT(mp, i != 1)) {
2140			error = -EFSCORRUPTED;
2141			goto error0;
2142		}
2143	}
2144	xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2145	bno_cur = NULL;
2146	/*
2147	 * In all cases we need to insert the new freespace in the by-size tree.
2148	 */
2149	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2150		goto error0;
2151	if (XFS_IS_CORRUPT(mp, i != 0)) {
2152		error = -EFSCORRUPTED;
2153		goto error0;
2154	}
2155	if ((error = xfs_btree_insert(cnt_cur, &i)))
2156		goto error0;
2157	if (XFS_IS_CORRUPT(mp, i != 1)) {
2158		error = -EFSCORRUPTED;
2159		goto error0;
2160	}
2161	xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2162	cnt_cur = NULL;
2163
2164	/*
2165	 * Update the freespace totals in the ag and superblock.
2166	 */
2167	error = xfs_alloc_update_counters(tp, agbp, len);
2168	xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2169	if (error)
2170		goto error0;
2171
2172	XFS_STATS_INC(mp, xs_freex);
2173	XFS_STATS_ADD(mp, xs_freeb, len);
2174
2175	trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2176
2177	return 0;
2178
2179 error0:
2180	trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2181	if (bno_cur)
2182		xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2183	if (cnt_cur)
2184		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2185	return error;
2186}
2187
2188/*
2189 * Visible (exported) allocation/free functions.
2190 * Some of these are used just by xfs_alloc_btree.c and this file.
2191 */
2192
2193/*
2194 * Compute and fill in value of m_ag_maxlevels.
2195 */
2196void
2197xfs_alloc_compute_maxlevels(
2198	xfs_mount_t	*mp)	/* file system mount structure */
2199{
2200	mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2201			(mp->m_sb.sb_agblocks + 1) / 2);
2202}
2203
2204/*
2205 * Find the length of the longest extent in an AG.  The 'need' parameter
2206 * specifies how much space we're going to need for the AGFL and the
2207 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2208 * other callers.
2209 */
2210xfs_extlen_t
2211xfs_alloc_longest_free_extent(
2212	struct xfs_perag	*pag,
2213	xfs_extlen_t		need,
2214	xfs_extlen_t		reserved)
2215{
2216	xfs_extlen_t		delta = 0;
2217
2218	/*
2219	 * If the AGFL needs a recharge, we'll have to subtract that from the
2220	 * longest extent.
2221	 */
2222	if (need > pag->pagf_flcount)
2223		delta = need - pag->pagf_flcount;
2224
2225	/*
2226	 * If we cannot maintain others' reservations with space from the
2227	 * not-longest freesp extents, we'll have to subtract /that/ from
2228	 * the longest extent too.
2229	 */
2230	if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2231		delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2232
2233	/*
2234	 * If the longest extent is long enough to satisfy all the
2235	 * reservations and AGFL rules in place, we can return this extent.
2236	 */
2237	if (pag->pagf_longest > delta)
2238		return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2239				pag->pagf_longest - delta);
2240
2241	/* Otherwise, let the caller try for 1 block if there's space. */
2242	return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2243}
2244
2245/*
2246 * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
2247 * return the largest possible minimum length.
2248 */
2249unsigned int
2250xfs_alloc_min_freelist(
2251	struct xfs_mount	*mp,
2252	struct xfs_perag	*pag)
2253{
2254	/* AG btrees have at least 1 level. */
2255	static const uint8_t	fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2256	const uint8_t		*levels = pag ? pag->pagf_levels : fake_levels;
2257	unsigned int		min_free;
2258
2259	ASSERT(mp->m_ag_maxlevels > 0);
2260
2261	/* space needed by-bno freespace btree */
2262	min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2263				       mp->m_ag_maxlevels);
2264	/* space needed by-size freespace btree */
2265	min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2266				       mp->m_ag_maxlevels);
2267	/* space needed reverse mapping used space btree */
2268	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
2269		min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2270						mp->m_rmap_maxlevels);
2271
2272	return min_free;
2273}
2274
2275/*
2276 * Check if the operation we are fixing up the freelist for should go ahead or
2277 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2278 * is dependent on whether the size and shape of free space available will
2279 * permit the requested allocation to take place.
2280 */
2281static bool
2282xfs_alloc_space_available(
2283	struct xfs_alloc_arg	*args,
2284	xfs_extlen_t		min_free,
2285	int			flags)
2286{
2287	struct xfs_perag	*pag = args->pag;
2288	xfs_extlen_t		alloc_len, longest;
2289	xfs_extlen_t		reservation; /* blocks that are still reserved */
2290	int			available;
2291	xfs_extlen_t		agflcount;
2292
2293	if (flags & XFS_ALLOC_FLAG_FREEING)
2294		return true;
2295
2296	reservation = xfs_ag_resv_needed(pag, args->resv);
2297
2298	/* do we have enough contiguous free space for the allocation? */
2299	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2300	longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2301	if (longest < alloc_len)
2302		return false;
2303
2304	/*
2305	 * Do we have enough free space remaining for the allocation? Don't
2306	 * account extra agfl blocks because we are about to defer free them,
2307	 * making them unavailable until the current transaction commits.
2308	 */
2309	agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2310	available = (int)(pag->pagf_freeblks + agflcount -
2311			  reservation - min_free - args->minleft);
2312	if (available < (int)max(args->total, alloc_len))
2313		return false;
2314
2315	/*
2316	 * Clamp maxlen to the amount of free space available for the actual
2317	 * extent allocation.
2318	 */
2319	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2320		args->maxlen = available;
2321		ASSERT(args->maxlen > 0);
2322		ASSERT(args->maxlen >= args->minlen);
2323	}
2324
2325	return true;
2326}
2327
2328int
2329xfs_free_agfl_block(
2330	struct xfs_trans	*tp,
2331	xfs_agnumber_t		agno,
2332	xfs_agblock_t		agbno,
2333	struct xfs_buf		*agbp,
2334	struct xfs_owner_info	*oinfo)
2335{
2336	int			error;
2337	struct xfs_buf		*bp;
2338
2339	error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2340				   XFS_AG_RESV_AGFL);
2341	if (error)
2342		return error;
2343
2344	error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2345			XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2346			tp->t_mountp->m_bsize, 0, &bp);
2347	if (error)
2348		return error;
2349	xfs_trans_binval(tp, bp);
2350
2351	return 0;
2352}
2353
2354/*
2355 * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2356 * is to detect an agfl header padding mismatch between current and early v5
2357 * kernels. This problem manifests as a 1-slot size difference between the
2358 * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2359 * may also catch variants of agfl count corruption unrelated to padding. Either
2360 * way, we'll reset the agfl and warn the user.
2361 *
2362 * Return true if a reset is required before the agfl can be used, false
2363 * otherwise.
2364 */
2365static bool
2366xfs_agfl_needs_reset(
2367	struct xfs_mount	*mp,
2368	struct xfs_agf		*agf)
2369{
2370	uint32_t		f = be32_to_cpu(agf->agf_flfirst);
2371	uint32_t		l = be32_to_cpu(agf->agf_fllast);
2372	uint32_t		c = be32_to_cpu(agf->agf_flcount);
2373	int			agfl_size = xfs_agfl_size(mp);
2374	int			active;
2375
2376	/* no agfl header on v4 supers */
2377	if (!xfs_sb_version_hascrc(&mp->m_sb))
2378		return false;
2379
2380	/*
2381	 * The agf read verifier catches severe corruption of these fields.
2382	 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2383	 * the verifier allows it.
2384	 */
2385	if (f >= agfl_size || l >= agfl_size)
2386		return true;
2387	if (c > agfl_size)
2388		return true;
2389
2390	/*
2391	 * Check consistency between the on-disk count and the active range. An
2392	 * agfl padding mismatch manifests as an inconsistent flcount.
2393	 */
2394	if (c && l >= f)
2395		active = l - f + 1;
2396	else if (c)
2397		active = agfl_size - f + l + 1;
2398	else
2399		active = 0;
2400
2401	return active != c;
2402}
2403
2404/*
2405 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2406 * agfl content cannot be trusted. Warn the user that a repair is required to
2407 * recover leaked blocks.
2408 *
2409 * The purpose of this mechanism is to handle filesystems affected by the agfl
2410 * header padding mismatch problem. A reset keeps the filesystem online with a
2411 * relatively minor free space accounting inconsistency rather than suffer the
2412 * inevitable crash from use of an invalid agfl block.
2413 */
2414static void
2415xfs_agfl_reset(
2416	struct xfs_trans	*tp,
2417	struct xfs_buf		*agbp,
2418	struct xfs_perag	*pag)
2419{
2420	struct xfs_mount	*mp = tp->t_mountp;
2421	struct xfs_agf		*agf = agbp->b_addr;
2422
2423	ASSERT(pag->pagf_agflreset);
2424	trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2425
2426	xfs_warn(mp,
2427	       "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2428	       "Please unmount and run xfs_repair.",
2429	         pag->pag_agno, pag->pagf_flcount);
2430
2431	agf->agf_flfirst = 0;
2432	agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2433	agf->agf_flcount = 0;
2434	xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2435				    XFS_AGF_FLCOUNT);
2436
2437	pag->pagf_flcount = 0;
2438	pag->pagf_agflreset = false;
2439}
2440
2441/*
2442 * Defer an AGFL block free. This is effectively equivalent to
2443 * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
2444 *
2445 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2446 * allocation operations in a transaction. AGFL frees are prone to this problem
2447 * because for one they are always freed one at a time. Further, an immediate
2448 * AGFL block free can cause a btree join and require another block free before
2449 * the real allocation can proceed. Deferring the free disconnects freeing up
2450 * the AGFL slot from freeing the block.
2451 */
2452STATIC void
2453xfs_defer_agfl_block(
2454	struct xfs_trans		*tp,
2455	xfs_agnumber_t			agno,
2456	xfs_fsblock_t			agbno,
2457	struct xfs_owner_info		*oinfo)
2458{
2459	struct xfs_mount		*mp = tp->t_mountp;
2460	struct xfs_extent_free_item	*new;		/* new element */
2461
2462	ASSERT(xfs_bmap_free_item_zone != NULL);
2463	ASSERT(oinfo != NULL);
2464
2465	new = kmem_cache_alloc(xfs_bmap_free_item_zone,
2466			       GFP_KERNEL | __GFP_NOFAIL);
2467	new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2468	new->xefi_blockcount = 1;
2469	new->xefi_oinfo = *oinfo;
2470	new->xefi_skip_discard = false;
2471
2472	trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2473
2474	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
2475}
2476
2477/*
2478 * Decide whether to use this allocation group for this allocation.
2479 * If so, fix up the btree freelist's size.
2480 */
2481int			/* error */
2482xfs_alloc_fix_freelist(
2483	struct xfs_alloc_arg	*args,	/* allocation argument structure */
2484	int			flags)	/* XFS_ALLOC_FLAG_... */
2485{
2486	struct xfs_mount	*mp = args->mp;
2487	struct xfs_perag	*pag = args->pag;
2488	struct xfs_trans	*tp = args->tp;
2489	struct xfs_buf		*agbp = NULL;
2490	struct xfs_buf		*agflbp = NULL;
2491	struct xfs_alloc_arg	targs;	/* local allocation arguments */
2492	xfs_agblock_t		bno;	/* freelist block */
2493	xfs_extlen_t		need;	/* total blocks needed in freelist */
2494	int			error = 0;
2495
2496	/* deferred ops (AGFL block frees) require permanent transactions */
2497	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2498
2499	if (!pag->pagf_init) {
2500		error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2501		if (error) {
2502			/* Couldn't lock the AGF so skip this AG. */
2503			if (error == -EAGAIN)
2504				error = 0;
2505			goto out_no_agbp;
2506		}
2507	}
2508
2509	/*
2510	 * If this is a metadata preferred pag and we are user data then try
2511	 * somewhere else if we are not being asked to try harder at this
2512	 * point
2513	 */
2514	if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
2515	    (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2516		ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2517		goto out_agbp_relse;
2518	}
2519
2520	need = xfs_alloc_min_freelist(mp, pag);
2521	if (!xfs_alloc_space_available(args, need, flags |
2522			XFS_ALLOC_FLAG_CHECK))
2523		goto out_agbp_relse;
2524
2525	/*
2526	 * Get the a.g. freespace buffer.
2527	 * Can fail if we're not blocking on locks, and it's held.
2528	 */
2529	if (!agbp) {
2530		error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2531		if (error) {
2532			/* Couldn't lock the AGF so skip this AG. */
2533			if (error == -EAGAIN)
2534				error = 0;
2535			goto out_no_agbp;
2536		}
2537	}
2538
2539	/* reset a padding mismatched agfl before final free space check */
2540	if (pag->pagf_agflreset)
2541		xfs_agfl_reset(tp, agbp, pag);
2542
2543	/* If there isn't enough total space or single-extent, reject it. */
2544	need = xfs_alloc_min_freelist(mp, pag);
2545	if (!xfs_alloc_space_available(args, need, flags))
2546		goto out_agbp_relse;
2547
2548	/*
2549	 * Make the freelist shorter if it's too long.
2550	 *
2551	 * Note that from this point onwards, we will always release the agf and
2552	 * agfl buffers on error. This handles the case where we error out and
2553	 * the buffers are clean or may not have been joined to the transaction
2554	 * and hence need to be released manually. If they have been joined to
2555	 * the transaction, then xfs_trans_brelse() will handle them
2556	 * appropriately based on the recursion count and dirty state of the
2557	 * buffer.
2558	 *
2559	 * XXX (dgc): When we have lots of free space, does this buy us
2560	 * anything other than extra overhead when we need to put more blocks
2561	 * back on the free list? Maybe we should only do this when space is
2562	 * getting low or the AGFL is more than half full?
2563	 *
2564	 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2565	 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2566	 * updating the rmapbt.  Both flags are used in xfs_repair while we're
2567	 * rebuilding the rmapbt, and neither are used by the kernel.  They're
2568	 * both required to ensure that rmaps are correctly recorded for the
2569	 * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
2570	 * repair/rmap.c in xfsprogs for details.
2571	 */
2572	memset(&targs, 0, sizeof(targs));
2573	/* struct copy below */
2574	if (flags & XFS_ALLOC_FLAG_NORMAP)
2575		targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2576	else
2577		targs.oinfo = XFS_RMAP_OINFO_AG;
2578	while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
2579		error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
2580		if (error)
2581			goto out_agbp_relse;
2582
2583		/* defer agfl frees */
2584		xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2585	}
2586
2587	targs.tp = tp;
2588	targs.mp = mp;
2589	targs.agbp = agbp;
2590	targs.agno = args->agno;
2591	targs.alignment = targs.minlen = targs.prod = 1;
2592	targs.type = XFS_ALLOCTYPE_THIS_AG;
2593	targs.pag = pag;
2594	error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
2595	if (error)
2596		goto out_agbp_relse;
2597
2598	/* Make the freelist longer if it's too short. */
2599	while (pag->pagf_flcount < need) {
2600		targs.agbno = 0;
2601		targs.maxlen = need - pag->pagf_flcount;
2602		targs.resv = XFS_AG_RESV_AGFL;
2603
2604		/* Allocate as many blocks as possible at once. */
2605		error = xfs_alloc_ag_vextent(&targs);
2606		if (error)
2607			goto out_agflbp_relse;
2608
2609		/*
2610		 * Stop if we run out.  Won't happen if callers are obeying
2611		 * the restrictions correctly.  Can happen for free calls
2612		 * on a completely full ag.
2613		 */
2614		if (targs.agbno == NULLAGBLOCK) {
2615			if (flags & XFS_ALLOC_FLAG_FREEING)
2616				break;
2617			goto out_agflbp_relse;
2618		}
2619		/*
2620		 * Put each allocated block on the list.
2621		 */
2622		for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2623			error = xfs_alloc_put_freelist(tp, agbp,
2624							agflbp, bno, 0);
2625			if (error)
2626				goto out_agflbp_relse;
2627		}
2628	}
2629	xfs_trans_brelse(tp, agflbp);
2630	args->agbp = agbp;
2631	return 0;
2632
2633out_agflbp_relse:
2634	xfs_trans_brelse(tp, agflbp);
2635out_agbp_relse:
2636	if (agbp)
2637		xfs_trans_brelse(tp, agbp);
2638out_no_agbp:
2639	args->agbp = NULL;
2640	return error;
2641}
2642
2643/*
2644 * Get a block from the freelist.
2645 * Returns with the buffer for the block gotten.
2646 */
2647int				/* error */
2648xfs_alloc_get_freelist(
2649	xfs_trans_t	*tp,	/* transaction pointer */
2650	xfs_buf_t	*agbp,	/* buffer containing the agf structure */
2651	xfs_agblock_t	*bnop,	/* block address retrieved from freelist */
2652	int		btreeblk) /* destination is a AGF btree */
2653{
2654	struct xfs_agf	*agf = agbp->b_addr;
2655	xfs_buf_t	*agflbp;/* buffer for a.g. freelist structure */
2656	xfs_agblock_t	bno;	/* block number returned */
2657	__be32		*agfl_bno;
2658	int		error;
2659	int		logflags;
2660	xfs_mount_t	*mp = tp->t_mountp;
2661	xfs_perag_t	*pag;	/* per allocation group data */
2662
2663	/*
2664	 * Freelist is empty, give up.
2665	 */
2666	if (!agf->agf_flcount) {
2667		*bnop = NULLAGBLOCK;
2668		return 0;
2669	}
2670	/*
2671	 * Read the array of free blocks.
2672	 */
2673	error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
2674				    &agflbp);
2675	if (error)
2676		return error;
2677
2678
2679	/*
2680	 * Get the block number and update the data structures.
2681	 */
2682	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2683	bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2684	be32_add_cpu(&agf->agf_flfirst, 1);
2685	xfs_trans_brelse(tp, agflbp);
2686	if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
2687		agf->agf_flfirst = 0;
2688
2689	pag = agbp->b_pag;
2690	ASSERT(!pag->pagf_agflreset);
2691	be32_add_cpu(&agf->agf_flcount, -1);
2692	xfs_trans_agflist_delta(tp, -1);
2693	pag->pagf_flcount--;
2694
2695	logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2696	if (btreeblk) {
2697		be32_add_cpu(&agf->agf_btreeblks, 1);
2698		pag->pagf_btreeblks++;
2699		logflags |= XFS_AGF_BTREEBLKS;
2700	}
2701
2702	xfs_alloc_log_agf(tp, agbp, logflags);
2703	*bnop = bno;
2704
2705	return 0;
2706}
2707
2708/*
2709 * Log the given fields from the agf structure.
2710 */
2711void
2712xfs_alloc_log_agf(
2713	xfs_trans_t	*tp,	/* transaction pointer */
2714	xfs_buf_t	*bp,	/* buffer for a.g. freelist header */
2715	int		fields)	/* mask of fields to be logged (XFS_AGF_...) */
2716{
2717	int	first;		/* first byte offset */
2718	int	last;		/* last byte offset */
2719	static const short	offsets[] = {
2720		offsetof(xfs_agf_t, agf_magicnum),
2721		offsetof(xfs_agf_t, agf_versionnum),
2722		offsetof(xfs_agf_t, agf_seqno),
2723		offsetof(xfs_agf_t, agf_length),
2724		offsetof(xfs_agf_t, agf_roots[0]),
2725		offsetof(xfs_agf_t, agf_levels[0]),
2726		offsetof(xfs_agf_t, agf_flfirst),
2727		offsetof(xfs_agf_t, agf_fllast),
2728		offsetof(xfs_agf_t, agf_flcount),
2729		offsetof(xfs_agf_t, agf_freeblks),
2730		offsetof(xfs_agf_t, agf_longest),
2731		offsetof(xfs_agf_t, agf_btreeblks),
2732		offsetof(xfs_agf_t, agf_uuid),
2733		offsetof(xfs_agf_t, agf_rmap_blocks),
2734		offsetof(xfs_agf_t, agf_refcount_blocks),
2735		offsetof(xfs_agf_t, agf_refcount_root),
2736		offsetof(xfs_agf_t, agf_refcount_level),
2737		/* needed so that we don't log the whole rest of the structure: */
2738		offsetof(xfs_agf_t, agf_spare64),
2739		sizeof(xfs_agf_t)
2740	};
2741
2742	trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
2743
2744	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2745
2746	xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2747	xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2748}
2749
2750/*
2751 * Interface for inode allocation to force the pag data to be initialized.
2752 */
2753int					/* error */
2754xfs_alloc_pagf_init(
2755	xfs_mount_t		*mp,	/* file system mount structure */
2756	xfs_trans_t		*tp,	/* transaction pointer */
2757	xfs_agnumber_t		agno,	/* allocation group number */
2758	int			flags)	/* XFS_ALLOC_FLAGS_... */
2759{
2760	xfs_buf_t		*bp;
2761	int			error;
2762
2763	error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp);
2764	if (!error)
2765		xfs_trans_brelse(tp, bp);
2766	return error;
2767}
2768
2769/*
2770 * Put the block on the freelist for the allocation group.
2771 */
2772int					/* error */
2773xfs_alloc_put_freelist(
2774	xfs_trans_t		*tp,	/* transaction pointer */
2775	xfs_buf_t		*agbp,	/* buffer for a.g. freelist header */
2776	xfs_buf_t		*agflbp,/* buffer for a.g. free block array */
2777	xfs_agblock_t		bno,	/* block being freed */
2778	int			btreeblk) /* block came from a AGF btree */
2779{
2780	struct xfs_mount	*mp = tp->t_mountp;
2781	struct xfs_agf		*agf = agbp->b_addr;
2782	__be32			*blockp;/* pointer to array entry */
2783	int			error;
2784	int			logflags;
2785	xfs_perag_t		*pag;	/* per allocation group data */
2786	__be32			*agfl_bno;
2787	int			startoff;
2788
2789	if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
2790			be32_to_cpu(agf->agf_seqno), &agflbp)))
2791		return error;
2792	be32_add_cpu(&agf->agf_fllast, 1);
2793	if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
2794		agf->agf_fllast = 0;
2795
2796	pag = agbp->b_pag;
2797	ASSERT(!pag->pagf_agflreset);
2798	be32_add_cpu(&agf->agf_flcount, 1);
2799	xfs_trans_agflist_delta(tp, 1);
2800	pag->pagf_flcount++;
2801
2802	logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2803	if (btreeblk) {
2804		be32_add_cpu(&agf->agf_btreeblks, -1);
2805		pag->pagf_btreeblks--;
2806		logflags |= XFS_AGF_BTREEBLKS;
2807	}
2808
2809	xfs_alloc_log_agf(tp, agbp, logflags);
2810
2811	ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
2812
2813	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2814	blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2815	*blockp = cpu_to_be32(bno);
2816	startoff = (char *)blockp - (char *)agflbp->b_addr;
2817
2818	xfs_alloc_log_agf(tp, agbp, logflags);
2819
2820	xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2821	xfs_trans_log_buf(tp, agflbp, startoff,
2822			  startoff + sizeof(xfs_agblock_t) - 1);
2823	return 0;
2824}
2825
2826static xfs_failaddr_t
2827xfs_agf_verify(
2828	struct xfs_buf		*bp)
2829{
2830	struct xfs_mount	*mp = bp->b_mount;
2831	struct xfs_agf		*agf = bp->b_addr;
2832
2833	if (xfs_sb_version_hascrc(&mp->m_sb)) {
2834		if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
2835			return __this_address;
2836		if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
2837			return __this_address;
2838	}
2839
2840	if (!xfs_verify_magic(bp, agf->agf_magicnum))
2841		return __this_address;
2842
2843	if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2844	      be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2845	      be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2846	      be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2847	      be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
2848		return __this_address;
2849
2850	if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
2851		return __this_address;
2852
2853	if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
2854	    be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
2855		return __this_address;
2856
2857	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2858	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
2859	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
2860	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
2861		return __this_address;
2862
2863	if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
2864	    (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
2865	     be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
2866		return __this_address;
2867
2868	if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
2869	    be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
2870		return __this_address;
2871
2872	/*
2873	 * during growfs operations, the perag is not fully initialised,
2874	 * so we can't use it for any useful checking. growfs ensures we can't
2875	 * use it by using uncached buffers that don't have the perag attached
2876	 * so we can detect and avoid this problem.
2877	 */
2878	if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
2879		return __this_address;
2880
2881	if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
2882	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
2883		return __this_address;
2884
2885	if (xfs_sb_version_hasreflink(&mp->m_sb) &&
2886	    be32_to_cpu(agf->agf_refcount_blocks) >
2887	    be32_to_cpu(agf->agf_length))
2888		return __this_address;
2889
2890	if (xfs_sb_version_hasreflink(&mp->m_sb) &&
2891	    (be32_to_cpu(agf->agf_refcount_level) < 1 ||
2892	     be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
2893		return __this_address;
2894
2895	return NULL;
2896
2897}
2898
2899static void
2900xfs_agf_read_verify(
2901	struct xfs_buf	*bp)
2902{
2903	struct xfs_mount *mp = bp->b_mount;
2904	xfs_failaddr_t	fa;
2905
2906	if (xfs_sb_version_hascrc(&mp->m_sb) &&
2907	    !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2908		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2909	else {
2910		fa = xfs_agf_verify(bp);
2911		if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
2912			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2913	}
2914}
2915
2916static void
2917xfs_agf_write_verify(
2918	struct xfs_buf	*bp)
2919{
2920	struct xfs_mount	*mp = bp->b_mount;
2921	struct xfs_buf_log_item	*bip = bp->b_log_item;
2922	struct xfs_agf		*agf = bp->b_addr;
2923	xfs_failaddr_t		fa;
2924
2925	fa = xfs_agf_verify(bp);
2926	if (fa) {
2927		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2928		return;
2929	}
2930
2931	if (!xfs_sb_version_hascrc(&mp->m_sb))
2932		return;
2933
2934	if (bip)
2935		agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2936
2937	xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
2938}
2939
2940const struct xfs_buf_ops xfs_agf_buf_ops = {
2941	.name = "xfs_agf",
2942	.magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
2943	.verify_read = xfs_agf_read_verify,
2944	.verify_write = xfs_agf_write_verify,
2945	.verify_struct = xfs_agf_verify,
2946};
2947
2948/*
2949 * Read in the allocation group header (free/alloc section).
2950 */
2951int					/* error */
2952xfs_read_agf(
2953	struct xfs_mount	*mp,	/* mount point structure */
2954	struct xfs_trans	*tp,	/* transaction pointer */
2955	xfs_agnumber_t		agno,	/* allocation group number */
2956	int			flags,	/* XFS_BUF_ */
2957	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
2958{
2959	int		error;
2960
2961	trace_xfs_read_agf(mp, agno);
2962
2963	ASSERT(agno != NULLAGNUMBER);
2964	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
2965			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
2966			XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
2967	if (error)
2968		return error;
2969
2970	ASSERT(!(*bpp)->b_error);
2971	xfs_buf_set_ref(*bpp, XFS_AGF_REF);
2972	return 0;
2973}
2974
2975/*
2976 * Read in the allocation group header (free/alloc section).
2977 */
2978int					/* error */
2979xfs_alloc_read_agf(
2980	struct xfs_mount	*mp,	/* mount point structure */
2981	struct xfs_trans	*tp,	/* transaction pointer */
2982	xfs_agnumber_t		agno,	/* allocation group number */
2983	int			flags,	/* XFS_ALLOC_FLAG_... */
2984	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
2985{
2986	struct xfs_agf		*agf;		/* ag freelist header */
2987	struct xfs_perag	*pag;		/* per allocation group data */
2988	int			error;
2989
2990	trace_xfs_alloc_read_agf(mp, agno);
2991
2992	/* We don't support trylock when freeing. */
2993	ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
2994			(XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
2995	ASSERT(agno != NULLAGNUMBER);
2996	error = xfs_read_agf(mp, tp, agno,
2997			(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2998			bpp);
2999	if (error)
3000		return error;
3001	ASSERT(!(*bpp)->b_error);
3002
3003	agf = (*bpp)->b_addr;
3004	pag = (*bpp)->b_pag;
3005	if (!pag->pagf_init) {
3006		pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3007		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3008		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3009		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3010		pag->pagf_levels[XFS_BTNUM_BNOi] =
3011			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3012		pag->pagf_levels[XFS_BTNUM_CNTi] =
3013			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3014		pag->pagf_levels[XFS_BTNUM_RMAPi] =
3015			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3016		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3017		pag->pagf_init = 1;
3018		pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
3019	}
3020#ifdef DEBUG
3021	else if (!XFS_FORCED_SHUTDOWN(mp)) {
3022		ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3023		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3024		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3025		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3026		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3027		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3028		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3029		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3030	}
3031#endif
3032	return 0;
3033}
3034
3035/*
3036 * Allocate an extent (variable-size).
3037 * Depending on the allocation type, we either look in a single allocation
3038 * group or loop over the allocation groups to find the result.
3039 */
3040int				/* error */
3041xfs_alloc_vextent(
3042	struct xfs_alloc_arg	*args)	/* allocation argument structure */
3043{
3044	xfs_agblock_t		agsize;	/* allocation group size */
3045	int			error;
3046	int			flags;	/* XFS_ALLOC_FLAG_... locking flags */
3047	struct xfs_mount	*mp;	/* mount structure pointer */
3048	xfs_agnumber_t		sagno;	/* starting allocation group number */
3049	xfs_alloctype_t		type;	/* input allocation type */
3050	int			bump_rotor = 0;
3051	xfs_agnumber_t		rotorstep = xfs_rotorstep; /* inode32 agf stepper */
3052
3053	mp = args->mp;
3054	type = args->otype = args->type;
3055	args->agbno = NULLAGBLOCK;
3056	/*
3057	 * Just fix this up, for the case where the last a.g. is shorter
3058	 * (or there's only one a.g.) and the caller couldn't easily figure
3059	 * that out (xfs_bmap_alloc).
3060	 */
3061	agsize = mp->m_sb.sb_agblocks;
3062	if (args->maxlen > agsize)
3063		args->maxlen = agsize;
3064	if (args->alignment == 0)
3065		args->alignment = 1;
3066	ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
3067	ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
3068	ASSERT(args->minlen <= args->maxlen);
3069	ASSERT(args->minlen <= agsize);
3070	ASSERT(args->mod < args->prod);
3071	if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
3072	    XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
3073	    args->minlen > args->maxlen || args->minlen > agsize ||
3074	    args->mod >= args->prod) {
3075		args->fsbno = NULLFSBLOCK;
3076		trace_xfs_alloc_vextent_badargs(args);
3077		return 0;
3078	}
3079
3080	switch (type) {
3081	case XFS_ALLOCTYPE_THIS_AG:
3082	case XFS_ALLOCTYPE_NEAR_BNO:
3083	case XFS_ALLOCTYPE_THIS_BNO:
3084		/*
3085		 * These three force us into a single a.g.
3086		 */
3087		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3088		args->pag = xfs_perag_get(mp, args->agno);
3089		error = xfs_alloc_fix_freelist(args, 0);
3090		if (error) {
3091			trace_xfs_alloc_vextent_nofix(args);
3092			goto error0;
3093		}
3094		if (!args->agbp) {
3095			trace_xfs_alloc_vextent_noagbp(args);
3096			break;
3097		}
3098		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3099		if ((error = xfs_alloc_ag_vextent(args)))
3100			goto error0;
3101		break;
3102	case XFS_ALLOCTYPE_START_BNO:
3103		/*
3104		 * Try near allocation first, then anywhere-in-ag after
3105		 * the first a.g. fails.
3106		 */
3107		if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3108		    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
3109			args->fsbno = XFS_AGB_TO_FSB(mp,
3110					((mp->m_agfrotor / rotorstep) %
3111					mp->m_sb.sb_agcount), 0);
3112			bump_rotor = 1;
3113		}
3114		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3115		args->type = XFS_ALLOCTYPE_NEAR_BNO;
3116		/* FALLTHROUGH */
3117	case XFS_ALLOCTYPE_FIRST_AG:
3118		/*
3119		 * Rotate through the allocation groups looking for a winner.
3120		 */
3121		if (type == XFS_ALLOCTYPE_FIRST_AG) {
3122			/*
3123			 * Start with allocation group given by bno.
3124			 */
3125			args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3126			args->type = XFS_ALLOCTYPE_THIS_AG;
3127			sagno = 0;
3128			flags = 0;
3129		} else {
3130			/*
3131			 * Start with the given allocation group.
3132			 */
3133			args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3134			flags = XFS_ALLOC_FLAG_TRYLOCK;
3135		}
3136		/*
3137		 * Loop over allocation groups twice; first time with
3138		 * trylock set, second time without.
3139		 */
3140		for (;;) {
3141			args->pag = xfs_perag_get(mp, args->agno);
3142			error = xfs_alloc_fix_freelist(args, flags);
3143			if (error) {
3144				trace_xfs_alloc_vextent_nofix(args);
3145				goto error0;
3146			}
3147			/*
3148			 * If we get a buffer back then the allocation will fly.
3149			 */
3150			if (args->agbp) {
3151				if ((error = xfs_alloc_ag_vextent(args)))
3152					goto error0;
3153				break;
3154			}
3155
3156			trace_xfs_alloc_vextent_loopfailed(args);
3157
3158			/*
3159			 * Didn't work, figure out the next iteration.
3160			 */
3161			if (args->agno == sagno &&
3162			    type == XFS_ALLOCTYPE_START_BNO)
3163				args->type = XFS_ALLOCTYPE_THIS_AG;
3164			/*
3165			* For the first allocation, we can try any AG to get
3166			* space.  However, if we already have allocated a
3167			* block, we don't want to try AGs whose number is below
3168			* sagno. Otherwise, we may end up with out-of-order
3169			* locking of AGF, which might cause deadlock.
3170			*/
3171			if (++(args->agno) == mp->m_sb.sb_agcount) {
3172				if (args->tp->t_firstblock != NULLFSBLOCK)
3173					args->agno = sagno;
3174				else
3175					args->agno = 0;
3176			}
3177			/*
3178			 * Reached the starting a.g., must either be done
3179			 * or switch to non-trylock mode.
3180			 */
3181			if (args->agno == sagno) {
3182				if (flags == 0) {
3183					args->agbno = NULLAGBLOCK;
3184					trace_xfs_alloc_vextent_allfailed(args);
3185					break;
3186				}
3187
3188				flags = 0;
3189				if (type == XFS_ALLOCTYPE_START_BNO) {
3190					args->agbno = XFS_FSB_TO_AGBNO(mp,
3191						args->fsbno);
3192					args->type = XFS_ALLOCTYPE_NEAR_BNO;
3193				}
3194			}
3195			xfs_perag_put(args->pag);
3196		}
3197		if (bump_rotor) {
3198			if (args->agno == sagno)
3199				mp->m_agfrotor = (mp->m_agfrotor + 1) %
3200					(mp->m_sb.sb_agcount * rotorstep);
3201			else
3202				mp->m_agfrotor = (args->agno * rotorstep + 1) %
3203					(mp->m_sb.sb_agcount * rotorstep);
3204		}
3205		break;
3206	default:
3207		ASSERT(0);
3208		/* NOTREACHED */
3209	}
3210	if (args->agbno == NULLAGBLOCK)
3211		args->fsbno = NULLFSBLOCK;
3212	else {
3213		args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3214#ifdef DEBUG
3215		ASSERT(args->len >= args->minlen);
3216		ASSERT(args->len <= args->maxlen);
3217		ASSERT(args->agbno % args->alignment == 0);
3218		XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
3219			args->len);
3220#endif
3221
3222	}
3223	xfs_perag_put(args->pag);
3224	return 0;
3225error0:
3226	xfs_perag_put(args->pag);
3227	return error;
3228}
3229
3230/* Ensure that the freelist is at full capacity. */
3231int
3232xfs_free_extent_fix_freelist(
3233	struct xfs_trans	*tp,
3234	xfs_agnumber_t		agno,
3235	struct xfs_buf		**agbp)
3236{
3237	struct xfs_alloc_arg	args;
3238	int			error;
3239
3240	memset(&args, 0, sizeof(struct xfs_alloc_arg));
3241	args.tp = tp;
3242	args.mp = tp->t_mountp;
3243	args.agno = agno;
3244
3245	/*
3246	 * validate that the block number is legal - the enables us to detect
3247	 * and handle a silent filesystem corruption rather than crashing.
3248	 */
3249	if (args.agno >= args.mp->m_sb.sb_agcount)
3250		return -EFSCORRUPTED;
3251
3252	args.pag = xfs_perag_get(args.mp, args.agno);
3253	ASSERT(args.pag);
3254
3255	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3256	if (error)
3257		goto out;
3258
3259	*agbp = args.agbp;
3260out:
3261	xfs_perag_put(args.pag);
3262	return error;
3263}
3264
3265/*
3266 * Free an extent.
3267 * Just break up the extent address and hand off to xfs_free_ag_extent
3268 * after fixing up the freelist.
3269 */
3270int
3271__xfs_free_extent(
3272	struct xfs_trans		*tp,
3273	xfs_fsblock_t			bno,
3274	xfs_extlen_t			len,
3275	const struct xfs_owner_info	*oinfo,
3276	enum xfs_ag_resv_type		type,
3277	bool				skip_discard)
3278{
3279	struct xfs_mount		*mp = tp->t_mountp;
3280	struct xfs_buf			*agbp;
3281	xfs_agnumber_t			agno = XFS_FSB_TO_AGNO(mp, bno);
3282	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp, bno);
3283	struct xfs_agf			*agf;
3284	int				error;
3285	unsigned int			busy_flags = 0;
3286
3287	ASSERT(len != 0);
3288	ASSERT(type != XFS_AG_RESV_AGFL);
3289
3290	if (XFS_TEST_ERROR(false, mp,
3291			XFS_ERRTAG_FREE_EXTENT))
3292		return -EIO;
3293
3294	error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
3295	if (error)
3296		return error;
3297	agf = agbp->b_addr;
3298
3299	if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3300		error = -EFSCORRUPTED;
3301		goto err;
3302	}
3303
3304	/* validate the extent size is legal now we have the agf locked */
3305	if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3306		error = -EFSCORRUPTED;
3307		goto err;
3308	}
3309
3310	error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
3311	if (error)
3312		goto err;
3313
3314	if (skip_discard)
3315		busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3316	xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags);
3317	return 0;
3318
3319err:
3320	xfs_trans_brelse(tp, agbp);
3321	return error;
3322}
3323
3324struct xfs_alloc_query_range_info {
3325	xfs_alloc_query_range_fn	fn;
3326	void				*priv;
3327};
3328
3329/* Format btree record and pass to our callback. */
3330STATIC int
3331xfs_alloc_query_range_helper(
3332	struct xfs_btree_cur		*cur,
3333	union xfs_btree_rec		*rec,
3334	void				*priv)
3335{
3336	struct xfs_alloc_query_range_info	*query = priv;
3337	struct xfs_alloc_rec_incore		irec;
3338
3339	irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
3340	irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
3341	return query->fn(cur, &irec, query->priv);
3342}
3343
3344/* Find all free space within a given range of blocks. */
3345int
3346xfs_alloc_query_range(
3347	struct xfs_btree_cur			*cur,
3348	struct xfs_alloc_rec_incore		*low_rec,
3349	struct xfs_alloc_rec_incore		*high_rec,
3350	xfs_alloc_query_range_fn		fn,
3351	void					*priv)
3352{
3353	union xfs_btree_irec			low_brec;
3354	union xfs_btree_irec			high_brec;
3355	struct xfs_alloc_query_range_info	query;
3356
3357	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3358	low_brec.a = *low_rec;
3359	high_brec.a = *high_rec;
3360	query.priv = priv;
3361	query.fn = fn;
3362	return xfs_btree_query_range(cur, &low_brec, &high_brec,
3363			xfs_alloc_query_range_helper, &query);
3364}
3365
3366/* Find all free space records. */
3367int
3368xfs_alloc_query_all(
3369	struct xfs_btree_cur			*cur,
3370	xfs_alloc_query_range_fn		fn,
3371	void					*priv)
3372{
3373	struct xfs_alloc_query_range_info	query;
3374
3375	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3376	query.priv = priv;
3377	query.fn = fn;
3378	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3379}
3380
3381/* Is there a record covering a given extent? */
3382int
3383xfs_alloc_has_record(
3384	struct xfs_btree_cur	*cur,
3385	xfs_agblock_t		bno,
3386	xfs_extlen_t		len,
3387	bool			*exists)
3388{
3389	union xfs_btree_irec	low;
3390	union xfs_btree_irec	high;
3391
3392	memset(&low, 0, sizeof(low));
3393	low.a.ar_startblock = bno;
3394	memset(&high, 0xFF, sizeof(high));
3395	high.a.ar_startblock = bno + len - 1;
3396
3397	return xfs_btree_has_record(cur, &low, &high, exists);
3398}
3399
3400/*
3401 * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
3402 * error code or XFS_ITER_*.
3403 */
3404int
3405xfs_agfl_walk(
3406	struct xfs_mount	*mp,
3407	struct xfs_agf		*agf,
3408	struct xfs_buf		*agflbp,
3409	xfs_agfl_walk_fn	walk_fn,
3410	void			*priv)
3411{
3412	__be32			*agfl_bno;
3413	unsigned int		i;
3414	int			error;
3415
3416	agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3417	i = be32_to_cpu(agf->agf_flfirst);
3418
3419	/* Nothing to walk in an empty AGFL. */
3420	if (agf->agf_flcount == cpu_to_be32(0))
3421		return 0;
3422
3423	/* Otherwise, walk from first to last, wrapping as needed. */
3424	for (;;) {
3425		error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3426		if (error)
3427			return error;
3428		if (i == be32_to_cpu(agf->agf_fllast))
3429			break;
3430		if (++i == xfs_agfl_size(mp))
3431			i = 0;
3432	}
3433
3434	return 0;
3435}
3436