1// SPDX-License-Identifier: GPL-2.0+
2/*
3 * Copyright (C) 2016 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
12#include "xfs_mount.h"
13#include "xfs_alloc.h"
14#include "xfs_errortag.h"
15#include "xfs_error.h"
16#include "xfs_trace.h"
17#include "xfs_trans.h"
18#include "xfs_rmap_btree.h"
19#include "xfs_btree.h"
20#include "xfs_refcount_btree.h"
21#include "xfs_ialloc_btree.h"
22#include "xfs_ag.h"
23#include "xfs_ag_resv.h"
24
25/*
26 * Per-AG Block Reservations
27 *
28 * For some kinds of allocation group metadata structures, it is advantageous
29 * to reserve a small number of blocks in each AG so that future expansions of
30 * that data structure do not encounter ENOSPC because errors during a btree
31 * split cause the filesystem to go offline.
32 *
33 * Prior to the introduction of reflink, this wasn't an issue because the free
34 * space btrees maintain a reserve of space (the AGFL) to handle any expansion
35 * that may be necessary; and allocations of other metadata (inodes, BMBT,
36 * dir/attr) aren't restricted to a single AG.  However, with reflink it is
37 * possible to allocate all the space in an AG, have subsequent reflink/CoW
38 * activity expand the refcount btree, and discover that there's no space left
39 * to handle that expansion.  Since we can calculate the maximum size of the
40 * refcount btree, we can reserve space for it and avoid ENOSPC.
41 *
42 * Handling per-AG reservations consists of three changes to the allocator's
43 * behavior:  First, because these reservations are always needed, we decrease
44 * the ag_max_usable counter to reflect the size of the AG after the reserved
45 * blocks are taken.  Second, the reservations must be reflected in the
46 * fdblocks count to maintain proper accounting.  Third, each AG must maintain
47 * its own reserved block counter so that we can calculate the amount of space
48 * that must remain free to maintain the reservations.  Fourth, the "remaining
49 * reserved blocks" count must be used when calculating the length of the
50 * longest free extent in an AG and to clamp maxlen in the per-AG allocation
51 * functions.  In other words, we maintain a virtual allocation via in-core
52 * accounting tricks so that we don't have to clean up after a crash. :)
53 *
54 * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
55 * values via struct xfs_alloc_arg or directly to the xfs_free_extent
56 * function.  It might seem a little funny to maintain a reservoir of blocks
57 * to feed another reservoir, but the AGFL only holds enough blocks to get
58 * through the next transaction.  The per-AG reservation is to ensure (we
59 * hope) that each AG never runs out of blocks.  Each data structure wanting
60 * to use the reservation system should update ask/used in xfs_ag_resv_init.
61 */
62
63/*
64 * Are we critically low on blocks?  For now we'll define that as the number
65 * of blocks we can get our hands on being less than 10% of what we reserved
66 * or less than some arbitrary number (maximum btree height).
67 */
68bool
69xfs_ag_resv_critical(
70	struct xfs_perag		*pag,
71	enum xfs_ag_resv_type		type)
72{
73	xfs_extlen_t			avail;
74	xfs_extlen_t			orig;
75
76	switch (type) {
77	case XFS_AG_RESV_METADATA:
78		avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
79		orig = pag->pag_meta_resv.ar_asked;
80		break;
81	case XFS_AG_RESV_RMAPBT:
82		avail = pag->pagf_freeblks + pag->pagf_flcount -
83			pag->pag_meta_resv.ar_reserved;
84		orig = pag->pag_rmapbt_resv.ar_asked;
85		break;
86	default:
87		ASSERT(0);
88		return false;
89	}
90
91	trace_xfs_ag_resv_critical(pag, type, avail);
92
93	/* Critically low if less than 10% or max btree height remains. */
94	return XFS_TEST_ERROR(avail < orig / 10 ||
95			      avail < pag->pag_mount->m_agbtree_maxlevels,
96			pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
97}
98
99/*
100 * How many blocks are reserved but not used, and therefore must not be
101 * allocated away?
102 */
103xfs_extlen_t
104xfs_ag_resv_needed(
105	struct xfs_perag		*pag,
106	enum xfs_ag_resv_type		type)
107{
108	xfs_extlen_t			len;
109
110	len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
111	switch (type) {
112	case XFS_AG_RESV_METADATA:
113	case XFS_AG_RESV_RMAPBT:
114		len -= xfs_perag_resv(pag, type)->ar_reserved;
115		break;
116	case XFS_AG_RESV_NONE:
117		/* empty */
118		break;
119	default:
120		ASSERT(0);
121	}
122
123	trace_xfs_ag_resv_needed(pag, type, len);
124
125	return len;
126}
127
128/* Clean out a reservation */
129static int
130__xfs_ag_resv_free(
131	struct xfs_perag		*pag,
132	enum xfs_ag_resv_type		type)
133{
134	struct xfs_ag_resv		*resv;
135	xfs_extlen_t			oldresv;
136	int				error;
137
138	trace_xfs_ag_resv_free(pag, type, 0);
139
140	resv = xfs_perag_resv(pag, type);
141	if (pag->pag_agno == 0)
142		pag->pag_mount->m_ag_max_usable += resv->ar_asked;
143	/*
144	 * RMAPBT blocks come from the AGFL and AGFL blocks are always
145	 * considered "free", so whatever was reserved at mount time must be
146	 * given back at umount.
147	 */
148	if (type == XFS_AG_RESV_RMAPBT)
149		oldresv = resv->ar_orig_reserved;
150	else
151		oldresv = resv->ar_reserved;
152	error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
153	resv->ar_reserved = 0;
154	resv->ar_asked = 0;
155	resv->ar_orig_reserved = 0;
156
157	if (error)
158		trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
159				error, _RET_IP_);
160	return error;
161}
162
163/* Free a per-AG reservation. */
164int
165xfs_ag_resv_free(
166	struct xfs_perag		*pag)
167{
168	int				error;
169	int				err2;
170
171	error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
172	err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
173	if (err2 && !error)
174		error = err2;
175	return error;
176}
177
178static int
179__xfs_ag_resv_init(
180	struct xfs_perag		*pag,
181	enum xfs_ag_resv_type		type,
182	xfs_extlen_t			ask,
183	xfs_extlen_t			used)
184{
185	struct xfs_mount		*mp = pag->pag_mount;
186	struct xfs_ag_resv		*resv;
187	int				error;
188	xfs_extlen_t			hidden_space;
189
190	if (used > ask)
191		ask = used;
192
193	switch (type) {
194	case XFS_AG_RESV_RMAPBT:
195		/*
196		 * Space taken by the rmapbt is not subtracted from fdblocks
197		 * because the rmapbt lives in the free space.  Here we must
198		 * subtract the entire reservation from fdblocks so that we
199		 * always have blocks available for rmapbt expansion.
200		 */
201		hidden_space = ask;
202		break;
203	case XFS_AG_RESV_METADATA:
204		/*
205		 * Space taken by all other metadata btrees are accounted
206		 * on-disk as used space.  We therefore only hide the space
207		 * that is reserved but not used by the trees.
208		 */
209		hidden_space = ask - used;
210		break;
211	default:
212		ASSERT(0);
213		return -EINVAL;
214	}
215
216	if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
217		error = -ENOSPC;
218	else
219		error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
220	if (error) {
221		trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
222				error, _RET_IP_);
223		xfs_warn(mp,
224"Per-AG reservation for AG %u failed.  Filesystem may run out of space.",
225				pag->pag_agno);
226		return error;
227	}
228
229	/*
230	 * Reduce the maximum per-AG allocation length by however much we're
231	 * trying to reserve for an AG.  Since this is a filesystem-wide
232	 * counter, we only make the adjustment for AG 0.  This assumes that
233	 * there aren't any AGs hungrier for per-AG reservation than AG 0.
234	 */
235	if (pag->pag_agno == 0)
236		mp->m_ag_max_usable -= ask;
237
238	resv = xfs_perag_resv(pag, type);
239	resv->ar_asked = ask;
240	resv->ar_orig_reserved = hidden_space;
241	resv->ar_reserved = ask - used;
242
243	trace_xfs_ag_resv_init(pag, type, ask);
244	return 0;
245}
246
247/* Create a per-AG block reservation. */
248int
249xfs_ag_resv_init(
250	struct xfs_perag		*pag,
251	struct xfs_trans		*tp)
252{
253	struct xfs_mount		*mp = pag->pag_mount;
254	xfs_extlen_t			ask;
255	xfs_extlen_t			used;
256	int				error = 0, error2;
257	bool				has_resv = false;
258
259	/* Create the metadata reservation. */
260	if (pag->pag_meta_resv.ar_asked == 0) {
261		ask = used = 0;
262
263		error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
264		if (error)
265			goto out;
266
267		error = xfs_finobt_calc_reserves(pag, tp, &ask, &used);
268		if (error)
269			goto out;
270
271		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
272				ask, used);
273		if (error) {
274			/*
275			 * Because we didn't have per-AG reservations when the
276			 * finobt feature was added we might not be able to
277			 * reserve all needed blocks.  Warn and fall back to the
278			 * old and potentially buggy code in that case, but
279			 * ensure we do have the reservation for the refcountbt.
280			 */
281			ask = used = 0;
282
283			mp->m_finobt_nores = true;
284
285			error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
286					&used);
287			if (error)
288				goto out;
289
290			error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
291					ask, used);
292			if (error)
293				goto out;
294		}
295		if (ask)
296			has_resv = true;
297	}
298
299	/* Create the RMAPBT metadata reservation */
300	if (pag->pag_rmapbt_resv.ar_asked == 0) {
301		ask = used = 0;
302
303		error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
304		if (error)
305			goto out;
306
307		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
308		if (error)
309			goto out;
310		if (ask)
311			has_resv = true;
312	}
313
314out:
315	/*
316	 * Initialize the pagf if we have at least one active reservation on the
317	 * AG. This may have occurred already via reservation calculation, but
318	 * fall back to an explicit init to ensure the in-core allocbt usage
319	 * counters are initialized as soon as possible. This is important
320	 * because filesystems with large perag reservations are susceptible to
321	 * free space reservation problems that the allocbt counter is used to
322	 * address.
323	 */
324	if (has_resv) {
325		error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
326		if (error2)
327			return error2;
328
329		/*
330		 * If there isn't enough space in the AG to satisfy the
331		 * reservation, let the caller know that there wasn't enough
332		 * space.  Callers are responsible for deciding what to do
333		 * next, since (in theory) we can stumble along with
334		 * insufficient reservation if data blocks are being freed to
335		 * replenish the AG's free space.
336		 */
337		if (!error &&
338		    xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
339		    xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
340		    pag->pagf_freeblks + pag->pagf_flcount)
341			error = -ENOSPC;
342	}
343
344	return error;
345}
346
347/* Allocate a block from the reservation. */
348void
349xfs_ag_resv_alloc_extent(
350	struct xfs_perag		*pag,
351	enum xfs_ag_resv_type		type,
352	struct xfs_alloc_arg		*args)
353{
354	struct xfs_ag_resv		*resv;
355	xfs_extlen_t			len;
356	uint				field;
357
358	trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
359
360	switch (type) {
361	case XFS_AG_RESV_AGFL:
362		return;
363	case XFS_AG_RESV_METADATA:
364	case XFS_AG_RESV_RMAPBT:
365		resv = xfs_perag_resv(pag, type);
366		break;
367	default:
368		ASSERT(0);
369		fallthrough;
370	case XFS_AG_RESV_NONE:
371		field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
372				       XFS_TRANS_SB_FDBLOCKS;
373		xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
374		return;
375	}
376
377	len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
378	resv->ar_reserved -= len;
379	if (type == XFS_AG_RESV_RMAPBT)
380		return;
381	/* Allocations of reserved blocks only need on-disk sb updates... */
382	xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
383	/* ...but non-reserved blocks need in-core and on-disk updates. */
384	if (args->len > len)
385		xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
386				-((int64_t)args->len - len));
387}
388
389/* Free a block to the reservation. */
390void
391xfs_ag_resv_free_extent(
392	struct xfs_perag		*pag,
393	enum xfs_ag_resv_type		type,
394	struct xfs_trans		*tp,
395	xfs_extlen_t			len)
396{
397	xfs_extlen_t			leftover;
398	struct xfs_ag_resv		*resv;
399
400	trace_xfs_ag_resv_free_extent(pag, type, len);
401
402	switch (type) {
403	case XFS_AG_RESV_AGFL:
404		return;
405	case XFS_AG_RESV_METADATA:
406	case XFS_AG_RESV_RMAPBT:
407		resv = xfs_perag_resv(pag, type);
408		break;
409	default:
410		ASSERT(0);
411		fallthrough;
412	case XFS_AG_RESV_NONE:
413		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
414		return;
415	}
416
417	leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
418	resv->ar_reserved += leftover;
419	if (type == XFS_AG_RESV_RMAPBT)
420		return;
421	/* Freeing into the reserved pool only requires on-disk update... */
422	xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
423	/* ...but freeing beyond that requires in-core and on-disk update. */
424	if (len > leftover)
425		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
426}
427