xref: /kernel/linux/linux-6.6/fs/xfs/scrub/reap.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_btree.h"
13#include "xfs_log_format.h"
14#include "xfs_trans.h"
15#include "xfs_sb.h"
16#include "xfs_inode.h"
17#include "xfs_alloc.h"
18#include "xfs_alloc_btree.h"
19#include "xfs_ialloc.h"
20#include "xfs_ialloc_btree.h"
21#include "xfs_rmap.h"
22#include "xfs_rmap_btree.h"
23#include "xfs_refcount_btree.h"
24#include "xfs_extent_busy.h"
25#include "xfs_ag.h"
26#include "xfs_ag_resv.h"
27#include "xfs_quota.h"
28#include "xfs_qm.h"
29#include "xfs_bmap.h"
30#include "xfs_da_format.h"
31#include "xfs_da_btree.h"
32#include "xfs_attr.h"
33#include "xfs_attr_remote.h"
34#include "scrub/scrub.h"
35#include "scrub/common.h"
36#include "scrub/trace.h"
37#include "scrub/repair.h"
38#include "scrub/bitmap.h"
39#include "scrub/reap.h"
40
41/*
42 * Disposal of Blocks from Old Metadata
43 *
44 * Now that we've constructed a new btree to replace the damaged one, we want
45 * to dispose of the blocks that (we think) the old btree was using.
46 * Previously, we used the rmapbt to collect the extents (bitmap) with the
47 * rmap owner corresponding to the tree we rebuilt, collected extents for any
48 * blocks with the same rmap owner that are owned by another data structure
49 * (sublist), and subtracted sublist from bitmap.  In theory the extents
50 * remaining in bitmap are the old btree's blocks.
51 *
52 * Unfortunately, it's possible that the btree was crosslinked with other
53 * blocks on disk.  The rmap data can tell us if there are multiple owners, so
54 * if the rmapbt says there is an owner of this block other than @oinfo, then
55 * the block is crosslinked.  Remove the reverse mapping and continue.
56 *
57 * If there is one rmap record, we can free the block, which removes the
58 * reverse mapping but doesn't add the block to the free space.  Our repair
59 * strategy is to hope the other metadata objects crosslinked on this block
60 * will be rebuilt (atop different blocks), thereby removing all the cross
61 * links.
62 *
63 * If there are no rmap records at all, we also free the block.  If the btree
64 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
65 * supposed to be a rmap record and everything is ok.  For other btrees there
66 * had to have been an rmap entry for the block to have ended up on @bitmap,
67 * so if it's gone now there's something wrong and the fs will shut down.
68 *
69 * Note: If there are multiple rmap records with only the same rmap owner as
70 * the btree we're trying to rebuild and the block is indeed owned by another
71 * data structure with the same rmap owner, then the block will be in sublist
72 * and therefore doesn't need disposal.  If there are multiple rmap records
73 * with only the same rmap owner but the block is not owned by something with
74 * the same rmap owner, the block will be freed.
75 *
76 * The caller is responsible for locking the AG headers for the entire rebuild
77 * operation so that nothing else can sneak in and change the AG state while
78 * we're not looking.  We must also invalidate any buffers associated with
79 * @bitmap.
80 */
81
82/* Information about reaping extents after a repair. */
83struct xreap_state {
84	struct xfs_scrub		*sc;
85
86	/* Reverse mapping owner and metadata reservation type. */
87	const struct xfs_owner_info	*oinfo;
88	enum xfs_ag_resv_type		resv;
89
90	/* If true, roll the transaction before reaping the next extent. */
91	bool				force_roll;
92
93	/* Number of deferred reaps attached to the current transaction. */
94	unsigned int			deferred;
95
96	/* Number of invalidated buffers logged to the current transaction. */
97	unsigned int			invalidated;
98
99	/* Number of deferred reaps queued during the whole reap sequence. */
100	unsigned long long		total_deferred;
101};
102
103/* Put a block back on the AGFL. */
104STATIC int
105xreap_put_freelist(
106	struct xfs_scrub	*sc,
107	xfs_agblock_t		agbno)
108{
109	struct xfs_buf		*agfl_bp;
110	int			error;
111
112	/* Make sure there's space on the freelist. */
113	error = xrep_fix_freelist(sc, true);
114	if (error)
115		return error;
116
117	/*
118	 * Since we're "freeing" a lost block onto the AGFL, we have to
119	 * create an rmap for the block prior to merging it or else other
120	 * parts will break.
121	 */
122	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
123			&XFS_RMAP_OINFO_AG);
124	if (error)
125		return error;
126
127	/* Put the block on the AGFL. */
128	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
129	if (error)
130		return error;
131
132	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
133			agfl_bp, agbno, 0);
134	if (error)
135		return error;
136	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
137			XFS_EXTENT_BUSY_SKIP_DISCARD);
138
139	return 0;
140}
141
142/* Are there any uncommitted reap operations? */
143static inline bool xreap_dirty(const struct xreap_state *rs)
144{
145	if (rs->force_roll)
146		return true;
147	if (rs->deferred)
148		return true;
149	if (rs->invalidated)
150		return true;
151	if (rs->total_deferred)
152		return true;
153	return false;
154}
155
156#define XREAP_MAX_BINVAL	(2048)
157
158/*
159 * Decide if we want to roll the transaction after reaping an extent.  We don't
160 * want to overrun the transaction reservation, so we prohibit more than
161 * 128 EFIs per transaction.  For the same reason, we limit the number
162 * of buffer invalidations to 2048.
163 */
164static inline bool xreap_want_roll(const struct xreap_state *rs)
165{
166	if (rs->force_roll)
167		return true;
168	if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
169		return true;
170	if (rs->invalidated > XREAP_MAX_BINVAL)
171		return true;
172	return false;
173}
174
175static inline void xreap_reset(struct xreap_state *rs)
176{
177	rs->total_deferred += rs->deferred;
178	rs->deferred = 0;
179	rs->invalidated = 0;
180	rs->force_roll = false;
181}
182
183#define XREAP_MAX_DEFER_CHAIN		(2048)
184
185/*
186 * Decide if we want to finish the deferred ops that are attached to the scrub
187 * transaction.  We don't want to queue huge chains of deferred ops because
188 * that can consume a lot of log space and kernel memory.  Hence we trigger a
189 * xfs_defer_finish if there are more than 2048 deferred reap operations or the
190 * caller did some real work.
191 */
192static inline bool
193xreap_want_defer_finish(const struct xreap_state *rs)
194{
195	if (rs->force_roll)
196		return true;
197	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
198		return true;
199	return false;
200}
201
202static inline void xreap_defer_finish_reset(struct xreap_state *rs)
203{
204	rs->total_deferred = 0;
205	rs->deferred = 0;
206	rs->invalidated = 0;
207	rs->force_roll = false;
208}
209
210/* Try to invalidate the incore buffers for an extent that we're freeing. */
211STATIC void
212xreap_agextent_binval(
213	struct xreap_state	*rs,
214	xfs_agblock_t		agbno,
215	xfs_extlen_t		*aglenp)
216{
217	struct xfs_scrub	*sc = rs->sc;
218	struct xfs_perag	*pag = sc->sa.pag;
219	struct xfs_mount	*mp = sc->mp;
220	xfs_agnumber_t		agno = sc->sa.pag->pag_agno;
221	xfs_agblock_t		agbno_next = agbno + *aglenp;
222	xfs_agblock_t		bno = agbno;
223
224	/*
225	 * Avoid invalidating AG headers and post-EOFS blocks because we never
226	 * own those.
227	 */
228	if (!xfs_verify_agbno(pag, agbno) ||
229	    !xfs_verify_agbno(pag, agbno_next - 1))
230		return;
231
232	/*
233	 * If there are incore buffers for these blocks, invalidate them.  We
234	 * assume that the lack of any other known owners means that the buffer
235	 * can be locked without risk of deadlocking.  The buffer cache cannot
236	 * detect aliasing, so employ nested loops to scan for incore buffers
237	 * of any plausible size.
238	 */
239	while (bno < agbno_next) {
240		xfs_agblock_t	fsbcount;
241		xfs_agblock_t	max_fsbs;
242
243		/*
244		 * Max buffer size is the max remote xattr buffer size, which
245		 * is one fs block larger than 64k.
246		 */
247		max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
248				xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
249
250		for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) {
251			struct xfs_buf	*bp = NULL;
252			xfs_daddr_t	daddr;
253			int		error;
254
255			daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
256			error = xfs_buf_incore(mp->m_ddev_targp, daddr,
257					XFS_FSB_TO_BB(mp, fsbcount),
258					XBF_LIVESCAN, &bp);
259			if (error)
260				continue;
261
262			xfs_trans_bjoin(sc->tp, bp);
263			xfs_trans_binval(sc->tp, bp);
264			rs->invalidated++;
265
266			/*
267			 * Stop invalidating if we've hit the limit; we should
268			 * still have enough reservation left to free however
269			 * far we've gotten.
270			 */
271			if (rs->invalidated > XREAP_MAX_BINVAL) {
272				*aglenp -= agbno_next - bno;
273				goto out;
274			}
275		}
276
277		bno++;
278	}
279
280out:
281	trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
282}
283
284/*
285 * Figure out the longest run of blocks that we can dispose of with a single
286 * call.  Cross-linked blocks should have their reverse mappings removed, but
287 * single-owner extents can be freed.  AGFL blocks can only be put back one at
288 * a time.
289 */
290STATIC int
291xreap_agextent_select(
292	struct xreap_state	*rs,
293	xfs_agblock_t		agbno,
294	xfs_agblock_t		agbno_next,
295	bool			*crosslinked,
296	xfs_extlen_t		*aglenp)
297{
298	struct xfs_scrub	*sc = rs->sc;
299	struct xfs_btree_cur	*cur;
300	xfs_agblock_t		bno = agbno + 1;
301	xfs_extlen_t		len = 1;
302	int			error;
303
304	/*
305	 * Determine if there are any other rmap records covering the first
306	 * block of this extent.  If so, the block is crosslinked.
307	 */
308	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
309			sc->sa.pag);
310	error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
311			crosslinked);
312	if (error)
313		goto out_cur;
314
315	/* AGFL blocks can only be deal with one at a time. */
316	if (rs->resv == XFS_AG_RESV_AGFL)
317		goto out_found;
318
319	/*
320	 * Figure out how many of the subsequent blocks have the same crosslink
321	 * status.
322	 */
323	while (bno < agbno_next) {
324		bool		also_crosslinked;
325
326		error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
327				&also_crosslinked);
328		if (error)
329			goto out_cur;
330
331		if (*crosslinked != also_crosslinked)
332			break;
333
334		len++;
335		bno++;
336	}
337
338out_found:
339	*aglenp = len;
340	trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
341out_cur:
342	xfs_btree_del_cursor(cur, error);
343	return error;
344}
345
346/*
347 * Dispose of as much of the beginning of this AG extent as possible.  The
348 * number of blocks disposed of will be returned in @aglenp.
349 */
350STATIC int
351xreap_agextent_iter(
352	struct xreap_state	*rs,
353	xfs_agblock_t		agbno,
354	xfs_extlen_t		*aglenp,
355	bool			crosslinked)
356{
357	struct xfs_scrub	*sc = rs->sc;
358	xfs_fsblock_t		fsbno;
359	int			error = 0;
360
361	fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
362
363	/*
364	 * If there are other rmappings, this block is cross linked and must
365	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
366	 * we were the only owner of the block, so free the extent, which will
367	 * also remove the rmap.
368	 *
369	 * XXX: XFS doesn't support detecting the case where a single block
370	 * metadata structure is crosslinked with a multi-block structure
371	 * because the buffer cache doesn't detect aliasing problems, so we
372	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
373	 * blow on writeout, the filesystem will shut down, and the admin gets
374	 * to run xfs_repair.
375	 */
376	if (crosslinked) {
377		trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
378
379		rs->force_roll = true;
380		return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
381				*aglenp, rs->oinfo);
382	}
383
384	trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
385
386	/*
387	 * Invalidate as many buffers as we can, starting at agbno.  If this
388	 * function sets *aglenp to zero, the transaction is full of logged
389	 * buffer invalidations, so we need to return early so that we can
390	 * roll and retry.
391	 */
392	xreap_agextent_binval(rs, agbno, aglenp);
393	if (*aglenp == 0) {
394		ASSERT(xreap_want_roll(rs));
395		return 0;
396	}
397
398	/* Put blocks back on the AGFL one at a time. */
399	if (rs->resv == XFS_AG_RESV_AGFL) {
400		ASSERT(*aglenp == 1);
401		error = xreap_put_freelist(sc, agbno);
402		if (error)
403			return error;
404
405		rs->force_roll = true;
406		return 0;
407	}
408
409	/*
410	 * Use deferred frees to get rid of the old btree blocks to try to
411	 * minimize the window in which we could crash and lose the old blocks.
412	 */
413	error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
414			rs->resv, true);
415	if (error)
416		return error;
417
418	rs->deferred++;
419	return 0;
420}
421
422/*
423 * Break an AG metadata extent into sub-extents by fate (crosslinked, not
424 * crosslinked), and dispose of each sub-extent separately.
425 */
426STATIC int
427xreap_agmeta_extent(
428	uint64_t		fsbno,
429	uint64_t		len,
430	void			*priv)
431{
432	struct xreap_state	*rs = priv;
433	struct xfs_scrub	*sc = rs->sc;
434	xfs_agblock_t		agbno = fsbno;
435	xfs_agblock_t		agbno_next = agbno + len;
436	int			error = 0;
437
438	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
439	ASSERT(sc->ip == NULL);
440
441	while (agbno < agbno_next) {
442		xfs_extlen_t	aglen;
443		bool		crosslinked;
444
445		error = xreap_agextent_select(rs, agbno, agbno_next,
446				&crosslinked, &aglen);
447		if (error)
448			return error;
449
450		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
451		if (error)
452			return error;
453
454		if (xreap_want_defer_finish(rs)) {
455			error = xrep_defer_finish(sc);
456			if (error)
457				return error;
458			xreap_defer_finish_reset(rs);
459		} else if (xreap_want_roll(rs)) {
460			error = xrep_roll_ag_trans(sc);
461			if (error)
462				return error;
463			xreap_reset(rs);
464		}
465
466		agbno += aglen;
467	}
468
469	return 0;
470}
471
472/* Dispose of every block of every AG metadata extent in the bitmap. */
473int
474xrep_reap_agblocks(
475	struct xfs_scrub		*sc,
476	struct xagb_bitmap		*bitmap,
477	const struct xfs_owner_info	*oinfo,
478	enum xfs_ag_resv_type		type)
479{
480	struct xreap_state		rs = {
481		.sc			= sc,
482		.oinfo			= oinfo,
483		.resv			= type,
484	};
485	int				error;
486
487	ASSERT(xfs_has_rmapbt(sc->mp));
488	ASSERT(sc->ip == NULL);
489
490	error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
491	if (error)
492		return error;
493
494	if (xreap_dirty(&rs))
495		return xrep_defer_finish(sc);
496
497	return 0;
498}
499