xref: /kernel/linux/linux-6.6/fs/xfs/scrub/common.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_btree.h"
13#include "xfs_log_format.h"
14#include "xfs_trans.h"
15#include "xfs_inode.h"
16#include "xfs_icache.h"
17#include "xfs_alloc.h"
18#include "xfs_alloc_btree.h"
19#include "xfs_ialloc.h"
20#include "xfs_ialloc_btree.h"
21#include "xfs_refcount_btree.h"
22#include "xfs_rmap.h"
23#include "xfs_rmap_btree.h"
24#include "xfs_log.h"
25#include "xfs_trans_priv.h"
26#include "xfs_da_format.h"
27#include "xfs_da_btree.h"
28#include "xfs_attr.h"
29#include "xfs_reflink.h"
30#include "xfs_ag.h"
31#include "scrub/scrub.h"
32#include "scrub/common.h"
33#include "scrub/trace.h"
34#include "scrub/repair.h"
35#include "scrub/health.h"
36
37/* Common code for the metadata scrubbers. */
38
39/*
40 * Handling operational errors.
41 *
42 * The *_process_error() family of functions are used to process error return
43 * codes from functions called as part of a scrub operation.
44 *
45 * If there's no error, we return true to tell the caller that it's ok
46 * to move on to the next check in its list.
47 *
48 * For non-verifier errors (e.g. ENOMEM) we return false to tell the
49 * caller that something bad happened, and we preserve *error so that
50 * the caller can return the *error up the stack to userspace.
51 *
52 * Verifier errors (EFSBADCRC/EFSCORRUPTED) are recorded by setting
53 * OFLAG_CORRUPT in sm_flags and the *error is cleared.  In other words,
54 * we track verifier errors (and failed scrub checks) via OFLAG_CORRUPT,
55 * not via return codes.  We return false to tell the caller that
56 * something bad happened.  Since the error has been cleared, the caller
57 * will (presumably) return that zero and scrubbing will move on to
58 * whatever's next.
59 *
60 * ftrace can be used to record the precise metadata location and the
61 * approximate code location of the failed operation.
62 */
63
64/* Check for operational errors. */
65static bool
66__xchk_process_error(
67	struct xfs_scrub	*sc,
68	xfs_agnumber_t		agno,
69	xfs_agblock_t		bno,
70	int			*error,
71	__u32			errflag,
72	void			*ret_ip)
73{
74	switch (*error) {
75	case 0:
76		return true;
77	case -EDEADLOCK:
78	case -ECHRNG:
79		/* Used to restart an op with deadlock avoidance. */
80		trace_xchk_deadlock_retry(
81				sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
82				sc->sm, *error);
83		break;
84	case -EFSBADCRC:
85	case -EFSCORRUPTED:
86		/* Note the badness but don't abort. */
87		sc->sm->sm_flags |= errflag;
88		*error = 0;
89		fallthrough;
90	default:
91		trace_xchk_op_error(sc, agno, bno, *error,
92				ret_ip);
93		break;
94	}
95	return false;
96}
97
98bool
99xchk_process_error(
100	struct xfs_scrub	*sc,
101	xfs_agnumber_t		agno,
102	xfs_agblock_t		bno,
103	int			*error)
104{
105	return __xchk_process_error(sc, agno, bno, error,
106			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
107}
108
109bool
110xchk_xref_process_error(
111	struct xfs_scrub	*sc,
112	xfs_agnumber_t		agno,
113	xfs_agblock_t		bno,
114	int			*error)
115{
116	return __xchk_process_error(sc, agno, bno, error,
117			XFS_SCRUB_OFLAG_XFAIL, __return_address);
118}
119
120/* Check for operational errors for a file offset. */
121static bool
122__xchk_fblock_process_error(
123	struct xfs_scrub	*sc,
124	int			whichfork,
125	xfs_fileoff_t		offset,
126	int			*error,
127	__u32			errflag,
128	void			*ret_ip)
129{
130	switch (*error) {
131	case 0:
132		return true;
133	case -EDEADLOCK:
134	case -ECHRNG:
135		/* Used to restart an op with deadlock avoidance. */
136		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
137		break;
138	case -EFSBADCRC:
139	case -EFSCORRUPTED:
140		/* Note the badness but don't abort. */
141		sc->sm->sm_flags |= errflag;
142		*error = 0;
143		fallthrough;
144	default:
145		trace_xchk_file_op_error(sc, whichfork, offset, *error,
146				ret_ip);
147		break;
148	}
149	return false;
150}
151
152bool
153xchk_fblock_process_error(
154	struct xfs_scrub	*sc,
155	int			whichfork,
156	xfs_fileoff_t		offset,
157	int			*error)
158{
159	return __xchk_fblock_process_error(sc, whichfork, offset, error,
160			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
161}
162
163bool
164xchk_fblock_xref_process_error(
165	struct xfs_scrub	*sc,
166	int			whichfork,
167	xfs_fileoff_t		offset,
168	int			*error)
169{
170	return __xchk_fblock_process_error(sc, whichfork, offset, error,
171			XFS_SCRUB_OFLAG_XFAIL, __return_address);
172}
173
174/*
175 * Handling scrub corruption/optimization/warning checks.
176 *
177 * The *_set_{corrupt,preen,warning}() family of functions are used to
178 * record the presence of metadata that is incorrect (corrupt), could be
179 * optimized somehow (preen), or should be flagged for administrative
180 * review but is not incorrect (warn).
181 *
182 * ftrace can be used to record the precise metadata location and
183 * approximate code location of the failed check.
184 */
185
186/* Record a block which could be optimized. */
187void
188xchk_block_set_preen(
189	struct xfs_scrub	*sc,
190	struct xfs_buf		*bp)
191{
192	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
193	trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address);
194}
195
196/*
197 * Record an inode which could be optimized.  The trace data will
198 * include the block given by bp if bp is given; otherwise it will use
199 * the block location of the inode record itself.
200 */
201void
202xchk_ino_set_preen(
203	struct xfs_scrub	*sc,
204	xfs_ino_t		ino)
205{
206	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN;
207	trace_xchk_ino_preen(sc, ino, __return_address);
208}
209
210/* Record something being wrong with the filesystem primary superblock. */
211void
212xchk_set_corrupt(
213	struct xfs_scrub	*sc)
214{
215	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
216	trace_xchk_fs_error(sc, 0, __return_address);
217}
218
219/* Record a corrupt block. */
220void
221xchk_block_set_corrupt(
222	struct xfs_scrub	*sc,
223	struct xfs_buf		*bp)
224{
225	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
226	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
227}
228
229/* Record a corruption while cross-referencing. */
230void
231xchk_block_xref_set_corrupt(
232	struct xfs_scrub	*sc,
233	struct xfs_buf		*bp)
234{
235	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
236	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
237}
238
239/*
240 * Record a corrupt inode.  The trace data will include the block given
241 * by bp if bp is given; otherwise it will use the block location of the
242 * inode record itself.
243 */
244void
245xchk_ino_set_corrupt(
246	struct xfs_scrub	*sc,
247	xfs_ino_t		ino)
248{
249	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
250	trace_xchk_ino_error(sc, ino, __return_address);
251}
252
253/* Record a corruption while cross-referencing with an inode. */
254void
255xchk_ino_xref_set_corrupt(
256	struct xfs_scrub	*sc,
257	xfs_ino_t		ino)
258{
259	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
260	trace_xchk_ino_error(sc, ino, __return_address);
261}
262
263/* Record corruption in a block indexed by a file fork. */
264void
265xchk_fblock_set_corrupt(
266	struct xfs_scrub	*sc,
267	int			whichfork,
268	xfs_fileoff_t		offset)
269{
270	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
271	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
272}
273
274/* Record a corruption while cross-referencing a fork block. */
275void
276xchk_fblock_xref_set_corrupt(
277	struct xfs_scrub	*sc,
278	int			whichfork,
279	xfs_fileoff_t		offset)
280{
281	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT;
282	trace_xchk_fblock_error(sc, whichfork, offset, __return_address);
283}
284
285/*
286 * Warn about inodes that need administrative review but is not
287 * incorrect.
288 */
289void
290xchk_ino_set_warning(
291	struct xfs_scrub	*sc,
292	xfs_ino_t		ino)
293{
294	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
295	trace_xchk_ino_warning(sc, ino, __return_address);
296}
297
298/* Warn about a block indexed by a file fork that needs review. */
299void
300xchk_fblock_set_warning(
301	struct xfs_scrub	*sc,
302	int			whichfork,
303	xfs_fileoff_t		offset)
304{
305	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING;
306	trace_xchk_fblock_warning(sc, whichfork, offset, __return_address);
307}
308
309/* Signal an incomplete scrub. */
310void
311xchk_set_incomplete(
312	struct xfs_scrub	*sc)
313{
314	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_INCOMPLETE;
315	trace_xchk_incomplete(sc, __return_address);
316}
317
318/*
319 * rmap scrubbing -- compute the number of blocks with a given owner,
320 * at least according to the reverse mapping data.
321 */
322
323struct xchk_rmap_ownedby_info {
324	const struct xfs_owner_info	*oinfo;
325	xfs_filblks_t			*blocks;
326};
327
328STATIC int
329xchk_count_rmap_ownedby_irec(
330	struct xfs_btree_cur		*cur,
331	const struct xfs_rmap_irec	*rec,
332	void				*priv)
333{
334	struct xchk_rmap_ownedby_info	*sroi = priv;
335	bool				irec_attr;
336	bool				oinfo_attr;
337
338	irec_attr = rec->rm_flags & XFS_RMAP_ATTR_FORK;
339	oinfo_attr = sroi->oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK;
340
341	if (rec->rm_owner != sroi->oinfo->oi_owner)
342		return 0;
343
344	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || irec_attr == oinfo_attr)
345		(*sroi->blocks) += rec->rm_blockcount;
346
347	return 0;
348}
349
350/*
351 * Calculate the number of blocks the rmap thinks are owned by something.
352 * The caller should pass us an rmapbt cursor.
353 */
354int
355xchk_count_rmap_ownedby_ag(
356	struct xfs_scrub		*sc,
357	struct xfs_btree_cur		*cur,
358	const struct xfs_owner_info	*oinfo,
359	xfs_filblks_t			*blocks)
360{
361	struct xchk_rmap_ownedby_info	sroi = {
362		.oinfo			= oinfo,
363		.blocks			= blocks,
364	};
365
366	*blocks = 0;
367	return xfs_rmap_query_all(cur, xchk_count_rmap_ownedby_irec,
368			&sroi);
369}
370
371/*
372 * AG scrubbing
373 *
374 * These helpers facilitate locking an allocation group's header
375 * buffers, setting up cursors for all btrees that are present, and
376 * cleaning everything up once we're through.
377 */
378
379/* Decide if we want to return an AG header read failure. */
380static inline bool
381want_ag_read_header_failure(
382	struct xfs_scrub	*sc,
383	unsigned int		type)
384{
385	/* Return all AG header read failures when scanning btrees. */
386	if (sc->sm->sm_type != XFS_SCRUB_TYPE_AGF &&
387	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGFL &&
388	    sc->sm->sm_type != XFS_SCRUB_TYPE_AGI)
389		return true;
390	/*
391	 * If we're scanning a given type of AG header, we only want to
392	 * see read failures from that specific header.  We'd like the
393	 * other headers to cross-check them, but this isn't required.
394	 */
395	if (sc->sm->sm_type == type)
396		return true;
397	return false;
398}
399
400/*
401 * Grab the AG header buffers for the attached perag structure.
402 *
403 * The headers should be released by xchk_ag_free, but as a fail safe we attach
404 * all the buffers we grab to the scrub transaction so they'll all be freed
405 * when we cancel it.
406 */
407static inline int
408xchk_perag_read_headers(
409	struct xfs_scrub	*sc,
410	struct xchk_ag		*sa)
411{
412	int			error;
413
414	error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp);
415	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI))
416		return error;
417
418	error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp);
419	if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
420		return error;
421
422	return 0;
423}
424
425/*
426 * Grab the AG headers for the attached perag structure and wait for pending
427 * intents to drain.
428 */
429static int
430xchk_perag_drain_and_lock(
431	struct xfs_scrub	*sc)
432{
433	struct xchk_ag		*sa = &sc->sa;
434	int			error = 0;
435
436	ASSERT(sa->pag != NULL);
437	ASSERT(sa->agi_bp == NULL);
438	ASSERT(sa->agf_bp == NULL);
439
440	do {
441		if (xchk_should_terminate(sc, &error))
442			return error;
443
444		error = xchk_perag_read_headers(sc, sa);
445		if (error)
446			return error;
447
448		/*
449		 * If we've grabbed an inode for scrubbing then we assume that
450		 * holding its ILOCK will suffice to coordinate with any intent
451		 * chains involving this inode.
452		 */
453		if (sc->ip)
454			return 0;
455
456		/*
457		 * Decide if this AG is quiet enough for all metadata to be
458		 * consistent with each other.  XFS allows the AG header buffer
459		 * locks to cycle across transaction rolls while processing
460		 * chains of deferred ops, which means that there could be
461		 * other threads in the middle of processing a chain of
462		 * deferred ops.  For regular operations we are careful about
463		 * ordering operations to prevent collisions between threads
464		 * (which is why we don't need a per-AG lock), but scrub and
465		 * repair have to serialize against chained operations.
466		 *
467		 * We just locked all the AG headers buffers; now take a look
468		 * to see if there are any intents in progress.  If there are,
469		 * drop the AG headers and wait for the intents to drain.
470		 * Since we hold all the AG header locks for the duration of
471		 * the scrub, this is the only time we have to sample the
472		 * intents counter; any threads increasing it after this point
473		 * can't possibly be in the middle of a chain of AG metadata
474		 * updates.
475		 *
476		 * Obviously, this should be slanted against scrub and in favor
477		 * of runtime threads.
478		 */
479		if (!xfs_perag_intent_busy(sa->pag))
480			return 0;
481
482		if (sa->agf_bp) {
483			xfs_trans_brelse(sc->tp, sa->agf_bp);
484			sa->agf_bp = NULL;
485		}
486
487		if (sa->agi_bp) {
488			xfs_trans_brelse(sc->tp, sa->agi_bp);
489			sa->agi_bp = NULL;
490		}
491
492		if (!(sc->flags & XCHK_FSGATES_DRAIN))
493			return -ECHRNG;
494		error = xfs_perag_intent_drain(sa->pag);
495		if (error == -ERESTARTSYS)
496			error = -EINTR;
497	} while (!error);
498
499	return error;
500}
501
502/*
503 * Grab the per-AG structure, grab all AG header buffers, and wait until there
504 * aren't any pending intents.  Returns -ENOENT if we can't grab the perag
505 * structure.
506 */
507int
508xchk_ag_read_headers(
509	struct xfs_scrub	*sc,
510	xfs_agnumber_t		agno,
511	struct xchk_ag		*sa)
512{
513	struct xfs_mount	*mp = sc->mp;
514
515	ASSERT(!sa->pag);
516	sa->pag = xfs_perag_get(mp, agno);
517	if (!sa->pag)
518		return -ENOENT;
519
520	return xchk_perag_drain_and_lock(sc);
521}
522
523/* Release all the AG btree cursors. */
524void
525xchk_ag_btcur_free(
526	struct xchk_ag		*sa)
527{
528	if (sa->refc_cur)
529		xfs_btree_del_cursor(sa->refc_cur, XFS_BTREE_ERROR);
530	if (sa->rmap_cur)
531		xfs_btree_del_cursor(sa->rmap_cur, XFS_BTREE_ERROR);
532	if (sa->fino_cur)
533		xfs_btree_del_cursor(sa->fino_cur, XFS_BTREE_ERROR);
534	if (sa->ino_cur)
535		xfs_btree_del_cursor(sa->ino_cur, XFS_BTREE_ERROR);
536	if (sa->cnt_cur)
537		xfs_btree_del_cursor(sa->cnt_cur, XFS_BTREE_ERROR);
538	if (sa->bno_cur)
539		xfs_btree_del_cursor(sa->bno_cur, XFS_BTREE_ERROR);
540
541	sa->refc_cur = NULL;
542	sa->rmap_cur = NULL;
543	sa->fino_cur = NULL;
544	sa->ino_cur = NULL;
545	sa->bno_cur = NULL;
546	sa->cnt_cur = NULL;
547}
548
549/* Initialize all the btree cursors for an AG. */
550void
551xchk_ag_btcur_init(
552	struct xfs_scrub	*sc,
553	struct xchk_ag		*sa)
554{
555	struct xfs_mount	*mp = sc->mp;
556
557	if (sa->agf_bp &&
558	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
559		/* Set up a bnobt cursor for cross-referencing. */
560		sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
561				sa->pag, XFS_BTNUM_BNO);
562	}
563
564	if (sa->agf_bp &&
565	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
566		/* Set up a cntbt cursor for cross-referencing. */
567		sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
568				sa->pag, XFS_BTNUM_CNT);
569	}
570
571	/* Set up a inobt cursor for cross-referencing. */
572	if (sa->agi_bp &&
573	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
574		sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
575				XFS_BTNUM_INO);
576	}
577
578	/* Set up a finobt cursor for cross-referencing. */
579	if (sa->agi_bp && xfs_has_finobt(mp) &&
580	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
581		sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
582				XFS_BTNUM_FINO);
583	}
584
585	/* Set up a rmapbt cursor for cross-referencing. */
586	if (sa->agf_bp && xfs_has_rmapbt(mp) &&
587	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
588		sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
589				sa->pag);
590	}
591
592	/* Set up a refcountbt cursor for cross-referencing. */
593	if (sa->agf_bp && xfs_has_reflink(mp) &&
594	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
595		sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
596				sa->agf_bp, sa->pag);
597	}
598}
599
600/* Release the AG header context and btree cursors. */
601void
602xchk_ag_free(
603	struct xfs_scrub	*sc,
604	struct xchk_ag		*sa)
605{
606	xchk_ag_btcur_free(sa);
607	if (sa->agf_bp) {
608		xfs_trans_brelse(sc->tp, sa->agf_bp);
609		sa->agf_bp = NULL;
610	}
611	if (sa->agi_bp) {
612		xfs_trans_brelse(sc->tp, sa->agi_bp);
613		sa->agi_bp = NULL;
614	}
615	if (sa->pag) {
616		xfs_perag_put(sa->pag);
617		sa->pag = NULL;
618	}
619}
620
621/*
622 * For scrub, grab the perag structure, the AGI, and the AGF headers, in that
623 * order.  Locking order requires us to get the AGI before the AGF.  We use the
624 * transaction to avoid deadlocking on crosslinked metadata buffers; either the
625 * caller passes one in (bmap scrub) or we have to create a transaction
626 * ourselves.  Returns ENOENT if the perag struct cannot be grabbed.
627 */
628int
629xchk_ag_init(
630	struct xfs_scrub	*sc,
631	xfs_agnumber_t		agno,
632	struct xchk_ag		*sa)
633{
634	int			error;
635
636	error = xchk_ag_read_headers(sc, agno, sa);
637	if (error)
638		return error;
639
640	xchk_ag_btcur_init(sc, sa);
641	return 0;
642}
643
644/* Per-scrubber setup functions */
645
646void
647xchk_trans_cancel(
648	struct xfs_scrub	*sc)
649{
650	xfs_trans_cancel(sc->tp);
651	sc->tp = NULL;
652}
653
654/*
655 * Grab an empty transaction so that we can re-grab locked buffers if
656 * one of our btrees turns out to be cyclic.
657 *
658 * If we're going to repair something, we need to ask for the largest possible
659 * log reservation so that we can handle the worst case scenario for metadata
660 * updates while rebuilding a metadata item.  We also need to reserve as many
661 * blocks in the head transaction as we think we're going to need to rebuild
662 * the metadata object.
663 */
664int
665xchk_trans_alloc(
666	struct xfs_scrub	*sc,
667	uint			resblks)
668{
669	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
670		return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
671				resblks, 0, 0, &sc->tp);
672
673	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
674}
675
676/* Set us up with a transaction and an empty context. */
677int
678xchk_setup_fs(
679	struct xfs_scrub	*sc)
680{
681	uint			resblks;
682
683	resblks = xrep_calc_ag_resblks(sc);
684	return xchk_trans_alloc(sc, resblks);
685}
686
687/* Set us up with AG headers and btree cursors. */
688int
689xchk_setup_ag_btree(
690	struct xfs_scrub	*sc,
691	bool			force_log)
692{
693	struct xfs_mount	*mp = sc->mp;
694	int			error;
695
696	/*
697	 * If the caller asks us to checkpont the log, do so.  This
698	 * expensive operation should be performed infrequently and only
699	 * as a last resort.  Any caller that sets force_log should
700	 * document why they need to do so.
701	 */
702	if (force_log) {
703		error = xchk_checkpoint_log(mp);
704		if (error)
705			return error;
706	}
707
708	error = xchk_setup_fs(sc);
709	if (error)
710		return error;
711
712	return xchk_ag_init(sc, sc->sm->sm_agno, &sc->sa);
713}
714
715/* Push everything out of the log onto disk. */
716int
717xchk_checkpoint_log(
718	struct xfs_mount	*mp)
719{
720	int			error;
721
722	error = xfs_log_force(mp, XFS_LOG_SYNC);
723	if (error)
724		return error;
725	xfs_ail_push_all_sync(mp->m_ail);
726	return 0;
727}
728
729/* Verify that an inode is allocated ondisk, then return its cached inode. */
730int
731xchk_iget(
732	struct xfs_scrub	*sc,
733	xfs_ino_t		inum,
734	struct xfs_inode	**ipp)
735{
736	return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
737}
738
739/*
740 * Try to grab an inode in a manner that avoids races with physical inode
741 * allocation.  If we can't, return the locked AGI buffer so that the caller
742 * can single-step the loading process to see where things went wrong.
743 * Callers must have a valid scrub transaction.
744 *
745 * If the iget succeeds, return 0, a NULL AGI, and the inode.
746 *
747 * If the iget fails, return the error, the locked AGI, and a NULL inode.  This
748 * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
749 * no longer allocated; or any other corruption or runtime error.
750 *
751 * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
752 *
753 * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
754 */
755int
756xchk_iget_agi(
757	struct xfs_scrub	*sc,
758	xfs_ino_t		inum,
759	struct xfs_buf		**agi_bpp,
760	struct xfs_inode	**ipp)
761{
762	struct xfs_mount	*mp = sc->mp;
763	struct xfs_trans	*tp = sc->tp;
764	struct xfs_perag	*pag;
765	int			error;
766
767	ASSERT(sc->tp != NULL);
768
769again:
770	*agi_bpp = NULL;
771	*ipp = NULL;
772	error = 0;
773
774	if (xchk_should_terminate(sc, &error))
775		return error;
776
777	/*
778	 * Attach the AGI buffer to the scrub transaction to avoid deadlocks
779	 * in the iget cache miss path.
780	 */
781	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
782	error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
783	xfs_perag_put(pag);
784	if (error)
785		return error;
786
787	error = xfs_iget(mp, tp, inum,
788			XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp);
789	if (error == -EAGAIN) {
790		/*
791		 * The inode may be in core but temporarily unavailable and may
792		 * require the AGI buffer before it can be returned.  Drop the
793		 * AGI buffer and retry the lookup.
794		 *
795		 * Incore lookup will fail with EAGAIN on a cache hit if the
796		 * inode is queued to the inactivation list.  The inactivation
797		 * worker may remove the inode from the unlinked list and hence
798		 * needs the AGI.
799		 *
800		 * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
801		 * to allow inodegc to make progress and move the inode to
802		 * IRECLAIMABLE state where xfs_iget will be able to return it
803		 * again if it can lock the inode.
804		 */
805		xfs_trans_brelse(tp, *agi_bpp);
806		delay(1);
807		goto again;
808	}
809	if (error)
810		return error;
811
812	/* We got the inode, so we can release the AGI. */
813	ASSERT(*ipp != NULL);
814	xfs_trans_brelse(tp, *agi_bpp);
815	*agi_bpp = NULL;
816	return 0;
817}
818
819/* Install an inode that we opened by handle for scrubbing. */
820int
821xchk_install_handle_inode(
822	struct xfs_scrub	*sc,
823	struct xfs_inode	*ip)
824{
825	if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
826		xchk_irele(sc, ip);
827		return -ENOENT;
828	}
829
830	sc->ip = ip;
831	return 0;
832}
833
834/*
835 * Install an already-referenced inode for scrubbing.  Get our own reference to
836 * the inode to make disposal simpler.  The inode must not be in I_FREEING or
837 * I_WILL_FREE state!
838 */
839int
840xchk_install_live_inode(
841	struct xfs_scrub	*sc,
842	struct xfs_inode	*ip)
843{
844	if (!igrab(VFS_I(ip))) {
845		xchk_ino_set_corrupt(sc, ip->i_ino);
846		return -EFSCORRUPTED;
847	}
848
849	sc->ip = ip;
850	return 0;
851}
852
853/*
854 * In preparation to scrub metadata structures that hang off of an inode,
855 * grab either the inode referenced in the scrub control structure or the
856 * inode passed in.  If the inumber does not reference an allocated inode
857 * record, the function returns ENOENT to end the scrub early.  The inode
858 * is not locked.
859 */
860int
861xchk_iget_for_scrubbing(
862	struct xfs_scrub	*sc)
863{
864	struct xfs_imap		imap;
865	struct xfs_mount	*mp = sc->mp;
866	struct xfs_perag	*pag;
867	struct xfs_buf		*agi_bp;
868	struct xfs_inode	*ip_in = XFS_I(file_inode(sc->file));
869	struct xfs_inode	*ip = NULL;
870	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
871	int			error;
872
873	ASSERT(sc->tp == NULL);
874
875	/* We want to scan the inode we already had opened. */
876	if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
877		return xchk_install_live_inode(sc, ip_in);
878
879	/* Reject internal metadata files and obviously bad inode numbers. */
880	if (xfs_internal_inum(mp, sc->sm->sm_ino))
881		return -ENOENT;
882	if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
883		return -ENOENT;
884
885	/* Try a regular untrusted iget. */
886	error = xchk_iget(sc, sc->sm->sm_ino, &ip);
887	if (!error)
888		return xchk_install_handle_inode(sc, ip);
889	if (error == -ENOENT)
890		return error;
891	if (error != -EINVAL)
892		goto out_error;
893
894	/*
895	 * EINVAL with IGET_UNTRUSTED probably means one of several things:
896	 * userspace gave us an inode number that doesn't correspond to fs
897	 * space; the inode btree lacks a record for this inode; or there is a
898	 * record, and it says this inode is free.
899	 *
900	 * We want to look up this inode in the inobt to distinguish two
901	 * scenarios: (1) the inobt says the inode is free, in which case
902	 * there's nothing to do; and (2) the inobt says the inode is
903	 * allocated, but loading it failed due to corruption.
904	 *
905	 * Allocate a transaction and grab the AGI to prevent inobt activity
906	 * in this AG.  Retry the iget in case someone allocated a new inode
907	 * after the first iget failed.
908	 */
909	error = xchk_trans_alloc(sc, 0);
910	if (error)
911		goto out_error;
912
913	error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
914	if (error == 0) {
915		/* Actually got the inode, so install it. */
916		xchk_trans_cancel(sc);
917		return xchk_install_handle_inode(sc, ip);
918	}
919	if (error == -ENOENT)
920		goto out_gone;
921	if (error != -EINVAL)
922		goto out_cancel;
923
924	/* Ensure that we have protected against inode allocation/freeing. */
925	if (agi_bp == NULL) {
926		ASSERT(agi_bp != NULL);
927		error = -ECANCELED;
928		goto out_cancel;
929	}
930
931	/*
932	 * Untrusted iget failed a second time.  Let's try an inobt lookup.
933	 * If the inobt thinks this the inode neither can exist inside the
934	 * filesystem nor is allocated, return ENOENT to signal that the check
935	 * can be skipped.
936	 *
937	 * If the lookup returns corruption, we'll mark this inode corrupt and
938	 * exit to userspace.  There's little chance of fixing anything until
939	 * the inobt is straightened out, but there's nothing we can do here.
940	 *
941	 * If the lookup encounters any other error, exit to userspace.
942	 *
943	 * If the lookup succeeds, something else must be very wrong in the fs
944	 * such that setting up the incore inode failed in some strange way.
945	 * Treat those as corruptions.
946	 */
947	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
948	if (!pag) {
949		error = -EFSCORRUPTED;
950		goto out_cancel;
951	}
952
953	error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
954			XFS_IGET_UNTRUSTED);
955	xfs_perag_put(pag);
956	if (error == -EINVAL || error == -ENOENT)
957		goto out_gone;
958	if (!error)
959		error = -EFSCORRUPTED;
960
961out_cancel:
962	xchk_trans_cancel(sc);
963out_error:
964	trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
965			error, __return_address);
966	return error;
967out_gone:
968	/* The file is gone, so there's nothing to check. */
969	xchk_trans_cancel(sc);
970	return -ENOENT;
971}
972
973/* Release an inode, possibly dropping it in the process. */
974void
975xchk_irele(
976	struct xfs_scrub	*sc,
977	struct xfs_inode	*ip)
978{
979	if (current->journal_info != NULL) {
980		ASSERT(current->journal_info == sc->tp);
981
982		/*
983		 * If we are in a transaction, we /cannot/ drop the inode
984		 * ourselves, because the VFS will trigger writeback, which
985		 * can require a transaction.  Clear DONTCACHE to force the
986		 * inode to the LRU, where someone else can take care of
987		 * dropping it.
988		 *
989		 * Note that when we grabbed our reference to the inode, it
990		 * could have had an active ref and DONTCACHE set if a sysadmin
991		 * is trying to coerce a change in file access mode.  icache
992		 * hits do not clear DONTCACHE, so we must do it here.
993		 */
994		spin_lock(&VFS_I(ip)->i_lock);
995		VFS_I(ip)->i_state &= ~I_DONTCACHE;
996		spin_unlock(&VFS_I(ip)->i_lock);
997	} else if (atomic_read(&VFS_I(ip)->i_count) == 1) {
998		/*
999		 * If this is the last reference to the inode and the caller
1000		 * permits it, set DONTCACHE to avoid thrashing.
1001		 */
1002		d_mark_dontcache(VFS_I(ip));
1003	}
1004
1005	xfs_irele(ip);
1006}
1007
1008/*
1009 * Set us up to scrub metadata mapped by a file's fork.  Callers must not use
1010 * this to operate on user-accessible regular file data because the MMAPLOCK is
1011 * not taken.
1012 */
1013int
1014xchk_setup_inode_contents(
1015	struct xfs_scrub	*sc,
1016	unsigned int		resblks)
1017{
1018	int			error;
1019
1020	error = xchk_iget_for_scrubbing(sc);
1021	if (error)
1022		return error;
1023
1024	/* Lock the inode so the VFS cannot touch this file. */
1025	xchk_ilock(sc, XFS_IOLOCK_EXCL);
1026
1027	error = xchk_trans_alloc(sc, resblks);
1028	if (error)
1029		goto out;
1030	xchk_ilock(sc, XFS_ILOCK_EXCL);
1031out:
1032	/* scrub teardown will unlock and release the inode for us */
1033	return error;
1034}
1035
1036void
1037xchk_ilock(
1038	struct xfs_scrub	*sc,
1039	unsigned int		ilock_flags)
1040{
1041	xfs_ilock(sc->ip, ilock_flags);
1042	sc->ilock_flags |= ilock_flags;
1043}
1044
1045bool
1046xchk_ilock_nowait(
1047	struct xfs_scrub	*sc,
1048	unsigned int		ilock_flags)
1049{
1050	if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
1051		sc->ilock_flags |= ilock_flags;
1052		return true;
1053	}
1054
1055	return false;
1056}
1057
1058void
1059xchk_iunlock(
1060	struct xfs_scrub	*sc,
1061	unsigned int		ilock_flags)
1062{
1063	sc->ilock_flags &= ~ilock_flags;
1064	xfs_iunlock(sc->ip, ilock_flags);
1065}
1066
1067/*
1068 * Predicate that decides if we need to evaluate the cross-reference check.
1069 * If there was an error accessing the cross-reference btree, just delete
1070 * the cursor and skip the check.
1071 */
1072bool
1073xchk_should_check_xref(
1074	struct xfs_scrub	*sc,
1075	int			*error,
1076	struct xfs_btree_cur	**curpp)
1077{
1078	/* No point in xref if we already know we're corrupt. */
1079	if (xchk_skip_xref(sc->sm))
1080		return false;
1081
1082	if (*error == 0)
1083		return true;
1084
1085	if (curpp) {
1086		/* If we've already given up on xref, just bail out. */
1087		if (!*curpp)
1088			return false;
1089
1090		/* xref error, delete cursor and bail out. */
1091		xfs_btree_del_cursor(*curpp, XFS_BTREE_ERROR);
1092		*curpp = NULL;
1093	}
1094
1095	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
1096	trace_xchk_xref_error(sc, *error, __return_address);
1097
1098	/*
1099	 * Errors encountered during cross-referencing with another
1100	 * data structure should not cause this scrubber to abort.
1101	 */
1102	*error = 0;
1103	return false;
1104}
1105
1106/* Run the structure verifiers on in-memory buffers to detect bad memory. */
1107void
1108xchk_buffer_recheck(
1109	struct xfs_scrub	*sc,
1110	struct xfs_buf		*bp)
1111{
1112	xfs_failaddr_t		fa;
1113
1114	if (bp->b_ops == NULL) {
1115		xchk_block_set_corrupt(sc, bp);
1116		return;
1117	}
1118	if (bp->b_ops->verify_struct == NULL) {
1119		xchk_set_incomplete(sc);
1120		return;
1121	}
1122	fa = bp->b_ops->verify_struct(bp);
1123	if (!fa)
1124		return;
1125	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
1126	trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
1127}
1128
1129static inline int
1130xchk_metadata_inode_subtype(
1131	struct xfs_scrub	*sc,
1132	unsigned int		scrub_type)
1133{
1134	__u32			smtype = sc->sm->sm_type;
1135	int			error;
1136
1137	sc->sm->sm_type = scrub_type;
1138
1139	switch (scrub_type) {
1140	case XFS_SCRUB_TYPE_INODE:
1141		error = xchk_inode(sc);
1142		break;
1143	case XFS_SCRUB_TYPE_BMBTD:
1144		error = xchk_bmap_data(sc);
1145		break;
1146	default:
1147		ASSERT(0);
1148		error = -EFSCORRUPTED;
1149		break;
1150	}
1151
1152	sc->sm->sm_type = smtype;
1153	return error;
1154}
1155
1156/*
1157 * Scrub the attr/data forks of a metadata inode.  The metadata inode must be
1158 * pointed to by sc->ip and the ILOCK must be held.
1159 */
1160int
1161xchk_metadata_inode_forks(
1162	struct xfs_scrub	*sc)
1163{
1164	bool			shared;
1165	int			error;
1166
1167	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1168		return 0;
1169
1170	/* Check the inode record. */
1171	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1172	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1173		return error;
1174
1175	/* Metadata inodes don't live on the rt device. */
1176	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
1177		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1178		return 0;
1179	}
1180
1181	/* They should never participate in reflink. */
1182	if (xfs_is_reflink_inode(sc->ip)) {
1183		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1184		return 0;
1185	}
1186
1187	/* They also should never have extended attributes. */
1188	if (xfs_inode_hasattr(sc->ip)) {
1189		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1190		return 0;
1191	}
1192
1193	/* Invoke the data fork scrubber. */
1194	error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1195	if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
1196		return error;
1197
1198	/* Look for incorrect shared blocks. */
1199	if (xfs_has_reflink(sc->mp)) {
1200		error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
1201				&shared);
1202		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0,
1203				&error))
1204			return error;
1205		if (shared)
1206			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1207	}
1208
1209	return 0;
1210}
1211
1212/*
1213 * Enable filesystem hooks (i.e. runtime code patching) before starting a scrub
1214 * operation.  Callers must not hold any locks that intersect with the CPU
1215 * hotplug lock (e.g. writeback locks) because code patching must halt the CPUs
1216 * to change kernel code.
1217 */
1218void
1219xchk_fsgates_enable(
1220	struct xfs_scrub	*sc,
1221	unsigned int		scrub_fsgates)
1222{
1223	ASSERT(!(scrub_fsgates & ~XCHK_FSGATES_ALL));
1224	ASSERT(!(sc->flags & scrub_fsgates));
1225
1226	trace_xchk_fsgates_enable(sc, scrub_fsgates);
1227
1228	if (scrub_fsgates & XCHK_FSGATES_DRAIN)
1229		xfs_drain_wait_enable();
1230
1231	sc->flags |= scrub_fsgates;
1232}
1233
1234/*
1235 * Decide if this is this a cached inode that's also allocated.  The caller
1236 * must hold a reference to an AG and the AGI buffer lock to prevent inodes
1237 * from being allocated or freed.
1238 *
1239 * Look up an inode by number in the given file system.  If the inode number
1240 * is invalid, return -EINVAL.  If the inode is not in cache, return -ENODATA.
1241 * If the inode is being reclaimed, return -ENODATA because we know the inode
1242 * cache cannot be updating the ondisk metadata.
1243 *
1244 * Otherwise, the incore inode is the one we want, and it is either live,
1245 * somewhere in the inactivation machinery, or reclaimable.  The inode is
1246 * allocated if i_mode is nonzero.  In all three cases, the cached inode will
1247 * be more up to date than the ondisk inode buffer, so we must use the incore
1248 * i_mode.
1249 */
1250int
1251xchk_inode_is_allocated(
1252	struct xfs_scrub	*sc,
1253	xfs_agino_t		agino,
1254	bool			*inuse)
1255{
1256	struct xfs_mount	*mp = sc->mp;
1257	struct xfs_perag	*pag = sc->sa.pag;
1258	xfs_ino_t		ino;
1259	struct xfs_inode	*ip;
1260	int			error;
1261
1262	/* caller must hold perag reference */
1263	if (pag == NULL) {
1264		ASSERT(pag != NULL);
1265		return -EINVAL;
1266	}
1267
1268	/* caller must have AGI buffer */
1269	if (sc->sa.agi_bp == NULL) {
1270		ASSERT(sc->sa.agi_bp != NULL);
1271		return -EINVAL;
1272	}
1273
1274	/* reject inode numbers outside existing AGs */
1275	ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
1276	if (!xfs_verify_ino(mp, ino))
1277		return -EINVAL;
1278
1279	error = -ENODATA;
1280	rcu_read_lock();
1281	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
1282	if (!ip) {
1283		/* cache miss */
1284		goto out_rcu;
1285	}
1286
1287	/*
1288	 * If the inode number doesn't match, the incore inode got reused
1289	 * during an RCU grace period and the radix tree hasn't been updated.
1290	 * This isn't the inode we want.
1291	 */
1292	spin_lock(&ip->i_flags_lock);
1293	if (ip->i_ino != ino)
1294		goto out_skip;
1295
1296	trace_xchk_inode_is_allocated(ip);
1297
1298	/*
1299	 * We have an incore inode that matches the inode we want, and the
1300	 * caller holds the perag structure and the AGI buffer.  Let's check
1301	 * our assumptions below:
1302	 */
1303
1304#ifdef DEBUG
1305	/*
1306	 * (1) If the incore inode is live (i.e. referenced from the dcache),
1307	 * it will not be INEW, nor will it be in the inactivation or reclaim
1308	 * machinery.  The ondisk inode had better be allocated.  This is the
1309	 * most trivial case.
1310	 */
1311	if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
1312			     XFS_INACTIVATING))) {
1313		/* live inode */
1314		ASSERT(VFS_I(ip)->i_mode != 0);
1315	}
1316
1317	/*
1318	 * If the incore inode is INEW, there are several possibilities:
1319	 *
1320	 * (2) For a file that is being created, note that we allocate the
1321	 * ondisk inode before allocating, initializing, and adding the incore
1322	 * inode to the radix tree.
1323	 *
1324	 * (3) If the incore inode is being recycled, the inode has to be
1325	 * allocated because we don't allow freed inodes to be recycled.
1326	 * Recycling doesn't touch i_mode.
1327	 */
1328	if (ip->i_flags & XFS_INEW) {
1329		/* created on disk already or recycling */
1330		ASSERT(VFS_I(ip)->i_mode != 0);
1331	}
1332
1333	/*
1334	 * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
1335	 * inactivation has not started (!INACTIVATING), it is still allocated.
1336	 */
1337	if ((ip->i_flags & XFS_NEED_INACTIVE) &&
1338	    !(ip->i_flags & XFS_INACTIVATING)) {
1339		/* definitely before difree */
1340		ASSERT(VFS_I(ip)->i_mode != 0);
1341	}
1342#endif
1343
1344	/*
1345	 * If the incore inode is undergoing inactivation (INACTIVATING), there
1346	 * are two possibilities:
1347	 *
1348	 * (5) It is before the point where it would get freed ondisk, in which
1349	 * case i_mode is still nonzero.
1350	 *
1351	 * (6) It has already been freed, in which case i_mode is zero.
1352	 *
1353	 * We don't take the ILOCK here, but difree and dialloc update the AGI,
1354	 * and we've taken the AGI buffer lock, which prevents that from
1355	 * happening.
1356	 */
1357
1358	/*
1359	 * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
1360	 * reclaim (IRECLAIMABLE) could be allocated or free.  i_mode still
1361	 * reflects the ondisk state.
1362	 */
1363
1364	/*
1365	 * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
1366	 * the flush code uses i_mode to format the ondisk inode.
1367	 */
1368
1369	/*
1370	 * (9) If the inode is in IRECLAIM and was reachable via the radix
1371	 * tree, it still has the same i_mode as it did before it entered
1372	 * reclaim.  The inode object is still alive because we hold the RCU
1373	 * read lock.
1374	 */
1375
1376	*inuse = VFS_I(ip)->i_mode != 0;
1377	error = 0;
1378
1379out_skip:
1380	spin_unlock(&ip->i_flags_lock);
1381out_rcu:
1382	rcu_read_unlock();
1383	return error;
1384}
1385