xref: /kernel/linux/linux-5.10/fs/btrfs/scrub.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4 */
5
6#include <linux/blkdev.h>
7#include <linux/ratelimit.h>
8#include <linux/sched/mm.h>
9#include <crypto/hash.h>
10#include "ctree.h"
11#include "discard.h"
12#include "volumes.h"
13#include "disk-io.h"
14#include "ordered-data.h"
15#include "transaction.h"
16#include "backref.h"
17#include "extent_io.h"
18#include "dev-replace.h"
19#include "check-integrity.h"
20#include "rcu-string.h"
21#include "raid56.h"
22#include "block-group.h"
23
24/*
25 * This is only the first step towards a full-features scrub. It reads all
26 * extent and super block and verifies the checksums. In case a bad checksum
27 * is found or the extent cannot be read, good data will be written back if
28 * any can be found.
29 *
30 * Future enhancements:
31 *  - In case an unrepairable extent is encountered, track which files are
32 *    affected and report them
33 *  - track and record media errors, throw out bad devices
34 *  - add a mode to also read unallocated space
35 */
36
37struct scrub_block;
38struct scrub_ctx;
39
40/*
41 * the following three values only influence the performance.
42 * The last one configures the number of parallel and outstanding I/O
43 * operations. The first two values configure an upper limit for the number
44 * of (dynamically allocated) pages that are added to a bio.
45 */
46#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
47#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
48#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
49
50/*
51 * the following value times PAGE_SIZE needs to be large enough to match the
52 * largest node/leaf/sector size that shall be supported.
53 * Values larger than BTRFS_STRIPE_LEN are not supported.
54 */
55#define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
56
57struct scrub_recover {
58	refcount_t		refs;
59	struct btrfs_bio	*bbio;
60	u64			map_length;
61};
62
63struct scrub_page {
64	struct scrub_block	*sblock;
65	struct page		*page;
66	struct btrfs_device	*dev;
67	struct list_head	list;
68	u64			flags;  /* extent flags */
69	u64			generation;
70	u64			logical;
71	u64			physical;
72	u64			physical_for_dev_replace;
73	atomic_t		refs;
74	struct {
75		unsigned int	mirror_num:8;
76		unsigned int	have_csum:1;
77		unsigned int	io_error:1;
78	};
79	u8			csum[BTRFS_CSUM_SIZE];
80
81	struct scrub_recover	*recover;
82};
83
84struct scrub_bio {
85	int			index;
86	struct scrub_ctx	*sctx;
87	struct btrfs_device	*dev;
88	struct bio		*bio;
89	blk_status_t		status;
90	u64			logical;
91	u64			physical;
92#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
93	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
94#else
95	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
96#endif
97	int			page_count;
98	int			next_free;
99	struct btrfs_work	work;
100};
101
102struct scrub_block {
103	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
104	int			page_count;
105	atomic_t		outstanding_pages;
106	refcount_t		refs; /* free mem on transition to zero */
107	struct scrub_ctx	*sctx;
108	struct scrub_parity	*sparity;
109	struct {
110		unsigned int	header_error:1;
111		unsigned int	checksum_error:1;
112		unsigned int	no_io_error_seen:1;
113		unsigned int	generation_error:1; /* also sets header_error */
114
115		/* The following is for the data used to check parity */
116		/* It is for the data with checksum */
117		unsigned int	data_corrected:1;
118	};
119	struct btrfs_work	work;
120};
121
122/* Used for the chunks with parity stripe such RAID5/6 */
123struct scrub_parity {
124	struct scrub_ctx	*sctx;
125
126	struct btrfs_device	*scrub_dev;
127
128	u64			logic_start;
129
130	u64			logic_end;
131
132	int			nsectors;
133
134	u64			stripe_len;
135
136	refcount_t		refs;
137
138	struct list_head	spages;
139
140	/* Work of parity check and repair */
141	struct btrfs_work	work;
142
143	/* Mark the parity blocks which have data */
144	unsigned long		*dbitmap;
145
146	/*
147	 * Mark the parity blocks which have data, but errors happen when
148	 * read data or check data
149	 */
150	unsigned long		*ebitmap;
151
152	unsigned long		bitmap[];
153};
154
155struct scrub_ctx {
156	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
157	struct btrfs_fs_info	*fs_info;
158	int			first_free;
159	int			curr;
160	atomic_t		bios_in_flight;
161	atomic_t		workers_pending;
162	spinlock_t		list_lock;
163	wait_queue_head_t	list_wait;
164	u16			csum_size;
165	struct list_head	csum_list;
166	atomic_t		cancel_req;
167	int			readonly;
168	int			pages_per_rd_bio;
169
170	int			is_dev_replace;
171
172	struct scrub_bio        *wr_curr_bio;
173	struct mutex            wr_lock;
174	int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
175	struct btrfs_device     *wr_tgtdev;
176	bool                    flush_all_writes;
177
178	/*
179	 * statistics
180	 */
181	struct btrfs_scrub_progress stat;
182	spinlock_t		stat_lock;
183
184	/*
185	 * Use a ref counter to avoid use-after-free issues. Scrub workers
186	 * decrement bios_in_flight and workers_pending and then do a wakeup
187	 * on the list_wait wait queue. We must ensure the main scrub task
188	 * doesn't free the scrub context before or while the workers are
189	 * doing the wakeup() call.
190	 */
191	refcount_t              refs;
192};
193
194struct scrub_warning {
195	struct btrfs_path	*path;
196	u64			extent_item_size;
197	const char		*errstr;
198	u64			physical;
199	u64			logical;
200	struct btrfs_device	*dev;
201};
202
203struct full_stripe_lock {
204	struct rb_node node;
205	u64 logical;
206	u64 refs;
207	struct mutex mutex;
208};
209
210static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
211static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
212static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
213static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
214				     struct scrub_block *sblocks_for_recheck);
215static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
216				struct scrub_block *sblock,
217				int retry_failed_mirror);
218static void scrub_recheck_block_checksum(struct scrub_block *sblock);
219static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
220					     struct scrub_block *sblock_good);
221static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
222					    struct scrub_block *sblock_good,
223					    int page_num, int force_write);
224static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
225static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
226					   int page_num);
227static int scrub_checksum_data(struct scrub_block *sblock);
228static int scrub_checksum_tree_block(struct scrub_block *sblock);
229static int scrub_checksum_super(struct scrub_block *sblock);
230static void scrub_block_get(struct scrub_block *sblock);
231static void scrub_block_put(struct scrub_block *sblock);
232static void scrub_page_get(struct scrub_page *spage);
233static void scrub_page_put(struct scrub_page *spage);
234static void scrub_parity_get(struct scrub_parity *sparity);
235static void scrub_parity_put(struct scrub_parity *sparity);
236static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
237				    struct scrub_page *spage);
238static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
239		       u64 physical, struct btrfs_device *dev, u64 flags,
240		       u64 gen, int mirror_num, u8 *csum, int force,
241		       u64 physical_for_dev_replace);
242static void scrub_bio_end_io(struct bio *bio);
243static void scrub_bio_end_io_worker(struct btrfs_work *work);
244static void scrub_block_complete(struct scrub_block *sblock);
245static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
246			       u64 extent_logical, u64 extent_len,
247			       u64 *extent_physical,
248			       struct btrfs_device **extent_dev,
249			       int *extent_mirror_num);
250static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
251				    struct scrub_page *spage);
252static void scrub_wr_submit(struct scrub_ctx *sctx);
253static void scrub_wr_bio_end_io(struct bio *bio);
254static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
255static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
256static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
257static void scrub_put_ctx(struct scrub_ctx *sctx);
258
259static inline int scrub_is_page_on_raid56(struct scrub_page *page)
260{
261	return page->recover &&
262	       (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
263}
264
265static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
266{
267	refcount_inc(&sctx->refs);
268	atomic_inc(&sctx->bios_in_flight);
269}
270
271static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
272{
273	atomic_dec(&sctx->bios_in_flight);
274	wake_up(&sctx->list_wait);
275	scrub_put_ctx(sctx);
276}
277
278static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
279{
280	while (atomic_read(&fs_info->scrub_pause_req)) {
281		mutex_unlock(&fs_info->scrub_lock);
282		wait_event(fs_info->scrub_pause_wait,
283		   atomic_read(&fs_info->scrub_pause_req) == 0);
284		mutex_lock(&fs_info->scrub_lock);
285	}
286}
287
288static void scrub_pause_on(struct btrfs_fs_info *fs_info)
289{
290	atomic_inc(&fs_info->scrubs_paused);
291	wake_up(&fs_info->scrub_pause_wait);
292}
293
294static void scrub_pause_off(struct btrfs_fs_info *fs_info)
295{
296	mutex_lock(&fs_info->scrub_lock);
297	__scrub_blocked_if_needed(fs_info);
298	atomic_dec(&fs_info->scrubs_paused);
299	mutex_unlock(&fs_info->scrub_lock);
300
301	wake_up(&fs_info->scrub_pause_wait);
302}
303
304static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
305{
306	scrub_pause_on(fs_info);
307	scrub_pause_off(fs_info);
308}
309
310/*
311 * Insert new full stripe lock into full stripe locks tree
312 *
313 * Return pointer to existing or newly inserted full_stripe_lock structure if
314 * everything works well.
315 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
316 *
317 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
318 * function
319 */
320static struct full_stripe_lock *insert_full_stripe_lock(
321		struct btrfs_full_stripe_locks_tree *locks_root,
322		u64 fstripe_logical)
323{
324	struct rb_node **p;
325	struct rb_node *parent = NULL;
326	struct full_stripe_lock *entry;
327	struct full_stripe_lock *ret;
328
329	lockdep_assert_held(&locks_root->lock);
330
331	p = &locks_root->root.rb_node;
332	while (*p) {
333		parent = *p;
334		entry = rb_entry(parent, struct full_stripe_lock, node);
335		if (fstripe_logical < entry->logical) {
336			p = &(*p)->rb_left;
337		} else if (fstripe_logical > entry->logical) {
338			p = &(*p)->rb_right;
339		} else {
340			entry->refs++;
341			return entry;
342		}
343	}
344
345	/*
346	 * Insert new lock.
347	 */
348	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
349	if (!ret)
350		return ERR_PTR(-ENOMEM);
351	ret->logical = fstripe_logical;
352	ret->refs = 1;
353	mutex_init(&ret->mutex);
354
355	rb_link_node(&ret->node, parent, p);
356	rb_insert_color(&ret->node, &locks_root->root);
357	return ret;
358}
359
360/*
361 * Search for a full stripe lock of a block group
362 *
363 * Return pointer to existing full stripe lock if found
364 * Return NULL if not found
365 */
366static struct full_stripe_lock *search_full_stripe_lock(
367		struct btrfs_full_stripe_locks_tree *locks_root,
368		u64 fstripe_logical)
369{
370	struct rb_node *node;
371	struct full_stripe_lock *entry;
372
373	lockdep_assert_held(&locks_root->lock);
374
375	node = locks_root->root.rb_node;
376	while (node) {
377		entry = rb_entry(node, struct full_stripe_lock, node);
378		if (fstripe_logical < entry->logical)
379			node = node->rb_left;
380		else if (fstripe_logical > entry->logical)
381			node = node->rb_right;
382		else
383			return entry;
384	}
385	return NULL;
386}
387
388/*
389 * Helper to get full stripe logical from a normal bytenr.
390 *
391 * Caller must ensure @cache is a RAID56 block group.
392 */
393static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
394{
395	u64 ret;
396
397	/*
398	 * Due to chunk item size limit, full stripe length should not be
399	 * larger than U32_MAX. Just a sanity check here.
400	 */
401	WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
402
403	/*
404	 * round_down() can only handle power of 2, while RAID56 full
405	 * stripe length can be 64KiB * n, so we need to manually round down.
406	 */
407	ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
408			cache->full_stripe_len + cache->start;
409	return ret;
410}
411
412/*
413 * Lock a full stripe to avoid concurrency of recovery and read
414 *
415 * It's only used for profiles with parities (RAID5/6), for other profiles it
416 * does nothing.
417 *
418 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
419 * So caller must call unlock_full_stripe() at the same context.
420 *
421 * Return <0 if encounters error.
422 */
423static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
424			    bool *locked_ret)
425{
426	struct btrfs_block_group *bg_cache;
427	struct btrfs_full_stripe_locks_tree *locks_root;
428	struct full_stripe_lock *existing;
429	u64 fstripe_start;
430	int ret = 0;
431
432	*locked_ret = false;
433	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
434	if (!bg_cache) {
435		ASSERT(0);
436		return -ENOENT;
437	}
438
439	/* Profiles not based on parity don't need full stripe lock */
440	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
441		goto out;
442	locks_root = &bg_cache->full_stripe_locks_root;
443
444	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
445
446	/* Now insert the full stripe lock */
447	mutex_lock(&locks_root->lock);
448	existing = insert_full_stripe_lock(locks_root, fstripe_start);
449	mutex_unlock(&locks_root->lock);
450	if (IS_ERR(existing)) {
451		ret = PTR_ERR(existing);
452		goto out;
453	}
454	mutex_lock(&existing->mutex);
455	*locked_ret = true;
456out:
457	btrfs_put_block_group(bg_cache);
458	return ret;
459}
460
461/*
462 * Unlock a full stripe.
463 *
464 * NOTE: Caller must ensure it's the same context calling corresponding
465 * lock_full_stripe().
466 *
467 * Return 0 if we unlock full stripe without problem.
468 * Return <0 for error
469 */
470static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
471			      bool locked)
472{
473	struct btrfs_block_group *bg_cache;
474	struct btrfs_full_stripe_locks_tree *locks_root;
475	struct full_stripe_lock *fstripe_lock;
476	u64 fstripe_start;
477	bool freeit = false;
478	int ret = 0;
479
480	/* If we didn't acquire full stripe lock, no need to continue */
481	if (!locked)
482		return 0;
483
484	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
485	if (!bg_cache) {
486		ASSERT(0);
487		return -ENOENT;
488	}
489	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
490		goto out;
491
492	locks_root = &bg_cache->full_stripe_locks_root;
493	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
494
495	mutex_lock(&locks_root->lock);
496	fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
497	/* Unpaired unlock_full_stripe() detected */
498	if (!fstripe_lock) {
499		WARN_ON(1);
500		ret = -ENOENT;
501		mutex_unlock(&locks_root->lock);
502		goto out;
503	}
504
505	if (fstripe_lock->refs == 0) {
506		WARN_ON(1);
507		btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
508			fstripe_lock->logical);
509	} else {
510		fstripe_lock->refs--;
511	}
512
513	if (fstripe_lock->refs == 0) {
514		rb_erase(&fstripe_lock->node, &locks_root->root);
515		freeit = true;
516	}
517	mutex_unlock(&locks_root->lock);
518
519	mutex_unlock(&fstripe_lock->mutex);
520	if (freeit)
521		kfree(fstripe_lock);
522out:
523	btrfs_put_block_group(bg_cache);
524	return ret;
525}
526
527static void scrub_free_csums(struct scrub_ctx *sctx)
528{
529	while (!list_empty(&sctx->csum_list)) {
530		struct btrfs_ordered_sum *sum;
531		sum = list_first_entry(&sctx->csum_list,
532				       struct btrfs_ordered_sum, list);
533		list_del(&sum->list);
534		kfree(sum);
535	}
536}
537
538static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
539{
540	int i;
541
542	if (!sctx)
543		return;
544
545	/* this can happen when scrub is cancelled */
546	if (sctx->curr != -1) {
547		struct scrub_bio *sbio = sctx->bios[sctx->curr];
548
549		for (i = 0; i < sbio->page_count; i++) {
550			WARN_ON(!sbio->pagev[i]->page);
551			scrub_block_put(sbio->pagev[i]->sblock);
552		}
553		bio_put(sbio->bio);
554	}
555
556	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
557		struct scrub_bio *sbio = sctx->bios[i];
558
559		if (!sbio)
560			break;
561		kfree(sbio);
562	}
563
564	kfree(sctx->wr_curr_bio);
565	scrub_free_csums(sctx);
566	kfree(sctx);
567}
568
569static void scrub_put_ctx(struct scrub_ctx *sctx)
570{
571	if (refcount_dec_and_test(&sctx->refs))
572		scrub_free_ctx(sctx);
573}
574
575static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
576		struct btrfs_fs_info *fs_info, int is_dev_replace)
577{
578	struct scrub_ctx *sctx;
579	int		i;
580
581	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
582	if (!sctx)
583		goto nomem;
584	refcount_set(&sctx->refs, 1);
585	sctx->is_dev_replace = is_dev_replace;
586	sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
587	sctx->curr = -1;
588	sctx->fs_info = fs_info;
589	INIT_LIST_HEAD(&sctx->csum_list);
590	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
591		struct scrub_bio *sbio;
592
593		sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
594		if (!sbio)
595			goto nomem;
596		sctx->bios[i] = sbio;
597
598		sbio->index = i;
599		sbio->sctx = sctx;
600		sbio->page_count = 0;
601		btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
602				NULL);
603
604		if (i != SCRUB_BIOS_PER_SCTX - 1)
605			sctx->bios[i]->next_free = i + 1;
606		else
607			sctx->bios[i]->next_free = -1;
608	}
609	sctx->first_free = 0;
610	atomic_set(&sctx->bios_in_flight, 0);
611	atomic_set(&sctx->workers_pending, 0);
612	atomic_set(&sctx->cancel_req, 0);
613	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
614
615	spin_lock_init(&sctx->list_lock);
616	spin_lock_init(&sctx->stat_lock);
617	init_waitqueue_head(&sctx->list_wait);
618
619	WARN_ON(sctx->wr_curr_bio != NULL);
620	mutex_init(&sctx->wr_lock);
621	sctx->wr_curr_bio = NULL;
622	if (is_dev_replace) {
623		WARN_ON(!fs_info->dev_replace.tgtdev);
624		sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
625		sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
626		sctx->flush_all_writes = false;
627	}
628
629	return sctx;
630
631nomem:
632	scrub_free_ctx(sctx);
633	return ERR_PTR(-ENOMEM);
634}
635
636static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
637				     void *warn_ctx)
638{
639	u64 isize;
640	u32 nlink;
641	int ret;
642	int i;
643	unsigned nofs_flag;
644	struct extent_buffer *eb;
645	struct btrfs_inode_item *inode_item;
646	struct scrub_warning *swarn = warn_ctx;
647	struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
648	struct inode_fs_paths *ipath = NULL;
649	struct btrfs_root *local_root;
650	struct btrfs_key key;
651
652	local_root = btrfs_get_fs_root(fs_info, root, true);
653	if (IS_ERR(local_root)) {
654		ret = PTR_ERR(local_root);
655		goto err;
656	}
657
658	/*
659	 * this makes the path point to (inum INODE_ITEM ioff)
660	 */
661	key.objectid = inum;
662	key.type = BTRFS_INODE_ITEM_KEY;
663	key.offset = 0;
664
665	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
666	if (ret) {
667		btrfs_put_root(local_root);
668		btrfs_release_path(swarn->path);
669		goto err;
670	}
671
672	eb = swarn->path->nodes[0];
673	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
674					struct btrfs_inode_item);
675	isize = btrfs_inode_size(eb, inode_item);
676	nlink = btrfs_inode_nlink(eb, inode_item);
677	btrfs_release_path(swarn->path);
678
679	/*
680	 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
681	 * uses GFP_NOFS in this context, so we keep it consistent but it does
682	 * not seem to be strictly necessary.
683	 */
684	nofs_flag = memalloc_nofs_save();
685	ipath = init_ipath(4096, local_root, swarn->path);
686	memalloc_nofs_restore(nofs_flag);
687	if (IS_ERR(ipath)) {
688		btrfs_put_root(local_root);
689		ret = PTR_ERR(ipath);
690		ipath = NULL;
691		goto err;
692	}
693	ret = paths_from_inode(inum, ipath);
694
695	if (ret < 0)
696		goto err;
697
698	/*
699	 * we deliberately ignore the bit ipath might have been too small to
700	 * hold all of the paths here
701	 */
702	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
703		btrfs_warn_in_rcu(fs_info,
704"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
705				  swarn->errstr, swarn->logical,
706				  rcu_str_deref(swarn->dev->name),
707				  swarn->physical,
708				  root, inum, offset,
709				  min(isize - offset, (u64)PAGE_SIZE), nlink,
710				  (char *)(unsigned long)ipath->fspath->val[i]);
711
712	btrfs_put_root(local_root);
713	free_ipath(ipath);
714	return 0;
715
716err:
717	btrfs_warn_in_rcu(fs_info,
718			  "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
719			  swarn->errstr, swarn->logical,
720			  rcu_str_deref(swarn->dev->name),
721			  swarn->physical,
722			  root, inum, offset, ret);
723
724	free_ipath(ipath);
725	return 0;
726}
727
728static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
729{
730	struct btrfs_device *dev;
731	struct btrfs_fs_info *fs_info;
732	struct btrfs_path *path;
733	struct btrfs_key found_key;
734	struct extent_buffer *eb;
735	struct btrfs_extent_item *ei;
736	struct scrub_warning swarn;
737	unsigned long ptr = 0;
738	u64 extent_item_pos;
739	u64 flags = 0;
740	u64 ref_root;
741	u32 item_size;
742	u8 ref_level = 0;
743	int ret;
744
745	WARN_ON(sblock->page_count < 1);
746	dev = sblock->pagev[0]->dev;
747	fs_info = sblock->sctx->fs_info;
748
749	path = btrfs_alloc_path();
750	if (!path)
751		return;
752
753	swarn.physical = sblock->pagev[0]->physical;
754	swarn.logical = sblock->pagev[0]->logical;
755	swarn.errstr = errstr;
756	swarn.dev = NULL;
757
758	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
759				  &flags);
760	if (ret < 0)
761		goto out;
762
763	extent_item_pos = swarn.logical - found_key.objectid;
764	swarn.extent_item_size = found_key.offset;
765
766	eb = path->nodes[0];
767	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
768	item_size = btrfs_item_size_nr(eb, path->slots[0]);
769
770	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
771		do {
772			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
773						      item_size, &ref_root,
774						      &ref_level);
775			btrfs_warn_in_rcu(fs_info,
776"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
777				errstr, swarn.logical,
778				rcu_str_deref(dev->name),
779				swarn.physical,
780				ref_level ? "node" : "leaf",
781				ret < 0 ? -1 : ref_level,
782				ret < 0 ? -1 : ref_root);
783		} while (ret != 1);
784		btrfs_release_path(path);
785	} else {
786		btrfs_release_path(path);
787		swarn.path = path;
788		swarn.dev = dev;
789		iterate_extent_inodes(fs_info, found_key.objectid,
790					extent_item_pos, 1,
791					scrub_print_warning_inode, &swarn, false);
792	}
793
794out:
795	btrfs_free_path(path);
796}
797
798static inline void scrub_get_recover(struct scrub_recover *recover)
799{
800	refcount_inc(&recover->refs);
801}
802
803static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
804				     struct scrub_recover *recover)
805{
806	if (refcount_dec_and_test(&recover->refs)) {
807		btrfs_bio_counter_dec(fs_info);
808		btrfs_put_bbio(recover->bbio);
809		kfree(recover);
810	}
811}
812
813/*
814 * scrub_handle_errored_block gets called when either verification of the
815 * pages failed or the bio failed to read, e.g. with EIO. In the latter
816 * case, this function handles all pages in the bio, even though only one
817 * may be bad.
818 * The goal of this function is to repair the errored block by using the
819 * contents of one of the mirrors.
820 */
821static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
822{
823	struct scrub_ctx *sctx = sblock_to_check->sctx;
824	struct btrfs_device *dev;
825	struct btrfs_fs_info *fs_info;
826	u64 logical;
827	unsigned int failed_mirror_index;
828	unsigned int is_metadata;
829	unsigned int have_csum;
830	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
831	struct scrub_block *sblock_bad;
832	int ret;
833	int mirror_index;
834	int page_num;
835	int success;
836	bool full_stripe_locked;
837	unsigned int nofs_flag;
838	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
839				      DEFAULT_RATELIMIT_BURST);
840
841	BUG_ON(sblock_to_check->page_count < 1);
842	fs_info = sctx->fs_info;
843	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
844		/*
845		 * if we find an error in a super block, we just report it.
846		 * They will get written with the next transaction commit
847		 * anyway
848		 */
849		spin_lock(&sctx->stat_lock);
850		++sctx->stat.super_errors;
851		spin_unlock(&sctx->stat_lock);
852		return 0;
853	}
854	logical = sblock_to_check->pagev[0]->logical;
855	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
856	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
857	is_metadata = !(sblock_to_check->pagev[0]->flags &
858			BTRFS_EXTENT_FLAG_DATA);
859	have_csum = sblock_to_check->pagev[0]->have_csum;
860	dev = sblock_to_check->pagev[0]->dev;
861
862	/*
863	 * We must use GFP_NOFS because the scrub task might be waiting for a
864	 * worker task executing this function and in turn a transaction commit
865	 * might be waiting the scrub task to pause (which needs to wait for all
866	 * the worker tasks to complete before pausing).
867	 * We do allocations in the workers through insert_full_stripe_lock()
868	 * and scrub_add_page_to_wr_bio(), which happens down the call chain of
869	 * this function.
870	 */
871	nofs_flag = memalloc_nofs_save();
872	/*
873	 * For RAID5/6, race can happen for a different device scrub thread.
874	 * For data corruption, Parity and Data threads will both try
875	 * to recovery the data.
876	 * Race can lead to doubly added csum error, or even unrecoverable
877	 * error.
878	 */
879	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
880	if (ret < 0) {
881		memalloc_nofs_restore(nofs_flag);
882		spin_lock(&sctx->stat_lock);
883		if (ret == -ENOMEM)
884			sctx->stat.malloc_errors++;
885		sctx->stat.read_errors++;
886		sctx->stat.uncorrectable_errors++;
887		spin_unlock(&sctx->stat_lock);
888		return ret;
889	}
890
891	/*
892	 * read all mirrors one after the other. This includes to
893	 * re-read the extent or metadata block that failed (that was
894	 * the cause that this fixup code is called) another time,
895	 * page by page this time in order to know which pages
896	 * caused I/O errors and which ones are good (for all mirrors).
897	 * It is the goal to handle the situation when more than one
898	 * mirror contains I/O errors, but the errors do not
899	 * overlap, i.e. the data can be repaired by selecting the
900	 * pages from those mirrors without I/O error on the
901	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
902	 * would be that mirror #1 has an I/O error on the first page,
903	 * the second page is good, and mirror #2 has an I/O error on
904	 * the second page, but the first page is good.
905	 * Then the first page of the first mirror can be repaired by
906	 * taking the first page of the second mirror, and the
907	 * second page of the second mirror can be repaired by
908	 * copying the contents of the 2nd page of the 1st mirror.
909	 * One more note: if the pages of one mirror contain I/O
910	 * errors, the checksum cannot be verified. In order to get
911	 * the best data for repairing, the first attempt is to find
912	 * a mirror without I/O errors and with a validated checksum.
913	 * Only if this is not possible, the pages are picked from
914	 * mirrors with I/O errors without considering the checksum.
915	 * If the latter is the case, at the end, the checksum of the
916	 * repaired area is verified in order to correctly maintain
917	 * the statistics.
918	 */
919
920	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
921				      sizeof(*sblocks_for_recheck), GFP_KERNEL);
922	if (!sblocks_for_recheck) {
923		spin_lock(&sctx->stat_lock);
924		sctx->stat.malloc_errors++;
925		sctx->stat.read_errors++;
926		sctx->stat.uncorrectable_errors++;
927		spin_unlock(&sctx->stat_lock);
928		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
929		goto out;
930	}
931
932	/* setup the context, map the logical blocks and alloc the pages */
933	ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
934	if (ret) {
935		spin_lock(&sctx->stat_lock);
936		sctx->stat.read_errors++;
937		sctx->stat.uncorrectable_errors++;
938		spin_unlock(&sctx->stat_lock);
939		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
940		goto out;
941	}
942	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
943	sblock_bad = sblocks_for_recheck + failed_mirror_index;
944
945	/* build and submit the bios for the failed mirror, check checksums */
946	scrub_recheck_block(fs_info, sblock_bad, 1);
947
948	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
949	    sblock_bad->no_io_error_seen) {
950		/*
951		 * the error disappeared after reading page by page, or
952		 * the area was part of a huge bio and other parts of the
953		 * bio caused I/O errors, or the block layer merged several
954		 * read requests into one and the error is caused by a
955		 * different bio (usually one of the two latter cases is
956		 * the cause)
957		 */
958		spin_lock(&sctx->stat_lock);
959		sctx->stat.unverified_errors++;
960		sblock_to_check->data_corrected = 1;
961		spin_unlock(&sctx->stat_lock);
962
963		if (sctx->is_dev_replace)
964			scrub_write_block_to_dev_replace(sblock_bad);
965		goto out;
966	}
967
968	if (!sblock_bad->no_io_error_seen) {
969		spin_lock(&sctx->stat_lock);
970		sctx->stat.read_errors++;
971		spin_unlock(&sctx->stat_lock);
972		if (__ratelimit(&rs))
973			scrub_print_warning("i/o error", sblock_to_check);
974		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
975	} else if (sblock_bad->checksum_error) {
976		spin_lock(&sctx->stat_lock);
977		sctx->stat.csum_errors++;
978		spin_unlock(&sctx->stat_lock);
979		if (__ratelimit(&rs))
980			scrub_print_warning("checksum error", sblock_to_check);
981		btrfs_dev_stat_inc_and_print(dev,
982					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
983	} else if (sblock_bad->header_error) {
984		spin_lock(&sctx->stat_lock);
985		sctx->stat.verify_errors++;
986		spin_unlock(&sctx->stat_lock);
987		if (__ratelimit(&rs))
988			scrub_print_warning("checksum/header error",
989					    sblock_to_check);
990		if (sblock_bad->generation_error)
991			btrfs_dev_stat_inc_and_print(dev,
992				BTRFS_DEV_STAT_GENERATION_ERRS);
993		else
994			btrfs_dev_stat_inc_and_print(dev,
995				BTRFS_DEV_STAT_CORRUPTION_ERRS);
996	}
997
998	if (sctx->readonly) {
999		ASSERT(!sctx->is_dev_replace);
1000		goto out;
1001	}
1002
1003	/*
1004	 * now build and submit the bios for the other mirrors, check
1005	 * checksums.
1006	 * First try to pick the mirror which is completely without I/O
1007	 * errors and also does not have a checksum error.
1008	 * If one is found, and if a checksum is present, the full block
1009	 * that is known to contain an error is rewritten. Afterwards
1010	 * the block is known to be corrected.
1011	 * If a mirror is found which is completely correct, and no
1012	 * checksum is present, only those pages are rewritten that had
1013	 * an I/O error in the block to be repaired, since it cannot be
1014	 * determined, which copy of the other pages is better (and it
1015	 * could happen otherwise that a correct page would be
1016	 * overwritten by a bad one).
1017	 */
1018	for (mirror_index = 0; ;mirror_index++) {
1019		struct scrub_block *sblock_other;
1020
1021		if (mirror_index == failed_mirror_index)
1022			continue;
1023
1024		/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1025		if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1026			if (mirror_index >= BTRFS_MAX_MIRRORS)
1027				break;
1028			if (!sblocks_for_recheck[mirror_index].page_count)
1029				break;
1030
1031			sblock_other = sblocks_for_recheck + mirror_index;
1032		} else {
1033			struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1034			int max_allowed = r->bbio->num_stripes -
1035						r->bbio->num_tgtdevs;
1036
1037			if (mirror_index >= max_allowed)
1038				break;
1039			if (!sblocks_for_recheck[1].page_count)
1040				break;
1041
1042			ASSERT(failed_mirror_index == 0);
1043			sblock_other = sblocks_for_recheck + 1;
1044			sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1045		}
1046
1047		/* build and submit the bios, check checksums */
1048		scrub_recheck_block(fs_info, sblock_other, 0);
1049
1050		if (!sblock_other->header_error &&
1051		    !sblock_other->checksum_error &&
1052		    sblock_other->no_io_error_seen) {
1053			if (sctx->is_dev_replace) {
1054				scrub_write_block_to_dev_replace(sblock_other);
1055				goto corrected_error;
1056			} else {
1057				ret = scrub_repair_block_from_good_copy(
1058						sblock_bad, sblock_other);
1059				if (!ret)
1060					goto corrected_error;
1061			}
1062		}
1063	}
1064
1065	if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1066		goto did_not_correct_error;
1067
1068	/*
1069	 * In case of I/O errors in the area that is supposed to be
1070	 * repaired, continue by picking good copies of those pages.
1071	 * Select the good pages from mirrors to rewrite bad pages from
1072	 * the area to fix. Afterwards verify the checksum of the block
1073	 * that is supposed to be repaired. This verification step is
1074	 * only done for the purpose of statistic counting and for the
1075	 * final scrub report, whether errors remain.
1076	 * A perfect algorithm could make use of the checksum and try
1077	 * all possible combinations of pages from the different mirrors
1078	 * until the checksum verification succeeds. For example, when
1079	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1080	 * of mirror #2 is readable but the final checksum test fails,
1081	 * then the 2nd page of mirror #3 could be tried, whether now
1082	 * the final checksum succeeds. But this would be a rare
1083	 * exception and is therefore not implemented. At least it is
1084	 * avoided that the good copy is overwritten.
1085	 * A more useful improvement would be to pick the sectors
1086	 * without I/O error based on sector sizes (512 bytes on legacy
1087	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1088	 * mirror could be repaired by taking 512 byte of a different
1089	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1090	 * area are unreadable.
1091	 */
1092	success = 1;
1093	for (page_num = 0; page_num < sblock_bad->page_count;
1094	     page_num++) {
1095		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1096		struct scrub_block *sblock_other = NULL;
1097
1098		/* skip no-io-error page in scrub */
1099		if (!page_bad->io_error && !sctx->is_dev_replace)
1100			continue;
1101
1102		if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1103			/*
1104			 * In case of dev replace, if raid56 rebuild process
1105			 * didn't work out correct data, then copy the content
1106			 * in sblock_bad to make sure target device is identical
1107			 * to source device, instead of writing garbage data in
1108			 * sblock_for_recheck array to target device.
1109			 */
1110			sblock_other = NULL;
1111		} else if (page_bad->io_error) {
1112			/* try to find no-io-error page in mirrors */
1113			for (mirror_index = 0;
1114			     mirror_index < BTRFS_MAX_MIRRORS &&
1115			     sblocks_for_recheck[mirror_index].page_count > 0;
1116			     mirror_index++) {
1117				if (!sblocks_for_recheck[mirror_index].
1118				    pagev[page_num]->io_error) {
1119					sblock_other = sblocks_for_recheck +
1120						       mirror_index;
1121					break;
1122				}
1123			}
1124			if (!sblock_other)
1125				success = 0;
1126		}
1127
1128		if (sctx->is_dev_replace) {
1129			/*
1130			 * did not find a mirror to fetch the page
1131			 * from. scrub_write_page_to_dev_replace()
1132			 * handles this case (page->io_error), by
1133			 * filling the block with zeros before
1134			 * submitting the write request
1135			 */
1136			if (!sblock_other)
1137				sblock_other = sblock_bad;
1138
1139			if (scrub_write_page_to_dev_replace(sblock_other,
1140							    page_num) != 0) {
1141				atomic64_inc(
1142					&fs_info->dev_replace.num_write_errors);
1143				success = 0;
1144			}
1145		} else if (sblock_other) {
1146			ret = scrub_repair_page_from_good_copy(sblock_bad,
1147							       sblock_other,
1148							       page_num, 0);
1149			if (0 == ret)
1150				page_bad->io_error = 0;
1151			else
1152				success = 0;
1153		}
1154	}
1155
1156	if (success && !sctx->is_dev_replace) {
1157		if (is_metadata || have_csum) {
1158			/*
1159			 * need to verify the checksum now that all
1160			 * sectors on disk are repaired (the write
1161			 * request for data to be repaired is on its way).
1162			 * Just be lazy and use scrub_recheck_block()
1163			 * which re-reads the data before the checksum
1164			 * is verified, but most likely the data comes out
1165			 * of the page cache.
1166			 */
1167			scrub_recheck_block(fs_info, sblock_bad, 1);
1168			if (!sblock_bad->header_error &&
1169			    !sblock_bad->checksum_error &&
1170			    sblock_bad->no_io_error_seen)
1171				goto corrected_error;
1172			else
1173				goto did_not_correct_error;
1174		} else {
1175corrected_error:
1176			spin_lock(&sctx->stat_lock);
1177			sctx->stat.corrected_errors++;
1178			sblock_to_check->data_corrected = 1;
1179			spin_unlock(&sctx->stat_lock);
1180			btrfs_err_rl_in_rcu(fs_info,
1181				"fixed up error at logical %llu on dev %s",
1182				logical, rcu_str_deref(dev->name));
1183		}
1184	} else {
1185did_not_correct_error:
1186		spin_lock(&sctx->stat_lock);
1187		sctx->stat.uncorrectable_errors++;
1188		spin_unlock(&sctx->stat_lock);
1189		btrfs_err_rl_in_rcu(fs_info,
1190			"unable to fixup (regular) error at logical %llu on dev %s",
1191			logical, rcu_str_deref(dev->name));
1192	}
1193
1194out:
1195	if (sblocks_for_recheck) {
1196		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1197		     mirror_index++) {
1198			struct scrub_block *sblock = sblocks_for_recheck +
1199						     mirror_index;
1200			struct scrub_recover *recover;
1201			int page_index;
1202
1203			for (page_index = 0; page_index < sblock->page_count;
1204			     page_index++) {
1205				sblock->pagev[page_index]->sblock = NULL;
1206				recover = sblock->pagev[page_index]->recover;
1207				if (recover) {
1208					scrub_put_recover(fs_info, recover);
1209					sblock->pagev[page_index]->recover =
1210									NULL;
1211				}
1212				scrub_page_put(sblock->pagev[page_index]);
1213			}
1214		}
1215		kfree(sblocks_for_recheck);
1216	}
1217
1218	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1219	memalloc_nofs_restore(nofs_flag);
1220	if (ret < 0)
1221		return ret;
1222	return 0;
1223}
1224
1225static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1226{
1227	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1228		return 2;
1229	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1230		return 3;
1231	else
1232		return (int)bbio->num_stripes;
1233}
1234
1235static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1236						 u64 *raid_map,
1237						 u64 mapped_length,
1238						 int nstripes, int mirror,
1239						 int *stripe_index,
1240						 u64 *stripe_offset)
1241{
1242	int i;
1243
1244	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1245		/* RAID5/6 */
1246		for (i = 0; i < nstripes; i++) {
1247			if (raid_map[i] == RAID6_Q_STRIPE ||
1248			    raid_map[i] == RAID5_P_STRIPE)
1249				continue;
1250
1251			if (logical >= raid_map[i] &&
1252			    logical < raid_map[i] + mapped_length)
1253				break;
1254		}
1255
1256		*stripe_index = i;
1257		*stripe_offset = logical - raid_map[i];
1258	} else {
1259		/* The other RAID type */
1260		*stripe_index = mirror;
1261		*stripe_offset = 0;
1262	}
1263}
1264
1265static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1266				     struct scrub_block *sblocks_for_recheck)
1267{
1268	struct scrub_ctx *sctx = original_sblock->sctx;
1269	struct btrfs_fs_info *fs_info = sctx->fs_info;
1270	u64 length = original_sblock->page_count * PAGE_SIZE;
1271	u64 logical = original_sblock->pagev[0]->logical;
1272	u64 generation = original_sblock->pagev[0]->generation;
1273	u64 flags = original_sblock->pagev[0]->flags;
1274	u64 have_csum = original_sblock->pagev[0]->have_csum;
1275	struct scrub_recover *recover;
1276	struct btrfs_bio *bbio;
1277	u64 sublen;
1278	u64 mapped_length;
1279	u64 stripe_offset;
1280	int stripe_index;
1281	int page_index = 0;
1282	int mirror_index;
1283	int nmirrors;
1284	int ret;
1285
1286	/*
1287	 * note: the two members refs and outstanding_pages
1288	 * are not used (and not set) in the blocks that are used for
1289	 * the recheck procedure
1290	 */
1291
1292	while (length > 0) {
1293		sublen = min_t(u64, length, PAGE_SIZE);
1294		mapped_length = sublen;
1295		bbio = NULL;
1296
1297		/*
1298		 * with a length of PAGE_SIZE, each returned stripe
1299		 * represents one mirror
1300		 */
1301		btrfs_bio_counter_inc_blocked(fs_info);
1302		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1303				logical, &mapped_length, &bbio);
1304		if (ret || !bbio || mapped_length < sublen) {
1305			btrfs_put_bbio(bbio);
1306			btrfs_bio_counter_dec(fs_info);
1307			return -EIO;
1308		}
1309
1310		recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1311		if (!recover) {
1312			btrfs_put_bbio(bbio);
1313			btrfs_bio_counter_dec(fs_info);
1314			return -ENOMEM;
1315		}
1316
1317		refcount_set(&recover->refs, 1);
1318		recover->bbio = bbio;
1319		recover->map_length = mapped_length;
1320
1321		BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1322
1323		nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1324
1325		for (mirror_index = 0; mirror_index < nmirrors;
1326		     mirror_index++) {
1327			struct scrub_block *sblock;
1328			struct scrub_page *page;
1329
1330			sblock = sblocks_for_recheck + mirror_index;
1331			sblock->sctx = sctx;
1332
1333			page = kzalloc(sizeof(*page), GFP_NOFS);
1334			if (!page) {
1335leave_nomem:
1336				spin_lock(&sctx->stat_lock);
1337				sctx->stat.malloc_errors++;
1338				spin_unlock(&sctx->stat_lock);
1339				scrub_put_recover(fs_info, recover);
1340				return -ENOMEM;
1341			}
1342			scrub_page_get(page);
1343			sblock->pagev[page_index] = page;
1344			page->sblock = sblock;
1345			page->flags = flags;
1346			page->generation = generation;
1347			page->logical = logical;
1348			page->have_csum = have_csum;
1349			if (have_csum)
1350				memcpy(page->csum,
1351				       original_sblock->pagev[0]->csum,
1352				       sctx->csum_size);
1353
1354			scrub_stripe_index_and_offset(logical,
1355						      bbio->map_type,
1356						      bbio->raid_map,
1357						      mapped_length,
1358						      bbio->num_stripes -
1359						      bbio->num_tgtdevs,
1360						      mirror_index,
1361						      &stripe_index,
1362						      &stripe_offset);
1363			page->physical = bbio->stripes[stripe_index].physical +
1364					 stripe_offset;
1365			page->dev = bbio->stripes[stripe_index].dev;
1366
1367			BUG_ON(page_index >= original_sblock->page_count);
1368			page->physical_for_dev_replace =
1369				original_sblock->pagev[page_index]->
1370				physical_for_dev_replace;
1371			/* for missing devices, dev->bdev is NULL */
1372			page->mirror_num = mirror_index + 1;
1373			sblock->page_count++;
1374			page->page = alloc_page(GFP_NOFS);
1375			if (!page->page)
1376				goto leave_nomem;
1377
1378			scrub_get_recover(recover);
1379			page->recover = recover;
1380		}
1381		scrub_put_recover(fs_info, recover);
1382		length -= sublen;
1383		logical += sublen;
1384		page_index++;
1385	}
1386
1387	return 0;
1388}
1389
1390static void scrub_bio_wait_endio(struct bio *bio)
1391{
1392	complete(bio->bi_private);
1393}
1394
1395static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1396					struct bio *bio,
1397					struct scrub_page *page)
1398{
1399	DECLARE_COMPLETION_ONSTACK(done);
1400	int ret;
1401	int mirror_num;
1402
1403	bio->bi_iter.bi_sector = page->logical >> 9;
1404	bio->bi_private = &done;
1405	bio->bi_end_io = scrub_bio_wait_endio;
1406
1407	mirror_num = page->sblock->pagev[0]->mirror_num;
1408	ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
1409				    page->recover->map_length,
1410				    mirror_num, 0);
1411	if (ret)
1412		return ret;
1413
1414	wait_for_completion_io(&done);
1415	return blk_status_to_errno(bio->bi_status);
1416}
1417
1418static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1419					  struct scrub_block *sblock)
1420{
1421	struct scrub_page *first_page = sblock->pagev[0];
1422	struct bio *bio;
1423	int page_num;
1424
1425	/* All pages in sblock belong to the same stripe on the same device. */
1426	ASSERT(first_page->dev);
1427	if (!first_page->dev->bdev)
1428		goto out;
1429
1430	bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
1431	bio_set_dev(bio, first_page->dev->bdev);
1432
1433	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1434		struct scrub_page *page = sblock->pagev[page_num];
1435
1436		WARN_ON(!page->page);
1437		bio_add_page(bio, page->page, PAGE_SIZE, 0);
1438	}
1439
1440	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1441		bio_put(bio);
1442		goto out;
1443	}
1444
1445	bio_put(bio);
1446
1447	scrub_recheck_block_checksum(sblock);
1448
1449	return;
1450out:
1451	for (page_num = 0; page_num < sblock->page_count; page_num++)
1452		sblock->pagev[page_num]->io_error = 1;
1453
1454	sblock->no_io_error_seen = 0;
1455}
1456
1457/*
1458 * this function will check the on disk data for checksum errors, header
1459 * errors and read I/O errors. If any I/O errors happen, the exact pages
1460 * which are errored are marked as being bad. The goal is to enable scrub
1461 * to take those pages that are not errored from all the mirrors so that
1462 * the pages that are errored in the just handled mirror can be repaired.
1463 */
1464static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1465				struct scrub_block *sblock,
1466				int retry_failed_mirror)
1467{
1468	int page_num;
1469
1470	sblock->no_io_error_seen = 1;
1471
1472	/* short cut for raid56 */
1473	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1474		return scrub_recheck_block_on_raid56(fs_info, sblock);
1475
1476	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1477		struct bio *bio;
1478		struct scrub_page *page = sblock->pagev[page_num];
1479
1480		if (page->dev->bdev == NULL) {
1481			page->io_error = 1;
1482			sblock->no_io_error_seen = 0;
1483			continue;
1484		}
1485
1486		WARN_ON(!page->page);
1487		bio = btrfs_io_bio_alloc(1);
1488		bio_set_dev(bio, page->dev->bdev);
1489
1490		bio_add_page(bio, page->page, PAGE_SIZE, 0);
1491		bio->bi_iter.bi_sector = page->physical >> 9;
1492		bio->bi_opf = REQ_OP_READ;
1493
1494		if (btrfsic_submit_bio_wait(bio)) {
1495			page->io_error = 1;
1496			sblock->no_io_error_seen = 0;
1497		}
1498
1499		bio_put(bio);
1500	}
1501
1502	if (sblock->no_io_error_seen)
1503		scrub_recheck_block_checksum(sblock);
1504}
1505
1506static inline int scrub_check_fsid(u8 fsid[],
1507				   struct scrub_page *spage)
1508{
1509	struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1510	int ret;
1511
1512	ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1513	return !ret;
1514}
1515
1516static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1517{
1518	sblock->header_error = 0;
1519	sblock->checksum_error = 0;
1520	sblock->generation_error = 0;
1521
1522	if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1523		scrub_checksum_data(sblock);
1524	else
1525		scrub_checksum_tree_block(sblock);
1526}
1527
1528static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1529					     struct scrub_block *sblock_good)
1530{
1531	int page_num;
1532	int ret = 0;
1533
1534	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1535		int ret_sub;
1536
1537		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1538							   sblock_good,
1539							   page_num, 1);
1540		if (ret_sub)
1541			ret = ret_sub;
1542	}
1543
1544	return ret;
1545}
1546
1547static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1548					    struct scrub_block *sblock_good,
1549					    int page_num, int force_write)
1550{
1551	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1552	struct scrub_page *page_good = sblock_good->pagev[page_num];
1553	struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1554
1555	BUG_ON(page_bad->page == NULL);
1556	BUG_ON(page_good->page == NULL);
1557	if (force_write || sblock_bad->header_error ||
1558	    sblock_bad->checksum_error || page_bad->io_error) {
1559		struct bio *bio;
1560		int ret;
1561
1562		if (!page_bad->dev->bdev) {
1563			btrfs_warn_rl(fs_info,
1564				"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1565			return -EIO;
1566		}
1567
1568		bio = btrfs_io_bio_alloc(1);
1569		bio_set_dev(bio, page_bad->dev->bdev);
1570		bio->bi_iter.bi_sector = page_bad->physical >> 9;
1571		bio->bi_opf = REQ_OP_WRITE;
1572
1573		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1574		if (PAGE_SIZE != ret) {
1575			bio_put(bio);
1576			return -EIO;
1577		}
1578
1579		if (btrfsic_submit_bio_wait(bio)) {
1580			btrfs_dev_stat_inc_and_print(page_bad->dev,
1581				BTRFS_DEV_STAT_WRITE_ERRS);
1582			atomic64_inc(&fs_info->dev_replace.num_write_errors);
1583			bio_put(bio);
1584			return -EIO;
1585		}
1586		bio_put(bio);
1587	}
1588
1589	return 0;
1590}
1591
1592static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1593{
1594	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1595	int page_num;
1596
1597	/*
1598	 * This block is used for the check of the parity on the source device,
1599	 * so the data needn't be written into the destination device.
1600	 */
1601	if (sblock->sparity)
1602		return;
1603
1604	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1605		int ret;
1606
1607		ret = scrub_write_page_to_dev_replace(sblock, page_num);
1608		if (ret)
1609			atomic64_inc(&fs_info->dev_replace.num_write_errors);
1610	}
1611}
1612
1613static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1614					   int page_num)
1615{
1616	struct scrub_page *spage = sblock->pagev[page_num];
1617
1618	BUG_ON(spage->page == NULL);
1619	if (spage->io_error)
1620		clear_page(page_address(spage->page));
1621
1622	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1623}
1624
1625static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1626				    struct scrub_page *spage)
1627{
1628	struct scrub_bio *sbio;
1629	int ret;
1630
1631	mutex_lock(&sctx->wr_lock);
1632again:
1633	if (!sctx->wr_curr_bio) {
1634		sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1635					      GFP_KERNEL);
1636		if (!sctx->wr_curr_bio) {
1637			mutex_unlock(&sctx->wr_lock);
1638			return -ENOMEM;
1639		}
1640		sctx->wr_curr_bio->sctx = sctx;
1641		sctx->wr_curr_bio->page_count = 0;
1642	}
1643	sbio = sctx->wr_curr_bio;
1644	if (sbio->page_count == 0) {
1645		struct bio *bio;
1646
1647		sbio->physical = spage->physical_for_dev_replace;
1648		sbio->logical = spage->logical;
1649		sbio->dev = sctx->wr_tgtdev;
1650		bio = sbio->bio;
1651		if (!bio) {
1652			bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1653			sbio->bio = bio;
1654		}
1655
1656		bio->bi_private = sbio;
1657		bio->bi_end_io = scrub_wr_bio_end_io;
1658		bio_set_dev(bio, sbio->dev->bdev);
1659		bio->bi_iter.bi_sector = sbio->physical >> 9;
1660		bio->bi_opf = REQ_OP_WRITE;
1661		sbio->status = 0;
1662	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1663		   spage->physical_for_dev_replace ||
1664		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1665		   spage->logical) {
1666		scrub_wr_submit(sctx);
1667		goto again;
1668	}
1669
1670	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1671	if (ret != PAGE_SIZE) {
1672		if (sbio->page_count < 1) {
1673			bio_put(sbio->bio);
1674			sbio->bio = NULL;
1675			mutex_unlock(&sctx->wr_lock);
1676			return -EIO;
1677		}
1678		scrub_wr_submit(sctx);
1679		goto again;
1680	}
1681
1682	sbio->pagev[sbio->page_count] = spage;
1683	scrub_page_get(spage);
1684	sbio->page_count++;
1685	if (sbio->page_count == sctx->pages_per_wr_bio)
1686		scrub_wr_submit(sctx);
1687	mutex_unlock(&sctx->wr_lock);
1688
1689	return 0;
1690}
1691
1692static void scrub_wr_submit(struct scrub_ctx *sctx)
1693{
1694	struct scrub_bio *sbio;
1695
1696	if (!sctx->wr_curr_bio)
1697		return;
1698
1699	sbio = sctx->wr_curr_bio;
1700	sctx->wr_curr_bio = NULL;
1701	WARN_ON(!sbio->bio->bi_disk);
1702	scrub_pending_bio_inc(sctx);
1703	/* process all writes in a single worker thread. Then the block layer
1704	 * orders the requests before sending them to the driver which
1705	 * doubled the write performance on spinning disks when measured
1706	 * with Linux 3.5 */
1707	btrfsic_submit_bio(sbio->bio);
1708}
1709
1710static void scrub_wr_bio_end_io(struct bio *bio)
1711{
1712	struct scrub_bio *sbio = bio->bi_private;
1713	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1714
1715	sbio->status = bio->bi_status;
1716	sbio->bio = bio;
1717
1718	btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1719	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1720}
1721
1722static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1723{
1724	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1725	struct scrub_ctx *sctx = sbio->sctx;
1726	int i;
1727
1728	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1729	if (sbio->status) {
1730		struct btrfs_dev_replace *dev_replace =
1731			&sbio->sctx->fs_info->dev_replace;
1732
1733		for (i = 0; i < sbio->page_count; i++) {
1734			struct scrub_page *spage = sbio->pagev[i];
1735
1736			spage->io_error = 1;
1737			atomic64_inc(&dev_replace->num_write_errors);
1738		}
1739	}
1740
1741	for (i = 0; i < sbio->page_count; i++)
1742		scrub_page_put(sbio->pagev[i]);
1743
1744	bio_put(sbio->bio);
1745	kfree(sbio);
1746	scrub_pending_bio_dec(sctx);
1747}
1748
1749static int scrub_checksum(struct scrub_block *sblock)
1750{
1751	u64 flags;
1752	int ret;
1753
1754	/*
1755	 * No need to initialize these stats currently,
1756	 * because this function only use return value
1757	 * instead of these stats value.
1758	 *
1759	 * Todo:
1760	 * always use stats
1761	 */
1762	sblock->header_error = 0;
1763	sblock->generation_error = 0;
1764	sblock->checksum_error = 0;
1765
1766	WARN_ON(sblock->page_count < 1);
1767	flags = sblock->pagev[0]->flags;
1768	ret = 0;
1769	if (flags & BTRFS_EXTENT_FLAG_DATA)
1770		ret = scrub_checksum_data(sblock);
1771	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1772		ret = scrub_checksum_tree_block(sblock);
1773	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1774		(void)scrub_checksum_super(sblock);
1775	else
1776		WARN_ON(1);
1777	if (ret)
1778		scrub_handle_errored_block(sblock);
1779
1780	return ret;
1781}
1782
1783static int scrub_checksum_data(struct scrub_block *sblock)
1784{
1785	struct scrub_ctx *sctx = sblock->sctx;
1786	struct btrfs_fs_info *fs_info = sctx->fs_info;
1787	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1788	u8 csum[BTRFS_CSUM_SIZE];
1789	struct scrub_page *spage;
1790	char *kaddr;
1791
1792	BUG_ON(sblock->page_count < 1);
1793	spage = sblock->pagev[0];
1794	if (!spage->have_csum)
1795		return 0;
1796
1797	kaddr = page_address(spage->page);
1798
1799	shash->tfm = fs_info->csum_shash;
1800	crypto_shash_init(shash);
1801	crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
1802
1803	if (memcmp(csum, spage->csum, sctx->csum_size))
1804		sblock->checksum_error = 1;
1805
1806	return sblock->checksum_error;
1807}
1808
1809static int scrub_checksum_tree_block(struct scrub_block *sblock)
1810{
1811	struct scrub_ctx *sctx = sblock->sctx;
1812	struct btrfs_header *h;
1813	struct btrfs_fs_info *fs_info = sctx->fs_info;
1814	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1815	u8 calculated_csum[BTRFS_CSUM_SIZE];
1816	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1817	const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT;
1818	int i;
1819	struct scrub_page *spage;
1820	char *kaddr;
1821
1822	BUG_ON(sblock->page_count < 1);
1823	spage = sblock->pagev[0];
1824	kaddr = page_address(spage->page);
1825	h = (struct btrfs_header *)kaddr;
1826	memcpy(on_disk_csum, h->csum, sctx->csum_size);
1827
1828	/*
1829	 * we don't use the getter functions here, as we
1830	 * a) don't have an extent buffer and
1831	 * b) the page is already kmapped
1832	 */
1833	if (spage->logical != btrfs_stack_header_bytenr(h))
1834		sblock->header_error = 1;
1835
1836	if (spage->generation != btrfs_stack_header_generation(h)) {
1837		sblock->header_error = 1;
1838		sblock->generation_error = 1;
1839	}
1840
1841	if (!scrub_check_fsid(h->fsid, spage))
1842		sblock->header_error = 1;
1843
1844	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1845		   BTRFS_UUID_SIZE))
1846		sblock->header_error = 1;
1847
1848	shash->tfm = fs_info->csum_shash;
1849	crypto_shash_init(shash);
1850	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1851			    PAGE_SIZE - BTRFS_CSUM_SIZE);
1852
1853	for (i = 1; i < num_pages; i++) {
1854		kaddr = page_address(sblock->pagev[i]->page);
1855		crypto_shash_update(shash, kaddr, PAGE_SIZE);
1856	}
1857
1858	crypto_shash_final(shash, calculated_csum);
1859	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1860		sblock->checksum_error = 1;
1861
1862	return sblock->header_error || sblock->checksum_error;
1863}
1864
1865static int scrub_checksum_super(struct scrub_block *sblock)
1866{
1867	struct btrfs_super_block *s;
1868	struct scrub_ctx *sctx = sblock->sctx;
1869	struct btrfs_fs_info *fs_info = sctx->fs_info;
1870	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1871	u8 calculated_csum[BTRFS_CSUM_SIZE];
1872	struct scrub_page *spage;
1873	char *kaddr;
1874	int fail_gen = 0;
1875	int fail_cor = 0;
1876
1877	BUG_ON(sblock->page_count < 1);
1878	spage = sblock->pagev[0];
1879	kaddr = page_address(spage->page);
1880	s = (struct btrfs_super_block *)kaddr;
1881
1882	if (spage->logical != btrfs_super_bytenr(s))
1883		++fail_cor;
1884
1885	if (spage->generation != btrfs_super_generation(s))
1886		++fail_gen;
1887
1888	if (!scrub_check_fsid(s->fsid, spage))
1889		++fail_cor;
1890
1891	shash->tfm = fs_info->csum_shash;
1892	crypto_shash_init(shash);
1893	crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1894			BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1895
1896	if (memcmp(calculated_csum, s->csum, sctx->csum_size))
1897		++fail_cor;
1898
1899	if (fail_cor + fail_gen) {
1900		/*
1901		 * if we find an error in a super block, we just report it.
1902		 * They will get written with the next transaction commit
1903		 * anyway
1904		 */
1905		spin_lock(&sctx->stat_lock);
1906		++sctx->stat.super_errors;
1907		spin_unlock(&sctx->stat_lock);
1908		if (fail_cor)
1909			btrfs_dev_stat_inc_and_print(spage->dev,
1910				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1911		else
1912			btrfs_dev_stat_inc_and_print(spage->dev,
1913				BTRFS_DEV_STAT_GENERATION_ERRS);
1914	}
1915
1916	return fail_cor + fail_gen;
1917}
1918
1919static void scrub_block_get(struct scrub_block *sblock)
1920{
1921	refcount_inc(&sblock->refs);
1922}
1923
1924static void scrub_block_put(struct scrub_block *sblock)
1925{
1926	if (refcount_dec_and_test(&sblock->refs)) {
1927		int i;
1928
1929		if (sblock->sparity)
1930			scrub_parity_put(sblock->sparity);
1931
1932		for (i = 0; i < sblock->page_count; i++)
1933			scrub_page_put(sblock->pagev[i]);
1934		kfree(sblock);
1935	}
1936}
1937
1938static void scrub_page_get(struct scrub_page *spage)
1939{
1940	atomic_inc(&spage->refs);
1941}
1942
1943static void scrub_page_put(struct scrub_page *spage)
1944{
1945	if (atomic_dec_and_test(&spage->refs)) {
1946		if (spage->page)
1947			__free_page(spage->page);
1948		kfree(spage);
1949	}
1950}
1951
1952static void scrub_submit(struct scrub_ctx *sctx)
1953{
1954	struct scrub_bio *sbio;
1955
1956	if (sctx->curr == -1)
1957		return;
1958
1959	sbio = sctx->bios[sctx->curr];
1960	sctx->curr = -1;
1961	scrub_pending_bio_inc(sctx);
1962	btrfsic_submit_bio(sbio->bio);
1963}
1964
1965static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1966				    struct scrub_page *spage)
1967{
1968	struct scrub_block *sblock = spage->sblock;
1969	struct scrub_bio *sbio;
1970	int ret;
1971
1972again:
1973	/*
1974	 * grab a fresh bio or wait for one to become available
1975	 */
1976	while (sctx->curr == -1) {
1977		spin_lock(&sctx->list_lock);
1978		sctx->curr = sctx->first_free;
1979		if (sctx->curr != -1) {
1980			sctx->first_free = sctx->bios[sctx->curr]->next_free;
1981			sctx->bios[sctx->curr]->next_free = -1;
1982			sctx->bios[sctx->curr]->page_count = 0;
1983			spin_unlock(&sctx->list_lock);
1984		} else {
1985			spin_unlock(&sctx->list_lock);
1986			wait_event(sctx->list_wait, sctx->first_free != -1);
1987		}
1988	}
1989	sbio = sctx->bios[sctx->curr];
1990	if (sbio->page_count == 0) {
1991		struct bio *bio;
1992
1993		sbio->physical = spage->physical;
1994		sbio->logical = spage->logical;
1995		sbio->dev = spage->dev;
1996		bio = sbio->bio;
1997		if (!bio) {
1998			bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
1999			sbio->bio = bio;
2000		}
2001
2002		bio->bi_private = sbio;
2003		bio->bi_end_io = scrub_bio_end_io;
2004		bio_set_dev(bio, sbio->dev->bdev);
2005		bio->bi_iter.bi_sector = sbio->physical >> 9;
2006		bio->bi_opf = REQ_OP_READ;
2007		sbio->status = 0;
2008	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2009		   spage->physical ||
2010		   sbio->logical + sbio->page_count * PAGE_SIZE !=
2011		   spage->logical ||
2012		   sbio->dev != spage->dev) {
2013		scrub_submit(sctx);
2014		goto again;
2015	}
2016
2017	sbio->pagev[sbio->page_count] = spage;
2018	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2019	if (ret != PAGE_SIZE) {
2020		if (sbio->page_count < 1) {
2021			bio_put(sbio->bio);
2022			sbio->bio = NULL;
2023			return -EIO;
2024		}
2025		scrub_submit(sctx);
2026		goto again;
2027	}
2028
2029	scrub_block_get(sblock); /* one for the page added to the bio */
2030	atomic_inc(&sblock->outstanding_pages);
2031	sbio->page_count++;
2032	if (sbio->page_count == sctx->pages_per_rd_bio)
2033		scrub_submit(sctx);
2034
2035	return 0;
2036}
2037
2038static void scrub_missing_raid56_end_io(struct bio *bio)
2039{
2040	struct scrub_block *sblock = bio->bi_private;
2041	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2042
2043	if (bio->bi_status)
2044		sblock->no_io_error_seen = 0;
2045
2046	bio_put(bio);
2047
2048	btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2049}
2050
2051static void scrub_missing_raid56_worker(struct btrfs_work *work)
2052{
2053	struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2054	struct scrub_ctx *sctx = sblock->sctx;
2055	struct btrfs_fs_info *fs_info = sctx->fs_info;
2056	u64 logical;
2057	struct btrfs_device *dev;
2058
2059	logical = sblock->pagev[0]->logical;
2060	dev = sblock->pagev[0]->dev;
2061
2062	if (sblock->no_io_error_seen)
2063		scrub_recheck_block_checksum(sblock);
2064
2065	if (!sblock->no_io_error_seen) {
2066		spin_lock(&sctx->stat_lock);
2067		sctx->stat.read_errors++;
2068		spin_unlock(&sctx->stat_lock);
2069		btrfs_err_rl_in_rcu(fs_info,
2070			"IO error rebuilding logical %llu for dev %s",
2071			logical, rcu_str_deref(dev->name));
2072	} else if (sblock->header_error || sblock->checksum_error) {
2073		spin_lock(&sctx->stat_lock);
2074		sctx->stat.uncorrectable_errors++;
2075		spin_unlock(&sctx->stat_lock);
2076		btrfs_err_rl_in_rcu(fs_info,
2077			"failed to rebuild valid logical %llu for dev %s",
2078			logical, rcu_str_deref(dev->name));
2079	} else {
2080		scrub_write_block_to_dev_replace(sblock);
2081	}
2082
2083	if (sctx->is_dev_replace && sctx->flush_all_writes) {
2084		mutex_lock(&sctx->wr_lock);
2085		scrub_wr_submit(sctx);
2086		mutex_unlock(&sctx->wr_lock);
2087	}
2088
2089	scrub_block_put(sblock);
2090	scrub_pending_bio_dec(sctx);
2091}
2092
2093static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2094{
2095	struct scrub_ctx *sctx = sblock->sctx;
2096	struct btrfs_fs_info *fs_info = sctx->fs_info;
2097	u64 length = sblock->page_count * PAGE_SIZE;
2098	u64 logical = sblock->pagev[0]->logical;
2099	struct btrfs_bio *bbio = NULL;
2100	struct bio *bio;
2101	struct btrfs_raid_bio *rbio;
2102	int ret;
2103	int i;
2104
2105	btrfs_bio_counter_inc_blocked(fs_info);
2106	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2107			&length, &bbio);
2108	if (ret || !bbio || !bbio->raid_map)
2109		goto bbio_out;
2110
2111	if (WARN_ON(!sctx->is_dev_replace ||
2112		    !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2113		/*
2114		 * We shouldn't be scrubbing a missing device. Even for dev
2115		 * replace, we should only get here for RAID 5/6. We either
2116		 * managed to mount something with no mirrors remaining or
2117		 * there's a bug in scrub_remap_extent()/btrfs_map_block().
2118		 */
2119		goto bbio_out;
2120	}
2121
2122	bio = btrfs_io_bio_alloc(0);
2123	bio->bi_iter.bi_sector = logical >> 9;
2124	bio->bi_private = sblock;
2125	bio->bi_end_io = scrub_missing_raid56_end_io;
2126
2127	rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2128	if (!rbio)
2129		goto rbio_out;
2130
2131	for (i = 0; i < sblock->page_count; i++) {
2132		struct scrub_page *spage = sblock->pagev[i];
2133
2134		raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2135	}
2136
2137	btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
2138	scrub_block_get(sblock);
2139	scrub_pending_bio_inc(sctx);
2140	raid56_submit_missing_rbio(rbio);
2141	return;
2142
2143rbio_out:
2144	bio_put(bio);
2145bbio_out:
2146	btrfs_bio_counter_dec(fs_info);
2147	btrfs_put_bbio(bbio);
2148	spin_lock(&sctx->stat_lock);
2149	sctx->stat.malloc_errors++;
2150	spin_unlock(&sctx->stat_lock);
2151}
2152
2153static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2154		       u64 physical, struct btrfs_device *dev, u64 flags,
2155		       u64 gen, int mirror_num, u8 *csum, int force,
2156		       u64 physical_for_dev_replace)
2157{
2158	struct scrub_block *sblock;
2159	int index;
2160
2161	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2162	if (!sblock) {
2163		spin_lock(&sctx->stat_lock);
2164		sctx->stat.malloc_errors++;
2165		spin_unlock(&sctx->stat_lock);
2166		return -ENOMEM;
2167	}
2168
2169	/* one ref inside this function, plus one for each page added to
2170	 * a bio later on */
2171	refcount_set(&sblock->refs, 1);
2172	sblock->sctx = sctx;
2173	sblock->no_io_error_seen = 1;
2174
2175	for (index = 0; len > 0; index++) {
2176		struct scrub_page *spage;
2177		u64 l = min_t(u64, len, PAGE_SIZE);
2178
2179		spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2180		if (!spage) {
2181leave_nomem:
2182			spin_lock(&sctx->stat_lock);
2183			sctx->stat.malloc_errors++;
2184			spin_unlock(&sctx->stat_lock);
2185			scrub_block_put(sblock);
2186			return -ENOMEM;
2187		}
2188		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2189		scrub_page_get(spage);
2190		sblock->pagev[index] = spage;
2191		spage->sblock = sblock;
2192		spage->dev = dev;
2193		spage->flags = flags;
2194		spage->generation = gen;
2195		spage->logical = logical;
2196		spage->physical = physical;
2197		spage->physical_for_dev_replace = physical_for_dev_replace;
2198		spage->mirror_num = mirror_num;
2199		if (csum) {
2200			spage->have_csum = 1;
2201			memcpy(spage->csum, csum, sctx->csum_size);
2202		} else {
2203			spage->have_csum = 0;
2204		}
2205		sblock->page_count++;
2206		spage->page = alloc_page(GFP_KERNEL);
2207		if (!spage->page)
2208			goto leave_nomem;
2209		len -= l;
2210		logical += l;
2211		physical += l;
2212		physical_for_dev_replace += l;
2213	}
2214
2215	WARN_ON(sblock->page_count == 0);
2216	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2217		/*
2218		 * This case should only be hit for RAID 5/6 device replace. See
2219		 * the comment in scrub_missing_raid56_pages() for details.
2220		 */
2221		scrub_missing_raid56_pages(sblock);
2222	} else {
2223		for (index = 0; index < sblock->page_count; index++) {
2224			struct scrub_page *spage = sblock->pagev[index];
2225			int ret;
2226
2227			ret = scrub_add_page_to_rd_bio(sctx, spage);
2228			if (ret) {
2229				scrub_block_put(sblock);
2230				return ret;
2231			}
2232		}
2233
2234		if (force)
2235			scrub_submit(sctx);
2236	}
2237
2238	/* last one frees, either here or in bio completion for last page */
2239	scrub_block_put(sblock);
2240	return 0;
2241}
2242
2243static void scrub_bio_end_io(struct bio *bio)
2244{
2245	struct scrub_bio *sbio = bio->bi_private;
2246	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2247
2248	sbio->status = bio->bi_status;
2249	sbio->bio = bio;
2250
2251	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2252}
2253
2254static void scrub_bio_end_io_worker(struct btrfs_work *work)
2255{
2256	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2257	struct scrub_ctx *sctx = sbio->sctx;
2258	int i;
2259
2260	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2261	if (sbio->status) {
2262		for (i = 0; i < sbio->page_count; i++) {
2263			struct scrub_page *spage = sbio->pagev[i];
2264
2265			spage->io_error = 1;
2266			spage->sblock->no_io_error_seen = 0;
2267		}
2268	}
2269
2270	/* now complete the scrub_block items that have all pages completed */
2271	for (i = 0; i < sbio->page_count; i++) {
2272		struct scrub_page *spage = sbio->pagev[i];
2273		struct scrub_block *sblock = spage->sblock;
2274
2275		if (atomic_dec_and_test(&sblock->outstanding_pages))
2276			scrub_block_complete(sblock);
2277		scrub_block_put(sblock);
2278	}
2279
2280	bio_put(sbio->bio);
2281	sbio->bio = NULL;
2282	spin_lock(&sctx->list_lock);
2283	sbio->next_free = sctx->first_free;
2284	sctx->first_free = sbio->index;
2285	spin_unlock(&sctx->list_lock);
2286
2287	if (sctx->is_dev_replace && sctx->flush_all_writes) {
2288		mutex_lock(&sctx->wr_lock);
2289		scrub_wr_submit(sctx);
2290		mutex_unlock(&sctx->wr_lock);
2291	}
2292
2293	scrub_pending_bio_dec(sctx);
2294}
2295
2296static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2297				       unsigned long *bitmap,
2298				       u64 start, u64 len)
2299{
2300	u64 offset;
2301	u64 nsectors64;
2302	u32 nsectors;
2303	int sectorsize = sparity->sctx->fs_info->sectorsize;
2304
2305	if (len >= sparity->stripe_len) {
2306		bitmap_set(bitmap, 0, sparity->nsectors);
2307		return;
2308	}
2309
2310	start -= sparity->logic_start;
2311	start = div64_u64_rem(start, sparity->stripe_len, &offset);
2312	offset = div_u64(offset, sectorsize);
2313	nsectors64 = div_u64(len, sectorsize);
2314
2315	ASSERT(nsectors64 < UINT_MAX);
2316	nsectors = (u32)nsectors64;
2317
2318	if (offset + nsectors <= sparity->nsectors) {
2319		bitmap_set(bitmap, offset, nsectors);
2320		return;
2321	}
2322
2323	bitmap_set(bitmap, offset, sparity->nsectors - offset);
2324	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2325}
2326
2327static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2328						   u64 start, u64 len)
2329{
2330	__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2331}
2332
2333static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2334						  u64 start, u64 len)
2335{
2336	__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2337}
2338
2339static void scrub_block_complete(struct scrub_block *sblock)
2340{
2341	int corrupted = 0;
2342
2343	if (!sblock->no_io_error_seen) {
2344		corrupted = 1;
2345		scrub_handle_errored_block(sblock);
2346	} else {
2347		/*
2348		 * if has checksum error, write via repair mechanism in
2349		 * dev replace case, otherwise write here in dev replace
2350		 * case.
2351		 */
2352		corrupted = scrub_checksum(sblock);
2353		if (!corrupted && sblock->sctx->is_dev_replace)
2354			scrub_write_block_to_dev_replace(sblock);
2355	}
2356
2357	if (sblock->sparity && corrupted && !sblock->data_corrected) {
2358		u64 start = sblock->pagev[0]->logical;
2359		u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2360			  PAGE_SIZE;
2361
2362		scrub_parity_mark_sectors_error(sblock->sparity,
2363						start, end - start);
2364	}
2365}
2366
2367static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2368{
2369	struct btrfs_ordered_sum *sum = NULL;
2370	unsigned long index;
2371	unsigned long num_sectors;
2372
2373	while (!list_empty(&sctx->csum_list)) {
2374		sum = list_first_entry(&sctx->csum_list,
2375				       struct btrfs_ordered_sum, list);
2376		if (sum->bytenr > logical)
2377			return 0;
2378		if (sum->bytenr + sum->len > logical)
2379			break;
2380
2381		++sctx->stat.csum_discards;
2382		list_del(&sum->list);
2383		kfree(sum);
2384		sum = NULL;
2385	}
2386	if (!sum)
2387		return 0;
2388
2389	index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
2390	ASSERT(index < UINT_MAX);
2391
2392	num_sectors = sum->len / sctx->fs_info->sectorsize;
2393	memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
2394	if (index == num_sectors - 1) {
2395		list_del(&sum->list);
2396		kfree(sum);
2397	}
2398	return 1;
2399}
2400
2401/* scrub extent tries to collect up to 64 kB for each bio */
2402static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2403			u64 logical, u64 len,
2404			u64 physical, struct btrfs_device *dev, u64 flags,
2405			u64 gen, int mirror_num, u64 physical_for_dev_replace)
2406{
2407	int ret;
2408	u8 csum[BTRFS_CSUM_SIZE];
2409	u32 blocksize;
2410
2411	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2412		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2413			blocksize = map->stripe_len;
2414		else
2415			blocksize = sctx->fs_info->sectorsize;
2416		spin_lock(&sctx->stat_lock);
2417		sctx->stat.data_extents_scrubbed++;
2418		sctx->stat.data_bytes_scrubbed += len;
2419		spin_unlock(&sctx->stat_lock);
2420	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2421		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2422			blocksize = map->stripe_len;
2423		else
2424			blocksize = sctx->fs_info->nodesize;
2425		spin_lock(&sctx->stat_lock);
2426		sctx->stat.tree_extents_scrubbed++;
2427		sctx->stat.tree_bytes_scrubbed += len;
2428		spin_unlock(&sctx->stat_lock);
2429	} else {
2430		blocksize = sctx->fs_info->sectorsize;
2431		WARN_ON(1);
2432	}
2433
2434	while (len) {
2435		u64 l = min_t(u64, len, blocksize);
2436		int have_csum = 0;
2437
2438		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2439			/* push csums to sbio */
2440			have_csum = scrub_find_csum(sctx, logical, csum);
2441			if (have_csum == 0)
2442				++sctx->stat.no_csum;
2443		}
2444		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2445				  mirror_num, have_csum ? csum : NULL, 0,
2446				  physical_for_dev_replace);
2447		if (ret)
2448			return ret;
2449		len -= l;
2450		logical += l;
2451		physical += l;
2452		physical_for_dev_replace += l;
2453	}
2454	return 0;
2455}
2456
2457static int scrub_pages_for_parity(struct scrub_parity *sparity,
2458				  u64 logical, u64 len,
2459				  u64 physical, struct btrfs_device *dev,
2460				  u64 flags, u64 gen, int mirror_num, u8 *csum)
2461{
2462	struct scrub_ctx *sctx = sparity->sctx;
2463	struct scrub_block *sblock;
2464	int index;
2465
2466	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2467	if (!sblock) {
2468		spin_lock(&sctx->stat_lock);
2469		sctx->stat.malloc_errors++;
2470		spin_unlock(&sctx->stat_lock);
2471		return -ENOMEM;
2472	}
2473
2474	/* one ref inside this function, plus one for each page added to
2475	 * a bio later on */
2476	refcount_set(&sblock->refs, 1);
2477	sblock->sctx = sctx;
2478	sblock->no_io_error_seen = 1;
2479	sblock->sparity = sparity;
2480	scrub_parity_get(sparity);
2481
2482	for (index = 0; len > 0; index++) {
2483		struct scrub_page *spage;
2484		u64 l = min_t(u64, len, PAGE_SIZE);
2485
2486		spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2487		if (!spage) {
2488leave_nomem:
2489			spin_lock(&sctx->stat_lock);
2490			sctx->stat.malloc_errors++;
2491			spin_unlock(&sctx->stat_lock);
2492			scrub_block_put(sblock);
2493			return -ENOMEM;
2494		}
2495		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2496		/* For scrub block */
2497		scrub_page_get(spage);
2498		sblock->pagev[index] = spage;
2499		/* For scrub parity */
2500		scrub_page_get(spage);
2501		list_add_tail(&spage->list, &sparity->spages);
2502		spage->sblock = sblock;
2503		spage->dev = dev;
2504		spage->flags = flags;
2505		spage->generation = gen;
2506		spage->logical = logical;
2507		spage->physical = physical;
2508		spage->mirror_num = mirror_num;
2509		if (csum) {
2510			spage->have_csum = 1;
2511			memcpy(spage->csum, csum, sctx->csum_size);
2512		} else {
2513			spage->have_csum = 0;
2514		}
2515		sblock->page_count++;
2516		spage->page = alloc_page(GFP_KERNEL);
2517		if (!spage->page)
2518			goto leave_nomem;
2519		len -= l;
2520		logical += l;
2521		physical += l;
2522	}
2523
2524	WARN_ON(sblock->page_count == 0);
2525	for (index = 0; index < sblock->page_count; index++) {
2526		struct scrub_page *spage = sblock->pagev[index];
2527		int ret;
2528
2529		ret = scrub_add_page_to_rd_bio(sctx, spage);
2530		if (ret) {
2531			scrub_block_put(sblock);
2532			return ret;
2533		}
2534	}
2535
2536	/* last one frees, either here or in bio completion for last page */
2537	scrub_block_put(sblock);
2538	return 0;
2539}
2540
2541static int scrub_extent_for_parity(struct scrub_parity *sparity,
2542				   u64 logical, u64 len,
2543				   u64 physical, struct btrfs_device *dev,
2544				   u64 flags, u64 gen, int mirror_num)
2545{
2546	struct scrub_ctx *sctx = sparity->sctx;
2547	int ret;
2548	u8 csum[BTRFS_CSUM_SIZE];
2549	u32 blocksize;
2550
2551	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2552		scrub_parity_mark_sectors_error(sparity, logical, len);
2553		return 0;
2554	}
2555
2556	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2557		blocksize = sparity->stripe_len;
2558	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2559		blocksize = sparity->stripe_len;
2560	} else {
2561		blocksize = sctx->fs_info->sectorsize;
2562		WARN_ON(1);
2563	}
2564
2565	while (len) {
2566		u64 l = min_t(u64, len, blocksize);
2567		int have_csum = 0;
2568
2569		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2570			/* push csums to sbio */
2571			have_csum = scrub_find_csum(sctx, logical, csum);
2572			if (have_csum == 0)
2573				goto skip;
2574		}
2575		ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2576					     flags, gen, mirror_num,
2577					     have_csum ? csum : NULL);
2578		if (ret)
2579			return ret;
2580skip:
2581		len -= l;
2582		logical += l;
2583		physical += l;
2584	}
2585	return 0;
2586}
2587
2588/*
2589 * Given a physical address, this will calculate it's
2590 * logical offset. if this is a parity stripe, it will return
2591 * the most left data stripe's logical offset.
2592 *
2593 * return 0 if it is a data stripe, 1 means parity stripe.
2594 */
2595static int get_raid56_logic_offset(u64 physical, int num,
2596				   struct map_lookup *map, u64 *offset,
2597				   u64 *stripe_start)
2598{
2599	int i;
2600	int j = 0;
2601	u64 stripe_nr;
2602	u64 last_offset;
2603	u32 stripe_index;
2604	u32 rot;
2605	const int data_stripes = nr_data_stripes(map);
2606
2607	last_offset = (physical - map->stripes[num].physical) * data_stripes;
2608	if (stripe_start)
2609		*stripe_start = last_offset;
2610
2611	*offset = last_offset;
2612	for (i = 0; i < data_stripes; i++) {
2613		*offset = last_offset + i * map->stripe_len;
2614
2615		stripe_nr = div64_u64(*offset, map->stripe_len);
2616		stripe_nr = div_u64(stripe_nr, data_stripes);
2617
2618		/* Work out the disk rotation on this stripe-set */
2619		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2620		/* calculate which stripe this data locates */
2621		rot += i;
2622		stripe_index = rot % map->num_stripes;
2623		if (stripe_index == num)
2624			return 0;
2625		if (stripe_index < num)
2626			j++;
2627	}
2628	*offset = last_offset + j * map->stripe_len;
2629	return 1;
2630}
2631
2632static void scrub_free_parity(struct scrub_parity *sparity)
2633{
2634	struct scrub_ctx *sctx = sparity->sctx;
2635	struct scrub_page *curr, *next;
2636	int nbits;
2637
2638	nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2639	if (nbits) {
2640		spin_lock(&sctx->stat_lock);
2641		sctx->stat.read_errors += nbits;
2642		sctx->stat.uncorrectable_errors += nbits;
2643		spin_unlock(&sctx->stat_lock);
2644	}
2645
2646	list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2647		list_del_init(&curr->list);
2648		scrub_page_put(curr);
2649	}
2650
2651	kfree(sparity);
2652}
2653
2654static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2655{
2656	struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2657						    work);
2658	struct scrub_ctx *sctx = sparity->sctx;
2659
2660	scrub_free_parity(sparity);
2661	scrub_pending_bio_dec(sctx);
2662}
2663
2664static void scrub_parity_bio_endio(struct bio *bio)
2665{
2666	struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2667	struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2668
2669	if (bio->bi_status)
2670		bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2671			  sparity->nsectors);
2672
2673	bio_put(bio);
2674
2675	btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2676			NULL);
2677	btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2678}
2679
2680static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2681{
2682	struct scrub_ctx *sctx = sparity->sctx;
2683	struct btrfs_fs_info *fs_info = sctx->fs_info;
2684	struct bio *bio;
2685	struct btrfs_raid_bio *rbio;
2686	struct btrfs_bio *bbio = NULL;
2687	u64 length;
2688	int ret;
2689
2690	if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2691			   sparity->nsectors))
2692		goto out;
2693
2694	length = sparity->logic_end - sparity->logic_start;
2695
2696	btrfs_bio_counter_inc_blocked(fs_info);
2697	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2698			       &length, &bbio);
2699	if (ret || !bbio || !bbio->raid_map)
2700		goto bbio_out;
2701
2702	bio = btrfs_io_bio_alloc(0);
2703	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2704	bio->bi_private = sparity;
2705	bio->bi_end_io = scrub_parity_bio_endio;
2706
2707	rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
2708					      length, sparity->scrub_dev,
2709					      sparity->dbitmap,
2710					      sparity->nsectors);
2711	if (!rbio)
2712		goto rbio_out;
2713
2714	scrub_pending_bio_inc(sctx);
2715	raid56_parity_submit_scrub_rbio(rbio);
2716	return;
2717
2718rbio_out:
2719	bio_put(bio);
2720bbio_out:
2721	btrfs_bio_counter_dec(fs_info);
2722	btrfs_put_bbio(bbio);
2723	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2724		  sparity->nsectors);
2725	spin_lock(&sctx->stat_lock);
2726	sctx->stat.malloc_errors++;
2727	spin_unlock(&sctx->stat_lock);
2728out:
2729	scrub_free_parity(sparity);
2730}
2731
2732static inline int scrub_calc_parity_bitmap_len(int nsectors)
2733{
2734	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2735}
2736
2737static void scrub_parity_get(struct scrub_parity *sparity)
2738{
2739	refcount_inc(&sparity->refs);
2740}
2741
2742static void scrub_parity_put(struct scrub_parity *sparity)
2743{
2744	if (!refcount_dec_and_test(&sparity->refs))
2745		return;
2746
2747	scrub_parity_check_and_repair(sparity);
2748}
2749
2750static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2751						  struct map_lookup *map,
2752						  struct btrfs_device *sdev,
2753						  struct btrfs_path *path,
2754						  u64 logic_start,
2755						  u64 logic_end)
2756{
2757	struct btrfs_fs_info *fs_info = sctx->fs_info;
2758	struct btrfs_root *root = fs_info->extent_root;
2759	struct btrfs_root *csum_root = fs_info->csum_root;
2760	struct btrfs_extent_item *extent;
2761	struct btrfs_bio *bbio = NULL;
2762	u64 flags;
2763	int ret;
2764	int slot;
2765	struct extent_buffer *l;
2766	struct btrfs_key key;
2767	u64 generation;
2768	u64 extent_logical;
2769	u64 extent_physical;
2770	u64 extent_len;
2771	u64 mapped_length;
2772	struct btrfs_device *extent_dev;
2773	struct scrub_parity *sparity;
2774	int nsectors;
2775	int bitmap_len;
2776	int extent_mirror_num;
2777	int stop_loop = 0;
2778
2779	nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
2780	bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2781	sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2782			  GFP_NOFS);
2783	if (!sparity) {
2784		spin_lock(&sctx->stat_lock);
2785		sctx->stat.malloc_errors++;
2786		spin_unlock(&sctx->stat_lock);
2787		return -ENOMEM;
2788	}
2789
2790	sparity->stripe_len = map->stripe_len;
2791	sparity->nsectors = nsectors;
2792	sparity->sctx = sctx;
2793	sparity->scrub_dev = sdev;
2794	sparity->logic_start = logic_start;
2795	sparity->logic_end = logic_end;
2796	refcount_set(&sparity->refs, 1);
2797	INIT_LIST_HEAD(&sparity->spages);
2798	sparity->dbitmap = sparity->bitmap;
2799	sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2800
2801	ret = 0;
2802	while (logic_start < logic_end) {
2803		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2804			key.type = BTRFS_METADATA_ITEM_KEY;
2805		else
2806			key.type = BTRFS_EXTENT_ITEM_KEY;
2807		key.objectid = logic_start;
2808		key.offset = (u64)-1;
2809
2810		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2811		if (ret < 0)
2812			goto out;
2813
2814		if (ret > 0) {
2815			ret = btrfs_previous_extent_item(root, path, 0);
2816			if (ret < 0)
2817				goto out;
2818			if (ret > 0) {
2819				btrfs_release_path(path);
2820				ret = btrfs_search_slot(NULL, root, &key,
2821							path, 0, 0);
2822				if (ret < 0)
2823					goto out;
2824			}
2825		}
2826
2827		stop_loop = 0;
2828		while (1) {
2829			u64 bytes;
2830
2831			l = path->nodes[0];
2832			slot = path->slots[0];
2833			if (slot >= btrfs_header_nritems(l)) {
2834				ret = btrfs_next_leaf(root, path);
2835				if (ret == 0)
2836					continue;
2837				if (ret < 0)
2838					goto out;
2839
2840				stop_loop = 1;
2841				break;
2842			}
2843			btrfs_item_key_to_cpu(l, &key, slot);
2844
2845			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2846			    key.type != BTRFS_METADATA_ITEM_KEY)
2847				goto next;
2848
2849			if (key.type == BTRFS_METADATA_ITEM_KEY)
2850				bytes = fs_info->nodesize;
2851			else
2852				bytes = key.offset;
2853
2854			if (key.objectid + bytes <= logic_start)
2855				goto next;
2856
2857			if (key.objectid >= logic_end) {
2858				stop_loop = 1;
2859				break;
2860			}
2861
2862			while (key.objectid >= logic_start + map->stripe_len)
2863				logic_start += map->stripe_len;
2864
2865			extent = btrfs_item_ptr(l, slot,
2866						struct btrfs_extent_item);
2867			flags = btrfs_extent_flags(l, extent);
2868			generation = btrfs_extent_generation(l, extent);
2869
2870			if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
2871			    (key.objectid < logic_start ||
2872			     key.objectid + bytes >
2873			     logic_start + map->stripe_len)) {
2874				btrfs_err(fs_info,
2875					  "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2876					  key.objectid, logic_start);
2877				spin_lock(&sctx->stat_lock);
2878				sctx->stat.uncorrectable_errors++;
2879				spin_unlock(&sctx->stat_lock);
2880				goto next;
2881			}
2882again:
2883			extent_logical = key.objectid;
2884			extent_len = bytes;
2885
2886			if (extent_logical < logic_start) {
2887				extent_len -= logic_start - extent_logical;
2888				extent_logical = logic_start;
2889			}
2890
2891			if (extent_logical + extent_len >
2892			    logic_start + map->stripe_len)
2893				extent_len = logic_start + map->stripe_len -
2894					     extent_logical;
2895
2896			scrub_parity_mark_sectors_data(sparity, extent_logical,
2897						       extent_len);
2898
2899			mapped_length = extent_len;
2900			bbio = NULL;
2901			ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
2902					extent_logical, &mapped_length, &bbio,
2903					0);
2904			if (!ret) {
2905				if (!bbio || mapped_length < extent_len)
2906					ret = -EIO;
2907			}
2908			if (ret) {
2909				btrfs_put_bbio(bbio);
2910				goto out;
2911			}
2912			extent_physical = bbio->stripes[0].physical;
2913			extent_mirror_num = bbio->mirror_num;
2914			extent_dev = bbio->stripes[0].dev;
2915			btrfs_put_bbio(bbio);
2916
2917			ret = btrfs_lookup_csums_range(csum_root,
2918						extent_logical,
2919						extent_logical + extent_len - 1,
2920						&sctx->csum_list, 1);
2921			if (ret)
2922				goto out;
2923
2924			ret = scrub_extent_for_parity(sparity, extent_logical,
2925						      extent_len,
2926						      extent_physical,
2927						      extent_dev, flags,
2928						      generation,
2929						      extent_mirror_num);
2930
2931			scrub_free_csums(sctx);
2932
2933			if (ret)
2934				goto out;
2935
2936			if (extent_logical + extent_len <
2937			    key.objectid + bytes) {
2938				logic_start += map->stripe_len;
2939
2940				if (logic_start >= logic_end) {
2941					stop_loop = 1;
2942					break;
2943				}
2944
2945				if (logic_start < key.objectid + bytes) {
2946					cond_resched();
2947					goto again;
2948				}
2949			}
2950next:
2951			path->slots[0]++;
2952		}
2953
2954		btrfs_release_path(path);
2955
2956		if (stop_loop)
2957			break;
2958
2959		logic_start += map->stripe_len;
2960	}
2961out:
2962	if (ret < 0)
2963		scrub_parity_mark_sectors_error(sparity, logic_start,
2964						logic_end - logic_start);
2965	scrub_parity_put(sparity);
2966	scrub_submit(sctx);
2967	mutex_lock(&sctx->wr_lock);
2968	scrub_wr_submit(sctx);
2969	mutex_unlock(&sctx->wr_lock);
2970
2971	btrfs_release_path(path);
2972	return ret < 0 ? ret : 0;
2973}
2974
2975static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2976					   struct map_lookup *map,
2977					   struct btrfs_device *scrub_dev,
2978					   int num, u64 base, u64 length,
2979					   struct btrfs_block_group *cache)
2980{
2981	struct btrfs_path *path, *ppath;
2982	struct btrfs_fs_info *fs_info = sctx->fs_info;
2983	struct btrfs_root *root = fs_info->extent_root;
2984	struct btrfs_root *csum_root = fs_info->csum_root;
2985	struct btrfs_extent_item *extent;
2986	struct blk_plug plug;
2987	u64 flags;
2988	int ret;
2989	int slot;
2990	u64 nstripes;
2991	struct extent_buffer *l;
2992	u64 physical;
2993	u64 logical;
2994	u64 logic_end;
2995	u64 physical_end;
2996	u64 generation;
2997	int mirror_num;
2998	struct reada_control *reada1;
2999	struct reada_control *reada2;
3000	struct btrfs_key key;
3001	struct btrfs_key key_end;
3002	u64 increment = map->stripe_len;
3003	u64 offset;
3004	u64 extent_logical;
3005	u64 extent_physical;
3006	u64 extent_len;
3007	u64 stripe_logical;
3008	u64 stripe_end;
3009	struct btrfs_device *extent_dev;
3010	int extent_mirror_num;
3011	int stop_loop = 0;
3012
3013	physical = map->stripes[num].physical;
3014	offset = 0;
3015	nstripes = div64_u64(length, map->stripe_len);
3016	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3017		offset = map->stripe_len * num;
3018		increment = map->stripe_len * map->num_stripes;
3019		mirror_num = 1;
3020	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3021		int factor = map->num_stripes / map->sub_stripes;
3022		offset = map->stripe_len * (num / map->sub_stripes);
3023		increment = map->stripe_len * factor;
3024		mirror_num = num % map->sub_stripes + 1;
3025	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3026		increment = map->stripe_len;
3027		mirror_num = num % map->num_stripes + 1;
3028	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3029		increment = map->stripe_len;
3030		mirror_num = num % map->num_stripes + 1;
3031	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3032		get_raid56_logic_offset(physical, num, map, &offset, NULL);
3033		increment = map->stripe_len * nr_data_stripes(map);
3034		mirror_num = 1;
3035	} else {
3036		increment = map->stripe_len;
3037		mirror_num = 1;
3038	}
3039
3040	path = btrfs_alloc_path();
3041	if (!path)
3042		return -ENOMEM;
3043
3044	ppath = btrfs_alloc_path();
3045	if (!ppath) {
3046		btrfs_free_path(path);
3047		return -ENOMEM;
3048	}
3049
3050	/*
3051	 * work on commit root. The related disk blocks are static as
3052	 * long as COW is applied. This means, it is save to rewrite
3053	 * them to repair disk errors without any race conditions
3054	 */
3055	path->search_commit_root = 1;
3056	path->skip_locking = 1;
3057
3058	ppath->search_commit_root = 1;
3059	ppath->skip_locking = 1;
3060	/*
3061	 * trigger the readahead for extent tree csum tree and wait for
3062	 * completion. During readahead, the scrub is officially paused
3063	 * to not hold off transaction commits
3064	 */
3065	logical = base + offset;
3066	physical_end = physical + nstripes * map->stripe_len;
3067	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3068		get_raid56_logic_offset(physical_end, num,
3069					map, &logic_end, NULL);
3070		logic_end += base;
3071	} else {
3072		logic_end = logical + increment * nstripes;
3073	}
3074	wait_event(sctx->list_wait,
3075		   atomic_read(&sctx->bios_in_flight) == 0);
3076	scrub_blocked_if_needed(fs_info);
3077
3078	/* FIXME it might be better to start readahead at commit root */
3079	key.objectid = logical;
3080	key.type = BTRFS_EXTENT_ITEM_KEY;
3081	key.offset = (u64)0;
3082	key_end.objectid = logic_end;
3083	key_end.type = BTRFS_METADATA_ITEM_KEY;
3084	key_end.offset = (u64)-1;
3085	reada1 = btrfs_reada_add(root, &key, &key_end);
3086
3087	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3088	key.type = BTRFS_EXTENT_CSUM_KEY;
3089	key.offset = logical;
3090	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3091	key_end.type = BTRFS_EXTENT_CSUM_KEY;
3092	key_end.offset = logic_end;
3093	reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3094
3095	if (!IS_ERR(reada1))
3096		btrfs_reada_wait(reada1);
3097	if (!IS_ERR(reada2))
3098		btrfs_reada_wait(reada2);
3099
3100
3101	/*
3102	 * collect all data csums for the stripe to avoid seeking during
3103	 * the scrub. This might currently (crc32) end up to be about 1MB
3104	 */
3105	blk_start_plug(&plug);
3106
3107	/*
3108	 * now find all extents for each stripe and scrub them
3109	 */
3110	ret = 0;
3111	while (physical < physical_end) {
3112		/*
3113		 * canceled?
3114		 */
3115		if (atomic_read(&fs_info->scrub_cancel_req) ||
3116		    atomic_read(&sctx->cancel_req)) {
3117			ret = -ECANCELED;
3118			goto out;
3119		}
3120		/*
3121		 * check to see if we have to pause
3122		 */
3123		if (atomic_read(&fs_info->scrub_pause_req)) {
3124			/* push queued extents */
3125			sctx->flush_all_writes = true;
3126			scrub_submit(sctx);
3127			mutex_lock(&sctx->wr_lock);
3128			scrub_wr_submit(sctx);
3129			mutex_unlock(&sctx->wr_lock);
3130			wait_event(sctx->list_wait,
3131				   atomic_read(&sctx->bios_in_flight) == 0);
3132			sctx->flush_all_writes = false;
3133			scrub_blocked_if_needed(fs_info);
3134		}
3135
3136		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3137			ret = get_raid56_logic_offset(physical, num, map,
3138						      &logical,
3139						      &stripe_logical);
3140			logical += base;
3141			if (ret) {
3142				/* it is parity strip */
3143				stripe_logical += base;
3144				stripe_end = stripe_logical + increment;
3145				ret = scrub_raid56_parity(sctx, map, scrub_dev,
3146							  ppath, stripe_logical,
3147							  stripe_end);
3148				if (ret)
3149					goto out;
3150				goto skip;
3151			}
3152		}
3153
3154		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3155			key.type = BTRFS_METADATA_ITEM_KEY;
3156		else
3157			key.type = BTRFS_EXTENT_ITEM_KEY;
3158		key.objectid = logical;
3159		key.offset = (u64)-1;
3160
3161		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3162		if (ret < 0)
3163			goto out;
3164
3165		if (ret > 0) {
3166			ret = btrfs_previous_extent_item(root, path, 0);
3167			if (ret < 0)
3168				goto out;
3169			if (ret > 0) {
3170				/* there's no smaller item, so stick with the
3171				 * larger one */
3172				btrfs_release_path(path);
3173				ret = btrfs_search_slot(NULL, root, &key,
3174							path, 0, 0);
3175				if (ret < 0)
3176					goto out;
3177			}
3178		}
3179
3180		stop_loop = 0;
3181		while (1) {
3182			u64 bytes;
3183
3184			l = path->nodes[0];
3185			slot = path->slots[0];
3186			if (slot >= btrfs_header_nritems(l)) {
3187				ret = btrfs_next_leaf(root, path);
3188				if (ret == 0)
3189					continue;
3190				if (ret < 0)
3191					goto out;
3192
3193				stop_loop = 1;
3194				break;
3195			}
3196			btrfs_item_key_to_cpu(l, &key, slot);
3197
3198			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3199			    key.type != BTRFS_METADATA_ITEM_KEY)
3200				goto next;
3201
3202			if (key.type == BTRFS_METADATA_ITEM_KEY)
3203				bytes = fs_info->nodesize;
3204			else
3205				bytes = key.offset;
3206
3207			if (key.objectid + bytes <= logical)
3208				goto next;
3209
3210			if (key.objectid >= logical + map->stripe_len) {
3211				/* out of this device extent */
3212				if (key.objectid >= logic_end)
3213					stop_loop = 1;
3214				break;
3215			}
3216
3217			/*
3218			 * If our block group was removed in the meanwhile, just
3219			 * stop scrubbing since there is no point in continuing.
3220			 * Continuing would prevent reusing its device extents
3221			 * for new block groups for a long time.
3222			 */
3223			spin_lock(&cache->lock);
3224			if (cache->removed) {
3225				spin_unlock(&cache->lock);
3226				ret = 0;
3227				goto out;
3228			}
3229			spin_unlock(&cache->lock);
3230
3231			extent = btrfs_item_ptr(l, slot,
3232						struct btrfs_extent_item);
3233			flags = btrfs_extent_flags(l, extent);
3234			generation = btrfs_extent_generation(l, extent);
3235
3236			if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3237			    (key.objectid < logical ||
3238			     key.objectid + bytes >
3239			     logical + map->stripe_len)) {
3240				btrfs_err(fs_info,
3241					   "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3242				       key.objectid, logical);
3243				spin_lock(&sctx->stat_lock);
3244				sctx->stat.uncorrectable_errors++;
3245				spin_unlock(&sctx->stat_lock);
3246				goto next;
3247			}
3248
3249again:
3250			extent_logical = key.objectid;
3251			extent_len = bytes;
3252
3253			/*
3254			 * trim extent to this stripe
3255			 */
3256			if (extent_logical < logical) {
3257				extent_len -= logical - extent_logical;
3258				extent_logical = logical;
3259			}
3260			if (extent_logical + extent_len >
3261			    logical + map->stripe_len) {
3262				extent_len = logical + map->stripe_len -
3263					     extent_logical;
3264			}
3265
3266			extent_physical = extent_logical - logical + physical;
3267			extent_dev = scrub_dev;
3268			extent_mirror_num = mirror_num;
3269			if (sctx->is_dev_replace)
3270				scrub_remap_extent(fs_info, extent_logical,
3271						   extent_len, &extent_physical,
3272						   &extent_dev,
3273						   &extent_mirror_num);
3274
3275			if (flags & BTRFS_EXTENT_FLAG_DATA) {
3276				ret = btrfs_lookup_csums_range(csum_root,
3277						extent_logical,
3278						extent_logical + extent_len - 1,
3279						&sctx->csum_list, 1);
3280				if (ret)
3281					goto out;
3282			}
3283
3284			ret = scrub_extent(sctx, map, extent_logical, extent_len,
3285					   extent_physical, extent_dev, flags,
3286					   generation, extent_mirror_num,
3287					   extent_logical - logical + physical);
3288
3289			scrub_free_csums(sctx);
3290
3291			if (ret)
3292				goto out;
3293
3294			if (extent_logical + extent_len <
3295			    key.objectid + bytes) {
3296				if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3297					/*
3298					 * loop until we find next data stripe
3299					 * or we have finished all stripes.
3300					 */
3301loop:
3302					physical += map->stripe_len;
3303					ret = get_raid56_logic_offset(physical,
3304							num, map, &logical,
3305							&stripe_logical);
3306					logical += base;
3307
3308					if (ret && physical < physical_end) {
3309						stripe_logical += base;
3310						stripe_end = stripe_logical +
3311								increment;
3312						ret = scrub_raid56_parity(sctx,
3313							map, scrub_dev, ppath,
3314							stripe_logical,
3315							stripe_end);
3316						if (ret)
3317							goto out;
3318						goto loop;
3319					}
3320				} else {
3321					physical += map->stripe_len;
3322					logical += increment;
3323				}
3324				if (logical < key.objectid + bytes) {
3325					cond_resched();
3326					goto again;
3327				}
3328
3329				if (physical >= physical_end) {
3330					stop_loop = 1;
3331					break;
3332				}
3333			}
3334next:
3335			path->slots[0]++;
3336		}
3337		btrfs_release_path(path);
3338skip:
3339		logical += increment;
3340		physical += map->stripe_len;
3341		spin_lock(&sctx->stat_lock);
3342		if (stop_loop)
3343			sctx->stat.last_physical = map->stripes[num].physical +
3344						   length;
3345		else
3346			sctx->stat.last_physical = physical;
3347		spin_unlock(&sctx->stat_lock);
3348		if (stop_loop)
3349			break;
3350	}
3351out:
3352	/* push queued extents */
3353	scrub_submit(sctx);
3354	mutex_lock(&sctx->wr_lock);
3355	scrub_wr_submit(sctx);
3356	mutex_unlock(&sctx->wr_lock);
3357
3358	blk_finish_plug(&plug);
3359	btrfs_free_path(path);
3360	btrfs_free_path(ppath);
3361	return ret < 0 ? ret : 0;
3362}
3363
3364static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3365					  struct btrfs_device *scrub_dev,
3366					  u64 chunk_offset, u64 length,
3367					  u64 dev_offset,
3368					  struct btrfs_block_group *cache)
3369{
3370	struct btrfs_fs_info *fs_info = sctx->fs_info;
3371	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3372	struct map_lookup *map;
3373	struct extent_map *em;
3374	int i;
3375	int ret = 0;
3376
3377	read_lock(&map_tree->lock);
3378	em = lookup_extent_mapping(map_tree, chunk_offset, 1);
3379	read_unlock(&map_tree->lock);
3380
3381	if (!em) {
3382		/*
3383		 * Might have been an unused block group deleted by the cleaner
3384		 * kthread or relocation.
3385		 */
3386		spin_lock(&cache->lock);
3387		if (!cache->removed)
3388			ret = -EINVAL;
3389		spin_unlock(&cache->lock);
3390
3391		return ret;
3392	}
3393
3394	map = em->map_lookup;
3395	if (em->start != chunk_offset)
3396		goto out;
3397
3398	if (em->len < length)
3399		goto out;
3400
3401	for (i = 0; i < map->num_stripes; ++i) {
3402		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3403		    map->stripes[i].physical == dev_offset) {
3404			ret = scrub_stripe(sctx, map, scrub_dev, i,
3405					   chunk_offset, length, cache);
3406			if (ret)
3407				goto out;
3408		}
3409	}
3410out:
3411	free_extent_map(em);
3412
3413	return ret;
3414}
3415
3416static noinline_for_stack
3417int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3418			   struct btrfs_device *scrub_dev, u64 start, u64 end)
3419{
3420	struct btrfs_dev_extent *dev_extent = NULL;
3421	struct btrfs_path *path;
3422	struct btrfs_fs_info *fs_info = sctx->fs_info;
3423	struct btrfs_root *root = fs_info->dev_root;
3424	u64 length;
3425	u64 chunk_offset;
3426	int ret = 0;
3427	int ro_set;
3428	int slot;
3429	struct extent_buffer *l;
3430	struct btrfs_key key;
3431	struct btrfs_key found_key;
3432	struct btrfs_block_group *cache;
3433	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3434
3435	path = btrfs_alloc_path();
3436	if (!path)
3437		return -ENOMEM;
3438
3439	path->reada = READA_FORWARD;
3440	path->search_commit_root = 1;
3441	path->skip_locking = 1;
3442
3443	key.objectid = scrub_dev->devid;
3444	key.offset = 0ull;
3445	key.type = BTRFS_DEV_EXTENT_KEY;
3446
3447	while (1) {
3448		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3449		if (ret < 0)
3450			break;
3451		if (ret > 0) {
3452			if (path->slots[0] >=
3453			    btrfs_header_nritems(path->nodes[0])) {
3454				ret = btrfs_next_leaf(root, path);
3455				if (ret < 0)
3456					break;
3457				if (ret > 0) {
3458					ret = 0;
3459					break;
3460				}
3461			} else {
3462				ret = 0;
3463			}
3464		}
3465
3466		l = path->nodes[0];
3467		slot = path->slots[0];
3468
3469		btrfs_item_key_to_cpu(l, &found_key, slot);
3470
3471		if (found_key.objectid != scrub_dev->devid)
3472			break;
3473
3474		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3475			break;
3476
3477		if (found_key.offset >= end)
3478			break;
3479
3480		if (found_key.offset < key.offset)
3481			break;
3482
3483		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3484		length = btrfs_dev_extent_length(l, dev_extent);
3485
3486		if (found_key.offset + length <= start)
3487			goto skip;
3488
3489		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3490
3491		/*
3492		 * get a reference on the corresponding block group to prevent
3493		 * the chunk from going away while we scrub it
3494		 */
3495		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3496
3497		/* some chunks are removed but not committed to disk yet,
3498		 * continue scrubbing */
3499		if (!cache)
3500			goto skip;
3501
3502		/*
3503		 * Make sure that while we are scrubbing the corresponding block
3504		 * group doesn't get its logical address and its device extents
3505		 * reused for another block group, which can possibly be of a
3506		 * different type and different profile. We do this to prevent
3507		 * false error detections and crashes due to bogus attempts to
3508		 * repair extents.
3509		 */
3510		spin_lock(&cache->lock);
3511		if (cache->removed) {
3512			spin_unlock(&cache->lock);
3513			btrfs_put_block_group(cache);
3514			goto skip;
3515		}
3516		btrfs_freeze_block_group(cache);
3517		spin_unlock(&cache->lock);
3518
3519		/*
3520		 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3521		 * to avoid deadlock caused by:
3522		 * btrfs_inc_block_group_ro()
3523		 * -> btrfs_wait_for_commit()
3524		 * -> btrfs_commit_transaction()
3525		 * -> btrfs_scrub_pause()
3526		 */
3527		scrub_pause_on(fs_info);
3528
3529		/*
3530		 * Don't do chunk preallocation for scrub.
3531		 *
3532		 * This is especially important for SYSTEM bgs, or we can hit
3533		 * -EFBIG from btrfs_finish_chunk_alloc() like:
3534		 * 1. The only SYSTEM bg is marked RO.
3535		 *    Since SYSTEM bg is small, that's pretty common.
3536		 * 2. New SYSTEM bg will be allocated
3537		 *    Due to regular version will allocate new chunk.
3538		 * 3. New SYSTEM bg is empty and will get cleaned up
3539		 *    Before cleanup really happens, it's marked RO again.
3540		 * 4. Empty SYSTEM bg get scrubbed
3541		 *    We go back to 2.
3542		 *
3543		 * This can easily boost the amount of SYSTEM chunks if cleaner
3544		 * thread can't be triggered fast enough, and use up all space
3545		 * of btrfs_super_block::sys_chunk_array
3546		 *
3547		 * While for dev replace, we need to try our best to mark block
3548		 * group RO, to prevent race between:
3549		 * - Write duplication
3550		 *   Contains latest data
3551		 * - Scrub copy
3552		 *   Contains data from commit tree
3553		 *
3554		 * If target block group is not marked RO, nocow writes can
3555		 * be overwritten by scrub copy, causing data corruption.
3556		 * So for dev-replace, it's not allowed to continue if a block
3557		 * group is not RO.
3558		 */
3559		ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3560		if (ret == 0) {
3561			ro_set = 1;
3562		} else if (ret == -ENOSPC && !sctx->is_dev_replace &&
3563			   !(cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) {
3564			/*
3565			 * btrfs_inc_block_group_ro return -ENOSPC when it
3566			 * failed in creating new chunk for metadata.
3567			 * It is not a problem for scrub, because
3568			 * metadata are always cowed, and our scrub paused
3569			 * commit_transactions.
3570			 *
3571			 * For RAID56 chunks, we have to mark them read-only
3572			 * for scrub, as later we would use our own cache
3573			 * out of RAID56 realm.
3574			 * Thus we want the RAID56 bg to be marked RO to
3575			 * prevent RMW from screwing up out cache.
3576			 */
3577			ro_set = 0;
3578		} else if (ret == -ETXTBSY) {
3579			btrfs_warn(fs_info,
3580		   "skipping scrub of block group %llu due to active swapfile",
3581				   cache->start);
3582			scrub_pause_off(fs_info);
3583			ret = 0;
3584			goto skip_unfreeze;
3585		} else {
3586			btrfs_warn(fs_info,
3587				   "failed setting block group ro: %d", ret);
3588			btrfs_unfreeze_block_group(cache);
3589			btrfs_put_block_group(cache);
3590			scrub_pause_off(fs_info);
3591			break;
3592		}
3593
3594		/*
3595		 * Now the target block is marked RO, wait for nocow writes to
3596		 * finish before dev-replace.
3597		 * COW is fine, as COW never overwrites extents in commit tree.
3598		 */
3599		if (sctx->is_dev_replace) {
3600			btrfs_wait_nocow_writers(cache);
3601			btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3602					cache->length);
3603		}
3604
3605		scrub_pause_off(fs_info);
3606		down_write(&dev_replace->rwsem);
3607		dev_replace->cursor_right = found_key.offset + length;
3608		dev_replace->cursor_left = found_key.offset;
3609		dev_replace->item_needs_writeback = 1;
3610		up_write(&dev_replace->rwsem);
3611
3612		ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3613				  found_key.offset, cache);
3614
3615		/*
3616		 * flush, submit all pending read and write bios, afterwards
3617		 * wait for them.
3618		 * Note that in the dev replace case, a read request causes
3619		 * write requests that are submitted in the read completion
3620		 * worker. Therefore in the current situation, it is required
3621		 * that all write requests are flushed, so that all read and
3622		 * write requests are really completed when bios_in_flight
3623		 * changes to 0.
3624		 */
3625		sctx->flush_all_writes = true;
3626		scrub_submit(sctx);
3627		mutex_lock(&sctx->wr_lock);
3628		scrub_wr_submit(sctx);
3629		mutex_unlock(&sctx->wr_lock);
3630
3631		wait_event(sctx->list_wait,
3632			   atomic_read(&sctx->bios_in_flight) == 0);
3633
3634		scrub_pause_on(fs_info);
3635
3636		/*
3637		 * must be called before we decrease @scrub_paused.
3638		 * make sure we don't block transaction commit while
3639		 * we are waiting pending workers finished.
3640		 */
3641		wait_event(sctx->list_wait,
3642			   atomic_read(&sctx->workers_pending) == 0);
3643		sctx->flush_all_writes = false;
3644
3645		scrub_pause_off(fs_info);
3646
3647		down_write(&dev_replace->rwsem);
3648		dev_replace->cursor_left = dev_replace->cursor_right;
3649		dev_replace->item_needs_writeback = 1;
3650		up_write(&dev_replace->rwsem);
3651
3652		if (ro_set)
3653			btrfs_dec_block_group_ro(cache);
3654
3655		/*
3656		 * We might have prevented the cleaner kthread from deleting
3657		 * this block group if it was already unused because we raced
3658		 * and set it to RO mode first. So add it back to the unused
3659		 * list, otherwise it might not ever be deleted unless a manual
3660		 * balance is triggered or it becomes used and unused again.
3661		 */
3662		spin_lock(&cache->lock);
3663		if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3664		    cache->used == 0) {
3665			spin_unlock(&cache->lock);
3666			if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3667				btrfs_discard_queue_work(&fs_info->discard_ctl,
3668							 cache);
3669			else
3670				btrfs_mark_bg_unused(cache);
3671		} else {
3672			spin_unlock(&cache->lock);
3673		}
3674skip_unfreeze:
3675		btrfs_unfreeze_block_group(cache);
3676		btrfs_put_block_group(cache);
3677		if (ret)
3678			break;
3679		if (sctx->is_dev_replace &&
3680		    atomic64_read(&dev_replace->num_write_errors) > 0) {
3681			ret = -EIO;
3682			break;
3683		}
3684		if (sctx->stat.malloc_errors > 0) {
3685			ret = -ENOMEM;
3686			break;
3687		}
3688skip:
3689		key.offset = found_key.offset + length;
3690		btrfs_release_path(path);
3691	}
3692
3693	btrfs_free_path(path);
3694
3695	return ret;
3696}
3697
3698static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3699					   struct btrfs_device *scrub_dev)
3700{
3701	int	i;
3702	u64	bytenr;
3703	u64	gen;
3704	int	ret;
3705	struct btrfs_fs_info *fs_info = sctx->fs_info;
3706
3707	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3708		return -EROFS;
3709
3710	/* Seed devices of a new filesystem has their own generation. */
3711	if (scrub_dev->fs_devices != fs_info->fs_devices)
3712		gen = scrub_dev->generation;
3713	else
3714		gen = fs_info->last_trans_committed;
3715
3716	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3717		bytenr = btrfs_sb_offset(i);
3718		if (bytenr + BTRFS_SUPER_INFO_SIZE >
3719		    scrub_dev->commit_total_bytes)
3720			break;
3721
3722		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3723				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3724				  NULL, 1, bytenr);
3725		if (ret)
3726			return ret;
3727	}
3728	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3729
3730	return 0;
3731}
3732
3733static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3734{
3735	if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3736					&fs_info->scrub_lock)) {
3737		struct btrfs_workqueue *scrub_workers = NULL;
3738		struct btrfs_workqueue *scrub_wr_comp = NULL;
3739		struct btrfs_workqueue *scrub_parity = NULL;
3740
3741		scrub_workers = fs_info->scrub_workers;
3742		scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3743		scrub_parity = fs_info->scrub_parity_workers;
3744
3745		fs_info->scrub_workers = NULL;
3746		fs_info->scrub_wr_completion_workers = NULL;
3747		fs_info->scrub_parity_workers = NULL;
3748		mutex_unlock(&fs_info->scrub_lock);
3749
3750		btrfs_destroy_workqueue(scrub_workers);
3751		btrfs_destroy_workqueue(scrub_wr_comp);
3752		btrfs_destroy_workqueue(scrub_parity);
3753	}
3754}
3755
3756/*
3757 * get a reference count on fs_info->scrub_workers. start worker if necessary
3758 */
3759static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3760						int is_dev_replace)
3761{
3762	struct btrfs_workqueue *scrub_workers = NULL;
3763	struct btrfs_workqueue *scrub_wr_comp = NULL;
3764	struct btrfs_workqueue *scrub_parity = NULL;
3765	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3766	int max_active = fs_info->thread_pool_size;
3767	int ret = -ENOMEM;
3768
3769	if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
3770		return 0;
3771
3772	scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
3773					      is_dev_replace ? 1 : max_active, 4);
3774	if (!scrub_workers)
3775		goto fail_scrub_workers;
3776
3777	scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3778					      max_active, 2);
3779	if (!scrub_wr_comp)
3780		goto fail_scrub_wr_completion_workers;
3781
3782	scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3783					     max_active, 2);
3784	if (!scrub_parity)
3785		goto fail_scrub_parity_workers;
3786
3787	mutex_lock(&fs_info->scrub_lock);
3788	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
3789		ASSERT(fs_info->scrub_workers == NULL &&
3790		       fs_info->scrub_wr_completion_workers == NULL &&
3791		       fs_info->scrub_parity_workers == NULL);
3792		fs_info->scrub_workers = scrub_workers;
3793		fs_info->scrub_wr_completion_workers = scrub_wr_comp;
3794		fs_info->scrub_parity_workers = scrub_parity;
3795		refcount_set(&fs_info->scrub_workers_refcnt, 1);
3796		mutex_unlock(&fs_info->scrub_lock);
3797		return 0;
3798	}
3799	/* Other thread raced in and created the workers for us */
3800	refcount_inc(&fs_info->scrub_workers_refcnt);
3801	mutex_unlock(&fs_info->scrub_lock);
3802
3803	ret = 0;
3804	btrfs_destroy_workqueue(scrub_parity);
3805fail_scrub_parity_workers:
3806	btrfs_destroy_workqueue(scrub_wr_comp);
3807fail_scrub_wr_completion_workers:
3808	btrfs_destroy_workqueue(scrub_workers);
3809fail_scrub_workers:
3810	return ret;
3811}
3812
3813int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3814		    u64 end, struct btrfs_scrub_progress *progress,
3815		    int readonly, int is_dev_replace)
3816{
3817	struct scrub_ctx *sctx;
3818	int ret;
3819	struct btrfs_device *dev;
3820	unsigned int nofs_flag;
3821	bool need_commit = false;
3822
3823	if (btrfs_fs_closing(fs_info))
3824		return -EAGAIN;
3825
3826	if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
3827		/*
3828		 * in this case scrub is unable to calculate the checksum
3829		 * the way scrub is implemented. Do not handle this
3830		 * situation at all because it won't ever happen.
3831		 */
3832		btrfs_err(fs_info,
3833			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3834		       fs_info->nodesize,
3835		       BTRFS_STRIPE_LEN);
3836		return -EINVAL;
3837	}
3838
3839	if (fs_info->sectorsize != PAGE_SIZE) {
3840		/* not supported for data w/o checksums */
3841		btrfs_err_rl(fs_info,
3842			   "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
3843		       fs_info->sectorsize, PAGE_SIZE);
3844		return -EINVAL;
3845	}
3846
3847	if (fs_info->nodesize >
3848	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3849	    fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3850		/*
3851		 * would exhaust the array bounds of pagev member in
3852		 * struct scrub_block
3853		 */
3854		btrfs_err(fs_info,
3855			  "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3856		       fs_info->nodesize,
3857		       SCRUB_MAX_PAGES_PER_BLOCK,
3858		       fs_info->sectorsize,
3859		       SCRUB_MAX_PAGES_PER_BLOCK);
3860		return -EINVAL;
3861	}
3862
3863	/* Allocate outside of device_list_mutex */
3864	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
3865	if (IS_ERR(sctx))
3866		return PTR_ERR(sctx);
3867
3868	ret = scrub_workers_get(fs_info, is_dev_replace);
3869	if (ret)
3870		goto out_free_ctx;
3871
3872	mutex_lock(&fs_info->fs_devices->device_list_mutex);
3873	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
3874	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
3875		     !is_dev_replace)) {
3876		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3877		ret = -ENODEV;
3878		goto out;
3879	}
3880
3881	if (!is_dev_replace && !readonly &&
3882	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
3883		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3884		btrfs_err_in_rcu(fs_info,
3885			"scrub on devid %llu: filesystem on %s is not writable",
3886				 devid, rcu_str_deref(dev->name));
3887		ret = -EROFS;
3888		goto out;
3889	}
3890
3891	mutex_lock(&fs_info->scrub_lock);
3892	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3893	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
3894		mutex_unlock(&fs_info->scrub_lock);
3895		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3896		ret = -EIO;
3897		goto out;
3898	}
3899
3900	down_read(&fs_info->dev_replace.rwsem);
3901	if (dev->scrub_ctx ||
3902	    (!is_dev_replace &&
3903	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3904		up_read(&fs_info->dev_replace.rwsem);
3905		mutex_unlock(&fs_info->scrub_lock);
3906		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3907		ret = -EINPROGRESS;
3908		goto out;
3909	}
3910	up_read(&fs_info->dev_replace.rwsem);
3911
3912	sctx->readonly = readonly;
3913	dev->scrub_ctx = sctx;
3914	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3915
3916	/*
3917	 * checking @scrub_pause_req here, we can avoid
3918	 * race between committing transaction and scrubbing.
3919	 */
3920	__scrub_blocked_if_needed(fs_info);
3921	atomic_inc(&fs_info->scrubs_running);
3922	mutex_unlock(&fs_info->scrub_lock);
3923
3924	/*
3925	 * In order to avoid deadlock with reclaim when there is a transaction
3926	 * trying to pause scrub, make sure we use GFP_NOFS for all the
3927	 * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
3928	 * invoked by our callees. The pausing request is done when the
3929	 * transaction commit starts, and it blocks the transaction until scrub
3930	 * is paused (done at specific points at scrub_stripe() or right above
3931	 * before incrementing fs_info->scrubs_running).
3932	 */
3933	nofs_flag = memalloc_nofs_save();
3934	if (!is_dev_replace) {
3935		u64 old_super_errors;
3936
3937		spin_lock(&sctx->stat_lock);
3938		old_super_errors = sctx->stat.super_errors;
3939		spin_unlock(&sctx->stat_lock);
3940
3941		btrfs_info(fs_info, "scrub: started on devid %llu", devid);
3942		/*
3943		 * by holding device list mutex, we can
3944		 * kick off writing super in log tree sync.
3945		 */
3946		mutex_lock(&fs_info->fs_devices->device_list_mutex);
3947		ret = scrub_supers(sctx, dev);
3948		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3949
3950		spin_lock(&sctx->stat_lock);
3951		/*
3952		 * Super block errors found, but we can not commit transaction
3953		 * at current context, since btrfs_commit_transaction() needs
3954		 * to pause the current running scrub (hold by ourselves).
3955		 */
3956		if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
3957			need_commit = true;
3958		spin_unlock(&sctx->stat_lock);
3959	}
3960
3961	if (!ret)
3962		ret = scrub_enumerate_chunks(sctx, dev, start, end);
3963	memalloc_nofs_restore(nofs_flag);
3964
3965	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3966	atomic_dec(&fs_info->scrubs_running);
3967	wake_up(&fs_info->scrub_pause_wait);
3968
3969	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3970
3971	if (progress)
3972		memcpy(progress, &sctx->stat, sizeof(*progress));
3973
3974	if (!is_dev_replace)
3975		btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
3976			ret ? "not finished" : "finished", devid, ret);
3977
3978	mutex_lock(&fs_info->scrub_lock);
3979	dev->scrub_ctx = NULL;
3980	mutex_unlock(&fs_info->scrub_lock);
3981
3982	scrub_workers_put(fs_info);
3983	scrub_put_ctx(sctx);
3984
3985	/*
3986	 * We found some super block errors before, now try to force a
3987	 * transaction commit, as scrub has finished.
3988	 */
3989	if (need_commit) {
3990		struct btrfs_trans_handle *trans;
3991
3992		trans = btrfs_start_transaction(fs_info->tree_root, 0);
3993		if (IS_ERR(trans)) {
3994			ret = PTR_ERR(trans);
3995			btrfs_err(fs_info,
3996	"scrub: failed to start transaction to fix super block errors: %d", ret);
3997			return ret;
3998		}
3999		ret = btrfs_commit_transaction(trans);
4000		if (ret < 0)
4001			btrfs_err(fs_info,
4002	"scrub: failed to commit transaction to fix super block errors: %d", ret);
4003	}
4004	return ret;
4005out:
4006	scrub_workers_put(fs_info);
4007out_free_ctx:
4008	scrub_free_ctx(sctx);
4009
4010	return ret;
4011}
4012
4013void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4014{
4015	mutex_lock(&fs_info->scrub_lock);
4016	atomic_inc(&fs_info->scrub_pause_req);
4017	while (atomic_read(&fs_info->scrubs_paused) !=
4018	       atomic_read(&fs_info->scrubs_running)) {
4019		mutex_unlock(&fs_info->scrub_lock);
4020		wait_event(fs_info->scrub_pause_wait,
4021			   atomic_read(&fs_info->scrubs_paused) ==
4022			   atomic_read(&fs_info->scrubs_running));
4023		mutex_lock(&fs_info->scrub_lock);
4024	}
4025	mutex_unlock(&fs_info->scrub_lock);
4026}
4027
4028void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4029{
4030	atomic_dec(&fs_info->scrub_pause_req);
4031	wake_up(&fs_info->scrub_pause_wait);
4032}
4033
4034int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4035{
4036	mutex_lock(&fs_info->scrub_lock);
4037	if (!atomic_read(&fs_info->scrubs_running)) {
4038		mutex_unlock(&fs_info->scrub_lock);
4039		return -ENOTCONN;
4040	}
4041
4042	atomic_inc(&fs_info->scrub_cancel_req);
4043	while (atomic_read(&fs_info->scrubs_running)) {
4044		mutex_unlock(&fs_info->scrub_lock);
4045		wait_event(fs_info->scrub_pause_wait,
4046			   atomic_read(&fs_info->scrubs_running) == 0);
4047		mutex_lock(&fs_info->scrub_lock);
4048	}
4049	atomic_dec(&fs_info->scrub_cancel_req);
4050	mutex_unlock(&fs_info->scrub_lock);
4051
4052	return 0;
4053}
4054
4055int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4056{
4057	struct btrfs_fs_info *fs_info = dev->fs_info;
4058	struct scrub_ctx *sctx;
4059
4060	mutex_lock(&fs_info->scrub_lock);
4061	sctx = dev->scrub_ctx;
4062	if (!sctx) {
4063		mutex_unlock(&fs_info->scrub_lock);
4064		return -ENOTCONN;
4065	}
4066	atomic_inc(&sctx->cancel_req);
4067	while (dev->scrub_ctx) {
4068		mutex_unlock(&fs_info->scrub_lock);
4069		wait_event(fs_info->scrub_pause_wait,
4070			   dev->scrub_ctx == NULL);
4071		mutex_lock(&fs_info->scrub_lock);
4072	}
4073	mutex_unlock(&fs_info->scrub_lock);
4074
4075	return 0;
4076}
4077
4078int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4079			 struct btrfs_scrub_progress *progress)
4080{
4081	struct btrfs_device *dev;
4082	struct scrub_ctx *sctx = NULL;
4083
4084	mutex_lock(&fs_info->fs_devices->device_list_mutex);
4085	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
4086	if (dev)
4087		sctx = dev->scrub_ctx;
4088	if (sctx)
4089		memcpy(progress, &sctx->stat, sizeof(*progress));
4090	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4091
4092	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4093}
4094
4095static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4096			       u64 extent_logical, u64 extent_len,
4097			       u64 *extent_physical,
4098			       struct btrfs_device **extent_dev,
4099			       int *extent_mirror_num)
4100{
4101	u64 mapped_length;
4102	struct btrfs_bio *bbio = NULL;
4103	int ret;
4104
4105	mapped_length = extent_len;
4106	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4107			      &mapped_length, &bbio, 0);
4108	if (ret || !bbio || mapped_length < extent_len ||
4109	    !bbio->stripes[0].dev->bdev) {
4110		btrfs_put_bbio(bbio);
4111		return;
4112	}
4113
4114	*extent_physical = bbio->stripes[0].physical;
4115	*extent_mirror_num = bbio->mirror_num;
4116	*extent_dev = bbio->stripes[0].dev;
4117	btrfs_put_bbio(bbio);
4118}
4119