xref: /kernel/linux/linux-6.6/fs/btrfs/raid56.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2012 Fusion-io  All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
5 */
6
7#include <linux/sched.h>
8#include <linux/bio.h>
9#include <linux/slab.h>
10#include <linux/blkdev.h>
11#include <linux/raid/pq.h>
12#include <linux/hash.h>
13#include <linux/list_sort.h>
14#include <linux/raid/xor.h>
15#include <linux/mm.h>
16#include "messages.h"
17#include "misc.h"
18#include "ctree.h"
19#include "disk-io.h"
20#include "volumes.h"
21#include "raid56.h"
22#include "async-thread.h"
23#include "file-item.h"
24#include "btrfs_inode.h"
25
26/* set when additional merges to this rbio are not allowed */
27#define RBIO_RMW_LOCKED_BIT	1
28
29/*
30 * set when this rbio is sitting in the hash, but it is just a cache
31 * of past RMW
32 */
33#define RBIO_CACHE_BIT		2
34
35/*
36 * set when it is safe to trust the stripe_pages for caching
37 */
38#define RBIO_CACHE_READY_BIT	3
39
40#define RBIO_CACHE_SIZE 1024
41
42#define BTRFS_STRIPE_HASH_TABLE_BITS				11
43
44/* Used by the raid56 code to lock stripes for read/modify/write */
45struct btrfs_stripe_hash {
46	struct list_head hash_list;
47	spinlock_t lock;
48};
49
50/* Used by the raid56 code to lock stripes for read/modify/write */
51struct btrfs_stripe_hash_table {
52	struct list_head stripe_cache;
53	spinlock_t cache_lock;
54	int cache_size;
55	struct btrfs_stripe_hash table[];
56};
57
58/*
59 * A bvec like structure to present a sector inside a page.
60 *
61 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
62 */
63struct sector_ptr {
64	struct page *page;
65	unsigned int pgoff:24;
66	unsigned int uptodate:8;
67};
68
69static void rmw_rbio_work(struct work_struct *work);
70static void rmw_rbio_work_locked(struct work_struct *work);
71static void index_rbio_pages(struct btrfs_raid_bio *rbio);
72static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
73
74static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
75static void scrub_rbio_work_locked(struct work_struct *work);
76
77static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
78{
79	bitmap_free(rbio->error_bitmap);
80	kfree(rbio->stripe_pages);
81	kfree(rbio->bio_sectors);
82	kfree(rbio->stripe_sectors);
83	kfree(rbio->finish_pointers);
84}
85
86static void free_raid_bio(struct btrfs_raid_bio *rbio)
87{
88	int i;
89
90	if (!refcount_dec_and_test(&rbio->refs))
91		return;
92
93	WARN_ON(!list_empty(&rbio->stripe_cache));
94	WARN_ON(!list_empty(&rbio->hash_list));
95	WARN_ON(!bio_list_empty(&rbio->bio_list));
96
97	for (i = 0; i < rbio->nr_pages; i++) {
98		if (rbio->stripe_pages[i]) {
99			__free_page(rbio->stripe_pages[i]);
100			rbio->stripe_pages[i] = NULL;
101		}
102	}
103
104	btrfs_put_bioc(rbio->bioc);
105	free_raid_bio_pointers(rbio);
106	kfree(rbio);
107}
108
109static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
110{
111	INIT_WORK(&rbio->work, work_func);
112	queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
113}
114
115/*
116 * the stripe hash table is used for locking, and to collect
117 * bios in hopes of making a full stripe
118 */
119int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
120{
121	struct btrfs_stripe_hash_table *table;
122	struct btrfs_stripe_hash_table *x;
123	struct btrfs_stripe_hash *cur;
124	struct btrfs_stripe_hash *h;
125	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
126	int i;
127
128	if (info->stripe_hash_table)
129		return 0;
130
131	/*
132	 * The table is large, starting with order 4 and can go as high as
133	 * order 7 in case lock debugging is turned on.
134	 *
135	 * Try harder to allocate and fallback to vmalloc to lower the chance
136	 * of a failing mount.
137	 */
138	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
139	if (!table)
140		return -ENOMEM;
141
142	spin_lock_init(&table->cache_lock);
143	INIT_LIST_HEAD(&table->stripe_cache);
144
145	h = table->table;
146
147	for (i = 0; i < num_entries; i++) {
148		cur = h + i;
149		INIT_LIST_HEAD(&cur->hash_list);
150		spin_lock_init(&cur->lock);
151	}
152
153	x = cmpxchg(&info->stripe_hash_table, NULL, table);
154	kvfree(x);
155	return 0;
156}
157
158/*
159 * caching an rbio means to copy anything from the
160 * bio_sectors array into the stripe_pages array.  We
161 * use the page uptodate bit in the stripe cache array
162 * to indicate if it has valid data
163 *
164 * once the caching is done, we set the cache ready
165 * bit.
166 */
167static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
168{
169	int i;
170	int ret;
171
172	ret = alloc_rbio_pages(rbio);
173	if (ret)
174		return;
175
176	for (i = 0; i < rbio->nr_sectors; i++) {
177		/* Some range not covered by bio (partial write), skip it */
178		if (!rbio->bio_sectors[i].page) {
179			/*
180			 * Even if the sector is not covered by bio, if it is
181			 * a data sector it should still be uptodate as it is
182			 * read from disk.
183			 */
184			if (i < rbio->nr_data * rbio->stripe_nsectors)
185				ASSERT(rbio->stripe_sectors[i].uptodate);
186			continue;
187		}
188
189		ASSERT(rbio->stripe_sectors[i].page);
190		memcpy_page(rbio->stripe_sectors[i].page,
191			    rbio->stripe_sectors[i].pgoff,
192			    rbio->bio_sectors[i].page,
193			    rbio->bio_sectors[i].pgoff,
194			    rbio->bioc->fs_info->sectorsize);
195		rbio->stripe_sectors[i].uptodate = 1;
196	}
197	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
198}
199
200/*
201 * we hash on the first logical address of the stripe
202 */
203static int rbio_bucket(struct btrfs_raid_bio *rbio)
204{
205	u64 num = rbio->bioc->full_stripe_logical;
206
207	/*
208	 * we shift down quite a bit.  We're using byte
209	 * addressing, and most of the lower bits are zeros.
210	 * This tends to upset hash_64, and it consistently
211	 * returns just one or two different values.
212	 *
213	 * shifting off the lower bits fixes things.
214	 */
215	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
216}
217
218static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
219				       unsigned int page_nr)
220{
221	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
222	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
223	int i;
224
225	ASSERT(page_nr < rbio->nr_pages);
226
227	for (i = sectors_per_page * page_nr;
228	     i < sectors_per_page * page_nr + sectors_per_page;
229	     i++) {
230		if (!rbio->stripe_sectors[i].uptodate)
231			return false;
232	}
233	return true;
234}
235
236/*
237 * Update the stripe_sectors[] array to use correct page and pgoff
238 *
239 * Should be called every time any page pointer in stripes_pages[] got modified.
240 */
241static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
242{
243	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
244	u32 offset;
245	int i;
246
247	for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
248		int page_index = offset >> PAGE_SHIFT;
249
250		ASSERT(page_index < rbio->nr_pages);
251		rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
252		rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
253	}
254}
255
256static void steal_rbio_page(struct btrfs_raid_bio *src,
257			    struct btrfs_raid_bio *dest, int page_nr)
258{
259	const u32 sectorsize = src->bioc->fs_info->sectorsize;
260	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
261	int i;
262
263	if (dest->stripe_pages[page_nr])
264		__free_page(dest->stripe_pages[page_nr]);
265	dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
266	src->stripe_pages[page_nr] = NULL;
267
268	/* Also update the sector->uptodate bits. */
269	for (i = sectors_per_page * page_nr;
270	     i < sectors_per_page * page_nr + sectors_per_page; i++)
271		dest->stripe_sectors[i].uptodate = true;
272}
273
274static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
275{
276	const int sector_nr = (page_nr << PAGE_SHIFT) >>
277			      rbio->bioc->fs_info->sectorsize_bits;
278
279	/*
280	 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
281	 * we won't have a page which is half data half parity.
282	 *
283	 * Thus if the first sector of the page belongs to data stripes, then
284	 * the full page belongs to data stripes.
285	 */
286	return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
287}
288
289/*
290 * Stealing an rbio means taking all the uptodate pages from the stripe array
291 * in the source rbio and putting them into the destination rbio.
292 *
293 * This will also update the involved stripe_sectors[] which are referring to
294 * the old pages.
295 */
296static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
297{
298	int i;
299
300	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
301		return;
302
303	for (i = 0; i < dest->nr_pages; i++) {
304		struct page *p = src->stripe_pages[i];
305
306		/*
307		 * We don't need to steal P/Q pages as they will always be
308		 * regenerated for RMW or full write anyway.
309		 */
310		if (!is_data_stripe_page(src, i))
311			continue;
312
313		/*
314		 * If @src already has RBIO_CACHE_READY_BIT, it should have
315		 * all data stripe pages present and uptodate.
316		 */
317		ASSERT(p);
318		ASSERT(full_page_sectors_uptodate(src, i));
319		steal_rbio_page(src, dest, i);
320	}
321	index_stripe_sectors(dest);
322	index_stripe_sectors(src);
323}
324
325/*
326 * merging means we take the bio_list from the victim and
327 * splice it into the destination.  The victim should
328 * be discarded afterwards.
329 *
330 * must be called with dest->rbio_list_lock held
331 */
332static void merge_rbio(struct btrfs_raid_bio *dest,
333		       struct btrfs_raid_bio *victim)
334{
335	bio_list_merge(&dest->bio_list, &victim->bio_list);
336	dest->bio_list_bytes += victim->bio_list_bytes;
337	/* Also inherit the bitmaps from @victim. */
338	bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
339		  dest->stripe_nsectors);
340	bio_list_init(&victim->bio_list);
341}
342
343/*
344 * used to prune items that are in the cache.  The caller
345 * must hold the hash table lock.
346 */
347static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
348{
349	int bucket = rbio_bucket(rbio);
350	struct btrfs_stripe_hash_table *table;
351	struct btrfs_stripe_hash *h;
352	int freeit = 0;
353
354	/*
355	 * check the bit again under the hash table lock.
356	 */
357	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
358		return;
359
360	table = rbio->bioc->fs_info->stripe_hash_table;
361	h = table->table + bucket;
362
363	/* hold the lock for the bucket because we may be
364	 * removing it from the hash table
365	 */
366	spin_lock(&h->lock);
367
368	/*
369	 * hold the lock for the bio list because we need
370	 * to make sure the bio list is empty
371	 */
372	spin_lock(&rbio->bio_list_lock);
373
374	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
375		list_del_init(&rbio->stripe_cache);
376		table->cache_size -= 1;
377		freeit = 1;
378
379		/* if the bio list isn't empty, this rbio is
380		 * still involved in an IO.  We take it out
381		 * of the cache list, and drop the ref that
382		 * was held for the list.
383		 *
384		 * If the bio_list was empty, we also remove
385		 * the rbio from the hash_table, and drop
386		 * the corresponding ref
387		 */
388		if (bio_list_empty(&rbio->bio_list)) {
389			if (!list_empty(&rbio->hash_list)) {
390				list_del_init(&rbio->hash_list);
391				refcount_dec(&rbio->refs);
392				BUG_ON(!list_empty(&rbio->plug_list));
393			}
394		}
395	}
396
397	spin_unlock(&rbio->bio_list_lock);
398	spin_unlock(&h->lock);
399
400	if (freeit)
401		free_raid_bio(rbio);
402}
403
404/*
405 * prune a given rbio from the cache
406 */
407static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
408{
409	struct btrfs_stripe_hash_table *table;
410
411	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
412		return;
413
414	table = rbio->bioc->fs_info->stripe_hash_table;
415
416	spin_lock(&table->cache_lock);
417	__remove_rbio_from_cache(rbio);
418	spin_unlock(&table->cache_lock);
419}
420
421/*
422 * remove everything in the cache
423 */
424static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
425{
426	struct btrfs_stripe_hash_table *table;
427	struct btrfs_raid_bio *rbio;
428
429	table = info->stripe_hash_table;
430
431	spin_lock(&table->cache_lock);
432	while (!list_empty(&table->stripe_cache)) {
433		rbio = list_entry(table->stripe_cache.next,
434				  struct btrfs_raid_bio,
435				  stripe_cache);
436		__remove_rbio_from_cache(rbio);
437	}
438	spin_unlock(&table->cache_lock);
439}
440
441/*
442 * remove all cached entries and free the hash table
443 * used by unmount
444 */
445void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
446{
447	if (!info->stripe_hash_table)
448		return;
449	btrfs_clear_rbio_cache(info);
450	kvfree(info->stripe_hash_table);
451	info->stripe_hash_table = NULL;
452}
453
454/*
455 * insert an rbio into the stripe cache.  It
456 * must have already been prepared by calling
457 * cache_rbio_pages
458 *
459 * If this rbio was already cached, it gets
460 * moved to the front of the lru.
461 *
462 * If the size of the rbio cache is too big, we
463 * prune an item.
464 */
465static void cache_rbio(struct btrfs_raid_bio *rbio)
466{
467	struct btrfs_stripe_hash_table *table;
468
469	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
470		return;
471
472	table = rbio->bioc->fs_info->stripe_hash_table;
473
474	spin_lock(&table->cache_lock);
475	spin_lock(&rbio->bio_list_lock);
476
477	/* bump our ref if we were not in the list before */
478	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
479		refcount_inc(&rbio->refs);
480
481	if (!list_empty(&rbio->stripe_cache)){
482		list_move(&rbio->stripe_cache, &table->stripe_cache);
483	} else {
484		list_add(&rbio->stripe_cache, &table->stripe_cache);
485		table->cache_size += 1;
486	}
487
488	spin_unlock(&rbio->bio_list_lock);
489
490	if (table->cache_size > RBIO_CACHE_SIZE) {
491		struct btrfs_raid_bio *found;
492
493		found = list_entry(table->stripe_cache.prev,
494				  struct btrfs_raid_bio,
495				  stripe_cache);
496
497		if (found != rbio)
498			__remove_rbio_from_cache(found);
499	}
500
501	spin_unlock(&table->cache_lock);
502}
503
504/*
505 * helper function to run the xor_blocks api.  It is only
506 * able to do MAX_XOR_BLOCKS at a time, so we need to
507 * loop through.
508 */
509static void run_xor(void **pages, int src_cnt, ssize_t len)
510{
511	int src_off = 0;
512	int xor_src_cnt = 0;
513	void *dest = pages[src_cnt];
514
515	while(src_cnt > 0) {
516		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
517		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
518
519		src_cnt -= xor_src_cnt;
520		src_off += xor_src_cnt;
521	}
522}
523
524/*
525 * Returns true if the bio list inside this rbio covers an entire stripe (no
526 * rmw required).
527 */
528static int rbio_is_full(struct btrfs_raid_bio *rbio)
529{
530	unsigned long size = rbio->bio_list_bytes;
531	int ret = 1;
532
533	spin_lock(&rbio->bio_list_lock);
534	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
535		ret = 0;
536	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
537	spin_unlock(&rbio->bio_list_lock);
538
539	return ret;
540}
541
542/*
543 * returns 1 if it is safe to merge two rbios together.
544 * The merging is safe if the two rbios correspond to
545 * the same stripe and if they are both going in the same
546 * direction (read vs write), and if neither one is
547 * locked for final IO
548 *
549 * The caller is responsible for locking such that
550 * rmw_locked is safe to test
551 */
552static int rbio_can_merge(struct btrfs_raid_bio *last,
553			  struct btrfs_raid_bio *cur)
554{
555	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
556	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
557		return 0;
558
559	/*
560	 * we can't merge with cached rbios, since the
561	 * idea is that when we merge the destination
562	 * rbio is going to run our IO for us.  We can
563	 * steal from cached rbios though, other functions
564	 * handle that.
565	 */
566	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
567	    test_bit(RBIO_CACHE_BIT, &cur->flags))
568		return 0;
569
570	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
571		return 0;
572
573	/* we can't merge with different operations */
574	if (last->operation != cur->operation)
575		return 0;
576	/*
577	 * We've need read the full stripe from the drive.
578	 * check and repair the parity and write the new results.
579	 *
580	 * We're not allowed to add any new bios to the
581	 * bio list here, anyone else that wants to
582	 * change this stripe needs to do their own rmw.
583	 */
584	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
585		return 0;
586
587	if (last->operation == BTRFS_RBIO_READ_REBUILD)
588		return 0;
589
590	return 1;
591}
592
593static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
594					     unsigned int stripe_nr,
595					     unsigned int sector_nr)
596{
597	ASSERT(stripe_nr < rbio->real_stripes);
598	ASSERT(sector_nr < rbio->stripe_nsectors);
599
600	return stripe_nr * rbio->stripe_nsectors + sector_nr;
601}
602
603/* Return a sector from rbio->stripe_sectors, not from the bio list */
604static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
605					     unsigned int stripe_nr,
606					     unsigned int sector_nr)
607{
608	return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
609							      sector_nr)];
610}
611
612/* Grab a sector inside P stripe */
613static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
614					      unsigned int sector_nr)
615{
616	return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
617}
618
619/* Grab a sector inside Q stripe, return NULL if not RAID6 */
620static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
621					      unsigned int sector_nr)
622{
623	if (rbio->nr_data + 1 == rbio->real_stripes)
624		return NULL;
625	return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
626}
627
628/*
629 * The first stripe in the table for a logical address
630 * has the lock.  rbios are added in one of three ways:
631 *
632 * 1) Nobody has the stripe locked yet.  The rbio is given
633 * the lock and 0 is returned.  The caller must start the IO
634 * themselves.
635 *
636 * 2) Someone has the stripe locked, but we're able to merge
637 * with the lock owner.  The rbio is freed and the IO will
638 * start automatically along with the existing rbio.  1 is returned.
639 *
640 * 3) Someone has the stripe locked, but we're not able to merge.
641 * The rbio is added to the lock owner's plug list, or merged into
642 * an rbio already on the plug list.  When the lock owner unlocks,
643 * the next rbio on the list is run and the IO is started automatically.
644 * 1 is returned
645 *
646 * If we return 0, the caller still owns the rbio and must continue with
647 * IO submission.  If we return 1, the caller must assume the rbio has
648 * already been freed.
649 */
650static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
651{
652	struct btrfs_stripe_hash *h;
653	struct btrfs_raid_bio *cur;
654	struct btrfs_raid_bio *pending;
655	struct btrfs_raid_bio *freeit = NULL;
656	struct btrfs_raid_bio *cache_drop = NULL;
657	int ret = 0;
658
659	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
660
661	spin_lock(&h->lock);
662	list_for_each_entry(cur, &h->hash_list, hash_list) {
663		if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
664			continue;
665
666		spin_lock(&cur->bio_list_lock);
667
668		/* Can we steal this cached rbio's pages? */
669		if (bio_list_empty(&cur->bio_list) &&
670		    list_empty(&cur->plug_list) &&
671		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
672		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
673			list_del_init(&cur->hash_list);
674			refcount_dec(&cur->refs);
675
676			steal_rbio(cur, rbio);
677			cache_drop = cur;
678			spin_unlock(&cur->bio_list_lock);
679
680			goto lockit;
681		}
682
683		/* Can we merge into the lock owner? */
684		if (rbio_can_merge(cur, rbio)) {
685			merge_rbio(cur, rbio);
686			spin_unlock(&cur->bio_list_lock);
687			freeit = rbio;
688			ret = 1;
689			goto out;
690		}
691
692
693		/*
694		 * We couldn't merge with the running rbio, see if we can merge
695		 * with the pending ones.  We don't have to check for rmw_locked
696		 * because there is no way they are inside finish_rmw right now
697		 */
698		list_for_each_entry(pending, &cur->plug_list, plug_list) {
699			if (rbio_can_merge(pending, rbio)) {
700				merge_rbio(pending, rbio);
701				spin_unlock(&cur->bio_list_lock);
702				freeit = rbio;
703				ret = 1;
704				goto out;
705			}
706		}
707
708		/*
709		 * No merging, put us on the tail of the plug list, our rbio
710		 * will be started with the currently running rbio unlocks
711		 */
712		list_add_tail(&rbio->plug_list, &cur->plug_list);
713		spin_unlock(&cur->bio_list_lock);
714		ret = 1;
715		goto out;
716	}
717lockit:
718	refcount_inc(&rbio->refs);
719	list_add(&rbio->hash_list, &h->hash_list);
720out:
721	spin_unlock(&h->lock);
722	if (cache_drop)
723		remove_rbio_from_cache(cache_drop);
724	if (freeit)
725		free_raid_bio(freeit);
726	return ret;
727}
728
729static void recover_rbio_work_locked(struct work_struct *work);
730
731/*
732 * called as rmw or parity rebuild is completed.  If the plug list has more
733 * rbios waiting for this stripe, the next one on the list will be started
734 */
735static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
736{
737	int bucket;
738	struct btrfs_stripe_hash *h;
739	int keep_cache = 0;
740
741	bucket = rbio_bucket(rbio);
742	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
743
744	if (list_empty(&rbio->plug_list))
745		cache_rbio(rbio);
746
747	spin_lock(&h->lock);
748	spin_lock(&rbio->bio_list_lock);
749
750	if (!list_empty(&rbio->hash_list)) {
751		/*
752		 * if we're still cached and there is no other IO
753		 * to perform, just leave this rbio here for others
754		 * to steal from later
755		 */
756		if (list_empty(&rbio->plug_list) &&
757		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
758			keep_cache = 1;
759			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
760			BUG_ON(!bio_list_empty(&rbio->bio_list));
761			goto done;
762		}
763
764		list_del_init(&rbio->hash_list);
765		refcount_dec(&rbio->refs);
766
767		/*
768		 * we use the plug list to hold all the rbios
769		 * waiting for the chance to lock this stripe.
770		 * hand the lock over to one of them.
771		 */
772		if (!list_empty(&rbio->plug_list)) {
773			struct btrfs_raid_bio *next;
774			struct list_head *head = rbio->plug_list.next;
775
776			next = list_entry(head, struct btrfs_raid_bio,
777					  plug_list);
778
779			list_del_init(&rbio->plug_list);
780
781			list_add(&next->hash_list, &h->hash_list);
782			refcount_inc(&next->refs);
783			spin_unlock(&rbio->bio_list_lock);
784			spin_unlock(&h->lock);
785
786			if (next->operation == BTRFS_RBIO_READ_REBUILD) {
787				start_async_work(next, recover_rbio_work_locked);
788			} else if (next->operation == BTRFS_RBIO_WRITE) {
789				steal_rbio(rbio, next);
790				start_async_work(next, rmw_rbio_work_locked);
791			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
792				steal_rbio(rbio, next);
793				start_async_work(next, scrub_rbio_work_locked);
794			}
795
796			goto done_nolock;
797		}
798	}
799done:
800	spin_unlock(&rbio->bio_list_lock);
801	spin_unlock(&h->lock);
802
803done_nolock:
804	if (!keep_cache)
805		remove_rbio_from_cache(rbio);
806}
807
808static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
809{
810	struct bio *next;
811
812	while (cur) {
813		next = cur->bi_next;
814		cur->bi_next = NULL;
815		cur->bi_status = err;
816		bio_endio(cur);
817		cur = next;
818	}
819}
820
821/*
822 * this frees the rbio and runs through all the bios in the
823 * bio_list and calls end_io on them
824 */
825static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
826{
827	struct bio *cur = bio_list_get(&rbio->bio_list);
828	struct bio *extra;
829
830	kfree(rbio->csum_buf);
831	bitmap_free(rbio->csum_bitmap);
832	rbio->csum_buf = NULL;
833	rbio->csum_bitmap = NULL;
834
835	/*
836	 * Clear the data bitmap, as the rbio may be cached for later usage.
837	 * do this before before unlock_stripe() so there will be no new bio
838	 * for this bio.
839	 */
840	bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
841
842	/*
843	 * At this moment, rbio->bio_list is empty, however since rbio does not
844	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
845	 * hash list, rbio may be merged with others so that rbio->bio_list
846	 * becomes non-empty.
847	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
848	 * more and we can call bio_endio() on all queued bios.
849	 */
850	unlock_stripe(rbio);
851	extra = bio_list_get(&rbio->bio_list);
852	free_raid_bio(rbio);
853
854	rbio_endio_bio_list(cur, err);
855	if (extra)
856		rbio_endio_bio_list(extra, err);
857}
858
859/*
860 * Get a sector pointer specified by its @stripe_nr and @sector_nr.
861 *
862 * @rbio:               The raid bio
863 * @stripe_nr:          Stripe number, valid range [0, real_stripe)
864 * @sector_nr:		Sector number inside the stripe,
865 *			valid range [0, stripe_nsectors)
866 * @bio_list_only:      Whether to use sectors inside the bio list only.
867 *
868 * The read/modify/write code wants to reuse the original bio page as much
869 * as possible, and only use stripe_sectors as fallback.
870 */
871static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
872					 int stripe_nr, int sector_nr,
873					 bool bio_list_only)
874{
875	struct sector_ptr *sector;
876	int index;
877
878	ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
879	ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
880
881	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
882	ASSERT(index >= 0 && index < rbio->nr_sectors);
883
884	spin_lock(&rbio->bio_list_lock);
885	sector = &rbio->bio_sectors[index];
886	if (sector->page || bio_list_only) {
887		/* Don't return sector without a valid page pointer */
888		if (!sector->page)
889			sector = NULL;
890		spin_unlock(&rbio->bio_list_lock);
891		return sector;
892	}
893	spin_unlock(&rbio->bio_list_lock);
894
895	return &rbio->stripe_sectors[index];
896}
897
898/*
899 * allocation and initial setup for the btrfs_raid_bio.  Not
900 * this does not allocate any pages for rbio->pages.
901 */
902static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
903					 struct btrfs_io_context *bioc)
904{
905	const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
906	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
907	const unsigned int num_pages = stripe_npages * real_stripes;
908	const unsigned int stripe_nsectors =
909		BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
910	const unsigned int num_sectors = stripe_nsectors * real_stripes;
911	struct btrfs_raid_bio *rbio;
912
913	/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
914	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
915	/*
916	 * Our current stripe len should be fixed to 64k thus stripe_nsectors
917	 * (at most 16) should be no larger than BITS_PER_LONG.
918	 */
919	ASSERT(stripe_nsectors <= BITS_PER_LONG);
920
921	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
922	if (!rbio)
923		return ERR_PTR(-ENOMEM);
924	rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
925				     GFP_NOFS);
926	rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
927				    GFP_NOFS);
928	rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
929				       GFP_NOFS);
930	rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
931	rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
932
933	if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
934	    !rbio->finish_pointers || !rbio->error_bitmap) {
935		free_raid_bio_pointers(rbio);
936		kfree(rbio);
937		return ERR_PTR(-ENOMEM);
938	}
939
940	bio_list_init(&rbio->bio_list);
941	init_waitqueue_head(&rbio->io_wait);
942	INIT_LIST_HEAD(&rbio->plug_list);
943	spin_lock_init(&rbio->bio_list_lock);
944	INIT_LIST_HEAD(&rbio->stripe_cache);
945	INIT_LIST_HEAD(&rbio->hash_list);
946	btrfs_get_bioc(bioc);
947	rbio->bioc = bioc;
948	rbio->nr_pages = num_pages;
949	rbio->nr_sectors = num_sectors;
950	rbio->real_stripes = real_stripes;
951	rbio->stripe_npages = stripe_npages;
952	rbio->stripe_nsectors = stripe_nsectors;
953	refcount_set(&rbio->refs, 1);
954	atomic_set(&rbio->stripes_pending, 0);
955
956	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
957	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
958
959	return rbio;
960}
961
962/* allocate pages for all the stripes in the bio, including parity */
963static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
964{
965	int ret;
966
967	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
968	if (ret < 0)
969		return ret;
970	/* Mapping all sectors */
971	index_stripe_sectors(rbio);
972	return 0;
973}
974
975/* only allocate pages for p/q stripes */
976static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
977{
978	const int data_pages = rbio->nr_data * rbio->stripe_npages;
979	int ret;
980
981	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
982				     rbio->stripe_pages + data_pages);
983	if (ret < 0)
984		return ret;
985
986	index_stripe_sectors(rbio);
987	return 0;
988}
989
990/*
991 * Return the total number of errors found in the vertical stripe of @sector_nr.
992 *
993 * @faila and @failb will also be updated to the first and second stripe
994 * number of the errors.
995 */
996static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
997				     int *faila, int *failb)
998{
999	int stripe_nr;
1000	int found_errors = 0;
1001
1002	if (faila || failb) {
1003		/*
1004		 * Both @faila and @failb should be valid pointers if any of
1005		 * them is specified.
1006		 */
1007		ASSERT(faila && failb);
1008		*faila = -1;
1009		*failb = -1;
1010	}
1011
1012	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1013		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1014
1015		if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1016			found_errors++;
1017			if (faila) {
1018				/* Update faila and failb. */
1019				if (*faila < 0)
1020					*faila = stripe_nr;
1021				else if (*failb < 0)
1022					*failb = stripe_nr;
1023			}
1024		}
1025	}
1026	return found_errors;
1027}
1028
1029/*
1030 * Add a single sector @sector into our list of bios for IO.
1031 *
1032 * Return 0 if everything went well.
1033 * Return <0 for error.
1034 */
1035static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1036			      struct bio_list *bio_list,
1037			      struct sector_ptr *sector,
1038			      unsigned int stripe_nr,
1039			      unsigned int sector_nr,
1040			      enum req_op op)
1041{
1042	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1043	struct bio *last = bio_list->tail;
1044	int ret;
1045	struct bio *bio;
1046	struct btrfs_io_stripe *stripe;
1047	u64 disk_start;
1048
1049	/*
1050	 * Note: here stripe_nr has taken device replace into consideration,
1051	 * thus it can be larger than rbio->real_stripe.
1052	 * So here we check against bioc->num_stripes, not rbio->real_stripes.
1053	 */
1054	ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1055	ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1056	ASSERT(sector->page);
1057
1058	stripe = &rbio->bioc->stripes[stripe_nr];
1059	disk_start = stripe->physical + sector_nr * sectorsize;
1060
1061	/* if the device is missing, just fail this stripe */
1062	if (!stripe->dev->bdev) {
1063		int found_errors;
1064
1065		set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1066			rbio->error_bitmap);
1067
1068		/* Check if we have reached tolerance early. */
1069		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
1070							 NULL, NULL);
1071		if (found_errors > rbio->bioc->max_errors)
1072			return -EIO;
1073		return 0;
1074	}
1075
1076	/* see if we can add this page onto our existing bio */
1077	if (last) {
1078		u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1079		last_end += last->bi_iter.bi_size;
1080
1081		/*
1082		 * we can't merge these if they are from different
1083		 * devices or if they are not contiguous
1084		 */
1085		if (last_end == disk_start && !last->bi_status &&
1086		    last->bi_bdev == stripe->dev->bdev) {
1087			ret = bio_add_page(last, sector->page, sectorsize,
1088					   sector->pgoff);
1089			if (ret == sectorsize)
1090				return 0;
1091		}
1092	}
1093
1094	/* put a new bio on the list */
1095	bio = bio_alloc(stripe->dev->bdev,
1096			max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1097			op, GFP_NOFS);
1098	bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1099	bio->bi_private = rbio;
1100
1101	__bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
1102	bio_list_add(bio_list, bio);
1103	return 0;
1104}
1105
1106static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1107{
1108	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1109	struct bio_vec bvec;
1110	struct bvec_iter iter;
1111	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1112		     rbio->bioc->full_stripe_logical;
1113
1114	bio_for_each_segment(bvec, bio, iter) {
1115		u32 bvec_offset;
1116
1117		for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1118		     bvec_offset += sectorsize, offset += sectorsize) {
1119			int index = offset / sectorsize;
1120			struct sector_ptr *sector = &rbio->bio_sectors[index];
1121
1122			sector->page = bvec.bv_page;
1123			sector->pgoff = bvec.bv_offset + bvec_offset;
1124			ASSERT(sector->pgoff < PAGE_SIZE);
1125		}
1126	}
1127}
1128
1129/*
1130 * helper function to walk our bio list and populate the bio_pages array with
1131 * the result.  This seems expensive, but it is faster than constantly
1132 * searching through the bio list as we setup the IO in finish_rmw or stripe
1133 * reconstruction.
1134 *
1135 * This must be called before you trust the answers from page_in_rbio
1136 */
1137static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1138{
1139	struct bio *bio;
1140
1141	spin_lock(&rbio->bio_list_lock);
1142	bio_list_for_each(bio, &rbio->bio_list)
1143		index_one_bio(rbio, bio);
1144
1145	spin_unlock(&rbio->bio_list_lock);
1146}
1147
1148static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1149			       struct raid56_bio_trace_info *trace_info)
1150{
1151	const struct btrfs_io_context *bioc = rbio->bioc;
1152	int i;
1153
1154	ASSERT(bioc);
1155
1156	/* We rely on bio->bi_bdev to find the stripe number. */
1157	if (!bio->bi_bdev)
1158		goto not_found;
1159
1160	for (i = 0; i < bioc->num_stripes; i++) {
1161		if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1162			continue;
1163		trace_info->stripe_nr = i;
1164		trace_info->devid = bioc->stripes[i].dev->devid;
1165		trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1166				     bioc->stripes[i].physical;
1167		return;
1168	}
1169
1170not_found:
1171	trace_info->devid = -1;
1172	trace_info->offset = -1;
1173	trace_info->stripe_nr = -1;
1174}
1175
1176static inline void bio_list_put(struct bio_list *bio_list)
1177{
1178	struct bio *bio;
1179
1180	while ((bio = bio_list_pop(bio_list)))
1181		bio_put(bio);
1182}
1183
1184/* Generate PQ for one vertical stripe. */
1185static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1186{
1187	void **pointers = rbio->finish_pointers;
1188	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1189	struct sector_ptr *sector;
1190	int stripe;
1191	const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1192
1193	/* First collect one sector from each data stripe */
1194	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1195		sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1196		pointers[stripe] = kmap_local_page(sector->page) +
1197				   sector->pgoff;
1198	}
1199
1200	/* Then add the parity stripe */
1201	sector = rbio_pstripe_sector(rbio, sectornr);
1202	sector->uptodate = 1;
1203	pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
1204
1205	if (has_qstripe) {
1206		/*
1207		 * RAID6, add the qstripe and call the library function
1208		 * to fill in our p/q
1209		 */
1210		sector = rbio_qstripe_sector(rbio, sectornr);
1211		sector->uptodate = 1;
1212		pointers[stripe++] = kmap_local_page(sector->page) +
1213				     sector->pgoff;
1214
1215		raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1216					pointers);
1217	} else {
1218		/* raid5 */
1219		memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
1220		run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
1221	}
1222	for (stripe = stripe - 1; stripe >= 0; stripe--)
1223		kunmap_local(pointers[stripe]);
1224}
1225
1226static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1227				   struct bio_list *bio_list)
1228{
1229	/* The total sector number inside the full stripe. */
1230	int total_sector_nr;
1231	int sectornr;
1232	int stripe;
1233	int ret;
1234
1235	ASSERT(bio_list_size(bio_list) == 0);
1236
1237	/* We should have at least one data sector. */
1238	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1239
1240	/*
1241	 * Reset errors, as we may have errors inherited from from degraded
1242	 * write.
1243	 */
1244	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
1245
1246	/*
1247	 * Start assembly.  Make bios for everything from the higher layers (the
1248	 * bio_list in our rbio) and our P/Q.  Ignore everything else.
1249	 */
1250	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1251	     total_sector_nr++) {
1252		struct sector_ptr *sector;
1253
1254		stripe = total_sector_nr / rbio->stripe_nsectors;
1255		sectornr = total_sector_nr % rbio->stripe_nsectors;
1256
1257		/* This vertical stripe has no data, skip it. */
1258		if (!test_bit(sectornr, &rbio->dbitmap))
1259			continue;
1260
1261		if (stripe < rbio->nr_data) {
1262			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1263			if (!sector)
1264				continue;
1265		} else {
1266			sector = rbio_stripe_sector(rbio, stripe, sectornr);
1267		}
1268
1269		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
1270					 sectornr, REQ_OP_WRITE);
1271		if (ret)
1272			goto error;
1273	}
1274
1275	if (likely(!rbio->bioc->replace_nr_stripes))
1276		return 0;
1277
1278	/*
1279	 * Make a copy for the replace target device.
1280	 *
1281	 * Thus the source stripe number (in replace_stripe_src) should be valid.
1282	 */
1283	ASSERT(rbio->bioc->replace_stripe_src >= 0);
1284
1285	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1286	     total_sector_nr++) {
1287		struct sector_ptr *sector;
1288
1289		stripe = total_sector_nr / rbio->stripe_nsectors;
1290		sectornr = total_sector_nr % rbio->stripe_nsectors;
1291
1292		/*
1293		 * For RAID56, there is only one device that can be replaced,
1294		 * and replace_stripe_src[0] indicates the stripe number we
1295		 * need to copy from.
1296		 */
1297		if (stripe != rbio->bioc->replace_stripe_src) {
1298			/*
1299			 * We can skip the whole stripe completely, note
1300			 * total_sector_nr will be increased by one anyway.
1301			 */
1302			ASSERT(sectornr == 0);
1303			total_sector_nr += rbio->stripe_nsectors - 1;
1304			continue;
1305		}
1306
1307		/* This vertical stripe has no data, skip it. */
1308		if (!test_bit(sectornr, &rbio->dbitmap))
1309			continue;
1310
1311		if (stripe < rbio->nr_data) {
1312			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1313			if (!sector)
1314				continue;
1315		} else {
1316			sector = rbio_stripe_sector(rbio, stripe, sectornr);
1317		}
1318
1319		ret = rbio_add_io_sector(rbio, bio_list, sector,
1320					 rbio->real_stripes,
1321					 sectornr, REQ_OP_WRITE);
1322		if (ret)
1323			goto error;
1324	}
1325
1326	return 0;
1327error:
1328	bio_list_put(bio_list);
1329	return -EIO;
1330}
1331
1332static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1333{
1334	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1335	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1336		     rbio->bioc->full_stripe_logical;
1337	int total_nr_sector = offset >> fs_info->sectorsize_bits;
1338
1339	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1340
1341	bitmap_set(rbio->error_bitmap, total_nr_sector,
1342		   bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1343
1344	/*
1345	 * Special handling for raid56_alloc_missing_rbio() used by
1346	 * scrub/replace.  Unlike call path in raid56_parity_recover(), they
1347	 * pass an empty bio here.  Thus we have to find out the missing device
1348	 * and mark the stripe error instead.
1349	 */
1350	if (bio->bi_iter.bi_size == 0) {
1351		bool found_missing = false;
1352		int stripe_nr;
1353
1354		for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1355			if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1356				found_missing = true;
1357				bitmap_set(rbio->error_bitmap,
1358					   stripe_nr * rbio->stripe_nsectors,
1359					   rbio->stripe_nsectors);
1360			}
1361		}
1362		ASSERT(found_missing);
1363	}
1364}
1365
1366/*
1367 * For subpage case, we can no longer set page Up-to-date directly for
1368 * stripe_pages[], thus we need to locate the sector.
1369 */
1370static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1371					     struct page *page,
1372					     unsigned int pgoff)
1373{
1374	int i;
1375
1376	for (i = 0; i < rbio->nr_sectors; i++) {
1377		struct sector_ptr *sector = &rbio->stripe_sectors[i];
1378
1379		if (sector->page == page && sector->pgoff == pgoff)
1380			return sector;
1381	}
1382	return NULL;
1383}
1384
1385/*
1386 * this sets each page in the bio uptodate.  It should only be used on private
1387 * rbio pages, nothing that comes in from the higher layers
1388 */
1389static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1390{
1391	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1392	struct bio_vec *bvec;
1393	struct bvec_iter_all iter_all;
1394
1395	ASSERT(!bio_flagged(bio, BIO_CLONED));
1396
1397	bio_for_each_segment_all(bvec, bio, iter_all) {
1398		struct sector_ptr *sector;
1399		int pgoff;
1400
1401		for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1402		     pgoff += sectorsize) {
1403			sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1404			ASSERT(sector);
1405			if (sector)
1406				sector->uptodate = 1;
1407		}
1408	}
1409}
1410
1411static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1412{
1413	struct bio_vec *bv = bio_first_bvec_all(bio);
1414	int i;
1415
1416	for (i = 0; i < rbio->nr_sectors; i++) {
1417		struct sector_ptr *sector;
1418
1419		sector = &rbio->stripe_sectors[i];
1420		if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1421			break;
1422		sector = &rbio->bio_sectors[i];
1423		if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1424			break;
1425	}
1426	ASSERT(i < rbio->nr_sectors);
1427	return i;
1428}
1429
1430static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1431{
1432	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1433	u32 bio_size = 0;
1434	struct bio_vec *bvec;
1435	int i;
1436
1437	bio_for_each_bvec_all(bvec, bio, i)
1438		bio_size += bvec->bv_len;
1439
1440	/*
1441	 * Since we can have multiple bios touching the error_bitmap, we cannot
1442	 * call bitmap_set() without protection.
1443	 *
1444	 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1445	 */
1446	for (i = total_sector_nr; i < total_sector_nr +
1447	     (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1448		set_bit(i, rbio->error_bitmap);
1449}
1450
1451/* Verify the data sectors at read time. */
1452static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1453				    struct bio *bio)
1454{
1455	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1456	int total_sector_nr = get_bio_sector_nr(rbio, bio);
1457	struct bio_vec *bvec;
1458	struct bvec_iter_all iter_all;
1459
1460	/* No data csum for the whole stripe, no need to verify. */
1461	if (!rbio->csum_bitmap || !rbio->csum_buf)
1462		return;
1463
1464	/* P/Q stripes, they have no data csum to verify against. */
1465	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1466		return;
1467
1468	bio_for_each_segment_all(bvec, bio, iter_all) {
1469		int bv_offset;
1470
1471		for (bv_offset = bvec->bv_offset;
1472		     bv_offset < bvec->bv_offset + bvec->bv_len;
1473		     bv_offset += fs_info->sectorsize, total_sector_nr++) {
1474			u8 csum_buf[BTRFS_CSUM_SIZE];
1475			u8 *expected_csum = rbio->csum_buf +
1476					    total_sector_nr * fs_info->csum_size;
1477			int ret;
1478
1479			/* No csum for this sector, skip to the next sector. */
1480			if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1481				continue;
1482
1483			ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
1484				bv_offset, csum_buf, expected_csum);
1485			if (ret < 0)
1486				set_bit(total_sector_nr, rbio->error_bitmap);
1487		}
1488	}
1489}
1490
1491static void raid_wait_read_end_io(struct bio *bio)
1492{
1493	struct btrfs_raid_bio *rbio = bio->bi_private;
1494
1495	if (bio->bi_status) {
1496		rbio_update_error_bitmap(rbio, bio);
1497	} else {
1498		set_bio_pages_uptodate(rbio, bio);
1499		verify_bio_data_sectors(rbio, bio);
1500	}
1501
1502	bio_put(bio);
1503	if (atomic_dec_and_test(&rbio->stripes_pending))
1504		wake_up(&rbio->io_wait);
1505}
1506
1507static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1508			     struct bio_list *bio_list)
1509{
1510	struct bio *bio;
1511
1512	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1513	while ((bio = bio_list_pop(bio_list))) {
1514		bio->bi_end_io = raid_wait_read_end_io;
1515
1516		if (trace_raid56_read_enabled()) {
1517			struct raid56_bio_trace_info trace_info = { 0 };
1518
1519			bio_get_trace_info(rbio, bio, &trace_info);
1520			trace_raid56_read(rbio, bio, &trace_info);
1521		}
1522		submit_bio(bio);
1523	}
1524
1525	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
1526}
1527
1528static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1529{
1530	const int data_pages = rbio->nr_data * rbio->stripe_npages;
1531	int ret;
1532
1533	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
1534	if (ret < 0)
1535		return ret;
1536
1537	index_stripe_sectors(rbio);
1538	return 0;
1539}
1540
1541/*
1542 * We use plugging call backs to collect full stripes.
1543 * Any time we get a partial stripe write while plugged
1544 * we collect it into a list.  When the unplug comes down,
1545 * we sort the list by logical block number and merge
1546 * everything we can into the same rbios
1547 */
1548struct btrfs_plug_cb {
1549	struct blk_plug_cb cb;
1550	struct btrfs_fs_info *info;
1551	struct list_head rbio_list;
1552	struct work_struct work;
1553};
1554
1555/*
1556 * rbios on the plug list are sorted for easier merging.
1557 */
1558static int plug_cmp(void *priv, const struct list_head *a,
1559		    const struct list_head *b)
1560{
1561	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1562						       plug_list);
1563	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1564						       plug_list);
1565	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1566	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1567
1568	if (a_sector < b_sector)
1569		return -1;
1570	if (a_sector > b_sector)
1571		return 1;
1572	return 0;
1573}
1574
1575static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1576{
1577	struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
1578	struct btrfs_raid_bio *cur;
1579	struct btrfs_raid_bio *last = NULL;
1580
1581	list_sort(NULL, &plug->rbio_list, plug_cmp);
1582
1583	while (!list_empty(&plug->rbio_list)) {
1584		cur = list_entry(plug->rbio_list.next,
1585				 struct btrfs_raid_bio, plug_list);
1586		list_del_init(&cur->plug_list);
1587
1588		if (rbio_is_full(cur)) {
1589			/* We have a full stripe, queue it down. */
1590			start_async_work(cur, rmw_rbio_work);
1591			continue;
1592		}
1593		if (last) {
1594			if (rbio_can_merge(last, cur)) {
1595				merge_rbio(last, cur);
1596				free_raid_bio(cur);
1597				continue;
1598			}
1599			start_async_work(last, rmw_rbio_work);
1600		}
1601		last = cur;
1602	}
1603	if (last)
1604		start_async_work(last, rmw_rbio_work);
1605	kfree(plug);
1606}
1607
1608/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1609static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1610{
1611	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1612	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1613	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1614	const u32 orig_len = orig_bio->bi_iter.bi_size;
1615	const u32 sectorsize = fs_info->sectorsize;
1616	u64 cur_logical;
1617
1618	ASSERT(orig_logical >= full_stripe_start &&
1619	       orig_logical + orig_len <= full_stripe_start +
1620	       rbio->nr_data * BTRFS_STRIPE_LEN);
1621
1622	bio_list_add(&rbio->bio_list, orig_bio);
1623	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1624
1625	/* Update the dbitmap. */
1626	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1627	     cur_logical += sectorsize) {
1628		int bit = ((u32)(cur_logical - full_stripe_start) >>
1629			   fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1630
1631		set_bit(bit, &rbio->dbitmap);
1632	}
1633}
1634
1635/*
1636 * our main entry point for writes from the rest of the FS.
1637 */
1638void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1639{
1640	struct btrfs_fs_info *fs_info = bioc->fs_info;
1641	struct btrfs_raid_bio *rbio;
1642	struct btrfs_plug_cb *plug = NULL;
1643	struct blk_plug_cb *cb;
1644
1645	rbio = alloc_rbio(fs_info, bioc);
1646	if (IS_ERR(rbio)) {
1647		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
1648		bio_endio(bio);
1649		return;
1650	}
1651	rbio->operation = BTRFS_RBIO_WRITE;
1652	rbio_add_bio(rbio, bio);
1653
1654	/*
1655	 * Don't plug on full rbios, just get them out the door
1656	 * as quickly as we can
1657	 */
1658	if (!rbio_is_full(rbio)) {
1659		cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1660		if (cb) {
1661			plug = container_of(cb, struct btrfs_plug_cb, cb);
1662			if (!plug->info) {
1663				plug->info = fs_info;
1664				INIT_LIST_HEAD(&plug->rbio_list);
1665			}
1666			list_add_tail(&rbio->plug_list, &plug->rbio_list);
1667			return;
1668		}
1669	}
1670
1671	/*
1672	 * Either we don't have any existing plug, or we're doing a full stripe,
1673	 * queue the rmw work now.
1674	 */
1675	start_async_work(rbio, rmw_rbio_work);
1676}
1677
1678static int verify_one_sector(struct btrfs_raid_bio *rbio,
1679			     int stripe_nr, int sector_nr)
1680{
1681	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1682	struct sector_ptr *sector;
1683	u8 csum_buf[BTRFS_CSUM_SIZE];
1684	u8 *csum_expected;
1685	int ret;
1686
1687	if (!rbio->csum_bitmap || !rbio->csum_buf)
1688		return 0;
1689
1690	/* No way to verify P/Q as they are not covered by data csum. */
1691	if (stripe_nr >= rbio->nr_data)
1692		return 0;
1693	/*
1694	 * If we're rebuilding a read, we have to use pages from the
1695	 * bio list if possible.
1696	 */
1697	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1698		sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1699	} else {
1700		sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1701	}
1702
1703	ASSERT(sector->page);
1704
1705	csum_expected = rbio->csum_buf +
1706			(stripe_nr * rbio->stripe_nsectors + sector_nr) *
1707			fs_info->csum_size;
1708	ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
1709				      csum_buf, csum_expected);
1710	return ret;
1711}
1712
1713/*
1714 * Recover a vertical stripe specified by @sector_nr.
1715 * @*pointers are the pre-allocated pointers by the caller, so we don't
1716 * need to allocate/free the pointers again and again.
1717 */
1718static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
1719			    void **pointers, void **unmap_array)
1720{
1721	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1722	struct sector_ptr *sector;
1723	const u32 sectorsize = fs_info->sectorsize;
1724	int found_errors;
1725	int faila;
1726	int failb;
1727	int stripe_nr;
1728	int ret = 0;
1729
1730	/*
1731	 * Now we just use bitmap to mark the horizontal stripes in
1732	 * which we have data when doing parity scrub.
1733	 */
1734	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1735	    !test_bit(sector_nr, &rbio->dbitmap))
1736		return 0;
1737
1738	found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
1739						 &failb);
1740	/*
1741	 * No errors in the vertical stripe, skip it.  Can happen for recovery
1742	 * which only part of a stripe failed csum check.
1743	 */
1744	if (!found_errors)
1745		return 0;
1746
1747	if (found_errors > rbio->bioc->max_errors)
1748		return -EIO;
1749
1750	/*
1751	 * Setup our array of pointers with sectors from each stripe
1752	 *
1753	 * NOTE: store a duplicate array of pointers to preserve the
1754	 * pointer order.
1755	 */
1756	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1757		/*
1758		 * If we're rebuilding a read, we have to use pages from the
1759		 * bio list if possible.
1760		 */
1761		if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1762			sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1763		} else {
1764			sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1765		}
1766		ASSERT(sector->page);
1767		pointers[stripe_nr] = kmap_local_page(sector->page) +
1768				   sector->pgoff;
1769		unmap_array[stripe_nr] = pointers[stripe_nr];
1770	}
1771
1772	/* All raid6 handling here */
1773	if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1774		/* Single failure, rebuild from parity raid5 style */
1775		if (failb < 0) {
1776			if (faila == rbio->nr_data)
1777				/*
1778				 * Just the P stripe has failed, without
1779				 * a bad data or Q stripe.
1780				 * We have nothing to do, just skip the
1781				 * recovery for this stripe.
1782				 */
1783				goto cleanup;
1784			/*
1785			 * a single failure in raid6 is rebuilt
1786			 * in the pstripe code below
1787			 */
1788			goto pstripe;
1789		}
1790
1791		/*
1792		 * If the q stripe is failed, do a pstripe reconstruction from
1793		 * the xors.
1794		 * If both the q stripe and the P stripe are failed, we're
1795		 * here due to a crc mismatch and we can't give them the
1796		 * data they want.
1797		 */
1798		if (failb == rbio->real_stripes - 1) {
1799			if (faila == rbio->real_stripes - 2)
1800				/*
1801				 * Only P and Q are corrupted.
1802				 * We only care about data stripes recovery,
1803				 * can skip this vertical stripe.
1804				 */
1805				goto cleanup;
1806			/*
1807			 * Otherwise we have one bad data stripe and
1808			 * a good P stripe.  raid5!
1809			 */
1810			goto pstripe;
1811		}
1812
1813		if (failb == rbio->real_stripes - 2) {
1814			raid6_datap_recov(rbio->real_stripes, sectorsize,
1815					  faila, pointers);
1816		} else {
1817			raid6_2data_recov(rbio->real_stripes, sectorsize,
1818					  faila, failb, pointers);
1819		}
1820	} else {
1821		void *p;
1822
1823		/* Rebuild from P stripe here (raid5 or raid6). */
1824		ASSERT(failb == -1);
1825pstripe:
1826		/* Copy parity block into failed block to start with */
1827		memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1828
1829		/* Rearrange the pointer array */
1830		p = pointers[faila];
1831		for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
1832		     stripe_nr++)
1833			pointers[stripe_nr] = pointers[stripe_nr + 1];
1834		pointers[rbio->nr_data - 1] = p;
1835
1836		/* Xor in the rest */
1837		run_xor(pointers, rbio->nr_data - 1, sectorsize);
1838
1839	}
1840
1841	/*
1842	 * No matter if this is a RMW or recovery, we should have all
1843	 * failed sectors repaired in the vertical stripe, thus they are now
1844	 * uptodate.
1845	 * Especially if we determine to cache the rbio, we need to
1846	 * have at least all data sectors uptodate.
1847	 *
1848	 * If possible, also check if the repaired sector matches its data
1849	 * checksum.
1850	 */
1851	if (faila >= 0) {
1852		ret = verify_one_sector(rbio, faila, sector_nr);
1853		if (ret < 0)
1854			goto cleanup;
1855
1856		sector = rbio_stripe_sector(rbio, faila, sector_nr);
1857		sector->uptodate = 1;
1858	}
1859	if (failb >= 0) {
1860		ret = verify_one_sector(rbio, failb, sector_nr);
1861		if (ret < 0)
1862			goto cleanup;
1863
1864		sector = rbio_stripe_sector(rbio, failb, sector_nr);
1865		sector->uptodate = 1;
1866	}
1867
1868cleanup:
1869	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
1870		kunmap_local(unmap_array[stripe_nr]);
1871	return ret;
1872}
1873
1874static int recover_sectors(struct btrfs_raid_bio *rbio)
1875{
1876	void **pointers = NULL;
1877	void **unmap_array = NULL;
1878	int sectornr;
1879	int ret = 0;
1880
1881	/*
1882	 * @pointers array stores the pointer for each sector.
1883	 *
1884	 * @unmap_array stores copy of pointers that does not get reordered
1885	 * during reconstruction so that kunmap_local works.
1886	 */
1887	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1888	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1889	if (!pointers || !unmap_array) {
1890		ret = -ENOMEM;
1891		goto out;
1892	}
1893
1894	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1895		spin_lock(&rbio->bio_list_lock);
1896		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1897		spin_unlock(&rbio->bio_list_lock);
1898	}
1899
1900	index_rbio_pages(rbio);
1901
1902	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1903		ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
1904		if (ret < 0)
1905			break;
1906	}
1907
1908out:
1909	kfree(pointers);
1910	kfree(unmap_array);
1911	return ret;
1912}
1913
1914static void recover_rbio(struct btrfs_raid_bio *rbio)
1915{
1916	struct bio_list bio_list = BIO_EMPTY_LIST;
1917	int total_sector_nr;
1918	int ret = 0;
1919
1920	/*
1921	 * Either we're doing recover for a read failure or degraded write,
1922	 * caller should have set error bitmap correctly.
1923	 */
1924	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
1925
1926	/* For recovery, we need to read all sectors including P/Q. */
1927	ret = alloc_rbio_pages(rbio);
1928	if (ret < 0)
1929		goto out;
1930
1931	index_rbio_pages(rbio);
1932
1933	/*
1934	 * Read everything that hasn't failed. However this time we will
1935	 * not trust any cached sector.
1936	 * As we may read out some stale data but higher layer is not reading
1937	 * that stale part.
1938	 *
1939	 * So here we always re-read everything in recovery path.
1940	 */
1941	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1942	     total_sector_nr++) {
1943		int stripe = total_sector_nr / rbio->stripe_nsectors;
1944		int sectornr = total_sector_nr % rbio->stripe_nsectors;
1945		struct sector_ptr *sector;
1946
1947		/*
1948		 * Skip the range which has error.  It can be a range which is
1949		 * marked error (for csum mismatch), or it can be a missing
1950		 * device.
1951		 */
1952		if (!rbio->bioc->stripes[stripe].dev->bdev ||
1953		    test_bit(total_sector_nr, rbio->error_bitmap)) {
1954			/*
1955			 * Also set the error bit for missing device, which
1956			 * may not yet have its error bit set.
1957			 */
1958			set_bit(total_sector_nr, rbio->error_bitmap);
1959			continue;
1960		}
1961
1962		sector = rbio_stripe_sector(rbio, stripe, sectornr);
1963		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
1964					 sectornr, REQ_OP_READ);
1965		if (ret < 0) {
1966			bio_list_put(&bio_list);
1967			goto out;
1968		}
1969	}
1970
1971	submit_read_wait_bio_list(rbio, &bio_list);
1972	ret = recover_sectors(rbio);
1973out:
1974	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
1975}
1976
1977static void recover_rbio_work(struct work_struct *work)
1978{
1979	struct btrfs_raid_bio *rbio;
1980
1981	rbio = container_of(work, struct btrfs_raid_bio, work);
1982	if (!lock_stripe_add(rbio))
1983		recover_rbio(rbio);
1984}
1985
1986static void recover_rbio_work_locked(struct work_struct *work)
1987{
1988	recover_rbio(container_of(work, struct btrfs_raid_bio, work));
1989}
1990
1991static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
1992{
1993	bool found = false;
1994	int sector_nr;
1995
1996	/*
1997	 * This is for RAID6 extra recovery tries, thus mirror number should
1998	 * be large than 2.
1999	 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2000	 * RAID5 methods.
2001	 */
2002	ASSERT(mirror_num > 2);
2003	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2004		int found_errors;
2005		int faila;
2006		int failb;
2007
2008		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2009							 &faila, &failb);
2010		/* This vertical stripe doesn't have errors. */
2011		if (!found_errors)
2012			continue;
2013
2014		/*
2015		 * If we found errors, there should be only one error marked
2016		 * by previous set_rbio_range_error().
2017		 */
2018		ASSERT(found_errors == 1);
2019		found = true;
2020
2021		/* Now select another stripe to mark as error. */
2022		failb = rbio->real_stripes - (mirror_num - 1);
2023		if (failb <= faila)
2024			failb--;
2025
2026		/* Set the extra bit in error bitmap. */
2027		if (failb >= 0)
2028			set_bit(failb * rbio->stripe_nsectors + sector_nr,
2029				rbio->error_bitmap);
2030	}
2031
2032	/* We should found at least one vertical stripe with error.*/
2033	ASSERT(found);
2034}
2035
2036/*
2037 * the main entry point for reads from the higher layers.  This
2038 * is really only called when the normal read path had a failure,
2039 * so we assume the bio they send down corresponds to a failed part
2040 * of the drive.
2041 */
2042void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2043			   int mirror_num)
2044{
2045	struct btrfs_fs_info *fs_info = bioc->fs_info;
2046	struct btrfs_raid_bio *rbio;
2047
2048	rbio = alloc_rbio(fs_info, bioc);
2049	if (IS_ERR(rbio)) {
2050		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2051		bio_endio(bio);
2052		return;
2053	}
2054
2055	rbio->operation = BTRFS_RBIO_READ_REBUILD;
2056	rbio_add_bio(rbio, bio);
2057
2058	set_rbio_range_error(rbio, bio);
2059
2060	/*
2061	 * Loop retry:
2062	 * for 'mirror == 2', reconstruct from all other stripes.
2063	 * for 'mirror_num > 2', select a stripe to fail on every retry.
2064	 */
2065	if (mirror_num > 2)
2066		set_rbio_raid6_extra_error(rbio, mirror_num);
2067
2068	start_async_work(rbio, recover_rbio_work);
2069}
2070
2071static void fill_data_csums(struct btrfs_raid_bio *rbio)
2072{
2073	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2074	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2075						       rbio->bioc->full_stripe_logical);
2076	const u64 start = rbio->bioc->full_stripe_logical;
2077	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2078			fs_info->sectorsize_bits;
2079	int ret;
2080
2081	/* The rbio should not have its csum buffer initialized. */
2082	ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2083
2084	/*
2085	 * Skip the csum search if:
2086	 *
2087	 * - The rbio doesn't belong to data block groups
2088	 *   Then we are doing IO for tree blocks, no need to search csums.
2089	 *
2090	 * - The rbio belongs to mixed block groups
2091	 *   This is to avoid deadlock, as we're already holding the full
2092	 *   stripe lock, if we trigger a metadata read, and it needs to do
2093	 *   raid56 recovery, we will deadlock.
2094	 */
2095	if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2096	    rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2097		return;
2098
2099	rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2100				 fs_info->csum_size, GFP_NOFS);
2101	rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2102					  GFP_NOFS);
2103	if (!rbio->csum_buf || !rbio->csum_bitmap) {
2104		ret = -ENOMEM;
2105		goto error;
2106	}
2107
2108	ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
2109					rbio->csum_buf, rbio->csum_bitmap);
2110	if (ret < 0)
2111		goto error;
2112	if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2113		goto no_csum;
2114	return;
2115
2116error:
2117	/*
2118	 * We failed to allocate memory or grab the csum, but it's not fatal,
2119	 * we can still continue.  But better to warn users that RMW is no
2120	 * longer safe for this particular sub-stripe write.
2121	 */
2122	btrfs_warn_rl(fs_info,
2123"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2124			rbio->bioc->full_stripe_logical, ret);
2125no_csum:
2126	kfree(rbio->csum_buf);
2127	bitmap_free(rbio->csum_bitmap);
2128	rbio->csum_buf = NULL;
2129	rbio->csum_bitmap = NULL;
2130}
2131
2132static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2133{
2134	struct bio_list bio_list = BIO_EMPTY_LIST;
2135	int total_sector_nr;
2136	int ret = 0;
2137
2138	/*
2139	 * Fill the data csums we need for data verification.  We need to fill
2140	 * the csum_bitmap/csum_buf first, as our endio function will try to
2141	 * verify the data sectors.
2142	 */
2143	fill_data_csums(rbio);
2144
2145	/*
2146	 * Build a list of bios to read all sectors (including data and P/Q).
2147	 *
2148	 * This behavior is to compensate the later csum verification and recovery.
2149	 */
2150	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2151	     total_sector_nr++) {
2152		struct sector_ptr *sector;
2153		int stripe = total_sector_nr / rbio->stripe_nsectors;
2154		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2155
2156		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2157		ret = rbio_add_io_sector(rbio, &bio_list, sector,
2158			       stripe, sectornr, REQ_OP_READ);
2159		if (ret) {
2160			bio_list_put(&bio_list);
2161			return ret;
2162		}
2163	}
2164
2165	/*
2166	 * We may or may not have any corrupted sectors (including missing dev
2167	 * and csum mismatch), just let recover_sectors() to handle them all.
2168	 */
2169	submit_read_wait_bio_list(rbio, &bio_list);
2170	return recover_sectors(rbio);
2171}
2172
2173static void raid_wait_write_end_io(struct bio *bio)
2174{
2175	struct btrfs_raid_bio *rbio = bio->bi_private;
2176	blk_status_t err = bio->bi_status;
2177
2178	if (err)
2179		rbio_update_error_bitmap(rbio, bio);
2180	bio_put(bio);
2181	if (atomic_dec_and_test(&rbio->stripes_pending))
2182		wake_up(&rbio->io_wait);
2183}
2184
2185static void submit_write_bios(struct btrfs_raid_bio *rbio,
2186			      struct bio_list *bio_list)
2187{
2188	struct bio *bio;
2189
2190	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2191	while ((bio = bio_list_pop(bio_list))) {
2192		bio->bi_end_io = raid_wait_write_end_io;
2193
2194		if (trace_raid56_write_enabled()) {
2195			struct raid56_bio_trace_info trace_info = { 0 };
2196
2197			bio_get_trace_info(rbio, bio, &trace_info);
2198			trace_raid56_write(rbio, bio, &trace_info);
2199		}
2200		submit_bio(bio);
2201	}
2202}
2203
2204/*
2205 * To determine if we need to read any sector from the disk.
2206 * Should only be utilized in RMW path, to skip cached rbio.
2207 */
2208static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2209{
2210	int i;
2211
2212	for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2213		struct sector_ptr *sector = &rbio->stripe_sectors[i];
2214
2215		/*
2216		 * We have a sector which doesn't have page nor uptodate,
2217		 * thus this rbio can not be cached one, as cached one must
2218		 * have all its data sectors present and uptodate.
2219		 */
2220		if (!sector->page || !sector->uptodate)
2221			return true;
2222	}
2223	return false;
2224}
2225
2226static void rmw_rbio(struct btrfs_raid_bio *rbio)
2227{
2228	struct bio_list bio_list;
2229	int sectornr;
2230	int ret = 0;
2231
2232	/*
2233	 * Allocate the pages for parity first, as P/Q pages will always be
2234	 * needed for both full-stripe and sub-stripe writes.
2235	 */
2236	ret = alloc_rbio_parity_pages(rbio);
2237	if (ret < 0)
2238		goto out;
2239
2240	/*
2241	 * Either full stripe write, or we have every data sector already
2242	 * cached, can go to write path immediately.
2243	 */
2244	if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2245		/*
2246		 * Now we're doing sub-stripe write, also need all data stripes
2247		 * to do the full RMW.
2248		 */
2249		ret = alloc_rbio_data_pages(rbio);
2250		if (ret < 0)
2251			goto out;
2252
2253		index_rbio_pages(rbio);
2254
2255		ret = rmw_read_wait_recover(rbio);
2256		if (ret < 0)
2257			goto out;
2258	}
2259
2260	/*
2261	 * At this stage we're not allowed to add any new bios to the
2262	 * bio list any more, anyone else that wants to change this stripe
2263	 * needs to do their own rmw.
2264	 */
2265	spin_lock(&rbio->bio_list_lock);
2266	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2267	spin_unlock(&rbio->bio_list_lock);
2268
2269	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2270
2271	index_rbio_pages(rbio);
2272
2273	/*
2274	 * We don't cache full rbios because we're assuming
2275	 * the higher layers are unlikely to use this area of
2276	 * the disk again soon.  If they do use it again,
2277	 * hopefully they will send another full bio.
2278	 */
2279	if (!rbio_is_full(rbio))
2280		cache_rbio_pages(rbio);
2281	else
2282		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2283
2284	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2285		generate_pq_vertical(rbio, sectornr);
2286
2287	bio_list_init(&bio_list);
2288	ret = rmw_assemble_write_bios(rbio, &bio_list);
2289	if (ret < 0)
2290		goto out;
2291
2292	/* We should have at least one bio assembled. */
2293	ASSERT(bio_list_size(&bio_list));
2294	submit_write_bios(rbio, &bio_list);
2295	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2296
2297	/* We may have more errors than our tolerance during the read. */
2298	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2299		int found_errors;
2300
2301		found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
2302		if (found_errors > rbio->bioc->max_errors) {
2303			ret = -EIO;
2304			break;
2305		}
2306	}
2307out:
2308	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2309}
2310
2311static void rmw_rbio_work(struct work_struct *work)
2312{
2313	struct btrfs_raid_bio *rbio;
2314
2315	rbio = container_of(work, struct btrfs_raid_bio, work);
2316	if (lock_stripe_add(rbio) == 0)
2317		rmw_rbio(rbio);
2318}
2319
2320static void rmw_rbio_work_locked(struct work_struct *work)
2321{
2322	rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2323}
2324
2325/*
2326 * The following code is used to scrub/replace the parity stripe
2327 *
2328 * Caller must have already increased bio_counter for getting @bioc.
2329 *
2330 * Note: We need make sure all the pages that add into the scrub/replace
2331 * raid bio are correct and not be changed during the scrub/replace. That
2332 * is those pages just hold metadata or file data with checksum.
2333 */
2334
2335struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2336				struct btrfs_io_context *bioc,
2337				struct btrfs_device *scrub_dev,
2338				unsigned long *dbitmap, int stripe_nsectors)
2339{
2340	struct btrfs_fs_info *fs_info = bioc->fs_info;
2341	struct btrfs_raid_bio *rbio;
2342	int i;
2343
2344	rbio = alloc_rbio(fs_info, bioc);
2345	if (IS_ERR(rbio))
2346		return NULL;
2347	bio_list_add(&rbio->bio_list, bio);
2348	/*
2349	 * This is a special bio which is used to hold the completion handler
2350	 * and make the scrub rbio is similar to the other types
2351	 */
2352	ASSERT(!bio->bi_iter.bi_size);
2353	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2354
2355	/*
2356	 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2357	 * to the end position, so this search can start from the first parity
2358	 * stripe.
2359	 */
2360	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2361		if (bioc->stripes[i].dev == scrub_dev) {
2362			rbio->scrubp = i;
2363			break;
2364		}
2365	}
2366	ASSERT(i < rbio->real_stripes);
2367
2368	bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2369	return rbio;
2370}
2371
2372/*
2373 * We just scrub the parity that we have correct data on the same horizontal,
2374 * so we needn't allocate all pages for all the stripes.
2375 */
2376static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2377{
2378	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2379	int total_sector_nr;
2380
2381	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2382	     total_sector_nr++) {
2383		struct page *page;
2384		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2385		int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2386
2387		if (!test_bit(sectornr, &rbio->dbitmap))
2388			continue;
2389		if (rbio->stripe_pages[index])
2390			continue;
2391		page = alloc_page(GFP_NOFS);
2392		if (!page)
2393			return -ENOMEM;
2394		rbio->stripe_pages[index] = page;
2395	}
2396	index_stripe_sectors(rbio);
2397	return 0;
2398}
2399
2400static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2401{
2402	struct btrfs_io_context *bioc = rbio->bioc;
2403	const u32 sectorsize = bioc->fs_info->sectorsize;
2404	void **pointers = rbio->finish_pointers;
2405	unsigned long *pbitmap = &rbio->finish_pbitmap;
2406	int nr_data = rbio->nr_data;
2407	int stripe;
2408	int sectornr;
2409	bool has_qstripe;
2410	struct sector_ptr p_sector = { 0 };
2411	struct sector_ptr q_sector = { 0 };
2412	struct bio_list bio_list;
2413	int is_replace = 0;
2414	int ret;
2415
2416	bio_list_init(&bio_list);
2417
2418	if (rbio->real_stripes - rbio->nr_data == 1)
2419		has_qstripe = false;
2420	else if (rbio->real_stripes - rbio->nr_data == 2)
2421		has_qstripe = true;
2422	else
2423		BUG();
2424
2425	/*
2426	 * Replace is running and our P/Q stripe is being replaced, then we
2427	 * need to duplicate the final write to replace target.
2428	 */
2429	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2430		is_replace = 1;
2431		bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2432	}
2433
2434	/*
2435	 * Because the higher layers(scrubber) are unlikely to
2436	 * use this area of the disk again soon, so don't cache
2437	 * it.
2438	 */
2439	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2440
2441	p_sector.page = alloc_page(GFP_NOFS);
2442	if (!p_sector.page)
2443		return -ENOMEM;
2444	p_sector.pgoff = 0;
2445	p_sector.uptodate = 1;
2446
2447	if (has_qstripe) {
2448		/* RAID6, allocate and map temp space for the Q stripe */
2449		q_sector.page = alloc_page(GFP_NOFS);
2450		if (!q_sector.page) {
2451			__free_page(p_sector.page);
2452			p_sector.page = NULL;
2453			return -ENOMEM;
2454		}
2455		q_sector.pgoff = 0;
2456		q_sector.uptodate = 1;
2457		pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
2458	}
2459
2460	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2461
2462	/* Map the parity stripe just once */
2463	pointers[nr_data] = kmap_local_page(p_sector.page);
2464
2465	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2466		struct sector_ptr *sector;
2467		void *parity;
2468
2469		/* first collect one page from each data stripe */
2470		for (stripe = 0; stripe < nr_data; stripe++) {
2471			sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2472			pointers[stripe] = kmap_local_page(sector->page) +
2473					   sector->pgoff;
2474		}
2475
2476		if (has_qstripe) {
2477			/* RAID6, call the library function to fill in our P/Q */
2478			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2479						pointers);
2480		} else {
2481			/* raid5 */
2482			memcpy(pointers[nr_data], pointers[0], sectorsize);
2483			run_xor(pointers + 1, nr_data - 1, sectorsize);
2484		}
2485
2486		/* Check scrubbing parity and repair it */
2487		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2488		parity = kmap_local_page(sector->page) + sector->pgoff;
2489		if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2490			memcpy(parity, pointers[rbio->scrubp], sectorsize);
2491		else
2492			/* Parity is right, needn't writeback */
2493			bitmap_clear(&rbio->dbitmap, sectornr, 1);
2494		kunmap_local(parity);
2495
2496		for (stripe = nr_data - 1; stripe >= 0; stripe--)
2497			kunmap_local(pointers[stripe]);
2498	}
2499
2500	kunmap_local(pointers[nr_data]);
2501	__free_page(p_sector.page);
2502	p_sector.page = NULL;
2503	if (q_sector.page) {
2504		kunmap_local(pointers[rbio->real_stripes - 1]);
2505		__free_page(q_sector.page);
2506		q_sector.page = NULL;
2507	}
2508
2509	/*
2510	 * time to start writing.  Make bios for everything from the
2511	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2512	 * everything else.
2513	 */
2514	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2515		struct sector_ptr *sector;
2516
2517		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2518		ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2519					 sectornr, REQ_OP_WRITE);
2520		if (ret)
2521			goto cleanup;
2522	}
2523
2524	if (!is_replace)
2525		goto submit_write;
2526
2527	/*
2528	 * Replace is running and our parity stripe needs to be duplicated to
2529	 * the target device.  Check we have a valid source stripe number.
2530	 */
2531	ASSERT(rbio->bioc->replace_stripe_src >= 0);
2532	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2533		struct sector_ptr *sector;
2534
2535		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2536		ret = rbio_add_io_sector(rbio, &bio_list, sector,
2537					 rbio->real_stripes,
2538					 sectornr, REQ_OP_WRITE);
2539		if (ret)
2540			goto cleanup;
2541	}
2542
2543submit_write:
2544	submit_write_bios(rbio, &bio_list);
2545	return 0;
2546
2547cleanup:
2548	bio_list_put(&bio_list);
2549	return ret;
2550}
2551
2552static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2553{
2554	if (stripe >= 0 && stripe < rbio->nr_data)
2555		return 1;
2556	return 0;
2557}
2558
2559static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2560{
2561	void **pointers = NULL;
2562	void **unmap_array = NULL;
2563	int sector_nr;
2564	int ret = 0;
2565
2566	/*
2567	 * @pointers array stores the pointer for each sector.
2568	 *
2569	 * @unmap_array stores copy of pointers that does not get reordered
2570	 * during reconstruction so that kunmap_local works.
2571	 */
2572	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2573	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2574	if (!pointers || !unmap_array) {
2575		ret = -ENOMEM;
2576		goto out;
2577	}
2578
2579	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2580		int dfail = 0, failp = -1;
2581		int faila;
2582		int failb;
2583		int found_errors;
2584
2585		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2586							 &faila, &failb);
2587		if (found_errors > rbio->bioc->max_errors) {
2588			ret = -EIO;
2589			goto out;
2590		}
2591		if (found_errors == 0)
2592			continue;
2593
2594		/* We should have at least one error here. */
2595		ASSERT(faila >= 0 || failb >= 0);
2596
2597		if (is_data_stripe(rbio, faila))
2598			dfail++;
2599		else if (is_parity_stripe(faila))
2600			failp = faila;
2601
2602		if (is_data_stripe(rbio, failb))
2603			dfail++;
2604		else if (is_parity_stripe(failb))
2605			failp = failb;
2606		/*
2607		 * Because we can not use a scrubbing parity to repair the
2608		 * data, so the capability of the repair is declined.  (In the
2609		 * case of RAID5, we can not repair anything.)
2610		 */
2611		if (dfail > rbio->bioc->max_errors - 1) {
2612			ret = -EIO;
2613			goto out;
2614		}
2615		/*
2616		 * If all data is good, only parity is correctly, just repair
2617		 * the parity, no need to recover data stripes.
2618		 */
2619		if (dfail == 0)
2620			continue;
2621
2622		/*
2623		 * Here means we got one corrupted data stripe and one
2624		 * corrupted parity on RAID6, if the corrupted parity is
2625		 * scrubbing parity, luckily, use the other one to repair the
2626		 * data, or we can not repair the data stripe.
2627		 */
2628		if (failp != rbio->scrubp) {
2629			ret = -EIO;
2630			goto out;
2631		}
2632
2633		ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2634		if (ret < 0)
2635			goto out;
2636	}
2637out:
2638	kfree(pointers);
2639	kfree(unmap_array);
2640	return ret;
2641}
2642
2643static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2644{
2645	struct bio_list bio_list = BIO_EMPTY_LIST;
2646	int total_sector_nr;
2647	int ret = 0;
2648
2649	/* Build a list of bios to read all the missing parts. */
2650	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2651	     total_sector_nr++) {
2652		int sectornr = total_sector_nr % rbio->stripe_nsectors;
2653		int stripe = total_sector_nr / rbio->stripe_nsectors;
2654		struct sector_ptr *sector;
2655
2656		/* No data in the vertical stripe, no need to read. */
2657		if (!test_bit(sectornr, &rbio->dbitmap))
2658			continue;
2659
2660		/*
2661		 * We want to find all the sectors missing from the rbio and
2662		 * read them from the disk. If sector_in_rbio() finds a sector
2663		 * in the bio list we don't need to read it off the stripe.
2664		 */
2665		sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2666		if (sector)
2667			continue;
2668
2669		sector = rbio_stripe_sector(rbio, stripe, sectornr);
2670		/*
2671		 * The bio cache may have handed us an uptodate sector.  If so,
2672		 * use it.
2673		 */
2674		if (sector->uptodate)
2675			continue;
2676
2677		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2678					 sectornr, REQ_OP_READ);
2679		if (ret) {
2680			bio_list_put(&bio_list);
2681			return ret;
2682		}
2683	}
2684
2685	submit_read_wait_bio_list(rbio, &bio_list);
2686	return 0;
2687}
2688
2689static void scrub_rbio(struct btrfs_raid_bio *rbio)
2690{
2691	int sector_nr;
2692	int ret;
2693
2694	ret = alloc_rbio_essential_pages(rbio);
2695	if (ret)
2696		goto out;
2697
2698	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2699
2700	ret = scrub_assemble_read_bios(rbio);
2701	if (ret < 0)
2702		goto out;
2703
2704	/* We may have some failures, recover the failed sectors first. */
2705	ret = recover_scrub_rbio(rbio);
2706	if (ret < 0)
2707		goto out;
2708
2709	/*
2710	 * We have every sector properly prepared. Can finish the scrub
2711	 * and writeback the good content.
2712	 */
2713	ret = finish_parity_scrub(rbio);
2714	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2715	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2716		int found_errors;
2717
2718		found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
2719		if (found_errors > rbio->bioc->max_errors) {
2720			ret = -EIO;
2721			break;
2722		}
2723	}
2724out:
2725	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2726}
2727
2728static void scrub_rbio_work_locked(struct work_struct *work)
2729{
2730	scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2731}
2732
2733void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2734{
2735	if (!lock_stripe_add(rbio))
2736		start_async_work(rbio, scrub_rbio_work_locked);
2737}
2738
2739/*
2740 * This is for scrub call sites where we already have correct data contents.
2741 * This allows us to avoid reading data stripes again.
2742 *
2743 * Unfortunately here we have to do page copy, other than reusing the pages.
2744 * This is due to the fact rbio has its own page management for its cache.
2745 */
2746void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
2747				    struct page **data_pages, u64 data_logical)
2748{
2749	const u64 offset_in_full_stripe = data_logical -
2750					  rbio->bioc->full_stripe_logical;
2751	const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
2752	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2753	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
2754	int ret;
2755
2756	/*
2757	 * If we hit ENOMEM temporarily, but later at
2758	 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2759	 * the extra read, not a big deal.
2760	 *
2761	 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2762	 * the bio would got proper error number set.
2763	 */
2764	ret = alloc_rbio_data_pages(rbio);
2765	if (ret < 0)
2766		return;
2767
2768	/* data_logical must be at stripe boundary and inside the full stripe. */
2769	ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
2770	ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
2771
2772	for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
2773		struct page *dst = rbio->stripe_pages[page_nr + page_index];
2774		struct page *src = data_pages[page_nr];
2775
2776		memcpy_page(dst, 0, src, 0, PAGE_SIZE);
2777		for (int sector_nr = sectors_per_page * page_index;
2778		     sector_nr < sectors_per_page * (page_index + 1);
2779		     sector_nr++)
2780			rbio->stripe_sectors[sector_nr].uptodate = true;
2781	}
2782}
2783