162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2012 Fusion-io  All rights reserved.
462306a36Sopenharmony_ci * Copyright (C) 2012 Intel Corp. All rights reserved.
562306a36Sopenharmony_ci */
662306a36Sopenharmony_ci
762306a36Sopenharmony_ci#include <linux/sched.h>
862306a36Sopenharmony_ci#include <linux/bio.h>
962306a36Sopenharmony_ci#include <linux/slab.h>
1062306a36Sopenharmony_ci#include <linux/blkdev.h>
1162306a36Sopenharmony_ci#include <linux/raid/pq.h>
1262306a36Sopenharmony_ci#include <linux/hash.h>
1362306a36Sopenharmony_ci#include <linux/list_sort.h>
1462306a36Sopenharmony_ci#include <linux/raid/xor.h>
1562306a36Sopenharmony_ci#include <linux/mm.h>
1662306a36Sopenharmony_ci#include "messages.h"
1762306a36Sopenharmony_ci#include "misc.h"
1862306a36Sopenharmony_ci#include "ctree.h"
1962306a36Sopenharmony_ci#include "disk-io.h"
2062306a36Sopenharmony_ci#include "volumes.h"
2162306a36Sopenharmony_ci#include "raid56.h"
2262306a36Sopenharmony_ci#include "async-thread.h"
2362306a36Sopenharmony_ci#include "file-item.h"
2462306a36Sopenharmony_ci#include "btrfs_inode.h"
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci/* set when additional merges to this rbio are not allowed */
2762306a36Sopenharmony_ci#define RBIO_RMW_LOCKED_BIT	1
2862306a36Sopenharmony_ci
2962306a36Sopenharmony_ci/*
3062306a36Sopenharmony_ci * set when this rbio is sitting in the hash, but it is just a cache
3162306a36Sopenharmony_ci * of past RMW
3262306a36Sopenharmony_ci */
3362306a36Sopenharmony_ci#define RBIO_CACHE_BIT		2
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci/*
3662306a36Sopenharmony_ci * set when it is safe to trust the stripe_pages for caching
3762306a36Sopenharmony_ci */
3862306a36Sopenharmony_ci#define RBIO_CACHE_READY_BIT	3
3962306a36Sopenharmony_ci
4062306a36Sopenharmony_ci#define RBIO_CACHE_SIZE 1024
4162306a36Sopenharmony_ci
4262306a36Sopenharmony_ci#define BTRFS_STRIPE_HASH_TABLE_BITS				11
4362306a36Sopenharmony_ci
4462306a36Sopenharmony_ci/* Used by the raid56 code to lock stripes for read/modify/write */
4562306a36Sopenharmony_cistruct btrfs_stripe_hash {
4662306a36Sopenharmony_ci	struct list_head hash_list;
4762306a36Sopenharmony_ci	spinlock_t lock;
4862306a36Sopenharmony_ci};
4962306a36Sopenharmony_ci
5062306a36Sopenharmony_ci/* Used by the raid56 code to lock stripes for read/modify/write */
5162306a36Sopenharmony_cistruct btrfs_stripe_hash_table {
5262306a36Sopenharmony_ci	struct list_head stripe_cache;
5362306a36Sopenharmony_ci	spinlock_t cache_lock;
5462306a36Sopenharmony_ci	int cache_size;
5562306a36Sopenharmony_ci	struct btrfs_stripe_hash table[];
5662306a36Sopenharmony_ci};
5762306a36Sopenharmony_ci
5862306a36Sopenharmony_ci/*
5962306a36Sopenharmony_ci * A bvec like structure to present a sector inside a page.
6062306a36Sopenharmony_ci *
6162306a36Sopenharmony_ci * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
6262306a36Sopenharmony_ci */
6362306a36Sopenharmony_cistruct sector_ptr {
6462306a36Sopenharmony_ci	struct page *page;
6562306a36Sopenharmony_ci	unsigned int pgoff:24;
6662306a36Sopenharmony_ci	unsigned int uptodate:8;
6762306a36Sopenharmony_ci};
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_cistatic void rmw_rbio_work(struct work_struct *work);
7062306a36Sopenharmony_cistatic void rmw_rbio_work_locked(struct work_struct *work);
7162306a36Sopenharmony_cistatic void index_rbio_pages(struct btrfs_raid_bio *rbio);
7262306a36Sopenharmony_cistatic int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_cistatic int finish_parity_scrub(struct btrfs_raid_bio *rbio);
7562306a36Sopenharmony_cistatic void scrub_rbio_work_locked(struct work_struct *work);
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_cistatic void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
7862306a36Sopenharmony_ci{
7962306a36Sopenharmony_ci	bitmap_free(rbio->error_bitmap);
8062306a36Sopenharmony_ci	kfree(rbio->stripe_pages);
8162306a36Sopenharmony_ci	kfree(rbio->bio_sectors);
8262306a36Sopenharmony_ci	kfree(rbio->stripe_sectors);
8362306a36Sopenharmony_ci	kfree(rbio->finish_pointers);
8462306a36Sopenharmony_ci}
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_cistatic void free_raid_bio(struct btrfs_raid_bio *rbio)
8762306a36Sopenharmony_ci{
8862306a36Sopenharmony_ci	int i;
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_ci	if (!refcount_dec_and_test(&rbio->refs))
9162306a36Sopenharmony_ci		return;
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ci	WARN_ON(!list_empty(&rbio->stripe_cache));
9462306a36Sopenharmony_ci	WARN_ON(!list_empty(&rbio->hash_list));
9562306a36Sopenharmony_ci	WARN_ON(!bio_list_empty(&rbio->bio_list));
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci	for (i = 0; i < rbio->nr_pages; i++) {
9862306a36Sopenharmony_ci		if (rbio->stripe_pages[i]) {
9962306a36Sopenharmony_ci			__free_page(rbio->stripe_pages[i]);
10062306a36Sopenharmony_ci			rbio->stripe_pages[i] = NULL;
10162306a36Sopenharmony_ci		}
10262306a36Sopenharmony_ci	}
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci	btrfs_put_bioc(rbio->bioc);
10562306a36Sopenharmony_ci	free_raid_bio_pointers(rbio);
10662306a36Sopenharmony_ci	kfree(rbio);
10762306a36Sopenharmony_ci}
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_cistatic void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
11062306a36Sopenharmony_ci{
11162306a36Sopenharmony_ci	INIT_WORK(&rbio->work, work_func);
11262306a36Sopenharmony_ci	queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
11362306a36Sopenharmony_ci}
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci/*
11662306a36Sopenharmony_ci * the stripe hash table is used for locking, and to collect
11762306a36Sopenharmony_ci * bios in hopes of making a full stripe
11862306a36Sopenharmony_ci */
11962306a36Sopenharmony_ciint btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
12062306a36Sopenharmony_ci{
12162306a36Sopenharmony_ci	struct btrfs_stripe_hash_table *table;
12262306a36Sopenharmony_ci	struct btrfs_stripe_hash_table *x;
12362306a36Sopenharmony_ci	struct btrfs_stripe_hash *cur;
12462306a36Sopenharmony_ci	struct btrfs_stripe_hash *h;
12562306a36Sopenharmony_ci	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
12662306a36Sopenharmony_ci	int i;
12762306a36Sopenharmony_ci
12862306a36Sopenharmony_ci	if (info->stripe_hash_table)
12962306a36Sopenharmony_ci		return 0;
13062306a36Sopenharmony_ci
13162306a36Sopenharmony_ci	/*
13262306a36Sopenharmony_ci	 * The table is large, starting with order 4 and can go as high as
13362306a36Sopenharmony_ci	 * order 7 in case lock debugging is turned on.
13462306a36Sopenharmony_ci	 *
13562306a36Sopenharmony_ci	 * Try harder to allocate and fallback to vmalloc to lower the chance
13662306a36Sopenharmony_ci	 * of a failing mount.
13762306a36Sopenharmony_ci	 */
13862306a36Sopenharmony_ci	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
13962306a36Sopenharmony_ci	if (!table)
14062306a36Sopenharmony_ci		return -ENOMEM;
14162306a36Sopenharmony_ci
14262306a36Sopenharmony_ci	spin_lock_init(&table->cache_lock);
14362306a36Sopenharmony_ci	INIT_LIST_HEAD(&table->stripe_cache);
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_ci	h = table->table;
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	for (i = 0; i < num_entries; i++) {
14862306a36Sopenharmony_ci		cur = h + i;
14962306a36Sopenharmony_ci		INIT_LIST_HEAD(&cur->hash_list);
15062306a36Sopenharmony_ci		spin_lock_init(&cur->lock);
15162306a36Sopenharmony_ci	}
15262306a36Sopenharmony_ci
15362306a36Sopenharmony_ci	x = cmpxchg(&info->stripe_hash_table, NULL, table);
15462306a36Sopenharmony_ci	kvfree(x);
15562306a36Sopenharmony_ci	return 0;
15662306a36Sopenharmony_ci}
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci/*
15962306a36Sopenharmony_ci * caching an rbio means to copy anything from the
16062306a36Sopenharmony_ci * bio_sectors array into the stripe_pages array.  We
16162306a36Sopenharmony_ci * use the page uptodate bit in the stripe cache array
16262306a36Sopenharmony_ci * to indicate if it has valid data
16362306a36Sopenharmony_ci *
16462306a36Sopenharmony_ci * once the caching is done, we set the cache ready
16562306a36Sopenharmony_ci * bit.
16662306a36Sopenharmony_ci */
16762306a36Sopenharmony_cistatic void cache_rbio_pages(struct btrfs_raid_bio *rbio)
16862306a36Sopenharmony_ci{
16962306a36Sopenharmony_ci	int i;
17062306a36Sopenharmony_ci	int ret;
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_ci	ret = alloc_rbio_pages(rbio);
17362306a36Sopenharmony_ci	if (ret)
17462306a36Sopenharmony_ci		return;
17562306a36Sopenharmony_ci
17662306a36Sopenharmony_ci	for (i = 0; i < rbio->nr_sectors; i++) {
17762306a36Sopenharmony_ci		/* Some range not covered by bio (partial write), skip it */
17862306a36Sopenharmony_ci		if (!rbio->bio_sectors[i].page) {
17962306a36Sopenharmony_ci			/*
18062306a36Sopenharmony_ci			 * Even if the sector is not covered by bio, if it is
18162306a36Sopenharmony_ci			 * a data sector it should still be uptodate as it is
18262306a36Sopenharmony_ci			 * read from disk.
18362306a36Sopenharmony_ci			 */
18462306a36Sopenharmony_ci			if (i < rbio->nr_data * rbio->stripe_nsectors)
18562306a36Sopenharmony_ci				ASSERT(rbio->stripe_sectors[i].uptodate);
18662306a36Sopenharmony_ci			continue;
18762306a36Sopenharmony_ci		}
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci		ASSERT(rbio->stripe_sectors[i].page);
19062306a36Sopenharmony_ci		memcpy_page(rbio->stripe_sectors[i].page,
19162306a36Sopenharmony_ci			    rbio->stripe_sectors[i].pgoff,
19262306a36Sopenharmony_ci			    rbio->bio_sectors[i].page,
19362306a36Sopenharmony_ci			    rbio->bio_sectors[i].pgoff,
19462306a36Sopenharmony_ci			    rbio->bioc->fs_info->sectorsize);
19562306a36Sopenharmony_ci		rbio->stripe_sectors[i].uptodate = 1;
19662306a36Sopenharmony_ci	}
19762306a36Sopenharmony_ci	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
19862306a36Sopenharmony_ci}
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci/*
20162306a36Sopenharmony_ci * we hash on the first logical address of the stripe
20262306a36Sopenharmony_ci */
20362306a36Sopenharmony_cistatic int rbio_bucket(struct btrfs_raid_bio *rbio)
20462306a36Sopenharmony_ci{
20562306a36Sopenharmony_ci	u64 num = rbio->bioc->full_stripe_logical;
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	/*
20862306a36Sopenharmony_ci	 * we shift down quite a bit.  We're using byte
20962306a36Sopenharmony_ci	 * addressing, and most of the lower bits are zeros.
21062306a36Sopenharmony_ci	 * This tends to upset hash_64, and it consistently
21162306a36Sopenharmony_ci	 * returns just one or two different values.
21262306a36Sopenharmony_ci	 *
21362306a36Sopenharmony_ci	 * shifting off the lower bits fixes things.
21462306a36Sopenharmony_ci	 */
21562306a36Sopenharmony_ci	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
21662306a36Sopenharmony_ci}
21762306a36Sopenharmony_ci
21862306a36Sopenharmony_cistatic bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
21962306a36Sopenharmony_ci				       unsigned int page_nr)
22062306a36Sopenharmony_ci{
22162306a36Sopenharmony_ci	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
22262306a36Sopenharmony_ci	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
22362306a36Sopenharmony_ci	int i;
22462306a36Sopenharmony_ci
22562306a36Sopenharmony_ci	ASSERT(page_nr < rbio->nr_pages);
22662306a36Sopenharmony_ci
22762306a36Sopenharmony_ci	for (i = sectors_per_page * page_nr;
22862306a36Sopenharmony_ci	     i < sectors_per_page * page_nr + sectors_per_page;
22962306a36Sopenharmony_ci	     i++) {
23062306a36Sopenharmony_ci		if (!rbio->stripe_sectors[i].uptodate)
23162306a36Sopenharmony_ci			return false;
23262306a36Sopenharmony_ci	}
23362306a36Sopenharmony_ci	return true;
23462306a36Sopenharmony_ci}
23562306a36Sopenharmony_ci
23662306a36Sopenharmony_ci/*
23762306a36Sopenharmony_ci * Update the stripe_sectors[] array to use correct page and pgoff
23862306a36Sopenharmony_ci *
23962306a36Sopenharmony_ci * Should be called every time any page pointer in stripes_pages[] got modified.
24062306a36Sopenharmony_ci */
24162306a36Sopenharmony_cistatic void index_stripe_sectors(struct btrfs_raid_bio *rbio)
24262306a36Sopenharmony_ci{
24362306a36Sopenharmony_ci	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
24462306a36Sopenharmony_ci	u32 offset;
24562306a36Sopenharmony_ci	int i;
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
24862306a36Sopenharmony_ci		int page_index = offset >> PAGE_SHIFT;
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_ci		ASSERT(page_index < rbio->nr_pages);
25162306a36Sopenharmony_ci		rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
25262306a36Sopenharmony_ci		rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
25362306a36Sopenharmony_ci	}
25462306a36Sopenharmony_ci}
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_cistatic void steal_rbio_page(struct btrfs_raid_bio *src,
25762306a36Sopenharmony_ci			    struct btrfs_raid_bio *dest, int page_nr)
25862306a36Sopenharmony_ci{
25962306a36Sopenharmony_ci	const u32 sectorsize = src->bioc->fs_info->sectorsize;
26062306a36Sopenharmony_ci	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
26162306a36Sopenharmony_ci	int i;
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci	if (dest->stripe_pages[page_nr])
26462306a36Sopenharmony_ci		__free_page(dest->stripe_pages[page_nr]);
26562306a36Sopenharmony_ci	dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
26662306a36Sopenharmony_ci	src->stripe_pages[page_nr] = NULL;
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	/* Also update the sector->uptodate bits. */
26962306a36Sopenharmony_ci	for (i = sectors_per_page * page_nr;
27062306a36Sopenharmony_ci	     i < sectors_per_page * page_nr + sectors_per_page; i++)
27162306a36Sopenharmony_ci		dest->stripe_sectors[i].uptodate = true;
27262306a36Sopenharmony_ci}
27362306a36Sopenharmony_ci
27462306a36Sopenharmony_cistatic bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
27562306a36Sopenharmony_ci{
27662306a36Sopenharmony_ci	const int sector_nr = (page_nr << PAGE_SHIFT) >>
27762306a36Sopenharmony_ci			      rbio->bioc->fs_info->sectorsize_bits;
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci	/*
28062306a36Sopenharmony_ci	 * We have ensured PAGE_SIZE is aligned with sectorsize, thus
28162306a36Sopenharmony_ci	 * we won't have a page which is half data half parity.
28262306a36Sopenharmony_ci	 *
28362306a36Sopenharmony_ci	 * Thus if the first sector of the page belongs to data stripes, then
28462306a36Sopenharmony_ci	 * the full page belongs to data stripes.
28562306a36Sopenharmony_ci	 */
28662306a36Sopenharmony_ci	return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
28762306a36Sopenharmony_ci}
28862306a36Sopenharmony_ci
28962306a36Sopenharmony_ci/*
29062306a36Sopenharmony_ci * Stealing an rbio means taking all the uptodate pages from the stripe array
29162306a36Sopenharmony_ci * in the source rbio and putting them into the destination rbio.
29262306a36Sopenharmony_ci *
29362306a36Sopenharmony_ci * This will also update the involved stripe_sectors[] which are referring to
29462306a36Sopenharmony_ci * the old pages.
29562306a36Sopenharmony_ci */
29662306a36Sopenharmony_cistatic void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
29762306a36Sopenharmony_ci{
29862306a36Sopenharmony_ci	int i;
29962306a36Sopenharmony_ci
30062306a36Sopenharmony_ci	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
30162306a36Sopenharmony_ci		return;
30262306a36Sopenharmony_ci
30362306a36Sopenharmony_ci	for (i = 0; i < dest->nr_pages; i++) {
30462306a36Sopenharmony_ci		struct page *p = src->stripe_pages[i];
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_ci		/*
30762306a36Sopenharmony_ci		 * We don't need to steal P/Q pages as they will always be
30862306a36Sopenharmony_ci		 * regenerated for RMW or full write anyway.
30962306a36Sopenharmony_ci		 */
31062306a36Sopenharmony_ci		if (!is_data_stripe_page(src, i))
31162306a36Sopenharmony_ci			continue;
31262306a36Sopenharmony_ci
31362306a36Sopenharmony_ci		/*
31462306a36Sopenharmony_ci		 * If @src already has RBIO_CACHE_READY_BIT, it should have
31562306a36Sopenharmony_ci		 * all data stripe pages present and uptodate.
31662306a36Sopenharmony_ci		 */
31762306a36Sopenharmony_ci		ASSERT(p);
31862306a36Sopenharmony_ci		ASSERT(full_page_sectors_uptodate(src, i));
31962306a36Sopenharmony_ci		steal_rbio_page(src, dest, i);
32062306a36Sopenharmony_ci	}
32162306a36Sopenharmony_ci	index_stripe_sectors(dest);
32262306a36Sopenharmony_ci	index_stripe_sectors(src);
32362306a36Sopenharmony_ci}
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_ci/*
32662306a36Sopenharmony_ci * merging means we take the bio_list from the victim and
32762306a36Sopenharmony_ci * splice it into the destination.  The victim should
32862306a36Sopenharmony_ci * be discarded afterwards.
32962306a36Sopenharmony_ci *
33062306a36Sopenharmony_ci * must be called with dest->rbio_list_lock held
33162306a36Sopenharmony_ci */
33262306a36Sopenharmony_cistatic void merge_rbio(struct btrfs_raid_bio *dest,
33362306a36Sopenharmony_ci		       struct btrfs_raid_bio *victim)
33462306a36Sopenharmony_ci{
33562306a36Sopenharmony_ci	bio_list_merge(&dest->bio_list, &victim->bio_list);
33662306a36Sopenharmony_ci	dest->bio_list_bytes += victim->bio_list_bytes;
33762306a36Sopenharmony_ci	/* Also inherit the bitmaps from @victim. */
33862306a36Sopenharmony_ci	bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
33962306a36Sopenharmony_ci		  dest->stripe_nsectors);
34062306a36Sopenharmony_ci	bio_list_init(&victim->bio_list);
34162306a36Sopenharmony_ci}
34262306a36Sopenharmony_ci
34362306a36Sopenharmony_ci/*
34462306a36Sopenharmony_ci * used to prune items that are in the cache.  The caller
34562306a36Sopenharmony_ci * must hold the hash table lock.
34662306a36Sopenharmony_ci */
34762306a36Sopenharmony_cistatic void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
34862306a36Sopenharmony_ci{
34962306a36Sopenharmony_ci	int bucket = rbio_bucket(rbio);
35062306a36Sopenharmony_ci	struct btrfs_stripe_hash_table *table;
35162306a36Sopenharmony_ci	struct btrfs_stripe_hash *h;
35262306a36Sopenharmony_ci	int freeit = 0;
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	/*
35562306a36Sopenharmony_ci	 * check the bit again under the hash table lock.
35662306a36Sopenharmony_ci	 */
35762306a36Sopenharmony_ci	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
35862306a36Sopenharmony_ci		return;
35962306a36Sopenharmony_ci
36062306a36Sopenharmony_ci	table = rbio->bioc->fs_info->stripe_hash_table;
36162306a36Sopenharmony_ci	h = table->table + bucket;
36262306a36Sopenharmony_ci
36362306a36Sopenharmony_ci	/* hold the lock for the bucket because we may be
36462306a36Sopenharmony_ci	 * removing it from the hash table
36562306a36Sopenharmony_ci	 */
36662306a36Sopenharmony_ci	spin_lock(&h->lock);
36762306a36Sopenharmony_ci
36862306a36Sopenharmony_ci	/*
36962306a36Sopenharmony_ci	 * hold the lock for the bio list because we need
37062306a36Sopenharmony_ci	 * to make sure the bio list is empty
37162306a36Sopenharmony_ci	 */
37262306a36Sopenharmony_ci	spin_lock(&rbio->bio_list_lock);
37362306a36Sopenharmony_ci
37462306a36Sopenharmony_ci	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
37562306a36Sopenharmony_ci		list_del_init(&rbio->stripe_cache);
37662306a36Sopenharmony_ci		table->cache_size -= 1;
37762306a36Sopenharmony_ci		freeit = 1;
37862306a36Sopenharmony_ci
37962306a36Sopenharmony_ci		/* if the bio list isn't empty, this rbio is
38062306a36Sopenharmony_ci		 * still involved in an IO.  We take it out
38162306a36Sopenharmony_ci		 * of the cache list, and drop the ref that
38262306a36Sopenharmony_ci		 * was held for the list.
38362306a36Sopenharmony_ci		 *
38462306a36Sopenharmony_ci		 * If the bio_list was empty, we also remove
38562306a36Sopenharmony_ci		 * the rbio from the hash_table, and drop
38662306a36Sopenharmony_ci		 * the corresponding ref
38762306a36Sopenharmony_ci		 */
38862306a36Sopenharmony_ci		if (bio_list_empty(&rbio->bio_list)) {
38962306a36Sopenharmony_ci			if (!list_empty(&rbio->hash_list)) {
39062306a36Sopenharmony_ci				list_del_init(&rbio->hash_list);
39162306a36Sopenharmony_ci				refcount_dec(&rbio->refs);
39262306a36Sopenharmony_ci				BUG_ON(!list_empty(&rbio->plug_list));
39362306a36Sopenharmony_ci			}
39462306a36Sopenharmony_ci		}
39562306a36Sopenharmony_ci	}
39662306a36Sopenharmony_ci
39762306a36Sopenharmony_ci	spin_unlock(&rbio->bio_list_lock);
39862306a36Sopenharmony_ci	spin_unlock(&h->lock);
39962306a36Sopenharmony_ci
40062306a36Sopenharmony_ci	if (freeit)
40162306a36Sopenharmony_ci		free_raid_bio(rbio);
40262306a36Sopenharmony_ci}
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci/*
40562306a36Sopenharmony_ci * prune a given rbio from the cache
40662306a36Sopenharmony_ci */
40762306a36Sopenharmony_cistatic void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
40862306a36Sopenharmony_ci{
40962306a36Sopenharmony_ci	struct btrfs_stripe_hash_table *table;
41062306a36Sopenharmony_ci
41162306a36Sopenharmony_ci	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
41262306a36Sopenharmony_ci		return;
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_ci	table = rbio->bioc->fs_info->stripe_hash_table;
41562306a36Sopenharmony_ci
41662306a36Sopenharmony_ci	spin_lock(&table->cache_lock);
41762306a36Sopenharmony_ci	__remove_rbio_from_cache(rbio);
41862306a36Sopenharmony_ci	spin_unlock(&table->cache_lock);
41962306a36Sopenharmony_ci}
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci/*
42262306a36Sopenharmony_ci * remove everything in the cache
42362306a36Sopenharmony_ci */
42462306a36Sopenharmony_cistatic void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
42562306a36Sopenharmony_ci{
42662306a36Sopenharmony_ci	struct btrfs_stripe_hash_table *table;
42762306a36Sopenharmony_ci	struct btrfs_raid_bio *rbio;
42862306a36Sopenharmony_ci
42962306a36Sopenharmony_ci	table = info->stripe_hash_table;
43062306a36Sopenharmony_ci
43162306a36Sopenharmony_ci	spin_lock(&table->cache_lock);
43262306a36Sopenharmony_ci	while (!list_empty(&table->stripe_cache)) {
43362306a36Sopenharmony_ci		rbio = list_entry(table->stripe_cache.next,
43462306a36Sopenharmony_ci				  struct btrfs_raid_bio,
43562306a36Sopenharmony_ci				  stripe_cache);
43662306a36Sopenharmony_ci		__remove_rbio_from_cache(rbio);
43762306a36Sopenharmony_ci	}
43862306a36Sopenharmony_ci	spin_unlock(&table->cache_lock);
43962306a36Sopenharmony_ci}
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci/*
44262306a36Sopenharmony_ci * remove all cached entries and free the hash table
44362306a36Sopenharmony_ci * used by unmount
44462306a36Sopenharmony_ci */
44562306a36Sopenharmony_civoid btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
44662306a36Sopenharmony_ci{
44762306a36Sopenharmony_ci	if (!info->stripe_hash_table)
44862306a36Sopenharmony_ci		return;
44962306a36Sopenharmony_ci	btrfs_clear_rbio_cache(info);
45062306a36Sopenharmony_ci	kvfree(info->stripe_hash_table);
45162306a36Sopenharmony_ci	info->stripe_hash_table = NULL;
45262306a36Sopenharmony_ci}
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci/*
45562306a36Sopenharmony_ci * insert an rbio into the stripe cache.  It
45662306a36Sopenharmony_ci * must have already been prepared by calling
45762306a36Sopenharmony_ci * cache_rbio_pages
45862306a36Sopenharmony_ci *
45962306a36Sopenharmony_ci * If this rbio was already cached, it gets
46062306a36Sopenharmony_ci * moved to the front of the lru.
46162306a36Sopenharmony_ci *
46262306a36Sopenharmony_ci * If the size of the rbio cache is too big, we
46362306a36Sopenharmony_ci * prune an item.
46462306a36Sopenharmony_ci */
46562306a36Sopenharmony_cistatic void cache_rbio(struct btrfs_raid_bio *rbio)
46662306a36Sopenharmony_ci{
46762306a36Sopenharmony_ci	struct btrfs_stripe_hash_table *table;
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
47062306a36Sopenharmony_ci		return;
47162306a36Sopenharmony_ci
47262306a36Sopenharmony_ci	table = rbio->bioc->fs_info->stripe_hash_table;
47362306a36Sopenharmony_ci
47462306a36Sopenharmony_ci	spin_lock(&table->cache_lock);
47562306a36Sopenharmony_ci	spin_lock(&rbio->bio_list_lock);
47662306a36Sopenharmony_ci
47762306a36Sopenharmony_ci	/* bump our ref if we were not in the list before */
47862306a36Sopenharmony_ci	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
47962306a36Sopenharmony_ci		refcount_inc(&rbio->refs);
48062306a36Sopenharmony_ci
48162306a36Sopenharmony_ci	if (!list_empty(&rbio->stripe_cache)){
48262306a36Sopenharmony_ci		list_move(&rbio->stripe_cache, &table->stripe_cache);
48362306a36Sopenharmony_ci	} else {
48462306a36Sopenharmony_ci		list_add(&rbio->stripe_cache, &table->stripe_cache);
48562306a36Sopenharmony_ci		table->cache_size += 1;
48662306a36Sopenharmony_ci	}
48762306a36Sopenharmony_ci
48862306a36Sopenharmony_ci	spin_unlock(&rbio->bio_list_lock);
48962306a36Sopenharmony_ci
49062306a36Sopenharmony_ci	if (table->cache_size > RBIO_CACHE_SIZE) {
49162306a36Sopenharmony_ci		struct btrfs_raid_bio *found;
49262306a36Sopenharmony_ci
49362306a36Sopenharmony_ci		found = list_entry(table->stripe_cache.prev,
49462306a36Sopenharmony_ci				  struct btrfs_raid_bio,
49562306a36Sopenharmony_ci				  stripe_cache);
49662306a36Sopenharmony_ci
49762306a36Sopenharmony_ci		if (found != rbio)
49862306a36Sopenharmony_ci			__remove_rbio_from_cache(found);
49962306a36Sopenharmony_ci	}
50062306a36Sopenharmony_ci
50162306a36Sopenharmony_ci	spin_unlock(&table->cache_lock);
50262306a36Sopenharmony_ci}
50362306a36Sopenharmony_ci
50462306a36Sopenharmony_ci/*
50562306a36Sopenharmony_ci * helper function to run the xor_blocks api.  It is only
50662306a36Sopenharmony_ci * able to do MAX_XOR_BLOCKS at a time, so we need to
50762306a36Sopenharmony_ci * loop through.
50862306a36Sopenharmony_ci */
50962306a36Sopenharmony_cistatic void run_xor(void **pages, int src_cnt, ssize_t len)
51062306a36Sopenharmony_ci{
51162306a36Sopenharmony_ci	int src_off = 0;
51262306a36Sopenharmony_ci	int xor_src_cnt = 0;
51362306a36Sopenharmony_ci	void *dest = pages[src_cnt];
51462306a36Sopenharmony_ci
51562306a36Sopenharmony_ci	while(src_cnt > 0) {
51662306a36Sopenharmony_ci		xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
51762306a36Sopenharmony_ci		xor_blocks(xor_src_cnt, len, dest, pages + src_off);
51862306a36Sopenharmony_ci
51962306a36Sopenharmony_ci		src_cnt -= xor_src_cnt;
52062306a36Sopenharmony_ci		src_off += xor_src_cnt;
52162306a36Sopenharmony_ci	}
52262306a36Sopenharmony_ci}
52362306a36Sopenharmony_ci
52462306a36Sopenharmony_ci/*
52562306a36Sopenharmony_ci * Returns true if the bio list inside this rbio covers an entire stripe (no
52662306a36Sopenharmony_ci * rmw required).
52762306a36Sopenharmony_ci */
52862306a36Sopenharmony_cistatic int rbio_is_full(struct btrfs_raid_bio *rbio)
52962306a36Sopenharmony_ci{
53062306a36Sopenharmony_ci	unsigned long size = rbio->bio_list_bytes;
53162306a36Sopenharmony_ci	int ret = 1;
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_ci	spin_lock(&rbio->bio_list_lock);
53462306a36Sopenharmony_ci	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
53562306a36Sopenharmony_ci		ret = 0;
53662306a36Sopenharmony_ci	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
53762306a36Sopenharmony_ci	spin_unlock(&rbio->bio_list_lock);
53862306a36Sopenharmony_ci
53962306a36Sopenharmony_ci	return ret;
54062306a36Sopenharmony_ci}
54162306a36Sopenharmony_ci
54262306a36Sopenharmony_ci/*
54362306a36Sopenharmony_ci * returns 1 if it is safe to merge two rbios together.
54462306a36Sopenharmony_ci * The merging is safe if the two rbios correspond to
54562306a36Sopenharmony_ci * the same stripe and if they are both going in the same
54662306a36Sopenharmony_ci * direction (read vs write), and if neither one is
54762306a36Sopenharmony_ci * locked for final IO
54862306a36Sopenharmony_ci *
54962306a36Sopenharmony_ci * The caller is responsible for locking such that
55062306a36Sopenharmony_ci * rmw_locked is safe to test
55162306a36Sopenharmony_ci */
55262306a36Sopenharmony_cistatic int rbio_can_merge(struct btrfs_raid_bio *last,
55362306a36Sopenharmony_ci			  struct btrfs_raid_bio *cur)
55462306a36Sopenharmony_ci{
55562306a36Sopenharmony_ci	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
55662306a36Sopenharmony_ci	    test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
55762306a36Sopenharmony_ci		return 0;
55862306a36Sopenharmony_ci
55962306a36Sopenharmony_ci	/*
56062306a36Sopenharmony_ci	 * we can't merge with cached rbios, since the
56162306a36Sopenharmony_ci	 * idea is that when we merge the destination
56262306a36Sopenharmony_ci	 * rbio is going to run our IO for us.  We can
56362306a36Sopenharmony_ci	 * steal from cached rbios though, other functions
56462306a36Sopenharmony_ci	 * handle that.
56562306a36Sopenharmony_ci	 */
56662306a36Sopenharmony_ci	if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
56762306a36Sopenharmony_ci	    test_bit(RBIO_CACHE_BIT, &cur->flags))
56862306a36Sopenharmony_ci		return 0;
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
57162306a36Sopenharmony_ci		return 0;
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci	/* we can't merge with different operations */
57462306a36Sopenharmony_ci	if (last->operation != cur->operation)
57562306a36Sopenharmony_ci		return 0;
57662306a36Sopenharmony_ci	/*
57762306a36Sopenharmony_ci	 * We've need read the full stripe from the drive.
57862306a36Sopenharmony_ci	 * check and repair the parity and write the new results.
57962306a36Sopenharmony_ci	 *
58062306a36Sopenharmony_ci	 * We're not allowed to add any new bios to the
58162306a36Sopenharmony_ci	 * bio list here, anyone else that wants to
58262306a36Sopenharmony_ci	 * change this stripe needs to do their own rmw.
58362306a36Sopenharmony_ci	 */
58462306a36Sopenharmony_ci	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
58562306a36Sopenharmony_ci		return 0;
58662306a36Sopenharmony_ci
58762306a36Sopenharmony_ci	if (last->operation == BTRFS_RBIO_READ_REBUILD)
58862306a36Sopenharmony_ci		return 0;
58962306a36Sopenharmony_ci
59062306a36Sopenharmony_ci	return 1;
59162306a36Sopenharmony_ci}
59262306a36Sopenharmony_ci
59362306a36Sopenharmony_cistatic unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
59462306a36Sopenharmony_ci					     unsigned int stripe_nr,
59562306a36Sopenharmony_ci					     unsigned int sector_nr)
59662306a36Sopenharmony_ci{
59762306a36Sopenharmony_ci	ASSERT(stripe_nr < rbio->real_stripes);
59862306a36Sopenharmony_ci	ASSERT(sector_nr < rbio->stripe_nsectors);
59962306a36Sopenharmony_ci
60062306a36Sopenharmony_ci	return stripe_nr * rbio->stripe_nsectors + sector_nr;
60162306a36Sopenharmony_ci}
60262306a36Sopenharmony_ci
60362306a36Sopenharmony_ci/* Return a sector from rbio->stripe_sectors, not from the bio list */
60462306a36Sopenharmony_cistatic struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
60562306a36Sopenharmony_ci					     unsigned int stripe_nr,
60662306a36Sopenharmony_ci					     unsigned int sector_nr)
60762306a36Sopenharmony_ci{
60862306a36Sopenharmony_ci	return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
60962306a36Sopenharmony_ci							      sector_nr)];
61062306a36Sopenharmony_ci}
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci/* Grab a sector inside P stripe */
61362306a36Sopenharmony_cistatic struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
61462306a36Sopenharmony_ci					      unsigned int sector_nr)
61562306a36Sopenharmony_ci{
61662306a36Sopenharmony_ci	return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
61762306a36Sopenharmony_ci}
61862306a36Sopenharmony_ci
61962306a36Sopenharmony_ci/* Grab a sector inside Q stripe, return NULL if not RAID6 */
62062306a36Sopenharmony_cistatic struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
62162306a36Sopenharmony_ci					      unsigned int sector_nr)
62262306a36Sopenharmony_ci{
62362306a36Sopenharmony_ci	if (rbio->nr_data + 1 == rbio->real_stripes)
62462306a36Sopenharmony_ci		return NULL;
62562306a36Sopenharmony_ci	return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
62662306a36Sopenharmony_ci}
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_ci/*
62962306a36Sopenharmony_ci * The first stripe in the table for a logical address
63062306a36Sopenharmony_ci * has the lock.  rbios are added in one of three ways:
63162306a36Sopenharmony_ci *
63262306a36Sopenharmony_ci * 1) Nobody has the stripe locked yet.  The rbio is given
63362306a36Sopenharmony_ci * the lock and 0 is returned.  The caller must start the IO
63462306a36Sopenharmony_ci * themselves.
63562306a36Sopenharmony_ci *
63662306a36Sopenharmony_ci * 2) Someone has the stripe locked, but we're able to merge
63762306a36Sopenharmony_ci * with the lock owner.  The rbio is freed and the IO will
63862306a36Sopenharmony_ci * start automatically along with the existing rbio.  1 is returned.
63962306a36Sopenharmony_ci *
64062306a36Sopenharmony_ci * 3) Someone has the stripe locked, but we're not able to merge.
64162306a36Sopenharmony_ci * The rbio is added to the lock owner's plug list, or merged into
64262306a36Sopenharmony_ci * an rbio already on the plug list.  When the lock owner unlocks,
64362306a36Sopenharmony_ci * the next rbio on the list is run and the IO is started automatically.
64462306a36Sopenharmony_ci * 1 is returned
64562306a36Sopenharmony_ci *
64662306a36Sopenharmony_ci * If we return 0, the caller still owns the rbio and must continue with
64762306a36Sopenharmony_ci * IO submission.  If we return 1, the caller must assume the rbio has
64862306a36Sopenharmony_ci * already been freed.
64962306a36Sopenharmony_ci */
65062306a36Sopenharmony_cistatic noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
65162306a36Sopenharmony_ci{
65262306a36Sopenharmony_ci	struct btrfs_stripe_hash *h;
65362306a36Sopenharmony_ci	struct btrfs_raid_bio *cur;
65462306a36Sopenharmony_ci	struct btrfs_raid_bio *pending;
65562306a36Sopenharmony_ci	struct btrfs_raid_bio *freeit = NULL;
65662306a36Sopenharmony_ci	struct btrfs_raid_bio *cache_drop = NULL;
65762306a36Sopenharmony_ci	int ret = 0;
65862306a36Sopenharmony_ci
65962306a36Sopenharmony_ci	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
66062306a36Sopenharmony_ci
66162306a36Sopenharmony_ci	spin_lock(&h->lock);
66262306a36Sopenharmony_ci	list_for_each_entry(cur, &h->hash_list, hash_list) {
66362306a36Sopenharmony_ci		if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
66462306a36Sopenharmony_ci			continue;
66562306a36Sopenharmony_ci
66662306a36Sopenharmony_ci		spin_lock(&cur->bio_list_lock);
66762306a36Sopenharmony_ci
66862306a36Sopenharmony_ci		/* Can we steal this cached rbio's pages? */
66962306a36Sopenharmony_ci		if (bio_list_empty(&cur->bio_list) &&
67062306a36Sopenharmony_ci		    list_empty(&cur->plug_list) &&
67162306a36Sopenharmony_ci		    test_bit(RBIO_CACHE_BIT, &cur->flags) &&
67262306a36Sopenharmony_ci		    !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
67362306a36Sopenharmony_ci			list_del_init(&cur->hash_list);
67462306a36Sopenharmony_ci			refcount_dec(&cur->refs);
67562306a36Sopenharmony_ci
67662306a36Sopenharmony_ci			steal_rbio(cur, rbio);
67762306a36Sopenharmony_ci			cache_drop = cur;
67862306a36Sopenharmony_ci			spin_unlock(&cur->bio_list_lock);
67962306a36Sopenharmony_ci
68062306a36Sopenharmony_ci			goto lockit;
68162306a36Sopenharmony_ci		}
68262306a36Sopenharmony_ci
68362306a36Sopenharmony_ci		/* Can we merge into the lock owner? */
68462306a36Sopenharmony_ci		if (rbio_can_merge(cur, rbio)) {
68562306a36Sopenharmony_ci			merge_rbio(cur, rbio);
68662306a36Sopenharmony_ci			spin_unlock(&cur->bio_list_lock);
68762306a36Sopenharmony_ci			freeit = rbio;
68862306a36Sopenharmony_ci			ret = 1;
68962306a36Sopenharmony_ci			goto out;
69062306a36Sopenharmony_ci		}
69162306a36Sopenharmony_ci
69262306a36Sopenharmony_ci
69362306a36Sopenharmony_ci		/*
69462306a36Sopenharmony_ci		 * We couldn't merge with the running rbio, see if we can merge
69562306a36Sopenharmony_ci		 * with the pending ones.  We don't have to check for rmw_locked
69662306a36Sopenharmony_ci		 * because there is no way they are inside finish_rmw right now
69762306a36Sopenharmony_ci		 */
69862306a36Sopenharmony_ci		list_for_each_entry(pending, &cur->plug_list, plug_list) {
69962306a36Sopenharmony_ci			if (rbio_can_merge(pending, rbio)) {
70062306a36Sopenharmony_ci				merge_rbio(pending, rbio);
70162306a36Sopenharmony_ci				spin_unlock(&cur->bio_list_lock);
70262306a36Sopenharmony_ci				freeit = rbio;
70362306a36Sopenharmony_ci				ret = 1;
70462306a36Sopenharmony_ci				goto out;
70562306a36Sopenharmony_ci			}
70662306a36Sopenharmony_ci		}
70762306a36Sopenharmony_ci
70862306a36Sopenharmony_ci		/*
70962306a36Sopenharmony_ci		 * No merging, put us on the tail of the plug list, our rbio
71062306a36Sopenharmony_ci		 * will be started with the currently running rbio unlocks
71162306a36Sopenharmony_ci		 */
71262306a36Sopenharmony_ci		list_add_tail(&rbio->plug_list, &cur->plug_list);
71362306a36Sopenharmony_ci		spin_unlock(&cur->bio_list_lock);
71462306a36Sopenharmony_ci		ret = 1;
71562306a36Sopenharmony_ci		goto out;
71662306a36Sopenharmony_ci	}
71762306a36Sopenharmony_cilockit:
71862306a36Sopenharmony_ci	refcount_inc(&rbio->refs);
71962306a36Sopenharmony_ci	list_add(&rbio->hash_list, &h->hash_list);
72062306a36Sopenharmony_ciout:
72162306a36Sopenharmony_ci	spin_unlock(&h->lock);
72262306a36Sopenharmony_ci	if (cache_drop)
72362306a36Sopenharmony_ci		remove_rbio_from_cache(cache_drop);
72462306a36Sopenharmony_ci	if (freeit)
72562306a36Sopenharmony_ci		free_raid_bio(freeit);
72662306a36Sopenharmony_ci	return ret;
72762306a36Sopenharmony_ci}
72862306a36Sopenharmony_ci
72962306a36Sopenharmony_cistatic void recover_rbio_work_locked(struct work_struct *work);
73062306a36Sopenharmony_ci
73162306a36Sopenharmony_ci/*
73262306a36Sopenharmony_ci * called as rmw or parity rebuild is completed.  If the plug list has more
73362306a36Sopenharmony_ci * rbios waiting for this stripe, the next one on the list will be started
73462306a36Sopenharmony_ci */
73562306a36Sopenharmony_cistatic noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
73662306a36Sopenharmony_ci{
73762306a36Sopenharmony_ci	int bucket;
73862306a36Sopenharmony_ci	struct btrfs_stripe_hash *h;
73962306a36Sopenharmony_ci	int keep_cache = 0;
74062306a36Sopenharmony_ci
74162306a36Sopenharmony_ci	bucket = rbio_bucket(rbio);
74262306a36Sopenharmony_ci	h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
74362306a36Sopenharmony_ci
74462306a36Sopenharmony_ci	if (list_empty(&rbio->plug_list))
74562306a36Sopenharmony_ci		cache_rbio(rbio);
74662306a36Sopenharmony_ci
74762306a36Sopenharmony_ci	spin_lock(&h->lock);
74862306a36Sopenharmony_ci	spin_lock(&rbio->bio_list_lock);
74962306a36Sopenharmony_ci
75062306a36Sopenharmony_ci	if (!list_empty(&rbio->hash_list)) {
75162306a36Sopenharmony_ci		/*
75262306a36Sopenharmony_ci		 * if we're still cached and there is no other IO
75362306a36Sopenharmony_ci		 * to perform, just leave this rbio here for others
75462306a36Sopenharmony_ci		 * to steal from later
75562306a36Sopenharmony_ci		 */
75662306a36Sopenharmony_ci		if (list_empty(&rbio->plug_list) &&
75762306a36Sopenharmony_ci		    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
75862306a36Sopenharmony_ci			keep_cache = 1;
75962306a36Sopenharmony_ci			clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
76062306a36Sopenharmony_ci			BUG_ON(!bio_list_empty(&rbio->bio_list));
76162306a36Sopenharmony_ci			goto done;
76262306a36Sopenharmony_ci		}
76362306a36Sopenharmony_ci
76462306a36Sopenharmony_ci		list_del_init(&rbio->hash_list);
76562306a36Sopenharmony_ci		refcount_dec(&rbio->refs);
76662306a36Sopenharmony_ci
76762306a36Sopenharmony_ci		/*
76862306a36Sopenharmony_ci		 * we use the plug list to hold all the rbios
76962306a36Sopenharmony_ci		 * waiting for the chance to lock this stripe.
77062306a36Sopenharmony_ci		 * hand the lock over to one of them.
77162306a36Sopenharmony_ci		 */
77262306a36Sopenharmony_ci		if (!list_empty(&rbio->plug_list)) {
77362306a36Sopenharmony_ci			struct btrfs_raid_bio *next;
77462306a36Sopenharmony_ci			struct list_head *head = rbio->plug_list.next;
77562306a36Sopenharmony_ci
77662306a36Sopenharmony_ci			next = list_entry(head, struct btrfs_raid_bio,
77762306a36Sopenharmony_ci					  plug_list);
77862306a36Sopenharmony_ci
77962306a36Sopenharmony_ci			list_del_init(&rbio->plug_list);
78062306a36Sopenharmony_ci
78162306a36Sopenharmony_ci			list_add(&next->hash_list, &h->hash_list);
78262306a36Sopenharmony_ci			refcount_inc(&next->refs);
78362306a36Sopenharmony_ci			spin_unlock(&rbio->bio_list_lock);
78462306a36Sopenharmony_ci			spin_unlock(&h->lock);
78562306a36Sopenharmony_ci
78662306a36Sopenharmony_ci			if (next->operation == BTRFS_RBIO_READ_REBUILD) {
78762306a36Sopenharmony_ci				start_async_work(next, recover_rbio_work_locked);
78862306a36Sopenharmony_ci			} else if (next->operation == BTRFS_RBIO_WRITE) {
78962306a36Sopenharmony_ci				steal_rbio(rbio, next);
79062306a36Sopenharmony_ci				start_async_work(next, rmw_rbio_work_locked);
79162306a36Sopenharmony_ci			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
79262306a36Sopenharmony_ci				steal_rbio(rbio, next);
79362306a36Sopenharmony_ci				start_async_work(next, scrub_rbio_work_locked);
79462306a36Sopenharmony_ci			}
79562306a36Sopenharmony_ci
79662306a36Sopenharmony_ci			goto done_nolock;
79762306a36Sopenharmony_ci		}
79862306a36Sopenharmony_ci	}
79962306a36Sopenharmony_cidone:
80062306a36Sopenharmony_ci	spin_unlock(&rbio->bio_list_lock);
80162306a36Sopenharmony_ci	spin_unlock(&h->lock);
80262306a36Sopenharmony_ci
80362306a36Sopenharmony_cidone_nolock:
80462306a36Sopenharmony_ci	if (!keep_cache)
80562306a36Sopenharmony_ci		remove_rbio_from_cache(rbio);
80662306a36Sopenharmony_ci}
80762306a36Sopenharmony_ci
80862306a36Sopenharmony_cistatic void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
80962306a36Sopenharmony_ci{
81062306a36Sopenharmony_ci	struct bio *next;
81162306a36Sopenharmony_ci
81262306a36Sopenharmony_ci	while (cur) {
81362306a36Sopenharmony_ci		next = cur->bi_next;
81462306a36Sopenharmony_ci		cur->bi_next = NULL;
81562306a36Sopenharmony_ci		cur->bi_status = err;
81662306a36Sopenharmony_ci		bio_endio(cur);
81762306a36Sopenharmony_ci		cur = next;
81862306a36Sopenharmony_ci	}
81962306a36Sopenharmony_ci}
82062306a36Sopenharmony_ci
82162306a36Sopenharmony_ci/*
82262306a36Sopenharmony_ci * this frees the rbio and runs through all the bios in the
82362306a36Sopenharmony_ci * bio_list and calls end_io on them
82462306a36Sopenharmony_ci */
82562306a36Sopenharmony_cistatic void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
82662306a36Sopenharmony_ci{
82762306a36Sopenharmony_ci	struct bio *cur = bio_list_get(&rbio->bio_list);
82862306a36Sopenharmony_ci	struct bio *extra;
82962306a36Sopenharmony_ci
83062306a36Sopenharmony_ci	kfree(rbio->csum_buf);
83162306a36Sopenharmony_ci	bitmap_free(rbio->csum_bitmap);
83262306a36Sopenharmony_ci	rbio->csum_buf = NULL;
83362306a36Sopenharmony_ci	rbio->csum_bitmap = NULL;
83462306a36Sopenharmony_ci
83562306a36Sopenharmony_ci	/*
83662306a36Sopenharmony_ci	 * Clear the data bitmap, as the rbio may be cached for later usage.
83762306a36Sopenharmony_ci	 * do this before before unlock_stripe() so there will be no new bio
83862306a36Sopenharmony_ci	 * for this bio.
83962306a36Sopenharmony_ci	 */
84062306a36Sopenharmony_ci	bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
84162306a36Sopenharmony_ci
84262306a36Sopenharmony_ci	/*
84362306a36Sopenharmony_ci	 * At this moment, rbio->bio_list is empty, however since rbio does not
84462306a36Sopenharmony_ci	 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
84562306a36Sopenharmony_ci	 * hash list, rbio may be merged with others so that rbio->bio_list
84662306a36Sopenharmony_ci	 * becomes non-empty.
84762306a36Sopenharmony_ci	 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
84862306a36Sopenharmony_ci	 * more and we can call bio_endio() on all queued bios.
84962306a36Sopenharmony_ci	 */
85062306a36Sopenharmony_ci	unlock_stripe(rbio);
85162306a36Sopenharmony_ci	extra = bio_list_get(&rbio->bio_list);
85262306a36Sopenharmony_ci	free_raid_bio(rbio);
85362306a36Sopenharmony_ci
85462306a36Sopenharmony_ci	rbio_endio_bio_list(cur, err);
85562306a36Sopenharmony_ci	if (extra)
85662306a36Sopenharmony_ci		rbio_endio_bio_list(extra, err);
85762306a36Sopenharmony_ci}
85862306a36Sopenharmony_ci
85962306a36Sopenharmony_ci/*
86062306a36Sopenharmony_ci * Get a sector pointer specified by its @stripe_nr and @sector_nr.
86162306a36Sopenharmony_ci *
86262306a36Sopenharmony_ci * @rbio:               The raid bio
86362306a36Sopenharmony_ci * @stripe_nr:          Stripe number, valid range [0, real_stripe)
86462306a36Sopenharmony_ci * @sector_nr:		Sector number inside the stripe,
86562306a36Sopenharmony_ci *			valid range [0, stripe_nsectors)
86662306a36Sopenharmony_ci * @bio_list_only:      Whether to use sectors inside the bio list only.
86762306a36Sopenharmony_ci *
86862306a36Sopenharmony_ci * The read/modify/write code wants to reuse the original bio page as much
86962306a36Sopenharmony_ci * as possible, and only use stripe_sectors as fallback.
87062306a36Sopenharmony_ci */
87162306a36Sopenharmony_cistatic struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
87262306a36Sopenharmony_ci					 int stripe_nr, int sector_nr,
87362306a36Sopenharmony_ci					 bool bio_list_only)
87462306a36Sopenharmony_ci{
87562306a36Sopenharmony_ci	struct sector_ptr *sector;
87662306a36Sopenharmony_ci	int index;
87762306a36Sopenharmony_ci
87862306a36Sopenharmony_ci	ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
87962306a36Sopenharmony_ci	ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
88062306a36Sopenharmony_ci
88162306a36Sopenharmony_ci	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
88262306a36Sopenharmony_ci	ASSERT(index >= 0 && index < rbio->nr_sectors);
88362306a36Sopenharmony_ci
88462306a36Sopenharmony_ci	spin_lock(&rbio->bio_list_lock);
88562306a36Sopenharmony_ci	sector = &rbio->bio_sectors[index];
88662306a36Sopenharmony_ci	if (sector->page || bio_list_only) {
88762306a36Sopenharmony_ci		/* Don't return sector without a valid page pointer */
88862306a36Sopenharmony_ci		if (!sector->page)
88962306a36Sopenharmony_ci			sector = NULL;
89062306a36Sopenharmony_ci		spin_unlock(&rbio->bio_list_lock);
89162306a36Sopenharmony_ci		return sector;
89262306a36Sopenharmony_ci	}
89362306a36Sopenharmony_ci	spin_unlock(&rbio->bio_list_lock);
89462306a36Sopenharmony_ci
89562306a36Sopenharmony_ci	return &rbio->stripe_sectors[index];
89662306a36Sopenharmony_ci}
89762306a36Sopenharmony_ci
89862306a36Sopenharmony_ci/*
89962306a36Sopenharmony_ci * allocation and initial setup for the btrfs_raid_bio.  Not
90062306a36Sopenharmony_ci * this does not allocate any pages for rbio->pages.
90162306a36Sopenharmony_ci */
90262306a36Sopenharmony_cistatic struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
90362306a36Sopenharmony_ci					 struct btrfs_io_context *bioc)
90462306a36Sopenharmony_ci{
90562306a36Sopenharmony_ci	const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
90662306a36Sopenharmony_ci	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
90762306a36Sopenharmony_ci	const unsigned int num_pages = stripe_npages * real_stripes;
90862306a36Sopenharmony_ci	const unsigned int stripe_nsectors =
90962306a36Sopenharmony_ci		BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
91062306a36Sopenharmony_ci	const unsigned int num_sectors = stripe_nsectors * real_stripes;
91162306a36Sopenharmony_ci	struct btrfs_raid_bio *rbio;
91262306a36Sopenharmony_ci
91362306a36Sopenharmony_ci	/* PAGE_SIZE must also be aligned to sectorsize for subpage support */
91462306a36Sopenharmony_ci	ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
91562306a36Sopenharmony_ci	/*
91662306a36Sopenharmony_ci	 * Our current stripe len should be fixed to 64k thus stripe_nsectors
91762306a36Sopenharmony_ci	 * (at most 16) should be no larger than BITS_PER_LONG.
91862306a36Sopenharmony_ci	 */
91962306a36Sopenharmony_ci	ASSERT(stripe_nsectors <= BITS_PER_LONG);
92062306a36Sopenharmony_ci
92162306a36Sopenharmony_ci	rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
92262306a36Sopenharmony_ci	if (!rbio)
92362306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
92462306a36Sopenharmony_ci	rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
92562306a36Sopenharmony_ci				     GFP_NOFS);
92662306a36Sopenharmony_ci	rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
92762306a36Sopenharmony_ci				    GFP_NOFS);
92862306a36Sopenharmony_ci	rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
92962306a36Sopenharmony_ci				       GFP_NOFS);
93062306a36Sopenharmony_ci	rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
93162306a36Sopenharmony_ci	rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
93262306a36Sopenharmony_ci
93362306a36Sopenharmony_ci	if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
93462306a36Sopenharmony_ci	    !rbio->finish_pointers || !rbio->error_bitmap) {
93562306a36Sopenharmony_ci		free_raid_bio_pointers(rbio);
93662306a36Sopenharmony_ci		kfree(rbio);
93762306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
93862306a36Sopenharmony_ci	}
93962306a36Sopenharmony_ci
94062306a36Sopenharmony_ci	bio_list_init(&rbio->bio_list);
94162306a36Sopenharmony_ci	init_waitqueue_head(&rbio->io_wait);
94262306a36Sopenharmony_ci	INIT_LIST_HEAD(&rbio->plug_list);
94362306a36Sopenharmony_ci	spin_lock_init(&rbio->bio_list_lock);
94462306a36Sopenharmony_ci	INIT_LIST_HEAD(&rbio->stripe_cache);
94562306a36Sopenharmony_ci	INIT_LIST_HEAD(&rbio->hash_list);
94662306a36Sopenharmony_ci	btrfs_get_bioc(bioc);
94762306a36Sopenharmony_ci	rbio->bioc = bioc;
94862306a36Sopenharmony_ci	rbio->nr_pages = num_pages;
94962306a36Sopenharmony_ci	rbio->nr_sectors = num_sectors;
95062306a36Sopenharmony_ci	rbio->real_stripes = real_stripes;
95162306a36Sopenharmony_ci	rbio->stripe_npages = stripe_npages;
95262306a36Sopenharmony_ci	rbio->stripe_nsectors = stripe_nsectors;
95362306a36Sopenharmony_ci	refcount_set(&rbio->refs, 1);
95462306a36Sopenharmony_ci	atomic_set(&rbio->stripes_pending, 0);
95562306a36Sopenharmony_ci
95662306a36Sopenharmony_ci	ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
95762306a36Sopenharmony_ci	rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
95862306a36Sopenharmony_ci
95962306a36Sopenharmony_ci	return rbio;
96062306a36Sopenharmony_ci}
96162306a36Sopenharmony_ci
96262306a36Sopenharmony_ci/* allocate pages for all the stripes in the bio, including parity */
96362306a36Sopenharmony_cistatic int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
96462306a36Sopenharmony_ci{
96562306a36Sopenharmony_ci	int ret;
96662306a36Sopenharmony_ci
96762306a36Sopenharmony_ci	ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
96862306a36Sopenharmony_ci	if (ret < 0)
96962306a36Sopenharmony_ci		return ret;
97062306a36Sopenharmony_ci	/* Mapping all sectors */
97162306a36Sopenharmony_ci	index_stripe_sectors(rbio);
97262306a36Sopenharmony_ci	return 0;
97362306a36Sopenharmony_ci}
97462306a36Sopenharmony_ci
97562306a36Sopenharmony_ci/* only allocate pages for p/q stripes */
97662306a36Sopenharmony_cistatic int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
97762306a36Sopenharmony_ci{
97862306a36Sopenharmony_ci	const int data_pages = rbio->nr_data * rbio->stripe_npages;
97962306a36Sopenharmony_ci	int ret;
98062306a36Sopenharmony_ci
98162306a36Sopenharmony_ci	ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
98262306a36Sopenharmony_ci				     rbio->stripe_pages + data_pages);
98362306a36Sopenharmony_ci	if (ret < 0)
98462306a36Sopenharmony_ci		return ret;
98562306a36Sopenharmony_ci
98662306a36Sopenharmony_ci	index_stripe_sectors(rbio);
98762306a36Sopenharmony_ci	return 0;
98862306a36Sopenharmony_ci}
98962306a36Sopenharmony_ci
99062306a36Sopenharmony_ci/*
99162306a36Sopenharmony_ci * Return the total number of errors found in the vertical stripe of @sector_nr.
99262306a36Sopenharmony_ci *
99362306a36Sopenharmony_ci * @faila and @failb will also be updated to the first and second stripe
99462306a36Sopenharmony_ci * number of the errors.
99562306a36Sopenharmony_ci */
99662306a36Sopenharmony_cistatic int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
99762306a36Sopenharmony_ci				     int *faila, int *failb)
99862306a36Sopenharmony_ci{
99962306a36Sopenharmony_ci	int stripe_nr;
100062306a36Sopenharmony_ci	int found_errors = 0;
100162306a36Sopenharmony_ci
100262306a36Sopenharmony_ci	if (faila || failb) {
100362306a36Sopenharmony_ci		/*
100462306a36Sopenharmony_ci		 * Both @faila and @failb should be valid pointers if any of
100562306a36Sopenharmony_ci		 * them is specified.
100662306a36Sopenharmony_ci		 */
100762306a36Sopenharmony_ci		ASSERT(faila && failb);
100862306a36Sopenharmony_ci		*faila = -1;
100962306a36Sopenharmony_ci		*failb = -1;
101062306a36Sopenharmony_ci	}
101162306a36Sopenharmony_ci
101262306a36Sopenharmony_ci	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
101362306a36Sopenharmony_ci		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
101462306a36Sopenharmony_ci
101562306a36Sopenharmony_ci		if (test_bit(total_sector_nr, rbio->error_bitmap)) {
101662306a36Sopenharmony_ci			found_errors++;
101762306a36Sopenharmony_ci			if (faila) {
101862306a36Sopenharmony_ci				/* Update faila and failb. */
101962306a36Sopenharmony_ci				if (*faila < 0)
102062306a36Sopenharmony_ci					*faila = stripe_nr;
102162306a36Sopenharmony_ci				else if (*failb < 0)
102262306a36Sopenharmony_ci					*failb = stripe_nr;
102362306a36Sopenharmony_ci			}
102462306a36Sopenharmony_ci		}
102562306a36Sopenharmony_ci	}
102662306a36Sopenharmony_ci	return found_errors;
102762306a36Sopenharmony_ci}
102862306a36Sopenharmony_ci
102962306a36Sopenharmony_ci/*
103062306a36Sopenharmony_ci * Add a single sector @sector into our list of bios for IO.
103162306a36Sopenharmony_ci *
103262306a36Sopenharmony_ci * Return 0 if everything went well.
103362306a36Sopenharmony_ci * Return <0 for error.
103462306a36Sopenharmony_ci */
103562306a36Sopenharmony_cistatic int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
103662306a36Sopenharmony_ci			      struct bio_list *bio_list,
103762306a36Sopenharmony_ci			      struct sector_ptr *sector,
103862306a36Sopenharmony_ci			      unsigned int stripe_nr,
103962306a36Sopenharmony_ci			      unsigned int sector_nr,
104062306a36Sopenharmony_ci			      enum req_op op)
104162306a36Sopenharmony_ci{
104262306a36Sopenharmony_ci	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
104362306a36Sopenharmony_ci	struct bio *last = bio_list->tail;
104462306a36Sopenharmony_ci	int ret;
104562306a36Sopenharmony_ci	struct bio *bio;
104662306a36Sopenharmony_ci	struct btrfs_io_stripe *stripe;
104762306a36Sopenharmony_ci	u64 disk_start;
104862306a36Sopenharmony_ci
104962306a36Sopenharmony_ci	/*
105062306a36Sopenharmony_ci	 * Note: here stripe_nr has taken device replace into consideration,
105162306a36Sopenharmony_ci	 * thus it can be larger than rbio->real_stripe.
105262306a36Sopenharmony_ci	 * So here we check against bioc->num_stripes, not rbio->real_stripes.
105362306a36Sopenharmony_ci	 */
105462306a36Sopenharmony_ci	ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
105562306a36Sopenharmony_ci	ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
105662306a36Sopenharmony_ci	ASSERT(sector->page);
105762306a36Sopenharmony_ci
105862306a36Sopenharmony_ci	stripe = &rbio->bioc->stripes[stripe_nr];
105962306a36Sopenharmony_ci	disk_start = stripe->physical + sector_nr * sectorsize;
106062306a36Sopenharmony_ci
106162306a36Sopenharmony_ci	/* if the device is missing, just fail this stripe */
106262306a36Sopenharmony_ci	if (!stripe->dev->bdev) {
106362306a36Sopenharmony_ci		int found_errors;
106462306a36Sopenharmony_ci
106562306a36Sopenharmony_ci		set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
106662306a36Sopenharmony_ci			rbio->error_bitmap);
106762306a36Sopenharmony_ci
106862306a36Sopenharmony_ci		/* Check if we have reached tolerance early. */
106962306a36Sopenharmony_ci		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
107062306a36Sopenharmony_ci							 NULL, NULL);
107162306a36Sopenharmony_ci		if (found_errors > rbio->bioc->max_errors)
107262306a36Sopenharmony_ci			return -EIO;
107362306a36Sopenharmony_ci		return 0;
107462306a36Sopenharmony_ci	}
107562306a36Sopenharmony_ci
107662306a36Sopenharmony_ci	/* see if we can add this page onto our existing bio */
107762306a36Sopenharmony_ci	if (last) {
107862306a36Sopenharmony_ci		u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
107962306a36Sopenharmony_ci		last_end += last->bi_iter.bi_size;
108062306a36Sopenharmony_ci
108162306a36Sopenharmony_ci		/*
108262306a36Sopenharmony_ci		 * we can't merge these if they are from different
108362306a36Sopenharmony_ci		 * devices or if they are not contiguous
108462306a36Sopenharmony_ci		 */
108562306a36Sopenharmony_ci		if (last_end == disk_start && !last->bi_status &&
108662306a36Sopenharmony_ci		    last->bi_bdev == stripe->dev->bdev) {
108762306a36Sopenharmony_ci			ret = bio_add_page(last, sector->page, sectorsize,
108862306a36Sopenharmony_ci					   sector->pgoff);
108962306a36Sopenharmony_ci			if (ret == sectorsize)
109062306a36Sopenharmony_ci				return 0;
109162306a36Sopenharmony_ci		}
109262306a36Sopenharmony_ci	}
109362306a36Sopenharmony_ci
109462306a36Sopenharmony_ci	/* put a new bio on the list */
109562306a36Sopenharmony_ci	bio = bio_alloc(stripe->dev->bdev,
109662306a36Sopenharmony_ci			max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
109762306a36Sopenharmony_ci			op, GFP_NOFS);
109862306a36Sopenharmony_ci	bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
109962306a36Sopenharmony_ci	bio->bi_private = rbio;
110062306a36Sopenharmony_ci
110162306a36Sopenharmony_ci	__bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
110262306a36Sopenharmony_ci	bio_list_add(bio_list, bio);
110362306a36Sopenharmony_ci	return 0;
110462306a36Sopenharmony_ci}
110562306a36Sopenharmony_ci
110662306a36Sopenharmony_cistatic void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
110762306a36Sopenharmony_ci{
110862306a36Sopenharmony_ci	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
110962306a36Sopenharmony_ci	struct bio_vec bvec;
111062306a36Sopenharmony_ci	struct bvec_iter iter;
111162306a36Sopenharmony_ci	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
111262306a36Sopenharmony_ci		     rbio->bioc->full_stripe_logical;
111362306a36Sopenharmony_ci
111462306a36Sopenharmony_ci	bio_for_each_segment(bvec, bio, iter) {
111562306a36Sopenharmony_ci		u32 bvec_offset;
111662306a36Sopenharmony_ci
111762306a36Sopenharmony_ci		for (bvec_offset = 0; bvec_offset < bvec.bv_len;
111862306a36Sopenharmony_ci		     bvec_offset += sectorsize, offset += sectorsize) {
111962306a36Sopenharmony_ci			int index = offset / sectorsize;
112062306a36Sopenharmony_ci			struct sector_ptr *sector = &rbio->bio_sectors[index];
112162306a36Sopenharmony_ci
112262306a36Sopenharmony_ci			sector->page = bvec.bv_page;
112362306a36Sopenharmony_ci			sector->pgoff = bvec.bv_offset + bvec_offset;
112462306a36Sopenharmony_ci			ASSERT(sector->pgoff < PAGE_SIZE);
112562306a36Sopenharmony_ci		}
112662306a36Sopenharmony_ci	}
112762306a36Sopenharmony_ci}
112862306a36Sopenharmony_ci
112962306a36Sopenharmony_ci/*
113062306a36Sopenharmony_ci * helper function to walk our bio list and populate the bio_pages array with
113162306a36Sopenharmony_ci * the result.  This seems expensive, but it is faster than constantly
113262306a36Sopenharmony_ci * searching through the bio list as we setup the IO in finish_rmw or stripe
113362306a36Sopenharmony_ci * reconstruction.
113462306a36Sopenharmony_ci *
113562306a36Sopenharmony_ci * This must be called before you trust the answers from page_in_rbio
113662306a36Sopenharmony_ci */
113762306a36Sopenharmony_cistatic void index_rbio_pages(struct btrfs_raid_bio *rbio)
113862306a36Sopenharmony_ci{
113962306a36Sopenharmony_ci	struct bio *bio;
114062306a36Sopenharmony_ci
114162306a36Sopenharmony_ci	spin_lock(&rbio->bio_list_lock);
114262306a36Sopenharmony_ci	bio_list_for_each(bio, &rbio->bio_list)
114362306a36Sopenharmony_ci		index_one_bio(rbio, bio);
114462306a36Sopenharmony_ci
114562306a36Sopenharmony_ci	spin_unlock(&rbio->bio_list_lock);
114662306a36Sopenharmony_ci}
114762306a36Sopenharmony_ci
114862306a36Sopenharmony_cistatic void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
114962306a36Sopenharmony_ci			       struct raid56_bio_trace_info *trace_info)
115062306a36Sopenharmony_ci{
115162306a36Sopenharmony_ci	const struct btrfs_io_context *bioc = rbio->bioc;
115262306a36Sopenharmony_ci	int i;
115362306a36Sopenharmony_ci
115462306a36Sopenharmony_ci	ASSERT(bioc);
115562306a36Sopenharmony_ci
115662306a36Sopenharmony_ci	/* We rely on bio->bi_bdev to find the stripe number. */
115762306a36Sopenharmony_ci	if (!bio->bi_bdev)
115862306a36Sopenharmony_ci		goto not_found;
115962306a36Sopenharmony_ci
116062306a36Sopenharmony_ci	for (i = 0; i < bioc->num_stripes; i++) {
116162306a36Sopenharmony_ci		if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
116262306a36Sopenharmony_ci			continue;
116362306a36Sopenharmony_ci		trace_info->stripe_nr = i;
116462306a36Sopenharmony_ci		trace_info->devid = bioc->stripes[i].dev->devid;
116562306a36Sopenharmony_ci		trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
116662306a36Sopenharmony_ci				     bioc->stripes[i].physical;
116762306a36Sopenharmony_ci		return;
116862306a36Sopenharmony_ci	}
116962306a36Sopenharmony_ci
117062306a36Sopenharmony_cinot_found:
117162306a36Sopenharmony_ci	trace_info->devid = -1;
117262306a36Sopenharmony_ci	trace_info->offset = -1;
117362306a36Sopenharmony_ci	trace_info->stripe_nr = -1;
117462306a36Sopenharmony_ci}
117562306a36Sopenharmony_ci
117662306a36Sopenharmony_cistatic inline void bio_list_put(struct bio_list *bio_list)
117762306a36Sopenharmony_ci{
117862306a36Sopenharmony_ci	struct bio *bio;
117962306a36Sopenharmony_ci
118062306a36Sopenharmony_ci	while ((bio = bio_list_pop(bio_list)))
118162306a36Sopenharmony_ci		bio_put(bio);
118262306a36Sopenharmony_ci}
118362306a36Sopenharmony_ci
118462306a36Sopenharmony_ci/* Generate PQ for one vertical stripe. */
118562306a36Sopenharmony_cistatic void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
118662306a36Sopenharmony_ci{
118762306a36Sopenharmony_ci	void **pointers = rbio->finish_pointers;
118862306a36Sopenharmony_ci	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
118962306a36Sopenharmony_ci	struct sector_ptr *sector;
119062306a36Sopenharmony_ci	int stripe;
119162306a36Sopenharmony_ci	const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
119262306a36Sopenharmony_ci
119362306a36Sopenharmony_ci	/* First collect one sector from each data stripe */
119462306a36Sopenharmony_ci	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
119562306a36Sopenharmony_ci		sector = sector_in_rbio(rbio, stripe, sectornr, 0);
119662306a36Sopenharmony_ci		pointers[stripe] = kmap_local_page(sector->page) +
119762306a36Sopenharmony_ci				   sector->pgoff;
119862306a36Sopenharmony_ci	}
119962306a36Sopenharmony_ci
120062306a36Sopenharmony_ci	/* Then add the parity stripe */
120162306a36Sopenharmony_ci	sector = rbio_pstripe_sector(rbio, sectornr);
120262306a36Sopenharmony_ci	sector->uptodate = 1;
120362306a36Sopenharmony_ci	pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
120462306a36Sopenharmony_ci
120562306a36Sopenharmony_ci	if (has_qstripe) {
120662306a36Sopenharmony_ci		/*
120762306a36Sopenharmony_ci		 * RAID6, add the qstripe and call the library function
120862306a36Sopenharmony_ci		 * to fill in our p/q
120962306a36Sopenharmony_ci		 */
121062306a36Sopenharmony_ci		sector = rbio_qstripe_sector(rbio, sectornr);
121162306a36Sopenharmony_ci		sector->uptodate = 1;
121262306a36Sopenharmony_ci		pointers[stripe++] = kmap_local_page(sector->page) +
121362306a36Sopenharmony_ci				     sector->pgoff;
121462306a36Sopenharmony_ci
121562306a36Sopenharmony_ci		raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
121662306a36Sopenharmony_ci					pointers);
121762306a36Sopenharmony_ci	} else {
121862306a36Sopenharmony_ci		/* raid5 */
121962306a36Sopenharmony_ci		memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
122062306a36Sopenharmony_ci		run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
122162306a36Sopenharmony_ci	}
122262306a36Sopenharmony_ci	for (stripe = stripe - 1; stripe >= 0; stripe--)
122362306a36Sopenharmony_ci		kunmap_local(pointers[stripe]);
122462306a36Sopenharmony_ci}
122562306a36Sopenharmony_ci
122662306a36Sopenharmony_cistatic int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
122762306a36Sopenharmony_ci				   struct bio_list *bio_list)
122862306a36Sopenharmony_ci{
122962306a36Sopenharmony_ci	/* The total sector number inside the full stripe. */
123062306a36Sopenharmony_ci	int total_sector_nr;
123162306a36Sopenharmony_ci	int sectornr;
123262306a36Sopenharmony_ci	int stripe;
123362306a36Sopenharmony_ci	int ret;
123462306a36Sopenharmony_ci
123562306a36Sopenharmony_ci	ASSERT(bio_list_size(bio_list) == 0);
123662306a36Sopenharmony_ci
123762306a36Sopenharmony_ci	/* We should have at least one data sector. */
123862306a36Sopenharmony_ci	ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
123962306a36Sopenharmony_ci
124062306a36Sopenharmony_ci	/*
124162306a36Sopenharmony_ci	 * Reset errors, as we may have errors inherited from from degraded
124262306a36Sopenharmony_ci	 * write.
124362306a36Sopenharmony_ci	 */
124462306a36Sopenharmony_ci	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
124562306a36Sopenharmony_ci
124662306a36Sopenharmony_ci	/*
124762306a36Sopenharmony_ci	 * Start assembly.  Make bios for everything from the higher layers (the
124862306a36Sopenharmony_ci	 * bio_list in our rbio) and our P/Q.  Ignore everything else.
124962306a36Sopenharmony_ci	 */
125062306a36Sopenharmony_ci	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
125162306a36Sopenharmony_ci	     total_sector_nr++) {
125262306a36Sopenharmony_ci		struct sector_ptr *sector;
125362306a36Sopenharmony_ci
125462306a36Sopenharmony_ci		stripe = total_sector_nr / rbio->stripe_nsectors;
125562306a36Sopenharmony_ci		sectornr = total_sector_nr % rbio->stripe_nsectors;
125662306a36Sopenharmony_ci
125762306a36Sopenharmony_ci		/* This vertical stripe has no data, skip it. */
125862306a36Sopenharmony_ci		if (!test_bit(sectornr, &rbio->dbitmap))
125962306a36Sopenharmony_ci			continue;
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_ci		if (stripe < rbio->nr_data) {
126262306a36Sopenharmony_ci			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
126362306a36Sopenharmony_ci			if (!sector)
126462306a36Sopenharmony_ci				continue;
126562306a36Sopenharmony_ci		} else {
126662306a36Sopenharmony_ci			sector = rbio_stripe_sector(rbio, stripe, sectornr);
126762306a36Sopenharmony_ci		}
126862306a36Sopenharmony_ci
126962306a36Sopenharmony_ci		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
127062306a36Sopenharmony_ci					 sectornr, REQ_OP_WRITE);
127162306a36Sopenharmony_ci		if (ret)
127262306a36Sopenharmony_ci			goto error;
127362306a36Sopenharmony_ci	}
127462306a36Sopenharmony_ci
127562306a36Sopenharmony_ci	if (likely(!rbio->bioc->replace_nr_stripes))
127662306a36Sopenharmony_ci		return 0;
127762306a36Sopenharmony_ci
127862306a36Sopenharmony_ci	/*
127962306a36Sopenharmony_ci	 * Make a copy for the replace target device.
128062306a36Sopenharmony_ci	 *
128162306a36Sopenharmony_ci	 * Thus the source stripe number (in replace_stripe_src) should be valid.
128262306a36Sopenharmony_ci	 */
128362306a36Sopenharmony_ci	ASSERT(rbio->bioc->replace_stripe_src >= 0);
128462306a36Sopenharmony_ci
128562306a36Sopenharmony_ci	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
128662306a36Sopenharmony_ci	     total_sector_nr++) {
128762306a36Sopenharmony_ci		struct sector_ptr *sector;
128862306a36Sopenharmony_ci
128962306a36Sopenharmony_ci		stripe = total_sector_nr / rbio->stripe_nsectors;
129062306a36Sopenharmony_ci		sectornr = total_sector_nr % rbio->stripe_nsectors;
129162306a36Sopenharmony_ci
129262306a36Sopenharmony_ci		/*
129362306a36Sopenharmony_ci		 * For RAID56, there is only one device that can be replaced,
129462306a36Sopenharmony_ci		 * and replace_stripe_src[0] indicates the stripe number we
129562306a36Sopenharmony_ci		 * need to copy from.
129662306a36Sopenharmony_ci		 */
129762306a36Sopenharmony_ci		if (stripe != rbio->bioc->replace_stripe_src) {
129862306a36Sopenharmony_ci			/*
129962306a36Sopenharmony_ci			 * We can skip the whole stripe completely, note
130062306a36Sopenharmony_ci			 * total_sector_nr will be increased by one anyway.
130162306a36Sopenharmony_ci			 */
130262306a36Sopenharmony_ci			ASSERT(sectornr == 0);
130362306a36Sopenharmony_ci			total_sector_nr += rbio->stripe_nsectors - 1;
130462306a36Sopenharmony_ci			continue;
130562306a36Sopenharmony_ci		}
130662306a36Sopenharmony_ci
130762306a36Sopenharmony_ci		/* This vertical stripe has no data, skip it. */
130862306a36Sopenharmony_ci		if (!test_bit(sectornr, &rbio->dbitmap))
130962306a36Sopenharmony_ci			continue;
131062306a36Sopenharmony_ci
131162306a36Sopenharmony_ci		if (stripe < rbio->nr_data) {
131262306a36Sopenharmony_ci			sector = sector_in_rbio(rbio, stripe, sectornr, 1);
131362306a36Sopenharmony_ci			if (!sector)
131462306a36Sopenharmony_ci				continue;
131562306a36Sopenharmony_ci		} else {
131662306a36Sopenharmony_ci			sector = rbio_stripe_sector(rbio, stripe, sectornr);
131762306a36Sopenharmony_ci		}
131862306a36Sopenharmony_ci
131962306a36Sopenharmony_ci		ret = rbio_add_io_sector(rbio, bio_list, sector,
132062306a36Sopenharmony_ci					 rbio->real_stripes,
132162306a36Sopenharmony_ci					 sectornr, REQ_OP_WRITE);
132262306a36Sopenharmony_ci		if (ret)
132362306a36Sopenharmony_ci			goto error;
132462306a36Sopenharmony_ci	}
132562306a36Sopenharmony_ci
132662306a36Sopenharmony_ci	return 0;
132762306a36Sopenharmony_cierror:
132862306a36Sopenharmony_ci	bio_list_put(bio_list);
132962306a36Sopenharmony_ci	return -EIO;
133062306a36Sopenharmony_ci}
133162306a36Sopenharmony_ci
133262306a36Sopenharmony_cistatic void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
133362306a36Sopenharmony_ci{
133462306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
133562306a36Sopenharmony_ci	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
133662306a36Sopenharmony_ci		     rbio->bioc->full_stripe_logical;
133762306a36Sopenharmony_ci	int total_nr_sector = offset >> fs_info->sectorsize_bits;
133862306a36Sopenharmony_ci
133962306a36Sopenharmony_ci	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
134062306a36Sopenharmony_ci
134162306a36Sopenharmony_ci	bitmap_set(rbio->error_bitmap, total_nr_sector,
134262306a36Sopenharmony_ci		   bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
134362306a36Sopenharmony_ci
134462306a36Sopenharmony_ci	/*
134562306a36Sopenharmony_ci	 * Special handling for raid56_alloc_missing_rbio() used by
134662306a36Sopenharmony_ci	 * scrub/replace.  Unlike call path in raid56_parity_recover(), they
134762306a36Sopenharmony_ci	 * pass an empty bio here.  Thus we have to find out the missing device
134862306a36Sopenharmony_ci	 * and mark the stripe error instead.
134962306a36Sopenharmony_ci	 */
135062306a36Sopenharmony_ci	if (bio->bi_iter.bi_size == 0) {
135162306a36Sopenharmony_ci		bool found_missing = false;
135262306a36Sopenharmony_ci		int stripe_nr;
135362306a36Sopenharmony_ci
135462306a36Sopenharmony_ci		for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
135562306a36Sopenharmony_ci			if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
135662306a36Sopenharmony_ci				found_missing = true;
135762306a36Sopenharmony_ci				bitmap_set(rbio->error_bitmap,
135862306a36Sopenharmony_ci					   stripe_nr * rbio->stripe_nsectors,
135962306a36Sopenharmony_ci					   rbio->stripe_nsectors);
136062306a36Sopenharmony_ci			}
136162306a36Sopenharmony_ci		}
136262306a36Sopenharmony_ci		ASSERT(found_missing);
136362306a36Sopenharmony_ci	}
136462306a36Sopenharmony_ci}
136562306a36Sopenharmony_ci
136662306a36Sopenharmony_ci/*
136762306a36Sopenharmony_ci * For subpage case, we can no longer set page Up-to-date directly for
136862306a36Sopenharmony_ci * stripe_pages[], thus we need to locate the sector.
136962306a36Sopenharmony_ci */
137062306a36Sopenharmony_cistatic struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
137162306a36Sopenharmony_ci					     struct page *page,
137262306a36Sopenharmony_ci					     unsigned int pgoff)
137362306a36Sopenharmony_ci{
137462306a36Sopenharmony_ci	int i;
137562306a36Sopenharmony_ci
137662306a36Sopenharmony_ci	for (i = 0; i < rbio->nr_sectors; i++) {
137762306a36Sopenharmony_ci		struct sector_ptr *sector = &rbio->stripe_sectors[i];
137862306a36Sopenharmony_ci
137962306a36Sopenharmony_ci		if (sector->page == page && sector->pgoff == pgoff)
138062306a36Sopenharmony_ci			return sector;
138162306a36Sopenharmony_ci	}
138262306a36Sopenharmony_ci	return NULL;
138362306a36Sopenharmony_ci}
138462306a36Sopenharmony_ci
138562306a36Sopenharmony_ci/*
138662306a36Sopenharmony_ci * this sets each page in the bio uptodate.  It should only be used on private
138762306a36Sopenharmony_ci * rbio pages, nothing that comes in from the higher layers
138862306a36Sopenharmony_ci */
138962306a36Sopenharmony_cistatic void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
139062306a36Sopenharmony_ci{
139162306a36Sopenharmony_ci	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
139262306a36Sopenharmony_ci	struct bio_vec *bvec;
139362306a36Sopenharmony_ci	struct bvec_iter_all iter_all;
139462306a36Sopenharmony_ci
139562306a36Sopenharmony_ci	ASSERT(!bio_flagged(bio, BIO_CLONED));
139662306a36Sopenharmony_ci
139762306a36Sopenharmony_ci	bio_for_each_segment_all(bvec, bio, iter_all) {
139862306a36Sopenharmony_ci		struct sector_ptr *sector;
139962306a36Sopenharmony_ci		int pgoff;
140062306a36Sopenharmony_ci
140162306a36Sopenharmony_ci		for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
140262306a36Sopenharmony_ci		     pgoff += sectorsize) {
140362306a36Sopenharmony_ci			sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
140462306a36Sopenharmony_ci			ASSERT(sector);
140562306a36Sopenharmony_ci			if (sector)
140662306a36Sopenharmony_ci				sector->uptodate = 1;
140762306a36Sopenharmony_ci		}
140862306a36Sopenharmony_ci	}
140962306a36Sopenharmony_ci}
141062306a36Sopenharmony_ci
141162306a36Sopenharmony_cistatic int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
141262306a36Sopenharmony_ci{
141362306a36Sopenharmony_ci	struct bio_vec *bv = bio_first_bvec_all(bio);
141462306a36Sopenharmony_ci	int i;
141562306a36Sopenharmony_ci
141662306a36Sopenharmony_ci	for (i = 0; i < rbio->nr_sectors; i++) {
141762306a36Sopenharmony_ci		struct sector_ptr *sector;
141862306a36Sopenharmony_ci
141962306a36Sopenharmony_ci		sector = &rbio->stripe_sectors[i];
142062306a36Sopenharmony_ci		if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
142162306a36Sopenharmony_ci			break;
142262306a36Sopenharmony_ci		sector = &rbio->bio_sectors[i];
142362306a36Sopenharmony_ci		if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
142462306a36Sopenharmony_ci			break;
142562306a36Sopenharmony_ci	}
142662306a36Sopenharmony_ci	ASSERT(i < rbio->nr_sectors);
142762306a36Sopenharmony_ci	return i;
142862306a36Sopenharmony_ci}
142962306a36Sopenharmony_ci
143062306a36Sopenharmony_cistatic void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
143162306a36Sopenharmony_ci{
143262306a36Sopenharmony_ci	int total_sector_nr = get_bio_sector_nr(rbio, bio);
143362306a36Sopenharmony_ci	u32 bio_size = 0;
143462306a36Sopenharmony_ci	struct bio_vec *bvec;
143562306a36Sopenharmony_ci	int i;
143662306a36Sopenharmony_ci
143762306a36Sopenharmony_ci	bio_for_each_bvec_all(bvec, bio, i)
143862306a36Sopenharmony_ci		bio_size += bvec->bv_len;
143962306a36Sopenharmony_ci
144062306a36Sopenharmony_ci	/*
144162306a36Sopenharmony_ci	 * Since we can have multiple bios touching the error_bitmap, we cannot
144262306a36Sopenharmony_ci	 * call bitmap_set() without protection.
144362306a36Sopenharmony_ci	 *
144462306a36Sopenharmony_ci	 * Instead use set_bit() for each bit, as set_bit() itself is atomic.
144562306a36Sopenharmony_ci	 */
144662306a36Sopenharmony_ci	for (i = total_sector_nr; i < total_sector_nr +
144762306a36Sopenharmony_ci	     (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
144862306a36Sopenharmony_ci		set_bit(i, rbio->error_bitmap);
144962306a36Sopenharmony_ci}
145062306a36Sopenharmony_ci
145162306a36Sopenharmony_ci/* Verify the data sectors at read time. */
145262306a36Sopenharmony_cistatic void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
145362306a36Sopenharmony_ci				    struct bio *bio)
145462306a36Sopenharmony_ci{
145562306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
145662306a36Sopenharmony_ci	int total_sector_nr = get_bio_sector_nr(rbio, bio);
145762306a36Sopenharmony_ci	struct bio_vec *bvec;
145862306a36Sopenharmony_ci	struct bvec_iter_all iter_all;
145962306a36Sopenharmony_ci
146062306a36Sopenharmony_ci	/* No data csum for the whole stripe, no need to verify. */
146162306a36Sopenharmony_ci	if (!rbio->csum_bitmap || !rbio->csum_buf)
146262306a36Sopenharmony_ci		return;
146362306a36Sopenharmony_ci
146462306a36Sopenharmony_ci	/* P/Q stripes, they have no data csum to verify against. */
146562306a36Sopenharmony_ci	if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
146662306a36Sopenharmony_ci		return;
146762306a36Sopenharmony_ci
146862306a36Sopenharmony_ci	bio_for_each_segment_all(bvec, bio, iter_all) {
146962306a36Sopenharmony_ci		int bv_offset;
147062306a36Sopenharmony_ci
147162306a36Sopenharmony_ci		for (bv_offset = bvec->bv_offset;
147262306a36Sopenharmony_ci		     bv_offset < bvec->bv_offset + bvec->bv_len;
147362306a36Sopenharmony_ci		     bv_offset += fs_info->sectorsize, total_sector_nr++) {
147462306a36Sopenharmony_ci			u8 csum_buf[BTRFS_CSUM_SIZE];
147562306a36Sopenharmony_ci			u8 *expected_csum = rbio->csum_buf +
147662306a36Sopenharmony_ci					    total_sector_nr * fs_info->csum_size;
147762306a36Sopenharmony_ci			int ret;
147862306a36Sopenharmony_ci
147962306a36Sopenharmony_ci			/* No csum for this sector, skip to the next sector. */
148062306a36Sopenharmony_ci			if (!test_bit(total_sector_nr, rbio->csum_bitmap))
148162306a36Sopenharmony_ci				continue;
148262306a36Sopenharmony_ci
148362306a36Sopenharmony_ci			ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
148462306a36Sopenharmony_ci				bv_offset, csum_buf, expected_csum);
148562306a36Sopenharmony_ci			if (ret < 0)
148662306a36Sopenharmony_ci				set_bit(total_sector_nr, rbio->error_bitmap);
148762306a36Sopenharmony_ci		}
148862306a36Sopenharmony_ci	}
148962306a36Sopenharmony_ci}
149062306a36Sopenharmony_ci
149162306a36Sopenharmony_cistatic void raid_wait_read_end_io(struct bio *bio)
149262306a36Sopenharmony_ci{
149362306a36Sopenharmony_ci	struct btrfs_raid_bio *rbio = bio->bi_private;
149462306a36Sopenharmony_ci
149562306a36Sopenharmony_ci	if (bio->bi_status) {
149662306a36Sopenharmony_ci		rbio_update_error_bitmap(rbio, bio);
149762306a36Sopenharmony_ci	} else {
149862306a36Sopenharmony_ci		set_bio_pages_uptodate(rbio, bio);
149962306a36Sopenharmony_ci		verify_bio_data_sectors(rbio, bio);
150062306a36Sopenharmony_ci	}
150162306a36Sopenharmony_ci
150262306a36Sopenharmony_ci	bio_put(bio);
150362306a36Sopenharmony_ci	if (atomic_dec_and_test(&rbio->stripes_pending))
150462306a36Sopenharmony_ci		wake_up(&rbio->io_wait);
150562306a36Sopenharmony_ci}
150662306a36Sopenharmony_ci
150762306a36Sopenharmony_cistatic void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
150862306a36Sopenharmony_ci			     struct bio_list *bio_list)
150962306a36Sopenharmony_ci{
151062306a36Sopenharmony_ci	struct bio *bio;
151162306a36Sopenharmony_ci
151262306a36Sopenharmony_ci	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
151362306a36Sopenharmony_ci	while ((bio = bio_list_pop(bio_list))) {
151462306a36Sopenharmony_ci		bio->bi_end_io = raid_wait_read_end_io;
151562306a36Sopenharmony_ci
151662306a36Sopenharmony_ci		if (trace_raid56_read_enabled()) {
151762306a36Sopenharmony_ci			struct raid56_bio_trace_info trace_info = { 0 };
151862306a36Sopenharmony_ci
151962306a36Sopenharmony_ci			bio_get_trace_info(rbio, bio, &trace_info);
152062306a36Sopenharmony_ci			trace_raid56_read(rbio, bio, &trace_info);
152162306a36Sopenharmony_ci		}
152262306a36Sopenharmony_ci		submit_bio(bio);
152362306a36Sopenharmony_ci	}
152462306a36Sopenharmony_ci
152562306a36Sopenharmony_ci	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
152662306a36Sopenharmony_ci}
152762306a36Sopenharmony_ci
152862306a36Sopenharmony_cistatic int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
152962306a36Sopenharmony_ci{
153062306a36Sopenharmony_ci	const int data_pages = rbio->nr_data * rbio->stripe_npages;
153162306a36Sopenharmony_ci	int ret;
153262306a36Sopenharmony_ci
153362306a36Sopenharmony_ci	ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
153462306a36Sopenharmony_ci	if (ret < 0)
153562306a36Sopenharmony_ci		return ret;
153662306a36Sopenharmony_ci
153762306a36Sopenharmony_ci	index_stripe_sectors(rbio);
153862306a36Sopenharmony_ci	return 0;
153962306a36Sopenharmony_ci}
154062306a36Sopenharmony_ci
154162306a36Sopenharmony_ci/*
154262306a36Sopenharmony_ci * We use plugging call backs to collect full stripes.
154362306a36Sopenharmony_ci * Any time we get a partial stripe write while plugged
154462306a36Sopenharmony_ci * we collect it into a list.  When the unplug comes down,
154562306a36Sopenharmony_ci * we sort the list by logical block number and merge
154662306a36Sopenharmony_ci * everything we can into the same rbios
154762306a36Sopenharmony_ci */
154862306a36Sopenharmony_cistruct btrfs_plug_cb {
154962306a36Sopenharmony_ci	struct blk_plug_cb cb;
155062306a36Sopenharmony_ci	struct btrfs_fs_info *info;
155162306a36Sopenharmony_ci	struct list_head rbio_list;
155262306a36Sopenharmony_ci	struct work_struct work;
155362306a36Sopenharmony_ci};
155462306a36Sopenharmony_ci
155562306a36Sopenharmony_ci/*
155662306a36Sopenharmony_ci * rbios on the plug list are sorted for easier merging.
155762306a36Sopenharmony_ci */
155862306a36Sopenharmony_cistatic int plug_cmp(void *priv, const struct list_head *a,
155962306a36Sopenharmony_ci		    const struct list_head *b)
156062306a36Sopenharmony_ci{
156162306a36Sopenharmony_ci	const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
156262306a36Sopenharmony_ci						       plug_list);
156362306a36Sopenharmony_ci	const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
156462306a36Sopenharmony_ci						       plug_list);
156562306a36Sopenharmony_ci	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
156662306a36Sopenharmony_ci	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
156762306a36Sopenharmony_ci
156862306a36Sopenharmony_ci	if (a_sector < b_sector)
156962306a36Sopenharmony_ci		return -1;
157062306a36Sopenharmony_ci	if (a_sector > b_sector)
157162306a36Sopenharmony_ci		return 1;
157262306a36Sopenharmony_ci	return 0;
157362306a36Sopenharmony_ci}
157462306a36Sopenharmony_ci
157562306a36Sopenharmony_cistatic void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
157662306a36Sopenharmony_ci{
157762306a36Sopenharmony_ci	struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
157862306a36Sopenharmony_ci	struct btrfs_raid_bio *cur;
157962306a36Sopenharmony_ci	struct btrfs_raid_bio *last = NULL;
158062306a36Sopenharmony_ci
158162306a36Sopenharmony_ci	list_sort(NULL, &plug->rbio_list, plug_cmp);
158262306a36Sopenharmony_ci
158362306a36Sopenharmony_ci	while (!list_empty(&plug->rbio_list)) {
158462306a36Sopenharmony_ci		cur = list_entry(plug->rbio_list.next,
158562306a36Sopenharmony_ci				 struct btrfs_raid_bio, plug_list);
158662306a36Sopenharmony_ci		list_del_init(&cur->plug_list);
158762306a36Sopenharmony_ci
158862306a36Sopenharmony_ci		if (rbio_is_full(cur)) {
158962306a36Sopenharmony_ci			/* We have a full stripe, queue it down. */
159062306a36Sopenharmony_ci			start_async_work(cur, rmw_rbio_work);
159162306a36Sopenharmony_ci			continue;
159262306a36Sopenharmony_ci		}
159362306a36Sopenharmony_ci		if (last) {
159462306a36Sopenharmony_ci			if (rbio_can_merge(last, cur)) {
159562306a36Sopenharmony_ci				merge_rbio(last, cur);
159662306a36Sopenharmony_ci				free_raid_bio(cur);
159762306a36Sopenharmony_ci				continue;
159862306a36Sopenharmony_ci			}
159962306a36Sopenharmony_ci			start_async_work(last, rmw_rbio_work);
160062306a36Sopenharmony_ci		}
160162306a36Sopenharmony_ci		last = cur;
160262306a36Sopenharmony_ci	}
160362306a36Sopenharmony_ci	if (last)
160462306a36Sopenharmony_ci		start_async_work(last, rmw_rbio_work);
160562306a36Sopenharmony_ci	kfree(plug);
160662306a36Sopenharmony_ci}
160762306a36Sopenharmony_ci
160862306a36Sopenharmony_ci/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
160962306a36Sopenharmony_cistatic void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
161062306a36Sopenharmony_ci{
161162306a36Sopenharmony_ci	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
161262306a36Sopenharmony_ci	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
161362306a36Sopenharmony_ci	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
161462306a36Sopenharmony_ci	const u32 orig_len = orig_bio->bi_iter.bi_size;
161562306a36Sopenharmony_ci	const u32 sectorsize = fs_info->sectorsize;
161662306a36Sopenharmony_ci	u64 cur_logical;
161762306a36Sopenharmony_ci
161862306a36Sopenharmony_ci	ASSERT(orig_logical >= full_stripe_start &&
161962306a36Sopenharmony_ci	       orig_logical + orig_len <= full_stripe_start +
162062306a36Sopenharmony_ci	       rbio->nr_data * BTRFS_STRIPE_LEN);
162162306a36Sopenharmony_ci
162262306a36Sopenharmony_ci	bio_list_add(&rbio->bio_list, orig_bio);
162362306a36Sopenharmony_ci	rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
162462306a36Sopenharmony_ci
162562306a36Sopenharmony_ci	/* Update the dbitmap. */
162662306a36Sopenharmony_ci	for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
162762306a36Sopenharmony_ci	     cur_logical += sectorsize) {
162862306a36Sopenharmony_ci		int bit = ((u32)(cur_logical - full_stripe_start) >>
162962306a36Sopenharmony_ci			   fs_info->sectorsize_bits) % rbio->stripe_nsectors;
163062306a36Sopenharmony_ci
163162306a36Sopenharmony_ci		set_bit(bit, &rbio->dbitmap);
163262306a36Sopenharmony_ci	}
163362306a36Sopenharmony_ci}
163462306a36Sopenharmony_ci
163562306a36Sopenharmony_ci/*
163662306a36Sopenharmony_ci * our main entry point for writes from the rest of the FS.
163762306a36Sopenharmony_ci */
163862306a36Sopenharmony_civoid raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
163962306a36Sopenharmony_ci{
164062306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = bioc->fs_info;
164162306a36Sopenharmony_ci	struct btrfs_raid_bio *rbio;
164262306a36Sopenharmony_ci	struct btrfs_plug_cb *plug = NULL;
164362306a36Sopenharmony_ci	struct blk_plug_cb *cb;
164462306a36Sopenharmony_ci
164562306a36Sopenharmony_ci	rbio = alloc_rbio(fs_info, bioc);
164662306a36Sopenharmony_ci	if (IS_ERR(rbio)) {
164762306a36Sopenharmony_ci		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
164862306a36Sopenharmony_ci		bio_endio(bio);
164962306a36Sopenharmony_ci		return;
165062306a36Sopenharmony_ci	}
165162306a36Sopenharmony_ci	rbio->operation = BTRFS_RBIO_WRITE;
165262306a36Sopenharmony_ci	rbio_add_bio(rbio, bio);
165362306a36Sopenharmony_ci
165462306a36Sopenharmony_ci	/*
165562306a36Sopenharmony_ci	 * Don't plug on full rbios, just get them out the door
165662306a36Sopenharmony_ci	 * as quickly as we can
165762306a36Sopenharmony_ci	 */
165862306a36Sopenharmony_ci	if (!rbio_is_full(rbio)) {
165962306a36Sopenharmony_ci		cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
166062306a36Sopenharmony_ci		if (cb) {
166162306a36Sopenharmony_ci			plug = container_of(cb, struct btrfs_plug_cb, cb);
166262306a36Sopenharmony_ci			if (!plug->info) {
166362306a36Sopenharmony_ci				plug->info = fs_info;
166462306a36Sopenharmony_ci				INIT_LIST_HEAD(&plug->rbio_list);
166562306a36Sopenharmony_ci			}
166662306a36Sopenharmony_ci			list_add_tail(&rbio->plug_list, &plug->rbio_list);
166762306a36Sopenharmony_ci			return;
166862306a36Sopenharmony_ci		}
166962306a36Sopenharmony_ci	}
167062306a36Sopenharmony_ci
167162306a36Sopenharmony_ci	/*
167262306a36Sopenharmony_ci	 * Either we don't have any existing plug, or we're doing a full stripe,
167362306a36Sopenharmony_ci	 * queue the rmw work now.
167462306a36Sopenharmony_ci	 */
167562306a36Sopenharmony_ci	start_async_work(rbio, rmw_rbio_work);
167662306a36Sopenharmony_ci}
167762306a36Sopenharmony_ci
167862306a36Sopenharmony_cistatic int verify_one_sector(struct btrfs_raid_bio *rbio,
167962306a36Sopenharmony_ci			     int stripe_nr, int sector_nr)
168062306a36Sopenharmony_ci{
168162306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
168262306a36Sopenharmony_ci	struct sector_ptr *sector;
168362306a36Sopenharmony_ci	u8 csum_buf[BTRFS_CSUM_SIZE];
168462306a36Sopenharmony_ci	u8 *csum_expected;
168562306a36Sopenharmony_ci	int ret;
168662306a36Sopenharmony_ci
168762306a36Sopenharmony_ci	if (!rbio->csum_bitmap || !rbio->csum_buf)
168862306a36Sopenharmony_ci		return 0;
168962306a36Sopenharmony_ci
169062306a36Sopenharmony_ci	/* No way to verify P/Q as they are not covered by data csum. */
169162306a36Sopenharmony_ci	if (stripe_nr >= rbio->nr_data)
169262306a36Sopenharmony_ci		return 0;
169362306a36Sopenharmony_ci	/*
169462306a36Sopenharmony_ci	 * If we're rebuilding a read, we have to use pages from the
169562306a36Sopenharmony_ci	 * bio list if possible.
169662306a36Sopenharmony_ci	 */
169762306a36Sopenharmony_ci	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
169862306a36Sopenharmony_ci		sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
169962306a36Sopenharmony_ci	} else {
170062306a36Sopenharmony_ci		sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
170162306a36Sopenharmony_ci	}
170262306a36Sopenharmony_ci
170362306a36Sopenharmony_ci	ASSERT(sector->page);
170462306a36Sopenharmony_ci
170562306a36Sopenharmony_ci	csum_expected = rbio->csum_buf +
170662306a36Sopenharmony_ci			(stripe_nr * rbio->stripe_nsectors + sector_nr) *
170762306a36Sopenharmony_ci			fs_info->csum_size;
170862306a36Sopenharmony_ci	ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
170962306a36Sopenharmony_ci				      csum_buf, csum_expected);
171062306a36Sopenharmony_ci	return ret;
171162306a36Sopenharmony_ci}
171262306a36Sopenharmony_ci
171362306a36Sopenharmony_ci/*
171462306a36Sopenharmony_ci * Recover a vertical stripe specified by @sector_nr.
171562306a36Sopenharmony_ci * @*pointers are the pre-allocated pointers by the caller, so we don't
171662306a36Sopenharmony_ci * need to allocate/free the pointers again and again.
171762306a36Sopenharmony_ci */
171862306a36Sopenharmony_cistatic int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
171962306a36Sopenharmony_ci			    void **pointers, void **unmap_array)
172062306a36Sopenharmony_ci{
172162306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
172262306a36Sopenharmony_ci	struct sector_ptr *sector;
172362306a36Sopenharmony_ci	const u32 sectorsize = fs_info->sectorsize;
172462306a36Sopenharmony_ci	int found_errors;
172562306a36Sopenharmony_ci	int faila;
172662306a36Sopenharmony_ci	int failb;
172762306a36Sopenharmony_ci	int stripe_nr;
172862306a36Sopenharmony_ci	int ret = 0;
172962306a36Sopenharmony_ci
173062306a36Sopenharmony_ci	/*
173162306a36Sopenharmony_ci	 * Now we just use bitmap to mark the horizontal stripes in
173262306a36Sopenharmony_ci	 * which we have data when doing parity scrub.
173362306a36Sopenharmony_ci	 */
173462306a36Sopenharmony_ci	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
173562306a36Sopenharmony_ci	    !test_bit(sector_nr, &rbio->dbitmap))
173662306a36Sopenharmony_ci		return 0;
173762306a36Sopenharmony_ci
173862306a36Sopenharmony_ci	found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
173962306a36Sopenharmony_ci						 &failb);
174062306a36Sopenharmony_ci	/*
174162306a36Sopenharmony_ci	 * No errors in the vertical stripe, skip it.  Can happen for recovery
174262306a36Sopenharmony_ci	 * which only part of a stripe failed csum check.
174362306a36Sopenharmony_ci	 */
174462306a36Sopenharmony_ci	if (!found_errors)
174562306a36Sopenharmony_ci		return 0;
174662306a36Sopenharmony_ci
174762306a36Sopenharmony_ci	if (found_errors > rbio->bioc->max_errors)
174862306a36Sopenharmony_ci		return -EIO;
174962306a36Sopenharmony_ci
175062306a36Sopenharmony_ci	/*
175162306a36Sopenharmony_ci	 * Setup our array of pointers with sectors from each stripe
175262306a36Sopenharmony_ci	 *
175362306a36Sopenharmony_ci	 * NOTE: store a duplicate array of pointers to preserve the
175462306a36Sopenharmony_ci	 * pointer order.
175562306a36Sopenharmony_ci	 */
175662306a36Sopenharmony_ci	for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
175762306a36Sopenharmony_ci		/*
175862306a36Sopenharmony_ci		 * If we're rebuilding a read, we have to use pages from the
175962306a36Sopenharmony_ci		 * bio list if possible.
176062306a36Sopenharmony_ci		 */
176162306a36Sopenharmony_ci		if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
176262306a36Sopenharmony_ci			sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
176362306a36Sopenharmony_ci		} else {
176462306a36Sopenharmony_ci			sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
176562306a36Sopenharmony_ci		}
176662306a36Sopenharmony_ci		ASSERT(sector->page);
176762306a36Sopenharmony_ci		pointers[stripe_nr] = kmap_local_page(sector->page) +
176862306a36Sopenharmony_ci				   sector->pgoff;
176962306a36Sopenharmony_ci		unmap_array[stripe_nr] = pointers[stripe_nr];
177062306a36Sopenharmony_ci	}
177162306a36Sopenharmony_ci
177262306a36Sopenharmony_ci	/* All raid6 handling here */
177362306a36Sopenharmony_ci	if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
177462306a36Sopenharmony_ci		/* Single failure, rebuild from parity raid5 style */
177562306a36Sopenharmony_ci		if (failb < 0) {
177662306a36Sopenharmony_ci			if (faila == rbio->nr_data)
177762306a36Sopenharmony_ci				/*
177862306a36Sopenharmony_ci				 * Just the P stripe has failed, without
177962306a36Sopenharmony_ci				 * a bad data or Q stripe.
178062306a36Sopenharmony_ci				 * We have nothing to do, just skip the
178162306a36Sopenharmony_ci				 * recovery for this stripe.
178262306a36Sopenharmony_ci				 */
178362306a36Sopenharmony_ci				goto cleanup;
178462306a36Sopenharmony_ci			/*
178562306a36Sopenharmony_ci			 * a single failure in raid6 is rebuilt
178662306a36Sopenharmony_ci			 * in the pstripe code below
178762306a36Sopenharmony_ci			 */
178862306a36Sopenharmony_ci			goto pstripe;
178962306a36Sopenharmony_ci		}
179062306a36Sopenharmony_ci
179162306a36Sopenharmony_ci		/*
179262306a36Sopenharmony_ci		 * If the q stripe is failed, do a pstripe reconstruction from
179362306a36Sopenharmony_ci		 * the xors.
179462306a36Sopenharmony_ci		 * If both the q stripe and the P stripe are failed, we're
179562306a36Sopenharmony_ci		 * here due to a crc mismatch and we can't give them the
179662306a36Sopenharmony_ci		 * data they want.
179762306a36Sopenharmony_ci		 */
179862306a36Sopenharmony_ci		if (failb == rbio->real_stripes - 1) {
179962306a36Sopenharmony_ci			if (faila == rbio->real_stripes - 2)
180062306a36Sopenharmony_ci				/*
180162306a36Sopenharmony_ci				 * Only P and Q are corrupted.
180262306a36Sopenharmony_ci				 * We only care about data stripes recovery,
180362306a36Sopenharmony_ci				 * can skip this vertical stripe.
180462306a36Sopenharmony_ci				 */
180562306a36Sopenharmony_ci				goto cleanup;
180662306a36Sopenharmony_ci			/*
180762306a36Sopenharmony_ci			 * Otherwise we have one bad data stripe and
180862306a36Sopenharmony_ci			 * a good P stripe.  raid5!
180962306a36Sopenharmony_ci			 */
181062306a36Sopenharmony_ci			goto pstripe;
181162306a36Sopenharmony_ci		}
181262306a36Sopenharmony_ci
181362306a36Sopenharmony_ci		if (failb == rbio->real_stripes - 2) {
181462306a36Sopenharmony_ci			raid6_datap_recov(rbio->real_stripes, sectorsize,
181562306a36Sopenharmony_ci					  faila, pointers);
181662306a36Sopenharmony_ci		} else {
181762306a36Sopenharmony_ci			raid6_2data_recov(rbio->real_stripes, sectorsize,
181862306a36Sopenharmony_ci					  faila, failb, pointers);
181962306a36Sopenharmony_ci		}
182062306a36Sopenharmony_ci	} else {
182162306a36Sopenharmony_ci		void *p;
182262306a36Sopenharmony_ci
182362306a36Sopenharmony_ci		/* Rebuild from P stripe here (raid5 or raid6). */
182462306a36Sopenharmony_ci		ASSERT(failb == -1);
182562306a36Sopenharmony_cipstripe:
182662306a36Sopenharmony_ci		/* Copy parity block into failed block to start with */
182762306a36Sopenharmony_ci		memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
182862306a36Sopenharmony_ci
182962306a36Sopenharmony_ci		/* Rearrange the pointer array */
183062306a36Sopenharmony_ci		p = pointers[faila];
183162306a36Sopenharmony_ci		for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
183262306a36Sopenharmony_ci		     stripe_nr++)
183362306a36Sopenharmony_ci			pointers[stripe_nr] = pointers[stripe_nr + 1];
183462306a36Sopenharmony_ci		pointers[rbio->nr_data - 1] = p;
183562306a36Sopenharmony_ci
183662306a36Sopenharmony_ci		/* Xor in the rest */
183762306a36Sopenharmony_ci		run_xor(pointers, rbio->nr_data - 1, sectorsize);
183862306a36Sopenharmony_ci
183962306a36Sopenharmony_ci	}
184062306a36Sopenharmony_ci
184162306a36Sopenharmony_ci	/*
184262306a36Sopenharmony_ci	 * No matter if this is a RMW or recovery, we should have all
184362306a36Sopenharmony_ci	 * failed sectors repaired in the vertical stripe, thus they are now
184462306a36Sopenharmony_ci	 * uptodate.
184562306a36Sopenharmony_ci	 * Especially if we determine to cache the rbio, we need to
184662306a36Sopenharmony_ci	 * have at least all data sectors uptodate.
184762306a36Sopenharmony_ci	 *
184862306a36Sopenharmony_ci	 * If possible, also check if the repaired sector matches its data
184962306a36Sopenharmony_ci	 * checksum.
185062306a36Sopenharmony_ci	 */
185162306a36Sopenharmony_ci	if (faila >= 0) {
185262306a36Sopenharmony_ci		ret = verify_one_sector(rbio, faila, sector_nr);
185362306a36Sopenharmony_ci		if (ret < 0)
185462306a36Sopenharmony_ci			goto cleanup;
185562306a36Sopenharmony_ci
185662306a36Sopenharmony_ci		sector = rbio_stripe_sector(rbio, faila, sector_nr);
185762306a36Sopenharmony_ci		sector->uptodate = 1;
185862306a36Sopenharmony_ci	}
185962306a36Sopenharmony_ci	if (failb >= 0) {
186062306a36Sopenharmony_ci		ret = verify_one_sector(rbio, failb, sector_nr);
186162306a36Sopenharmony_ci		if (ret < 0)
186262306a36Sopenharmony_ci			goto cleanup;
186362306a36Sopenharmony_ci
186462306a36Sopenharmony_ci		sector = rbio_stripe_sector(rbio, failb, sector_nr);
186562306a36Sopenharmony_ci		sector->uptodate = 1;
186662306a36Sopenharmony_ci	}
186762306a36Sopenharmony_ci
186862306a36Sopenharmony_cicleanup:
186962306a36Sopenharmony_ci	for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
187062306a36Sopenharmony_ci		kunmap_local(unmap_array[stripe_nr]);
187162306a36Sopenharmony_ci	return ret;
187262306a36Sopenharmony_ci}
187362306a36Sopenharmony_ci
187462306a36Sopenharmony_cistatic int recover_sectors(struct btrfs_raid_bio *rbio)
187562306a36Sopenharmony_ci{
187662306a36Sopenharmony_ci	void **pointers = NULL;
187762306a36Sopenharmony_ci	void **unmap_array = NULL;
187862306a36Sopenharmony_ci	int sectornr;
187962306a36Sopenharmony_ci	int ret = 0;
188062306a36Sopenharmony_ci
188162306a36Sopenharmony_ci	/*
188262306a36Sopenharmony_ci	 * @pointers array stores the pointer for each sector.
188362306a36Sopenharmony_ci	 *
188462306a36Sopenharmony_ci	 * @unmap_array stores copy of pointers that does not get reordered
188562306a36Sopenharmony_ci	 * during reconstruction so that kunmap_local works.
188662306a36Sopenharmony_ci	 */
188762306a36Sopenharmony_ci	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
188862306a36Sopenharmony_ci	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
188962306a36Sopenharmony_ci	if (!pointers || !unmap_array) {
189062306a36Sopenharmony_ci		ret = -ENOMEM;
189162306a36Sopenharmony_ci		goto out;
189262306a36Sopenharmony_ci	}
189362306a36Sopenharmony_ci
189462306a36Sopenharmony_ci	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
189562306a36Sopenharmony_ci		spin_lock(&rbio->bio_list_lock);
189662306a36Sopenharmony_ci		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
189762306a36Sopenharmony_ci		spin_unlock(&rbio->bio_list_lock);
189862306a36Sopenharmony_ci	}
189962306a36Sopenharmony_ci
190062306a36Sopenharmony_ci	index_rbio_pages(rbio);
190162306a36Sopenharmony_ci
190262306a36Sopenharmony_ci	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
190362306a36Sopenharmony_ci		ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
190462306a36Sopenharmony_ci		if (ret < 0)
190562306a36Sopenharmony_ci			break;
190662306a36Sopenharmony_ci	}
190762306a36Sopenharmony_ci
190862306a36Sopenharmony_ciout:
190962306a36Sopenharmony_ci	kfree(pointers);
191062306a36Sopenharmony_ci	kfree(unmap_array);
191162306a36Sopenharmony_ci	return ret;
191262306a36Sopenharmony_ci}
191362306a36Sopenharmony_ci
191462306a36Sopenharmony_cistatic void recover_rbio(struct btrfs_raid_bio *rbio)
191562306a36Sopenharmony_ci{
191662306a36Sopenharmony_ci	struct bio_list bio_list = BIO_EMPTY_LIST;
191762306a36Sopenharmony_ci	int total_sector_nr;
191862306a36Sopenharmony_ci	int ret = 0;
191962306a36Sopenharmony_ci
192062306a36Sopenharmony_ci	/*
192162306a36Sopenharmony_ci	 * Either we're doing recover for a read failure or degraded write,
192262306a36Sopenharmony_ci	 * caller should have set error bitmap correctly.
192362306a36Sopenharmony_ci	 */
192462306a36Sopenharmony_ci	ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
192562306a36Sopenharmony_ci
192662306a36Sopenharmony_ci	/* For recovery, we need to read all sectors including P/Q. */
192762306a36Sopenharmony_ci	ret = alloc_rbio_pages(rbio);
192862306a36Sopenharmony_ci	if (ret < 0)
192962306a36Sopenharmony_ci		goto out;
193062306a36Sopenharmony_ci
193162306a36Sopenharmony_ci	index_rbio_pages(rbio);
193262306a36Sopenharmony_ci
193362306a36Sopenharmony_ci	/*
193462306a36Sopenharmony_ci	 * Read everything that hasn't failed. However this time we will
193562306a36Sopenharmony_ci	 * not trust any cached sector.
193662306a36Sopenharmony_ci	 * As we may read out some stale data but higher layer is not reading
193762306a36Sopenharmony_ci	 * that stale part.
193862306a36Sopenharmony_ci	 *
193962306a36Sopenharmony_ci	 * So here we always re-read everything in recovery path.
194062306a36Sopenharmony_ci	 */
194162306a36Sopenharmony_ci	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
194262306a36Sopenharmony_ci	     total_sector_nr++) {
194362306a36Sopenharmony_ci		int stripe = total_sector_nr / rbio->stripe_nsectors;
194462306a36Sopenharmony_ci		int sectornr = total_sector_nr % rbio->stripe_nsectors;
194562306a36Sopenharmony_ci		struct sector_ptr *sector;
194662306a36Sopenharmony_ci
194762306a36Sopenharmony_ci		/*
194862306a36Sopenharmony_ci		 * Skip the range which has error.  It can be a range which is
194962306a36Sopenharmony_ci		 * marked error (for csum mismatch), or it can be a missing
195062306a36Sopenharmony_ci		 * device.
195162306a36Sopenharmony_ci		 */
195262306a36Sopenharmony_ci		if (!rbio->bioc->stripes[stripe].dev->bdev ||
195362306a36Sopenharmony_ci		    test_bit(total_sector_nr, rbio->error_bitmap)) {
195462306a36Sopenharmony_ci			/*
195562306a36Sopenharmony_ci			 * Also set the error bit for missing device, which
195662306a36Sopenharmony_ci			 * may not yet have its error bit set.
195762306a36Sopenharmony_ci			 */
195862306a36Sopenharmony_ci			set_bit(total_sector_nr, rbio->error_bitmap);
195962306a36Sopenharmony_ci			continue;
196062306a36Sopenharmony_ci		}
196162306a36Sopenharmony_ci
196262306a36Sopenharmony_ci		sector = rbio_stripe_sector(rbio, stripe, sectornr);
196362306a36Sopenharmony_ci		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
196462306a36Sopenharmony_ci					 sectornr, REQ_OP_READ);
196562306a36Sopenharmony_ci		if (ret < 0) {
196662306a36Sopenharmony_ci			bio_list_put(&bio_list);
196762306a36Sopenharmony_ci			goto out;
196862306a36Sopenharmony_ci		}
196962306a36Sopenharmony_ci	}
197062306a36Sopenharmony_ci
197162306a36Sopenharmony_ci	submit_read_wait_bio_list(rbio, &bio_list);
197262306a36Sopenharmony_ci	ret = recover_sectors(rbio);
197362306a36Sopenharmony_ciout:
197462306a36Sopenharmony_ci	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
197562306a36Sopenharmony_ci}
197662306a36Sopenharmony_ci
197762306a36Sopenharmony_cistatic void recover_rbio_work(struct work_struct *work)
197862306a36Sopenharmony_ci{
197962306a36Sopenharmony_ci	struct btrfs_raid_bio *rbio;
198062306a36Sopenharmony_ci
198162306a36Sopenharmony_ci	rbio = container_of(work, struct btrfs_raid_bio, work);
198262306a36Sopenharmony_ci	if (!lock_stripe_add(rbio))
198362306a36Sopenharmony_ci		recover_rbio(rbio);
198462306a36Sopenharmony_ci}
198562306a36Sopenharmony_ci
198662306a36Sopenharmony_cistatic void recover_rbio_work_locked(struct work_struct *work)
198762306a36Sopenharmony_ci{
198862306a36Sopenharmony_ci	recover_rbio(container_of(work, struct btrfs_raid_bio, work));
198962306a36Sopenharmony_ci}
199062306a36Sopenharmony_ci
199162306a36Sopenharmony_cistatic void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
199262306a36Sopenharmony_ci{
199362306a36Sopenharmony_ci	bool found = false;
199462306a36Sopenharmony_ci	int sector_nr;
199562306a36Sopenharmony_ci
199662306a36Sopenharmony_ci	/*
199762306a36Sopenharmony_ci	 * This is for RAID6 extra recovery tries, thus mirror number should
199862306a36Sopenharmony_ci	 * be large than 2.
199962306a36Sopenharmony_ci	 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
200062306a36Sopenharmony_ci	 * RAID5 methods.
200162306a36Sopenharmony_ci	 */
200262306a36Sopenharmony_ci	ASSERT(mirror_num > 2);
200362306a36Sopenharmony_ci	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
200462306a36Sopenharmony_ci		int found_errors;
200562306a36Sopenharmony_ci		int faila;
200662306a36Sopenharmony_ci		int failb;
200762306a36Sopenharmony_ci
200862306a36Sopenharmony_ci		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
200962306a36Sopenharmony_ci							 &faila, &failb);
201062306a36Sopenharmony_ci		/* This vertical stripe doesn't have errors. */
201162306a36Sopenharmony_ci		if (!found_errors)
201262306a36Sopenharmony_ci			continue;
201362306a36Sopenharmony_ci
201462306a36Sopenharmony_ci		/*
201562306a36Sopenharmony_ci		 * If we found errors, there should be only one error marked
201662306a36Sopenharmony_ci		 * by previous set_rbio_range_error().
201762306a36Sopenharmony_ci		 */
201862306a36Sopenharmony_ci		ASSERT(found_errors == 1);
201962306a36Sopenharmony_ci		found = true;
202062306a36Sopenharmony_ci
202162306a36Sopenharmony_ci		/* Now select another stripe to mark as error. */
202262306a36Sopenharmony_ci		failb = rbio->real_stripes - (mirror_num - 1);
202362306a36Sopenharmony_ci		if (failb <= faila)
202462306a36Sopenharmony_ci			failb--;
202562306a36Sopenharmony_ci
202662306a36Sopenharmony_ci		/* Set the extra bit in error bitmap. */
202762306a36Sopenharmony_ci		if (failb >= 0)
202862306a36Sopenharmony_ci			set_bit(failb * rbio->stripe_nsectors + sector_nr,
202962306a36Sopenharmony_ci				rbio->error_bitmap);
203062306a36Sopenharmony_ci	}
203162306a36Sopenharmony_ci
203262306a36Sopenharmony_ci	/* We should found at least one vertical stripe with error.*/
203362306a36Sopenharmony_ci	ASSERT(found);
203462306a36Sopenharmony_ci}
203562306a36Sopenharmony_ci
203662306a36Sopenharmony_ci/*
203762306a36Sopenharmony_ci * the main entry point for reads from the higher layers.  This
203862306a36Sopenharmony_ci * is really only called when the normal read path had a failure,
203962306a36Sopenharmony_ci * so we assume the bio they send down corresponds to a failed part
204062306a36Sopenharmony_ci * of the drive.
204162306a36Sopenharmony_ci */
204262306a36Sopenharmony_civoid raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
204362306a36Sopenharmony_ci			   int mirror_num)
204462306a36Sopenharmony_ci{
204562306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = bioc->fs_info;
204662306a36Sopenharmony_ci	struct btrfs_raid_bio *rbio;
204762306a36Sopenharmony_ci
204862306a36Sopenharmony_ci	rbio = alloc_rbio(fs_info, bioc);
204962306a36Sopenharmony_ci	if (IS_ERR(rbio)) {
205062306a36Sopenharmony_ci		bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
205162306a36Sopenharmony_ci		bio_endio(bio);
205262306a36Sopenharmony_ci		return;
205362306a36Sopenharmony_ci	}
205462306a36Sopenharmony_ci
205562306a36Sopenharmony_ci	rbio->operation = BTRFS_RBIO_READ_REBUILD;
205662306a36Sopenharmony_ci	rbio_add_bio(rbio, bio);
205762306a36Sopenharmony_ci
205862306a36Sopenharmony_ci	set_rbio_range_error(rbio, bio);
205962306a36Sopenharmony_ci
206062306a36Sopenharmony_ci	/*
206162306a36Sopenharmony_ci	 * Loop retry:
206262306a36Sopenharmony_ci	 * for 'mirror == 2', reconstruct from all other stripes.
206362306a36Sopenharmony_ci	 * for 'mirror_num > 2', select a stripe to fail on every retry.
206462306a36Sopenharmony_ci	 */
206562306a36Sopenharmony_ci	if (mirror_num > 2)
206662306a36Sopenharmony_ci		set_rbio_raid6_extra_error(rbio, mirror_num);
206762306a36Sopenharmony_ci
206862306a36Sopenharmony_ci	start_async_work(rbio, recover_rbio_work);
206962306a36Sopenharmony_ci}
207062306a36Sopenharmony_ci
207162306a36Sopenharmony_cistatic void fill_data_csums(struct btrfs_raid_bio *rbio)
207262306a36Sopenharmony_ci{
207362306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
207462306a36Sopenharmony_ci	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
207562306a36Sopenharmony_ci						       rbio->bioc->full_stripe_logical);
207662306a36Sopenharmony_ci	const u64 start = rbio->bioc->full_stripe_logical;
207762306a36Sopenharmony_ci	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
207862306a36Sopenharmony_ci			fs_info->sectorsize_bits;
207962306a36Sopenharmony_ci	int ret;
208062306a36Sopenharmony_ci
208162306a36Sopenharmony_ci	/* The rbio should not have its csum buffer initialized. */
208262306a36Sopenharmony_ci	ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
208362306a36Sopenharmony_ci
208462306a36Sopenharmony_ci	/*
208562306a36Sopenharmony_ci	 * Skip the csum search if:
208662306a36Sopenharmony_ci	 *
208762306a36Sopenharmony_ci	 * - The rbio doesn't belong to data block groups
208862306a36Sopenharmony_ci	 *   Then we are doing IO for tree blocks, no need to search csums.
208962306a36Sopenharmony_ci	 *
209062306a36Sopenharmony_ci	 * - The rbio belongs to mixed block groups
209162306a36Sopenharmony_ci	 *   This is to avoid deadlock, as we're already holding the full
209262306a36Sopenharmony_ci	 *   stripe lock, if we trigger a metadata read, and it needs to do
209362306a36Sopenharmony_ci	 *   raid56 recovery, we will deadlock.
209462306a36Sopenharmony_ci	 */
209562306a36Sopenharmony_ci	if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
209662306a36Sopenharmony_ci	    rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
209762306a36Sopenharmony_ci		return;
209862306a36Sopenharmony_ci
209962306a36Sopenharmony_ci	rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
210062306a36Sopenharmony_ci				 fs_info->csum_size, GFP_NOFS);
210162306a36Sopenharmony_ci	rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
210262306a36Sopenharmony_ci					  GFP_NOFS);
210362306a36Sopenharmony_ci	if (!rbio->csum_buf || !rbio->csum_bitmap) {
210462306a36Sopenharmony_ci		ret = -ENOMEM;
210562306a36Sopenharmony_ci		goto error;
210662306a36Sopenharmony_ci	}
210762306a36Sopenharmony_ci
210862306a36Sopenharmony_ci	ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
210962306a36Sopenharmony_ci					rbio->csum_buf, rbio->csum_bitmap);
211062306a36Sopenharmony_ci	if (ret < 0)
211162306a36Sopenharmony_ci		goto error;
211262306a36Sopenharmony_ci	if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
211362306a36Sopenharmony_ci		goto no_csum;
211462306a36Sopenharmony_ci	return;
211562306a36Sopenharmony_ci
211662306a36Sopenharmony_cierror:
211762306a36Sopenharmony_ci	/*
211862306a36Sopenharmony_ci	 * We failed to allocate memory or grab the csum, but it's not fatal,
211962306a36Sopenharmony_ci	 * we can still continue.  But better to warn users that RMW is no
212062306a36Sopenharmony_ci	 * longer safe for this particular sub-stripe write.
212162306a36Sopenharmony_ci	 */
212262306a36Sopenharmony_ci	btrfs_warn_rl(fs_info,
212362306a36Sopenharmony_ci"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
212462306a36Sopenharmony_ci			rbio->bioc->full_stripe_logical, ret);
212562306a36Sopenharmony_cino_csum:
212662306a36Sopenharmony_ci	kfree(rbio->csum_buf);
212762306a36Sopenharmony_ci	bitmap_free(rbio->csum_bitmap);
212862306a36Sopenharmony_ci	rbio->csum_buf = NULL;
212962306a36Sopenharmony_ci	rbio->csum_bitmap = NULL;
213062306a36Sopenharmony_ci}
213162306a36Sopenharmony_ci
213262306a36Sopenharmony_cistatic int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
213362306a36Sopenharmony_ci{
213462306a36Sopenharmony_ci	struct bio_list bio_list = BIO_EMPTY_LIST;
213562306a36Sopenharmony_ci	int total_sector_nr;
213662306a36Sopenharmony_ci	int ret = 0;
213762306a36Sopenharmony_ci
213862306a36Sopenharmony_ci	/*
213962306a36Sopenharmony_ci	 * Fill the data csums we need for data verification.  We need to fill
214062306a36Sopenharmony_ci	 * the csum_bitmap/csum_buf first, as our endio function will try to
214162306a36Sopenharmony_ci	 * verify the data sectors.
214262306a36Sopenharmony_ci	 */
214362306a36Sopenharmony_ci	fill_data_csums(rbio);
214462306a36Sopenharmony_ci
214562306a36Sopenharmony_ci	/*
214662306a36Sopenharmony_ci	 * Build a list of bios to read all sectors (including data and P/Q).
214762306a36Sopenharmony_ci	 *
214862306a36Sopenharmony_ci	 * This behavior is to compensate the later csum verification and recovery.
214962306a36Sopenharmony_ci	 */
215062306a36Sopenharmony_ci	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
215162306a36Sopenharmony_ci	     total_sector_nr++) {
215262306a36Sopenharmony_ci		struct sector_ptr *sector;
215362306a36Sopenharmony_ci		int stripe = total_sector_nr / rbio->stripe_nsectors;
215462306a36Sopenharmony_ci		int sectornr = total_sector_nr % rbio->stripe_nsectors;
215562306a36Sopenharmony_ci
215662306a36Sopenharmony_ci		sector = rbio_stripe_sector(rbio, stripe, sectornr);
215762306a36Sopenharmony_ci		ret = rbio_add_io_sector(rbio, &bio_list, sector,
215862306a36Sopenharmony_ci			       stripe, sectornr, REQ_OP_READ);
215962306a36Sopenharmony_ci		if (ret) {
216062306a36Sopenharmony_ci			bio_list_put(&bio_list);
216162306a36Sopenharmony_ci			return ret;
216262306a36Sopenharmony_ci		}
216362306a36Sopenharmony_ci	}
216462306a36Sopenharmony_ci
216562306a36Sopenharmony_ci	/*
216662306a36Sopenharmony_ci	 * We may or may not have any corrupted sectors (including missing dev
216762306a36Sopenharmony_ci	 * and csum mismatch), just let recover_sectors() to handle them all.
216862306a36Sopenharmony_ci	 */
216962306a36Sopenharmony_ci	submit_read_wait_bio_list(rbio, &bio_list);
217062306a36Sopenharmony_ci	return recover_sectors(rbio);
217162306a36Sopenharmony_ci}
217262306a36Sopenharmony_ci
217362306a36Sopenharmony_cistatic void raid_wait_write_end_io(struct bio *bio)
217462306a36Sopenharmony_ci{
217562306a36Sopenharmony_ci	struct btrfs_raid_bio *rbio = bio->bi_private;
217662306a36Sopenharmony_ci	blk_status_t err = bio->bi_status;
217762306a36Sopenharmony_ci
217862306a36Sopenharmony_ci	if (err)
217962306a36Sopenharmony_ci		rbio_update_error_bitmap(rbio, bio);
218062306a36Sopenharmony_ci	bio_put(bio);
218162306a36Sopenharmony_ci	if (atomic_dec_and_test(&rbio->stripes_pending))
218262306a36Sopenharmony_ci		wake_up(&rbio->io_wait);
218362306a36Sopenharmony_ci}
218462306a36Sopenharmony_ci
218562306a36Sopenharmony_cistatic void submit_write_bios(struct btrfs_raid_bio *rbio,
218662306a36Sopenharmony_ci			      struct bio_list *bio_list)
218762306a36Sopenharmony_ci{
218862306a36Sopenharmony_ci	struct bio *bio;
218962306a36Sopenharmony_ci
219062306a36Sopenharmony_ci	atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
219162306a36Sopenharmony_ci	while ((bio = bio_list_pop(bio_list))) {
219262306a36Sopenharmony_ci		bio->bi_end_io = raid_wait_write_end_io;
219362306a36Sopenharmony_ci
219462306a36Sopenharmony_ci		if (trace_raid56_write_enabled()) {
219562306a36Sopenharmony_ci			struct raid56_bio_trace_info trace_info = { 0 };
219662306a36Sopenharmony_ci
219762306a36Sopenharmony_ci			bio_get_trace_info(rbio, bio, &trace_info);
219862306a36Sopenharmony_ci			trace_raid56_write(rbio, bio, &trace_info);
219962306a36Sopenharmony_ci		}
220062306a36Sopenharmony_ci		submit_bio(bio);
220162306a36Sopenharmony_ci	}
220262306a36Sopenharmony_ci}
220362306a36Sopenharmony_ci
220462306a36Sopenharmony_ci/*
220562306a36Sopenharmony_ci * To determine if we need to read any sector from the disk.
220662306a36Sopenharmony_ci * Should only be utilized in RMW path, to skip cached rbio.
220762306a36Sopenharmony_ci */
220862306a36Sopenharmony_cistatic bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
220962306a36Sopenharmony_ci{
221062306a36Sopenharmony_ci	int i;
221162306a36Sopenharmony_ci
221262306a36Sopenharmony_ci	for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
221362306a36Sopenharmony_ci		struct sector_ptr *sector = &rbio->stripe_sectors[i];
221462306a36Sopenharmony_ci
221562306a36Sopenharmony_ci		/*
221662306a36Sopenharmony_ci		 * We have a sector which doesn't have page nor uptodate,
221762306a36Sopenharmony_ci		 * thus this rbio can not be cached one, as cached one must
221862306a36Sopenharmony_ci		 * have all its data sectors present and uptodate.
221962306a36Sopenharmony_ci		 */
222062306a36Sopenharmony_ci		if (!sector->page || !sector->uptodate)
222162306a36Sopenharmony_ci			return true;
222262306a36Sopenharmony_ci	}
222362306a36Sopenharmony_ci	return false;
222462306a36Sopenharmony_ci}
222562306a36Sopenharmony_ci
222662306a36Sopenharmony_cistatic void rmw_rbio(struct btrfs_raid_bio *rbio)
222762306a36Sopenharmony_ci{
222862306a36Sopenharmony_ci	struct bio_list bio_list;
222962306a36Sopenharmony_ci	int sectornr;
223062306a36Sopenharmony_ci	int ret = 0;
223162306a36Sopenharmony_ci
223262306a36Sopenharmony_ci	/*
223362306a36Sopenharmony_ci	 * Allocate the pages for parity first, as P/Q pages will always be
223462306a36Sopenharmony_ci	 * needed for both full-stripe and sub-stripe writes.
223562306a36Sopenharmony_ci	 */
223662306a36Sopenharmony_ci	ret = alloc_rbio_parity_pages(rbio);
223762306a36Sopenharmony_ci	if (ret < 0)
223862306a36Sopenharmony_ci		goto out;
223962306a36Sopenharmony_ci
224062306a36Sopenharmony_ci	/*
224162306a36Sopenharmony_ci	 * Either full stripe write, or we have every data sector already
224262306a36Sopenharmony_ci	 * cached, can go to write path immediately.
224362306a36Sopenharmony_ci	 */
224462306a36Sopenharmony_ci	if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
224562306a36Sopenharmony_ci		/*
224662306a36Sopenharmony_ci		 * Now we're doing sub-stripe write, also need all data stripes
224762306a36Sopenharmony_ci		 * to do the full RMW.
224862306a36Sopenharmony_ci		 */
224962306a36Sopenharmony_ci		ret = alloc_rbio_data_pages(rbio);
225062306a36Sopenharmony_ci		if (ret < 0)
225162306a36Sopenharmony_ci			goto out;
225262306a36Sopenharmony_ci
225362306a36Sopenharmony_ci		index_rbio_pages(rbio);
225462306a36Sopenharmony_ci
225562306a36Sopenharmony_ci		ret = rmw_read_wait_recover(rbio);
225662306a36Sopenharmony_ci		if (ret < 0)
225762306a36Sopenharmony_ci			goto out;
225862306a36Sopenharmony_ci	}
225962306a36Sopenharmony_ci
226062306a36Sopenharmony_ci	/*
226162306a36Sopenharmony_ci	 * At this stage we're not allowed to add any new bios to the
226262306a36Sopenharmony_ci	 * bio list any more, anyone else that wants to change this stripe
226362306a36Sopenharmony_ci	 * needs to do their own rmw.
226462306a36Sopenharmony_ci	 */
226562306a36Sopenharmony_ci	spin_lock(&rbio->bio_list_lock);
226662306a36Sopenharmony_ci	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
226762306a36Sopenharmony_ci	spin_unlock(&rbio->bio_list_lock);
226862306a36Sopenharmony_ci
226962306a36Sopenharmony_ci	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
227062306a36Sopenharmony_ci
227162306a36Sopenharmony_ci	index_rbio_pages(rbio);
227262306a36Sopenharmony_ci
227362306a36Sopenharmony_ci	/*
227462306a36Sopenharmony_ci	 * We don't cache full rbios because we're assuming
227562306a36Sopenharmony_ci	 * the higher layers are unlikely to use this area of
227662306a36Sopenharmony_ci	 * the disk again soon.  If they do use it again,
227762306a36Sopenharmony_ci	 * hopefully they will send another full bio.
227862306a36Sopenharmony_ci	 */
227962306a36Sopenharmony_ci	if (!rbio_is_full(rbio))
228062306a36Sopenharmony_ci		cache_rbio_pages(rbio);
228162306a36Sopenharmony_ci	else
228262306a36Sopenharmony_ci		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
228362306a36Sopenharmony_ci
228462306a36Sopenharmony_ci	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
228562306a36Sopenharmony_ci		generate_pq_vertical(rbio, sectornr);
228662306a36Sopenharmony_ci
228762306a36Sopenharmony_ci	bio_list_init(&bio_list);
228862306a36Sopenharmony_ci	ret = rmw_assemble_write_bios(rbio, &bio_list);
228962306a36Sopenharmony_ci	if (ret < 0)
229062306a36Sopenharmony_ci		goto out;
229162306a36Sopenharmony_ci
229262306a36Sopenharmony_ci	/* We should have at least one bio assembled. */
229362306a36Sopenharmony_ci	ASSERT(bio_list_size(&bio_list));
229462306a36Sopenharmony_ci	submit_write_bios(rbio, &bio_list);
229562306a36Sopenharmony_ci	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
229662306a36Sopenharmony_ci
229762306a36Sopenharmony_ci	/* We may have more errors than our tolerance during the read. */
229862306a36Sopenharmony_ci	for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
229962306a36Sopenharmony_ci		int found_errors;
230062306a36Sopenharmony_ci
230162306a36Sopenharmony_ci		found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
230262306a36Sopenharmony_ci		if (found_errors > rbio->bioc->max_errors) {
230362306a36Sopenharmony_ci			ret = -EIO;
230462306a36Sopenharmony_ci			break;
230562306a36Sopenharmony_ci		}
230662306a36Sopenharmony_ci	}
230762306a36Sopenharmony_ciout:
230862306a36Sopenharmony_ci	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
230962306a36Sopenharmony_ci}
231062306a36Sopenharmony_ci
231162306a36Sopenharmony_cistatic void rmw_rbio_work(struct work_struct *work)
231262306a36Sopenharmony_ci{
231362306a36Sopenharmony_ci	struct btrfs_raid_bio *rbio;
231462306a36Sopenharmony_ci
231562306a36Sopenharmony_ci	rbio = container_of(work, struct btrfs_raid_bio, work);
231662306a36Sopenharmony_ci	if (lock_stripe_add(rbio) == 0)
231762306a36Sopenharmony_ci		rmw_rbio(rbio);
231862306a36Sopenharmony_ci}
231962306a36Sopenharmony_ci
232062306a36Sopenharmony_cistatic void rmw_rbio_work_locked(struct work_struct *work)
232162306a36Sopenharmony_ci{
232262306a36Sopenharmony_ci	rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
232362306a36Sopenharmony_ci}
232462306a36Sopenharmony_ci
232562306a36Sopenharmony_ci/*
232662306a36Sopenharmony_ci * The following code is used to scrub/replace the parity stripe
232762306a36Sopenharmony_ci *
232862306a36Sopenharmony_ci * Caller must have already increased bio_counter for getting @bioc.
232962306a36Sopenharmony_ci *
233062306a36Sopenharmony_ci * Note: We need make sure all the pages that add into the scrub/replace
233162306a36Sopenharmony_ci * raid bio are correct and not be changed during the scrub/replace. That
233262306a36Sopenharmony_ci * is those pages just hold metadata or file data with checksum.
233362306a36Sopenharmony_ci */
233462306a36Sopenharmony_ci
233562306a36Sopenharmony_cistruct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
233662306a36Sopenharmony_ci				struct btrfs_io_context *bioc,
233762306a36Sopenharmony_ci				struct btrfs_device *scrub_dev,
233862306a36Sopenharmony_ci				unsigned long *dbitmap, int stripe_nsectors)
233962306a36Sopenharmony_ci{
234062306a36Sopenharmony_ci	struct btrfs_fs_info *fs_info = bioc->fs_info;
234162306a36Sopenharmony_ci	struct btrfs_raid_bio *rbio;
234262306a36Sopenharmony_ci	int i;
234362306a36Sopenharmony_ci
234462306a36Sopenharmony_ci	rbio = alloc_rbio(fs_info, bioc);
234562306a36Sopenharmony_ci	if (IS_ERR(rbio))
234662306a36Sopenharmony_ci		return NULL;
234762306a36Sopenharmony_ci	bio_list_add(&rbio->bio_list, bio);
234862306a36Sopenharmony_ci	/*
234962306a36Sopenharmony_ci	 * This is a special bio which is used to hold the completion handler
235062306a36Sopenharmony_ci	 * and make the scrub rbio is similar to the other types
235162306a36Sopenharmony_ci	 */
235262306a36Sopenharmony_ci	ASSERT(!bio->bi_iter.bi_size);
235362306a36Sopenharmony_ci	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
235462306a36Sopenharmony_ci
235562306a36Sopenharmony_ci	/*
235662306a36Sopenharmony_ci	 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
235762306a36Sopenharmony_ci	 * to the end position, so this search can start from the first parity
235862306a36Sopenharmony_ci	 * stripe.
235962306a36Sopenharmony_ci	 */
236062306a36Sopenharmony_ci	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
236162306a36Sopenharmony_ci		if (bioc->stripes[i].dev == scrub_dev) {
236262306a36Sopenharmony_ci			rbio->scrubp = i;
236362306a36Sopenharmony_ci			break;
236462306a36Sopenharmony_ci		}
236562306a36Sopenharmony_ci	}
236662306a36Sopenharmony_ci	ASSERT(i < rbio->real_stripes);
236762306a36Sopenharmony_ci
236862306a36Sopenharmony_ci	bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
236962306a36Sopenharmony_ci	return rbio;
237062306a36Sopenharmony_ci}
237162306a36Sopenharmony_ci
237262306a36Sopenharmony_ci/*
237362306a36Sopenharmony_ci * We just scrub the parity that we have correct data on the same horizontal,
237462306a36Sopenharmony_ci * so we needn't allocate all pages for all the stripes.
237562306a36Sopenharmony_ci */
237662306a36Sopenharmony_cistatic int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
237762306a36Sopenharmony_ci{
237862306a36Sopenharmony_ci	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
237962306a36Sopenharmony_ci	int total_sector_nr;
238062306a36Sopenharmony_ci
238162306a36Sopenharmony_ci	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
238262306a36Sopenharmony_ci	     total_sector_nr++) {
238362306a36Sopenharmony_ci		struct page *page;
238462306a36Sopenharmony_ci		int sectornr = total_sector_nr % rbio->stripe_nsectors;
238562306a36Sopenharmony_ci		int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
238662306a36Sopenharmony_ci
238762306a36Sopenharmony_ci		if (!test_bit(sectornr, &rbio->dbitmap))
238862306a36Sopenharmony_ci			continue;
238962306a36Sopenharmony_ci		if (rbio->stripe_pages[index])
239062306a36Sopenharmony_ci			continue;
239162306a36Sopenharmony_ci		page = alloc_page(GFP_NOFS);
239262306a36Sopenharmony_ci		if (!page)
239362306a36Sopenharmony_ci			return -ENOMEM;
239462306a36Sopenharmony_ci		rbio->stripe_pages[index] = page;
239562306a36Sopenharmony_ci	}
239662306a36Sopenharmony_ci	index_stripe_sectors(rbio);
239762306a36Sopenharmony_ci	return 0;
239862306a36Sopenharmony_ci}
239962306a36Sopenharmony_ci
240062306a36Sopenharmony_cistatic int finish_parity_scrub(struct btrfs_raid_bio *rbio)
240162306a36Sopenharmony_ci{
240262306a36Sopenharmony_ci	struct btrfs_io_context *bioc = rbio->bioc;
240362306a36Sopenharmony_ci	const u32 sectorsize = bioc->fs_info->sectorsize;
240462306a36Sopenharmony_ci	void **pointers = rbio->finish_pointers;
240562306a36Sopenharmony_ci	unsigned long *pbitmap = &rbio->finish_pbitmap;
240662306a36Sopenharmony_ci	int nr_data = rbio->nr_data;
240762306a36Sopenharmony_ci	int stripe;
240862306a36Sopenharmony_ci	int sectornr;
240962306a36Sopenharmony_ci	bool has_qstripe;
241062306a36Sopenharmony_ci	struct sector_ptr p_sector = { 0 };
241162306a36Sopenharmony_ci	struct sector_ptr q_sector = { 0 };
241262306a36Sopenharmony_ci	struct bio_list bio_list;
241362306a36Sopenharmony_ci	int is_replace = 0;
241462306a36Sopenharmony_ci	int ret;
241562306a36Sopenharmony_ci
241662306a36Sopenharmony_ci	bio_list_init(&bio_list);
241762306a36Sopenharmony_ci
241862306a36Sopenharmony_ci	if (rbio->real_stripes - rbio->nr_data == 1)
241962306a36Sopenharmony_ci		has_qstripe = false;
242062306a36Sopenharmony_ci	else if (rbio->real_stripes - rbio->nr_data == 2)
242162306a36Sopenharmony_ci		has_qstripe = true;
242262306a36Sopenharmony_ci	else
242362306a36Sopenharmony_ci		BUG();
242462306a36Sopenharmony_ci
242562306a36Sopenharmony_ci	/*
242662306a36Sopenharmony_ci	 * Replace is running and our P/Q stripe is being replaced, then we
242762306a36Sopenharmony_ci	 * need to duplicate the final write to replace target.
242862306a36Sopenharmony_ci	 */
242962306a36Sopenharmony_ci	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
243062306a36Sopenharmony_ci		is_replace = 1;
243162306a36Sopenharmony_ci		bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
243262306a36Sopenharmony_ci	}
243362306a36Sopenharmony_ci
243462306a36Sopenharmony_ci	/*
243562306a36Sopenharmony_ci	 * Because the higher layers(scrubber) are unlikely to
243662306a36Sopenharmony_ci	 * use this area of the disk again soon, so don't cache
243762306a36Sopenharmony_ci	 * it.
243862306a36Sopenharmony_ci	 */
243962306a36Sopenharmony_ci	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
244062306a36Sopenharmony_ci
244162306a36Sopenharmony_ci	p_sector.page = alloc_page(GFP_NOFS);
244262306a36Sopenharmony_ci	if (!p_sector.page)
244362306a36Sopenharmony_ci		return -ENOMEM;
244462306a36Sopenharmony_ci	p_sector.pgoff = 0;
244562306a36Sopenharmony_ci	p_sector.uptodate = 1;
244662306a36Sopenharmony_ci
244762306a36Sopenharmony_ci	if (has_qstripe) {
244862306a36Sopenharmony_ci		/* RAID6, allocate and map temp space for the Q stripe */
244962306a36Sopenharmony_ci		q_sector.page = alloc_page(GFP_NOFS);
245062306a36Sopenharmony_ci		if (!q_sector.page) {
245162306a36Sopenharmony_ci			__free_page(p_sector.page);
245262306a36Sopenharmony_ci			p_sector.page = NULL;
245362306a36Sopenharmony_ci			return -ENOMEM;
245462306a36Sopenharmony_ci		}
245562306a36Sopenharmony_ci		q_sector.pgoff = 0;
245662306a36Sopenharmony_ci		q_sector.uptodate = 1;
245762306a36Sopenharmony_ci		pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
245862306a36Sopenharmony_ci	}
245962306a36Sopenharmony_ci
246062306a36Sopenharmony_ci	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
246162306a36Sopenharmony_ci
246262306a36Sopenharmony_ci	/* Map the parity stripe just once */
246362306a36Sopenharmony_ci	pointers[nr_data] = kmap_local_page(p_sector.page);
246462306a36Sopenharmony_ci
246562306a36Sopenharmony_ci	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
246662306a36Sopenharmony_ci		struct sector_ptr *sector;
246762306a36Sopenharmony_ci		void *parity;
246862306a36Sopenharmony_ci
246962306a36Sopenharmony_ci		/* first collect one page from each data stripe */
247062306a36Sopenharmony_ci		for (stripe = 0; stripe < nr_data; stripe++) {
247162306a36Sopenharmony_ci			sector = sector_in_rbio(rbio, stripe, sectornr, 0);
247262306a36Sopenharmony_ci			pointers[stripe] = kmap_local_page(sector->page) +
247362306a36Sopenharmony_ci					   sector->pgoff;
247462306a36Sopenharmony_ci		}
247562306a36Sopenharmony_ci
247662306a36Sopenharmony_ci		if (has_qstripe) {
247762306a36Sopenharmony_ci			/* RAID6, call the library function to fill in our P/Q */
247862306a36Sopenharmony_ci			raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
247962306a36Sopenharmony_ci						pointers);
248062306a36Sopenharmony_ci		} else {
248162306a36Sopenharmony_ci			/* raid5 */
248262306a36Sopenharmony_ci			memcpy(pointers[nr_data], pointers[0], sectorsize);
248362306a36Sopenharmony_ci			run_xor(pointers + 1, nr_data - 1, sectorsize);
248462306a36Sopenharmony_ci		}
248562306a36Sopenharmony_ci
248662306a36Sopenharmony_ci		/* Check scrubbing parity and repair it */
248762306a36Sopenharmony_ci		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
248862306a36Sopenharmony_ci		parity = kmap_local_page(sector->page) + sector->pgoff;
248962306a36Sopenharmony_ci		if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
249062306a36Sopenharmony_ci			memcpy(parity, pointers[rbio->scrubp], sectorsize);
249162306a36Sopenharmony_ci		else
249262306a36Sopenharmony_ci			/* Parity is right, needn't writeback */
249362306a36Sopenharmony_ci			bitmap_clear(&rbio->dbitmap, sectornr, 1);
249462306a36Sopenharmony_ci		kunmap_local(parity);
249562306a36Sopenharmony_ci
249662306a36Sopenharmony_ci		for (stripe = nr_data - 1; stripe >= 0; stripe--)
249762306a36Sopenharmony_ci			kunmap_local(pointers[stripe]);
249862306a36Sopenharmony_ci	}
249962306a36Sopenharmony_ci
250062306a36Sopenharmony_ci	kunmap_local(pointers[nr_data]);
250162306a36Sopenharmony_ci	__free_page(p_sector.page);
250262306a36Sopenharmony_ci	p_sector.page = NULL;
250362306a36Sopenharmony_ci	if (q_sector.page) {
250462306a36Sopenharmony_ci		kunmap_local(pointers[rbio->real_stripes - 1]);
250562306a36Sopenharmony_ci		__free_page(q_sector.page);
250662306a36Sopenharmony_ci		q_sector.page = NULL;
250762306a36Sopenharmony_ci	}
250862306a36Sopenharmony_ci
250962306a36Sopenharmony_ci	/*
251062306a36Sopenharmony_ci	 * time to start writing.  Make bios for everything from the
251162306a36Sopenharmony_ci	 * higher layers (the bio_list in our rbio) and our p/q.  Ignore
251262306a36Sopenharmony_ci	 * everything else.
251362306a36Sopenharmony_ci	 */
251462306a36Sopenharmony_ci	for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
251562306a36Sopenharmony_ci		struct sector_ptr *sector;
251662306a36Sopenharmony_ci
251762306a36Sopenharmony_ci		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
251862306a36Sopenharmony_ci		ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
251962306a36Sopenharmony_ci					 sectornr, REQ_OP_WRITE);
252062306a36Sopenharmony_ci		if (ret)
252162306a36Sopenharmony_ci			goto cleanup;
252262306a36Sopenharmony_ci	}
252362306a36Sopenharmony_ci
252462306a36Sopenharmony_ci	if (!is_replace)
252562306a36Sopenharmony_ci		goto submit_write;
252662306a36Sopenharmony_ci
252762306a36Sopenharmony_ci	/*
252862306a36Sopenharmony_ci	 * Replace is running and our parity stripe needs to be duplicated to
252962306a36Sopenharmony_ci	 * the target device.  Check we have a valid source stripe number.
253062306a36Sopenharmony_ci	 */
253162306a36Sopenharmony_ci	ASSERT(rbio->bioc->replace_stripe_src >= 0);
253262306a36Sopenharmony_ci	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
253362306a36Sopenharmony_ci		struct sector_ptr *sector;
253462306a36Sopenharmony_ci
253562306a36Sopenharmony_ci		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
253662306a36Sopenharmony_ci		ret = rbio_add_io_sector(rbio, &bio_list, sector,
253762306a36Sopenharmony_ci					 rbio->real_stripes,
253862306a36Sopenharmony_ci					 sectornr, REQ_OP_WRITE);
253962306a36Sopenharmony_ci		if (ret)
254062306a36Sopenharmony_ci			goto cleanup;
254162306a36Sopenharmony_ci	}
254262306a36Sopenharmony_ci
254362306a36Sopenharmony_cisubmit_write:
254462306a36Sopenharmony_ci	submit_write_bios(rbio, &bio_list);
254562306a36Sopenharmony_ci	return 0;
254662306a36Sopenharmony_ci
254762306a36Sopenharmony_cicleanup:
254862306a36Sopenharmony_ci	bio_list_put(&bio_list);
254962306a36Sopenharmony_ci	return ret;
255062306a36Sopenharmony_ci}
255162306a36Sopenharmony_ci
255262306a36Sopenharmony_cistatic inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
255362306a36Sopenharmony_ci{
255462306a36Sopenharmony_ci	if (stripe >= 0 && stripe < rbio->nr_data)
255562306a36Sopenharmony_ci		return 1;
255662306a36Sopenharmony_ci	return 0;
255762306a36Sopenharmony_ci}
255862306a36Sopenharmony_ci
255962306a36Sopenharmony_cistatic int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
256062306a36Sopenharmony_ci{
256162306a36Sopenharmony_ci	void **pointers = NULL;
256262306a36Sopenharmony_ci	void **unmap_array = NULL;
256362306a36Sopenharmony_ci	int sector_nr;
256462306a36Sopenharmony_ci	int ret = 0;
256562306a36Sopenharmony_ci
256662306a36Sopenharmony_ci	/*
256762306a36Sopenharmony_ci	 * @pointers array stores the pointer for each sector.
256862306a36Sopenharmony_ci	 *
256962306a36Sopenharmony_ci	 * @unmap_array stores copy of pointers that does not get reordered
257062306a36Sopenharmony_ci	 * during reconstruction so that kunmap_local works.
257162306a36Sopenharmony_ci	 */
257262306a36Sopenharmony_ci	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
257362306a36Sopenharmony_ci	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
257462306a36Sopenharmony_ci	if (!pointers || !unmap_array) {
257562306a36Sopenharmony_ci		ret = -ENOMEM;
257662306a36Sopenharmony_ci		goto out;
257762306a36Sopenharmony_ci	}
257862306a36Sopenharmony_ci
257962306a36Sopenharmony_ci	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
258062306a36Sopenharmony_ci		int dfail = 0, failp = -1;
258162306a36Sopenharmony_ci		int faila;
258262306a36Sopenharmony_ci		int failb;
258362306a36Sopenharmony_ci		int found_errors;
258462306a36Sopenharmony_ci
258562306a36Sopenharmony_ci		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
258662306a36Sopenharmony_ci							 &faila, &failb);
258762306a36Sopenharmony_ci		if (found_errors > rbio->bioc->max_errors) {
258862306a36Sopenharmony_ci			ret = -EIO;
258962306a36Sopenharmony_ci			goto out;
259062306a36Sopenharmony_ci		}
259162306a36Sopenharmony_ci		if (found_errors == 0)
259262306a36Sopenharmony_ci			continue;
259362306a36Sopenharmony_ci
259462306a36Sopenharmony_ci		/* We should have at least one error here. */
259562306a36Sopenharmony_ci		ASSERT(faila >= 0 || failb >= 0);
259662306a36Sopenharmony_ci
259762306a36Sopenharmony_ci		if (is_data_stripe(rbio, faila))
259862306a36Sopenharmony_ci			dfail++;
259962306a36Sopenharmony_ci		else if (is_parity_stripe(faila))
260062306a36Sopenharmony_ci			failp = faila;
260162306a36Sopenharmony_ci
260262306a36Sopenharmony_ci		if (is_data_stripe(rbio, failb))
260362306a36Sopenharmony_ci			dfail++;
260462306a36Sopenharmony_ci		else if (is_parity_stripe(failb))
260562306a36Sopenharmony_ci			failp = failb;
260662306a36Sopenharmony_ci		/*
260762306a36Sopenharmony_ci		 * Because we can not use a scrubbing parity to repair the
260862306a36Sopenharmony_ci		 * data, so the capability of the repair is declined.  (In the
260962306a36Sopenharmony_ci		 * case of RAID5, we can not repair anything.)
261062306a36Sopenharmony_ci		 */
261162306a36Sopenharmony_ci		if (dfail > rbio->bioc->max_errors - 1) {
261262306a36Sopenharmony_ci			ret = -EIO;
261362306a36Sopenharmony_ci			goto out;
261462306a36Sopenharmony_ci		}
261562306a36Sopenharmony_ci		/*
261662306a36Sopenharmony_ci		 * If all data is good, only parity is correctly, just repair
261762306a36Sopenharmony_ci		 * the parity, no need to recover data stripes.
261862306a36Sopenharmony_ci		 */
261962306a36Sopenharmony_ci		if (dfail == 0)
262062306a36Sopenharmony_ci			continue;
262162306a36Sopenharmony_ci
262262306a36Sopenharmony_ci		/*
262362306a36Sopenharmony_ci		 * Here means we got one corrupted data stripe and one
262462306a36Sopenharmony_ci		 * corrupted parity on RAID6, if the corrupted parity is
262562306a36Sopenharmony_ci		 * scrubbing parity, luckily, use the other one to repair the
262662306a36Sopenharmony_ci		 * data, or we can not repair the data stripe.
262762306a36Sopenharmony_ci		 */
262862306a36Sopenharmony_ci		if (failp != rbio->scrubp) {
262962306a36Sopenharmony_ci			ret = -EIO;
263062306a36Sopenharmony_ci			goto out;
263162306a36Sopenharmony_ci		}
263262306a36Sopenharmony_ci
263362306a36Sopenharmony_ci		ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
263462306a36Sopenharmony_ci		if (ret < 0)
263562306a36Sopenharmony_ci			goto out;
263662306a36Sopenharmony_ci	}
263762306a36Sopenharmony_ciout:
263862306a36Sopenharmony_ci	kfree(pointers);
263962306a36Sopenharmony_ci	kfree(unmap_array);
264062306a36Sopenharmony_ci	return ret;
264162306a36Sopenharmony_ci}
264262306a36Sopenharmony_ci
264362306a36Sopenharmony_cistatic int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
264462306a36Sopenharmony_ci{
264562306a36Sopenharmony_ci	struct bio_list bio_list = BIO_EMPTY_LIST;
264662306a36Sopenharmony_ci	int total_sector_nr;
264762306a36Sopenharmony_ci	int ret = 0;
264862306a36Sopenharmony_ci
264962306a36Sopenharmony_ci	/* Build a list of bios to read all the missing parts. */
265062306a36Sopenharmony_ci	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
265162306a36Sopenharmony_ci	     total_sector_nr++) {
265262306a36Sopenharmony_ci		int sectornr = total_sector_nr % rbio->stripe_nsectors;
265362306a36Sopenharmony_ci		int stripe = total_sector_nr / rbio->stripe_nsectors;
265462306a36Sopenharmony_ci		struct sector_ptr *sector;
265562306a36Sopenharmony_ci
265662306a36Sopenharmony_ci		/* No data in the vertical stripe, no need to read. */
265762306a36Sopenharmony_ci		if (!test_bit(sectornr, &rbio->dbitmap))
265862306a36Sopenharmony_ci			continue;
265962306a36Sopenharmony_ci
266062306a36Sopenharmony_ci		/*
266162306a36Sopenharmony_ci		 * We want to find all the sectors missing from the rbio and
266262306a36Sopenharmony_ci		 * read them from the disk. If sector_in_rbio() finds a sector
266362306a36Sopenharmony_ci		 * in the bio list we don't need to read it off the stripe.
266462306a36Sopenharmony_ci		 */
266562306a36Sopenharmony_ci		sector = sector_in_rbio(rbio, stripe, sectornr, 1);
266662306a36Sopenharmony_ci		if (sector)
266762306a36Sopenharmony_ci			continue;
266862306a36Sopenharmony_ci
266962306a36Sopenharmony_ci		sector = rbio_stripe_sector(rbio, stripe, sectornr);
267062306a36Sopenharmony_ci		/*
267162306a36Sopenharmony_ci		 * The bio cache may have handed us an uptodate sector.  If so,
267262306a36Sopenharmony_ci		 * use it.
267362306a36Sopenharmony_ci		 */
267462306a36Sopenharmony_ci		if (sector->uptodate)
267562306a36Sopenharmony_ci			continue;
267662306a36Sopenharmony_ci
267762306a36Sopenharmony_ci		ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
267862306a36Sopenharmony_ci					 sectornr, REQ_OP_READ);
267962306a36Sopenharmony_ci		if (ret) {
268062306a36Sopenharmony_ci			bio_list_put(&bio_list);
268162306a36Sopenharmony_ci			return ret;
268262306a36Sopenharmony_ci		}
268362306a36Sopenharmony_ci	}
268462306a36Sopenharmony_ci
268562306a36Sopenharmony_ci	submit_read_wait_bio_list(rbio, &bio_list);
268662306a36Sopenharmony_ci	return 0;
268762306a36Sopenharmony_ci}
268862306a36Sopenharmony_ci
268962306a36Sopenharmony_cistatic void scrub_rbio(struct btrfs_raid_bio *rbio)
269062306a36Sopenharmony_ci{
269162306a36Sopenharmony_ci	int sector_nr;
269262306a36Sopenharmony_ci	int ret;
269362306a36Sopenharmony_ci
269462306a36Sopenharmony_ci	ret = alloc_rbio_essential_pages(rbio);
269562306a36Sopenharmony_ci	if (ret)
269662306a36Sopenharmony_ci		goto out;
269762306a36Sopenharmony_ci
269862306a36Sopenharmony_ci	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
269962306a36Sopenharmony_ci
270062306a36Sopenharmony_ci	ret = scrub_assemble_read_bios(rbio);
270162306a36Sopenharmony_ci	if (ret < 0)
270262306a36Sopenharmony_ci		goto out;
270362306a36Sopenharmony_ci
270462306a36Sopenharmony_ci	/* We may have some failures, recover the failed sectors first. */
270562306a36Sopenharmony_ci	ret = recover_scrub_rbio(rbio);
270662306a36Sopenharmony_ci	if (ret < 0)
270762306a36Sopenharmony_ci		goto out;
270862306a36Sopenharmony_ci
270962306a36Sopenharmony_ci	/*
271062306a36Sopenharmony_ci	 * We have every sector properly prepared. Can finish the scrub
271162306a36Sopenharmony_ci	 * and writeback the good content.
271262306a36Sopenharmony_ci	 */
271362306a36Sopenharmony_ci	ret = finish_parity_scrub(rbio);
271462306a36Sopenharmony_ci	wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
271562306a36Sopenharmony_ci	for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
271662306a36Sopenharmony_ci		int found_errors;
271762306a36Sopenharmony_ci
271862306a36Sopenharmony_ci		found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
271962306a36Sopenharmony_ci		if (found_errors > rbio->bioc->max_errors) {
272062306a36Sopenharmony_ci			ret = -EIO;
272162306a36Sopenharmony_ci			break;
272262306a36Sopenharmony_ci		}
272362306a36Sopenharmony_ci	}
272462306a36Sopenharmony_ciout:
272562306a36Sopenharmony_ci	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
272662306a36Sopenharmony_ci}
272762306a36Sopenharmony_ci
272862306a36Sopenharmony_cistatic void scrub_rbio_work_locked(struct work_struct *work)
272962306a36Sopenharmony_ci{
273062306a36Sopenharmony_ci	scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
273162306a36Sopenharmony_ci}
273262306a36Sopenharmony_ci
273362306a36Sopenharmony_civoid raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
273462306a36Sopenharmony_ci{
273562306a36Sopenharmony_ci	if (!lock_stripe_add(rbio))
273662306a36Sopenharmony_ci		start_async_work(rbio, scrub_rbio_work_locked);
273762306a36Sopenharmony_ci}
273862306a36Sopenharmony_ci
273962306a36Sopenharmony_ci/*
274062306a36Sopenharmony_ci * This is for scrub call sites where we already have correct data contents.
274162306a36Sopenharmony_ci * This allows us to avoid reading data stripes again.
274262306a36Sopenharmony_ci *
274362306a36Sopenharmony_ci * Unfortunately here we have to do page copy, other than reusing the pages.
274462306a36Sopenharmony_ci * This is due to the fact rbio has its own page management for its cache.
274562306a36Sopenharmony_ci */
274662306a36Sopenharmony_civoid raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
274762306a36Sopenharmony_ci				    struct page **data_pages, u64 data_logical)
274862306a36Sopenharmony_ci{
274962306a36Sopenharmony_ci	const u64 offset_in_full_stripe = data_logical -
275062306a36Sopenharmony_ci					  rbio->bioc->full_stripe_logical;
275162306a36Sopenharmony_ci	const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
275262306a36Sopenharmony_ci	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
275362306a36Sopenharmony_ci	const u32 sectors_per_page = PAGE_SIZE / sectorsize;
275462306a36Sopenharmony_ci	int ret;
275562306a36Sopenharmony_ci
275662306a36Sopenharmony_ci	/*
275762306a36Sopenharmony_ci	 * If we hit ENOMEM temporarily, but later at
275862306a36Sopenharmony_ci	 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
275962306a36Sopenharmony_ci	 * the extra read, not a big deal.
276062306a36Sopenharmony_ci	 *
276162306a36Sopenharmony_ci	 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
276262306a36Sopenharmony_ci	 * the bio would got proper error number set.
276362306a36Sopenharmony_ci	 */
276462306a36Sopenharmony_ci	ret = alloc_rbio_data_pages(rbio);
276562306a36Sopenharmony_ci	if (ret < 0)
276662306a36Sopenharmony_ci		return;
276762306a36Sopenharmony_ci
276862306a36Sopenharmony_ci	/* data_logical must be at stripe boundary and inside the full stripe. */
276962306a36Sopenharmony_ci	ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
277062306a36Sopenharmony_ci	ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
277162306a36Sopenharmony_ci
277262306a36Sopenharmony_ci	for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
277362306a36Sopenharmony_ci		struct page *dst = rbio->stripe_pages[page_nr + page_index];
277462306a36Sopenharmony_ci		struct page *src = data_pages[page_nr];
277562306a36Sopenharmony_ci
277662306a36Sopenharmony_ci		memcpy_page(dst, 0, src, 0, PAGE_SIZE);
277762306a36Sopenharmony_ci		for (int sector_nr = sectors_per_page * page_index;
277862306a36Sopenharmony_ci		     sector_nr < sectors_per_page * (page_index + 1);
277962306a36Sopenharmony_ci		     sector_nr++)
278062306a36Sopenharmony_ci			rbio->stripe_sectors[sector_nr].uptodate = true;
278162306a36Sopenharmony_ci	}
278262306a36Sopenharmony_ci}
2783