162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * raid1.c : Multiple Devices driver for Linux
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
662306a36Sopenharmony_ci *
762306a36Sopenharmony_ci * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
862306a36Sopenharmony_ci *
962306a36Sopenharmony_ci * RAID-1 management functions.
1062306a36Sopenharmony_ci *
1162306a36Sopenharmony_ci * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
1262306a36Sopenharmony_ci *
1362306a36Sopenharmony_ci * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
1462306a36Sopenharmony_ci * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
1562306a36Sopenharmony_ci *
1662306a36Sopenharmony_ci * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
1762306a36Sopenharmony_ci * bitmapped intelligence in resync:
1862306a36Sopenharmony_ci *
1962306a36Sopenharmony_ci *      - bitmap marked during normal i/o
2062306a36Sopenharmony_ci *      - bitmap used to skip nondirty blocks during sync
2162306a36Sopenharmony_ci *
2262306a36Sopenharmony_ci * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
2362306a36Sopenharmony_ci * - persistent bitmap code
2462306a36Sopenharmony_ci */
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci#include <linux/slab.h>
2762306a36Sopenharmony_ci#include <linux/delay.h>
2862306a36Sopenharmony_ci#include <linux/blkdev.h>
2962306a36Sopenharmony_ci#include <linux/module.h>
3062306a36Sopenharmony_ci#include <linux/seq_file.h>
3162306a36Sopenharmony_ci#include <linux/ratelimit.h>
3262306a36Sopenharmony_ci#include <linux/interval_tree_generic.h>
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci#include <trace/events/block.h>
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci#include "md.h"
3762306a36Sopenharmony_ci#include "raid1.h"
3862306a36Sopenharmony_ci#include "md-bitmap.h"
3962306a36Sopenharmony_ci
4062306a36Sopenharmony_ci#define UNSUPPORTED_MDDEV_FLAGS		\
4162306a36Sopenharmony_ci	((1L << MD_HAS_JOURNAL) |	\
4262306a36Sopenharmony_ci	 (1L << MD_JOURNAL_CLEAN) |	\
4362306a36Sopenharmony_ci	 (1L << MD_HAS_PPL) |		\
4462306a36Sopenharmony_ci	 (1L << MD_HAS_MULTIPLE_PPLS))
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_cistatic void allow_barrier(struct r1conf *conf, sector_t sector_nr);
4762306a36Sopenharmony_cistatic void lower_barrier(struct r1conf *conf, sector_t sector_nr);
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_ci#define raid1_log(md, fmt, args...)				\
5062306a36Sopenharmony_ci	do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_ci#include "raid1-10.c"
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci#define START(node) ((node)->start)
5562306a36Sopenharmony_ci#define LAST(node) ((node)->last)
5662306a36Sopenharmony_ciINTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
5762306a36Sopenharmony_ci		     START, LAST, static inline, raid1_rb);
5862306a36Sopenharmony_ci
5962306a36Sopenharmony_cistatic int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
6062306a36Sopenharmony_ci				struct serial_info *si, int idx)
6162306a36Sopenharmony_ci{
6262306a36Sopenharmony_ci	unsigned long flags;
6362306a36Sopenharmony_ci	int ret = 0;
6462306a36Sopenharmony_ci	sector_t lo = r1_bio->sector;
6562306a36Sopenharmony_ci	sector_t hi = lo + r1_bio->sectors;
6662306a36Sopenharmony_ci	struct serial_in_rdev *serial = &rdev->serial[idx];
6762306a36Sopenharmony_ci
6862306a36Sopenharmony_ci	spin_lock_irqsave(&serial->serial_lock, flags);
6962306a36Sopenharmony_ci	/* collision happened */
7062306a36Sopenharmony_ci	if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
7162306a36Sopenharmony_ci		ret = -EBUSY;
7262306a36Sopenharmony_ci	else {
7362306a36Sopenharmony_ci		si->start = lo;
7462306a36Sopenharmony_ci		si->last = hi;
7562306a36Sopenharmony_ci		raid1_rb_insert(si, &serial->serial_rb);
7662306a36Sopenharmony_ci	}
7762306a36Sopenharmony_ci	spin_unlock_irqrestore(&serial->serial_lock, flags);
7862306a36Sopenharmony_ci
7962306a36Sopenharmony_ci	return ret;
8062306a36Sopenharmony_ci}
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_cistatic void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
8362306a36Sopenharmony_ci{
8462306a36Sopenharmony_ci	struct mddev *mddev = rdev->mddev;
8562306a36Sopenharmony_ci	struct serial_info *si;
8662306a36Sopenharmony_ci	int idx = sector_to_idx(r1_bio->sector);
8762306a36Sopenharmony_ci	struct serial_in_rdev *serial = &rdev->serial[idx];
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci	if (WARN_ON(!mddev->serial_info_pool))
9062306a36Sopenharmony_ci		return;
9162306a36Sopenharmony_ci	si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
9262306a36Sopenharmony_ci	wait_event(serial->serial_io_wait,
9362306a36Sopenharmony_ci		   check_and_add_serial(rdev, r1_bio, si, idx) == 0);
9462306a36Sopenharmony_ci}
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_cistatic void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
9762306a36Sopenharmony_ci{
9862306a36Sopenharmony_ci	struct serial_info *si;
9962306a36Sopenharmony_ci	unsigned long flags;
10062306a36Sopenharmony_ci	int found = 0;
10162306a36Sopenharmony_ci	struct mddev *mddev = rdev->mddev;
10262306a36Sopenharmony_ci	int idx = sector_to_idx(lo);
10362306a36Sopenharmony_ci	struct serial_in_rdev *serial = &rdev->serial[idx];
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	spin_lock_irqsave(&serial->serial_lock, flags);
10662306a36Sopenharmony_ci	for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
10762306a36Sopenharmony_ci	     si; si = raid1_rb_iter_next(si, lo, hi)) {
10862306a36Sopenharmony_ci		if (si->start == lo && si->last == hi) {
10962306a36Sopenharmony_ci			raid1_rb_remove(si, &serial->serial_rb);
11062306a36Sopenharmony_ci			mempool_free(si, mddev->serial_info_pool);
11162306a36Sopenharmony_ci			found = 1;
11262306a36Sopenharmony_ci			break;
11362306a36Sopenharmony_ci		}
11462306a36Sopenharmony_ci	}
11562306a36Sopenharmony_ci	if (!found)
11662306a36Sopenharmony_ci		WARN(1, "The write IO is not recorded for serialization\n");
11762306a36Sopenharmony_ci	spin_unlock_irqrestore(&serial->serial_lock, flags);
11862306a36Sopenharmony_ci	wake_up(&serial->serial_io_wait);
11962306a36Sopenharmony_ci}
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci/*
12262306a36Sopenharmony_ci * for resync bio, r1bio pointer can be retrieved from the per-bio
12362306a36Sopenharmony_ci * 'struct resync_pages'.
12462306a36Sopenharmony_ci */
12562306a36Sopenharmony_cistatic inline struct r1bio *get_resync_r1bio(struct bio *bio)
12662306a36Sopenharmony_ci{
12762306a36Sopenharmony_ci	return get_resync_pages(bio)->raid_bio;
12862306a36Sopenharmony_ci}
12962306a36Sopenharmony_ci
13062306a36Sopenharmony_cistatic void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
13162306a36Sopenharmony_ci{
13262306a36Sopenharmony_ci	struct pool_info *pi = data;
13362306a36Sopenharmony_ci	int size = offsetof(struct r1bio, bios[pi->raid_disks]);
13462306a36Sopenharmony_ci
13562306a36Sopenharmony_ci	/* allocate a r1bio with room for raid_disks entries in the bios array */
13662306a36Sopenharmony_ci	return kzalloc(size, gfp_flags);
13762306a36Sopenharmony_ci}
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci#define RESYNC_DEPTH 32
14062306a36Sopenharmony_ci#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
14162306a36Sopenharmony_ci#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
14262306a36Sopenharmony_ci#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
14362306a36Sopenharmony_ci#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
14462306a36Sopenharmony_ci#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_cistatic void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
14762306a36Sopenharmony_ci{
14862306a36Sopenharmony_ci	struct pool_info *pi = data;
14962306a36Sopenharmony_ci	struct r1bio *r1_bio;
15062306a36Sopenharmony_ci	struct bio *bio;
15162306a36Sopenharmony_ci	int need_pages;
15262306a36Sopenharmony_ci	int j;
15362306a36Sopenharmony_ci	struct resync_pages *rps;
15462306a36Sopenharmony_ci
15562306a36Sopenharmony_ci	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
15662306a36Sopenharmony_ci	if (!r1_bio)
15762306a36Sopenharmony_ci		return NULL;
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci	rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages),
16062306a36Sopenharmony_ci			    gfp_flags);
16162306a36Sopenharmony_ci	if (!rps)
16262306a36Sopenharmony_ci		goto out_free_r1bio;
16362306a36Sopenharmony_ci
16462306a36Sopenharmony_ci	/*
16562306a36Sopenharmony_ci	 * Allocate bios : 1 for reading, n-1 for writing
16662306a36Sopenharmony_ci	 */
16762306a36Sopenharmony_ci	for (j = pi->raid_disks ; j-- ; ) {
16862306a36Sopenharmony_ci		bio = bio_kmalloc(RESYNC_PAGES, gfp_flags);
16962306a36Sopenharmony_ci		if (!bio)
17062306a36Sopenharmony_ci			goto out_free_bio;
17162306a36Sopenharmony_ci		bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0);
17262306a36Sopenharmony_ci		r1_bio->bios[j] = bio;
17362306a36Sopenharmony_ci	}
17462306a36Sopenharmony_ci	/*
17562306a36Sopenharmony_ci	 * Allocate RESYNC_PAGES data pages and attach them to
17662306a36Sopenharmony_ci	 * the first bio.
17762306a36Sopenharmony_ci	 * If this is a user-requested check/repair, allocate
17862306a36Sopenharmony_ci	 * RESYNC_PAGES for each bio.
17962306a36Sopenharmony_ci	 */
18062306a36Sopenharmony_ci	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
18162306a36Sopenharmony_ci		need_pages = pi->raid_disks;
18262306a36Sopenharmony_ci	else
18362306a36Sopenharmony_ci		need_pages = 1;
18462306a36Sopenharmony_ci	for (j = 0; j < pi->raid_disks; j++) {
18562306a36Sopenharmony_ci		struct resync_pages *rp = &rps[j];
18662306a36Sopenharmony_ci
18762306a36Sopenharmony_ci		bio = r1_bio->bios[j];
18862306a36Sopenharmony_ci
18962306a36Sopenharmony_ci		if (j < need_pages) {
19062306a36Sopenharmony_ci			if (resync_alloc_pages(rp, gfp_flags))
19162306a36Sopenharmony_ci				goto out_free_pages;
19262306a36Sopenharmony_ci		} else {
19362306a36Sopenharmony_ci			memcpy(rp, &rps[0], sizeof(*rp));
19462306a36Sopenharmony_ci			resync_get_all_pages(rp);
19562306a36Sopenharmony_ci		}
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci		rp->raid_bio = r1_bio;
19862306a36Sopenharmony_ci		bio->bi_private = rp;
19962306a36Sopenharmony_ci	}
20062306a36Sopenharmony_ci
20162306a36Sopenharmony_ci	r1_bio->master_bio = NULL;
20262306a36Sopenharmony_ci
20362306a36Sopenharmony_ci	return r1_bio;
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ciout_free_pages:
20662306a36Sopenharmony_ci	while (--j >= 0)
20762306a36Sopenharmony_ci		resync_free_pages(&rps[j]);
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ciout_free_bio:
21062306a36Sopenharmony_ci	while (++j < pi->raid_disks) {
21162306a36Sopenharmony_ci		bio_uninit(r1_bio->bios[j]);
21262306a36Sopenharmony_ci		kfree(r1_bio->bios[j]);
21362306a36Sopenharmony_ci	}
21462306a36Sopenharmony_ci	kfree(rps);
21562306a36Sopenharmony_ci
21662306a36Sopenharmony_ciout_free_r1bio:
21762306a36Sopenharmony_ci	rbio_pool_free(r1_bio, data);
21862306a36Sopenharmony_ci	return NULL;
21962306a36Sopenharmony_ci}
22062306a36Sopenharmony_ci
22162306a36Sopenharmony_cistatic void r1buf_pool_free(void *__r1_bio, void *data)
22262306a36Sopenharmony_ci{
22362306a36Sopenharmony_ci	struct pool_info *pi = data;
22462306a36Sopenharmony_ci	int i;
22562306a36Sopenharmony_ci	struct r1bio *r1bio = __r1_bio;
22662306a36Sopenharmony_ci	struct resync_pages *rp = NULL;
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci	for (i = pi->raid_disks; i--; ) {
22962306a36Sopenharmony_ci		rp = get_resync_pages(r1bio->bios[i]);
23062306a36Sopenharmony_ci		resync_free_pages(rp);
23162306a36Sopenharmony_ci		bio_uninit(r1bio->bios[i]);
23262306a36Sopenharmony_ci		kfree(r1bio->bios[i]);
23362306a36Sopenharmony_ci	}
23462306a36Sopenharmony_ci
23562306a36Sopenharmony_ci	/* resync pages array stored in the 1st bio's .bi_private */
23662306a36Sopenharmony_ci	kfree(rp);
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	rbio_pool_free(r1bio, data);
23962306a36Sopenharmony_ci}
24062306a36Sopenharmony_ci
24162306a36Sopenharmony_cistatic void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
24262306a36Sopenharmony_ci{
24362306a36Sopenharmony_ci	int i;
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci	for (i = 0; i < conf->raid_disks * 2; i++) {
24662306a36Sopenharmony_ci		struct bio **bio = r1_bio->bios + i;
24762306a36Sopenharmony_ci		if (!BIO_SPECIAL(*bio))
24862306a36Sopenharmony_ci			bio_put(*bio);
24962306a36Sopenharmony_ci		*bio = NULL;
25062306a36Sopenharmony_ci	}
25162306a36Sopenharmony_ci}
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_cistatic void free_r1bio(struct r1bio *r1_bio)
25462306a36Sopenharmony_ci{
25562306a36Sopenharmony_ci	struct r1conf *conf = r1_bio->mddev->private;
25662306a36Sopenharmony_ci
25762306a36Sopenharmony_ci	put_all_bios(conf, r1_bio);
25862306a36Sopenharmony_ci	mempool_free(r1_bio, &conf->r1bio_pool);
25962306a36Sopenharmony_ci}
26062306a36Sopenharmony_ci
26162306a36Sopenharmony_cistatic void put_buf(struct r1bio *r1_bio)
26262306a36Sopenharmony_ci{
26362306a36Sopenharmony_ci	struct r1conf *conf = r1_bio->mddev->private;
26462306a36Sopenharmony_ci	sector_t sect = r1_bio->sector;
26562306a36Sopenharmony_ci	int i;
26662306a36Sopenharmony_ci
26762306a36Sopenharmony_ci	for (i = 0; i < conf->raid_disks * 2; i++) {
26862306a36Sopenharmony_ci		struct bio *bio = r1_bio->bios[i];
26962306a36Sopenharmony_ci		if (bio->bi_end_io)
27062306a36Sopenharmony_ci			rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
27162306a36Sopenharmony_ci	}
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_ci	mempool_free(r1_bio, &conf->r1buf_pool);
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_ci	lower_barrier(conf, sect);
27662306a36Sopenharmony_ci}
27762306a36Sopenharmony_ci
27862306a36Sopenharmony_cistatic void reschedule_retry(struct r1bio *r1_bio)
27962306a36Sopenharmony_ci{
28062306a36Sopenharmony_ci	unsigned long flags;
28162306a36Sopenharmony_ci	struct mddev *mddev = r1_bio->mddev;
28262306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
28362306a36Sopenharmony_ci	int idx;
28462306a36Sopenharmony_ci
28562306a36Sopenharmony_ci	idx = sector_to_idx(r1_bio->sector);
28662306a36Sopenharmony_ci	spin_lock_irqsave(&conf->device_lock, flags);
28762306a36Sopenharmony_ci	list_add(&r1_bio->retry_list, &conf->retry_list);
28862306a36Sopenharmony_ci	atomic_inc(&conf->nr_queued[idx]);
28962306a36Sopenharmony_ci	spin_unlock_irqrestore(&conf->device_lock, flags);
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_ci	wake_up(&conf->wait_barrier);
29262306a36Sopenharmony_ci	md_wakeup_thread(mddev->thread);
29362306a36Sopenharmony_ci}
29462306a36Sopenharmony_ci
29562306a36Sopenharmony_ci/*
29662306a36Sopenharmony_ci * raid_end_bio_io() is called when we have finished servicing a mirrored
29762306a36Sopenharmony_ci * operation and are ready to return a success/failure code to the buffer
29862306a36Sopenharmony_ci * cache layer.
29962306a36Sopenharmony_ci */
30062306a36Sopenharmony_cistatic void call_bio_endio(struct r1bio *r1_bio)
30162306a36Sopenharmony_ci{
30262306a36Sopenharmony_ci	struct bio *bio = r1_bio->master_bio;
30362306a36Sopenharmony_ci
30462306a36Sopenharmony_ci	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
30562306a36Sopenharmony_ci		bio->bi_status = BLK_STS_IOERR;
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_ci	bio_endio(bio);
30862306a36Sopenharmony_ci}
30962306a36Sopenharmony_ci
31062306a36Sopenharmony_cistatic void raid_end_bio_io(struct r1bio *r1_bio)
31162306a36Sopenharmony_ci{
31262306a36Sopenharmony_ci	struct bio *bio = r1_bio->master_bio;
31362306a36Sopenharmony_ci	struct r1conf *conf = r1_bio->mddev->private;
31462306a36Sopenharmony_ci	sector_t sector = r1_bio->sector;
31562306a36Sopenharmony_ci
31662306a36Sopenharmony_ci	/* if nobody has done the final endio yet, do it now */
31762306a36Sopenharmony_ci	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
31862306a36Sopenharmony_ci		pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
31962306a36Sopenharmony_ci			 (bio_data_dir(bio) == WRITE) ? "write" : "read",
32062306a36Sopenharmony_ci			 (unsigned long long) bio->bi_iter.bi_sector,
32162306a36Sopenharmony_ci			 (unsigned long long) bio_end_sector(bio) - 1);
32262306a36Sopenharmony_ci
32362306a36Sopenharmony_ci		call_bio_endio(r1_bio);
32462306a36Sopenharmony_ci	}
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci	free_r1bio(r1_bio);
32762306a36Sopenharmony_ci	/*
32862306a36Sopenharmony_ci	 * Wake up any possible resync thread that waits for the device
32962306a36Sopenharmony_ci	 * to go idle.  All I/Os, even write-behind writes, are done.
33062306a36Sopenharmony_ci	 */
33162306a36Sopenharmony_ci	allow_barrier(conf, sector);
33262306a36Sopenharmony_ci}
33362306a36Sopenharmony_ci
33462306a36Sopenharmony_ci/*
33562306a36Sopenharmony_ci * Update disk head position estimator based on IRQ completion info.
33662306a36Sopenharmony_ci */
33762306a36Sopenharmony_cistatic inline void update_head_pos(int disk, struct r1bio *r1_bio)
33862306a36Sopenharmony_ci{
33962306a36Sopenharmony_ci	struct r1conf *conf = r1_bio->mddev->private;
34062306a36Sopenharmony_ci
34162306a36Sopenharmony_ci	conf->mirrors[disk].head_position =
34262306a36Sopenharmony_ci		r1_bio->sector + (r1_bio->sectors);
34362306a36Sopenharmony_ci}
34462306a36Sopenharmony_ci
34562306a36Sopenharmony_ci/*
34662306a36Sopenharmony_ci * Find the disk number which triggered given bio
34762306a36Sopenharmony_ci */
34862306a36Sopenharmony_cistatic int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
34962306a36Sopenharmony_ci{
35062306a36Sopenharmony_ci	int mirror;
35162306a36Sopenharmony_ci	struct r1conf *conf = r1_bio->mddev->private;
35262306a36Sopenharmony_ci	int raid_disks = conf->raid_disks;
35362306a36Sopenharmony_ci
35462306a36Sopenharmony_ci	for (mirror = 0; mirror < raid_disks * 2; mirror++)
35562306a36Sopenharmony_ci		if (r1_bio->bios[mirror] == bio)
35662306a36Sopenharmony_ci			break;
35762306a36Sopenharmony_ci
35862306a36Sopenharmony_ci	BUG_ON(mirror == raid_disks * 2);
35962306a36Sopenharmony_ci	update_head_pos(mirror, r1_bio);
36062306a36Sopenharmony_ci
36162306a36Sopenharmony_ci	return mirror;
36262306a36Sopenharmony_ci}
36362306a36Sopenharmony_ci
36462306a36Sopenharmony_cistatic void raid1_end_read_request(struct bio *bio)
36562306a36Sopenharmony_ci{
36662306a36Sopenharmony_ci	int uptodate = !bio->bi_status;
36762306a36Sopenharmony_ci	struct r1bio *r1_bio = bio->bi_private;
36862306a36Sopenharmony_ci	struct r1conf *conf = r1_bio->mddev->private;
36962306a36Sopenharmony_ci	struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
37062306a36Sopenharmony_ci
37162306a36Sopenharmony_ci	/*
37262306a36Sopenharmony_ci	 * this branch is our 'one mirror IO has finished' event handler:
37362306a36Sopenharmony_ci	 */
37462306a36Sopenharmony_ci	update_head_pos(r1_bio->read_disk, r1_bio);
37562306a36Sopenharmony_ci
37662306a36Sopenharmony_ci	if (uptodate)
37762306a36Sopenharmony_ci		set_bit(R1BIO_Uptodate, &r1_bio->state);
37862306a36Sopenharmony_ci	else if (test_bit(FailFast, &rdev->flags) &&
37962306a36Sopenharmony_ci		 test_bit(R1BIO_FailFast, &r1_bio->state))
38062306a36Sopenharmony_ci		/* This was a fail-fast read so we definitely
38162306a36Sopenharmony_ci		 * want to retry */
38262306a36Sopenharmony_ci		;
38362306a36Sopenharmony_ci	else {
38462306a36Sopenharmony_ci		/* If all other devices have failed, we want to return
38562306a36Sopenharmony_ci		 * the error upwards rather than fail the last device.
38662306a36Sopenharmony_ci		 * Here we redefine "uptodate" to mean "Don't want to retry"
38762306a36Sopenharmony_ci		 */
38862306a36Sopenharmony_ci		unsigned long flags;
38962306a36Sopenharmony_ci		spin_lock_irqsave(&conf->device_lock, flags);
39062306a36Sopenharmony_ci		if (r1_bio->mddev->degraded == conf->raid_disks ||
39162306a36Sopenharmony_ci		    (r1_bio->mddev->degraded == conf->raid_disks-1 &&
39262306a36Sopenharmony_ci		     test_bit(In_sync, &rdev->flags)))
39362306a36Sopenharmony_ci			uptodate = 1;
39462306a36Sopenharmony_ci		spin_unlock_irqrestore(&conf->device_lock, flags);
39562306a36Sopenharmony_ci	}
39662306a36Sopenharmony_ci
39762306a36Sopenharmony_ci	if (uptodate) {
39862306a36Sopenharmony_ci		raid_end_bio_io(r1_bio);
39962306a36Sopenharmony_ci		rdev_dec_pending(rdev, conf->mddev);
40062306a36Sopenharmony_ci	} else {
40162306a36Sopenharmony_ci		/*
40262306a36Sopenharmony_ci		 * oops, read error:
40362306a36Sopenharmony_ci		 */
40462306a36Sopenharmony_ci		pr_err_ratelimited("md/raid1:%s: %pg: rescheduling sector %llu\n",
40562306a36Sopenharmony_ci				   mdname(conf->mddev),
40662306a36Sopenharmony_ci				   rdev->bdev,
40762306a36Sopenharmony_ci				   (unsigned long long)r1_bio->sector);
40862306a36Sopenharmony_ci		set_bit(R1BIO_ReadError, &r1_bio->state);
40962306a36Sopenharmony_ci		reschedule_retry(r1_bio);
41062306a36Sopenharmony_ci		/* don't drop the reference on read_disk yet */
41162306a36Sopenharmony_ci	}
41262306a36Sopenharmony_ci}
41362306a36Sopenharmony_ci
41462306a36Sopenharmony_cistatic void close_write(struct r1bio *r1_bio)
41562306a36Sopenharmony_ci{
41662306a36Sopenharmony_ci	/* it really is the end of this request */
41762306a36Sopenharmony_ci	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
41862306a36Sopenharmony_ci		bio_free_pages(r1_bio->behind_master_bio);
41962306a36Sopenharmony_ci		bio_put(r1_bio->behind_master_bio);
42062306a36Sopenharmony_ci		r1_bio->behind_master_bio = NULL;
42162306a36Sopenharmony_ci	}
42262306a36Sopenharmony_ci	/* clear the bitmap if all writes complete successfully */
42362306a36Sopenharmony_ci	md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
42462306a36Sopenharmony_ci			   r1_bio->sectors,
42562306a36Sopenharmony_ci			   !test_bit(R1BIO_Degraded, &r1_bio->state),
42662306a36Sopenharmony_ci			   test_bit(R1BIO_BehindIO, &r1_bio->state));
42762306a36Sopenharmony_ci	md_write_end(r1_bio->mddev);
42862306a36Sopenharmony_ci}
42962306a36Sopenharmony_ci
43062306a36Sopenharmony_cistatic void r1_bio_write_done(struct r1bio *r1_bio)
43162306a36Sopenharmony_ci{
43262306a36Sopenharmony_ci	if (!atomic_dec_and_test(&r1_bio->remaining))
43362306a36Sopenharmony_ci		return;
43462306a36Sopenharmony_ci
43562306a36Sopenharmony_ci	if (test_bit(R1BIO_WriteError, &r1_bio->state))
43662306a36Sopenharmony_ci		reschedule_retry(r1_bio);
43762306a36Sopenharmony_ci	else {
43862306a36Sopenharmony_ci		close_write(r1_bio);
43962306a36Sopenharmony_ci		if (test_bit(R1BIO_MadeGood, &r1_bio->state))
44062306a36Sopenharmony_ci			reschedule_retry(r1_bio);
44162306a36Sopenharmony_ci		else
44262306a36Sopenharmony_ci			raid_end_bio_io(r1_bio);
44362306a36Sopenharmony_ci	}
44462306a36Sopenharmony_ci}
44562306a36Sopenharmony_ci
44662306a36Sopenharmony_cistatic void raid1_end_write_request(struct bio *bio)
44762306a36Sopenharmony_ci{
44862306a36Sopenharmony_ci	struct r1bio *r1_bio = bio->bi_private;
44962306a36Sopenharmony_ci	int behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
45062306a36Sopenharmony_ci	struct r1conf *conf = r1_bio->mddev->private;
45162306a36Sopenharmony_ci	struct bio *to_put = NULL;
45262306a36Sopenharmony_ci	int mirror = find_bio_disk(r1_bio, bio);
45362306a36Sopenharmony_ci	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
45462306a36Sopenharmony_ci	bool discard_error;
45562306a36Sopenharmony_ci	sector_t lo = r1_bio->sector;
45662306a36Sopenharmony_ci	sector_t hi = r1_bio->sector + r1_bio->sectors;
45762306a36Sopenharmony_ci
45862306a36Sopenharmony_ci	discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD;
45962306a36Sopenharmony_ci
46062306a36Sopenharmony_ci	/*
46162306a36Sopenharmony_ci	 * 'one mirror IO has finished' event handler:
46262306a36Sopenharmony_ci	 */
46362306a36Sopenharmony_ci	if (bio->bi_status && !discard_error) {
46462306a36Sopenharmony_ci		set_bit(WriteErrorSeen,	&rdev->flags);
46562306a36Sopenharmony_ci		if (!test_and_set_bit(WantReplacement, &rdev->flags))
46662306a36Sopenharmony_ci			set_bit(MD_RECOVERY_NEEDED, &
46762306a36Sopenharmony_ci				conf->mddev->recovery);
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci		if (test_bit(FailFast, &rdev->flags) &&
47062306a36Sopenharmony_ci		    (bio->bi_opf & MD_FAILFAST) &&
47162306a36Sopenharmony_ci		    /* We never try FailFast to WriteMostly devices */
47262306a36Sopenharmony_ci		    !test_bit(WriteMostly, &rdev->flags)) {
47362306a36Sopenharmony_ci			md_error(r1_bio->mddev, rdev);
47462306a36Sopenharmony_ci		}
47562306a36Sopenharmony_ci
47662306a36Sopenharmony_ci		/*
47762306a36Sopenharmony_ci		 * When the device is faulty, it is not necessary to
47862306a36Sopenharmony_ci		 * handle write error.
47962306a36Sopenharmony_ci		 */
48062306a36Sopenharmony_ci		if (!test_bit(Faulty, &rdev->flags))
48162306a36Sopenharmony_ci			set_bit(R1BIO_WriteError, &r1_bio->state);
48262306a36Sopenharmony_ci		else {
48362306a36Sopenharmony_ci			/* Fail the request */
48462306a36Sopenharmony_ci			set_bit(R1BIO_Degraded, &r1_bio->state);
48562306a36Sopenharmony_ci			/* Finished with this branch */
48662306a36Sopenharmony_ci			r1_bio->bios[mirror] = NULL;
48762306a36Sopenharmony_ci			to_put = bio;
48862306a36Sopenharmony_ci		}
48962306a36Sopenharmony_ci	} else {
49062306a36Sopenharmony_ci		/*
49162306a36Sopenharmony_ci		 * Set R1BIO_Uptodate in our master bio, so that we
49262306a36Sopenharmony_ci		 * will return a good error code for to the higher
49362306a36Sopenharmony_ci		 * levels even if IO on some other mirrored buffer
49462306a36Sopenharmony_ci		 * fails.
49562306a36Sopenharmony_ci		 *
49662306a36Sopenharmony_ci		 * The 'master' represents the composite IO operation
49762306a36Sopenharmony_ci		 * to user-side. So if something waits for IO, then it
49862306a36Sopenharmony_ci		 * will wait for the 'master' bio.
49962306a36Sopenharmony_ci		 */
50062306a36Sopenharmony_ci		sector_t first_bad;
50162306a36Sopenharmony_ci		int bad_sectors;
50262306a36Sopenharmony_ci
50362306a36Sopenharmony_ci		r1_bio->bios[mirror] = NULL;
50462306a36Sopenharmony_ci		to_put = bio;
50562306a36Sopenharmony_ci		/*
50662306a36Sopenharmony_ci		 * Do not set R1BIO_Uptodate if the current device is
50762306a36Sopenharmony_ci		 * rebuilding or Faulty. This is because we cannot use
50862306a36Sopenharmony_ci		 * such device for properly reading the data back (we could
50962306a36Sopenharmony_ci		 * potentially use it, if the current write would have felt
51062306a36Sopenharmony_ci		 * before rdev->recovery_offset, but for simplicity we don't
51162306a36Sopenharmony_ci		 * check this here.
51262306a36Sopenharmony_ci		 */
51362306a36Sopenharmony_ci		if (test_bit(In_sync, &rdev->flags) &&
51462306a36Sopenharmony_ci		    !test_bit(Faulty, &rdev->flags))
51562306a36Sopenharmony_ci			set_bit(R1BIO_Uptodate, &r1_bio->state);
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci		/* Maybe we can clear some bad blocks. */
51862306a36Sopenharmony_ci		if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
51962306a36Sopenharmony_ci				&first_bad, &bad_sectors) && !discard_error) {
52062306a36Sopenharmony_ci			r1_bio->bios[mirror] = IO_MADE_GOOD;
52162306a36Sopenharmony_ci			set_bit(R1BIO_MadeGood, &r1_bio->state);
52262306a36Sopenharmony_ci		}
52362306a36Sopenharmony_ci	}
52462306a36Sopenharmony_ci
52562306a36Sopenharmony_ci	if (behind) {
52662306a36Sopenharmony_ci		if (test_bit(CollisionCheck, &rdev->flags))
52762306a36Sopenharmony_ci			remove_serial(rdev, lo, hi);
52862306a36Sopenharmony_ci		if (test_bit(WriteMostly, &rdev->flags))
52962306a36Sopenharmony_ci			atomic_dec(&r1_bio->behind_remaining);
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_ci		/*
53262306a36Sopenharmony_ci		 * In behind mode, we ACK the master bio once the I/O
53362306a36Sopenharmony_ci		 * has safely reached all non-writemostly
53462306a36Sopenharmony_ci		 * disks. Setting the Returned bit ensures that this
53562306a36Sopenharmony_ci		 * gets done only once -- we don't ever want to return
53662306a36Sopenharmony_ci		 * -EIO here, instead we'll wait
53762306a36Sopenharmony_ci		 */
53862306a36Sopenharmony_ci		if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
53962306a36Sopenharmony_ci		    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
54062306a36Sopenharmony_ci			/* Maybe we can return now */
54162306a36Sopenharmony_ci			if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
54262306a36Sopenharmony_ci				struct bio *mbio = r1_bio->master_bio;
54362306a36Sopenharmony_ci				pr_debug("raid1: behind end write sectors"
54462306a36Sopenharmony_ci					 " %llu-%llu\n",
54562306a36Sopenharmony_ci					 (unsigned long long) mbio->bi_iter.bi_sector,
54662306a36Sopenharmony_ci					 (unsigned long long) bio_end_sector(mbio) - 1);
54762306a36Sopenharmony_ci				call_bio_endio(r1_bio);
54862306a36Sopenharmony_ci			}
54962306a36Sopenharmony_ci		}
55062306a36Sopenharmony_ci	} else if (rdev->mddev->serialize_policy)
55162306a36Sopenharmony_ci		remove_serial(rdev, lo, hi);
55262306a36Sopenharmony_ci	if (r1_bio->bios[mirror] == NULL)
55362306a36Sopenharmony_ci		rdev_dec_pending(rdev, conf->mddev);
55462306a36Sopenharmony_ci
55562306a36Sopenharmony_ci	/*
55662306a36Sopenharmony_ci	 * Let's see if all mirrored write operations have finished
55762306a36Sopenharmony_ci	 * already.
55862306a36Sopenharmony_ci	 */
55962306a36Sopenharmony_ci	r1_bio_write_done(r1_bio);
56062306a36Sopenharmony_ci
56162306a36Sopenharmony_ci	if (to_put)
56262306a36Sopenharmony_ci		bio_put(to_put);
56362306a36Sopenharmony_ci}
56462306a36Sopenharmony_ci
56562306a36Sopenharmony_cistatic sector_t align_to_barrier_unit_end(sector_t start_sector,
56662306a36Sopenharmony_ci					  sector_t sectors)
56762306a36Sopenharmony_ci{
56862306a36Sopenharmony_ci	sector_t len;
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci	WARN_ON(sectors == 0);
57162306a36Sopenharmony_ci	/*
57262306a36Sopenharmony_ci	 * len is the number of sectors from start_sector to end of the
57362306a36Sopenharmony_ci	 * barrier unit which start_sector belongs to.
57462306a36Sopenharmony_ci	 */
57562306a36Sopenharmony_ci	len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
57662306a36Sopenharmony_ci	      start_sector;
57762306a36Sopenharmony_ci
57862306a36Sopenharmony_ci	if (len > sectors)
57962306a36Sopenharmony_ci		len = sectors;
58062306a36Sopenharmony_ci
58162306a36Sopenharmony_ci	return len;
58262306a36Sopenharmony_ci}
58362306a36Sopenharmony_ci
58462306a36Sopenharmony_ci/*
58562306a36Sopenharmony_ci * This routine returns the disk from which the requested read should
58662306a36Sopenharmony_ci * be done. There is a per-array 'next expected sequential IO' sector
58762306a36Sopenharmony_ci * number - if this matches on the next IO then we use the last disk.
58862306a36Sopenharmony_ci * There is also a per-disk 'last know head position' sector that is
58962306a36Sopenharmony_ci * maintained from IRQ contexts, both the normal and the resync IO
59062306a36Sopenharmony_ci * completion handlers update this position correctly. If there is no
59162306a36Sopenharmony_ci * perfect sequential match then we pick the disk whose head is closest.
59262306a36Sopenharmony_ci *
59362306a36Sopenharmony_ci * If there are 2 mirrors in the same 2 devices, performance degrades
59462306a36Sopenharmony_ci * because position is mirror, not device based.
59562306a36Sopenharmony_ci *
59662306a36Sopenharmony_ci * The rdev for the device selected will have nr_pending incremented.
59762306a36Sopenharmony_ci */
59862306a36Sopenharmony_cistatic int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
59962306a36Sopenharmony_ci{
60062306a36Sopenharmony_ci	const sector_t this_sector = r1_bio->sector;
60162306a36Sopenharmony_ci	int sectors;
60262306a36Sopenharmony_ci	int best_good_sectors;
60362306a36Sopenharmony_ci	int best_disk, best_dist_disk, best_pending_disk;
60462306a36Sopenharmony_ci	int has_nonrot_disk;
60562306a36Sopenharmony_ci	int disk;
60662306a36Sopenharmony_ci	sector_t best_dist;
60762306a36Sopenharmony_ci	unsigned int min_pending;
60862306a36Sopenharmony_ci	struct md_rdev *rdev;
60962306a36Sopenharmony_ci	int choose_first;
61062306a36Sopenharmony_ci	int choose_next_idle;
61162306a36Sopenharmony_ci
61262306a36Sopenharmony_ci	rcu_read_lock();
61362306a36Sopenharmony_ci	/*
61462306a36Sopenharmony_ci	 * Check if we can balance. We can balance on the whole
61562306a36Sopenharmony_ci	 * device if no resync is going on, or below the resync window.
61662306a36Sopenharmony_ci	 * We take the first readable disk when above the resync window.
61762306a36Sopenharmony_ci	 */
61862306a36Sopenharmony_ci retry:
61962306a36Sopenharmony_ci	sectors = r1_bio->sectors;
62062306a36Sopenharmony_ci	best_disk = -1;
62162306a36Sopenharmony_ci	best_dist_disk = -1;
62262306a36Sopenharmony_ci	best_dist = MaxSector;
62362306a36Sopenharmony_ci	best_pending_disk = -1;
62462306a36Sopenharmony_ci	min_pending = UINT_MAX;
62562306a36Sopenharmony_ci	best_good_sectors = 0;
62662306a36Sopenharmony_ci	has_nonrot_disk = 0;
62762306a36Sopenharmony_ci	choose_next_idle = 0;
62862306a36Sopenharmony_ci	clear_bit(R1BIO_FailFast, &r1_bio->state);
62962306a36Sopenharmony_ci
63062306a36Sopenharmony_ci	if ((conf->mddev->recovery_cp < this_sector + sectors) ||
63162306a36Sopenharmony_ci	    (mddev_is_clustered(conf->mddev) &&
63262306a36Sopenharmony_ci	    md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
63362306a36Sopenharmony_ci		    this_sector + sectors)))
63462306a36Sopenharmony_ci		choose_first = 1;
63562306a36Sopenharmony_ci	else
63662306a36Sopenharmony_ci		choose_first = 0;
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
63962306a36Sopenharmony_ci		sector_t dist;
64062306a36Sopenharmony_ci		sector_t first_bad;
64162306a36Sopenharmony_ci		int bad_sectors;
64262306a36Sopenharmony_ci		unsigned int pending;
64362306a36Sopenharmony_ci		bool nonrot;
64462306a36Sopenharmony_ci
64562306a36Sopenharmony_ci		rdev = rcu_dereference(conf->mirrors[disk].rdev);
64662306a36Sopenharmony_ci		if (r1_bio->bios[disk] == IO_BLOCKED
64762306a36Sopenharmony_ci		    || rdev == NULL
64862306a36Sopenharmony_ci		    || test_bit(Faulty, &rdev->flags))
64962306a36Sopenharmony_ci			continue;
65062306a36Sopenharmony_ci		if (!test_bit(In_sync, &rdev->flags) &&
65162306a36Sopenharmony_ci		    rdev->recovery_offset < this_sector + sectors)
65262306a36Sopenharmony_ci			continue;
65362306a36Sopenharmony_ci		if (test_bit(WriteMostly, &rdev->flags)) {
65462306a36Sopenharmony_ci			/* Don't balance among write-mostly, just
65562306a36Sopenharmony_ci			 * use the first as a last resort */
65662306a36Sopenharmony_ci			if (best_dist_disk < 0) {
65762306a36Sopenharmony_ci				if (is_badblock(rdev, this_sector, sectors,
65862306a36Sopenharmony_ci						&first_bad, &bad_sectors)) {
65962306a36Sopenharmony_ci					if (first_bad <= this_sector)
66062306a36Sopenharmony_ci						/* Cannot use this */
66162306a36Sopenharmony_ci						continue;
66262306a36Sopenharmony_ci					best_good_sectors = first_bad - this_sector;
66362306a36Sopenharmony_ci				} else
66462306a36Sopenharmony_ci					best_good_sectors = sectors;
66562306a36Sopenharmony_ci				best_dist_disk = disk;
66662306a36Sopenharmony_ci				best_pending_disk = disk;
66762306a36Sopenharmony_ci			}
66862306a36Sopenharmony_ci			continue;
66962306a36Sopenharmony_ci		}
67062306a36Sopenharmony_ci		/* This is a reasonable device to use.  It might
67162306a36Sopenharmony_ci		 * even be best.
67262306a36Sopenharmony_ci		 */
67362306a36Sopenharmony_ci		if (is_badblock(rdev, this_sector, sectors,
67462306a36Sopenharmony_ci				&first_bad, &bad_sectors)) {
67562306a36Sopenharmony_ci			if (best_dist < MaxSector)
67662306a36Sopenharmony_ci				/* already have a better device */
67762306a36Sopenharmony_ci				continue;
67862306a36Sopenharmony_ci			if (first_bad <= this_sector) {
67962306a36Sopenharmony_ci				/* cannot read here. If this is the 'primary'
68062306a36Sopenharmony_ci				 * device, then we must not read beyond
68162306a36Sopenharmony_ci				 * bad_sectors from another device..
68262306a36Sopenharmony_ci				 */
68362306a36Sopenharmony_ci				bad_sectors -= (this_sector - first_bad);
68462306a36Sopenharmony_ci				if (choose_first && sectors > bad_sectors)
68562306a36Sopenharmony_ci					sectors = bad_sectors;
68662306a36Sopenharmony_ci				if (best_good_sectors > sectors)
68762306a36Sopenharmony_ci					best_good_sectors = sectors;
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci			} else {
69062306a36Sopenharmony_ci				sector_t good_sectors = first_bad - this_sector;
69162306a36Sopenharmony_ci				if (good_sectors > best_good_sectors) {
69262306a36Sopenharmony_ci					best_good_sectors = good_sectors;
69362306a36Sopenharmony_ci					best_disk = disk;
69462306a36Sopenharmony_ci				}
69562306a36Sopenharmony_ci				if (choose_first)
69662306a36Sopenharmony_ci					break;
69762306a36Sopenharmony_ci			}
69862306a36Sopenharmony_ci			continue;
69962306a36Sopenharmony_ci		} else {
70062306a36Sopenharmony_ci			if ((sectors > best_good_sectors) && (best_disk >= 0))
70162306a36Sopenharmony_ci				best_disk = -1;
70262306a36Sopenharmony_ci			best_good_sectors = sectors;
70362306a36Sopenharmony_ci		}
70462306a36Sopenharmony_ci
70562306a36Sopenharmony_ci		if (best_disk >= 0)
70662306a36Sopenharmony_ci			/* At least two disks to choose from so failfast is OK */
70762306a36Sopenharmony_ci			set_bit(R1BIO_FailFast, &r1_bio->state);
70862306a36Sopenharmony_ci
70962306a36Sopenharmony_ci		nonrot = bdev_nonrot(rdev->bdev);
71062306a36Sopenharmony_ci		has_nonrot_disk |= nonrot;
71162306a36Sopenharmony_ci		pending = atomic_read(&rdev->nr_pending);
71262306a36Sopenharmony_ci		dist = abs(this_sector - conf->mirrors[disk].head_position);
71362306a36Sopenharmony_ci		if (choose_first) {
71462306a36Sopenharmony_ci			best_disk = disk;
71562306a36Sopenharmony_ci			break;
71662306a36Sopenharmony_ci		}
71762306a36Sopenharmony_ci		/* Don't change to another disk for sequential reads */
71862306a36Sopenharmony_ci		if (conf->mirrors[disk].next_seq_sect == this_sector
71962306a36Sopenharmony_ci		    || dist == 0) {
72062306a36Sopenharmony_ci			int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
72162306a36Sopenharmony_ci			struct raid1_info *mirror = &conf->mirrors[disk];
72262306a36Sopenharmony_ci
72362306a36Sopenharmony_ci			best_disk = disk;
72462306a36Sopenharmony_ci			/*
72562306a36Sopenharmony_ci			 * If buffered sequential IO size exceeds optimal
72662306a36Sopenharmony_ci			 * iosize, check if there is idle disk. If yes, choose
72762306a36Sopenharmony_ci			 * the idle disk. read_balance could already choose an
72862306a36Sopenharmony_ci			 * idle disk before noticing it's a sequential IO in
72962306a36Sopenharmony_ci			 * this disk. This doesn't matter because this disk
73062306a36Sopenharmony_ci			 * will idle, next time it will be utilized after the
73162306a36Sopenharmony_ci			 * first disk has IO size exceeds optimal iosize. In
73262306a36Sopenharmony_ci			 * this way, iosize of the first disk will be optimal
73362306a36Sopenharmony_ci			 * iosize at least. iosize of the second disk might be
73462306a36Sopenharmony_ci			 * small, but not a big deal since when the second disk
73562306a36Sopenharmony_ci			 * starts IO, the first disk is likely still busy.
73662306a36Sopenharmony_ci			 */
73762306a36Sopenharmony_ci			if (nonrot && opt_iosize > 0 &&
73862306a36Sopenharmony_ci			    mirror->seq_start != MaxSector &&
73962306a36Sopenharmony_ci			    mirror->next_seq_sect > opt_iosize &&
74062306a36Sopenharmony_ci			    mirror->next_seq_sect - opt_iosize >=
74162306a36Sopenharmony_ci			    mirror->seq_start) {
74262306a36Sopenharmony_ci				choose_next_idle = 1;
74362306a36Sopenharmony_ci				continue;
74462306a36Sopenharmony_ci			}
74562306a36Sopenharmony_ci			break;
74662306a36Sopenharmony_ci		}
74762306a36Sopenharmony_ci
74862306a36Sopenharmony_ci		if (choose_next_idle)
74962306a36Sopenharmony_ci			continue;
75062306a36Sopenharmony_ci
75162306a36Sopenharmony_ci		if (min_pending > pending) {
75262306a36Sopenharmony_ci			min_pending = pending;
75362306a36Sopenharmony_ci			best_pending_disk = disk;
75462306a36Sopenharmony_ci		}
75562306a36Sopenharmony_ci
75662306a36Sopenharmony_ci		if (dist < best_dist) {
75762306a36Sopenharmony_ci			best_dist = dist;
75862306a36Sopenharmony_ci			best_dist_disk = disk;
75962306a36Sopenharmony_ci		}
76062306a36Sopenharmony_ci	}
76162306a36Sopenharmony_ci
76262306a36Sopenharmony_ci	/*
76362306a36Sopenharmony_ci	 * If all disks are rotational, choose the closest disk. If any disk is
76462306a36Sopenharmony_ci	 * non-rotational, choose the disk with less pending request even the
76562306a36Sopenharmony_ci	 * disk is rotational, which might/might not be optimal for raids with
76662306a36Sopenharmony_ci	 * mixed ratation/non-rotational disks depending on workload.
76762306a36Sopenharmony_ci	 */
76862306a36Sopenharmony_ci	if (best_disk == -1) {
76962306a36Sopenharmony_ci		if (has_nonrot_disk || min_pending == 0)
77062306a36Sopenharmony_ci			best_disk = best_pending_disk;
77162306a36Sopenharmony_ci		else
77262306a36Sopenharmony_ci			best_disk = best_dist_disk;
77362306a36Sopenharmony_ci	}
77462306a36Sopenharmony_ci
77562306a36Sopenharmony_ci	if (best_disk >= 0) {
77662306a36Sopenharmony_ci		rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
77762306a36Sopenharmony_ci		if (!rdev)
77862306a36Sopenharmony_ci			goto retry;
77962306a36Sopenharmony_ci		atomic_inc(&rdev->nr_pending);
78062306a36Sopenharmony_ci		sectors = best_good_sectors;
78162306a36Sopenharmony_ci
78262306a36Sopenharmony_ci		if (conf->mirrors[best_disk].next_seq_sect != this_sector)
78362306a36Sopenharmony_ci			conf->mirrors[best_disk].seq_start = this_sector;
78462306a36Sopenharmony_ci
78562306a36Sopenharmony_ci		conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
78662306a36Sopenharmony_ci	}
78762306a36Sopenharmony_ci	rcu_read_unlock();
78862306a36Sopenharmony_ci	*max_sectors = sectors;
78962306a36Sopenharmony_ci
79062306a36Sopenharmony_ci	return best_disk;
79162306a36Sopenharmony_ci}
79262306a36Sopenharmony_ci
79362306a36Sopenharmony_cistatic void wake_up_barrier(struct r1conf *conf)
79462306a36Sopenharmony_ci{
79562306a36Sopenharmony_ci	if (wq_has_sleeper(&conf->wait_barrier))
79662306a36Sopenharmony_ci		wake_up(&conf->wait_barrier);
79762306a36Sopenharmony_ci}
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_cistatic void flush_bio_list(struct r1conf *conf, struct bio *bio)
80062306a36Sopenharmony_ci{
80162306a36Sopenharmony_ci	/* flush any pending bitmap writes to disk before proceeding w/ I/O */
80262306a36Sopenharmony_ci	raid1_prepare_flush_writes(conf->mddev->bitmap);
80362306a36Sopenharmony_ci	wake_up_barrier(conf);
80462306a36Sopenharmony_ci
80562306a36Sopenharmony_ci	while (bio) { /* submit pending writes */
80662306a36Sopenharmony_ci		struct bio *next = bio->bi_next;
80762306a36Sopenharmony_ci
80862306a36Sopenharmony_ci		raid1_submit_write(bio);
80962306a36Sopenharmony_ci		bio = next;
81062306a36Sopenharmony_ci		cond_resched();
81162306a36Sopenharmony_ci	}
81262306a36Sopenharmony_ci}
81362306a36Sopenharmony_ci
81462306a36Sopenharmony_cistatic void flush_pending_writes(struct r1conf *conf)
81562306a36Sopenharmony_ci{
81662306a36Sopenharmony_ci	/* Any writes that have been queued but are awaiting
81762306a36Sopenharmony_ci	 * bitmap updates get flushed here.
81862306a36Sopenharmony_ci	 */
81962306a36Sopenharmony_ci	spin_lock_irq(&conf->device_lock);
82062306a36Sopenharmony_ci
82162306a36Sopenharmony_ci	if (conf->pending_bio_list.head) {
82262306a36Sopenharmony_ci		struct blk_plug plug;
82362306a36Sopenharmony_ci		struct bio *bio;
82462306a36Sopenharmony_ci
82562306a36Sopenharmony_ci		bio = bio_list_get(&conf->pending_bio_list);
82662306a36Sopenharmony_ci		spin_unlock_irq(&conf->device_lock);
82762306a36Sopenharmony_ci
82862306a36Sopenharmony_ci		/*
82962306a36Sopenharmony_ci		 * As this is called in a wait_event() loop (see freeze_array),
83062306a36Sopenharmony_ci		 * current->state might be TASK_UNINTERRUPTIBLE which will
83162306a36Sopenharmony_ci		 * cause a warning when we prepare to wait again.  As it is
83262306a36Sopenharmony_ci		 * rare that this path is taken, it is perfectly safe to force
83362306a36Sopenharmony_ci		 * us to go around the wait_event() loop again, so the warning
83462306a36Sopenharmony_ci		 * is a false-positive.  Silence the warning by resetting
83562306a36Sopenharmony_ci		 * thread state
83662306a36Sopenharmony_ci		 */
83762306a36Sopenharmony_ci		__set_current_state(TASK_RUNNING);
83862306a36Sopenharmony_ci		blk_start_plug(&plug);
83962306a36Sopenharmony_ci		flush_bio_list(conf, bio);
84062306a36Sopenharmony_ci		blk_finish_plug(&plug);
84162306a36Sopenharmony_ci	} else
84262306a36Sopenharmony_ci		spin_unlock_irq(&conf->device_lock);
84362306a36Sopenharmony_ci}
84462306a36Sopenharmony_ci
84562306a36Sopenharmony_ci/* Barriers....
84662306a36Sopenharmony_ci * Sometimes we need to suspend IO while we do something else,
84762306a36Sopenharmony_ci * either some resync/recovery, or reconfigure the array.
84862306a36Sopenharmony_ci * To do this we raise a 'barrier'.
84962306a36Sopenharmony_ci * The 'barrier' is a counter that can be raised multiple times
85062306a36Sopenharmony_ci * to count how many activities are happening which preclude
85162306a36Sopenharmony_ci * normal IO.
85262306a36Sopenharmony_ci * We can only raise the barrier if there is no pending IO.
85362306a36Sopenharmony_ci * i.e. if nr_pending == 0.
85462306a36Sopenharmony_ci * We choose only to raise the barrier if no-one is waiting for the
85562306a36Sopenharmony_ci * barrier to go down.  This means that as soon as an IO request
85662306a36Sopenharmony_ci * is ready, no other operations which require a barrier will start
85762306a36Sopenharmony_ci * until the IO request has had a chance.
85862306a36Sopenharmony_ci *
85962306a36Sopenharmony_ci * So: regular IO calls 'wait_barrier'.  When that returns there
86062306a36Sopenharmony_ci *    is no backgroup IO happening,  It must arrange to call
86162306a36Sopenharmony_ci *    allow_barrier when it has finished its IO.
86262306a36Sopenharmony_ci * backgroup IO calls must call raise_barrier.  Once that returns
86362306a36Sopenharmony_ci *    there is no normal IO happeing.  It must arrange to call
86462306a36Sopenharmony_ci *    lower_barrier when the particular background IO completes.
86562306a36Sopenharmony_ci *
86662306a36Sopenharmony_ci * If resync/recovery is interrupted, returns -EINTR;
86762306a36Sopenharmony_ci * Otherwise, returns 0.
86862306a36Sopenharmony_ci */
86962306a36Sopenharmony_cistatic int raise_barrier(struct r1conf *conf, sector_t sector_nr)
87062306a36Sopenharmony_ci{
87162306a36Sopenharmony_ci	int idx = sector_to_idx(sector_nr);
87262306a36Sopenharmony_ci
87362306a36Sopenharmony_ci	spin_lock_irq(&conf->resync_lock);
87462306a36Sopenharmony_ci
87562306a36Sopenharmony_ci	/* Wait until no block IO is waiting */
87662306a36Sopenharmony_ci	wait_event_lock_irq(conf->wait_barrier,
87762306a36Sopenharmony_ci			    !atomic_read(&conf->nr_waiting[idx]),
87862306a36Sopenharmony_ci			    conf->resync_lock);
87962306a36Sopenharmony_ci
88062306a36Sopenharmony_ci	/* block any new IO from starting */
88162306a36Sopenharmony_ci	atomic_inc(&conf->barrier[idx]);
88262306a36Sopenharmony_ci	/*
88362306a36Sopenharmony_ci	 * In raise_barrier() we firstly increase conf->barrier[idx] then
88462306a36Sopenharmony_ci	 * check conf->nr_pending[idx]. In _wait_barrier() we firstly
88562306a36Sopenharmony_ci	 * increase conf->nr_pending[idx] then check conf->barrier[idx].
88662306a36Sopenharmony_ci	 * A memory barrier here to make sure conf->nr_pending[idx] won't
88762306a36Sopenharmony_ci	 * be fetched before conf->barrier[idx] is increased. Otherwise
88862306a36Sopenharmony_ci	 * there will be a race between raise_barrier() and _wait_barrier().
88962306a36Sopenharmony_ci	 */
89062306a36Sopenharmony_ci	smp_mb__after_atomic();
89162306a36Sopenharmony_ci
89262306a36Sopenharmony_ci	/* For these conditions we must wait:
89362306a36Sopenharmony_ci	 * A: while the array is in frozen state
89462306a36Sopenharmony_ci	 * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
89562306a36Sopenharmony_ci	 *    existing in corresponding I/O barrier bucket.
89662306a36Sopenharmony_ci	 * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
89762306a36Sopenharmony_ci	 *    max resync count which allowed on current I/O barrier bucket.
89862306a36Sopenharmony_ci	 */
89962306a36Sopenharmony_ci	wait_event_lock_irq(conf->wait_barrier,
90062306a36Sopenharmony_ci			    (!conf->array_frozen &&
90162306a36Sopenharmony_ci			     !atomic_read(&conf->nr_pending[idx]) &&
90262306a36Sopenharmony_ci			     atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH) ||
90362306a36Sopenharmony_ci				test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery),
90462306a36Sopenharmony_ci			    conf->resync_lock);
90562306a36Sopenharmony_ci
90662306a36Sopenharmony_ci	if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
90762306a36Sopenharmony_ci		atomic_dec(&conf->barrier[idx]);
90862306a36Sopenharmony_ci		spin_unlock_irq(&conf->resync_lock);
90962306a36Sopenharmony_ci		wake_up(&conf->wait_barrier);
91062306a36Sopenharmony_ci		return -EINTR;
91162306a36Sopenharmony_ci	}
91262306a36Sopenharmony_ci
91362306a36Sopenharmony_ci	atomic_inc(&conf->nr_sync_pending);
91462306a36Sopenharmony_ci	spin_unlock_irq(&conf->resync_lock);
91562306a36Sopenharmony_ci
91662306a36Sopenharmony_ci	return 0;
91762306a36Sopenharmony_ci}
91862306a36Sopenharmony_ci
91962306a36Sopenharmony_cistatic void lower_barrier(struct r1conf *conf, sector_t sector_nr)
92062306a36Sopenharmony_ci{
92162306a36Sopenharmony_ci	int idx = sector_to_idx(sector_nr);
92262306a36Sopenharmony_ci
92362306a36Sopenharmony_ci	BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
92462306a36Sopenharmony_ci
92562306a36Sopenharmony_ci	atomic_dec(&conf->barrier[idx]);
92662306a36Sopenharmony_ci	atomic_dec(&conf->nr_sync_pending);
92762306a36Sopenharmony_ci	wake_up(&conf->wait_barrier);
92862306a36Sopenharmony_ci}
92962306a36Sopenharmony_ci
93062306a36Sopenharmony_cistatic bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
93162306a36Sopenharmony_ci{
93262306a36Sopenharmony_ci	bool ret = true;
93362306a36Sopenharmony_ci
93462306a36Sopenharmony_ci	/*
93562306a36Sopenharmony_ci	 * We need to increase conf->nr_pending[idx] very early here,
93662306a36Sopenharmony_ci	 * then raise_barrier() can be blocked when it waits for
93762306a36Sopenharmony_ci	 * conf->nr_pending[idx] to be 0. Then we can avoid holding
93862306a36Sopenharmony_ci	 * conf->resync_lock when there is no barrier raised in same
93962306a36Sopenharmony_ci	 * barrier unit bucket. Also if the array is frozen, I/O
94062306a36Sopenharmony_ci	 * should be blocked until array is unfrozen.
94162306a36Sopenharmony_ci	 */
94262306a36Sopenharmony_ci	atomic_inc(&conf->nr_pending[idx]);
94362306a36Sopenharmony_ci	/*
94462306a36Sopenharmony_ci	 * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
94562306a36Sopenharmony_ci	 * check conf->barrier[idx]. In raise_barrier() we firstly increase
94662306a36Sopenharmony_ci	 * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
94762306a36Sopenharmony_ci	 * barrier is necessary here to make sure conf->barrier[idx] won't be
94862306a36Sopenharmony_ci	 * fetched before conf->nr_pending[idx] is increased. Otherwise there
94962306a36Sopenharmony_ci	 * will be a race between _wait_barrier() and raise_barrier().
95062306a36Sopenharmony_ci	 */
95162306a36Sopenharmony_ci	smp_mb__after_atomic();
95262306a36Sopenharmony_ci
95362306a36Sopenharmony_ci	/*
95462306a36Sopenharmony_ci	 * Don't worry about checking two atomic_t variables at same time
95562306a36Sopenharmony_ci	 * here. If during we check conf->barrier[idx], the array is
95662306a36Sopenharmony_ci	 * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
95762306a36Sopenharmony_ci	 * 0, it is safe to return and make the I/O continue. Because the
95862306a36Sopenharmony_ci	 * array is frozen, all I/O returned here will eventually complete
95962306a36Sopenharmony_ci	 * or be queued, no race will happen. See code comment in
96062306a36Sopenharmony_ci	 * frozen_array().
96162306a36Sopenharmony_ci	 */
96262306a36Sopenharmony_ci	if (!READ_ONCE(conf->array_frozen) &&
96362306a36Sopenharmony_ci	    !atomic_read(&conf->barrier[idx]))
96462306a36Sopenharmony_ci		return ret;
96562306a36Sopenharmony_ci
96662306a36Sopenharmony_ci	/*
96762306a36Sopenharmony_ci	 * After holding conf->resync_lock, conf->nr_pending[idx]
96862306a36Sopenharmony_ci	 * should be decreased before waiting for barrier to drop.
96962306a36Sopenharmony_ci	 * Otherwise, we may encounter a race condition because
97062306a36Sopenharmony_ci	 * raise_barrer() might be waiting for conf->nr_pending[idx]
97162306a36Sopenharmony_ci	 * to be 0 at same time.
97262306a36Sopenharmony_ci	 */
97362306a36Sopenharmony_ci	spin_lock_irq(&conf->resync_lock);
97462306a36Sopenharmony_ci	atomic_inc(&conf->nr_waiting[idx]);
97562306a36Sopenharmony_ci	atomic_dec(&conf->nr_pending[idx]);
97662306a36Sopenharmony_ci	/*
97762306a36Sopenharmony_ci	 * In case freeze_array() is waiting for
97862306a36Sopenharmony_ci	 * get_unqueued_pending() == extra
97962306a36Sopenharmony_ci	 */
98062306a36Sopenharmony_ci	wake_up_barrier(conf);
98162306a36Sopenharmony_ci	/* Wait for the barrier in same barrier unit bucket to drop. */
98262306a36Sopenharmony_ci
98362306a36Sopenharmony_ci	/* Return false when nowait flag is set */
98462306a36Sopenharmony_ci	if (nowait) {
98562306a36Sopenharmony_ci		ret = false;
98662306a36Sopenharmony_ci	} else {
98762306a36Sopenharmony_ci		wait_event_lock_irq(conf->wait_barrier,
98862306a36Sopenharmony_ci				!conf->array_frozen &&
98962306a36Sopenharmony_ci				!atomic_read(&conf->barrier[idx]),
99062306a36Sopenharmony_ci				conf->resync_lock);
99162306a36Sopenharmony_ci		atomic_inc(&conf->nr_pending[idx]);
99262306a36Sopenharmony_ci	}
99362306a36Sopenharmony_ci
99462306a36Sopenharmony_ci	atomic_dec(&conf->nr_waiting[idx]);
99562306a36Sopenharmony_ci	spin_unlock_irq(&conf->resync_lock);
99662306a36Sopenharmony_ci	return ret;
99762306a36Sopenharmony_ci}
99862306a36Sopenharmony_ci
99962306a36Sopenharmony_cistatic bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
100062306a36Sopenharmony_ci{
100162306a36Sopenharmony_ci	int idx = sector_to_idx(sector_nr);
100262306a36Sopenharmony_ci	bool ret = true;
100362306a36Sopenharmony_ci
100462306a36Sopenharmony_ci	/*
100562306a36Sopenharmony_ci	 * Very similar to _wait_barrier(). The difference is, for read
100662306a36Sopenharmony_ci	 * I/O we don't need wait for sync I/O, but if the whole array
100762306a36Sopenharmony_ci	 * is frozen, the read I/O still has to wait until the array is
100862306a36Sopenharmony_ci	 * unfrozen. Since there is no ordering requirement with
100962306a36Sopenharmony_ci	 * conf->barrier[idx] here, memory barrier is unnecessary as well.
101062306a36Sopenharmony_ci	 */
101162306a36Sopenharmony_ci	atomic_inc(&conf->nr_pending[idx]);
101262306a36Sopenharmony_ci
101362306a36Sopenharmony_ci	if (!READ_ONCE(conf->array_frozen))
101462306a36Sopenharmony_ci		return ret;
101562306a36Sopenharmony_ci
101662306a36Sopenharmony_ci	spin_lock_irq(&conf->resync_lock);
101762306a36Sopenharmony_ci	atomic_inc(&conf->nr_waiting[idx]);
101862306a36Sopenharmony_ci	atomic_dec(&conf->nr_pending[idx]);
101962306a36Sopenharmony_ci	/*
102062306a36Sopenharmony_ci	 * In case freeze_array() is waiting for
102162306a36Sopenharmony_ci	 * get_unqueued_pending() == extra
102262306a36Sopenharmony_ci	 */
102362306a36Sopenharmony_ci	wake_up_barrier(conf);
102462306a36Sopenharmony_ci	/* Wait for array to be unfrozen */
102562306a36Sopenharmony_ci
102662306a36Sopenharmony_ci	/* Return false when nowait flag is set */
102762306a36Sopenharmony_ci	if (nowait) {
102862306a36Sopenharmony_ci		/* Return false when nowait flag is set */
102962306a36Sopenharmony_ci		ret = false;
103062306a36Sopenharmony_ci	} else {
103162306a36Sopenharmony_ci		wait_event_lock_irq(conf->wait_barrier,
103262306a36Sopenharmony_ci				!conf->array_frozen,
103362306a36Sopenharmony_ci				conf->resync_lock);
103462306a36Sopenharmony_ci		atomic_inc(&conf->nr_pending[idx]);
103562306a36Sopenharmony_ci	}
103662306a36Sopenharmony_ci
103762306a36Sopenharmony_ci	atomic_dec(&conf->nr_waiting[idx]);
103862306a36Sopenharmony_ci	spin_unlock_irq(&conf->resync_lock);
103962306a36Sopenharmony_ci	return ret;
104062306a36Sopenharmony_ci}
104162306a36Sopenharmony_ci
104262306a36Sopenharmony_cistatic bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
104362306a36Sopenharmony_ci{
104462306a36Sopenharmony_ci	int idx = sector_to_idx(sector_nr);
104562306a36Sopenharmony_ci
104662306a36Sopenharmony_ci	return _wait_barrier(conf, idx, nowait);
104762306a36Sopenharmony_ci}
104862306a36Sopenharmony_ci
104962306a36Sopenharmony_cistatic void _allow_barrier(struct r1conf *conf, int idx)
105062306a36Sopenharmony_ci{
105162306a36Sopenharmony_ci	atomic_dec(&conf->nr_pending[idx]);
105262306a36Sopenharmony_ci	wake_up_barrier(conf);
105362306a36Sopenharmony_ci}
105462306a36Sopenharmony_ci
105562306a36Sopenharmony_cistatic void allow_barrier(struct r1conf *conf, sector_t sector_nr)
105662306a36Sopenharmony_ci{
105762306a36Sopenharmony_ci	int idx = sector_to_idx(sector_nr);
105862306a36Sopenharmony_ci
105962306a36Sopenharmony_ci	_allow_barrier(conf, idx);
106062306a36Sopenharmony_ci}
106162306a36Sopenharmony_ci
106262306a36Sopenharmony_ci/* conf->resync_lock should be held */
106362306a36Sopenharmony_cistatic int get_unqueued_pending(struct r1conf *conf)
106462306a36Sopenharmony_ci{
106562306a36Sopenharmony_ci	int idx, ret;
106662306a36Sopenharmony_ci
106762306a36Sopenharmony_ci	ret = atomic_read(&conf->nr_sync_pending);
106862306a36Sopenharmony_ci	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
106962306a36Sopenharmony_ci		ret += atomic_read(&conf->nr_pending[idx]) -
107062306a36Sopenharmony_ci			atomic_read(&conf->nr_queued[idx]);
107162306a36Sopenharmony_ci
107262306a36Sopenharmony_ci	return ret;
107362306a36Sopenharmony_ci}
107462306a36Sopenharmony_ci
107562306a36Sopenharmony_cistatic void freeze_array(struct r1conf *conf, int extra)
107662306a36Sopenharmony_ci{
107762306a36Sopenharmony_ci	/* Stop sync I/O and normal I/O and wait for everything to
107862306a36Sopenharmony_ci	 * go quiet.
107962306a36Sopenharmony_ci	 * This is called in two situations:
108062306a36Sopenharmony_ci	 * 1) management command handlers (reshape, remove disk, quiesce).
108162306a36Sopenharmony_ci	 * 2) one normal I/O request failed.
108262306a36Sopenharmony_ci
108362306a36Sopenharmony_ci	 * After array_frozen is set to 1, new sync IO will be blocked at
108462306a36Sopenharmony_ci	 * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
108562306a36Sopenharmony_ci	 * or wait_read_barrier(). The flying I/Os will either complete or be
108662306a36Sopenharmony_ci	 * queued. When everything goes quite, there are only queued I/Os left.
108762306a36Sopenharmony_ci
108862306a36Sopenharmony_ci	 * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
108962306a36Sopenharmony_ci	 * barrier bucket index which this I/O request hits. When all sync and
109062306a36Sopenharmony_ci	 * normal I/O are queued, sum of all conf->nr_pending[] will match sum
109162306a36Sopenharmony_ci	 * of all conf->nr_queued[]. But normal I/O failure is an exception,
109262306a36Sopenharmony_ci	 * in handle_read_error(), we may call freeze_array() before trying to
109362306a36Sopenharmony_ci	 * fix the read error. In this case, the error read I/O is not queued,
109462306a36Sopenharmony_ci	 * so get_unqueued_pending() == 1.
109562306a36Sopenharmony_ci	 *
109662306a36Sopenharmony_ci	 * Therefore before this function returns, we need to wait until
109762306a36Sopenharmony_ci	 * get_unqueued_pendings(conf) gets equal to extra. For
109862306a36Sopenharmony_ci	 * normal I/O context, extra is 1, in rested situations extra is 0.
109962306a36Sopenharmony_ci	 */
110062306a36Sopenharmony_ci	spin_lock_irq(&conf->resync_lock);
110162306a36Sopenharmony_ci	conf->array_frozen = 1;
110262306a36Sopenharmony_ci	raid1_log(conf->mddev, "wait freeze");
110362306a36Sopenharmony_ci	wait_event_lock_irq_cmd(
110462306a36Sopenharmony_ci		conf->wait_barrier,
110562306a36Sopenharmony_ci		get_unqueued_pending(conf) == extra,
110662306a36Sopenharmony_ci		conf->resync_lock,
110762306a36Sopenharmony_ci		flush_pending_writes(conf));
110862306a36Sopenharmony_ci	spin_unlock_irq(&conf->resync_lock);
110962306a36Sopenharmony_ci}
111062306a36Sopenharmony_cistatic void unfreeze_array(struct r1conf *conf)
111162306a36Sopenharmony_ci{
111262306a36Sopenharmony_ci	/* reverse the effect of the freeze */
111362306a36Sopenharmony_ci	spin_lock_irq(&conf->resync_lock);
111462306a36Sopenharmony_ci	conf->array_frozen = 0;
111562306a36Sopenharmony_ci	spin_unlock_irq(&conf->resync_lock);
111662306a36Sopenharmony_ci	wake_up(&conf->wait_barrier);
111762306a36Sopenharmony_ci}
111862306a36Sopenharmony_ci
111962306a36Sopenharmony_cistatic void alloc_behind_master_bio(struct r1bio *r1_bio,
112062306a36Sopenharmony_ci					   struct bio *bio)
112162306a36Sopenharmony_ci{
112262306a36Sopenharmony_ci	int size = bio->bi_iter.bi_size;
112362306a36Sopenharmony_ci	unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
112462306a36Sopenharmony_ci	int i = 0;
112562306a36Sopenharmony_ci	struct bio *behind_bio = NULL;
112662306a36Sopenharmony_ci
112762306a36Sopenharmony_ci	behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO,
112862306a36Sopenharmony_ci				      &r1_bio->mddev->bio_set);
112962306a36Sopenharmony_ci	if (!behind_bio)
113062306a36Sopenharmony_ci		return;
113162306a36Sopenharmony_ci
113262306a36Sopenharmony_ci	/* discard op, we don't support writezero/writesame yet */
113362306a36Sopenharmony_ci	if (!bio_has_data(bio)) {
113462306a36Sopenharmony_ci		behind_bio->bi_iter.bi_size = size;
113562306a36Sopenharmony_ci		goto skip_copy;
113662306a36Sopenharmony_ci	}
113762306a36Sopenharmony_ci
113862306a36Sopenharmony_ci	while (i < vcnt && size) {
113962306a36Sopenharmony_ci		struct page *page;
114062306a36Sopenharmony_ci		int len = min_t(int, PAGE_SIZE, size);
114162306a36Sopenharmony_ci
114262306a36Sopenharmony_ci		page = alloc_page(GFP_NOIO);
114362306a36Sopenharmony_ci		if (unlikely(!page))
114462306a36Sopenharmony_ci			goto free_pages;
114562306a36Sopenharmony_ci
114662306a36Sopenharmony_ci		if (!bio_add_page(behind_bio, page, len, 0)) {
114762306a36Sopenharmony_ci			put_page(page);
114862306a36Sopenharmony_ci			goto free_pages;
114962306a36Sopenharmony_ci		}
115062306a36Sopenharmony_ci
115162306a36Sopenharmony_ci		size -= len;
115262306a36Sopenharmony_ci		i++;
115362306a36Sopenharmony_ci	}
115462306a36Sopenharmony_ci
115562306a36Sopenharmony_ci	bio_copy_data(behind_bio, bio);
115662306a36Sopenharmony_ciskip_copy:
115762306a36Sopenharmony_ci	r1_bio->behind_master_bio = behind_bio;
115862306a36Sopenharmony_ci	set_bit(R1BIO_BehindIO, &r1_bio->state);
115962306a36Sopenharmony_ci
116062306a36Sopenharmony_ci	return;
116162306a36Sopenharmony_ci
116262306a36Sopenharmony_cifree_pages:
116362306a36Sopenharmony_ci	pr_debug("%dB behind alloc failed, doing sync I/O\n",
116462306a36Sopenharmony_ci		 bio->bi_iter.bi_size);
116562306a36Sopenharmony_ci	bio_free_pages(behind_bio);
116662306a36Sopenharmony_ci	bio_put(behind_bio);
116762306a36Sopenharmony_ci}
116862306a36Sopenharmony_ci
116962306a36Sopenharmony_cistatic void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
117062306a36Sopenharmony_ci{
117162306a36Sopenharmony_ci	struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
117262306a36Sopenharmony_ci						  cb);
117362306a36Sopenharmony_ci	struct mddev *mddev = plug->cb.data;
117462306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
117562306a36Sopenharmony_ci	struct bio *bio;
117662306a36Sopenharmony_ci
117762306a36Sopenharmony_ci	if (from_schedule) {
117862306a36Sopenharmony_ci		spin_lock_irq(&conf->device_lock);
117962306a36Sopenharmony_ci		bio_list_merge(&conf->pending_bio_list, &plug->pending);
118062306a36Sopenharmony_ci		spin_unlock_irq(&conf->device_lock);
118162306a36Sopenharmony_ci		wake_up_barrier(conf);
118262306a36Sopenharmony_ci		md_wakeup_thread(mddev->thread);
118362306a36Sopenharmony_ci		kfree(plug);
118462306a36Sopenharmony_ci		return;
118562306a36Sopenharmony_ci	}
118662306a36Sopenharmony_ci
118762306a36Sopenharmony_ci	/* we aren't scheduling, so we can do the write-out directly. */
118862306a36Sopenharmony_ci	bio = bio_list_get(&plug->pending);
118962306a36Sopenharmony_ci	flush_bio_list(conf, bio);
119062306a36Sopenharmony_ci	kfree(plug);
119162306a36Sopenharmony_ci}
119262306a36Sopenharmony_ci
119362306a36Sopenharmony_cistatic void init_r1bio(struct r1bio *r1_bio, struct mddev *mddev, struct bio *bio)
119462306a36Sopenharmony_ci{
119562306a36Sopenharmony_ci	r1_bio->master_bio = bio;
119662306a36Sopenharmony_ci	r1_bio->sectors = bio_sectors(bio);
119762306a36Sopenharmony_ci	r1_bio->state = 0;
119862306a36Sopenharmony_ci	r1_bio->mddev = mddev;
119962306a36Sopenharmony_ci	r1_bio->sector = bio->bi_iter.bi_sector;
120062306a36Sopenharmony_ci}
120162306a36Sopenharmony_ci
120262306a36Sopenharmony_cistatic inline struct r1bio *
120362306a36Sopenharmony_cialloc_r1bio(struct mddev *mddev, struct bio *bio)
120462306a36Sopenharmony_ci{
120562306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
120662306a36Sopenharmony_ci	struct r1bio *r1_bio;
120762306a36Sopenharmony_ci
120862306a36Sopenharmony_ci	r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO);
120962306a36Sopenharmony_ci	/* Ensure no bio records IO_BLOCKED */
121062306a36Sopenharmony_ci	memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0]));
121162306a36Sopenharmony_ci	init_r1bio(r1_bio, mddev, bio);
121262306a36Sopenharmony_ci	return r1_bio;
121362306a36Sopenharmony_ci}
121462306a36Sopenharmony_ci
121562306a36Sopenharmony_cistatic void raid1_read_request(struct mddev *mddev, struct bio *bio,
121662306a36Sopenharmony_ci			       int max_read_sectors, struct r1bio *r1_bio)
121762306a36Sopenharmony_ci{
121862306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
121962306a36Sopenharmony_ci	struct raid1_info *mirror;
122062306a36Sopenharmony_ci	struct bio *read_bio;
122162306a36Sopenharmony_ci	struct bitmap *bitmap = mddev->bitmap;
122262306a36Sopenharmony_ci	const enum req_op op = bio_op(bio);
122362306a36Sopenharmony_ci	const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
122462306a36Sopenharmony_ci	int max_sectors;
122562306a36Sopenharmony_ci	int rdisk;
122662306a36Sopenharmony_ci	bool r1bio_existed = !!r1_bio;
122762306a36Sopenharmony_ci	char b[BDEVNAME_SIZE];
122862306a36Sopenharmony_ci
122962306a36Sopenharmony_ci	/*
123062306a36Sopenharmony_ci	 * If r1_bio is set, we are blocking the raid1d thread
123162306a36Sopenharmony_ci	 * so there is a tiny risk of deadlock.  So ask for
123262306a36Sopenharmony_ci	 * emergency memory if needed.
123362306a36Sopenharmony_ci	 */
123462306a36Sopenharmony_ci	gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
123562306a36Sopenharmony_ci
123662306a36Sopenharmony_ci	if (r1bio_existed) {
123762306a36Sopenharmony_ci		/* Need to get the block device name carefully */
123862306a36Sopenharmony_ci		struct md_rdev *rdev;
123962306a36Sopenharmony_ci		rcu_read_lock();
124062306a36Sopenharmony_ci		rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
124162306a36Sopenharmony_ci		if (rdev)
124262306a36Sopenharmony_ci			snprintf(b, sizeof(b), "%pg", rdev->bdev);
124362306a36Sopenharmony_ci		else
124462306a36Sopenharmony_ci			strcpy(b, "???");
124562306a36Sopenharmony_ci		rcu_read_unlock();
124662306a36Sopenharmony_ci	}
124762306a36Sopenharmony_ci
124862306a36Sopenharmony_ci	/*
124962306a36Sopenharmony_ci	 * Still need barrier for READ in case that whole
125062306a36Sopenharmony_ci	 * array is frozen.
125162306a36Sopenharmony_ci	 */
125262306a36Sopenharmony_ci	if (!wait_read_barrier(conf, bio->bi_iter.bi_sector,
125362306a36Sopenharmony_ci				bio->bi_opf & REQ_NOWAIT)) {
125462306a36Sopenharmony_ci		bio_wouldblock_error(bio);
125562306a36Sopenharmony_ci		return;
125662306a36Sopenharmony_ci	}
125762306a36Sopenharmony_ci
125862306a36Sopenharmony_ci	if (!r1_bio)
125962306a36Sopenharmony_ci		r1_bio = alloc_r1bio(mddev, bio);
126062306a36Sopenharmony_ci	else
126162306a36Sopenharmony_ci		init_r1bio(r1_bio, mddev, bio);
126262306a36Sopenharmony_ci	r1_bio->sectors = max_read_sectors;
126362306a36Sopenharmony_ci
126462306a36Sopenharmony_ci	/*
126562306a36Sopenharmony_ci	 * make_request() can abort the operation when read-ahead is being
126662306a36Sopenharmony_ci	 * used and no empty request is available.
126762306a36Sopenharmony_ci	 */
126862306a36Sopenharmony_ci	rdisk = read_balance(conf, r1_bio, &max_sectors);
126962306a36Sopenharmony_ci
127062306a36Sopenharmony_ci	if (rdisk < 0) {
127162306a36Sopenharmony_ci		/* couldn't find anywhere to read from */
127262306a36Sopenharmony_ci		if (r1bio_existed) {
127362306a36Sopenharmony_ci			pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
127462306a36Sopenharmony_ci					    mdname(mddev),
127562306a36Sopenharmony_ci					    b,
127662306a36Sopenharmony_ci					    (unsigned long long)r1_bio->sector);
127762306a36Sopenharmony_ci		}
127862306a36Sopenharmony_ci		raid_end_bio_io(r1_bio);
127962306a36Sopenharmony_ci		return;
128062306a36Sopenharmony_ci	}
128162306a36Sopenharmony_ci	mirror = conf->mirrors + rdisk;
128262306a36Sopenharmony_ci
128362306a36Sopenharmony_ci	if (r1bio_existed)
128462306a36Sopenharmony_ci		pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %pg\n",
128562306a36Sopenharmony_ci				    mdname(mddev),
128662306a36Sopenharmony_ci				    (unsigned long long)r1_bio->sector,
128762306a36Sopenharmony_ci				    mirror->rdev->bdev);
128862306a36Sopenharmony_ci
128962306a36Sopenharmony_ci	if (test_bit(WriteMostly, &mirror->rdev->flags) &&
129062306a36Sopenharmony_ci	    bitmap) {
129162306a36Sopenharmony_ci		/*
129262306a36Sopenharmony_ci		 * Reading from a write-mostly device must take care not to
129362306a36Sopenharmony_ci		 * over-take any writes that are 'behind'
129462306a36Sopenharmony_ci		 */
129562306a36Sopenharmony_ci		raid1_log(mddev, "wait behind writes");
129662306a36Sopenharmony_ci		wait_event(bitmap->behind_wait,
129762306a36Sopenharmony_ci			   atomic_read(&bitmap->behind_writes) == 0);
129862306a36Sopenharmony_ci	}
129962306a36Sopenharmony_ci
130062306a36Sopenharmony_ci	if (max_sectors < bio_sectors(bio)) {
130162306a36Sopenharmony_ci		struct bio *split = bio_split(bio, max_sectors,
130262306a36Sopenharmony_ci					      gfp, &conf->bio_split);
130362306a36Sopenharmony_ci		bio_chain(split, bio);
130462306a36Sopenharmony_ci		submit_bio_noacct(bio);
130562306a36Sopenharmony_ci		bio = split;
130662306a36Sopenharmony_ci		r1_bio->master_bio = bio;
130762306a36Sopenharmony_ci		r1_bio->sectors = max_sectors;
130862306a36Sopenharmony_ci	}
130962306a36Sopenharmony_ci
131062306a36Sopenharmony_ci	r1_bio->read_disk = rdisk;
131162306a36Sopenharmony_ci	if (!r1bio_existed) {
131262306a36Sopenharmony_ci		md_account_bio(mddev, &bio);
131362306a36Sopenharmony_ci		r1_bio->master_bio = bio;
131462306a36Sopenharmony_ci	}
131562306a36Sopenharmony_ci	read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
131662306a36Sopenharmony_ci				   &mddev->bio_set);
131762306a36Sopenharmony_ci
131862306a36Sopenharmony_ci	r1_bio->bios[rdisk] = read_bio;
131962306a36Sopenharmony_ci
132062306a36Sopenharmony_ci	read_bio->bi_iter.bi_sector = r1_bio->sector +
132162306a36Sopenharmony_ci		mirror->rdev->data_offset;
132262306a36Sopenharmony_ci	read_bio->bi_end_io = raid1_end_read_request;
132362306a36Sopenharmony_ci	read_bio->bi_opf = op | do_sync;
132462306a36Sopenharmony_ci	if (test_bit(FailFast, &mirror->rdev->flags) &&
132562306a36Sopenharmony_ci	    test_bit(R1BIO_FailFast, &r1_bio->state))
132662306a36Sopenharmony_ci	        read_bio->bi_opf |= MD_FAILFAST;
132762306a36Sopenharmony_ci	read_bio->bi_private = r1_bio;
132862306a36Sopenharmony_ci
132962306a36Sopenharmony_ci	if (mddev->gendisk)
133062306a36Sopenharmony_ci	        trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
133162306a36Sopenharmony_ci				      r1_bio->sector);
133262306a36Sopenharmony_ci
133362306a36Sopenharmony_ci	submit_bio_noacct(read_bio);
133462306a36Sopenharmony_ci}
133562306a36Sopenharmony_ci
133662306a36Sopenharmony_cistatic void raid1_write_request(struct mddev *mddev, struct bio *bio,
133762306a36Sopenharmony_ci				int max_write_sectors)
133862306a36Sopenharmony_ci{
133962306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
134062306a36Sopenharmony_ci	struct r1bio *r1_bio;
134162306a36Sopenharmony_ci	int i, disks;
134262306a36Sopenharmony_ci	struct bitmap *bitmap = mddev->bitmap;
134362306a36Sopenharmony_ci	unsigned long flags;
134462306a36Sopenharmony_ci	struct md_rdev *blocked_rdev;
134562306a36Sopenharmony_ci	int first_clone;
134662306a36Sopenharmony_ci	int max_sectors;
134762306a36Sopenharmony_ci	bool write_behind = false;
134862306a36Sopenharmony_ci
134962306a36Sopenharmony_ci	if (mddev_is_clustered(mddev) &&
135062306a36Sopenharmony_ci	     md_cluster_ops->area_resyncing(mddev, WRITE,
135162306a36Sopenharmony_ci		     bio->bi_iter.bi_sector, bio_end_sector(bio))) {
135262306a36Sopenharmony_ci
135362306a36Sopenharmony_ci		DEFINE_WAIT(w);
135462306a36Sopenharmony_ci		if (bio->bi_opf & REQ_NOWAIT) {
135562306a36Sopenharmony_ci			bio_wouldblock_error(bio);
135662306a36Sopenharmony_ci			return;
135762306a36Sopenharmony_ci		}
135862306a36Sopenharmony_ci		for (;;) {
135962306a36Sopenharmony_ci			prepare_to_wait(&conf->wait_barrier,
136062306a36Sopenharmony_ci					&w, TASK_IDLE);
136162306a36Sopenharmony_ci			if (!md_cluster_ops->area_resyncing(mddev, WRITE,
136262306a36Sopenharmony_ci							bio->bi_iter.bi_sector,
136362306a36Sopenharmony_ci							bio_end_sector(bio)))
136462306a36Sopenharmony_ci				break;
136562306a36Sopenharmony_ci			schedule();
136662306a36Sopenharmony_ci		}
136762306a36Sopenharmony_ci		finish_wait(&conf->wait_barrier, &w);
136862306a36Sopenharmony_ci	}
136962306a36Sopenharmony_ci
137062306a36Sopenharmony_ci	/*
137162306a36Sopenharmony_ci	 * Register the new request and wait if the reconstruction
137262306a36Sopenharmony_ci	 * thread has put up a bar for new requests.
137362306a36Sopenharmony_ci	 * Continue immediately if no resync is active currently.
137462306a36Sopenharmony_ci	 */
137562306a36Sopenharmony_ci	if (!wait_barrier(conf, bio->bi_iter.bi_sector,
137662306a36Sopenharmony_ci				bio->bi_opf & REQ_NOWAIT)) {
137762306a36Sopenharmony_ci		bio_wouldblock_error(bio);
137862306a36Sopenharmony_ci		return;
137962306a36Sopenharmony_ci	}
138062306a36Sopenharmony_ci
138162306a36Sopenharmony_ci retry_write:
138262306a36Sopenharmony_ci	r1_bio = alloc_r1bio(mddev, bio);
138362306a36Sopenharmony_ci	r1_bio->sectors = max_write_sectors;
138462306a36Sopenharmony_ci
138562306a36Sopenharmony_ci	/* first select target devices under rcu_lock and
138662306a36Sopenharmony_ci	 * inc refcount on their rdev.  Record them by setting
138762306a36Sopenharmony_ci	 * bios[x] to bio
138862306a36Sopenharmony_ci	 * If there are known/acknowledged bad blocks on any device on
138962306a36Sopenharmony_ci	 * which we have seen a write error, we want to avoid writing those
139062306a36Sopenharmony_ci	 * blocks.
139162306a36Sopenharmony_ci	 * This potentially requires several writes to write around
139262306a36Sopenharmony_ci	 * the bad blocks.  Each set of writes gets it's own r1bio
139362306a36Sopenharmony_ci	 * with a set of bios attached.
139462306a36Sopenharmony_ci	 */
139562306a36Sopenharmony_ci
139662306a36Sopenharmony_ci	disks = conf->raid_disks * 2;
139762306a36Sopenharmony_ci	blocked_rdev = NULL;
139862306a36Sopenharmony_ci	rcu_read_lock();
139962306a36Sopenharmony_ci	max_sectors = r1_bio->sectors;
140062306a36Sopenharmony_ci	for (i = 0;  i < disks; i++) {
140162306a36Sopenharmony_ci		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
140262306a36Sopenharmony_ci
140362306a36Sopenharmony_ci		/*
140462306a36Sopenharmony_ci		 * The write-behind io is only attempted on drives marked as
140562306a36Sopenharmony_ci		 * write-mostly, which means we could allocate write behind
140662306a36Sopenharmony_ci		 * bio later.
140762306a36Sopenharmony_ci		 */
140862306a36Sopenharmony_ci		if (rdev && test_bit(WriteMostly, &rdev->flags))
140962306a36Sopenharmony_ci			write_behind = true;
141062306a36Sopenharmony_ci
141162306a36Sopenharmony_ci		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
141262306a36Sopenharmony_ci			atomic_inc(&rdev->nr_pending);
141362306a36Sopenharmony_ci			blocked_rdev = rdev;
141462306a36Sopenharmony_ci			break;
141562306a36Sopenharmony_ci		}
141662306a36Sopenharmony_ci		r1_bio->bios[i] = NULL;
141762306a36Sopenharmony_ci		if (!rdev || test_bit(Faulty, &rdev->flags)) {
141862306a36Sopenharmony_ci			if (i < conf->raid_disks)
141962306a36Sopenharmony_ci				set_bit(R1BIO_Degraded, &r1_bio->state);
142062306a36Sopenharmony_ci			continue;
142162306a36Sopenharmony_ci		}
142262306a36Sopenharmony_ci
142362306a36Sopenharmony_ci		atomic_inc(&rdev->nr_pending);
142462306a36Sopenharmony_ci		if (test_bit(WriteErrorSeen, &rdev->flags)) {
142562306a36Sopenharmony_ci			sector_t first_bad;
142662306a36Sopenharmony_ci			int bad_sectors;
142762306a36Sopenharmony_ci			int is_bad;
142862306a36Sopenharmony_ci
142962306a36Sopenharmony_ci			is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
143062306a36Sopenharmony_ci					     &first_bad, &bad_sectors);
143162306a36Sopenharmony_ci			if (is_bad < 0) {
143262306a36Sopenharmony_ci				/* mustn't write here until the bad block is
143362306a36Sopenharmony_ci				 * acknowledged*/
143462306a36Sopenharmony_ci				set_bit(BlockedBadBlocks, &rdev->flags);
143562306a36Sopenharmony_ci				blocked_rdev = rdev;
143662306a36Sopenharmony_ci				break;
143762306a36Sopenharmony_ci			}
143862306a36Sopenharmony_ci			if (is_bad && first_bad <= r1_bio->sector) {
143962306a36Sopenharmony_ci				/* Cannot write here at all */
144062306a36Sopenharmony_ci				bad_sectors -= (r1_bio->sector - first_bad);
144162306a36Sopenharmony_ci				if (bad_sectors < max_sectors)
144262306a36Sopenharmony_ci					/* mustn't write more than bad_sectors
144362306a36Sopenharmony_ci					 * to other devices yet
144462306a36Sopenharmony_ci					 */
144562306a36Sopenharmony_ci					max_sectors = bad_sectors;
144662306a36Sopenharmony_ci				rdev_dec_pending(rdev, mddev);
144762306a36Sopenharmony_ci				/* We don't set R1BIO_Degraded as that
144862306a36Sopenharmony_ci				 * only applies if the disk is
144962306a36Sopenharmony_ci				 * missing, so it might be re-added,
145062306a36Sopenharmony_ci				 * and we want to know to recover this
145162306a36Sopenharmony_ci				 * chunk.
145262306a36Sopenharmony_ci				 * In this case the device is here,
145362306a36Sopenharmony_ci				 * and the fact that this chunk is not
145462306a36Sopenharmony_ci				 * in-sync is recorded in the bad
145562306a36Sopenharmony_ci				 * block log
145662306a36Sopenharmony_ci				 */
145762306a36Sopenharmony_ci				continue;
145862306a36Sopenharmony_ci			}
145962306a36Sopenharmony_ci			if (is_bad) {
146062306a36Sopenharmony_ci				int good_sectors = first_bad - r1_bio->sector;
146162306a36Sopenharmony_ci				if (good_sectors < max_sectors)
146262306a36Sopenharmony_ci					max_sectors = good_sectors;
146362306a36Sopenharmony_ci			}
146462306a36Sopenharmony_ci		}
146562306a36Sopenharmony_ci		r1_bio->bios[i] = bio;
146662306a36Sopenharmony_ci	}
146762306a36Sopenharmony_ci	rcu_read_unlock();
146862306a36Sopenharmony_ci
146962306a36Sopenharmony_ci	if (unlikely(blocked_rdev)) {
147062306a36Sopenharmony_ci		/* Wait for this device to become unblocked */
147162306a36Sopenharmony_ci		int j;
147262306a36Sopenharmony_ci
147362306a36Sopenharmony_ci		for (j = 0; j < i; j++)
147462306a36Sopenharmony_ci			if (r1_bio->bios[j])
147562306a36Sopenharmony_ci				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
147662306a36Sopenharmony_ci		free_r1bio(r1_bio);
147762306a36Sopenharmony_ci		allow_barrier(conf, bio->bi_iter.bi_sector);
147862306a36Sopenharmony_ci
147962306a36Sopenharmony_ci		if (bio->bi_opf & REQ_NOWAIT) {
148062306a36Sopenharmony_ci			bio_wouldblock_error(bio);
148162306a36Sopenharmony_ci			return;
148262306a36Sopenharmony_ci		}
148362306a36Sopenharmony_ci		raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
148462306a36Sopenharmony_ci		md_wait_for_blocked_rdev(blocked_rdev, mddev);
148562306a36Sopenharmony_ci		wait_barrier(conf, bio->bi_iter.bi_sector, false);
148662306a36Sopenharmony_ci		goto retry_write;
148762306a36Sopenharmony_ci	}
148862306a36Sopenharmony_ci
148962306a36Sopenharmony_ci	/*
149062306a36Sopenharmony_ci	 * When using a bitmap, we may call alloc_behind_master_bio below.
149162306a36Sopenharmony_ci	 * alloc_behind_master_bio allocates a copy of the data payload a page
149262306a36Sopenharmony_ci	 * at a time and thus needs a new bio that can fit the whole payload
149362306a36Sopenharmony_ci	 * this bio in page sized chunks.
149462306a36Sopenharmony_ci	 */
149562306a36Sopenharmony_ci	if (write_behind && bitmap)
149662306a36Sopenharmony_ci		max_sectors = min_t(int, max_sectors,
149762306a36Sopenharmony_ci				    BIO_MAX_VECS * (PAGE_SIZE >> 9));
149862306a36Sopenharmony_ci	if (max_sectors < bio_sectors(bio)) {
149962306a36Sopenharmony_ci		struct bio *split = bio_split(bio, max_sectors,
150062306a36Sopenharmony_ci					      GFP_NOIO, &conf->bio_split);
150162306a36Sopenharmony_ci		bio_chain(split, bio);
150262306a36Sopenharmony_ci		submit_bio_noacct(bio);
150362306a36Sopenharmony_ci		bio = split;
150462306a36Sopenharmony_ci		r1_bio->master_bio = bio;
150562306a36Sopenharmony_ci		r1_bio->sectors = max_sectors;
150662306a36Sopenharmony_ci	}
150762306a36Sopenharmony_ci
150862306a36Sopenharmony_ci	md_account_bio(mddev, &bio);
150962306a36Sopenharmony_ci	r1_bio->master_bio = bio;
151062306a36Sopenharmony_ci	atomic_set(&r1_bio->remaining, 1);
151162306a36Sopenharmony_ci	atomic_set(&r1_bio->behind_remaining, 0);
151262306a36Sopenharmony_ci
151362306a36Sopenharmony_ci	first_clone = 1;
151462306a36Sopenharmony_ci
151562306a36Sopenharmony_ci	for (i = 0; i < disks; i++) {
151662306a36Sopenharmony_ci		struct bio *mbio = NULL;
151762306a36Sopenharmony_ci		struct md_rdev *rdev = conf->mirrors[i].rdev;
151862306a36Sopenharmony_ci		if (!r1_bio->bios[i])
151962306a36Sopenharmony_ci			continue;
152062306a36Sopenharmony_ci
152162306a36Sopenharmony_ci		if (first_clone) {
152262306a36Sopenharmony_ci			/* do behind I/O ?
152362306a36Sopenharmony_ci			 * Not if there are too many, or cannot
152462306a36Sopenharmony_ci			 * allocate memory, or a reader on WriteMostly
152562306a36Sopenharmony_ci			 * is waiting for behind writes to flush */
152662306a36Sopenharmony_ci			if (bitmap && write_behind &&
152762306a36Sopenharmony_ci			    (atomic_read(&bitmap->behind_writes)
152862306a36Sopenharmony_ci			     < mddev->bitmap_info.max_write_behind) &&
152962306a36Sopenharmony_ci			    !waitqueue_active(&bitmap->behind_wait)) {
153062306a36Sopenharmony_ci				alloc_behind_master_bio(r1_bio, bio);
153162306a36Sopenharmony_ci			}
153262306a36Sopenharmony_ci
153362306a36Sopenharmony_ci			md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors,
153462306a36Sopenharmony_ci					     test_bit(R1BIO_BehindIO, &r1_bio->state));
153562306a36Sopenharmony_ci			first_clone = 0;
153662306a36Sopenharmony_ci		}
153762306a36Sopenharmony_ci
153862306a36Sopenharmony_ci		if (r1_bio->behind_master_bio) {
153962306a36Sopenharmony_ci			mbio = bio_alloc_clone(rdev->bdev,
154062306a36Sopenharmony_ci					       r1_bio->behind_master_bio,
154162306a36Sopenharmony_ci					       GFP_NOIO, &mddev->bio_set);
154262306a36Sopenharmony_ci			if (test_bit(CollisionCheck, &rdev->flags))
154362306a36Sopenharmony_ci				wait_for_serialization(rdev, r1_bio);
154462306a36Sopenharmony_ci			if (test_bit(WriteMostly, &rdev->flags))
154562306a36Sopenharmony_ci				atomic_inc(&r1_bio->behind_remaining);
154662306a36Sopenharmony_ci		} else {
154762306a36Sopenharmony_ci			mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO,
154862306a36Sopenharmony_ci					       &mddev->bio_set);
154962306a36Sopenharmony_ci
155062306a36Sopenharmony_ci			if (mddev->serialize_policy)
155162306a36Sopenharmony_ci				wait_for_serialization(rdev, r1_bio);
155262306a36Sopenharmony_ci		}
155362306a36Sopenharmony_ci
155462306a36Sopenharmony_ci		r1_bio->bios[i] = mbio;
155562306a36Sopenharmony_ci
155662306a36Sopenharmony_ci		mbio->bi_iter.bi_sector	= (r1_bio->sector + rdev->data_offset);
155762306a36Sopenharmony_ci		mbio->bi_end_io	= raid1_end_write_request;
155862306a36Sopenharmony_ci		mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
155962306a36Sopenharmony_ci		if (test_bit(FailFast, &rdev->flags) &&
156062306a36Sopenharmony_ci		    !test_bit(WriteMostly, &rdev->flags) &&
156162306a36Sopenharmony_ci		    conf->raid_disks - mddev->degraded > 1)
156262306a36Sopenharmony_ci			mbio->bi_opf |= MD_FAILFAST;
156362306a36Sopenharmony_ci		mbio->bi_private = r1_bio;
156462306a36Sopenharmony_ci
156562306a36Sopenharmony_ci		atomic_inc(&r1_bio->remaining);
156662306a36Sopenharmony_ci
156762306a36Sopenharmony_ci		if (mddev->gendisk)
156862306a36Sopenharmony_ci			trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
156962306a36Sopenharmony_ci					      r1_bio->sector);
157062306a36Sopenharmony_ci		/* flush_pending_writes() needs access to the rdev so...*/
157162306a36Sopenharmony_ci		mbio->bi_bdev = (void *)rdev;
157262306a36Sopenharmony_ci		if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
157362306a36Sopenharmony_ci			spin_lock_irqsave(&conf->device_lock, flags);
157462306a36Sopenharmony_ci			bio_list_add(&conf->pending_bio_list, mbio);
157562306a36Sopenharmony_ci			spin_unlock_irqrestore(&conf->device_lock, flags);
157662306a36Sopenharmony_ci			md_wakeup_thread(mddev->thread);
157762306a36Sopenharmony_ci		}
157862306a36Sopenharmony_ci	}
157962306a36Sopenharmony_ci
158062306a36Sopenharmony_ci	r1_bio_write_done(r1_bio);
158162306a36Sopenharmony_ci
158262306a36Sopenharmony_ci	/* In case raid1d snuck in to freeze_array */
158362306a36Sopenharmony_ci	wake_up_barrier(conf);
158462306a36Sopenharmony_ci}
158562306a36Sopenharmony_ci
158662306a36Sopenharmony_cistatic bool raid1_make_request(struct mddev *mddev, struct bio *bio)
158762306a36Sopenharmony_ci{
158862306a36Sopenharmony_ci	sector_t sectors;
158962306a36Sopenharmony_ci
159062306a36Sopenharmony_ci	if (unlikely(bio->bi_opf & REQ_PREFLUSH)
159162306a36Sopenharmony_ci	    && md_flush_request(mddev, bio))
159262306a36Sopenharmony_ci		return true;
159362306a36Sopenharmony_ci
159462306a36Sopenharmony_ci	/*
159562306a36Sopenharmony_ci	 * There is a limit to the maximum size, but
159662306a36Sopenharmony_ci	 * the read/write handler might find a lower limit
159762306a36Sopenharmony_ci	 * due to bad blocks.  To avoid multiple splits,
159862306a36Sopenharmony_ci	 * we pass the maximum number of sectors down
159962306a36Sopenharmony_ci	 * and let the lower level perform the split.
160062306a36Sopenharmony_ci	 */
160162306a36Sopenharmony_ci	sectors = align_to_barrier_unit_end(
160262306a36Sopenharmony_ci		bio->bi_iter.bi_sector, bio_sectors(bio));
160362306a36Sopenharmony_ci
160462306a36Sopenharmony_ci	if (bio_data_dir(bio) == READ)
160562306a36Sopenharmony_ci		raid1_read_request(mddev, bio, sectors, NULL);
160662306a36Sopenharmony_ci	else {
160762306a36Sopenharmony_ci		if (!md_write_start(mddev,bio))
160862306a36Sopenharmony_ci			return false;
160962306a36Sopenharmony_ci		raid1_write_request(mddev, bio, sectors);
161062306a36Sopenharmony_ci	}
161162306a36Sopenharmony_ci	return true;
161262306a36Sopenharmony_ci}
161362306a36Sopenharmony_ci
161462306a36Sopenharmony_cistatic void raid1_status(struct seq_file *seq, struct mddev *mddev)
161562306a36Sopenharmony_ci{
161662306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
161762306a36Sopenharmony_ci	int i;
161862306a36Sopenharmony_ci
161962306a36Sopenharmony_ci	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
162062306a36Sopenharmony_ci		   conf->raid_disks - mddev->degraded);
162162306a36Sopenharmony_ci	rcu_read_lock();
162262306a36Sopenharmony_ci	for (i = 0; i < conf->raid_disks; i++) {
162362306a36Sopenharmony_ci		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
162462306a36Sopenharmony_ci		seq_printf(seq, "%s",
162562306a36Sopenharmony_ci			   rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
162662306a36Sopenharmony_ci	}
162762306a36Sopenharmony_ci	rcu_read_unlock();
162862306a36Sopenharmony_ci	seq_printf(seq, "]");
162962306a36Sopenharmony_ci}
163062306a36Sopenharmony_ci
163162306a36Sopenharmony_ci/**
163262306a36Sopenharmony_ci * raid1_error() - RAID1 error handler.
163362306a36Sopenharmony_ci * @mddev: affected md device.
163462306a36Sopenharmony_ci * @rdev: member device to fail.
163562306a36Sopenharmony_ci *
163662306a36Sopenharmony_ci * The routine acknowledges &rdev failure and determines new @mddev state.
163762306a36Sopenharmony_ci * If it failed, then:
163862306a36Sopenharmony_ci *	- &MD_BROKEN flag is set in &mddev->flags.
163962306a36Sopenharmony_ci *	- recovery is disabled.
164062306a36Sopenharmony_ci * Otherwise, it must be degraded:
164162306a36Sopenharmony_ci *	- recovery is interrupted.
164262306a36Sopenharmony_ci *	- &mddev->degraded is bumped.
164362306a36Sopenharmony_ci *
164462306a36Sopenharmony_ci * @rdev is marked as &Faulty excluding case when array is failed and
164562306a36Sopenharmony_ci * &mddev->fail_last_dev is off.
164662306a36Sopenharmony_ci */
164762306a36Sopenharmony_cistatic void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
164862306a36Sopenharmony_ci{
164962306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
165062306a36Sopenharmony_ci	unsigned long flags;
165162306a36Sopenharmony_ci
165262306a36Sopenharmony_ci	spin_lock_irqsave(&conf->device_lock, flags);
165362306a36Sopenharmony_ci
165462306a36Sopenharmony_ci	if (test_bit(In_sync, &rdev->flags) &&
165562306a36Sopenharmony_ci	    (conf->raid_disks - mddev->degraded) == 1) {
165662306a36Sopenharmony_ci		set_bit(MD_BROKEN, &mddev->flags);
165762306a36Sopenharmony_ci
165862306a36Sopenharmony_ci		if (!mddev->fail_last_dev) {
165962306a36Sopenharmony_ci			conf->recovery_disabled = mddev->recovery_disabled;
166062306a36Sopenharmony_ci			spin_unlock_irqrestore(&conf->device_lock, flags);
166162306a36Sopenharmony_ci			return;
166262306a36Sopenharmony_ci		}
166362306a36Sopenharmony_ci	}
166462306a36Sopenharmony_ci	set_bit(Blocked, &rdev->flags);
166562306a36Sopenharmony_ci	if (test_and_clear_bit(In_sync, &rdev->flags))
166662306a36Sopenharmony_ci		mddev->degraded++;
166762306a36Sopenharmony_ci	set_bit(Faulty, &rdev->flags);
166862306a36Sopenharmony_ci	spin_unlock_irqrestore(&conf->device_lock, flags);
166962306a36Sopenharmony_ci	/*
167062306a36Sopenharmony_ci	 * if recovery is running, make sure it aborts.
167162306a36Sopenharmony_ci	 */
167262306a36Sopenharmony_ci	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
167362306a36Sopenharmony_ci	set_mask_bits(&mddev->sb_flags, 0,
167462306a36Sopenharmony_ci		      BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
167562306a36Sopenharmony_ci	pr_crit("md/raid1:%s: Disk failure on %pg, disabling device.\n"
167662306a36Sopenharmony_ci		"md/raid1:%s: Operation continuing on %d devices.\n",
167762306a36Sopenharmony_ci		mdname(mddev), rdev->bdev,
167862306a36Sopenharmony_ci		mdname(mddev), conf->raid_disks - mddev->degraded);
167962306a36Sopenharmony_ci}
168062306a36Sopenharmony_ci
168162306a36Sopenharmony_cistatic void print_conf(struct r1conf *conf)
168262306a36Sopenharmony_ci{
168362306a36Sopenharmony_ci	int i;
168462306a36Sopenharmony_ci
168562306a36Sopenharmony_ci	pr_debug("RAID1 conf printout:\n");
168662306a36Sopenharmony_ci	if (!conf) {
168762306a36Sopenharmony_ci		pr_debug("(!conf)\n");
168862306a36Sopenharmony_ci		return;
168962306a36Sopenharmony_ci	}
169062306a36Sopenharmony_ci	pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
169162306a36Sopenharmony_ci		 conf->raid_disks);
169262306a36Sopenharmony_ci
169362306a36Sopenharmony_ci	rcu_read_lock();
169462306a36Sopenharmony_ci	for (i = 0; i < conf->raid_disks; i++) {
169562306a36Sopenharmony_ci		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
169662306a36Sopenharmony_ci		if (rdev)
169762306a36Sopenharmony_ci			pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
169862306a36Sopenharmony_ci				 i, !test_bit(In_sync, &rdev->flags),
169962306a36Sopenharmony_ci				 !test_bit(Faulty, &rdev->flags),
170062306a36Sopenharmony_ci				 rdev->bdev);
170162306a36Sopenharmony_ci	}
170262306a36Sopenharmony_ci	rcu_read_unlock();
170362306a36Sopenharmony_ci}
170462306a36Sopenharmony_ci
170562306a36Sopenharmony_cistatic void close_sync(struct r1conf *conf)
170662306a36Sopenharmony_ci{
170762306a36Sopenharmony_ci	int idx;
170862306a36Sopenharmony_ci
170962306a36Sopenharmony_ci	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
171062306a36Sopenharmony_ci		_wait_barrier(conf, idx, false);
171162306a36Sopenharmony_ci		_allow_barrier(conf, idx);
171262306a36Sopenharmony_ci	}
171362306a36Sopenharmony_ci
171462306a36Sopenharmony_ci	mempool_exit(&conf->r1buf_pool);
171562306a36Sopenharmony_ci}
171662306a36Sopenharmony_ci
171762306a36Sopenharmony_cistatic int raid1_spare_active(struct mddev *mddev)
171862306a36Sopenharmony_ci{
171962306a36Sopenharmony_ci	int i;
172062306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
172162306a36Sopenharmony_ci	int count = 0;
172262306a36Sopenharmony_ci	unsigned long flags;
172362306a36Sopenharmony_ci
172462306a36Sopenharmony_ci	/*
172562306a36Sopenharmony_ci	 * Find all failed disks within the RAID1 configuration
172662306a36Sopenharmony_ci	 * and mark them readable.
172762306a36Sopenharmony_ci	 * Called under mddev lock, so rcu protection not needed.
172862306a36Sopenharmony_ci	 * device_lock used to avoid races with raid1_end_read_request
172962306a36Sopenharmony_ci	 * which expects 'In_sync' flags and ->degraded to be consistent.
173062306a36Sopenharmony_ci	 */
173162306a36Sopenharmony_ci	spin_lock_irqsave(&conf->device_lock, flags);
173262306a36Sopenharmony_ci	for (i = 0; i < conf->raid_disks; i++) {
173362306a36Sopenharmony_ci		struct md_rdev *rdev = conf->mirrors[i].rdev;
173462306a36Sopenharmony_ci		struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
173562306a36Sopenharmony_ci		if (repl
173662306a36Sopenharmony_ci		    && !test_bit(Candidate, &repl->flags)
173762306a36Sopenharmony_ci		    && repl->recovery_offset == MaxSector
173862306a36Sopenharmony_ci		    && !test_bit(Faulty, &repl->flags)
173962306a36Sopenharmony_ci		    && !test_and_set_bit(In_sync, &repl->flags)) {
174062306a36Sopenharmony_ci			/* replacement has just become active */
174162306a36Sopenharmony_ci			if (!rdev ||
174262306a36Sopenharmony_ci			    !test_and_clear_bit(In_sync, &rdev->flags))
174362306a36Sopenharmony_ci				count++;
174462306a36Sopenharmony_ci			if (rdev) {
174562306a36Sopenharmony_ci				/* Replaced device not technically
174662306a36Sopenharmony_ci				 * faulty, but we need to be sure
174762306a36Sopenharmony_ci				 * it gets removed and never re-added
174862306a36Sopenharmony_ci				 */
174962306a36Sopenharmony_ci				set_bit(Faulty, &rdev->flags);
175062306a36Sopenharmony_ci				sysfs_notify_dirent_safe(
175162306a36Sopenharmony_ci					rdev->sysfs_state);
175262306a36Sopenharmony_ci			}
175362306a36Sopenharmony_ci		}
175462306a36Sopenharmony_ci		if (rdev
175562306a36Sopenharmony_ci		    && rdev->recovery_offset == MaxSector
175662306a36Sopenharmony_ci		    && !test_bit(Faulty, &rdev->flags)
175762306a36Sopenharmony_ci		    && !test_and_set_bit(In_sync, &rdev->flags)) {
175862306a36Sopenharmony_ci			count++;
175962306a36Sopenharmony_ci			sysfs_notify_dirent_safe(rdev->sysfs_state);
176062306a36Sopenharmony_ci		}
176162306a36Sopenharmony_ci	}
176262306a36Sopenharmony_ci	mddev->degraded -= count;
176362306a36Sopenharmony_ci	spin_unlock_irqrestore(&conf->device_lock, flags);
176462306a36Sopenharmony_ci
176562306a36Sopenharmony_ci	print_conf(conf);
176662306a36Sopenharmony_ci	return count;
176762306a36Sopenharmony_ci}
176862306a36Sopenharmony_ci
176962306a36Sopenharmony_cistatic int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
177062306a36Sopenharmony_ci{
177162306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
177262306a36Sopenharmony_ci	int err = -EEXIST;
177362306a36Sopenharmony_ci	int mirror = 0, repl_slot = -1;
177462306a36Sopenharmony_ci	struct raid1_info *p;
177562306a36Sopenharmony_ci	int first = 0;
177662306a36Sopenharmony_ci	int last = conf->raid_disks - 1;
177762306a36Sopenharmony_ci
177862306a36Sopenharmony_ci	if (mddev->recovery_disabled == conf->recovery_disabled)
177962306a36Sopenharmony_ci		return -EBUSY;
178062306a36Sopenharmony_ci
178162306a36Sopenharmony_ci	if (md_integrity_add_rdev(rdev, mddev))
178262306a36Sopenharmony_ci		return -ENXIO;
178362306a36Sopenharmony_ci
178462306a36Sopenharmony_ci	if (rdev->raid_disk >= 0)
178562306a36Sopenharmony_ci		first = last = rdev->raid_disk;
178662306a36Sopenharmony_ci
178762306a36Sopenharmony_ci	/*
178862306a36Sopenharmony_ci	 * find the disk ... but prefer rdev->saved_raid_disk
178962306a36Sopenharmony_ci	 * if possible.
179062306a36Sopenharmony_ci	 */
179162306a36Sopenharmony_ci	if (rdev->saved_raid_disk >= 0 &&
179262306a36Sopenharmony_ci	    rdev->saved_raid_disk >= first &&
179362306a36Sopenharmony_ci	    rdev->saved_raid_disk < conf->raid_disks &&
179462306a36Sopenharmony_ci	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
179562306a36Sopenharmony_ci		first = last = rdev->saved_raid_disk;
179662306a36Sopenharmony_ci
179762306a36Sopenharmony_ci	for (mirror = first; mirror <= last; mirror++) {
179862306a36Sopenharmony_ci		p = conf->mirrors + mirror;
179962306a36Sopenharmony_ci		if (!p->rdev) {
180062306a36Sopenharmony_ci			if (mddev->gendisk)
180162306a36Sopenharmony_ci				disk_stack_limits(mddev->gendisk, rdev->bdev,
180262306a36Sopenharmony_ci						  rdev->data_offset << 9);
180362306a36Sopenharmony_ci
180462306a36Sopenharmony_ci			p->head_position = 0;
180562306a36Sopenharmony_ci			rdev->raid_disk = mirror;
180662306a36Sopenharmony_ci			err = 0;
180762306a36Sopenharmony_ci			/* As all devices are equivalent, we don't need a full recovery
180862306a36Sopenharmony_ci			 * if this was recently any drive of the array
180962306a36Sopenharmony_ci			 */
181062306a36Sopenharmony_ci			if (rdev->saved_raid_disk < 0)
181162306a36Sopenharmony_ci				conf->fullsync = 1;
181262306a36Sopenharmony_ci			rcu_assign_pointer(p->rdev, rdev);
181362306a36Sopenharmony_ci			break;
181462306a36Sopenharmony_ci		}
181562306a36Sopenharmony_ci		if (test_bit(WantReplacement, &p->rdev->flags) &&
181662306a36Sopenharmony_ci		    p[conf->raid_disks].rdev == NULL && repl_slot < 0)
181762306a36Sopenharmony_ci			repl_slot = mirror;
181862306a36Sopenharmony_ci	}
181962306a36Sopenharmony_ci
182062306a36Sopenharmony_ci	if (err && repl_slot >= 0) {
182162306a36Sopenharmony_ci		/* Add this device as a replacement */
182262306a36Sopenharmony_ci		p = conf->mirrors + repl_slot;
182362306a36Sopenharmony_ci		clear_bit(In_sync, &rdev->flags);
182462306a36Sopenharmony_ci		set_bit(Replacement, &rdev->flags);
182562306a36Sopenharmony_ci		rdev->raid_disk = repl_slot;
182662306a36Sopenharmony_ci		err = 0;
182762306a36Sopenharmony_ci		conf->fullsync = 1;
182862306a36Sopenharmony_ci		rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
182962306a36Sopenharmony_ci	}
183062306a36Sopenharmony_ci
183162306a36Sopenharmony_ci	print_conf(conf);
183262306a36Sopenharmony_ci	return err;
183362306a36Sopenharmony_ci}
183462306a36Sopenharmony_ci
183562306a36Sopenharmony_cistatic int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
183662306a36Sopenharmony_ci{
183762306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
183862306a36Sopenharmony_ci	int err = 0;
183962306a36Sopenharmony_ci	int number = rdev->raid_disk;
184062306a36Sopenharmony_ci	struct raid1_info *p = conf->mirrors + number;
184162306a36Sopenharmony_ci
184262306a36Sopenharmony_ci	if (unlikely(number >= conf->raid_disks))
184362306a36Sopenharmony_ci		goto abort;
184462306a36Sopenharmony_ci
184562306a36Sopenharmony_ci	if (rdev != p->rdev)
184662306a36Sopenharmony_ci		p = conf->mirrors + conf->raid_disks + number;
184762306a36Sopenharmony_ci
184862306a36Sopenharmony_ci	print_conf(conf);
184962306a36Sopenharmony_ci	if (rdev == p->rdev) {
185062306a36Sopenharmony_ci		if (test_bit(In_sync, &rdev->flags) ||
185162306a36Sopenharmony_ci		    atomic_read(&rdev->nr_pending)) {
185262306a36Sopenharmony_ci			err = -EBUSY;
185362306a36Sopenharmony_ci			goto abort;
185462306a36Sopenharmony_ci		}
185562306a36Sopenharmony_ci		/* Only remove non-faulty devices if recovery
185662306a36Sopenharmony_ci		 * is not possible.
185762306a36Sopenharmony_ci		 */
185862306a36Sopenharmony_ci		if (!test_bit(Faulty, &rdev->flags) &&
185962306a36Sopenharmony_ci		    mddev->recovery_disabled != conf->recovery_disabled &&
186062306a36Sopenharmony_ci		    mddev->degraded < conf->raid_disks) {
186162306a36Sopenharmony_ci			err = -EBUSY;
186262306a36Sopenharmony_ci			goto abort;
186362306a36Sopenharmony_ci		}
186462306a36Sopenharmony_ci		p->rdev = NULL;
186562306a36Sopenharmony_ci		if (!test_bit(RemoveSynchronized, &rdev->flags)) {
186662306a36Sopenharmony_ci			synchronize_rcu();
186762306a36Sopenharmony_ci			if (atomic_read(&rdev->nr_pending)) {
186862306a36Sopenharmony_ci				/* lost the race, try later */
186962306a36Sopenharmony_ci				err = -EBUSY;
187062306a36Sopenharmony_ci				p->rdev = rdev;
187162306a36Sopenharmony_ci				goto abort;
187262306a36Sopenharmony_ci			}
187362306a36Sopenharmony_ci		}
187462306a36Sopenharmony_ci		if (conf->mirrors[conf->raid_disks + number].rdev) {
187562306a36Sopenharmony_ci			/* We just removed a device that is being replaced.
187662306a36Sopenharmony_ci			 * Move down the replacement.  We drain all IO before
187762306a36Sopenharmony_ci			 * doing this to avoid confusion.
187862306a36Sopenharmony_ci			 */
187962306a36Sopenharmony_ci			struct md_rdev *repl =
188062306a36Sopenharmony_ci				conf->mirrors[conf->raid_disks + number].rdev;
188162306a36Sopenharmony_ci			freeze_array(conf, 0);
188262306a36Sopenharmony_ci			if (atomic_read(&repl->nr_pending)) {
188362306a36Sopenharmony_ci				/* It means that some queued IO of retry_list
188462306a36Sopenharmony_ci				 * hold repl. Thus, we cannot set replacement
188562306a36Sopenharmony_ci				 * as NULL, avoiding rdev NULL pointer
188662306a36Sopenharmony_ci				 * dereference in sync_request_write and
188762306a36Sopenharmony_ci				 * handle_write_finished.
188862306a36Sopenharmony_ci				 */
188962306a36Sopenharmony_ci				err = -EBUSY;
189062306a36Sopenharmony_ci				unfreeze_array(conf);
189162306a36Sopenharmony_ci				goto abort;
189262306a36Sopenharmony_ci			}
189362306a36Sopenharmony_ci			clear_bit(Replacement, &repl->flags);
189462306a36Sopenharmony_ci			p->rdev = repl;
189562306a36Sopenharmony_ci			conf->mirrors[conf->raid_disks + number].rdev = NULL;
189662306a36Sopenharmony_ci			unfreeze_array(conf);
189762306a36Sopenharmony_ci		}
189862306a36Sopenharmony_ci
189962306a36Sopenharmony_ci		clear_bit(WantReplacement, &rdev->flags);
190062306a36Sopenharmony_ci		err = md_integrity_register(mddev);
190162306a36Sopenharmony_ci	}
190262306a36Sopenharmony_ciabort:
190362306a36Sopenharmony_ci
190462306a36Sopenharmony_ci	print_conf(conf);
190562306a36Sopenharmony_ci	return err;
190662306a36Sopenharmony_ci}
190762306a36Sopenharmony_ci
190862306a36Sopenharmony_cistatic void end_sync_read(struct bio *bio)
190962306a36Sopenharmony_ci{
191062306a36Sopenharmony_ci	struct r1bio *r1_bio = get_resync_r1bio(bio);
191162306a36Sopenharmony_ci
191262306a36Sopenharmony_ci	update_head_pos(r1_bio->read_disk, r1_bio);
191362306a36Sopenharmony_ci
191462306a36Sopenharmony_ci	/*
191562306a36Sopenharmony_ci	 * we have read a block, now it needs to be re-written,
191662306a36Sopenharmony_ci	 * or re-read if the read failed.
191762306a36Sopenharmony_ci	 * We don't do much here, just schedule handling by raid1d
191862306a36Sopenharmony_ci	 */
191962306a36Sopenharmony_ci	if (!bio->bi_status)
192062306a36Sopenharmony_ci		set_bit(R1BIO_Uptodate, &r1_bio->state);
192162306a36Sopenharmony_ci
192262306a36Sopenharmony_ci	if (atomic_dec_and_test(&r1_bio->remaining))
192362306a36Sopenharmony_ci		reschedule_retry(r1_bio);
192462306a36Sopenharmony_ci}
192562306a36Sopenharmony_ci
192662306a36Sopenharmony_cistatic void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
192762306a36Sopenharmony_ci{
192862306a36Sopenharmony_ci	sector_t sync_blocks = 0;
192962306a36Sopenharmony_ci	sector_t s = r1_bio->sector;
193062306a36Sopenharmony_ci	long sectors_to_go = r1_bio->sectors;
193162306a36Sopenharmony_ci
193262306a36Sopenharmony_ci	/* make sure these bits don't get cleared. */
193362306a36Sopenharmony_ci	do {
193462306a36Sopenharmony_ci		md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
193562306a36Sopenharmony_ci		s += sync_blocks;
193662306a36Sopenharmony_ci		sectors_to_go -= sync_blocks;
193762306a36Sopenharmony_ci	} while (sectors_to_go > 0);
193862306a36Sopenharmony_ci}
193962306a36Sopenharmony_ci
194062306a36Sopenharmony_cistatic void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
194162306a36Sopenharmony_ci{
194262306a36Sopenharmony_ci	if (atomic_dec_and_test(&r1_bio->remaining)) {
194362306a36Sopenharmony_ci		struct mddev *mddev = r1_bio->mddev;
194462306a36Sopenharmony_ci		int s = r1_bio->sectors;
194562306a36Sopenharmony_ci
194662306a36Sopenharmony_ci		if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
194762306a36Sopenharmony_ci		    test_bit(R1BIO_WriteError, &r1_bio->state))
194862306a36Sopenharmony_ci			reschedule_retry(r1_bio);
194962306a36Sopenharmony_ci		else {
195062306a36Sopenharmony_ci			put_buf(r1_bio);
195162306a36Sopenharmony_ci			md_done_sync(mddev, s, uptodate);
195262306a36Sopenharmony_ci		}
195362306a36Sopenharmony_ci	}
195462306a36Sopenharmony_ci}
195562306a36Sopenharmony_ci
195662306a36Sopenharmony_cistatic void end_sync_write(struct bio *bio)
195762306a36Sopenharmony_ci{
195862306a36Sopenharmony_ci	int uptodate = !bio->bi_status;
195962306a36Sopenharmony_ci	struct r1bio *r1_bio = get_resync_r1bio(bio);
196062306a36Sopenharmony_ci	struct mddev *mddev = r1_bio->mddev;
196162306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
196262306a36Sopenharmony_ci	sector_t first_bad;
196362306a36Sopenharmony_ci	int bad_sectors;
196462306a36Sopenharmony_ci	struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
196562306a36Sopenharmony_ci
196662306a36Sopenharmony_ci	if (!uptodate) {
196762306a36Sopenharmony_ci		abort_sync_write(mddev, r1_bio);
196862306a36Sopenharmony_ci		set_bit(WriteErrorSeen, &rdev->flags);
196962306a36Sopenharmony_ci		if (!test_and_set_bit(WantReplacement, &rdev->flags))
197062306a36Sopenharmony_ci			set_bit(MD_RECOVERY_NEEDED, &
197162306a36Sopenharmony_ci				mddev->recovery);
197262306a36Sopenharmony_ci		set_bit(R1BIO_WriteError, &r1_bio->state);
197362306a36Sopenharmony_ci	} else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
197462306a36Sopenharmony_ci			       &first_bad, &bad_sectors) &&
197562306a36Sopenharmony_ci		   !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
197662306a36Sopenharmony_ci				r1_bio->sector,
197762306a36Sopenharmony_ci				r1_bio->sectors,
197862306a36Sopenharmony_ci				&first_bad, &bad_sectors)
197962306a36Sopenharmony_ci		)
198062306a36Sopenharmony_ci		set_bit(R1BIO_MadeGood, &r1_bio->state);
198162306a36Sopenharmony_ci
198262306a36Sopenharmony_ci	put_sync_write_buf(r1_bio, uptodate);
198362306a36Sopenharmony_ci}
198462306a36Sopenharmony_ci
198562306a36Sopenharmony_cistatic int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
198662306a36Sopenharmony_ci			   int sectors, struct page *page, blk_opf_t rw)
198762306a36Sopenharmony_ci{
198862306a36Sopenharmony_ci	if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
198962306a36Sopenharmony_ci		/* success */
199062306a36Sopenharmony_ci		return 1;
199162306a36Sopenharmony_ci	if (rw == REQ_OP_WRITE) {
199262306a36Sopenharmony_ci		set_bit(WriteErrorSeen, &rdev->flags);
199362306a36Sopenharmony_ci		if (!test_and_set_bit(WantReplacement,
199462306a36Sopenharmony_ci				      &rdev->flags))
199562306a36Sopenharmony_ci			set_bit(MD_RECOVERY_NEEDED, &
199662306a36Sopenharmony_ci				rdev->mddev->recovery);
199762306a36Sopenharmony_ci	}
199862306a36Sopenharmony_ci	/* need to record an error - either for the block or the device */
199962306a36Sopenharmony_ci	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
200062306a36Sopenharmony_ci		md_error(rdev->mddev, rdev);
200162306a36Sopenharmony_ci	return 0;
200262306a36Sopenharmony_ci}
200362306a36Sopenharmony_ci
200462306a36Sopenharmony_cistatic int fix_sync_read_error(struct r1bio *r1_bio)
200562306a36Sopenharmony_ci{
200662306a36Sopenharmony_ci	/* Try some synchronous reads of other devices to get
200762306a36Sopenharmony_ci	 * good data, much like with normal read errors.  Only
200862306a36Sopenharmony_ci	 * read into the pages we already have so we don't
200962306a36Sopenharmony_ci	 * need to re-issue the read request.
201062306a36Sopenharmony_ci	 * We don't need to freeze the array, because being in an
201162306a36Sopenharmony_ci	 * active sync request, there is no normal IO, and
201262306a36Sopenharmony_ci	 * no overlapping syncs.
201362306a36Sopenharmony_ci	 * We don't need to check is_badblock() again as we
201462306a36Sopenharmony_ci	 * made sure that anything with a bad block in range
201562306a36Sopenharmony_ci	 * will have bi_end_io clear.
201662306a36Sopenharmony_ci	 */
201762306a36Sopenharmony_ci	struct mddev *mddev = r1_bio->mddev;
201862306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
201962306a36Sopenharmony_ci	struct bio *bio = r1_bio->bios[r1_bio->read_disk];
202062306a36Sopenharmony_ci	struct page **pages = get_resync_pages(bio)->pages;
202162306a36Sopenharmony_ci	sector_t sect = r1_bio->sector;
202262306a36Sopenharmony_ci	int sectors = r1_bio->sectors;
202362306a36Sopenharmony_ci	int idx = 0;
202462306a36Sopenharmony_ci	struct md_rdev *rdev;
202562306a36Sopenharmony_ci
202662306a36Sopenharmony_ci	rdev = conf->mirrors[r1_bio->read_disk].rdev;
202762306a36Sopenharmony_ci	if (test_bit(FailFast, &rdev->flags)) {
202862306a36Sopenharmony_ci		/* Don't try recovering from here - just fail it
202962306a36Sopenharmony_ci		 * ... unless it is the last working device of course */
203062306a36Sopenharmony_ci		md_error(mddev, rdev);
203162306a36Sopenharmony_ci		if (test_bit(Faulty, &rdev->flags))
203262306a36Sopenharmony_ci			/* Don't try to read from here, but make sure
203362306a36Sopenharmony_ci			 * put_buf does it's thing
203462306a36Sopenharmony_ci			 */
203562306a36Sopenharmony_ci			bio->bi_end_io = end_sync_write;
203662306a36Sopenharmony_ci	}
203762306a36Sopenharmony_ci
203862306a36Sopenharmony_ci	while(sectors) {
203962306a36Sopenharmony_ci		int s = sectors;
204062306a36Sopenharmony_ci		int d = r1_bio->read_disk;
204162306a36Sopenharmony_ci		int success = 0;
204262306a36Sopenharmony_ci		int start;
204362306a36Sopenharmony_ci
204462306a36Sopenharmony_ci		if (s > (PAGE_SIZE>>9))
204562306a36Sopenharmony_ci			s = PAGE_SIZE >> 9;
204662306a36Sopenharmony_ci		do {
204762306a36Sopenharmony_ci			if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
204862306a36Sopenharmony_ci				/* No rcu protection needed here devices
204962306a36Sopenharmony_ci				 * can only be removed when no resync is
205062306a36Sopenharmony_ci				 * active, and resync is currently active
205162306a36Sopenharmony_ci				 */
205262306a36Sopenharmony_ci				rdev = conf->mirrors[d].rdev;
205362306a36Sopenharmony_ci				if (sync_page_io(rdev, sect, s<<9,
205462306a36Sopenharmony_ci						 pages[idx],
205562306a36Sopenharmony_ci						 REQ_OP_READ, false)) {
205662306a36Sopenharmony_ci					success = 1;
205762306a36Sopenharmony_ci					break;
205862306a36Sopenharmony_ci				}
205962306a36Sopenharmony_ci			}
206062306a36Sopenharmony_ci			d++;
206162306a36Sopenharmony_ci			if (d == conf->raid_disks * 2)
206262306a36Sopenharmony_ci				d = 0;
206362306a36Sopenharmony_ci		} while (!success && d != r1_bio->read_disk);
206462306a36Sopenharmony_ci
206562306a36Sopenharmony_ci		if (!success) {
206662306a36Sopenharmony_ci			int abort = 0;
206762306a36Sopenharmony_ci			/* Cannot read from anywhere, this block is lost.
206862306a36Sopenharmony_ci			 * Record a bad block on each device.  If that doesn't
206962306a36Sopenharmony_ci			 * work just disable and interrupt the recovery.
207062306a36Sopenharmony_ci			 * Don't fail devices as that won't really help.
207162306a36Sopenharmony_ci			 */
207262306a36Sopenharmony_ci			pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
207362306a36Sopenharmony_ci					    mdname(mddev), bio->bi_bdev,
207462306a36Sopenharmony_ci					    (unsigned long long)r1_bio->sector);
207562306a36Sopenharmony_ci			for (d = 0; d < conf->raid_disks * 2; d++) {
207662306a36Sopenharmony_ci				rdev = conf->mirrors[d].rdev;
207762306a36Sopenharmony_ci				if (!rdev || test_bit(Faulty, &rdev->flags))
207862306a36Sopenharmony_ci					continue;
207962306a36Sopenharmony_ci				if (!rdev_set_badblocks(rdev, sect, s, 0))
208062306a36Sopenharmony_ci					abort = 1;
208162306a36Sopenharmony_ci			}
208262306a36Sopenharmony_ci			if (abort) {
208362306a36Sopenharmony_ci				conf->recovery_disabled =
208462306a36Sopenharmony_ci					mddev->recovery_disabled;
208562306a36Sopenharmony_ci				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
208662306a36Sopenharmony_ci				md_done_sync(mddev, r1_bio->sectors, 0);
208762306a36Sopenharmony_ci				put_buf(r1_bio);
208862306a36Sopenharmony_ci				return 0;
208962306a36Sopenharmony_ci			}
209062306a36Sopenharmony_ci			/* Try next page */
209162306a36Sopenharmony_ci			sectors -= s;
209262306a36Sopenharmony_ci			sect += s;
209362306a36Sopenharmony_ci			idx++;
209462306a36Sopenharmony_ci			continue;
209562306a36Sopenharmony_ci		}
209662306a36Sopenharmony_ci
209762306a36Sopenharmony_ci		start = d;
209862306a36Sopenharmony_ci		/* write it back and re-read */
209962306a36Sopenharmony_ci		while (d != r1_bio->read_disk) {
210062306a36Sopenharmony_ci			if (d == 0)
210162306a36Sopenharmony_ci				d = conf->raid_disks * 2;
210262306a36Sopenharmony_ci			d--;
210362306a36Sopenharmony_ci			if (r1_bio->bios[d]->bi_end_io != end_sync_read)
210462306a36Sopenharmony_ci				continue;
210562306a36Sopenharmony_ci			rdev = conf->mirrors[d].rdev;
210662306a36Sopenharmony_ci			if (r1_sync_page_io(rdev, sect, s,
210762306a36Sopenharmony_ci					    pages[idx],
210862306a36Sopenharmony_ci					    REQ_OP_WRITE) == 0) {
210962306a36Sopenharmony_ci				r1_bio->bios[d]->bi_end_io = NULL;
211062306a36Sopenharmony_ci				rdev_dec_pending(rdev, mddev);
211162306a36Sopenharmony_ci			}
211262306a36Sopenharmony_ci		}
211362306a36Sopenharmony_ci		d = start;
211462306a36Sopenharmony_ci		while (d != r1_bio->read_disk) {
211562306a36Sopenharmony_ci			if (d == 0)
211662306a36Sopenharmony_ci				d = conf->raid_disks * 2;
211762306a36Sopenharmony_ci			d--;
211862306a36Sopenharmony_ci			if (r1_bio->bios[d]->bi_end_io != end_sync_read)
211962306a36Sopenharmony_ci				continue;
212062306a36Sopenharmony_ci			rdev = conf->mirrors[d].rdev;
212162306a36Sopenharmony_ci			if (r1_sync_page_io(rdev, sect, s,
212262306a36Sopenharmony_ci					    pages[idx],
212362306a36Sopenharmony_ci					    REQ_OP_READ) != 0)
212462306a36Sopenharmony_ci				atomic_add(s, &rdev->corrected_errors);
212562306a36Sopenharmony_ci		}
212662306a36Sopenharmony_ci		sectors -= s;
212762306a36Sopenharmony_ci		sect += s;
212862306a36Sopenharmony_ci		idx ++;
212962306a36Sopenharmony_ci	}
213062306a36Sopenharmony_ci	set_bit(R1BIO_Uptodate, &r1_bio->state);
213162306a36Sopenharmony_ci	bio->bi_status = 0;
213262306a36Sopenharmony_ci	return 1;
213362306a36Sopenharmony_ci}
213462306a36Sopenharmony_ci
213562306a36Sopenharmony_cistatic void process_checks(struct r1bio *r1_bio)
213662306a36Sopenharmony_ci{
213762306a36Sopenharmony_ci	/* We have read all readable devices.  If we haven't
213862306a36Sopenharmony_ci	 * got the block, then there is no hope left.
213962306a36Sopenharmony_ci	 * If we have, then we want to do a comparison
214062306a36Sopenharmony_ci	 * and skip the write if everything is the same.
214162306a36Sopenharmony_ci	 * If any blocks failed to read, then we need to
214262306a36Sopenharmony_ci	 * attempt an over-write
214362306a36Sopenharmony_ci	 */
214462306a36Sopenharmony_ci	struct mddev *mddev = r1_bio->mddev;
214562306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
214662306a36Sopenharmony_ci	int primary;
214762306a36Sopenharmony_ci	int i;
214862306a36Sopenharmony_ci	int vcnt;
214962306a36Sopenharmony_ci
215062306a36Sopenharmony_ci	/* Fix variable parts of all bios */
215162306a36Sopenharmony_ci	vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
215262306a36Sopenharmony_ci	for (i = 0; i < conf->raid_disks * 2; i++) {
215362306a36Sopenharmony_ci		blk_status_t status;
215462306a36Sopenharmony_ci		struct bio *b = r1_bio->bios[i];
215562306a36Sopenharmony_ci		struct resync_pages *rp = get_resync_pages(b);
215662306a36Sopenharmony_ci		if (b->bi_end_io != end_sync_read)
215762306a36Sopenharmony_ci			continue;
215862306a36Sopenharmony_ci		/* fixup the bio for reuse, but preserve errno */
215962306a36Sopenharmony_ci		status = b->bi_status;
216062306a36Sopenharmony_ci		bio_reset(b, conf->mirrors[i].rdev->bdev, REQ_OP_READ);
216162306a36Sopenharmony_ci		b->bi_status = status;
216262306a36Sopenharmony_ci		b->bi_iter.bi_sector = r1_bio->sector +
216362306a36Sopenharmony_ci			conf->mirrors[i].rdev->data_offset;
216462306a36Sopenharmony_ci		b->bi_end_io = end_sync_read;
216562306a36Sopenharmony_ci		rp->raid_bio = r1_bio;
216662306a36Sopenharmony_ci		b->bi_private = rp;
216762306a36Sopenharmony_ci
216862306a36Sopenharmony_ci		/* initialize bvec table again */
216962306a36Sopenharmony_ci		md_bio_reset_resync_pages(b, rp, r1_bio->sectors << 9);
217062306a36Sopenharmony_ci	}
217162306a36Sopenharmony_ci	for (primary = 0; primary < conf->raid_disks * 2; primary++)
217262306a36Sopenharmony_ci		if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
217362306a36Sopenharmony_ci		    !r1_bio->bios[primary]->bi_status) {
217462306a36Sopenharmony_ci			r1_bio->bios[primary]->bi_end_io = NULL;
217562306a36Sopenharmony_ci			rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
217662306a36Sopenharmony_ci			break;
217762306a36Sopenharmony_ci		}
217862306a36Sopenharmony_ci	r1_bio->read_disk = primary;
217962306a36Sopenharmony_ci	for (i = 0; i < conf->raid_disks * 2; i++) {
218062306a36Sopenharmony_ci		int j = 0;
218162306a36Sopenharmony_ci		struct bio *pbio = r1_bio->bios[primary];
218262306a36Sopenharmony_ci		struct bio *sbio = r1_bio->bios[i];
218362306a36Sopenharmony_ci		blk_status_t status = sbio->bi_status;
218462306a36Sopenharmony_ci		struct page **ppages = get_resync_pages(pbio)->pages;
218562306a36Sopenharmony_ci		struct page **spages = get_resync_pages(sbio)->pages;
218662306a36Sopenharmony_ci		struct bio_vec *bi;
218762306a36Sopenharmony_ci		int page_len[RESYNC_PAGES] = { 0 };
218862306a36Sopenharmony_ci		struct bvec_iter_all iter_all;
218962306a36Sopenharmony_ci
219062306a36Sopenharmony_ci		if (sbio->bi_end_io != end_sync_read)
219162306a36Sopenharmony_ci			continue;
219262306a36Sopenharmony_ci		/* Now we can 'fixup' the error value */
219362306a36Sopenharmony_ci		sbio->bi_status = 0;
219462306a36Sopenharmony_ci
219562306a36Sopenharmony_ci		bio_for_each_segment_all(bi, sbio, iter_all)
219662306a36Sopenharmony_ci			page_len[j++] = bi->bv_len;
219762306a36Sopenharmony_ci
219862306a36Sopenharmony_ci		if (!status) {
219962306a36Sopenharmony_ci			for (j = vcnt; j-- ; ) {
220062306a36Sopenharmony_ci				if (memcmp(page_address(ppages[j]),
220162306a36Sopenharmony_ci					   page_address(spages[j]),
220262306a36Sopenharmony_ci					   page_len[j]))
220362306a36Sopenharmony_ci					break;
220462306a36Sopenharmony_ci			}
220562306a36Sopenharmony_ci		} else
220662306a36Sopenharmony_ci			j = 0;
220762306a36Sopenharmony_ci		if (j >= 0)
220862306a36Sopenharmony_ci			atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
220962306a36Sopenharmony_ci		if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
221062306a36Sopenharmony_ci			      && !status)) {
221162306a36Sopenharmony_ci			/* No need to write to this device. */
221262306a36Sopenharmony_ci			sbio->bi_end_io = NULL;
221362306a36Sopenharmony_ci			rdev_dec_pending(conf->mirrors[i].rdev, mddev);
221462306a36Sopenharmony_ci			continue;
221562306a36Sopenharmony_ci		}
221662306a36Sopenharmony_ci
221762306a36Sopenharmony_ci		bio_copy_data(sbio, pbio);
221862306a36Sopenharmony_ci	}
221962306a36Sopenharmony_ci}
222062306a36Sopenharmony_ci
222162306a36Sopenharmony_cistatic void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
222262306a36Sopenharmony_ci{
222362306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
222462306a36Sopenharmony_ci	int i;
222562306a36Sopenharmony_ci	int disks = conf->raid_disks * 2;
222662306a36Sopenharmony_ci	struct bio *wbio;
222762306a36Sopenharmony_ci
222862306a36Sopenharmony_ci	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
222962306a36Sopenharmony_ci		/* ouch - failed to read all of that. */
223062306a36Sopenharmony_ci		if (!fix_sync_read_error(r1_bio))
223162306a36Sopenharmony_ci			return;
223262306a36Sopenharmony_ci
223362306a36Sopenharmony_ci	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
223462306a36Sopenharmony_ci		process_checks(r1_bio);
223562306a36Sopenharmony_ci
223662306a36Sopenharmony_ci	/*
223762306a36Sopenharmony_ci	 * schedule writes
223862306a36Sopenharmony_ci	 */
223962306a36Sopenharmony_ci	atomic_set(&r1_bio->remaining, 1);
224062306a36Sopenharmony_ci	for (i = 0; i < disks ; i++) {
224162306a36Sopenharmony_ci		wbio = r1_bio->bios[i];
224262306a36Sopenharmony_ci		if (wbio->bi_end_io == NULL ||
224362306a36Sopenharmony_ci		    (wbio->bi_end_io == end_sync_read &&
224462306a36Sopenharmony_ci		     (i == r1_bio->read_disk ||
224562306a36Sopenharmony_ci		      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
224662306a36Sopenharmony_ci			continue;
224762306a36Sopenharmony_ci		if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) {
224862306a36Sopenharmony_ci			abort_sync_write(mddev, r1_bio);
224962306a36Sopenharmony_ci			continue;
225062306a36Sopenharmony_ci		}
225162306a36Sopenharmony_ci
225262306a36Sopenharmony_ci		wbio->bi_opf = REQ_OP_WRITE;
225362306a36Sopenharmony_ci		if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
225462306a36Sopenharmony_ci			wbio->bi_opf |= MD_FAILFAST;
225562306a36Sopenharmony_ci
225662306a36Sopenharmony_ci		wbio->bi_end_io = end_sync_write;
225762306a36Sopenharmony_ci		atomic_inc(&r1_bio->remaining);
225862306a36Sopenharmony_ci		md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
225962306a36Sopenharmony_ci
226062306a36Sopenharmony_ci		submit_bio_noacct(wbio);
226162306a36Sopenharmony_ci	}
226262306a36Sopenharmony_ci
226362306a36Sopenharmony_ci	put_sync_write_buf(r1_bio, 1);
226462306a36Sopenharmony_ci}
226562306a36Sopenharmony_ci
226662306a36Sopenharmony_ci/*
226762306a36Sopenharmony_ci * This is a kernel thread which:
226862306a36Sopenharmony_ci *
226962306a36Sopenharmony_ci *	1.	Retries failed read operations on working mirrors.
227062306a36Sopenharmony_ci *	2.	Updates the raid superblock when problems encounter.
227162306a36Sopenharmony_ci *	3.	Performs writes following reads for array synchronising.
227262306a36Sopenharmony_ci */
227362306a36Sopenharmony_ci
227462306a36Sopenharmony_cistatic void fix_read_error(struct r1conf *conf, int read_disk,
227562306a36Sopenharmony_ci			   sector_t sect, int sectors)
227662306a36Sopenharmony_ci{
227762306a36Sopenharmony_ci	struct mddev *mddev = conf->mddev;
227862306a36Sopenharmony_ci	while(sectors) {
227962306a36Sopenharmony_ci		int s = sectors;
228062306a36Sopenharmony_ci		int d = read_disk;
228162306a36Sopenharmony_ci		int success = 0;
228262306a36Sopenharmony_ci		int start;
228362306a36Sopenharmony_ci		struct md_rdev *rdev;
228462306a36Sopenharmony_ci
228562306a36Sopenharmony_ci		if (s > (PAGE_SIZE>>9))
228662306a36Sopenharmony_ci			s = PAGE_SIZE >> 9;
228762306a36Sopenharmony_ci
228862306a36Sopenharmony_ci		do {
228962306a36Sopenharmony_ci			sector_t first_bad;
229062306a36Sopenharmony_ci			int bad_sectors;
229162306a36Sopenharmony_ci
229262306a36Sopenharmony_ci			rcu_read_lock();
229362306a36Sopenharmony_ci			rdev = rcu_dereference(conf->mirrors[d].rdev);
229462306a36Sopenharmony_ci			if (rdev &&
229562306a36Sopenharmony_ci			    (test_bit(In_sync, &rdev->flags) ||
229662306a36Sopenharmony_ci			     (!test_bit(Faulty, &rdev->flags) &&
229762306a36Sopenharmony_ci			      rdev->recovery_offset >= sect + s)) &&
229862306a36Sopenharmony_ci			    is_badblock(rdev, sect, s,
229962306a36Sopenharmony_ci					&first_bad, &bad_sectors) == 0) {
230062306a36Sopenharmony_ci				atomic_inc(&rdev->nr_pending);
230162306a36Sopenharmony_ci				rcu_read_unlock();
230262306a36Sopenharmony_ci				if (sync_page_io(rdev, sect, s<<9,
230362306a36Sopenharmony_ci					 conf->tmppage, REQ_OP_READ, false))
230462306a36Sopenharmony_ci					success = 1;
230562306a36Sopenharmony_ci				rdev_dec_pending(rdev, mddev);
230662306a36Sopenharmony_ci				if (success)
230762306a36Sopenharmony_ci					break;
230862306a36Sopenharmony_ci			} else
230962306a36Sopenharmony_ci				rcu_read_unlock();
231062306a36Sopenharmony_ci			d++;
231162306a36Sopenharmony_ci			if (d == conf->raid_disks * 2)
231262306a36Sopenharmony_ci				d = 0;
231362306a36Sopenharmony_ci		} while (d != read_disk);
231462306a36Sopenharmony_ci
231562306a36Sopenharmony_ci		if (!success) {
231662306a36Sopenharmony_ci			/* Cannot read from anywhere - mark it bad */
231762306a36Sopenharmony_ci			struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
231862306a36Sopenharmony_ci			if (!rdev_set_badblocks(rdev, sect, s, 0))
231962306a36Sopenharmony_ci				md_error(mddev, rdev);
232062306a36Sopenharmony_ci			break;
232162306a36Sopenharmony_ci		}
232262306a36Sopenharmony_ci		/* write it back and re-read */
232362306a36Sopenharmony_ci		start = d;
232462306a36Sopenharmony_ci		while (d != read_disk) {
232562306a36Sopenharmony_ci			if (d==0)
232662306a36Sopenharmony_ci				d = conf->raid_disks * 2;
232762306a36Sopenharmony_ci			d--;
232862306a36Sopenharmony_ci			rcu_read_lock();
232962306a36Sopenharmony_ci			rdev = rcu_dereference(conf->mirrors[d].rdev);
233062306a36Sopenharmony_ci			if (rdev &&
233162306a36Sopenharmony_ci			    !test_bit(Faulty, &rdev->flags)) {
233262306a36Sopenharmony_ci				atomic_inc(&rdev->nr_pending);
233362306a36Sopenharmony_ci				rcu_read_unlock();
233462306a36Sopenharmony_ci				r1_sync_page_io(rdev, sect, s,
233562306a36Sopenharmony_ci						conf->tmppage, REQ_OP_WRITE);
233662306a36Sopenharmony_ci				rdev_dec_pending(rdev, mddev);
233762306a36Sopenharmony_ci			} else
233862306a36Sopenharmony_ci				rcu_read_unlock();
233962306a36Sopenharmony_ci		}
234062306a36Sopenharmony_ci		d = start;
234162306a36Sopenharmony_ci		while (d != read_disk) {
234262306a36Sopenharmony_ci			if (d==0)
234362306a36Sopenharmony_ci				d = conf->raid_disks * 2;
234462306a36Sopenharmony_ci			d--;
234562306a36Sopenharmony_ci			rcu_read_lock();
234662306a36Sopenharmony_ci			rdev = rcu_dereference(conf->mirrors[d].rdev);
234762306a36Sopenharmony_ci			if (rdev &&
234862306a36Sopenharmony_ci			    !test_bit(Faulty, &rdev->flags)) {
234962306a36Sopenharmony_ci				atomic_inc(&rdev->nr_pending);
235062306a36Sopenharmony_ci				rcu_read_unlock();
235162306a36Sopenharmony_ci				if (r1_sync_page_io(rdev, sect, s,
235262306a36Sopenharmony_ci						conf->tmppage, REQ_OP_READ)) {
235362306a36Sopenharmony_ci					atomic_add(s, &rdev->corrected_errors);
235462306a36Sopenharmony_ci					pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %pg)\n",
235562306a36Sopenharmony_ci						mdname(mddev), s,
235662306a36Sopenharmony_ci						(unsigned long long)(sect +
235762306a36Sopenharmony_ci								     rdev->data_offset),
235862306a36Sopenharmony_ci						rdev->bdev);
235962306a36Sopenharmony_ci				}
236062306a36Sopenharmony_ci				rdev_dec_pending(rdev, mddev);
236162306a36Sopenharmony_ci			} else
236262306a36Sopenharmony_ci				rcu_read_unlock();
236362306a36Sopenharmony_ci		}
236462306a36Sopenharmony_ci		sectors -= s;
236562306a36Sopenharmony_ci		sect += s;
236662306a36Sopenharmony_ci	}
236762306a36Sopenharmony_ci}
236862306a36Sopenharmony_ci
236962306a36Sopenharmony_cistatic int narrow_write_error(struct r1bio *r1_bio, int i)
237062306a36Sopenharmony_ci{
237162306a36Sopenharmony_ci	struct mddev *mddev = r1_bio->mddev;
237262306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
237362306a36Sopenharmony_ci	struct md_rdev *rdev = conf->mirrors[i].rdev;
237462306a36Sopenharmony_ci
237562306a36Sopenharmony_ci	/* bio has the data to be written to device 'i' where
237662306a36Sopenharmony_ci	 * we just recently had a write error.
237762306a36Sopenharmony_ci	 * We repeatedly clone the bio and trim down to one block,
237862306a36Sopenharmony_ci	 * then try the write.  Where the write fails we record
237962306a36Sopenharmony_ci	 * a bad block.
238062306a36Sopenharmony_ci	 * It is conceivable that the bio doesn't exactly align with
238162306a36Sopenharmony_ci	 * blocks.  We must handle this somehow.
238262306a36Sopenharmony_ci	 *
238362306a36Sopenharmony_ci	 * We currently own a reference on the rdev.
238462306a36Sopenharmony_ci	 */
238562306a36Sopenharmony_ci
238662306a36Sopenharmony_ci	int block_sectors;
238762306a36Sopenharmony_ci	sector_t sector;
238862306a36Sopenharmony_ci	int sectors;
238962306a36Sopenharmony_ci	int sect_to_write = r1_bio->sectors;
239062306a36Sopenharmony_ci	int ok = 1;
239162306a36Sopenharmony_ci
239262306a36Sopenharmony_ci	if (rdev->badblocks.shift < 0)
239362306a36Sopenharmony_ci		return 0;
239462306a36Sopenharmony_ci
239562306a36Sopenharmony_ci	block_sectors = roundup(1 << rdev->badblocks.shift,
239662306a36Sopenharmony_ci				bdev_logical_block_size(rdev->bdev) >> 9);
239762306a36Sopenharmony_ci	sector = r1_bio->sector;
239862306a36Sopenharmony_ci	sectors = ((sector + block_sectors)
239962306a36Sopenharmony_ci		   & ~(sector_t)(block_sectors - 1))
240062306a36Sopenharmony_ci		- sector;
240162306a36Sopenharmony_ci
240262306a36Sopenharmony_ci	while (sect_to_write) {
240362306a36Sopenharmony_ci		struct bio *wbio;
240462306a36Sopenharmony_ci		if (sectors > sect_to_write)
240562306a36Sopenharmony_ci			sectors = sect_to_write;
240662306a36Sopenharmony_ci		/* Write at 'sector' for 'sectors'*/
240762306a36Sopenharmony_ci
240862306a36Sopenharmony_ci		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
240962306a36Sopenharmony_ci			wbio = bio_alloc_clone(rdev->bdev,
241062306a36Sopenharmony_ci					       r1_bio->behind_master_bio,
241162306a36Sopenharmony_ci					       GFP_NOIO, &mddev->bio_set);
241262306a36Sopenharmony_ci		} else {
241362306a36Sopenharmony_ci			wbio = bio_alloc_clone(rdev->bdev, r1_bio->master_bio,
241462306a36Sopenharmony_ci					       GFP_NOIO, &mddev->bio_set);
241562306a36Sopenharmony_ci		}
241662306a36Sopenharmony_ci
241762306a36Sopenharmony_ci		wbio->bi_opf = REQ_OP_WRITE;
241862306a36Sopenharmony_ci		wbio->bi_iter.bi_sector = r1_bio->sector;
241962306a36Sopenharmony_ci		wbio->bi_iter.bi_size = r1_bio->sectors << 9;
242062306a36Sopenharmony_ci
242162306a36Sopenharmony_ci		bio_trim(wbio, sector - r1_bio->sector, sectors);
242262306a36Sopenharmony_ci		wbio->bi_iter.bi_sector += rdev->data_offset;
242362306a36Sopenharmony_ci
242462306a36Sopenharmony_ci		if (submit_bio_wait(wbio) < 0)
242562306a36Sopenharmony_ci			/* failure! */
242662306a36Sopenharmony_ci			ok = rdev_set_badblocks(rdev, sector,
242762306a36Sopenharmony_ci						sectors, 0)
242862306a36Sopenharmony_ci				&& ok;
242962306a36Sopenharmony_ci
243062306a36Sopenharmony_ci		bio_put(wbio);
243162306a36Sopenharmony_ci		sect_to_write -= sectors;
243262306a36Sopenharmony_ci		sector += sectors;
243362306a36Sopenharmony_ci		sectors = block_sectors;
243462306a36Sopenharmony_ci	}
243562306a36Sopenharmony_ci	return ok;
243662306a36Sopenharmony_ci}
243762306a36Sopenharmony_ci
243862306a36Sopenharmony_cistatic void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
243962306a36Sopenharmony_ci{
244062306a36Sopenharmony_ci	int m;
244162306a36Sopenharmony_ci	int s = r1_bio->sectors;
244262306a36Sopenharmony_ci	for (m = 0; m < conf->raid_disks * 2 ; m++) {
244362306a36Sopenharmony_ci		struct md_rdev *rdev = conf->mirrors[m].rdev;
244462306a36Sopenharmony_ci		struct bio *bio = r1_bio->bios[m];
244562306a36Sopenharmony_ci		if (bio->bi_end_io == NULL)
244662306a36Sopenharmony_ci			continue;
244762306a36Sopenharmony_ci		if (!bio->bi_status &&
244862306a36Sopenharmony_ci		    test_bit(R1BIO_MadeGood, &r1_bio->state)) {
244962306a36Sopenharmony_ci			rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
245062306a36Sopenharmony_ci		}
245162306a36Sopenharmony_ci		if (bio->bi_status &&
245262306a36Sopenharmony_ci		    test_bit(R1BIO_WriteError, &r1_bio->state)) {
245362306a36Sopenharmony_ci			if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
245462306a36Sopenharmony_ci				md_error(conf->mddev, rdev);
245562306a36Sopenharmony_ci		}
245662306a36Sopenharmony_ci	}
245762306a36Sopenharmony_ci	put_buf(r1_bio);
245862306a36Sopenharmony_ci	md_done_sync(conf->mddev, s, 1);
245962306a36Sopenharmony_ci}
246062306a36Sopenharmony_ci
246162306a36Sopenharmony_cistatic void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
246262306a36Sopenharmony_ci{
246362306a36Sopenharmony_ci	int m, idx;
246462306a36Sopenharmony_ci	bool fail = false;
246562306a36Sopenharmony_ci
246662306a36Sopenharmony_ci	for (m = 0; m < conf->raid_disks * 2 ; m++)
246762306a36Sopenharmony_ci		if (r1_bio->bios[m] == IO_MADE_GOOD) {
246862306a36Sopenharmony_ci			struct md_rdev *rdev = conf->mirrors[m].rdev;
246962306a36Sopenharmony_ci			rdev_clear_badblocks(rdev,
247062306a36Sopenharmony_ci					     r1_bio->sector,
247162306a36Sopenharmony_ci					     r1_bio->sectors, 0);
247262306a36Sopenharmony_ci			rdev_dec_pending(rdev, conf->mddev);
247362306a36Sopenharmony_ci		} else if (r1_bio->bios[m] != NULL) {
247462306a36Sopenharmony_ci			/* This drive got a write error.  We need to
247562306a36Sopenharmony_ci			 * narrow down and record precise write
247662306a36Sopenharmony_ci			 * errors.
247762306a36Sopenharmony_ci			 */
247862306a36Sopenharmony_ci			fail = true;
247962306a36Sopenharmony_ci			if (!narrow_write_error(r1_bio, m)) {
248062306a36Sopenharmony_ci				md_error(conf->mddev,
248162306a36Sopenharmony_ci					 conf->mirrors[m].rdev);
248262306a36Sopenharmony_ci				/* an I/O failed, we can't clear the bitmap */
248362306a36Sopenharmony_ci				set_bit(R1BIO_Degraded, &r1_bio->state);
248462306a36Sopenharmony_ci			}
248562306a36Sopenharmony_ci			rdev_dec_pending(conf->mirrors[m].rdev,
248662306a36Sopenharmony_ci					 conf->mddev);
248762306a36Sopenharmony_ci		}
248862306a36Sopenharmony_ci	if (fail) {
248962306a36Sopenharmony_ci		spin_lock_irq(&conf->device_lock);
249062306a36Sopenharmony_ci		list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
249162306a36Sopenharmony_ci		idx = sector_to_idx(r1_bio->sector);
249262306a36Sopenharmony_ci		atomic_inc(&conf->nr_queued[idx]);
249362306a36Sopenharmony_ci		spin_unlock_irq(&conf->device_lock);
249462306a36Sopenharmony_ci		/*
249562306a36Sopenharmony_ci		 * In case freeze_array() is waiting for condition
249662306a36Sopenharmony_ci		 * get_unqueued_pending() == extra to be true.
249762306a36Sopenharmony_ci		 */
249862306a36Sopenharmony_ci		wake_up(&conf->wait_barrier);
249962306a36Sopenharmony_ci		md_wakeup_thread(conf->mddev->thread);
250062306a36Sopenharmony_ci	} else {
250162306a36Sopenharmony_ci		if (test_bit(R1BIO_WriteError, &r1_bio->state))
250262306a36Sopenharmony_ci			close_write(r1_bio);
250362306a36Sopenharmony_ci		raid_end_bio_io(r1_bio);
250462306a36Sopenharmony_ci	}
250562306a36Sopenharmony_ci}
250662306a36Sopenharmony_ci
250762306a36Sopenharmony_cistatic void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
250862306a36Sopenharmony_ci{
250962306a36Sopenharmony_ci	struct mddev *mddev = conf->mddev;
251062306a36Sopenharmony_ci	struct bio *bio;
251162306a36Sopenharmony_ci	struct md_rdev *rdev;
251262306a36Sopenharmony_ci	sector_t sector;
251362306a36Sopenharmony_ci
251462306a36Sopenharmony_ci	clear_bit(R1BIO_ReadError, &r1_bio->state);
251562306a36Sopenharmony_ci	/* we got a read error. Maybe the drive is bad.  Maybe just
251662306a36Sopenharmony_ci	 * the block and we can fix it.
251762306a36Sopenharmony_ci	 * We freeze all other IO, and try reading the block from
251862306a36Sopenharmony_ci	 * other devices.  When we find one, we re-write
251962306a36Sopenharmony_ci	 * and check it that fixes the read error.
252062306a36Sopenharmony_ci	 * This is all done synchronously while the array is
252162306a36Sopenharmony_ci	 * frozen
252262306a36Sopenharmony_ci	 */
252362306a36Sopenharmony_ci
252462306a36Sopenharmony_ci	bio = r1_bio->bios[r1_bio->read_disk];
252562306a36Sopenharmony_ci	bio_put(bio);
252662306a36Sopenharmony_ci	r1_bio->bios[r1_bio->read_disk] = NULL;
252762306a36Sopenharmony_ci
252862306a36Sopenharmony_ci	rdev = conf->mirrors[r1_bio->read_disk].rdev;
252962306a36Sopenharmony_ci	if (mddev->ro == 0
253062306a36Sopenharmony_ci	    && !test_bit(FailFast, &rdev->flags)) {
253162306a36Sopenharmony_ci		freeze_array(conf, 1);
253262306a36Sopenharmony_ci		fix_read_error(conf, r1_bio->read_disk,
253362306a36Sopenharmony_ci			       r1_bio->sector, r1_bio->sectors);
253462306a36Sopenharmony_ci		unfreeze_array(conf);
253562306a36Sopenharmony_ci	} else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
253662306a36Sopenharmony_ci		md_error(mddev, rdev);
253762306a36Sopenharmony_ci	} else {
253862306a36Sopenharmony_ci		r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
253962306a36Sopenharmony_ci	}
254062306a36Sopenharmony_ci
254162306a36Sopenharmony_ci	rdev_dec_pending(rdev, conf->mddev);
254262306a36Sopenharmony_ci	sector = r1_bio->sector;
254362306a36Sopenharmony_ci	bio = r1_bio->master_bio;
254462306a36Sopenharmony_ci
254562306a36Sopenharmony_ci	/* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
254662306a36Sopenharmony_ci	r1_bio->state = 0;
254762306a36Sopenharmony_ci	raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
254862306a36Sopenharmony_ci	allow_barrier(conf, sector);
254962306a36Sopenharmony_ci}
255062306a36Sopenharmony_ci
255162306a36Sopenharmony_cistatic void raid1d(struct md_thread *thread)
255262306a36Sopenharmony_ci{
255362306a36Sopenharmony_ci	struct mddev *mddev = thread->mddev;
255462306a36Sopenharmony_ci	struct r1bio *r1_bio;
255562306a36Sopenharmony_ci	unsigned long flags;
255662306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
255762306a36Sopenharmony_ci	struct list_head *head = &conf->retry_list;
255862306a36Sopenharmony_ci	struct blk_plug plug;
255962306a36Sopenharmony_ci	int idx;
256062306a36Sopenharmony_ci
256162306a36Sopenharmony_ci	md_check_recovery(mddev);
256262306a36Sopenharmony_ci
256362306a36Sopenharmony_ci	if (!list_empty_careful(&conf->bio_end_io_list) &&
256462306a36Sopenharmony_ci	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
256562306a36Sopenharmony_ci		LIST_HEAD(tmp);
256662306a36Sopenharmony_ci		spin_lock_irqsave(&conf->device_lock, flags);
256762306a36Sopenharmony_ci		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
256862306a36Sopenharmony_ci			list_splice_init(&conf->bio_end_io_list, &tmp);
256962306a36Sopenharmony_ci		spin_unlock_irqrestore(&conf->device_lock, flags);
257062306a36Sopenharmony_ci		while (!list_empty(&tmp)) {
257162306a36Sopenharmony_ci			r1_bio = list_first_entry(&tmp, struct r1bio,
257262306a36Sopenharmony_ci						  retry_list);
257362306a36Sopenharmony_ci			list_del(&r1_bio->retry_list);
257462306a36Sopenharmony_ci			idx = sector_to_idx(r1_bio->sector);
257562306a36Sopenharmony_ci			atomic_dec(&conf->nr_queued[idx]);
257662306a36Sopenharmony_ci			if (mddev->degraded)
257762306a36Sopenharmony_ci				set_bit(R1BIO_Degraded, &r1_bio->state);
257862306a36Sopenharmony_ci			if (test_bit(R1BIO_WriteError, &r1_bio->state))
257962306a36Sopenharmony_ci				close_write(r1_bio);
258062306a36Sopenharmony_ci			raid_end_bio_io(r1_bio);
258162306a36Sopenharmony_ci		}
258262306a36Sopenharmony_ci	}
258362306a36Sopenharmony_ci
258462306a36Sopenharmony_ci	blk_start_plug(&plug);
258562306a36Sopenharmony_ci	for (;;) {
258662306a36Sopenharmony_ci
258762306a36Sopenharmony_ci		flush_pending_writes(conf);
258862306a36Sopenharmony_ci
258962306a36Sopenharmony_ci		spin_lock_irqsave(&conf->device_lock, flags);
259062306a36Sopenharmony_ci		if (list_empty(head)) {
259162306a36Sopenharmony_ci			spin_unlock_irqrestore(&conf->device_lock, flags);
259262306a36Sopenharmony_ci			break;
259362306a36Sopenharmony_ci		}
259462306a36Sopenharmony_ci		r1_bio = list_entry(head->prev, struct r1bio, retry_list);
259562306a36Sopenharmony_ci		list_del(head->prev);
259662306a36Sopenharmony_ci		idx = sector_to_idx(r1_bio->sector);
259762306a36Sopenharmony_ci		atomic_dec(&conf->nr_queued[idx]);
259862306a36Sopenharmony_ci		spin_unlock_irqrestore(&conf->device_lock, flags);
259962306a36Sopenharmony_ci
260062306a36Sopenharmony_ci		mddev = r1_bio->mddev;
260162306a36Sopenharmony_ci		conf = mddev->private;
260262306a36Sopenharmony_ci		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
260362306a36Sopenharmony_ci			if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
260462306a36Sopenharmony_ci			    test_bit(R1BIO_WriteError, &r1_bio->state))
260562306a36Sopenharmony_ci				handle_sync_write_finished(conf, r1_bio);
260662306a36Sopenharmony_ci			else
260762306a36Sopenharmony_ci				sync_request_write(mddev, r1_bio);
260862306a36Sopenharmony_ci		} else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
260962306a36Sopenharmony_ci			   test_bit(R1BIO_WriteError, &r1_bio->state))
261062306a36Sopenharmony_ci			handle_write_finished(conf, r1_bio);
261162306a36Sopenharmony_ci		else if (test_bit(R1BIO_ReadError, &r1_bio->state))
261262306a36Sopenharmony_ci			handle_read_error(conf, r1_bio);
261362306a36Sopenharmony_ci		else
261462306a36Sopenharmony_ci			WARN_ON_ONCE(1);
261562306a36Sopenharmony_ci
261662306a36Sopenharmony_ci		cond_resched();
261762306a36Sopenharmony_ci		if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
261862306a36Sopenharmony_ci			md_check_recovery(mddev);
261962306a36Sopenharmony_ci	}
262062306a36Sopenharmony_ci	blk_finish_plug(&plug);
262162306a36Sopenharmony_ci}
262262306a36Sopenharmony_ci
262362306a36Sopenharmony_cistatic int init_resync(struct r1conf *conf)
262462306a36Sopenharmony_ci{
262562306a36Sopenharmony_ci	int buffs;
262662306a36Sopenharmony_ci
262762306a36Sopenharmony_ci	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
262862306a36Sopenharmony_ci	BUG_ON(mempool_initialized(&conf->r1buf_pool));
262962306a36Sopenharmony_ci
263062306a36Sopenharmony_ci	return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc,
263162306a36Sopenharmony_ci			    r1buf_pool_free, conf->poolinfo);
263262306a36Sopenharmony_ci}
263362306a36Sopenharmony_ci
263462306a36Sopenharmony_cistatic struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
263562306a36Sopenharmony_ci{
263662306a36Sopenharmony_ci	struct r1bio *r1bio = mempool_alloc(&conf->r1buf_pool, GFP_NOIO);
263762306a36Sopenharmony_ci	struct resync_pages *rps;
263862306a36Sopenharmony_ci	struct bio *bio;
263962306a36Sopenharmony_ci	int i;
264062306a36Sopenharmony_ci
264162306a36Sopenharmony_ci	for (i = conf->poolinfo->raid_disks; i--; ) {
264262306a36Sopenharmony_ci		bio = r1bio->bios[i];
264362306a36Sopenharmony_ci		rps = bio->bi_private;
264462306a36Sopenharmony_ci		bio_reset(bio, NULL, 0);
264562306a36Sopenharmony_ci		bio->bi_private = rps;
264662306a36Sopenharmony_ci	}
264762306a36Sopenharmony_ci	r1bio->master_bio = NULL;
264862306a36Sopenharmony_ci	return r1bio;
264962306a36Sopenharmony_ci}
265062306a36Sopenharmony_ci
265162306a36Sopenharmony_ci/*
265262306a36Sopenharmony_ci * perform a "sync" on one "block"
265362306a36Sopenharmony_ci *
265462306a36Sopenharmony_ci * We need to make sure that no normal I/O request - particularly write
265562306a36Sopenharmony_ci * requests - conflict with active sync requests.
265662306a36Sopenharmony_ci *
265762306a36Sopenharmony_ci * This is achieved by tracking pending requests and a 'barrier' concept
265862306a36Sopenharmony_ci * that can be installed to exclude normal IO requests.
265962306a36Sopenharmony_ci */
266062306a36Sopenharmony_ci
266162306a36Sopenharmony_cistatic sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
266262306a36Sopenharmony_ci				   int *skipped)
266362306a36Sopenharmony_ci{
266462306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
266562306a36Sopenharmony_ci	struct r1bio *r1_bio;
266662306a36Sopenharmony_ci	struct bio *bio;
266762306a36Sopenharmony_ci	sector_t max_sector, nr_sectors;
266862306a36Sopenharmony_ci	int disk = -1;
266962306a36Sopenharmony_ci	int i;
267062306a36Sopenharmony_ci	int wonly = -1;
267162306a36Sopenharmony_ci	int write_targets = 0, read_targets = 0;
267262306a36Sopenharmony_ci	sector_t sync_blocks;
267362306a36Sopenharmony_ci	int still_degraded = 0;
267462306a36Sopenharmony_ci	int good_sectors = RESYNC_SECTORS;
267562306a36Sopenharmony_ci	int min_bad = 0; /* number of sectors that are bad in all devices */
267662306a36Sopenharmony_ci	int idx = sector_to_idx(sector_nr);
267762306a36Sopenharmony_ci	int page_idx = 0;
267862306a36Sopenharmony_ci
267962306a36Sopenharmony_ci	if (!mempool_initialized(&conf->r1buf_pool))
268062306a36Sopenharmony_ci		if (init_resync(conf))
268162306a36Sopenharmony_ci			return 0;
268262306a36Sopenharmony_ci
268362306a36Sopenharmony_ci	max_sector = mddev->dev_sectors;
268462306a36Sopenharmony_ci	if (sector_nr >= max_sector) {
268562306a36Sopenharmony_ci		/* If we aborted, we need to abort the
268662306a36Sopenharmony_ci		 * sync on the 'current' bitmap chunk (there will
268762306a36Sopenharmony_ci		 * only be one in raid1 resync.
268862306a36Sopenharmony_ci		 * We can find the current addess in mddev->curr_resync
268962306a36Sopenharmony_ci		 */
269062306a36Sopenharmony_ci		if (mddev->curr_resync < max_sector) /* aborted */
269162306a36Sopenharmony_ci			md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
269262306a36Sopenharmony_ci					   &sync_blocks, 1);
269362306a36Sopenharmony_ci		else /* completed sync */
269462306a36Sopenharmony_ci			conf->fullsync = 0;
269562306a36Sopenharmony_ci
269662306a36Sopenharmony_ci		md_bitmap_close_sync(mddev->bitmap);
269762306a36Sopenharmony_ci		close_sync(conf);
269862306a36Sopenharmony_ci
269962306a36Sopenharmony_ci		if (mddev_is_clustered(mddev)) {
270062306a36Sopenharmony_ci			conf->cluster_sync_low = 0;
270162306a36Sopenharmony_ci			conf->cluster_sync_high = 0;
270262306a36Sopenharmony_ci		}
270362306a36Sopenharmony_ci		return 0;
270462306a36Sopenharmony_ci	}
270562306a36Sopenharmony_ci
270662306a36Sopenharmony_ci	if (mddev->bitmap == NULL &&
270762306a36Sopenharmony_ci	    mddev->recovery_cp == MaxSector &&
270862306a36Sopenharmony_ci	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
270962306a36Sopenharmony_ci	    conf->fullsync == 0) {
271062306a36Sopenharmony_ci		*skipped = 1;
271162306a36Sopenharmony_ci		return max_sector - sector_nr;
271262306a36Sopenharmony_ci	}
271362306a36Sopenharmony_ci	/* before building a request, check if we can skip these blocks..
271462306a36Sopenharmony_ci	 * This call the bitmap_start_sync doesn't actually record anything
271562306a36Sopenharmony_ci	 */
271662306a36Sopenharmony_ci	if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
271762306a36Sopenharmony_ci	    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
271862306a36Sopenharmony_ci		/* We can skip this block, and probably several more */
271962306a36Sopenharmony_ci		*skipped = 1;
272062306a36Sopenharmony_ci		return sync_blocks;
272162306a36Sopenharmony_ci	}
272262306a36Sopenharmony_ci
272362306a36Sopenharmony_ci	/*
272462306a36Sopenharmony_ci	 * If there is non-resync activity waiting for a turn, then let it
272562306a36Sopenharmony_ci	 * though before starting on this new sync request.
272662306a36Sopenharmony_ci	 */
272762306a36Sopenharmony_ci	if (atomic_read(&conf->nr_waiting[idx]))
272862306a36Sopenharmony_ci		schedule_timeout_uninterruptible(1);
272962306a36Sopenharmony_ci
273062306a36Sopenharmony_ci	/* we are incrementing sector_nr below. To be safe, we check against
273162306a36Sopenharmony_ci	 * sector_nr + two times RESYNC_SECTORS
273262306a36Sopenharmony_ci	 */
273362306a36Sopenharmony_ci
273462306a36Sopenharmony_ci	md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
273562306a36Sopenharmony_ci		mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
273662306a36Sopenharmony_ci
273762306a36Sopenharmony_ci
273862306a36Sopenharmony_ci	if (raise_barrier(conf, sector_nr))
273962306a36Sopenharmony_ci		return 0;
274062306a36Sopenharmony_ci
274162306a36Sopenharmony_ci	r1_bio = raid1_alloc_init_r1buf(conf);
274262306a36Sopenharmony_ci
274362306a36Sopenharmony_ci	rcu_read_lock();
274462306a36Sopenharmony_ci	/*
274562306a36Sopenharmony_ci	 * If we get a correctably read error during resync or recovery,
274662306a36Sopenharmony_ci	 * we might want to read from a different device.  So we
274762306a36Sopenharmony_ci	 * flag all drives that could conceivably be read from for READ,
274862306a36Sopenharmony_ci	 * and any others (which will be non-In_sync devices) for WRITE.
274962306a36Sopenharmony_ci	 * If a read fails, we try reading from something else for which READ
275062306a36Sopenharmony_ci	 * is OK.
275162306a36Sopenharmony_ci	 */
275262306a36Sopenharmony_ci
275362306a36Sopenharmony_ci	r1_bio->mddev = mddev;
275462306a36Sopenharmony_ci	r1_bio->sector = sector_nr;
275562306a36Sopenharmony_ci	r1_bio->state = 0;
275662306a36Sopenharmony_ci	set_bit(R1BIO_IsSync, &r1_bio->state);
275762306a36Sopenharmony_ci	/* make sure good_sectors won't go across barrier unit boundary */
275862306a36Sopenharmony_ci	good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
275962306a36Sopenharmony_ci
276062306a36Sopenharmony_ci	for (i = 0; i < conf->raid_disks * 2; i++) {
276162306a36Sopenharmony_ci		struct md_rdev *rdev;
276262306a36Sopenharmony_ci		bio = r1_bio->bios[i];
276362306a36Sopenharmony_ci
276462306a36Sopenharmony_ci		rdev = rcu_dereference(conf->mirrors[i].rdev);
276562306a36Sopenharmony_ci		if (rdev == NULL ||
276662306a36Sopenharmony_ci		    test_bit(Faulty, &rdev->flags)) {
276762306a36Sopenharmony_ci			if (i < conf->raid_disks)
276862306a36Sopenharmony_ci				still_degraded = 1;
276962306a36Sopenharmony_ci		} else if (!test_bit(In_sync, &rdev->flags)) {
277062306a36Sopenharmony_ci			bio->bi_opf = REQ_OP_WRITE;
277162306a36Sopenharmony_ci			bio->bi_end_io = end_sync_write;
277262306a36Sopenharmony_ci			write_targets ++;
277362306a36Sopenharmony_ci		} else {
277462306a36Sopenharmony_ci			/* may need to read from here */
277562306a36Sopenharmony_ci			sector_t first_bad = MaxSector;
277662306a36Sopenharmony_ci			int bad_sectors;
277762306a36Sopenharmony_ci
277862306a36Sopenharmony_ci			if (is_badblock(rdev, sector_nr, good_sectors,
277962306a36Sopenharmony_ci					&first_bad, &bad_sectors)) {
278062306a36Sopenharmony_ci				if (first_bad > sector_nr)
278162306a36Sopenharmony_ci					good_sectors = first_bad - sector_nr;
278262306a36Sopenharmony_ci				else {
278362306a36Sopenharmony_ci					bad_sectors -= (sector_nr - first_bad);
278462306a36Sopenharmony_ci					if (min_bad == 0 ||
278562306a36Sopenharmony_ci					    min_bad > bad_sectors)
278662306a36Sopenharmony_ci						min_bad = bad_sectors;
278762306a36Sopenharmony_ci				}
278862306a36Sopenharmony_ci			}
278962306a36Sopenharmony_ci			if (sector_nr < first_bad) {
279062306a36Sopenharmony_ci				if (test_bit(WriteMostly, &rdev->flags)) {
279162306a36Sopenharmony_ci					if (wonly < 0)
279262306a36Sopenharmony_ci						wonly = i;
279362306a36Sopenharmony_ci				} else {
279462306a36Sopenharmony_ci					if (disk < 0)
279562306a36Sopenharmony_ci						disk = i;
279662306a36Sopenharmony_ci				}
279762306a36Sopenharmony_ci				bio->bi_opf = REQ_OP_READ;
279862306a36Sopenharmony_ci				bio->bi_end_io = end_sync_read;
279962306a36Sopenharmony_ci				read_targets++;
280062306a36Sopenharmony_ci			} else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
280162306a36Sopenharmony_ci				test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
280262306a36Sopenharmony_ci				!test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
280362306a36Sopenharmony_ci				/*
280462306a36Sopenharmony_ci				 * The device is suitable for reading (InSync),
280562306a36Sopenharmony_ci				 * but has bad block(s) here. Let's try to correct them,
280662306a36Sopenharmony_ci				 * if we are doing resync or repair. Otherwise, leave
280762306a36Sopenharmony_ci				 * this device alone for this sync request.
280862306a36Sopenharmony_ci				 */
280962306a36Sopenharmony_ci				bio->bi_opf = REQ_OP_WRITE;
281062306a36Sopenharmony_ci				bio->bi_end_io = end_sync_write;
281162306a36Sopenharmony_ci				write_targets++;
281262306a36Sopenharmony_ci			}
281362306a36Sopenharmony_ci		}
281462306a36Sopenharmony_ci		if (rdev && bio->bi_end_io) {
281562306a36Sopenharmony_ci			atomic_inc(&rdev->nr_pending);
281662306a36Sopenharmony_ci			bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
281762306a36Sopenharmony_ci			bio_set_dev(bio, rdev->bdev);
281862306a36Sopenharmony_ci			if (test_bit(FailFast, &rdev->flags))
281962306a36Sopenharmony_ci				bio->bi_opf |= MD_FAILFAST;
282062306a36Sopenharmony_ci		}
282162306a36Sopenharmony_ci	}
282262306a36Sopenharmony_ci	rcu_read_unlock();
282362306a36Sopenharmony_ci	if (disk < 0)
282462306a36Sopenharmony_ci		disk = wonly;
282562306a36Sopenharmony_ci	r1_bio->read_disk = disk;
282662306a36Sopenharmony_ci
282762306a36Sopenharmony_ci	if (read_targets == 0 && min_bad > 0) {
282862306a36Sopenharmony_ci		/* These sectors are bad on all InSync devices, so we
282962306a36Sopenharmony_ci		 * need to mark them bad on all write targets
283062306a36Sopenharmony_ci		 */
283162306a36Sopenharmony_ci		int ok = 1;
283262306a36Sopenharmony_ci		for (i = 0 ; i < conf->raid_disks * 2 ; i++)
283362306a36Sopenharmony_ci			if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
283462306a36Sopenharmony_ci				struct md_rdev *rdev = conf->mirrors[i].rdev;
283562306a36Sopenharmony_ci				ok = rdev_set_badblocks(rdev, sector_nr,
283662306a36Sopenharmony_ci							min_bad, 0
283762306a36Sopenharmony_ci					) && ok;
283862306a36Sopenharmony_ci			}
283962306a36Sopenharmony_ci		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
284062306a36Sopenharmony_ci		*skipped = 1;
284162306a36Sopenharmony_ci		put_buf(r1_bio);
284262306a36Sopenharmony_ci
284362306a36Sopenharmony_ci		if (!ok) {
284462306a36Sopenharmony_ci			/* Cannot record the badblocks, so need to
284562306a36Sopenharmony_ci			 * abort the resync.
284662306a36Sopenharmony_ci			 * If there are multiple read targets, could just
284762306a36Sopenharmony_ci			 * fail the really bad ones ???
284862306a36Sopenharmony_ci			 */
284962306a36Sopenharmony_ci			conf->recovery_disabled = mddev->recovery_disabled;
285062306a36Sopenharmony_ci			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
285162306a36Sopenharmony_ci			return 0;
285262306a36Sopenharmony_ci		} else
285362306a36Sopenharmony_ci			return min_bad;
285462306a36Sopenharmony_ci
285562306a36Sopenharmony_ci	}
285662306a36Sopenharmony_ci	if (min_bad > 0 && min_bad < good_sectors) {
285762306a36Sopenharmony_ci		/* only resync enough to reach the next bad->good
285862306a36Sopenharmony_ci		 * transition */
285962306a36Sopenharmony_ci		good_sectors = min_bad;
286062306a36Sopenharmony_ci	}
286162306a36Sopenharmony_ci
286262306a36Sopenharmony_ci	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
286362306a36Sopenharmony_ci		/* extra read targets are also write targets */
286462306a36Sopenharmony_ci		write_targets += read_targets-1;
286562306a36Sopenharmony_ci
286662306a36Sopenharmony_ci	if (write_targets == 0 || read_targets == 0) {
286762306a36Sopenharmony_ci		/* There is nowhere to write, so all non-sync
286862306a36Sopenharmony_ci		 * drives must be failed - so we are finished
286962306a36Sopenharmony_ci		 */
287062306a36Sopenharmony_ci		sector_t rv;
287162306a36Sopenharmony_ci		if (min_bad > 0)
287262306a36Sopenharmony_ci			max_sector = sector_nr + min_bad;
287362306a36Sopenharmony_ci		rv = max_sector - sector_nr;
287462306a36Sopenharmony_ci		*skipped = 1;
287562306a36Sopenharmony_ci		put_buf(r1_bio);
287662306a36Sopenharmony_ci		return rv;
287762306a36Sopenharmony_ci	}
287862306a36Sopenharmony_ci
287962306a36Sopenharmony_ci	if (max_sector > mddev->resync_max)
288062306a36Sopenharmony_ci		max_sector = mddev->resync_max; /* Don't do IO beyond here */
288162306a36Sopenharmony_ci	if (max_sector > sector_nr + good_sectors)
288262306a36Sopenharmony_ci		max_sector = sector_nr + good_sectors;
288362306a36Sopenharmony_ci	nr_sectors = 0;
288462306a36Sopenharmony_ci	sync_blocks = 0;
288562306a36Sopenharmony_ci	do {
288662306a36Sopenharmony_ci		struct page *page;
288762306a36Sopenharmony_ci		int len = PAGE_SIZE;
288862306a36Sopenharmony_ci		if (sector_nr + (len>>9) > max_sector)
288962306a36Sopenharmony_ci			len = (max_sector - sector_nr) << 9;
289062306a36Sopenharmony_ci		if (len == 0)
289162306a36Sopenharmony_ci			break;
289262306a36Sopenharmony_ci		if (sync_blocks == 0) {
289362306a36Sopenharmony_ci			if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
289462306a36Sopenharmony_ci						  &sync_blocks, still_degraded) &&
289562306a36Sopenharmony_ci			    !conf->fullsync &&
289662306a36Sopenharmony_ci			    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
289762306a36Sopenharmony_ci				break;
289862306a36Sopenharmony_ci			if ((len >> 9) > sync_blocks)
289962306a36Sopenharmony_ci				len = sync_blocks<<9;
290062306a36Sopenharmony_ci		}
290162306a36Sopenharmony_ci
290262306a36Sopenharmony_ci		for (i = 0 ; i < conf->raid_disks * 2; i++) {
290362306a36Sopenharmony_ci			struct resync_pages *rp;
290462306a36Sopenharmony_ci
290562306a36Sopenharmony_ci			bio = r1_bio->bios[i];
290662306a36Sopenharmony_ci			rp = get_resync_pages(bio);
290762306a36Sopenharmony_ci			if (bio->bi_end_io) {
290862306a36Sopenharmony_ci				page = resync_fetch_page(rp, page_idx);
290962306a36Sopenharmony_ci
291062306a36Sopenharmony_ci				/*
291162306a36Sopenharmony_ci				 * won't fail because the vec table is big
291262306a36Sopenharmony_ci				 * enough to hold all these pages
291362306a36Sopenharmony_ci				 */
291462306a36Sopenharmony_ci				__bio_add_page(bio, page, len, 0);
291562306a36Sopenharmony_ci			}
291662306a36Sopenharmony_ci		}
291762306a36Sopenharmony_ci		nr_sectors += len>>9;
291862306a36Sopenharmony_ci		sector_nr += len>>9;
291962306a36Sopenharmony_ci		sync_blocks -= (len>>9);
292062306a36Sopenharmony_ci	} while (++page_idx < RESYNC_PAGES);
292162306a36Sopenharmony_ci
292262306a36Sopenharmony_ci	r1_bio->sectors = nr_sectors;
292362306a36Sopenharmony_ci
292462306a36Sopenharmony_ci	if (mddev_is_clustered(mddev) &&
292562306a36Sopenharmony_ci			conf->cluster_sync_high < sector_nr + nr_sectors) {
292662306a36Sopenharmony_ci		conf->cluster_sync_low = mddev->curr_resync_completed;
292762306a36Sopenharmony_ci		conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
292862306a36Sopenharmony_ci		/* Send resync message */
292962306a36Sopenharmony_ci		md_cluster_ops->resync_info_update(mddev,
293062306a36Sopenharmony_ci				conf->cluster_sync_low,
293162306a36Sopenharmony_ci				conf->cluster_sync_high);
293262306a36Sopenharmony_ci	}
293362306a36Sopenharmony_ci
293462306a36Sopenharmony_ci	/* For a user-requested sync, we read all readable devices and do a
293562306a36Sopenharmony_ci	 * compare
293662306a36Sopenharmony_ci	 */
293762306a36Sopenharmony_ci	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
293862306a36Sopenharmony_ci		atomic_set(&r1_bio->remaining, read_targets);
293962306a36Sopenharmony_ci		for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
294062306a36Sopenharmony_ci			bio = r1_bio->bios[i];
294162306a36Sopenharmony_ci			if (bio->bi_end_io == end_sync_read) {
294262306a36Sopenharmony_ci				read_targets--;
294362306a36Sopenharmony_ci				md_sync_acct_bio(bio, nr_sectors);
294462306a36Sopenharmony_ci				if (read_targets == 1)
294562306a36Sopenharmony_ci					bio->bi_opf &= ~MD_FAILFAST;
294662306a36Sopenharmony_ci				submit_bio_noacct(bio);
294762306a36Sopenharmony_ci			}
294862306a36Sopenharmony_ci		}
294962306a36Sopenharmony_ci	} else {
295062306a36Sopenharmony_ci		atomic_set(&r1_bio->remaining, 1);
295162306a36Sopenharmony_ci		bio = r1_bio->bios[r1_bio->read_disk];
295262306a36Sopenharmony_ci		md_sync_acct_bio(bio, nr_sectors);
295362306a36Sopenharmony_ci		if (read_targets == 1)
295462306a36Sopenharmony_ci			bio->bi_opf &= ~MD_FAILFAST;
295562306a36Sopenharmony_ci		submit_bio_noacct(bio);
295662306a36Sopenharmony_ci	}
295762306a36Sopenharmony_ci	return nr_sectors;
295862306a36Sopenharmony_ci}
295962306a36Sopenharmony_ci
296062306a36Sopenharmony_cistatic sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks)
296162306a36Sopenharmony_ci{
296262306a36Sopenharmony_ci	if (sectors)
296362306a36Sopenharmony_ci		return sectors;
296462306a36Sopenharmony_ci
296562306a36Sopenharmony_ci	return mddev->dev_sectors;
296662306a36Sopenharmony_ci}
296762306a36Sopenharmony_ci
296862306a36Sopenharmony_cistatic struct r1conf *setup_conf(struct mddev *mddev)
296962306a36Sopenharmony_ci{
297062306a36Sopenharmony_ci	struct r1conf *conf;
297162306a36Sopenharmony_ci	int i;
297262306a36Sopenharmony_ci	struct raid1_info *disk;
297362306a36Sopenharmony_ci	struct md_rdev *rdev;
297462306a36Sopenharmony_ci	int err = -ENOMEM;
297562306a36Sopenharmony_ci
297662306a36Sopenharmony_ci	conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
297762306a36Sopenharmony_ci	if (!conf)
297862306a36Sopenharmony_ci		goto abort;
297962306a36Sopenharmony_ci
298062306a36Sopenharmony_ci	conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
298162306a36Sopenharmony_ci				   sizeof(atomic_t), GFP_KERNEL);
298262306a36Sopenharmony_ci	if (!conf->nr_pending)
298362306a36Sopenharmony_ci		goto abort;
298462306a36Sopenharmony_ci
298562306a36Sopenharmony_ci	conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
298662306a36Sopenharmony_ci				   sizeof(atomic_t), GFP_KERNEL);
298762306a36Sopenharmony_ci	if (!conf->nr_waiting)
298862306a36Sopenharmony_ci		goto abort;
298962306a36Sopenharmony_ci
299062306a36Sopenharmony_ci	conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
299162306a36Sopenharmony_ci				  sizeof(atomic_t), GFP_KERNEL);
299262306a36Sopenharmony_ci	if (!conf->nr_queued)
299362306a36Sopenharmony_ci		goto abort;
299462306a36Sopenharmony_ci
299562306a36Sopenharmony_ci	conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
299662306a36Sopenharmony_ci				sizeof(atomic_t), GFP_KERNEL);
299762306a36Sopenharmony_ci	if (!conf->barrier)
299862306a36Sopenharmony_ci		goto abort;
299962306a36Sopenharmony_ci
300062306a36Sopenharmony_ci	conf->mirrors = kzalloc(array3_size(sizeof(struct raid1_info),
300162306a36Sopenharmony_ci					    mddev->raid_disks, 2),
300262306a36Sopenharmony_ci				GFP_KERNEL);
300362306a36Sopenharmony_ci	if (!conf->mirrors)
300462306a36Sopenharmony_ci		goto abort;
300562306a36Sopenharmony_ci
300662306a36Sopenharmony_ci	conf->tmppage = alloc_page(GFP_KERNEL);
300762306a36Sopenharmony_ci	if (!conf->tmppage)
300862306a36Sopenharmony_ci		goto abort;
300962306a36Sopenharmony_ci
301062306a36Sopenharmony_ci	conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
301162306a36Sopenharmony_ci	if (!conf->poolinfo)
301262306a36Sopenharmony_ci		goto abort;
301362306a36Sopenharmony_ci	conf->poolinfo->raid_disks = mddev->raid_disks * 2;
301462306a36Sopenharmony_ci	err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc,
301562306a36Sopenharmony_ci			   rbio_pool_free, conf->poolinfo);
301662306a36Sopenharmony_ci	if (err)
301762306a36Sopenharmony_ci		goto abort;
301862306a36Sopenharmony_ci
301962306a36Sopenharmony_ci	err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0);
302062306a36Sopenharmony_ci	if (err)
302162306a36Sopenharmony_ci		goto abort;
302262306a36Sopenharmony_ci
302362306a36Sopenharmony_ci	conf->poolinfo->mddev = mddev;
302462306a36Sopenharmony_ci
302562306a36Sopenharmony_ci	err = -EINVAL;
302662306a36Sopenharmony_ci	spin_lock_init(&conf->device_lock);
302762306a36Sopenharmony_ci	rdev_for_each(rdev, mddev) {
302862306a36Sopenharmony_ci		int disk_idx = rdev->raid_disk;
302962306a36Sopenharmony_ci		if (disk_idx >= mddev->raid_disks
303062306a36Sopenharmony_ci		    || disk_idx < 0)
303162306a36Sopenharmony_ci			continue;
303262306a36Sopenharmony_ci		if (test_bit(Replacement, &rdev->flags))
303362306a36Sopenharmony_ci			disk = conf->mirrors + mddev->raid_disks + disk_idx;
303462306a36Sopenharmony_ci		else
303562306a36Sopenharmony_ci			disk = conf->mirrors + disk_idx;
303662306a36Sopenharmony_ci
303762306a36Sopenharmony_ci		if (disk->rdev)
303862306a36Sopenharmony_ci			goto abort;
303962306a36Sopenharmony_ci		disk->rdev = rdev;
304062306a36Sopenharmony_ci		disk->head_position = 0;
304162306a36Sopenharmony_ci		disk->seq_start = MaxSector;
304262306a36Sopenharmony_ci	}
304362306a36Sopenharmony_ci	conf->raid_disks = mddev->raid_disks;
304462306a36Sopenharmony_ci	conf->mddev = mddev;
304562306a36Sopenharmony_ci	INIT_LIST_HEAD(&conf->retry_list);
304662306a36Sopenharmony_ci	INIT_LIST_HEAD(&conf->bio_end_io_list);
304762306a36Sopenharmony_ci
304862306a36Sopenharmony_ci	spin_lock_init(&conf->resync_lock);
304962306a36Sopenharmony_ci	init_waitqueue_head(&conf->wait_barrier);
305062306a36Sopenharmony_ci
305162306a36Sopenharmony_ci	bio_list_init(&conf->pending_bio_list);
305262306a36Sopenharmony_ci	conf->recovery_disabled = mddev->recovery_disabled - 1;
305362306a36Sopenharmony_ci
305462306a36Sopenharmony_ci	err = -EIO;
305562306a36Sopenharmony_ci	for (i = 0; i < conf->raid_disks * 2; i++) {
305662306a36Sopenharmony_ci
305762306a36Sopenharmony_ci		disk = conf->mirrors + i;
305862306a36Sopenharmony_ci
305962306a36Sopenharmony_ci		if (i < conf->raid_disks &&
306062306a36Sopenharmony_ci		    disk[conf->raid_disks].rdev) {
306162306a36Sopenharmony_ci			/* This slot has a replacement. */
306262306a36Sopenharmony_ci			if (!disk->rdev) {
306362306a36Sopenharmony_ci				/* No original, just make the replacement
306462306a36Sopenharmony_ci				 * a recovering spare
306562306a36Sopenharmony_ci				 */
306662306a36Sopenharmony_ci				disk->rdev =
306762306a36Sopenharmony_ci					disk[conf->raid_disks].rdev;
306862306a36Sopenharmony_ci				disk[conf->raid_disks].rdev = NULL;
306962306a36Sopenharmony_ci			} else if (!test_bit(In_sync, &disk->rdev->flags))
307062306a36Sopenharmony_ci				/* Original is not in_sync - bad */
307162306a36Sopenharmony_ci				goto abort;
307262306a36Sopenharmony_ci		}
307362306a36Sopenharmony_ci
307462306a36Sopenharmony_ci		if (!disk->rdev ||
307562306a36Sopenharmony_ci		    !test_bit(In_sync, &disk->rdev->flags)) {
307662306a36Sopenharmony_ci			disk->head_position = 0;
307762306a36Sopenharmony_ci			if (disk->rdev &&
307862306a36Sopenharmony_ci			    (disk->rdev->saved_raid_disk < 0))
307962306a36Sopenharmony_ci				conf->fullsync = 1;
308062306a36Sopenharmony_ci		}
308162306a36Sopenharmony_ci	}
308262306a36Sopenharmony_ci
308362306a36Sopenharmony_ci	err = -ENOMEM;
308462306a36Sopenharmony_ci	rcu_assign_pointer(conf->thread,
308562306a36Sopenharmony_ci			   md_register_thread(raid1d, mddev, "raid1"));
308662306a36Sopenharmony_ci	if (!conf->thread)
308762306a36Sopenharmony_ci		goto abort;
308862306a36Sopenharmony_ci
308962306a36Sopenharmony_ci	return conf;
309062306a36Sopenharmony_ci
309162306a36Sopenharmony_ci abort:
309262306a36Sopenharmony_ci	if (conf) {
309362306a36Sopenharmony_ci		mempool_exit(&conf->r1bio_pool);
309462306a36Sopenharmony_ci		kfree(conf->mirrors);
309562306a36Sopenharmony_ci		safe_put_page(conf->tmppage);
309662306a36Sopenharmony_ci		kfree(conf->poolinfo);
309762306a36Sopenharmony_ci		kfree(conf->nr_pending);
309862306a36Sopenharmony_ci		kfree(conf->nr_waiting);
309962306a36Sopenharmony_ci		kfree(conf->nr_queued);
310062306a36Sopenharmony_ci		kfree(conf->barrier);
310162306a36Sopenharmony_ci		bioset_exit(&conf->bio_split);
310262306a36Sopenharmony_ci		kfree(conf);
310362306a36Sopenharmony_ci	}
310462306a36Sopenharmony_ci	return ERR_PTR(err);
310562306a36Sopenharmony_ci}
310662306a36Sopenharmony_ci
310762306a36Sopenharmony_cistatic void raid1_free(struct mddev *mddev, void *priv);
310862306a36Sopenharmony_cistatic int raid1_run(struct mddev *mddev)
310962306a36Sopenharmony_ci{
311062306a36Sopenharmony_ci	struct r1conf *conf;
311162306a36Sopenharmony_ci	int i;
311262306a36Sopenharmony_ci	struct md_rdev *rdev;
311362306a36Sopenharmony_ci	int ret;
311462306a36Sopenharmony_ci
311562306a36Sopenharmony_ci	if (mddev->level != 1) {
311662306a36Sopenharmony_ci		pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n",
311762306a36Sopenharmony_ci			mdname(mddev), mddev->level);
311862306a36Sopenharmony_ci		return -EIO;
311962306a36Sopenharmony_ci	}
312062306a36Sopenharmony_ci	if (mddev->reshape_position != MaxSector) {
312162306a36Sopenharmony_ci		pr_warn("md/raid1:%s: reshape_position set but not supported\n",
312262306a36Sopenharmony_ci			mdname(mddev));
312362306a36Sopenharmony_ci		return -EIO;
312462306a36Sopenharmony_ci	}
312562306a36Sopenharmony_ci	if (mddev_init_writes_pending(mddev) < 0)
312662306a36Sopenharmony_ci		return -ENOMEM;
312762306a36Sopenharmony_ci	/*
312862306a36Sopenharmony_ci	 * copy the already verified devices into our private RAID1
312962306a36Sopenharmony_ci	 * bookkeeping area. [whatever we allocate in run(),
313062306a36Sopenharmony_ci	 * should be freed in raid1_free()]
313162306a36Sopenharmony_ci	 */
313262306a36Sopenharmony_ci	if (mddev->private == NULL)
313362306a36Sopenharmony_ci		conf = setup_conf(mddev);
313462306a36Sopenharmony_ci	else
313562306a36Sopenharmony_ci		conf = mddev->private;
313662306a36Sopenharmony_ci
313762306a36Sopenharmony_ci	if (IS_ERR(conf))
313862306a36Sopenharmony_ci		return PTR_ERR(conf);
313962306a36Sopenharmony_ci
314062306a36Sopenharmony_ci	if (mddev->queue)
314162306a36Sopenharmony_ci		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
314262306a36Sopenharmony_ci
314362306a36Sopenharmony_ci	rdev_for_each(rdev, mddev) {
314462306a36Sopenharmony_ci		if (!mddev->gendisk)
314562306a36Sopenharmony_ci			continue;
314662306a36Sopenharmony_ci		disk_stack_limits(mddev->gendisk, rdev->bdev,
314762306a36Sopenharmony_ci				  rdev->data_offset << 9);
314862306a36Sopenharmony_ci	}
314962306a36Sopenharmony_ci
315062306a36Sopenharmony_ci	mddev->degraded = 0;
315162306a36Sopenharmony_ci	for (i = 0; i < conf->raid_disks; i++)
315262306a36Sopenharmony_ci		if (conf->mirrors[i].rdev == NULL ||
315362306a36Sopenharmony_ci		    !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
315462306a36Sopenharmony_ci		    test_bit(Faulty, &conf->mirrors[i].rdev->flags))
315562306a36Sopenharmony_ci			mddev->degraded++;
315662306a36Sopenharmony_ci	/*
315762306a36Sopenharmony_ci	 * RAID1 needs at least one disk in active
315862306a36Sopenharmony_ci	 */
315962306a36Sopenharmony_ci	if (conf->raid_disks - mddev->degraded < 1) {
316062306a36Sopenharmony_ci		md_unregister_thread(mddev, &conf->thread);
316162306a36Sopenharmony_ci		ret = -EINVAL;
316262306a36Sopenharmony_ci		goto abort;
316362306a36Sopenharmony_ci	}
316462306a36Sopenharmony_ci
316562306a36Sopenharmony_ci	if (conf->raid_disks - mddev->degraded == 1)
316662306a36Sopenharmony_ci		mddev->recovery_cp = MaxSector;
316762306a36Sopenharmony_ci
316862306a36Sopenharmony_ci	if (mddev->recovery_cp != MaxSector)
316962306a36Sopenharmony_ci		pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
317062306a36Sopenharmony_ci			mdname(mddev));
317162306a36Sopenharmony_ci	pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
317262306a36Sopenharmony_ci		mdname(mddev), mddev->raid_disks - mddev->degraded,
317362306a36Sopenharmony_ci		mddev->raid_disks);
317462306a36Sopenharmony_ci
317562306a36Sopenharmony_ci	/*
317662306a36Sopenharmony_ci	 * Ok, everything is just fine now
317762306a36Sopenharmony_ci	 */
317862306a36Sopenharmony_ci	rcu_assign_pointer(mddev->thread, conf->thread);
317962306a36Sopenharmony_ci	rcu_assign_pointer(conf->thread, NULL);
318062306a36Sopenharmony_ci	mddev->private = conf;
318162306a36Sopenharmony_ci	set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
318262306a36Sopenharmony_ci
318362306a36Sopenharmony_ci	md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
318462306a36Sopenharmony_ci
318562306a36Sopenharmony_ci	ret = md_integrity_register(mddev);
318662306a36Sopenharmony_ci	if (ret) {
318762306a36Sopenharmony_ci		md_unregister_thread(mddev, &mddev->thread);
318862306a36Sopenharmony_ci		goto abort;
318962306a36Sopenharmony_ci	}
319062306a36Sopenharmony_ci	return 0;
319162306a36Sopenharmony_ci
319262306a36Sopenharmony_ciabort:
319362306a36Sopenharmony_ci	raid1_free(mddev, conf);
319462306a36Sopenharmony_ci	return ret;
319562306a36Sopenharmony_ci}
319662306a36Sopenharmony_ci
319762306a36Sopenharmony_cistatic void raid1_free(struct mddev *mddev, void *priv)
319862306a36Sopenharmony_ci{
319962306a36Sopenharmony_ci	struct r1conf *conf = priv;
320062306a36Sopenharmony_ci
320162306a36Sopenharmony_ci	mempool_exit(&conf->r1bio_pool);
320262306a36Sopenharmony_ci	kfree(conf->mirrors);
320362306a36Sopenharmony_ci	safe_put_page(conf->tmppage);
320462306a36Sopenharmony_ci	kfree(conf->poolinfo);
320562306a36Sopenharmony_ci	kfree(conf->nr_pending);
320662306a36Sopenharmony_ci	kfree(conf->nr_waiting);
320762306a36Sopenharmony_ci	kfree(conf->nr_queued);
320862306a36Sopenharmony_ci	kfree(conf->barrier);
320962306a36Sopenharmony_ci	bioset_exit(&conf->bio_split);
321062306a36Sopenharmony_ci	kfree(conf);
321162306a36Sopenharmony_ci}
321262306a36Sopenharmony_ci
321362306a36Sopenharmony_cistatic int raid1_resize(struct mddev *mddev, sector_t sectors)
321462306a36Sopenharmony_ci{
321562306a36Sopenharmony_ci	/* no resync is happening, and there is enough space
321662306a36Sopenharmony_ci	 * on all devices, so we can resize.
321762306a36Sopenharmony_ci	 * We need to make sure resync covers any new space.
321862306a36Sopenharmony_ci	 * If the array is shrinking we should possibly wait until
321962306a36Sopenharmony_ci	 * any io in the removed space completes, but it hardly seems
322062306a36Sopenharmony_ci	 * worth it.
322162306a36Sopenharmony_ci	 */
322262306a36Sopenharmony_ci	sector_t newsize = raid1_size(mddev, sectors, 0);
322362306a36Sopenharmony_ci	if (mddev->external_size &&
322462306a36Sopenharmony_ci	    mddev->array_sectors > newsize)
322562306a36Sopenharmony_ci		return -EINVAL;
322662306a36Sopenharmony_ci	if (mddev->bitmap) {
322762306a36Sopenharmony_ci		int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
322862306a36Sopenharmony_ci		if (ret)
322962306a36Sopenharmony_ci			return ret;
323062306a36Sopenharmony_ci	}
323162306a36Sopenharmony_ci	md_set_array_sectors(mddev, newsize);
323262306a36Sopenharmony_ci	if (sectors > mddev->dev_sectors &&
323362306a36Sopenharmony_ci	    mddev->recovery_cp > mddev->dev_sectors) {
323462306a36Sopenharmony_ci		mddev->recovery_cp = mddev->dev_sectors;
323562306a36Sopenharmony_ci		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
323662306a36Sopenharmony_ci	}
323762306a36Sopenharmony_ci	mddev->dev_sectors = sectors;
323862306a36Sopenharmony_ci	mddev->resync_max_sectors = sectors;
323962306a36Sopenharmony_ci	return 0;
324062306a36Sopenharmony_ci}
324162306a36Sopenharmony_ci
324262306a36Sopenharmony_cistatic int raid1_reshape(struct mddev *mddev)
324362306a36Sopenharmony_ci{
324462306a36Sopenharmony_ci	/* We need to:
324562306a36Sopenharmony_ci	 * 1/ resize the r1bio_pool
324662306a36Sopenharmony_ci	 * 2/ resize conf->mirrors
324762306a36Sopenharmony_ci	 *
324862306a36Sopenharmony_ci	 * We allocate a new r1bio_pool if we can.
324962306a36Sopenharmony_ci	 * Then raise a device barrier and wait until all IO stops.
325062306a36Sopenharmony_ci	 * Then resize conf->mirrors and swap in the new r1bio pool.
325162306a36Sopenharmony_ci	 *
325262306a36Sopenharmony_ci	 * At the same time, we "pack" the devices so that all the missing
325362306a36Sopenharmony_ci	 * devices have the higher raid_disk numbers.
325462306a36Sopenharmony_ci	 */
325562306a36Sopenharmony_ci	mempool_t newpool, oldpool;
325662306a36Sopenharmony_ci	struct pool_info *newpoolinfo;
325762306a36Sopenharmony_ci	struct raid1_info *newmirrors;
325862306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
325962306a36Sopenharmony_ci	int cnt, raid_disks;
326062306a36Sopenharmony_ci	unsigned long flags;
326162306a36Sopenharmony_ci	int d, d2;
326262306a36Sopenharmony_ci	int ret;
326362306a36Sopenharmony_ci
326462306a36Sopenharmony_ci	memset(&newpool, 0, sizeof(newpool));
326562306a36Sopenharmony_ci	memset(&oldpool, 0, sizeof(oldpool));
326662306a36Sopenharmony_ci
326762306a36Sopenharmony_ci	/* Cannot change chunk_size, layout, or level */
326862306a36Sopenharmony_ci	if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
326962306a36Sopenharmony_ci	    mddev->layout != mddev->new_layout ||
327062306a36Sopenharmony_ci	    mddev->level != mddev->new_level) {
327162306a36Sopenharmony_ci		mddev->new_chunk_sectors = mddev->chunk_sectors;
327262306a36Sopenharmony_ci		mddev->new_layout = mddev->layout;
327362306a36Sopenharmony_ci		mddev->new_level = mddev->level;
327462306a36Sopenharmony_ci		return -EINVAL;
327562306a36Sopenharmony_ci	}
327662306a36Sopenharmony_ci
327762306a36Sopenharmony_ci	if (!mddev_is_clustered(mddev))
327862306a36Sopenharmony_ci		md_allow_write(mddev);
327962306a36Sopenharmony_ci
328062306a36Sopenharmony_ci	raid_disks = mddev->raid_disks + mddev->delta_disks;
328162306a36Sopenharmony_ci
328262306a36Sopenharmony_ci	if (raid_disks < conf->raid_disks) {
328362306a36Sopenharmony_ci		cnt=0;
328462306a36Sopenharmony_ci		for (d= 0; d < conf->raid_disks; d++)
328562306a36Sopenharmony_ci			if (conf->mirrors[d].rdev)
328662306a36Sopenharmony_ci				cnt++;
328762306a36Sopenharmony_ci		if (cnt > raid_disks)
328862306a36Sopenharmony_ci			return -EBUSY;
328962306a36Sopenharmony_ci	}
329062306a36Sopenharmony_ci
329162306a36Sopenharmony_ci	newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
329262306a36Sopenharmony_ci	if (!newpoolinfo)
329362306a36Sopenharmony_ci		return -ENOMEM;
329462306a36Sopenharmony_ci	newpoolinfo->mddev = mddev;
329562306a36Sopenharmony_ci	newpoolinfo->raid_disks = raid_disks * 2;
329662306a36Sopenharmony_ci
329762306a36Sopenharmony_ci	ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc,
329862306a36Sopenharmony_ci			   rbio_pool_free, newpoolinfo);
329962306a36Sopenharmony_ci	if (ret) {
330062306a36Sopenharmony_ci		kfree(newpoolinfo);
330162306a36Sopenharmony_ci		return ret;
330262306a36Sopenharmony_ci	}
330362306a36Sopenharmony_ci	newmirrors = kzalloc(array3_size(sizeof(struct raid1_info),
330462306a36Sopenharmony_ci					 raid_disks, 2),
330562306a36Sopenharmony_ci			     GFP_KERNEL);
330662306a36Sopenharmony_ci	if (!newmirrors) {
330762306a36Sopenharmony_ci		kfree(newpoolinfo);
330862306a36Sopenharmony_ci		mempool_exit(&newpool);
330962306a36Sopenharmony_ci		return -ENOMEM;
331062306a36Sopenharmony_ci	}
331162306a36Sopenharmony_ci
331262306a36Sopenharmony_ci	freeze_array(conf, 0);
331362306a36Sopenharmony_ci
331462306a36Sopenharmony_ci	/* ok, everything is stopped */
331562306a36Sopenharmony_ci	oldpool = conf->r1bio_pool;
331662306a36Sopenharmony_ci	conf->r1bio_pool = newpool;
331762306a36Sopenharmony_ci
331862306a36Sopenharmony_ci	for (d = d2 = 0; d < conf->raid_disks; d++) {
331962306a36Sopenharmony_ci		struct md_rdev *rdev = conf->mirrors[d].rdev;
332062306a36Sopenharmony_ci		if (rdev && rdev->raid_disk != d2) {
332162306a36Sopenharmony_ci			sysfs_unlink_rdev(mddev, rdev);
332262306a36Sopenharmony_ci			rdev->raid_disk = d2;
332362306a36Sopenharmony_ci			sysfs_unlink_rdev(mddev, rdev);
332462306a36Sopenharmony_ci			if (sysfs_link_rdev(mddev, rdev))
332562306a36Sopenharmony_ci				pr_warn("md/raid1:%s: cannot register rd%d\n",
332662306a36Sopenharmony_ci					mdname(mddev), rdev->raid_disk);
332762306a36Sopenharmony_ci		}
332862306a36Sopenharmony_ci		if (rdev)
332962306a36Sopenharmony_ci			newmirrors[d2++].rdev = rdev;
333062306a36Sopenharmony_ci	}
333162306a36Sopenharmony_ci	kfree(conf->mirrors);
333262306a36Sopenharmony_ci	conf->mirrors = newmirrors;
333362306a36Sopenharmony_ci	kfree(conf->poolinfo);
333462306a36Sopenharmony_ci	conf->poolinfo = newpoolinfo;
333562306a36Sopenharmony_ci
333662306a36Sopenharmony_ci	spin_lock_irqsave(&conf->device_lock, flags);
333762306a36Sopenharmony_ci	mddev->degraded += (raid_disks - conf->raid_disks);
333862306a36Sopenharmony_ci	spin_unlock_irqrestore(&conf->device_lock, flags);
333962306a36Sopenharmony_ci	conf->raid_disks = mddev->raid_disks = raid_disks;
334062306a36Sopenharmony_ci	mddev->delta_disks = 0;
334162306a36Sopenharmony_ci
334262306a36Sopenharmony_ci	unfreeze_array(conf);
334362306a36Sopenharmony_ci
334462306a36Sopenharmony_ci	set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
334562306a36Sopenharmony_ci	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
334662306a36Sopenharmony_ci	md_wakeup_thread(mddev->thread);
334762306a36Sopenharmony_ci
334862306a36Sopenharmony_ci	mempool_exit(&oldpool);
334962306a36Sopenharmony_ci	return 0;
335062306a36Sopenharmony_ci}
335162306a36Sopenharmony_ci
335262306a36Sopenharmony_cistatic void raid1_quiesce(struct mddev *mddev, int quiesce)
335362306a36Sopenharmony_ci{
335462306a36Sopenharmony_ci	struct r1conf *conf = mddev->private;
335562306a36Sopenharmony_ci
335662306a36Sopenharmony_ci	if (quiesce)
335762306a36Sopenharmony_ci		freeze_array(conf, 0);
335862306a36Sopenharmony_ci	else
335962306a36Sopenharmony_ci		unfreeze_array(conf);
336062306a36Sopenharmony_ci}
336162306a36Sopenharmony_ci
336262306a36Sopenharmony_cistatic void *raid1_takeover(struct mddev *mddev)
336362306a36Sopenharmony_ci{
336462306a36Sopenharmony_ci	/* raid1 can take over:
336562306a36Sopenharmony_ci	 *  raid5 with 2 devices, any layout or chunk size
336662306a36Sopenharmony_ci	 */
336762306a36Sopenharmony_ci	if (mddev->level == 5 && mddev->raid_disks == 2) {
336862306a36Sopenharmony_ci		struct r1conf *conf;
336962306a36Sopenharmony_ci		mddev->new_level = 1;
337062306a36Sopenharmony_ci		mddev->new_layout = 0;
337162306a36Sopenharmony_ci		mddev->new_chunk_sectors = 0;
337262306a36Sopenharmony_ci		conf = setup_conf(mddev);
337362306a36Sopenharmony_ci		if (!IS_ERR(conf)) {
337462306a36Sopenharmony_ci			/* Array must appear to be quiesced */
337562306a36Sopenharmony_ci			conf->array_frozen = 1;
337662306a36Sopenharmony_ci			mddev_clear_unsupported_flags(mddev,
337762306a36Sopenharmony_ci				UNSUPPORTED_MDDEV_FLAGS);
337862306a36Sopenharmony_ci		}
337962306a36Sopenharmony_ci		return conf;
338062306a36Sopenharmony_ci	}
338162306a36Sopenharmony_ci	return ERR_PTR(-EINVAL);
338262306a36Sopenharmony_ci}
338362306a36Sopenharmony_ci
338462306a36Sopenharmony_cistatic struct md_personality raid1_personality =
338562306a36Sopenharmony_ci{
338662306a36Sopenharmony_ci	.name		= "raid1",
338762306a36Sopenharmony_ci	.level		= 1,
338862306a36Sopenharmony_ci	.owner		= THIS_MODULE,
338962306a36Sopenharmony_ci	.make_request	= raid1_make_request,
339062306a36Sopenharmony_ci	.run		= raid1_run,
339162306a36Sopenharmony_ci	.free		= raid1_free,
339262306a36Sopenharmony_ci	.status		= raid1_status,
339362306a36Sopenharmony_ci	.error_handler	= raid1_error,
339462306a36Sopenharmony_ci	.hot_add_disk	= raid1_add_disk,
339562306a36Sopenharmony_ci	.hot_remove_disk= raid1_remove_disk,
339662306a36Sopenharmony_ci	.spare_active	= raid1_spare_active,
339762306a36Sopenharmony_ci	.sync_request	= raid1_sync_request,
339862306a36Sopenharmony_ci	.resize		= raid1_resize,
339962306a36Sopenharmony_ci	.size		= raid1_size,
340062306a36Sopenharmony_ci	.check_reshape	= raid1_reshape,
340162306a36Sopenharmony_ci	.quiesce	= raid1_quiesce,
340262306a36Sopenharmony_ci	.takeover	= raid1_takeover,
340362306a36Sopenharmony_ci};
340462306a36Sopenharmony_ci
340562306a36Sopenharmony_cistatic int __init raid_init(void)
340662306a36Sopenharmony_ci{
340762306a36Sopenharmony_ci	return register_md_personality(&raid1_personality);
340862306a36Sopenharmony_ci}
340962306a36Sopenharmony_ci
341062306a36Sopenharmony_cistatic void raid_exit(void)
341162306a36Sopenharmony_ci{
341262306a36Sopenharmony_ci	unregister_md_personality(&raid1_personality);
341362306a36Sopenharmony_ci}
341462306a36Sopenharmony_ci
341562306a36Sopenharmony_cimodule_init(raid_init);
341662306a36Sopenharmony_cimodule_exit(raid_exit);
341762306a36Sopenharmony_ciMODULE_LICENSE("GPL");
341862306a36Sopenharmony_ciMODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
341962306a36Sopenharmony_ciMODULE_ALIAS("md-personality-3"); /* RAID1 */
342062306a36Sopenharmony_ciMODULE_ALIAS("md-raid1");
342162306a36Sopenharmony_ciMODULE_ALIAS("md-level-1");
3422