162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * raid1.c : Multiple Devices driver for Linux 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * RAID-1 management functions. 1062306a36Sopenharmony_ci * 1162306a36Sopenharmony_ci * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 1262306a36Sopenharmony_ci * 1362306a36Sopenharmony_ci * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> 1462306a36Sopenharmony_ci * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> 1562306a36Sopenharmony_ci * 1662306a36Sopenharmony_ci * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support 1762306a36Sopenharmony_ci * bitmapped intelligence in resync: 1862306a36Sopenharmony_ci * 1962306a36Sopenharmony_ci * - bitmap marked during normal i/o 2062306a36Sopenharmony_ci * - bitmap used to skip nondirty blocks during sync 2162306a36Sopenharmony_ci * 2262306a36Sopenharmony_ci * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: 2362306a36Sopenharmony_ci * - persistent bitmap code 2462306a36Sopenharmony_ci */ 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci#include <linux/slab.h> 2762306a36Sopenharmony_ci#include <linux/delay.h> 2862306a36Sopenharmony_ci#include <linux/blkdev.h> 2962306a36Sopenharmony_ci#include <linux/module.h> 3062306a36Sopenharmony_ci#include <linux/seq_file.h> 3162306a36Sopenharmony_ci#include <linux/ratelimit.h> 3262306a36Sopenharmony_ci#include <linux/interval_tree_generic.h> 3362306a36Sopenharmony_ci 3462306a36Sopenharmony_ci#include <trace/events/block.h> 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci#include "md.h" 3762306a36Sopenharmony_ci#include "raid1.h" 3862306a36Sopenharmony_ci#include "md-bitmap.h" 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci#define UNSUPPORTED_MDDEV_FLAGS \ 4162306a36Sopenharmony_ci ((1L << MD_HAS_JOURNAL) | \ 4262306a36Sopenharmony_ci (1L << MD_JOURNAL_CLEAN) | \ 4362306a36Sopenharmony_ci (1L << MD_HAS_PPL) | \ 4462306a36Sopenharmony_ci (1L << MD_HAS_MULTIPLE_PPLS)) 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_cistatic void allow_barrier(struct r1conf *conf, sector_t sector_nr); 4762306a36Sopenharmony_cistatic void lower_barrier(struct r1conf *conf, sector_t sector_nr); 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci#define raid1_log(md, fmt, args...) \ 5062306a36Sopenharmony_ci do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci#include "raid1-10.c" 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci#define START(node) ((node)->start) 5562306a36Sopenharmony_ci#define LAST(node) ((node)->last) 5662306a36Sopenharmony_ciINTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, 5762306a36Sopenharmony_ci START, LAST, static inline, raid1_rb); 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_cistatic int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, 6062306a36Sopenharmony_ci struct serial_info *si, int idx) 6162306a36Sopenharmony_ci{ 6262306a36Sopenharmony_ci unsigned long flags; 6362306a36Sopenharmony_ci int ret = 0; 6462306a36Sopenharmony_ci sector_t lo = r1_bio->sector; 6562306a36Sopenharmony_ci sector_t hi = lo + r1_bio->sectors; 6662306a36Sopenharmony_ci struct serial_in_rdev *serial = &rdev->serial[idx]; 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci spin_lock_irqsave(&serial->serial_lock, flags); 6962306a36Sopenharmony_ci /* collision happened */ 7062306a36Sopenharmony_ci if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) 7162306a36Sopenharmony_ci ret = -EBUSY; 7262306a36Sopenharmony_ci else { 7362306a36Sopenharmony_ci si->start = lo; 7462306a36Sopenharmony_ci si->last = hi; 7562306a36Sopenharmony_ci raid1_rb_insert(si, &serial->serial_rb); 7662306a36Sopenharmony_ci } 7762306a36Sopenharmony_ci spin_unlock_irqrestore(&serial->serial_lock, flags); 7862306a36Sopenharmony_ci 7962306a36Sopenharmony_ci return ret; 8062306a36Sopenharmony_ci} 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_cistatic void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio) 8362306a36Sopenharmony_ci{ 8462306a36Sopenharmony_ci struct mddev *mddev = rdev->mddev; 8562306a36Sopenharmony_ci struct serial_info *si; 8662306a36Sopenharmony_ci int idx = sector_to_idx(r1_bio->sector); 8762306a36Sopenharmony_ci struct serial_in_rdev *serial = &rdev->serial[idx]; 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci if (WARN_ON(!mddev->serial_info_pool)) 9062306a36Sopenharmony_ci return; 9162306a36Sopenharmony_ci si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); 9262306a36Sopenharmony_ci wait_event(serial->serial_io_wait, 9362306a36Sopenharmony_ci check_and_add_serial(rdev, r1_bio, si, idx) == 0); 9462306a36Sopenharmony_ci} 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_cistatic void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) 9762306a36Sopenharmony_ci{ 9862306a36Sopenharmony_ci struct serial_info *si; 9962306a36Sopenharmony_ci unsigned long flags; 10062306a36Sopenharmony_ci int found = 0; 10162306a36Sopenharmony_ci struct mddev *mddev = rdev->mddev; 10262306a36Sopenharmony_ci int idx = sector_to_idx(lo); 10362306a36Sopenharmony_ci struct serial_in_rdev *serial = &rdev->serial[idx]; 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci spin_lock_irqsave(&serial->serial_lock, flags); 10662306a36Sopenharmony_ci for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); 10762306a36Sopenharmony_ci si; si = raid1_rb_iter_next(si, lo, hi)) { 10862306a36Sopenharmony_ci if (si->start == lo && si->last == hi) { 10962306a36Sopenharmony_ci raid1_rb_remove(si, &serial->serial_rb); 11062306a36Sopenharmony_ci mempool_free(si, mddev->serial_info_pool); 11162306a36Sopenharmony_ci found = 1; 11262306a36Sopenharmony_ci break; 11362306a36Sopenharmony_ci } 11462306a36Sopenharmony_ci } 11562306a36Sopenharmony_ci if (!found) 11662306a36Sopenharmony_ci WARN(1, "The write IO is not recorded for serialization\n"); 11762306a36Sopenharmony_ci spin_unlock_irqrestore(&serial->serial_lock, flags); 11862306a36Sopenharmony_ci wake_up(&serial->serial_io_wait); 11962306a36Sopenharmony_ci} 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci/* 12262306a36Sopenharmony_ci * for resync bio, r1bio pointer can be retrieved from the per-bio 12362306a36Sopenharmony_ci * 'struct resync_pages'. 12462306a36Sopenharmony_ci */ 12562306a36Sopenharmony_cistatic inline struct r1bio *get_resync_r1bio(struct bio *bio) 12662306a36Sopenharmony_ci{ 12762306a36Sopenharmony_ci return get_resync_pages(bio)->raid_bio; 12862306a36Sopenharmony_ci} 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_cistatic void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 13162306a36Sopenharmony_ci{ 13262306a36Sopenharmony_ci struct pool_info *pi = data; 13362306a36Sopenharmony_ci int size = offsetof(struct r1bio, bios[pi->raid_disks]); 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci /* allocate a r1bio with room for raid_disks entries in the bios array */ 13662306a36Sopenharmony_ci return kzalloc(size, gfp_flags); 13762306a36Sopenharmony_ci} 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci#define RESYNC_DEPTH 32 14062306a36Sopenharmony_ci#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 14162306a36Sopenharmony_ci#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) 14262306a36Sopenharmony_ci#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) 14362306a36Sopenharmony_ci#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) 14462306a36Sopenharmony_ci#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_cistatic void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 14762306a36Sopenharmony_ci{ 14862306a36Sopenharmony_ci struct pool_info *pi = data; 14962306a36Sopenharmony_ci struct r1bio *r1_bio; 15062306a36Sopenharmony_ci struct bio *bio; 15162306a36Sopenharmony_ci int need_pages; 15262306a36Sopenharmony_ci int j; 15362306a36Sopenharmony_ci struct resync_pages *rps; 15462306a36Sopenharmony_ci 15562306a36Sopenharmony_ci r1_bio = r1bio_pool_alloc(gfp_flags, pi); 15662306a36Sopenharmony_ci if (!r1_bio) 15762306a36Sopenharmony_ci return NULL; 15862306a36Sopenharmony_ci 15962306a36Sopenharmony_ci rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages), 16062306a36Sopenharmony_ci gfp_flags); 16162306a36Sopenharmony_ci if (!rps) 16262306a36Sopenharmony_ci goto out_free_r1bio; 16362306a36Sopenharmony_ci 16462306a36Sopenharmony_ci /* 16562306a36Sopenharmony_ci * Allocate bios : 1 for reading, n-1 for writing 16662306a36Sopenharmony_ci */ 16762306a36Sopenharmony_ci for (j = pi->raid_disks ; j-- ; ) { 16862306a36Sopenharmony_ci bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 16962306a36Sopenharmony_ci if (!bio) 17062306a36Sopenharmony_ci goto out_free_bio; 17162306a36Sopenharmony_ci bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); 17262306a36Sopenharmony_ci r1_bio->bios[j] = bio; 17362306a36Sopenharmony_ci } 17462306a36Sopenharmony_ci /* 17562306a36Sopenharmony_ci * Allocate RESYNC_PAGES data pages and attach them to 17662306a36Sopenharmony_ci * the first bio. 17762306a36Sopenharmony_ci * If this is a user-requested check/repair, allocate 17862306a36Sopenharmony_ci * RESYNC_PAGES for each bio. 17962306a36Sopenharmony_ci */ 18062306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) 18162306a36Sopenharmony_ci need_pages = pi->raid_disks; 18262306a36Sopenharmony_ci else 18362306a36Sopenharmony_ci need_pages = 1; 18462306a36Sopenharmony_ci for (j = 0; j < pi->raid_disks; j++) { 18562306a36Sopenharmony_ci struct resync_pages *rp = &rps[j]; 18662306a36Sopenharmony_ci 18762306a36Sopenharmony_ci bio = r1_bio->bios[j]; 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci if (j < need_pages) { 19062306a36Sopenharmony_ci if (resync_alloc_pages(rp, gfp_flags)) 19162306a36Sopenharmony_ci goto out_free_pages; 19262306a36Sopenharmony_ci } else { 19362306a36Sopenharmony_ci memcpy(rp, &rps[0], sizeof(*rp)); 19462306a36Sopenharmony_ci resync_get_all_pages(rp); 19562306a36Sopenharmony_ci } 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci rp->raid_bio = r1_bio; 19862306a36Sopenharmony_ci bio->bi_private = rp; 19962306a36Sopenharmony_ci } 20062306a36Sopenharmony_ci 20162306a36Sopenharmony_ci r1_bio->master_bio = NULL; 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci return r1_bio; 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ciout_free_pages: 20662306a36Sopenharmony_ci while (--j >= 0) 20762306a36Sopenharmony_ci resync_free_pages(&rps[j]); 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ciout_free_bio: 21062306a36Sopenharmony_ci while (++j < pi->raid_disks) { 21162306a36Sopenharmony_ci bio_uninit(r1_bio->bios[j]); 21262306a36Sopenharmony_ci kfree(r1_bio->bios[j]); 21362306a36Sopenharmony_ci } 21462306a36Sopenharmony_ci kfree(rps); 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ciout_free_r1bio: 21762306a36Sopenharmony_ci rbio_pool_free(r1_bio, data); 21862306a36Sopenharmony_ci return NULL; 21962306a36Sopenharmony_ci} 22062306a36Sopenharmony_ci 22162306a36Sopenharmony_cistatic void r1buf_pool_free(void *__r1_bio, void *data) 22262306a36Sopenharmony_ci{ 22362306a36Sopenharmony_ci struct pool_info *pi = data; 22462306a36Sopenharmony_ci int i; 22562306a36Sopenharmony_ci struct r1bio *r1bio = __r1_bio; 22662306a36Sopenharmony_ci struct resync_pages *rp = NULL; 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ci for (i = pi->raid_disks; i--; ) { 22962306a36Sopenharmony_ci rp = get_resync_pages(r1bio->bios[i]); 23062306a36Sopenharmony_ci resync_free_pages(rp); 23162306a36Sopenharmony_ci bio_uninit(r1bio->bios[i]); 23262306a36Sopenharmony_ci kfree(r1bio->bios[i]); 23362306a36Sopenharmony_ci } 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci /* resync pages array stored in the 1st bio's .bi_private */ 23662306a36Sopenharmony_ci kfree(rp); 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci rbio_pool_free(r1bio, data); 23962306a36Sopenharmony_ci} 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_cistatic void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) 24262306a36Sopenharmony_ci{ 24362306a36Sopenharmony_ci int i; 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 24662306a36Sopenharmony_ci struct bio **bio = r1_bio->bios + i; 24762306a36Sopenharmony_ci if (!BIO_SPECIAL(*bio)) 24862306a36Sopenharmony_ci bio_put(*bio); 24962306a36Sopenharmony_ci *bio = NULL; 25062306a36Sopenharmony_ci } 25162306a36Sopenharmony_ci} 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_cistatic void free_r1bio(struct r1bio *r1_bio) 25462306a36Sopenharmony_ci{ 25562306a36Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci put_all_bios(conf, r1_bio); 25862306a36Sopenharmony_ci mempool_free(r1_bio, &conf->r1bio_pool); 25962306a36Sopenharmony_ci} 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_cistatic void put_buf(struct r1bio *r1_bio) 26262306a36Sopenharmony_ci{ 26362306a36Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 26462306a36Sopenharmony_ci sector_t sect = r1_bio->sector; 26562306a36Sopenharmony_ci int i; 26662306a36Sopenharmony_ci 26762306a36Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 26862306a36Sopenharmony_ci struct bio *bio = r1_bio->bios[i]; 26962306a36Sopenharmony_ci if (bio->bi_end_io) 27062306a36Sopenharmony_ci rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); 27162306a36Sopenharmony_ci } 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci mempool_free(r1_bio, &conf->r1buf_pool); 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_ci lower_barrier(conf, sect); 27662306a36Sopenharmony_ci} 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_cistatic void reschedule_retry(struct r1bio *r1_bio) 27962306a36Sopenharmony_ci{ 28062306a36Sopenharmony_ci unsigned long flags; 28162306a36Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 28262306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 28362306a36Sopenharmony_ci int idx; 28462306a36Sopenharmony_ci 28562306a36Sopenharmony_ci idx = sector_to_idx(r1_bio->sector); 28662306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 28762306a36Sopenharmony_ci list_add(&r1_bio->retry_list, &conf->retry_list); 28862306a36Sopenharmony_ci atomic_inc(&conf->nr_queued[idx]); 28962306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 29262306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 29362306a36Sopenharmony_ci} 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci/* 29662306a36Sopenharmony_ci * raid_end_bio_io() is called when we have finished servicing a mirrored 29762306a36Sopenharmony_ci * operation and are ready to return a success/failure code to the buffer 29862306a36Sopenharmony_ci * cache layer. 29962306a36Sopenharmony_ci */ 30062306a36Sopenharmony_cistatic void call_bio_endio(struct r1bio *r1_bio) 30162306a36Sopenharmony_ci{ 30262306a36Sopenharmony_ci struct bio *bio = r1_bio->master_bio; 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) 30562306a36Sopenharmony_ci bio->bi_status = BLK_STS_IOERR; 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci bio_endio(bio); 30862306a36Sopenharmony_ci} 30962306a36Sopenharmony_ci 31062306a36Sopenharmony_cistatic void raid_end_bio_io(struct r1bio *r1_bio) 31162306a36Sopenharmony_ci{ 31262306a36Sopenharmony_ci struct bio *bio = r1_bio->master_bio; 31362306a36Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 31462306a36Sopenharmony_ci sector_t sector = r1_bio->sector; 31562306a36Sopenharmony_ci 31662306a36Sopenharmony_ci /* if nobody has done the final endio yet, do it now */ 31762306a36Sopenharmony_ci if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 31862306a36Sopenharmony_ci pr_debug("raid1: sync end %s on sectors %llu-%llu\n", 31962306a36Sopenharmony_ci (bio_data_dir(bio) == WRITE) ? "write" : "read", 32062306a36Sopenharmony_ci (unsigned long long) bio->bi_iter.bi_sector, 32162306a36Sopenharmony_ci (unsigned long long) bio_end_sector(bio) - 1); 32262306a36Sopenharmony_ci 32362306a36Sopenharmony_ci call_bio_endio(r1_bio); 32462306a36Sopenharmony_ci } 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci free_r1bio(r1_bio); 32762306a36Sopenharmony_ci /* 32862306a36Sopenharmony_ci * Wake up any possible resync thread that waits for the device 32962306a36Sopenharmony_ci * to go idle. All I/Os, even write-behind writes, are done. 33062306a36Sopenharmony_ci */ 33162306a36Sopenharmony_ci allow_barrier(conf, sector); 33262306a36Sopenharmony_ci} 33362306a36Sopenharmony_ci 33462306a36Sopenharmony_ci/* 33562306a36Sopenharmony_ci * Update disk head position estimator based on IRQ completion info. 33662306a36Sopenharmony_ci */ 33762306a36Sopenharmony_cistatic inline void update_head_pos(int disk, struct r1bio *r1_bio) 33862306a36Sopenharmony_ci{ 33962306a36Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 34062306a36Sopenharmony_ci 34162306a36Sopenharmony_ci conf->mirrors[disk].head_position = 34262306a36Sopenharmony_ci r1_bio->sector + (r1_bio->sectors); 34362306a36Sopenharmony_ci} 34462306a36Sopenharmony_ci 34562306a36Sopenharmony_ci/* 34662306a36Sopenharmony_ci * Find the disk number which triggered given bio 34762306a36Sopenharmony_ci */ 34862306a36Sopenharmony_cistatic int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) 34962306a36Sopenharmony_ci{ 35062306a36Sopenharmony_ci int mirror; 35162306a36Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 35262306a36Sopenharmony_ci int raid_disks = conf->raid_disks; 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci for (mirror = 0; mirror < raid_disks * 2; mirror++) 35562306a36Sopenharmony_ci if (r1_bio->bios[mirror] == bio) 35662306a36Sopenharmony_ci break; 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci BUG_ON(mirror == raid_disks * 2); 35962306a36Sopenharmony_ci update_head_pos(mirror, r1_bio); 36062306a36Sopenharmony_ci 36162306a36Sopenharmony_ci return mirror; 36262306a36Sopenharmony_ci} 36362306a36Sopenharmony_ci 36462306a36Sopenharmony_cistatic void raid1_end_read_request(struct bio *bio) 36562306a36Sopenharmony_ci{ 36662306a36Sopenharmony_ci int uptodate = !bio->bi_status; 36762306a36Sopenharmony_ci struct r1bio *r1_bio = bio->bi_private; 36862306a36Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 36962306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci /* 37262306a36Sopenharmony_ci * this branch is our 'one mirror IO has finished' event handler: 37362306a36Sopenharmony_ci */ 37462306a36Sopenharmony_ci update_head_pos(r1_bio->read_disk, r1_bio); 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_ci if (uptodate) 37762306a36Sopenharmony_ci set_bit(R1BIO_Uptodate, &r1_bio->state); 37862306a36Sopenharmony_ci else if (test_bit(FailFast, &rdev->flags) && 37962306a36Sopenharmony_ci test_bit(R1BIO_FailFast, &r1_bio->state)) 38062306a36Sopenharmony_ci /* This was a fail-fast read so we definitely 38162306a36Sopenharmony_ci * want to retry */ 38262306a36Sopenharmony_ci ; 38362306a36Sopenharmony_ci else { 38462306a36Sopenharmony_ci /* If all other devices have failed, we want to return 38562306a36Sopenharmony_ci * the error upwards rather than fail the last device. 38662306a36Sopenharmony_ci * Here we redefine "uptodate" to mean "Don't want to retry" 38762306a36Sopenharmony_ci */ 38862306a36Sopenharmony_ci unsigned long flags; 38962306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 39062306a36Sopenharmony_ci if (r1_bio->mddev->degraded == conf->raid_disks || 39162306a36Sopenharmony_ci (r1_bio->mddev->degraded == conf->raid_disks-1 && 39262306a36Sopenharmony_ci test_bit(In_sync, &rdev->flags))) 39362306a36Sopenharmony_ci uptodate = 1; 39462306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 39562306a36Sopenharmony_ci } 39662306a36Sopenharmony_ci 39762306a36Sopenharmony_ci if (uptodate) { 39862306a36Sopenharmony_ci raid_end_bio_io(r1_bio); 39962306a36Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 40062306a36Sopenharmony_ci } else { 40162306a36Sopenharmony_ci /* 40262306a36Sopenharmony_ci * oops, read error: 40362306a36Sopenharmony_ci */ 40462306a36Sopenharmony_ci pr_err_ratelimited("md/raid1:%s: %pg: rescheduling sector %llu\n", 40562306a36Sopenharmony_ci mdname(conf->mddev), 40662306a36Sopenharmony_ci rdev->bdev, 40762306a36Sopenharmony_ci (unsigned long long)r1_bio->sector); 40862306a36Sopenharmony_ci set_bit(R1BIO_ReadError, &r1_bio->state); 40962306a36Sopenharmony_ci reschedule_retry(r1_bio); 41062306a36Sopenharmony_ci /* don't drop the reference on read_disk yet */ 41162306a36Sopenharmony_ci } 41262306a36Sopenharmony_ci} 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_cistatic void close_write(struct r1bio *r1_bio) 41562306a36Sopenharmony_ci{ 41662306a36Sopenharmony_ci /* it really is the end of this request */ 41762306a36Sopenharmony_ci if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 41862306a36Sopenharmony_ci bio_free_pages(r1_bio->behind_master_bio); 41962306a36Sopenharmony_ci bio_put(r1_bio->behind_master_bio); 42062306a36Sopenharmony_ci r1_bio->behind_master_bio = NULL; 42162306a36Sopenharmony_ci } 42262306a36Sopenharmony_ci /* clear the bitmap if all writes complete successfully */ 42362306a36Sopenharmony_ci md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 42462306a36Sopenharmony_ci r1_bio->sectors, 42562306a36Sopenharmony_ci !test_bit(R1BIO_Degraded, &r1_bio->state), 42662306a36Sopenharmony_ci test_bit(R1BIO_BehindIO, &r1_bio->state)); 42762306a36Sopenharmony_ci md_write_end(r1_bio->mddev); 42862306a36Sopenharmony_ci} 42962306a36Sopenharmony_ci 43062306a36Sopenharmony_cistatic void r1_bio_write_done(struct r1bio *r1_bio) 43162306a36Sopenharmony_ci{ 43262306a36Sopenharmony_ci if (!atomic_dec_and_test(&r1_bio->remaining)) 43362306a36Sopenharmony_ci return; 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci if (test_bit(R1BIO_WriteError, &r1_bio->state)) 43662306a36Sopenharmony_ci reschedule_retry(r1_bio); 43762306a36Sopenharmony_ci else { 43862306a36Sopenharmony_ci close_write(r1_bio); 43962306a36Sopenharmony_ci if (test_bit(R1BIO_MadeGood, &r1_bio->state)) 44062306a36Sopenharmony_ci reschedule_retry(r1_bio); 44162306a36Sopenharmony_ci else 44262306a36Sopenharmony_ci raid_end_bio_io(r1_bio); 44362306a36Sopenharmony_ci } 44462306a36Sopenharmony_ci} 44562306a36Sopenharmony_ci 44662306a36Sopenharmony_cistatic void raid1_end_write_request(struct bio *bio) 44762306a36Sopenharmony_ci{ 44862306a36Sopenharmony_ci struct r1bio *r1_bio = bio->bi_private; 44962306a36Sopenharmony_ci int behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 45062306a36Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 45162306a36Sopenharmony_ci struct bio *to_put = NULL; 45262306a36Sopenharmony_ci int mirror = find_bio_disk(r1_bio, bio); 45362306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[mirror].rdev; 45462306a36Sopenharmony_ci bool discard_error; 45562306a36Sopenharmony_ci sector_t lo = r1_bio->sector; 45662306a36Sopenharmony_ci sector_t hi = r1_bio->sector + r1_bio->sectors; 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; 45962306a36Sopenharmony_ci 46062306a36Sopenharmony_ci /* 46162306a36Sopenharmony_ci * 'one mirror IO has finished' event handler: 46262306a36Sopenharmony_ci */ 46362306a36Sopenharmony_ci if (bio->bi_status && !discard_error) { 46462306a36Sopenharmony_ci set_bit(WriteErrorSeen, &rdev->flags); 46562306a36Sopenharmony_ci if (!test_and_set_bit(WantReplacement, &rdev->flags)) 46662306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, & 46762306a36Sopenharmony_ci conf->mddev->recovery); 46862306a36Sopenharmony_ci 46962306a36Sopenharmony_ci if (test_bit(FailFast, &rdev->flags) && 47062306a36Sopenharmony_ci (bio->bi_opf & MD_FAILFAST) && 47162306a36Sopenharmony_ci /* We never try FailFast to WriteMostly devices */ 47262306a36Sopenharmony_ci !test_bit(WriteMostly, &rdev->flags)) { 47362306a36Sopenharmony_ci md_error(r1_bio->mddev, rdev); 47462306a36Sopenharmony_ci } 47562306a36Sopenharmony_ci 47662306a36Sopenharmony_ci /* 47762306a36Sopenharmony_ci * When the device is faulty, it is not necessary to 47862306a36Sopenharmony_ci * handle write error. 47962306a36Sopenharmony_ci */ 48062306a36Sopenharmony_ci if (!test_bit(Faulty, &rdev->flags)) 48162306a36Sopenharmony_ci set_bit(R1BIO_WriteError, &r1_bio->state); 48262306a36Sopenharmony_ci else { 48362306a36Sopenharmony_ci /* Fail the request */ 48462306a36Sopenharmony_ci set_bit(R1BIO_Degraded, &r1_bio->state); 48562306a36Sopenharmony_ci /* Finished with this branch */ 48662306a36Sopenharmony_ci r1_bio->bios[mirror] = NULL; 48762306a36Sopenharmony_ci to_put = bio; 48862306a36Sopenharmony_ci } 48962306a36Sopenharmony_ci } else { 49062306a36Sopenharmony_ci /* 49162306a36Sopenharmony_ci * Set R1BIO_Uptodate in our master bio, so that we 49262306a36Sopenharmony_ci * will return a good error code for to the higher 49362306a36Sopenharmony_ci * levels even if IO on some other mirrored buffer 49462306a36Sopenharmony_ci * fails. 49562306a36Sopenharmony_ci * 49662306a36Sopenharmony_ci * The 'master' represents the composite IO operation 49762306a36Sopenharmony_ci * to user-side. So if something waits for IO, then it 49862306a36Sopenharmony_ci * will wait for the 'master' bio. 49962306a36Sopenharmony_ci */ 50062306a36Sopenharmony_ci sector_t first_bad; 50162306a36Sopenharmony_ci int bad_sectors; 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_ci r1_bio->bios[mirror] = NULL; 50462306a36Sopenharmony_ci to_put = bio; 50562306a36Sopenharmony_ci /* 50662306a36Sopenharmony_ci * Do not set R1BIO_Uptodate if the current device is 50762306a36Sopenharmony_ci * rebuilding or Faulty. This is because we cannot use 50862306a36Sopenharmony_ci * such device for properly reading the data back (we could 50962306a36Sopenharmony_ci * potentially use it, if the current write would have felt 51062306a36Sopenharmony_ci * before rdev->recovery_offset, but for simplicity we don't 51162306a36Sopenharmony_ci * check this here. 51262306a36Sopenharmony_ci */ 51362306a36Sopenharmony_ci if (test_bit(In_sync, &rdev->flags) && 51462306a36Sopenharmony_ci !test_bit(Faulty, &rdev->flags)) 51562306a36Sopenharmony_ci set_bit(R1BIO_Uptodate, &r1_bio->state); 51662306a36Sopenharmony_ci 51762306a36Sopenharmony_ci /* Maybe we can clear some bad blocks. */ 51862306a36Sopenharmony_ci if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, 51962306a36Sopenharmony_ci &first_bad, &bad_sectors) && !discard_error) { 52062306a36Sopenharmony_ci r1_bio->bios[mirror] = IO_MADE_GOOD; 52162306a36Sopenharmony_ci set_bit(R1BIO_MadeGood, &r1_bio->state); 52262306a36Sopenharmony_ci } 52362306a36Sopenharmony_ci } 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ci if (behind) { 52662306a36Sopenharmony_ci if (test_bit(CollisionCheck, &rdev->flags)) 52762306a36Sopenharmony_ci remove_serial(rdev, lo, hi); 52862306a36Sopenharmony_ci if (test_bit(WriteMostly, &rdev->flags)) 52962306a36Sopenharmony_ci atomic_dec(&r1_bio->behind_remaining); 53062306a36Sopenharmony_ci 53162306a36Sopenharmony_ci /* 53262306a36Sopenharmony_ci * In behind mode, we ACK the master bio once the I/O 53362306a36Sopenharmony_ci * has safely reached all non-writemostly 53462306a36Sopenharmony_ci * disks. Setting the Returned bit ensures that this 53562306a36Sopenharmony_ci * gets done only once -- we don't ever want to return 53662306a36Sopenharmony_ci * -EIO here, instead we'll wait 53762306a36Sopenharmony_ci */ 53862306a36Sopenharmony_ci if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && 53962306a36Sopenharmony_ci test_bit(R1BIO_Uptodate, &r1_bio->state)) { 54062306a36Sopenharmony_ci /* Maybe we can return now */ 54162306a36Sopenharmony_ci if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 54262306a36Sopenharmony_ci struct bio *mbio = r1_bio->master_bio; 54362306a36Sopenharmony_ci pr_debug("raid1: behind end write sectors" 54462306a36Sopenharmony_ci " %llu-%llu\n", 54562306a36Sopenharmony_ci (unsigned long long) mbio->bi_iter.bi_sector, 54662306a36Sopenharmony_ci (unsigned long long) bio_end_sector(mbio) - 1); 54762306a36Sopenharmony_ci call_bio_endio(r1_bio); 54862306a36Sopenharmony_ci } 54962306a36Sopenharmony_ci } 55062306a36Sopenharmony_ci } else if (rdev->mddev->serialize_policy) 55162306a36Sopenharmony_ci remove_serial(rdev, lo, hi); 55262306a36Sopenharmony_ci if (r1_bio->bios[mirror] == NULL) 55362306a36Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 55462306a36Sopenharmony_ci 55562306a36Sopenharmony_ci /* 55662306a36Sopenharmony_ci * Let's see if all mirrored write operations have finished 55762306a36Sopenharmony_ci * already. 55862306a36Sopenharmony_ci */ 55962306a36Sopenharmony_ci r1_bio_write_done(r1_bio); 56062306a36Sopenharmony_ci 56162306a36Sopenharmony_ci if (to_put) 56262306a36Sopenharmony_ci bio_put(to_put); 56362306a36Sopenharmony_ci} 56462306a36Sopenharmony_ci 56562306a36Sopenharmony_cistatic sector_t align_to_barrier_unit_end(sector_t start_sector, 56662306a36Sopenharmony_ci sector_t sectors) 56762306a36Sopenharmony_ci{ 56862306a36Sopenharmony_ci sector_t len; 56962306a36Sopenharmony_ci 57062306a36Sopenharmony_ci WARN_ON(sectors == 0); 57162306a36Sopenharmony_ci /* 57262306a36Sopenharmony_ci * len is the number of sectors from start_sector to end of the 57362306a36Sopenharmony_ci * barrier unit which start_sector belongs to. 57462306a36Sopenharmony_ci */ 57562306a36Sopenharmony_ci len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) - 57662306a36Sopenharmony_ci start_sector; 57762306a36Sopenharmony_ci 57862306a36Sopenharmony_ci if (len > sectors) 57962306a36Sopenharmony_ci len = sectors; 58062306a36Sopenharmony_ci 58162306a36Sopenharmony_ci return len; 58262306a36Sopenharmony_ci} 58362306a36Sopenharmony_ci 58462306a36Sopenharmony_ci/* 58562306a36Sopenharmony_ci * This routine returns the disk from which the requested read should 58662306a36Sopenharmony_ci * be done. There is a per-array 'next expected sequential IO' sector 58762306a36Sopenharmony_ci * number - if this matches on the next IO then we use the last disk. 58862306a36Sopenharmony_ci * There is also a per-disk 'last know head position' sector that is 58962306a36Sopenharmony_ci * maintained from IRQ contexts, both the normal and the resync IO 59062306a36Sopenharmony_ci * completion handlers update this position correctly. If there is no 59162306a36Sopenharmony_ci * perfect sequential match then we pick the disk whose head is closest. 59262306a36Sopenharmony_ci * 59362306a36Sopenharmony_ci * If there are 2 mirrors in the same 2 devices, performance degrades 59462306a36Sopenharmony_ci * because position is mirror, not device based. 59562306a36Sopenharmony_ci * 59662306a36Sopenharmony_ci * The rdev for the device selected will have nr_pending incremented. 59762306a36Sopenharmony_ci */ 59862306a36Sopenharmony_cistatic int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) 59962306a36Sopenharmony_ci{ 60062306a36Sopenharmony_ci const sector_t this_sector = r1_bio->sector; 60162306a36Sopenharmony_ci int sectors; 60262306a36Sopenharmony_ci int best_good_sectors; 60362306a36Sopenharmony_ci int best_disk, best_dist_disk, best_pending_disk; 60462306a36Sopenharmony_ci int has_nonrot_disk; 60562306a36Sopenharmony_ci int disk; 60662306a36Sopenharmony_ci sector_t best_dist; 60762306a36Sopenharmony_ci unsigned int min_pending; 60862306a36Sopenharmony_ci struct md_rdev *rdev; 60962306a36Sopenharmony_ci int choose_first; 61062306a36Sopenharmony_ci int choose_next_idle; 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_ci rcu_read_lock(); 61362306a36Sopenharmony_ci /* 61462306a36Sopenharmony_ci * Check if we can balance. We can balance on the whole 61562306a36Sopenharmony_ci * device if no resync is going on, or below the resync window. 61662306a36Sopenharmony_ci * We take the first readable disk when above the resync window. 61762306a36Sopenharmony_ci */ 61862306a36Sopenharmony_ci retry: 61962306a36Sopenharmony_ci sectors = r1_bio->sectors; 62062306a36Sopenharmony_ci best_disk = -1; 62162306a36Sopenharmony_ci best_dist_disk = -1; 62262306a36Sopenharmony_ci best_dist = MaxSector; 62362306a36Sopenharmony_ci best_pending_disk = -1; 62462306a36Sopenharmony_ci min_pending = UINT_MAX; 62562306a36Sopenharmony_ci best_good_sectors = 0; 62662306a36Sopenharmony_ci has_nonrot_disk = 0; 62762306a36Sopenharmony_ci choose_next_idle = 0; 62862306a36Sopenharmony_ci clear_bit(R1BIO_FailFast, &r1_bio->state); 62962306a36Sopenharmony_ci 63062306a36Sopenharmony_ci if ((conf->mddev->recovery_cp < this_sector + sectors) || 63162306a36Sopenharmony_ci (mddev_is_clustered(conf->mddev) && 63262306a36Sopenharmony_ci md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, 63362306a36Sopenharmony_ci this_sector + sectors))) 63462306a36Sopenharmony_ci choose_first = 1; 63562306a36Sopenharmony_ci else 63662306a36Sopenharmony_ci choose_first = 0; 63762306a36Sopenharmony_ci 63862306a36Sopenharmony_ci for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { 63962306a36Sopenharmony_ci sector_t dist; 64062306a36Sopenharmony_ci sector_t first_bad; 64162306a36Sopenharmony_ci int bad_sectors; 64262306a36Sopenharmony_ci unsigned int pending; 64362306a36Sopenharmony_ci bool nonrot; 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[disk].rdev); 64662306a36Sopenharmony_ci if (r1_bio->bios[disk] == IO_BLOCKED 64762306a36Sopenharmony_ci || rdev == NULL 64862306a36Sopenharmony_ci || test_bit(Faulty, &rdev->flags)) 64962306a36Sopenharmony_ci continue; 65062306a36Sopenharmony_ci if (!test_bit(In_sync, &rdev->flags) && 65162306a36Sopenharmony_ci rdev->recovery_offset < this_sector + sectors) 65262306a36Sopenharmony_ci continue; 65362306a36Sopenharmony_ci if (test_bit(WriteMostly, &rdev->flags)) { 65462306a36Sopenharmony_ci /* Don't balance among write-mostly, just 65562306a36Sopenharmony_ci * use the first as a last resort */ 65662306a36Sopenharmony_ci if (best_dist_disk < 0) { 65762306a36Sopenharmony_ci if (is_badblock(rdev, this_sector, sectors, 65862306a36Sopenharmony_ci &first_bad, &bad_sectors)) { 65962306a36Sopenharmony_ci if (first_bad <= this_sector) 66062306a36Sopenharmony_ci /* Cannot use this */ 66162306a36Sopenharmony_ci continue; 66262306a36Sopenharmony_ci best_good_sectors = first_bad - this_sector; 66362306a36Sopenharmony_ci } else 66462306a36Sopenharmony_ci best_good_sectors = sectors; 66562306a36Sopenharmony_ci best_dist_disk = disk; 66662306a36Sopenharmony_ci best_pending_disk = disk; 66762306a36Sopenharmony_ci } 66862306a36Sopenharmony_ci continue; 66962306a36Sopenharmony_ci } 67062306a36Sopenharmony_ci /* This is a reasonable device to use. It might 67162306a36Sopenharmony_ci * even be best. 67262306a36Sopenharmony_ci */ 67362306a36Sopenharmony_ci if (is_badblock(rdev, this_sector, sectors, 67462306a36Sopenharmony_ci &first_bad, &bad_sectors)) { 67562306a36Sopenharmony_ci if (best_dist < MaxSector) 67662306a36Sopenharmony_ci /* already have a better device */ 67762306a36Sopenharmony_ci continue; 67862306a36Sopenharmony_ci if (first_bad <= this_sector) { 67962306a36Sopenharmony_ci /* cannot read here. If this is the 'primary' 68062306a36Sopenharmony_ci * device, then we must not read beyond 68162306a36Sopenharmony_ci * bad_sectors from another device.. 68262306a36Sopenharmony_ci */ 68362306a36Sopenharmony_ci bad_sectors -= (this_sector - first_bad); 68462306a36Sopenharmony_ci if (choose_first && sectors > bad_sectors) 68562306a36Sopenharmony_ci sectors = bad_sectors; 68662306a36Sopenharmony_ci if (best_good_sectors > sectors) 68762306a36Sopenharmony_ci best_good_sectors = sectors; 68862306a36Sopenharmony_ci 68962306a36Sopenharmony_ci } else { 69062306a36Sopenharmony_ci sector_t good_sectors = first_bad - this_sector; 69162306a36Sopenharmony_ci if (good_sectors > best_good_sectors) { 69262306a36Sopenharmony_ci best_good_sectors = good_sectors; 69362306a36Sopenharmony_ci best_disk = disk; 69462306a36Sopenharmony_ci } 69562306a36Sopenharmony_ci if (choose_first) 69662306a36Sopenharmony_ci break; 69762306a36Sopenharmony_ci } 69862306a36Sopenharmony_ci continue; 69962306a36Sopenharmony_ci } else { 70062306a36Sopenharmony_ci if ((sectors > best_good_sectors) && (best_disk >= 0)) 70162306a36Sopenharmony_ci best_disk = -1; 70262306a36Sopenharmony_ci best_good_sectors = sectors; 70362306a36Sopenharmony_ci } 70462306a36Sopenharmony_ci 70562306a36Sopenharmony_ci if (best_disk >= 0) 70662306a36Sopenharmony_ci /* At least two disks to choose from so failfast is OK */ 70762306a36Sopenharmony_ci set_bit(R1BIO_FailFast, &r1_bio->state); 70862306a36Sopenharmony_ci 70962306a36Sopenharmony_ci nonrot = bdev_nonrot(rdev->bdev); 71062306a36Sopenharmony_ci has_nonrot_disk |= nonrot; 71162306a36Sopenharmony_ci pending = atomic_read(&rdev->nr_pending); 71262306a36Sopenharmony_ci dist = abs(this_sector - conf->mirrors[disk].head_position); 71362306a36Sopenharmony_ci if (choose_first) { 71462306a36Sopenharmony_ci best_disk = disk; 71562306a36Sopenharmony_ci break; 71662306a36Sopenharmony_ci } 71762306a36Sopenharmony_ci /* Don't change to another disk for sequential reads */ 71862306a36Sopenharmony_ci if (conf->mirrors[disk].next_seq_sect == this_sector 71962306a36Sopenharmony_ci || dist == 0) { 72062306a36Sopenharmony_ci int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; 72162306a36Sopenharmony_ci struct raid1_info *mirror = &conf->mirrors[disk]; 72262306a36Sopenharmony_ci 72362306a36Sopenharmony_ci best_disk = disk; 72462306a36Sopenharmony_ci /* 72562306a36Sopenharmony_ci * If buffered sequential IO size exceeds optimal 72662306a36Sopenharmony_ci * iosize, check if there is idle disk. If yes, choose 72762306a36Sopenharmony_ci * the idle disk. read_balance could already choose an 72862306a36Sopenharmony_ci * idle disk before noticing it's a sequential IO in 72962306a36Sopenharmony_ci * this disk. This doesn't matter because this disk 73062306a36Sopenharmony_ci * will idle, next time it will be utilized after the 73162306a36Sopenharmony_ci * first disk has IO size exceeds optimal iosize. In 73262306a36Sopenharmony_ci * this way, iosize of the first disk will be optimal 73362306a36Sopenharmony_ci * iosize at least. iosize of the second disk might be 73462306a36Sopenharmony_ci * small, but not a big deal since when the second disk 73562306a36Sopenharmony_ci * starts IO, the first disk is likely still busy. 73662306a36Sopenharmony_ci */ 73762306a36Sopenharmony_ci if (nonrot && opt_iosize > 0 && 73862306a36Sopenharmony_ci mirror->seq_start != MaxSector && 73962306a36Sopenharmony_ci mirror->next_seq_sect > opt_iosize && 74062306a36Sopenharmony_ci mirror->next_seq_sect - opt_iosize >= 74162306a36Sopenharmony_ci mirror->seq_start) { 74262306a36Sopenharmony_ci choose_next_idle = 1; 74362306a36Sopenharmony_ci continue; 74462306a36Sopenharmony_ci } 74562306a36Sopenharmony_ci break; 74662306a36Sopenharmony_ci } 74762306a36Sopenharmony_ci 74862306a36Sopenharmony_ci if (choose_next_idle) 74962306a36Sopenharmony_ci continue; 75062306a36Sopenharmony_ci 75162306a36Sopenharmony_ci if (min_pending > pending) { 75262306a36Sopenharmony_ci min_pending = pending; 75362306a36Sopenharmony_ci best_pending_disk = disk; 75462306a36Sopenharmony_ci } 75562306a36Sopenharmony_ci 75662306a36Sopenharmony_ci if (dist < best_dist) { 75762306a36Sopenharmony_ci best_dist = dist; 75862306a36Sopenharmony_ci best_dist_disk = disk; 75962306a36Sopenharmony_ci } 76062306a36Sopenharmony_ci } 76162306a36Sopenharmony_ci 76262306a36Sopenharmony_ci /* 76362306a36Sopenharmony_ci * If all disks are rotational, choose the closest disk. If any disk is 76462306a36Sopenharmony_ci * non-rotational, choose the disk with less pending request even the 76562306a36Sopenharmony_ci * disk is rotational, which might/might not be optimal for raids with 76662306a36Sopenharmony_ci * mixed ratation/non-rotational disks depending on workload. 76762306a36Sopenharmony_ci */ 76862306a36Sopenharmony_ci if (best_disk == -1) { 76962306a36Sopenharmony_ci if (has_nonrot_disk || min_pending == 0) 77062306a36Sopenharmony_ci best_disk = best_pending_disk; 77162306a36Sopenharmony_ci else 77262306a36Sopenharmony_ci best_disk = best_dist_disk; 77362306a36Sopenharmony_ci } 77462306a36Sopenharmony_ci 77562306a36Sopenharmony_ci if (best_disk >= 0) { 77662306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[best_disk].rdev); 77762306a36Sopenharmony_ci if (!rdev) 77862306a36Sopenharmony_ci goto retry; 77962306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 78062306a36Sopenharmony_ci sectors = best_good_sectors; 78162306a36Sopenharmony_ci 78262306a36Sopenharmony_ci if (conf->mirrors[best_disk].next_seq_sect != this_sector) 78362306a36Sopenharmony_ci conf->mirrors[best_disk].seq_start = this_sector; 78462306a36Sopenharmony_ci 78562306a36Sopenharmony_ci conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; 78662306a36Sopenharmony_ci } 78762306a36Sopenharmony_ci rcu_read_unlock(); 78862306a36Sopenharmony_ci *max_sectors = sectors; 78962306a36Sopenharmony_ci 79062306a36Sopenharmony_ci return best_disk; 79162306a36Sopenharmony_ci} 79262306a36Sopenharmony_ci 79362306a36Sopenharmony_cistatic void wake_up_barrier(struct r1conf *conf) 79462306a36Sopenharmony_ci{ 79562306a36Sopenharmony_ci if (wq_has_sleeper(&conf->wait_barrier)) 79662306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 79762306a36Sopenharmony_ci} 79862306a36Sopenharmony_ci 79962306a36Sopenharmony_cistatic void flush_bio_list(struct r1conf *conf, struct bio *bio) 80062306a36Sopenharmony_ci{ 80162306a36Sopenharmony_ci /* flush any pending bitmap writes to disk before proceeding w/ I/O */ 80262306a36Sopenharmony_ci raid1_prepare_flush_writes(conf->mddev->bitmap); 80362306a36Sopenharmony_ci wake_up_barrier(conf); 80462306a36Sopenharmony_ci 80562306a36Sopenharmony_ci while (bio) { /* submit pending writes */ 80662306a36Sopenharmony_ci struct bio *next = bio->bi_next; 80762306a36Sopenharmony_ci 80862306a36Sopenharmony_ci raid1_submit_write(bio); 80962306a36Sopenharmony_ci bio = next; 81062306a36Sopenharmony_ci cond_resched(); 81162306a36Sopenharmony_ci } 81262306a36Sopenharmony_ci} 81362306a36Sopenharmony_ci 81462306a36Sopenharmony_cistatic void flush_pending_writes(struct r1conf *conf) 81562306a36Sopenharmony_ci{ 81662306a36Sopenharmony_ci /* Any writes that have been queued but are awaiting 81762306a36Sopenharmony_ci * bitmap updates get flushed here. 81862306a36Sopenharmony_ci */ 81962306a36Sopenharmony_ci spin_lock_irq(&conf->device_lock); 82062306a36Sopenharmony_ci 82162306a36Sopenharmony_ci if (conf->pending_bio_list.head) { 82262306a36Sopenharmony_ci struct blk_plug plug; 82362306a36Sopenharmony_ci struct bio *bio; 82462306a36Sopenharmony_ci 82562306a36Sopenharmony_ci bio = bio_list_get(&conf->pending_bio_list); 82662306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 82762306a36Sopenharmony_ci 82862306a36Sopenharmony_ci /* 82962306a36Sopenharmony_ci * As this is called in a wait_event() loop (see freeze_array), 83062306a36Sopenharmony_ci * current->state might be TASK_UNINTERRUPTIBLE which will 83162306a36Sopenharmony_ci * cause a warning when we prepare to wait again. As it is 83262306a36Sopenharmony_ci * rare that this path is taken, it is perfectly safe to force 83362306a36Sopenharmony_ci * us to go around the wait_event() loop again, so the warning 83462306a36Sopenharmony_ci * is a false-positive. Silence the warning by resetting 83562306a36Sopenharmony_ci * thread state 83662306a36Sopenharmony_ci */ 83762306a36Sopenharmony_ci __set_current_state(TASK_RUNNING); 83862306a36Sopenharmony_ci blk_start_plug(&plug); 83962306a36Sopenharmony_ci flush_bio_list(conf, bio); 84062306a36Sopenharmony_ci blk_finish_plug(&plug); 84162306a36Sopenharmony_ci } else 84262306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 84362306a36Sopenharmony_ci} 84462306a36Sopenharmony_ci 84562306a36Sopenharmony_ci/* Barriers.... 84662306a36Sopenharmony_ci * Sometimes we need to suspend IO while we do something else, 84762306a36Sopenharmony_ci * either some resync/recovery, or reconfigure the array. 84862306a36Sopenharmony_ci * To do this we raise a 'barrier'. 84962306a36Sopenharmony_ci * The 'barrier' is a counter that can be raised multiple times 85062306a36Sopenharmony_ci * to count how many activities are happening which preclude 85162306a36Sopenharmony_ci * normal IO. 85262306a36Sopenharmony_ci * We can only raise the barrier if there is no pending IO. 85362306a36Sopenharmony_ci * i.e. if nr_pending == 0. 85462306a36Sopenharmony_ci * We choose only to raise the barrier if no-one is waiting for the 85562306a36Sopenharmony_ci * barrier to go down. This means that as soon as an IO request 85662306a36Sopenharmony_ci * is ready, no other operations which require a barrier will start 85762306a36Sopenharmony_ci * until the IO request has had a chance. 85862306a36Sopenharmony_ci * 85962306a36Sopenharmony_ci * So: regular IO calls 'wait_barrier'. When that returns there 86062306a36Sopenharmony_ci * is no backgroup IO happening, It must arrange to call 86162306a36Sopenharmony_ci * allow_barrier when it has finished its IO. 86262306a36Sopenharmony_ci * backgroup IO calls must call raise_barrier. Once that returns 86362306a36Sopenharmony_ci * there is no normal IO happeing. It must arrange to call 86462306a36Sopenharmony_ci * lower_barrier when the particular background IO completes. 86562306a36Sopenharmony_ci * 86662306a36Sopenharmony_ci * If resync/recovery is interrupted, returns -EINTR; 86762306a36Sopenharmony_ci * Otherwise, returns 0. 86862306a36Sopenharmony_ci */ 86962306a36Sopenharmony_cistatic int raise_barrier(struct r1conf *conf, sector_t sector_nr) 87062306a36Sopenharmony_ci{ 87162306a36Sopenharmony_ci int idx = sector_to_idx(sector_nr); 87262306a36Sopenharmony_ci 87362306a36Sopenharmony_ci spin_lock_irq(&conf->resync_lock); 87462306a36Sopenharmony_ci 87562306a36Sopenharmony_ci /* Wait until no block IO is waiting */ 87662306a36Sopenharmony_ci wait_event_lock_irq(conf->wait_barrier, 87762306a36Sopenharmony_ci !atomic_read(&conf->nr_waiting[idx]), 87862306a36Sopenharmony_ci conf->resync_lock); 87962306a36Sopenharmony_ci 88062306a36Sopenharmony_ci /* block any new IO from starting */ 88162306a36Sopenharmony_ci atomic_inc(&conf->barrier[idx]); 88262306a36Sopenharmony_ci /* 88362306a36Sopenharmony_ci * In raise_barrier() we firstly increase conf->barrier[idx] then 88462306a36Sopenharmony_ci * check conf->nr_pending[idx]. In _wait_barrier() we firstly 88562306a36Sopenharmony_ci * increase conf->nr_pending[idx] then check conf->barrier[idx]. 88662306a36Sopenharmony_ci * A memory barrier here to make sure conf->nr_pending[idx] won't 88762306a36Sopenharmony_ci * be fetched before conf->barrier[idx] is increased. Otherwise 88862306a36Sopenharmony_ci * there will be a race between raise_barrier() and _wait_barrier(). 88962306a36Sopenharmony_ci */ 89062306a36Sopenharmony_ci smp_mb__after_atomic(); 89162306a36Sopenharmony_ci 89262306a36Sopenharmony_ci /* For these conditions we must wait: 89362306a36Sopenharmony_ci * A: while the array is in frozen state 89462306a36Sopenharmony_ci * B: while conf->nr_pending[idx] is not 0, meaning regular I/O 89562306a36Sopenharmony_ci * existing in corresponding I/O barrier bucket. 89662306a36Sopenharmony_ci * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches 89762306a36Sopenharmony_ci * max resync count which allowed on current I/O barrier bucket. 89862306a36Sopenharmony_ci */ 89962306a36Sopenharmony_ci wait_event_lock_irq(conf->wait_barrier, 90062306a36Sopenharmony_ci (!conf->array_frozen && 90162306a36Sopenharmony_ci !atomic_read(&conf->nr_pending[idx]) && 90262306a36Sopenharmony_ci atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH) || 90362306a36Sopenharmony_ci test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery), 90462306a36Sopenharmony_ci conf->resync_lock); 90562306a36Sopenharmony_ci 90662306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 90762306a36Sopenharmony_ci atomic_dec(&conf->barrier[idx]); 90862306a36Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 90962306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 91062306a36Sopenharmony_ci return -EINTR; 91162306a36Sopenharmony_ci } 91262306a36Sopenharmony_ci 91362306a36Sopenharmony_ci atomic_inc(&conf->nr_sync_pending); 91462306a36Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 91562306a36Sopenharmony_ci 91662306a36Sopenharmony_ci return 0; 91762306a36Sopenharmony_ci} 91862306a36Sopenharmony_ci 91962306a36Sopenharmony_cistatic void lower_barrier(struct r1conf *conf, sector_t sector_nr) 92062306a36Sopenharmony_ci{ 92162306a36Sopenharmony_ci int idx = sector_to_idx(sector_nr); 92262306a36Sopenharmony_ci 92362306a36Sopenharmony_ci BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); 92462306a36Sopenharmony_ci 92562306a36Sopenharmony_ci atomic_dec(&conf->barrier[idx]); 92662306a36Sopenharmony_ci atomic_dec(&conf->nr_sync_pending); 92762306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 92862306a36Sopenharmony_ci} 92962306a36Sopenharmony_ci 93062306a36Sopenharmony_cistatic bool _wait_barrier(struct r1conf *conf, int idx, bool nowait) 93162306a36Sopenharmony_ci{ 93262306a36Sopenharmony_ci bool ret = true; 93362306a36Sopenharmony_ci 93462306a36Sopenharmony_ci /* 93562306a36Sopenharmony_ci * We need to increase conf->nr_pending[idx] very early here, 93662306a36Sopenharmony_ci * then raise_barrier() can be blocked when it waits for 93762306a36Sopenharmony_ci * conf->nr_pending[idx] to be 0. Then we can avoid holding 93862306a36Sopenharmony_ci * conf->resync_lock when there is no barrier raised in same 93962306a36Sopenharmony_ci * barrier unit bucket. Also if the array is frozen, I/O 94062306a36Sopenharmony_ci * should be blocked until array is unfrozen. 94162306a36Sopenharmony_ci */ 94262306a36Sopenharmony_ci atomic_inc(&conf->nr_pending[idx]); 94362306a36Sopenharmony_ci /* 94462306a36Sopenharmony_ci * In _wait_barrier() we firstly increase conf->nr_pending[idx], then 94562306a36Sopenharmony_ci * check conf->barrier[idx]. In raise_barrier() we firstly increase 94662306a36Sopenharmony_ci * conf->barrier[idx], then check conf->nr_pending[idx]. A memory 94762306a36Sopenharmony_ci * barrier is necessary here to make sure conf->barrier[idx] won't be 94862306a36Sopenharmony_ci * fetched before conf->nr_pending[idx] is increased. Otherwise there 94962306a36Sopenharmony_ci * will be a race between _wait_barrier() and raise_barrier(). 95062306a36Sopenharmony_ci */ 95162306a36Sopenharmony_ci smp_mb__after_atomic(); 95262306a36Sopenharmony_ci 95362306a36Sopenharmony_ci /* 95462306a36Sopenharmony_ci * Don't worry about checking two atomic_t variables at same time 95562306a36Sopenharmony_ci * here. If during we check conf->barrier[idx], the array is 95662306a36Sopenharmony_ci * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is 95762306a36Sopenharmony_ci * 0, it is safe to return and make the I/O continue. Because the 95862306a36Sopenharmony_ci * array is frozen, all I/O returned here will eventually complete 95962306a36Sopenharmony_ci * or be queued, no race will happen. See code comment in 96062306a36Sopenharmony_ci * frozen_array(). 96162306a36Sopenharmony_ci */ 96262306a36Sopenharmony_ci if (!READ_ONCE(conf->array_frozen) && 96362306a36Sopenharmony_ci !atomic_read(&conf->barrier[idx])) 96462306a36Sopenharmony_ci return ret; 96562306a36Sopenharmony_ci 96662306a36Sopenharmony_ci /* 96762306a36Sopenharmony_ci * After holding conf->resync_lock, conf->nr_pending[idx] 96862306a36Sopenharmony_ci * should be decreased before waiting for barrier to drop. 96962306a36Sopenharmony_ci * Otherwise, we may encounter a race condition because 97062306a36Sopenharmony_ci * raise_barrer() might be waiting for conf->nr_pending[idx] 97162306a36Sopenharmony_ci * to be 0 at same time. 97262306a36Sopenharmony_ci */ 97362306a36Sopenharmony_ci spin_lock_irq(&conf->resync_lock); 97462306a36Sopenharmony_ci atomic_inc(&conf->nr_waiting[idx]); 97562306a36Sopenharmony_ci atomic_dec(&conf->nr_pending[idx]); 97662306a36Sopenharmony_ci /* 97762306a36Sopenharmony_ci * In case freeze_array() is waiting for 97862306a36Sopenharmony_ci * get_unqueued_pending() == extra 97962306a36Sopenharmony_ci */ 98062306a36Sopenharmony_ci wake_up_barrier(conf); 98162306a36Sopenharmony_ci /* Wait for the barrier in same barrier unit bucket to drop. */ 98262306a36Sopenharmony_ci 98362306a36Sopenharmony_ci /* Return false when nowait flag is set */ 98462306a36Sopenharmony_ci if (nowait) { 98562306a36Sopenharmony_ci ret = false; 98662306a36Sopenharmony_ci } else { 98762306a36Sopenharmony_ci wait_event_lock_irq(conf->wait_barrier, 98862306a36Sopenharmony_ci !conf->array_frozen && 98962306a36Sopenharmony_ci !atomic_read(&conf->barrier[idx]), 99062306a36Sopenharmony_ci conf->resync_lock); 99162306a36Sopenharmony_ci atomic_inc(&conf->nr_pending[idx]); 99262306a36Sopenharmony_ci } 99362306a36Sopenharmony_ci 99462306a36Sopenharmony_ci atomic_dec(&conf->nr_waiting[idx]); 99562306a36Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 99662306a36Sopenharmony_ci return ret; 99762306a36Sopenharmony_ci} 99862306a36Sopenharmony_ci 99962306a36Sopenharmony_cistatic bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait) 100062306a36Sopenharmony_ci{ 100162306a36Sopenharmony_ci int idx = sector_to_idx(sector_nr); 100262306a36Sopenharmony_ci bool ret = true; 100362306a36Sopenharmony_ci 100462306a36Sopenharmony_ci /* 100562306a36Sopenharmony_ci * Very similar to _wait_barrier(). The difference is, for read 100662306a36Sopenharmony_ci * I/O we don't need wait for sync I/O, but if the whole array 100762306a36Sopenharmony_ci * is frozen, the read I/O still has to wait until the array is 100862306a36Sopenharmony_ci * unfrozen. Since there is no ordering requirement with 100962306a36Sopenharmony_ci * conf->barrier[idx] here, memory barrier is unnecessary as well. 101062306a36Sopenharmony_ci */ 101162306a36Sopenharmony_ci atomic_inc(&conf->nr_pending[idx]); 101262306a36Sopenharmony_ci 101362306a36Sopenharmony_ci if (!READ_ONCE(conf->array_frozen)) 101462306a36Sopenharmony_ci return ret; 101562306a36Sopenharmony_ci 101662306a36Sopenharmony_ci spin_lock_irq(&conf->resync_lock); 101762306a36Sopenharmony_ci atomic_inc(&conf->nr_waiting[idx]); 101862306a36Sopenharmony_ci atomic_dec(&conf->nr_pending[idx]); 101962306a36Sopenharmony_ci /* 102062306a36Sopenharmony_ci * In case freeze_array() is waiting for 102162306a36Sopenharmony_ci * get_unqueued_pending() == extra 102262306a36Sopenharmony_ci */ 102362306a36Sopenharmony_ci wake_up_barrier(conf); 102462306a36Sopenharmony_ci /* Wait for array to be unfrozen */ 102562306a36Sopenharmony_ci 102662306a36Sopenharmony_ci /* Return false when nowait flag is set */ 102762306a36Sopenharmony_ci if (nowait) { 102862306a36Sopenharmony_ci /* Return false when nowait flag is set */ 102962306a36Sopenharmony_ci ret = false; 103062306a36Sopenharmony_ci } else { 103162306a36Sopenharmony_ci wait_event_lock_irq(conf->wait_barrier, 103262306a36Sopenharmony_ci !conf->array_frozen, 103362306a36Sopenharmony_ci conf->resync_lock); 103462306a36Sopenharmony_ci atomic_inc(&conf->nr_pending[idx]); 103562306a36Sopenharmony_ci } 103662306a36Sopenharmony_ci 103762306a36Sopenharmony_ci atomic_dec(&conf->nr_waiting[idx]); 103862306a36Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 103962306a36Sopenharmony_ci return ret; 104062306a36Sopenharmony_ci} 104162306a36Sopenharmony_ci 104262306a36Sopenharmony_cistatic bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait) 104362306a36Sopenharmony_ci{ 104462306a36Sopenharmony_ci int idx = sector_to_idx(sector_nr); 104562306a36Sopenharmony_ci 104662306a36Sopenharmony_ci return _wait_barrier(conf, idx, nowait); 104762306a36Sopenharmony_ci} 104862306a36Sopenharmony_ci 104962306a36Sopenharmony_cistatic void _allow_barrier(struct r1conf *conf, int idx) 105062306a36Sopenharmony_ci{ 105162306a36Sopenharmony_ci atomic_dec(&conf->nr_pending[idx]); 105262306a36Sopenharmony_ci wake_up_barrier(conf); 105362306a36Sopenharmony_ci} 105462306a36Sopenharmony_ci 105562306a36Sopenharmony_cistatic void allow_barrier(struct r1conf *conf, sector_t sector_nr) 105662306a36Sopenharmony_ci{ 105762306a36Sopenharmony_ci int idx = sector_to_idx(sector_nr); 105862306a36Sopenharmony_ci 105962306a36Sopenharmony_ci _allow_barrier(conf, idx); 106062306a36Sopenharmony_ci} 106162306a36Sopenharmony_ci 106262306a36Sopenharmony_ci/* conf->resync_lock should be held */ 106362306a36Sopenharmony_cistatic int get_unqueued_pending(struct r1conf *conf) 106462306a36Sopenharmony_ci{ 106562306a36Sopenharmony_ci int idx, ret; 106662306a36Sopenharmony_ci 106762306a36Sopenharmony_ci ret = atomic_read(&conf->nr_sync_pending); 106862306a36Sopenharmony_ci for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) 106962306a36Sopenharmony_ci ret += atomic_read(&conf->nr_pending[idx]) - 107062306a36Sopenharmony_ci atomic_read(&conf->nr_queued[idx]); 107162306a36Sopenharmony_ci 107262306a36Sopenharmony_ci return ret; 107362306a36Sopenharmony_ci} 107462306a36Sopenharmony_ci 107562306a36Sopenharmony_cistatic void freeze_array(struct r1conf *conf, int extra) 107662306a36Sopenharmony_ci{ 107762306a36Sopenharmony_ci /* Stop sync I/O and normal I/O and wait for everything to 107862306a36Sopenharmony_ci * go quiet. 107962306a36Sopenharmony_ci * This is called in two situations: 108062306a36Sopenharmony_ci * 1) management command handlers (reshape, remove disk, quiesce). 108162306a36Sopenharmony_ci * 2) one normal I/O request failed. 108262306a36Sopenharmony_ci 108362306a36Sopenharmony_ci * After array_frozen is set to 1, new sync IO will be blocked at 108462306a36Sopenharmony_ci * raise_barrier(), and new normal I/O will blocked at _wait_barrier() 108562306a36Sopenharmony_ci * or wait_read_barrier(). The flying I/Os will either complete or be 108662306a36Sopenharmony_ci * queued. When everything goes quite, there are only queued I/Os left. 108762306a36Sopenharmony_ci 108862306a36Sopenharmony_ci * Every flying I/O contributes to a conf->nr_pending[idx], idx is the 108962306a36Sopenharmony_ci * barrier bucket index which this I/O request hits. When all sync and 109062306a36Sopenharmony_ci * normal I/O are queued, sum of all conf->nr_pending[] will match sum 109162306a36Sopenharmony_ci * of all conf->nr_queued[]. But normal I/O failure is an exception, 109262306a36Sopenharmony_ci * in handle_read_error(), we may call freeze_array() before trying to 109362306a36Sopenharmony_ci * fix the read error. In this case, the error read I/O is not queued, 109462306a36Sopenharmony_ci * so get_unqueued_pending() == 1. 109562306a36Sopenharmony_ci * 109662306a36Sopenharmony_ci * Therefore before this function returns, we need to wait until 109762306a36Sopenharmony_ci * get_unqueued_pendings(conf) gets equal to extra. For 109862306a36Sopenharmony_ci * normal I/O context, extra is 1, in rested situations extra is 0. 109962306a36Sopenharmony_ci */ 110062306a36Sopenharmony_ci spin_lock_irq(&conf->resync_lock); 110162306a36Sopenharmony_ci conf->array_frozen = 1; 110262306a36Sopenharmony_ci raid1_log(conf->mddev, "wait freeze"); 110362306a36Sopenharmony_ci wait_event_lock_irq_cmd( 110462306a36Sopenharmony_ci conf->wait_barrier, 110562306a36Sopenharmony_ci get_unqueued_pending(conf) == extra, 110662306a36Sopenharmony_ci conf->resync_lock, 110762306a36Sopenharmony_ci flush_pending_writes(conf)); 110862306a36Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 110962306a36Sopenharmony_ci} 111062306a36Sopenharmony_cistatic void unfreeze_array(struct r1conf *conf) 111162306a36Sopenharmony_ci{ 111262306a36Sopenharmony_ci /* reverse the effect of the freeze */ 111362306a36Sopenharmony_ci spin_lock_irq(&conf->resync_lock); 111462306a36Sopenharmony_ci conf->array_frozen = 0; 111562306a36Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 111662306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 111762306a36Sopenharmony_ci} 111862306a36Sopenharmony_ci 111962306a36Sopenharmony_cistatic void alloc_behind_master_bio(struct r1bio *r1_bio, 112062306a36Sopenharmony_ci struct bio *bio) 112162306a36Sopenharmony_ci{ 112262306a36Sopenharmony_ci int size = bio->bi_iter.bi_size; 112362306a36Sopenharmony_ci unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 112462306a36Sopenharmony_ci int i = 0; 112562306a36Sopenharmony_ci struct bio *behind_bio = NULL; 112662306a36Sopenharmony_ci 112762306a36Sopenharmony_ci behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO, 112862306a36Sopenharmony_ci &r1_bio->mddev->bio_set); 112962306a36Sopenharmony_ci if (!behind_bio) 113062306a36Sopenharmony_ci return; 113162306a36Sopenharmony_ci 113262306a36Sopenharmony_ci /* discard op, we don't support writezero/writesame yet */ 113362306a36Sopenharmony_ci if (!bio_has_data(bio)) { 113462306a36Sopenharmony_ci behind_bio->bi_iter.bi_size = size; 113562306a36Sopenharmony_ci goto skip_copy; 113662306a36Sopenharmony_ci } 113762306a36Sopenharmony_ci 113862306a36Sopenharmony_ci while (i < vcnt && size) { 113962306a36Sopenharmony_ci struct page *page; 114062306a36Sopenharmony_ci int len = min_t(int, PAGE_SIZE, size); 114162306a36Sopenharmony_ci 114262306a36Sopenharmony_ci page = alloc_page(GFP_NOIO); 114362306a36Sopenharmony_ci if (unlikely(!page)) 114462306a36Sopenharmony_ci goto free_pages; 114562306a36Sopenharmony_ci 114662306a36Sopenharmony_ci if (!bio_add_page(behind_bio, page, len, 0)) { 114762306a36Sopenharmony_ci put_page(page); 114862306a36Sopenharmony_ci goto free_pages; 114962306a36Sopenharmony_ci } 115062306a36Sopenharmony_ci 115162306a36Sopenharmony_ci size -= len; 115262306a36Sopenharmony_ci i++; 115362306a36Sopenharmony_ci } 115462306a36Sopenharmony_ci 115562306a36Sopenharmony_ci bio_copy_data(behind_bio, bio); 115662306a36Sopenharmony_ciskip_copy: 115762306a36Sopenharmony_ci r1_bio->behind_master_bio = behind_bio; 115862306a36Sopenharmony_ci set_bit(R1BIO_BehindIO, &r1_bio->state); 115962306a36Sopenharmony_ci 116062306a36Sopenharmony_ci return; 116162306a36Sopenharmony_ci 116262306a36Sopenharmony_cifree_pages: 116362306a36Sopenharmony_ci pr_debug("%dB behind alloc failed, doing sync I/O\n", 116462306a36Sopenharmony_ci bio->bi_iter.bi_size); 116562306a36Sopenharmony_ci bio_free_pages(behind_bio); 116662306a36Sopenharmony_ci bio_put(behind_bio); 116762306a36Sopenharmony_ci} 116862306a36Sopenharmony_ci 116962306a36Sopenharmony_cistatic void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) 117062306a36Sopenharmony_ci{ 117162306a36Sopenharmony_ci struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, 117262306a36Sopenharmony_ci cb); 117362306a36Sopenharmony_ci struct mddev *mddev = plug->cb.data; 117462306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 117562306a36Sopenharmony_ci struct bio *bio; 117662306a36Sopenharmony_ci 117762306a36Sopenharmony_ci if (from_schedule) { 117862306a36Sopenharmony_ci spin_lock_irq(&conf->device_lock); 117962306a36Sopenharmony_ci bio_list_merge(&conf->pending_bio_list, &plug->pending); 118062306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 118162306a36Sopenharmony_ci wake_up_barrier(conf); 118262306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 118362306a36Sopenharmony_ci kfree(plug); 118462306a36Sopenharmony_ci return; 118562306a36Sopenharmony_ci } 118662306a36Sopenharmony_ci 118762306a36Sopenharmony_ci /* we aren't scheduling, so we can do the write-out directly. */ 118862306a36Sopenharmony_ci bio = bio_list_get(&plug->pending); 118962306a36Sopenharmony_ci flush_bio_list(conf, bio); 119062306a36Sopenharmony_ci kfree(plug); 119162306a36Sopenharmony_ci} 119262306a36Sopenharmony_ci 119362306a36Sopenharmony_cistatic void init_r1bio(struct r1bio *r1_bio, struct mddev *mddev, struct bio *bio) 119462306a36Sopenharmony_ci{ 119562306a36Sopenharmony_ci r1_bio->master_bio = bio; 119662306a36Sopenharmony_ci r1_bio->sectors = bio_sectors(bio); 119762306a36Sopenharmony_ci r1_bio->state = 0; 119862306a36Sopenharmony_ci r1_bio->mddev = mddev; 119962306a36Sopenharmony_ci r1_bio->sector = bio->bi_iter.bi_sector; 120062306a36Sopenharmony_ci} 120162306a36Sopenharmony_ci 120262306a36Sopenharmony_cistatic inline struct r1bio * 120362306a36Sopenharmony_cialloc_r1bio(struct mddev *mddev, struct bio *bio) 120462306a36Sopenharmony_ci{ 120562306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 120662306a36Sopenharmony_ci struct r1bio *r1_bio; 120762306a36Sopenharmony_ci 120862306a36Sopenharmony_ci r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO); 120962306a36Sopenharmony_ci /* Ensure no bio records IO_BLOCKED */ 121062306a36Sopenharmony_ci memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0])); 121162306a36Sopenharmony_ci init_r1bio(r1_bio, mddev, bio); 121262306a36Sopenharmony_ci return r1_bio; 121362306a36Sopenharmony_ci} 121462306a36Sopenharmony_ci 121562306a36Sopenharmony_cistatic void raid1_read_request(struct mddev *mddev, struct bio *bio, 121662306a36Sopenharmony_ci int max_read_sectors, struct r1bio *r1_bio) 121762306a36Sopenharmony_ci{ 121862306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 121962306a36Sopenharmony_ci struct raid1_info *mirror; 122062306a36Sopenharmony_ci struct bio *read_bio; 122162306a36Sopenharmony_ci struct bitmap *bitmap = mddev->bitmap; 122262306a36Sopenharmony_ci const enum req_op op = bio_op(bio); 122362306a36Sopenharmony_ci const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 122462306a36Sopenharmony_ci int max_sectors; 122562306a36Sopenharmony_ci int rdisk; 122662306a36Sopenharmony_ci bool r1bio_existed = !!r1_bio; 122762306a36Sopenharmony_ci char b[BDEVNAME_SIZE]; 122862306a36Sopenharmony_ci 122962306a36Sopenharmony_ci /* 123062306a36Sopenharmony_ci * If r1_bio is set, we are blocking the raid1d thread 123162306a36Sopenharmony_ci * so there is a tiny risk of deadlock. So ask for 123262306a36Sopenharmony_ci * emergency memory if needed. 123362306a36Sopenharmony_ci */ 123462306a36Sopenharmony_ci gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO; 123562306a36Sopenharmony_ci 123662306a36Sopenharmony_ci if (r1bio_existed) { 123762306a36Sopenharmony_ci /* Need to get the block device name carefully */ 123862306a36Sopenharmony_ci struct md_rdev *rdev; 123962306a36Sopenharmony_ci rcu_read_lock(); 124062306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev); 124162306a36Sopenharmony_ci if (rdev) 124262306a36Sopenharmony_ci snprintf(b, sizeof(b), "%pg", rdev->bdev); 124362306a36Sopenharmony_ci else 124462306a36Sopenharmony_ci strcpy(b, "???"); 124562306a36Sopenharmony_ci rcu_read_unlock(); 124662306a36Sopenharmony_ci } 124762306a36Sopenharmony_ci 124862306a36Sopenharmony_ci /* 124962306a36Sopenharmony_ci * Still need barrier for READ in case that whole 125062306a36Sopenharmony_ci * array is frozen. 125162306a36Sopenharmony_ci */ 125262306a36Sopenharmony_ci if (!wait_read_barrier(conf, bio->bi_iter.bi_sector, 125362306a36Sopenharmony_ci bio->bi_opf & REQ_NOWAIT)) { 125462306a36Sopenharmony_ci bio_wouldblock_error(bio); 125562306a36Sopenharmony_ci return; 125662306a36Sopenharmony_ci } 125762306a36Sopenharmony_ci 125862306a36Sopenharmony_ci if (!r1_bio) 125962306a36Sopenharmony_ci r1_bio = alloc_r1bio(mddev, bio); 126062306a36Sopenharmony_ci else 126162306a36Sopenharmony_ci init_r1bio(r1_bio, mddev, bio); 126262306a36Sopenharmony_ci r1_bio->sectors = max_read_sectors; 126362306a36Sopenharmony_ci 126462306a36Sopenharmony_ci /* 126562306a36Sopenharmony_ci * make_request() can abort the operation when read-ahead is being 126662306a36Sopenharmony_ci * used and no empty request is available. 126762306a36Sopenharmony_ci */ 126862306a36Sopenharmony_ci rdisk = read_balance(conf, r1_bio, &max_sectors); 126962306a36Sopenharmony_ci 127062306a36Sopenharmony_ci if (rdisk < 0) { 127162306a36Sopenharmony_ci /* couldn't find anywhere to read from */ 127262306a36Sopenharmony_ci if (r1bio_existed) { 127362306a36Sopenharmony_ci pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", 127462306a36Sopenharmony_ci mdname(mddev), 127562306a36Sopenharmony_ci b, 127662306a36Sopenharmony_ci (unsigned long long)r1_bio->sector); 127762306a36Sopenharmony_ci } 127862306a36Sopenharmony_ci raid_end_bio_io(r1_bio); 127962306a36Sopenharmony_ci return; 128062306a36Sopenharmony_ci } 128162306a36Sopenharmony_ci mirror = conf->mirrors + rdisk; 128262306a36Sopenharmony_ci 128362306a36Sopenharmony_ci if (r1bio_existed) 128462306a36Sopenharmony_ci pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %pg\n", 128562306a36Sopenharmony_ci mdname(mddev), 128662306a36Sopenharmony_ci (unsigned long long)r1_bio->sector, 128762306a36Sopenharmony_ci mirror->rdev->bdev); 128862306a36Sopenharmony_ci 128962306a36Sopenharmony_ci if (test_bit(WriteMostly, &mirror->rdev->flags) && 129062306a36Sopenharmony_ci bitmap) { 129162306a36Sopenharmony_ci /* 129262306a36Sopenharmony_ci * Reading from a write-mostly device must take care not to 129362306a36Sopenharmony_ci * over-take any writes that are 'behind' 129462306a36Sopenharmony_ci */ 129562306a36Sopenharmony_ci raid1_log(mddev, "wait behind writes"); 129662306a36Sopenharmony_ci wait_event(bitmap->behind_wait, 129762306a36Sopenharmony_ci atomic_read(&bitmap->behind_writes) == 0); 129862306a36Sopenharmony_ci } 129962306a36Sopenharmony_ci 130062306a36Sopenharmony_ci if (max_sectors < bio_sectors(bio)) { 130162306a36Sopenharmony_ci struct bio *split = bio_split(bio, max_sectors, 130262306a36Sopenharmony_ci gfp, &conf->bio_split); 130362306a36Sopenharmony_ci bio_chain(split, bio); 130462306a36Sopenharmony_ci submit_bio_noacct(bio); 130562306a36Sopenharmony_ci bio = split; 130662306a36Sopenharmony_ci r1_bio->master_bio = bio; 130762306a36Sopenharmony_ci r1_bio->sectors = max_sectors; 130862306a36Sopenharmony_ci } 130962306a36Sopenharmony_ci 131062306a36Sopenharmony_ci r1_bio->read_disk = rdisk; 131162306a36Sopenharmony_ci if (!r1bio_existed) { 131262306a36Sopenharmony_ci md_account_bio(mddev, &bio); 131362306a36Sopenharmony_ci r1_bio->master_bio = bio; 131462306a36Sopenharmony_ci } 131562306a36Sopenharmony_ci read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp, 131662306a36Sopenharmony_ci &mddev->bio_set); 131762306a36Sopenharmony_ci 131862306a36Sopenharmony_ci r1_bio->bios[rdisk] = read_bio; 131962306a36Sopenharmony_ci 132062306a36Sopenharmony_ci read_bio->bi_iter.bi_sector = r1_bio->sector + 132162306a36Sopenharmony_ci mirror->rdev->data_offset; 132262306a36Sopenharmony_ci read_bio->bi_end_io = raid1_end_read_request; 132362306a36Sopenharmony_ci read_bio->bi_opf = op | do_sync; 132462306a36Sopenharmony_ci if (test_bit(FailFast, &mirror->rdev->flags) && 132562306a36Sopenharmony_ci test_bit(R1BIO_FailFast, &r1_bio->state)) 132662306a36Sopenharmony_ci read_bio->bi_opf |= MD_FAILFAST; 132762306a36Sopenharmony_ci read_bio->bi_private = r1_bio; 132862306a36Sopenharmony_ci 132962306a36Sopenharmony_ci if (mddev->gendisk) 133062306a36Sopenharmony_ci trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), 133162306a36Sopenharmony_ci r1_bio->sector); 133262306a36Sopenharmony_ci 133362306a36Sopenharmony_ci submit_bio_noacct(read_bio); 133462306a36Sopenharmony_ci} 133562306a36Sopenharmony_ci 133662306a36Sopenharmony_cistatic void raid1_write_request(struct mddev *mddev, struct bio *bio, 133762306a36Sopenharmony_ci int max_write_sectors) 133862306a36Sopenharmony_ci{ 133962306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 134062306a36Sopenharmony_ci struct r1bio *r1_bio; 134162306a36Sopenharmony_ci int i, disks; 134262306a36Sopenharmony_ci struct bitmap *bitmap = mddev->bitmap; 134362306a36Sopenharmony_ci unsigned long flags; 134462306a36Sopenharmony_ci struct md_rdev *blocked_rdev; 134562306a36Sopenharmony_ci int first_clone; 134662306a36Sopenharmony_ci int max_sectors; 134762306a36Sopenharmony_ci bool write_behind = false; 134862306a36Sopenharmony_ci 134962306a36Sopenharmony_ci if (mddev_is_clustered(mddev) && 135062306a36Sopenharmony_ci md_cluster_ops->area_resyncing(mddev, WRITE, 135162306a36Sopenharmony_ci bio->bi_iter.bi_sector, bio_end_sector(bio))) { 135262306a36Sopenharmony_ci 135362306a36Sopenharmony_ci DEFINE_WAIT(w); 135462306a36Sopenharmony_ci if (bio->bi_opf & REQ_NOWAIT) { 135562306a36Sopenharmony_ci bio_wouldblock_error(bio); 135662306a36Sopenharmony_ci return; 135762306a36Sopenharmony_ci } 135862306a36Sopenharmony_ci for (;;) { 135962306a36Sopenharmony_ci prepare_to_wait(&conf->wait_barrier, 136062306a36Sopenharmony_ci &w, TASK_IDLE); 136162306a36Sopenharmony_ci if (!md_cluster_ops->area_resyncing(mddev, WRITE, 136262306a36Sopenharmony_ci bio->bi_iter.bi_sector, 136362306a36Sopenharmony_ci bio_end_sector(bio))) 136462306a36Sopenharmony_ci break; 136562306a36Sopenharmony_ci schedule(); 136662306a36Sopenharmony_ci } 136762306a36Sopenharmony_ci finish_wait(&conf->wait_barrier, &w); 136862306a36Sopenharmony_ci } 136962306a36Sopenharmony_ci 137062306a36Sopenharmony_ci /* 137162306a36Sopenharmony_ci * Register the new request and wait if the reconstruction 137262306a36Sopenharmony_ci * thread has put up a bar for new requests. 137362306a36Sopenharmony_ci * Continue immediately if no resync is active currently. 137462306a36Sopenharmony_ci */ 137562306a36Sopenharmony_ci if (!wait_barrier(conf, bio->bi_iter.bi_sector, 137662306a36Sopenharmony_ci bio->bi_opf & REQ_NOWAIT)) { 137762306a36Sopenharmony_ci bio_wouldblock_error(bio); 137862306a36Sopenharmony_ci return; 137962306a36Sopenharmony_ci } 138062306a36Sopenharmony_ci 138162306a36Sopenharmony_ci retry_write: 138262306a36Sopenharmony_ci r1_bio = alloc_r1bio(mddev, bio); 138362306a36Sopenharmony_ci r1_bio->sectors = max_write_sectors; 138462306a36Sopenharmony_ci 138562306a36Sopenharmony_ci /* first select target devices under rcu_lock and 138662306a36Sopenharmony_ci * inc refcount on their rdev. Record them by setting 138762306a36Sopenharmony_ci * bios[x] to bio 138862306a36Sopenharmony_ci * If there are known/acknowledged bad blocks on any device on 138962306a36Sopenharmony_ci * which we have seen a write error, we want to avoid writing those 139062306a36Sopenharmony_ci * blocks. 139162306a36Sopenharmony_ci * This potentially requires several writes to write around 139262306a36Sopenharmony_ci * the bad blocks. Each set of writes gets it's own r1bio 139362306a36Sopenharmony_ci * with a set of bios attached. 139462306a36Sopenharmony_ci */ 139562306a36Sopenharmony_ci 139662306a36Sopenharmony_ci disks = conf->raid_disks * 2; 139762306a36Sopenharmony_ci blocked_rdev = NULL; 139862306a36Sopenharmony_ci rcu_read_lock(); 139962306a36Sopenharmony_ci max_sectors = r1_bio->sectors; 140062306a36Sopenharmony_ci for (i = 0; i < disks; i++) { 140162306a36Sopenharmony_ci struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 140262306a36Sopenharmony_ci 140362306a36Sopenharmony_ci /* 140462306a36Sopenharmony_ci * The write-behind io is only attempted on drives marked as 140562306a36Sopenharmony_ci * write-mostly, which means we could allocate write behind 140662306a36Sopenharmony_ci * bio later. 140762306a36Sopenharmony_ci */ 140862306a36Sopenharmony_ci if (rdev && test_bit(WriteMostly, &rdev->flags)) 140962306a36Sopenharmony_ci write_behind = true; 141062306a36Sopenharmony_ci 141162306a36Sopenharmony_ci if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 141262306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 141362306a36Sopenharmony_ci blocked_rdev = rdev; 141462306a36Sopenharmony_ci break; 141562306a36Sopenharmony_ci } 141662306a36Sopenharmony_ci r1_bio->bios[i] = NULL; 141762306a36Sopenharmony_ci if (!rdev || test_bit(Faulty, &rdev->flags)) { 141862306a36Sopenharmony_ci if (i < conf->raid_disks) 141962306a36Sopenharmony_ci set_bit(R1BIO_Degraded, &r1_bio->state); 142062306a36Sopenharmony_ci continue; 142162306a36Sopenharmony_ci } 142262306a36Sopenharmony_ci 142362306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 142462306a36Sopenharmony_ci if (test_bit(WriteErrorSeen, &rdev->flags)) { 142562306a36Sopenharmony_ci sector_t first_bad; 142662306a36Sopenharmony_ci int bad_sectors; 142762306a36Sopenharmony_ci int is_bad; 142862306a36Sopenharmony_ci 142962306a36Sopenharmony_ci is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, 143062306a36Sopenharmony_ci &first_bad, &bad_sectors); 143162306a36Sopenharmony_ci if (is_bad < 0) { 143262306a36Sopenharmony_ci /* mustn't write here until the bad block is 143362306a36Sopenharmony_ci * acknowledged*/ 143462306a36Sopenharmony_ci set_bit(BlockedBadBlocks, &rdev->flags); 143562306a36Sopenharmony_ci blocked_rdev = rdev; 143662306a36Sopenharmony_ci break; 143762306a36Sopenharmony_ci } 143862306a36Sopenharmony_ci if (is_bad && first_bad <= r1_bio->sector) { 143962306a36Sopenharmony_ci /* Cannot write here at all */ 144062306a36Sopenharmony_ci bad_sectors -= (r1_bio->sector - first_bad); 144162306a36Sopenharmony_ci if (bad_sectors < max_sectors) 144262306a36Sopenharmony_ci /* mustn't write more than bad_sectors 144362306a36Sopenharmony_ci * to other devices yet 144462306a36Sopenharmony_ci */ 144562306a36Sopenharmony_ci max_sectors = bad_sectors; 144662306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 144762306a36Sopenharmony_ci /* We don't set R1BIO_Degraded as that 144862306a36Sopenharmony_ci * only applies if the disk is 144962306a36Sopenharmony_ci * missing, so it might be re-added, 145062306a36Sopenharmony_ci * and we want to know to recover this 145162306a36Sopenharmony_ci * chunk. 145262306a36Sopenharmony_ci * In this case the device is here, 145362306a36Sopenharmony_ci * and the fact that this chunk is not 145462306a36Sopenharmony_ci * in-sync is recorded in the bad 145562306a36Sopenharmony_ci * block log 145662306a36Sopenharmony_ci */ 145762306a36Sopenharmony_ci continue; 145862306a36Sopenharmony_ci } 145962306a36Sopenharmony_ci if (is_bad) { 146062306a36Sopenharmony_ci int good_sectors = first_bad - r1_bio->sector; 146162306a36Sopenharmony_ci if (good_sectors < max_sectors) 146262306a36Sopenharmony_ci max_sectors = good_sectors; 146362306a36Sopenharmony_ci } 146462306a36Sopenharmony_ci } 146562306a36Sopenharmony_ci r1_bio->bios[i] = bio; 146662306a36Sopenharmony_ci } 146762306a36Sopenharmony_ci rcu_read_unlock(); 146862306a36Sopenharmony_ci 146962306a36Sopenharmony_ci if (unlikely(blocked_rdev)) { 147062306a36Sopenharmony_ci /* Wait for this device to become unblocked */ 147162306a36Sopenharmony_ci int j; 147262306a36Sopenharmony_ci 147362306a36Sopenharmony_ci for (j = 0; j < i; j++) 147462306a36Sopenharmony_ci if (r1_bio->bios[j]) 147562306a36Sopenharmony_ci rdev_dec_pending(conf->mirrors[j].rdev, mddev); 147662306a36Sopenharmony_ci free_r1bio(r1_bio); 147762306a36Sopenharmony_ci allow_barrier(conf, bio->bi_iter.bi_sector); 147862306a36Sopenharmony_ci 147962306a36Sopenharmony_ci if (bio->bi_opf & REQ_NOWAIT) { 148062306a36Sopenharmony_ci bio_wouldblock_error(bio); 148162306a36Sopenharmony_ci return; 148262306a36Sopenharmony_ci } 148362306a36Sopenharmony_ci raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); 148462306a36Sopenharmony_ci md_wait_for_blocked_rdev(blocked_rdev, mddev); 148562306a36Sopenharmony_ci wait_barrier(conf, bio->bi_iter.bi_sector, false); 148662306a36Sopenharmony_ci goto retry_write; 148762306a36Sopenharmony_ci } 148862306a36Sopenharmony_ci 148962306a36Sopenharmony_ci /* 149062306a36Sopenharmony_ci * When using a bitmap, we may call alloc_behind_master_bio below. 149162306a36Sopenharmony_ci * alloc_behind_master_bio allocates a copy of the data payload a page 149262306a36Sopenharmony_ci * at a time and thus needs a new bio that can fit the whole payload 149362306a36Sopenharmony_ci * this bio in page sized chunks. 149462306a36Sopenharmony_ci */ 149562306a36Sopenharmony_ci if (write_behind && bitmap) 149662306a36Sopenharmony_ci max_sectors = min_t(int, max_sectors, 149762306a36Sopenharmony_ci BIO_MAX_VECS * (PAGE_SIZE >> 9)); 149862306a36Sopenharmony_ci if (max_sectors < bio_sectors(bio)) { 149962306a36Sopenharmony_ci struct bio *split = bio_split(bio, max_sectors, 150062306a36Sopenharmony_ci GFP_NOIO, &conf->bio_split); 150162306a36Sopenharmony_ci bio_chain(split, bio); 150262306a36Sopenharmony_ci submit_bio_noacct(bio); 150362306a36Sopenharmony_ci bio = split; 150462306a36Sopenharmony_ci r1_bio->master_bio = bio; 150562306a36Sopenharmony_ci r1_bio->sectors = max_sectors; 150662306a36Sopenharmony_ci } 150762306a36Sopenharmony_ci 150862306a36Sopenharmony_ci md_account_bio(mddev, &bio); 150962306a36Sopenharmony_ci r1_bio->master_bio = bio; 151062306a36Sopenharmony_ci atomic_set(&r1_bio->remaining, 1); 151162306a36Sopenharmony_ci atomic_set(&r1_bio->behind_remaining, 0); 151262306a36Sopenharmony_ci 151362306a36Sopenharmony_ci first_clone = 1; 151462306a36Sopenharmony_ci 151562306a36Sopenharmony_ci for (i = 0; i < disks; i++) { 151662306a36Sopenharmony_ci struct bio *mbio = NULL; 151762306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[i].rdev; 151862306a36Sopenharmony_ci if (!r1_bio->bios[i]) 151962306a36Sopenharmony_ci continue; 152062306a36Sopenharmony_ci 152162306a36Sopenharmony_ci if (first_clone) { 152262306a36Sopenharmony_ci /* do behind I/O ? 152362306a36Sopenharmony_ci * Not if there are too many, or cannot 152462306a36Sopenharmony_ci * allocate memory, or a reader on WriteMostly 152562306a36Sopenharmony_ci * is waiting for behind writes to flush */ 152662306a36Sopenharmony_ci if (bitmap && write_behind && 152762306a36Sopenharmony_ci (atomic_read(&bitmap->behind_writes) 152862306a36Sopenharmony_ci < mddev->bitmap_info.max_write_behind) && 152962306a36Sopenharmony_ci !waitqueue_active(&bitmap->behind_wait)) { 153062306a36Sopenharmony_ci alloc_behind_master_bio(r1_bio, bio); 153162306a36Sopenharmony_ci } 153262306a36Sopenharmony_ci 153362306a36Sopenharmony_ci md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, 153462306a36Sopenharmony_ci test_bit(R1BIO_BehindIO, &r1_bio->state)); 153562306a36Sopenharmony_ci first_clone = 0; 153662306a36Sopenharmony_ci } 153762306a36Sopenharmony_ci 153862306a36Sopenharmony_ci if (r1_bio->behind_master_bio) { 153962306a36Sopenharmony_ci mbio = bio_alloc_clone(rdev->bdev, 154062306a36Sopenharmony_ci r1_bio->behind_master_bio, 154162306a36Sopenharmony_ci GFP_NOIO, &mddev->bio_set); 154262306a36Sopenharmony_ci if (test_bit(CollisionCheck, &rdev->flags)) 154362306a36Sopenharmony_ci wait_for_serialization(rdev, r1_bio); 154462306a36Sopenharmony_ci if (test_bit(WriteMostly, &rdev->flags)) 154562306a36Sopenharmony_ci atomic_inc(&r1_bio->behind_remaining); 154662306a36Sopenharmony_ci } else { 154762306a36Sopenharmony_ci mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, 154862306a36Sopenharmony_ci &mddev->bio_set); 154962306a36Sopenharmony_ci 155062306a36Sopenharmony_ci if (mddev->serialize_policy) 155162306a36Sopenharmony_ci wait_for_serialization(rdev, r1_bio); 155262306a36Sopenharmony_ci } 155362306a36Sopenharmony_ci 155462306a36Sopenharmony_ci r1_bio->bios[i] = mbio; 155562306a36Sopenharmony_ci 155662306a36Sopenharmony_ci mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset); 155762306a36Sopenharmony_ci mbio->bi_end_io = raid1_end_write_request; 155862306a36Sopenharmony_ci mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); 155962306a36Sopenharmony_ci if (test_bit(FailFast, &rdev->flags) && 156062306a36Sopenharmony_ci !test_bit(WriteMostly, &rdev->flags) && 156162306a36Sopenharmony_ci conf->raid_disks - mddev->degraded > 1) 156262306a36Sopenharmony_ci mbio->bi_opf |= MD_FAILFAST; 156362306a36Sopenharmony_ci mbio->bi_private = r1_bio; 156462306a36Sopenharmony_ci 156562306a36Sopenharmony_ci atomic_inc(&r1_bio->remaining); 156662306a36Sopenharmony_ci 156762306a36Sopenharmony_ci if (mddev->gendisk) 156862306a36Sopenharmony_ci trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), 156962306a36Sopenharmony_ci r1_bio->sector); 157062306a36Sopenharmony_ci /* flush_pending_writes() needs access to the rdev so...*/ 157162306a36Sopenharmony_ci mbio->bi_bdev = (void *)rdev; 157262306a36Sopenharmony_ci if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) { 157362306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 157462306a36Sopenharmony_ci bio_list_add(&conf->pending_bio_list, mbio); 157562306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 157662306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 157762306a36Sopenharmony_ci } 157862306a36Sopenharmony_ci } 157962306a36Sopenharmony_ci 158062306a36Sopenharmony_ci r1_bio_write_done(r1_bio); 158162306a36Sopenharmony_ci 158262306a36Sopenharmony_ci /* In case raid1d snuck in to freeze_array */ 158362306a36Sopenharmony_ci wake_up_barrier(conf); 158462306a36Sopenharmony_ci} 158562306a36Sopenharmony_ci 158662306a36Sopenharmony_cistatic bool raid1_make_request(struct mddev *mddev, struct bio *bio) 158762306a36Sopenharmony_ci{ 158862306a36Sopenharmony_ci sector_t sectors; 158962306a36Sopenharmony_ci 159062306a36Sopenharmony_ci if (unlikely(bio->bi_opf & REQ_PREFLUSH) 159162306a36Sopenharmony_ci && md_flush_request(mddev, bio)) 159262306a36Sopenharmony_ci return true; 159362306a36Sopenharmony_ci 159462306a36Sopenharmony_ci /* 159562306a36Sopenharmony_ci * There is a limit to the maximum size, but 159662306a36Sopenharmony_ci * the read/write handler might find a lower limit 159762306a36Sopenharmony_ci * due to bad blocks. To avoid multiple splits, 159862306a36Sopenharmony_ci * we pass the maximum number of sectors down 159962306a36Sopenharmony_ci * and let the lower level perform the split. 160062306a36Sopenharmony_ci */ 160162306a36Sopenharmony_ci sectors = align_to_barrier_unit_end( 160262306a36Sopenharmony_ci bio->bi_iter.bi_sector, bio_sectors(bio)); 160362306a36Sopenharmony_ci 160462306a36Sopenharmony_ci if (bio_data_dir(bio) == READ) 160562306a36Sopenharmony_ci raid1_read_request(mddev, bio, sectors, NULL); 160662306a36Sopenharmony_ci else { 160762306a36Sopenharmony_ci if (!md_write_start(mddev,bio)) 160862306a36Sopenharmony_ci return false; 160962306a36Sopenharmony_ci raid1_write_request(mddev, bio, sectors); 161062306a36Sopenharmony_ci } 161162306a36Sopenharmony_ci return true; 161262306a36Sopenharmony_ci} 161362306a36Sopenharmony_ci 161462306a36Sopenharmony_cistatic void raid1_status(struct seq_file *seq, struct mddev *mddev) 161562306a36Sopenharmony_ci{ 161662306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 161762306a36Sopenharmony_ci int i; 161862306a36Sopenharmony_ci 161962306a36Sopenharmony_ci seq_printf(seq, " [%d/%d] [", conf->raid_disks, 162062306a36Sopenharmony_ci conf->raid_disks - mddev->degraded); 162162306a36Sopenharmony_ci rcu_read_lock(); 162262306a36Sopenharmony_ci for (i = 0; i < conf->raid_disks; i++) { 162362306a36Sopenharmony_ci struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 162462306a36Sopenharmony_ci seq_printf(seq, "%s", 162562306a36Sopenharmony_ci rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 162662306a36Sopenharmony_ci } 162762306a36Sopenharmony_ci rcu_read_unlock(); 162862306a36Sopenharmony_ci seq_printf(seq, "]"); 162962306a36Sopenharmony_ci} 163062306a36Sopenharmony_ci 163162306a36Sopenharmony_ci/** 163262306a36Sopenharmony_ci * raid1_error() - RAID1 error handler. 163362306a36Sopenharmony_ci * @mddev: affected md device. 163462306a36Sopenharmony_ci * @rdev: member device to fail. 163562306a36Sopenharmony_ci * 163662306a36Sopenharmony_ci * The routine acknowledges &rdev failure and determines new @mddev state. 163762306a36Sopenharmony_ci * If it failed, then: 163862306a36Sopenharmony_ci * - &MD_BROKEN flag is set in &mddev->flags. 163962306a36Sopenharmony_ci * - recovery is disabled. 164062306a36Sopenharmony_ci * Otherwise, it must be degraded: 164162306a36Sopenharmony_ci * - recovery is interrupted. 164262306a36Sopenharmony_ci * - &mddev->degraded is bumped. 164362306a36Sopenharmony_ci * 164462306a36Sopenharmony_ci * @rdev is marked as &Faulty excluding case when array is failed and 164562306a36Sopenharmony_ci * &mddev->fail_last_dev is off. 164662306a36Sopenharmony_ci */ 164762306a36Sopenharmony_cistatic void raid1_error(struct mddev *mddev, struct md_rdev *rdev) 164862306a36Sopenharmony_ci{ 164962306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 165062306a36Sopenharmony_ci unsigned long flags; 165162306a36Sopenharmony_ci 165262306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 165362306a36Sopenharmony_ci 165462306a36Sopenharmony_ci if (test_bit(In_sync, &rdev->flags) && 165562306a36Sopenharmony_ci (conf->raid_disks - mddev->degraded) == 1) { 165662306a36Sopenharmony_ci set_bit(MD_BROKEN, &mddev->flags); 165762306a36Sopenharmony_ci 165862306a36Sopenharmony_ci if (!mddev->fail_last_dev) { 165962306a36Sopenharmony_ci conf->recovery_disabled = mddev->recovery_disabled; 166062306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 166162306a36Sopenharmony_ci return; 166262306a36Sopenharmony_ci } 166362306a36Sopenharmony_ci } 166462306a36Sopenharmony_ci set_bit(Blocked, &rdev->flags); 166562306a36Sopenharmony_ci if (test_and_clear_bit(In_sync, &rdev->flags)) 166662306a36Sopenharmony_ci mddev->degraded++; 166762306a36Sopenharmony_ci set_bit(Faulty, &rdev->flags); 166862306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 166962306a36Sopenharmony_ci /* 167062306a36Sopenharmony_ci * if recovery is running, make sure it aborts. 167162306a36Sopenharmony_ci */ 167262306a36Sopenharmony_ci set_bit(MD_RECOVERY_INTR, &mddev->recovery); 167362306a36Sopenharmony_ci set_mask_bits(&mddev->sb_flags, 0, 167462306a36Sopenharmony_ci BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 167562306a36Sopenharmony_ci pr_crit("md/raid1:%s: Disk failure on %pg, disabling device.\n" 167662306a36Sopenharmony_ci "md/raid1:%s: Operation continuing on %d devices.\n", 167762306a36Sopenharmony_ci mdname(mddev), rdev->bdev, 167862306a36Sopenharmony_ci mdname(mddev), conf->raid_disks - mddev->degraded); 167962306a36Sopenharmony_ci} 168062306a36Sopenharmony_ci 168162306a36Sopenharmony_cistatic void print_conf(struct r1conf *conf) 168262306a36Sopenharmony_ci{ 168362306a36Sopenharmony_ci int i; 168462306a36Sopenharmony_ci 168562306a36Sopenharmony_ci pr_debug("RAID1 conf printout:\n"); 168662306a36Sopenharmony_ci if (!conf) { 168762306a36Sopenharmony_ci pr_debug("(!conf)\n"); 168862306a36Sopenharmony_ci return; 168962306a36Sopenharmony_ci } 169062306a36Sopenharmony_ci pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 169162306a36Sopenharmony_ci conf->raid_disks); 169262306a36Sopenharmony_ci 169362306a36Sopenharmony_ci rcu_read_lock(); 169462306a36Sopenharmony_ci for (i = 0; i < conf->raid_disks; i++) { 169562306a36Sopenharmony_ci struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 169662306a36Sopenharmony_ci if (rdev) 169762306a36Sopenharmony_ci pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", 169862306a36Sopenharmony_ci i, !test_bit(In_sync, &rdev->flags), 169962306a36Sopenharmony_ci !test_bit(Faulty, &rdev->flags), 170062306a36Sopenharmony_ci rdev->bdev); 170162306a36Sopenharmony_ci } 170262306a36Sopenharmony_ci rcu_read_unlock(); 170362306a36Sopenharmony_ci} 170462306a36Sopenharmony_ci 170562306a36Sopenharmony_cistatic void close_sync(struct r1conf *conf) 170662306a36Sopenharmony_ci{ 170762306a36Sopenharmony_ci int idx; 170862306a36Sopenharmony_ci 170962306a36Sopenharmony_ci for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) { 171062306a36Sopenharmony_ci _wait_barrier(conf, idx, false); 171162306a36Sopenharmony_ci _allow_barrier(conf, idx); 171262306a36Sopenharmony_ci } 171362306a36Sopenharmony_ci 171462306a36Sopenharmony_ci mempool_exit(&conf->r1buf_pool); 171562306a36Sopenharmony_ci} 171662306a36Sopenharmony_ci 171762306a36Sopenharmony_cistatic int raid1_spare_active(struct mddev *mddev) 171862306a36Sopenharmony_ci{ 171962306a36Sopenharmony_ci int i; 172062306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 172162306a36Sopenharmony_ci int count = 0; 172262306a36Sopenharmony_ci unsigned long flags; 172362306a36Sopenharmony_ci 172462306a36Sopenharmony_ci /* 172562306a36Sopenharmony_ci * Find all failed disks within the RAID1 configuration 172662306a36Sopenharmony_ci * and mark them readable. 172762306a36Sopenharmony_ci * Called under mddev lock, so rcu protection not needed. 172862306a36Sopenharmony_ci * device_lock used to avoid races with raid1_end_read_request 172962306a36Sopenharmony_ci * which expects 'In_sync' flags and ->degraded to be consistent. 173062306a36Sopenharmony_ci */ 173162306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 173262306a36Sopenharmony_ci for (i = 0; i < conf->raid_disks; i++) { 173362306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[i].rdev; 173462306a36Sopenharmony_ci struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; 173562306a36Sopenharmony_ci if (repl 173662306a36Sopenharmony_ci && !test_bit(Candidate, &repl->flags) 173762306a36Sopenharmony_ci && repl->recovery_offset == MaxSector 173862306a36Sopenharmony_ci && !test_bit(Faulty, &repl->flags) 173962306a36Sopenharmony_ci && !test_and_set_bit(In_sync, &repl->flags)) { 174062306a36Sopenharmony_ci /* replacement has just become active */ 174162306a36Sopenharmony_ci if (!rdev || 174262306a36Sopenharmony_ci !test_and_clear_bit(In_sync, &rdev->flags)) 174362306a36Sopenharmony_ci count++; 174462306a36Sopenharmony_ci if (rdev) { 174562306a36Sopenharmony_ci /* Replaced device not technically 174662306a36Sopenharmony_ci * faulty, but we need to be sure 174762306a36Sopenharmony_ci * it gets removed and never re-added 174862306a36Sopenharmony_ci */ 174962306a36Sopenharmony_ci set_bit(Faulty, &rdev->flags); 175062306a36Sopenharmony_ci sysfs_notify_dirent_safe( 175162306a36Sopenharmony_ci rdev->sysfs_state); 175262306a36Sopenharmony_ci } 175362306a36Sopenharmony_ci } 175462306a36Sopenharmony_ci if (rdev 175562306a36Sopenharmony_ci && rdev->recovery_offset == MaxSector 175662306a36Sopenharmony_ci && !test_bit(Faulty, &rdev->flags) 175762306a36Sopenharmony_ci && !test_and_set_bit(In_sync, &rdev->flags)) { 175862306a36Sopenharmony_ci count++; 175962306a36Sopenharmony_ci sysfs_notify_dirent_safe(rdev->sysfs_state); 176062306a36Sopenharmony_ci } 176162306a36Sopenharmony_ci } 176262306a36Sopenharmony_ci mddev->degraded -= count; 176362306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 176462306a36Sopenharmony_ci 176562306a36Sopenharmony_ci print_conf(conf); 176662306a36Sopenharmony_ci return count; 176762306a36Sopenharmony_ci} 176862306a36Sopenharmony_ci 176962306a36Sopenharmony_cistatic int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) 177062306a36Sopenharmony_ci{ 177162306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 177262306a36Sopenharmony_ci int err = -EEXIST; 177362306a36Sopenharmony_ci int mirror = 0, repl_slot = -1; 177462306a36Sopenharmony_ci struct raid1_info *p; 177562306a36Sopenharmony_ci int first = 0; 177662306a36Sopenharmony_ci int last = conf->raid_disks - 1; 177762306a36Sopenharmony_ci 177862306a36Sopenharmony_ci if (mddev->recovery_disabled == conf->recovery_disabled) 177962306a36Sopenharmony_ci return -EBUSY; 178062306a36Sopenharmony_ci 178162306a36Sopenharmony_ci if (md_integrity_add_rdev(rdev, mddev)) 178262306a36Sopenharmony_ci return -ENXIO; 178362306a36Sopenharmony_ci 178462306a36Sopenharmony_ci if (rdev->raid_disk >= 0) 178562306a36Sopenharmony_ci first = last = rdev->raid_disk; 178662306a36Sopenharmony_ci 178762306a36Sopenharmony_ci /* 178862306a36Sopenharmony_ci * find the disk ... but prefer rdev->saved_raid_disk 178962306a36Sopenharmony_ci * if possible. 179062306a36Sopenharmony_ci */ 179162306a36Sopenharmony_ci if (rdev->saved_raid_disk >= 0 && 179262306a36Sopenharmony_ci rdev->saved_raid_disk >= first && 179362306a36Sopenharmony_ci rdev->saved_raid_disk < conf->raid_disks && 179462306a36Sopenharmony_ci conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 179562306a36Sopenharmony_ci first = last = rdev->saved_raid_disk; 179662306a36Sopenharmony_ci 179762306a36Sopenharmony_ci for (mirror = first; mirror <= last; mirror++) { 179862306a36Sopenharmony_ci p = conf->mirrors + mirror; 179962306a36Sopenharmony_ci if (!p->rdev) { 180062306a36Sopenharmony_ci if (mddev->gendisk) 180162306a36Sopenharmony_ci disk_stack_limits(mddev->gendisk, rdev->bdev, 180262306a36Sopenharmony_ci rdev->data_offset << 9); 180362306a36Sopenharmony_ci 180462306a36Sopenharmony_ci p->head_position = 0; 180562306a36Sopenharmony_ci rdev->raid_disk = mirror; 180662306a36Sopenharmony_ci err = 0; 180762306a36Sopenharmony_ci /* As all devices are equivalent, we don't need a full recovery 180862306a36Sopenharmony_ci * if this was recently any drive of the array 180962306a36Sopenharmony_ci */ 181062306a36Sopenharmony_ci if (rdev->saved_raid_disk < 0) 181162306a36Sopenharmony_ci conf->fullsync = 1; 181262306a36Sopenharmony_ci rcu_assign_pointer(p->rdev, rdev); 181362306a36Sopenharmony_ci break; 181462306a36Sopenharmony_ci } 181562306a36Sopenharmony_ci if (test_bit(WantReplacement, &p->rdev->flags) && 181662306a36Sopenharmony_ci p[conf->raid_disks].rdev == NULL && repl_slot < 0) 181762306a36Sopenharmony_ci repl_slot = mirror; 181862306a36Sopenharmony_ci } 181962306a36Sopenharmony_ci 182062306a36Sopenharmony_ci if (err && repl_slot >= 0) { 182162306a36Sopenharmony_ci /* Add this device as a replacement */ 182262306a36Sopenharmony_ci p = conf->mirrors + repl_slot; 182362306a36Sopenharmony_ci clear_bit(In_sync, &rdev->flags); 182462306a36Sopenharmony_ci set_bit(Replacement, &rdev->flags); 182562306a36Sopenharmony_ci rdev->raid_disk = repl_slot; 182662306a36Sopenharmony_ci err = 0; 182762306a36Sopenharmony_ci conf->fullsync = 1; 182862306a36Sopenharmony_ci rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); 182962306a36Sopenharmony_ci } 183062306a36Sopenharmony_ci 183162306a36Sopenharmony_ci print_conf(conf); 183262306a36Sopenharmony_ci return err; 183362306a36Sopenharmony_ci} 183462306a36Sopenharmony_ci 183562306a36Sopenharmony_cistatic int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 183662306a36Sopenharmony_ci{ 183762306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 183862306a36Sopenharmony_ci int err = 0; 183962306a36Sopenharmony_ci int number = rdev->raid_disk; 184062306a36Sopenharmony_ci struct raid1_info *p = conf->mirrors + number; 184162306a36Sopenharmony_ci 184262306a36Sopenharmony_ci if (unlikely(number >= conf->raid_disks)) 184362306a36Sopenharmony_ci goto abort; 184462306a36Sopenharmony_ci 184562306a36Sopenharmony_ci if (rdev != p->rdev) 184662306a36Sopenharmony_ci p = conf->mirrors + conf->raid_disks + number; 184762306a36Sopenharmony_ci 184862306a36Sopenharmony_ci print_conf(conf); 184962306a36Sopenharmony_ci if (rdev == p->rdev) { 185062306a36Sopenharmony_ci if (test_bit(In_sync, &rdev->flags) || 185162306a36Sopenharmony_ci atomic_read(&rdev->nr_pending)) { 185262306a36Sopenharmony_ci err = -EBUSY; 185362306a36Sopenharmony_ci goto abort; 185462306a36Sopenharmony_ci } 185562306a36Sopenharmony_ci /* Only remove non-faulty devices if recovery 185662306a36Sopenharmony_ci * is not possible. 185762306a36Sopenharmony_ci */ 185862306a36Sopenharmony_ci if (!test_bit(Faulty, &rdev->flags) && 185962306a36Sopenharmony_ci mddev->recovery_disabled != conf->recovery_disabled && 186062306a36Sopenharmony_ci mddev->degraded < conf->raid_disks) { 186162306a36Sopenharmony_ci err = -EBUSY; 186262306a36Sopenharmony_ci goto abort; 186362306a36Sopenharmony_ci } 186462306a36Sopenharmony_ci p->rdev = NULL; 186562306a36Sopenharmony_ci if (!test_bit(RemoveSynchronized, &rdev->flags)) { 186662306a36Sopenharmony_ci synchronize_rcu(); 186762306a36Sopenharmony_ci if (atomic_read(&rdev->nr_pending)) { 186862306a36Sopenharmony_ci /* lost the race, try later */ 186962306a36Sopenharmony_ci err = -EBUSY; 187062306a36Sopenharmony_ci p->rdev = rdev; 187162306a36Sopenharmony_ci goto abort; 187262306a36Sopenharmony_ci } 187362306a36Sopenharmony_ci } 187462306a36Sopenharmony_ci if (conf->mirrors[conf->raid_disks + number].rdev) { 187562306a36Sopenharmony_ci /* We just removed a device that is being replaced. 187662306a36Sopenharmony_ci * Move down the replacement. We drain all IO before 187762306a36Sopenharmony_ci * doing this to avoid confusion. 187862306a36Sopenharmony_ci */ 187962306a36Sopenharmony_ci struct md_rdev *repl = 188062306a36Sopenharmony_ci conf->mirrors[conf->raid_disks + number].rdev; 188162306a36Sopenharmony_ci freeze_array(conf, 0); 188262306a36Sopenharmony_ci if (atomic_read(&repl->nr_pending)) { 188362306a36Sopenharmony_ci /* It means that some queued IO of retry_list 188462306a36Sopenharmony_ci * hold repl. Thus, we cannot set replacement 188562306a36Sopenharmony_ci * as NULL, avoiding rdev NULL pointer 188662306a36Sopenharmony_ci * dereference in sync_request_write and 188762306a36Sopenharmony_ci * handle_write_finished. 188862306a36Sopenharmony_ci */ 188962306a36Sopenharmony_ci err = -EBUSY; 189062306a36Sopenharmony_ci unfreeze_array(conf); 189162306a36Sopenharmony_ci goto abort; 189262306a36Sopenharmony_ci } 189362306a36Sopenharmony_ci clear_bit(Replacement, &repl->flags); 189462306a36Sopenharmony_ci p->rdev = repl; 189562306a36Sopenharmony_ci conf->mirrors[conf->raid_disks + number].rdev = NULL; 189662306a36Sopenharmony_ci unfreeze_array(conf); 189762306a36Sopenharmony_ci } 189862306a36Sopenharmony_ci 189962306a36Sopenharmony_ci clear_bit(WantReplacement, &rdev->flags); 190062306a36Sopenharmony_ci err = md_integrity_register(mddev); 190162306a36Sopenharmony_ci } 190262306a36Sopenharmony_ciabort: 190362306a36Sopenharmony_ci 190462306a36Sopenharmony_ci print_conf(conf); 190562306a36Sopenharmony_ci return err; 190662306a36Sopenharmony_ci} 190762306a36Sopenharmony_ci 190862306a36Sopenharmony_cistatic void end_sync_read(struct bio *bio) 190962306a36Sopenharmony_ci{ 191062306a36Sopenharmony_ci struct r1bio *r1_bio = get_resync_r1bio(bio); 191162306a36Sopenharmony_ci 191262306a36Sopenharmony_ci update_head_pos(r1_bio->read_disk, r1_bio); 191362306a36Sopenharmony_ci 191462306a36Sopenharmony_ci /* 191562306a36Sopenharmony_ci * we have read a block, now it needs to be re-written, 191662306a36Sopenharmony_ci * or re-read if the read failed. 191762306a36Sopenharmony_ci * We don't do much here, just schedule handling by raid1d 191862306a36Sopenharmony_ci */ 191962306a36Sopenharmony_ci if (!bio->bi_status) 192062306a36Sopenharmony_ci set_bit(R1BIO_Uptodate, &r1_bio->state); 192162306a36Sopenharmony_ci 192262306a36Sopenharmony_ci if (atomic_dec_and_test(&r1_bio->remaining)) 192362306a36Sopenharmony_ci reschedule_retry(r1_bio); 192462306a36Sopenharmony_ci} 192562306a36Sopenharmony_ci 192662306a36Sopenharmony_cistatic void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) 192762306a36Sopenharmony_ci{ 192862306a36Sopenharmony_ci sector_t sync_blocks = 0; 192962306a36Sopenharmony_ci sector_t s = r1_bio->sector; 193062306a36Sopenharmony_ci long sectors_to_go = r1_bio->sectors; 193162306a36Sopenharmony_ci 193262306a36Sopenharmony_ci /* make sure these bits don't get cleared. */ 193362306a36Sopenharmony_ci do { 193462306a36Sopenharmony_ci md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); 193562306a36Sopenharmony_ci s += sync_blocks; 193662306a36Sopenharmony_ci sectors_to_go -= sync_blocks; 193762306a36Sopenharmony_ci } while (sectors_to_go > 0); 193862306a36Sopenharmony_ci} 193962306a36Sopenharmony_ci 194062306a36Sopenharmony_cistatic void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) 194162306a36Sopenharmony_ci{ 194262306a36Sopenharmony_ci if (atomic_dec_and_test(&r1_bio->remaining)) { 194362306a36Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 194462306a36Sopenharmony_ci int s = r1_bio->sectors; 194562306a36Sopenharmony_ci 194662306a36Sopenharmony_ci if (test_bit(R1BIO_MadeGood, &r1_bio->state) || 194762306a36Sopenharmony_ci test_bit(R1BIO_WriteError, &r1_bio->state)) 194862306a36Sopenharmony_ci reschedule_retry(r1_bio); 194962306a36Sopenharmony_ci else { 195062306a36Sopenharmony_ci put_buf(r1_bio); 195162306a36Sopenharmony_ci md_done_sync(mddev, s, uptodate); 195262306a36Sopenharmony_ci } 195362306a36Sopenharmony_ci } 195462306a36Sopenharmony_ci} 195562306a36Sopenharmony_ci 195662306a36Sopenharmony_cistatic void end_sync_write(struct bio *bio) 195762306a36Sopenharmony_ci{ 195862306a36Sopenharmony_ci int uptodate = !bio->bi_status; 195962306a36Sopenharmony_ci struct r1bio *r1_bio = get_resync_r1bio(bio); 196062306a36Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 196162306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 196262306a36Sopenharmony_ci sector_t first_bad; 196362306a36Sopenharmony_ci int bad_sectors; 196462306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; 196562306a36Sopenharmony_ci 196662306a36Sopenharmony_ci if (!uptodate) { 196762306a36Sopenharmony_ci abort_sync_write(mddev, r1_bio); 196862306a36Sopenharmony_ci set_bit(WriteErrorSeen, &rdev->flags); 196962306a36Sopenharmony_ci if (!test_and_set_bit(WantReplacement, &rdev->flags)) 197062306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, & 197162306a36Sopenharmony_ci mddev->recovery); 197262306a36Sopenharmony_ci set_bit(R1BIO_WriteError, &r1_bio->state); 197362306a36Sopenharmony_ci } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, 197462306a36Sopenharmony_ci &first_bad, &bad_sectors) && 197562306a36Sopenharmony_ci !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, 197662306a36Sopenharmony_ci r1_bio->sector, 197762306a36Sopenharmony_ci r1_bio->sectors, 197862306a36Sopenharmony_ci &first_bad, &bad_sectors) 197962306a36Sopenharmony_ci ) 198062306a36Sopenharmony_ci set_bit(R1BIO_MadeGood, &r1_bio->state); 198162306a36Sopenharmony_ci 198262306a36Sopenharmony_ci put_sync_write_buf(r1_bio, uptodate); 198362306a36Sopenharmony_ci} 198462306a36Sopenharmony_ci 198562306a36Sopenharmony_cistatic int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, 198662306a36Sopenharmony_ci int sectors, struct page *page, blk_opf_t rw) 198762306a36Sopenharmony_ci{ 198862306a36Sopenharmony_ci if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) 198962306a36Sopenharmony_ci /* success */ 199062306a36Sopenharmony_ci return 1; 199162306a36Sopenharmony_ci if (rw == REQ_OP_WRITE) { 199262306a36Sopenharmony_ci set_bit(WriteErrorSeen, &rdev->flags); 199362306a36Sopenharmony_ci if (!test_and_set_bit(WantReplacement, 199462306a36Sopenharmony_ci &rdev->flags)) 199562306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, & 199662306a36Sopenharmony_ci rdev->mddev->recovery); 199762306a36Sopenharmony_ci } 199862306a36Sopenharmony_ci /* need to record an error - either for the block or the device */ 199962306a36Sopenharmony_ci if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 200062306a36Sopenharmony_ci md_error(rdev->mddev, rdev); 200162306a36Sopenharmony_ci return 0; 200262306a36Sopenharmony_ci} 200362306a36Sopenharmony_ci 200462306a36Sopenharmony_cistatic int fix_sync_read_error(struct r1bio *r1_bio) 200562306a36Sopenharmony_ci{ 200662306a36Sopenharmony_ci /* Try some synchronous reads of other devices to get 200762306a36Sopenharmony_ci * good data, much like with normal read errors. Only 200862306a36Sopenharmony_ci * read into the pages we already have so we don't 200962306a36Sopenharmony_ci * need to re-issue the read request. 201062306a36Sopenharmony_ci * We don't need to freeze the array, because being in an 201162306a36Sopenharmony_ci * active sync request, there is no normal IO, and 201262306a36Sopenharmony_ci * no overlapping syncs. 201362306a36Sopenharmony_ci * We don't need to check is_badblock() again as we 201462306a36Sopenharmony_ci * made sure that anything with a bad block in range 201562306a36Sopenharmony_ci * will have bi_end_io clear. 201662306a36Sopenharmony_ci */ 201762306a36Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 201862306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 201962306a36Sopenharmony_ci struct bio *bio = r1_bio->bios[r1_bio->read_disk]; 202062306a36Sopenharmony_ci struct page **pages = get_resync_pages(bio)->pages; 202162306a36Sopenharmony_ci sector_t sect = r1_bio->sector; 202262306a36Sopenharmony_ci int sectors = r1_bio->sectors; 202362306a36Sopenharmony_ci int idx = 0; 202462306a36Sopenharmony_ci struct md_rdev *rdev; 202562306a36Sopenharmony_ci 202662306a36Sopenharmony_ci rdev = conf->mirrors[r1_bio->read_disk].rdev; 202762306a36Sopenharmony_ci if (test_bit(FailFast, &rdev->flags)) { 202862306a36Sopenharmony_ci /* Don't try recovering from here - just fail it 202962306a36Sopenharmony_ci * ... unless it is the last working device of course */ 203062306a36Sopenharmony_ci md_error(mddev, rdev); 203162306a36Sopenharmony_ci if (test_bit(Faulty, &rdev->flags)) 203262306a36Sopenharmony_ci /* Don't try to read from here, but make sure 203362306a36Sopenharmony_ci * put_buf does it's thing 203462306a36Sopenharmony_ci */ 203562306a36Sopenharmony_ci bio->bi_end_io = end_sync_write; 203662306a36Sopenharmony_ci } 203762306a36Sopenharmony_ci 203862306a36Sopenharmony_ci while(sectors) { 203962306a36Sopenharmony_ci int s = sectors; 204062306a36Sopenharmony_ci int d = r1_bio->read_disk; 204162306a36Sopenharmony_ci int success = 0; 204262306a36Sopenharmony_ci int start; 204362306a36Sopenharmony_ci 204462306a36Sopenharmony_ci if (s > (PAGE_SIZE>>9)) 204562306a36Sopenharmony_ci s = PAGE_SIZE >> 9; 204662306a36Sopenharmony_ci do { 204762306a36Sopenharmony_ci if (r1_bio->bios[d]->bi_end_io == end_sync_read) { 204862306a36Sopenharmony_ci /* No rcu protection needed here devices 204962306a36Sopenharmony_ci * can only be removed when no resync is 205062306a36Sopenharmony_ci * active, and resync is currently active 205162306a36Sopenharmony_ci */ 205262306a36Sopenharmony_ci rdev = conf->mirrors[d].rdev; 205362306a36Sopenharmony_ci if (sync_page_io(rdev, sect, s<<9, 205462306a36Sopenharmony_ci pages[idx], 205562306a36Sopenharmony_ci REQ_OP_READ, false)) { 205662306a36Sopenharmony_ci success = 1; 205762306a36Sopenharmony_ci break; 205862306a36Sopenharmony_ci } 205962306a36Sopenharmony_ci } 206062306a36Sopenharmony_ci d++; 206162306a36Sopenharmony_ci if (d == conf->raid_disks * 2) 206262306a36Sopenharmony_ci d = 0; 206362306a36Sopenharmony_ci } while (!success && d != r1_bio->read_disk); 206462306a36Sopenharmony_ci 206562306a36Sopenharmony_ci if (!success) { 206662306a36Sopenharmony_ci int abort = 0; 206762306a36Sopenharmony_ci /* Cannot read from anywhere, this block is lost. 206862306a36Sopenharmony_ci * Record a bad block on each device. If that doesn't 206962306a36Sopenharmony_ci * work just disable and interrupt the recovery. 207062306a36Sopenharmony_ci * Don't fail devices as that won't really help. 207162306a36Sopenharmony_ci */ 207262306a36Sopenharmony_ci pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n", 207362306a36Sopenharmony_ci mdname(mddev), bio->bi_bdev, 207462306a36Sopenharmony_ci (unsigned long long)r1_bio->sector); 207562306a36Sopenharmony_ci for (d = 0; d < conf->raid_disks * 2; d++) { 207662306a36Sopenharmony_ci rdev = conf->mirrors[d].rdev; 207762306a36Sopenharmony_ci if (!rdev || test_bit(Faulty, &rdev->flags)) 207862306a36Sopenharmony_ci continue; 207962306a36Sopenharmony_ci if (!rdev_set_badblocks(rdev, sect, s, 0)) 208062306a36Sopenharmony_ci abort = 1; 208162306a36Sopenharmony_ci } 208262306a36Sopenharmony_ci if (abort) { 208362306a36Sopenharmony_ci conf->recovery_disabled = 208462306a36Sopenharmony_ci mddev->recovery_disabled; 208562306a36Sopenharmony_ci set_bit(MD_RECOVERY_INTR, &mddev->recovery); 208662306a36Sopenharmony_ci md_done_sync(mddev, r1_bio->sectors, 0); 208762306a36Sopenharmony_ci put_buf(r1_bio); 208862306a36Sopenharmony_ci return 0; 208962306a36Sopenharmony_ci } 209062306a36Sopenharmony_ci /* Try next page */ 209162306a36Sopenharmony_ci sectors -= s; 209262306a36Sopenharmony_ci sect += s; 209362306a36Sopenharmony_ci idx++; 209462306a36Sopenharmony_ci continue; 209562306a36Sopenharmony_ci } 209662306a36Sopenharmony_ci 209762306a36Sopenharmony_ci start = d; 209862306a36Sopenharmony_ci /* write it back and re-read */ 209962306a36Sopenharmony_ci while (d != r1_bio->read_disk) { 210062306a36Sopenharmony_ci if (d == 0) 210162306a36Sopenharmony_ci d = conf->raid_disks * 2; 210262306a36Sopenharmony_ci d--; 210362306a36Sopenharmony_ci if (r1_bio->bios[d]->bi_end_io != end_sync_read) 210462306a36Sopenharmony_ci continue; 210562306a36Sopenharmony_ci rdev = conf->mirrors[d].rdev; 210662306a36Sopenharmony_ci if (r1_sync_page_io(rdev, sect, s, 210762306a36Sopenharmony_ci pages[idx], 210862306a36Sopenharmony_ci REQ_OP_WRITE) == 0) { 210962306a36Sopenharmony_ci r1_bio->bios[d]->bi_end_io = NULL; 211062306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 211162306a36Sopenharmony_ci } 211262306a36Sopenharmony_ci } 211362306a36Sopenharmony_ci d = start; 211462306a36Sopenharmony_ci while (d != r1_bio->read_disk) { 211562306a36Sopenharmony_ci if (d == 0) 211662306a36Sopenharmony_ci d = conf->raid_disks * 2; 211762306a36Sopenharmony_ci d--; 211862306a36Sopenharmony_ci if (r1_bio->bios[d]->bi_end_io != end_sync_read) 211962306a36Sopenharmony_ci continue; 212062306a36Sopenharmony_ci rdev = conf->mirrors[d].rdev; 212162306a36Sopenharmony_ci if (r1_sync_page_io(rdev, sect, s, 212262306a36Sopenharmony_ci pages[idx], 212362306a36Sopenharmony_ci REQ_OP_READ) != 0) 212462306a36Sopenharmony_ci atomic_add(s, &rdev->corrected_errors); 212562306a36Sopenharmony_ci } 212662306a36Sopenharmony_ci sectors -= s; 212762306a36Sopenharmony_ci sect += s; 212862306a36Sopenharmony_ci idx ++; 212962306a36Sopenharmony_ci } 213062306a36Sopenharmony_ci set_bit(R1BIO_Uptodate, &r1_bio->state); 213162306a36Sopenharmony_ci bio->bi_status = 0; 213262306a36Sopenharmony_ci return 1; 213362306a36Sopenharmony_ci} 213462306a36Sopenharmony_ci 213562306a36Sopenharmony_cistatic void process_checks(struct r1bio *r1_bio) 213662306a36Sopenharmony_ci{ 213762306a36Sopenharmony_ci /* We have read all readable devices. If we haven't 213862306a36Sopenharmony_ci * got the block, then there is no hope left. 213962306a36Sopenharmony_ci * If we have, then we want to do a comparison 214062306a36Sopenharmony_ci * and skip the write if everything is the same. 214162306a36Sopenharmony_ci * If any blocks failed to read, then we need to 214262306a36Sopenharmony_ci * attempt an over-write 214362306a36Sopenharmony_ci */ 214462306a36Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 214562306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 214662306a36Sopenharmony_ci int primary; 214762306a36Sopenharmony_ci int i; 214862306a36Sopenharmony_ci int vcnt; 214962306a36Sopenharmony_ci 215062306a36Sopenharmony_ci /* Fix variable parts of all bios */ 215162306a36Sopenharmony_ci vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); 215262306a36Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 215362306a36Sopenharmony_ci blk_status_t status; 215462306a36Sopenharmony_ci struct bio *b = r1_bio->bios[i]; 215562306a36Sopenharmony_ci struct resync_pages *rp = get_resync_pages(b); 215662306a36Sopenharmony_ci if (b->bi_end_io != end_sync_read) 215762306a36Sopenharmony_ci continue; 215862306a36Sopenharmony_ci /* fixup the bio for reuse, but preserve errno */ 215962306a36Sopenharmony_ci status = b->bi_status; 216062306a36Sopenharmony_ci bio_reset(b, conf->mirrors[i].rdev->bdev, REQ_OP_READ); 216162306a36Sopenharmony_ci b->bi_status = status; 216262306a36Sopenharmony_ci b->bi_iter.bi_sector = r1_bio->sector + 216362306a36Sopenharmony_ci conf->mirrors[i].rdev->data_offset; 216462306a36Sopenharmony_ci b->bi_end_io = end_sync_read; 216562306a36Sopenharmony_ci rp->raid_bio = r1_bio; 216662306a36Sopenharmony_ci b->bi_private = rp; 216762306a36Sopenharmony_ci 216862306a36Sopenharmony_ci /* initialize bvec table again */ 216962306a36Sopenharmony_ci md_bio_reset_resync_pages(b, rp, r1_bio->sectors << 9); 217062306a36Sopenharmony_ci } 217162306a36Sopenharmony_ci for (primary = 0; primary < conf->raid_disks * 2; primary++) 217262306a36Sopenharmony_ci if (r1_bio->bios[primary]->bi_end_io == end_sync_read && 217362306a36Sopenharmony_ci !r1_bio->bios[primary]->bi_status) { 217462306a36Sopenharmony_ci r1_bio->bios[primary]->bi_end_io = NULL; 217562306a36Sopenharmony_ci rdev_dec_pending(conf->mirrors[primary].rdev, mddev); 217662306a36Sopenharmony_ci break; 217762306a36Sopenharmony_ci } 217862306a36Sopenharmony_ci r1_bio->read_disk = primary; 217962306a36Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 218062306a36Sopenharmony_ci int j = 0; 218162306a36Sopenharmony_ci struct bio *pbio = r1_bio->bios[primary]; 218262306a36Sopenharmony_ci struct bio *sbio = r1_bio->bios[i]; 218362306a36Sopenharmony_ci blk_status_t status = sbio->bi_status; 218462306a36Sopenharmony_ci struct page **ppages = get_resync_pages(pbio)->pages; 218562306a36Sopenharmony_ci struct page **spages = get_resync_pages(sbio)->pages; 218662306a36Sopenharmony_ci struct bio_vec *bi; 218762306a36Sopenharmony_ci int page_len[RESYNC_PAGES] = { 0 }; 218862306a36Sopenharmony_ci struct bvec_iter_all iter_all; 218962306a36Sopenharmony_ci 219062306a36Sopenharmony_ci if (sbio->bi_end_io != end_sync_read) 219162306a36Sopenharmony_ci continue; 219262306a36Sopenharmony_ci /* Now we can 'fixup' the error value */ 219362306a36Sopenharmony_ci sbio->bi_status = 0; 219462306a36Sopenharmony_ci 219562306a36Sopenharmony_ci bio_for_each_segment_all(bi, sbio, iter_all) 219662306a36Sopenharmony_ci page_len[j++] = bi->bv_len; 219762306a36Sopenharmony_ci 219862306a36Sopenharmony_ci if (!status) { 219962306a36Sopenharmony_ci for (j = vcnt; j-- ; ) { 220062306a36Sopenharmony_ci if (memcmp(page_address(ppages[j]), 220162306a36Sopenharmony_ci page_address(spages[j]), 220262306a36Sopenharmony_ci page_len[j])) 220362306a36Sopenharmony_ci break; 220462306a36Sopenharmony_ci } 220562306a36Sopenharmony_ci } else 220662306a36Sopenharmony_ci j = 0; 220762306a36Sopenharmony_ci if (j >= 0) 220862306a36Sopenharmony_ci atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); 220962306a36Sopenharmony_ci if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) 221062306a36Sopenharmony_ci && !status)) { 221162306a36Sopenharmony_ci /* No need to write to this device. */ 221262306a36Sopenharmony_ci sbio->bi_end_io = NULL; 221362306a36Sopenharmony_ci rdev_dec_pending(conf->mirrors[i].rdev, mddev); 221462306a36Sopenharmony_ci continue; 221562306a36Sopenharmony_ci } 221662306a36Sopenharmony_ci 221762306a36Sopenharmony_ci bio_copy_data(sbio, pbio); 221862306a36Sopenharmony_ci } 221962306a36Sopenharmony_ci} 222062306a36Sopenharmony_ci 222162306a36Sopenharmony_cistatic void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) 222262306a36Sopenharmony_ci{ 222362306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 222462306a36Sopenharmony_ci int i; 222562306a36Sopenharmony_ci int disks = conf->raid_disks * 2; 222662306a36Sopenharmony_ci struct bio *wbio; 222762306a36Sopenharmony_ci 222862306a36Sopenharmony_ci if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) 222962306a36Sopenharmony_ci /* ouch - failed to read all of that. */ 223062306a36Sopenharmony_ci if (!fix_sync_read_error(r1_bio)) 223162306a36Sopenharmony_ci return; 223262306a36Sopenharmony_ci 223362306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 223462306a36Sopenharmony_ci process_checks(r1_bio); 223562306a36Sopenharmony_ci 223662306a36Sopenharmony_ci /* 223762306a36Sopenharmony_ci * schedule writes 223862306a36Sopenharmony_ci */ 223962306a36Sopenharmony_ci atomic_set(&r1_bio->remaining, 1); 224062306a36Sopenharmony_ci for (i = 0; i < disks ; i++) { 224162306a36Sopenharmony_ci wbio = r1_bio->bios[i]; 224262306a36Sopenharmony_ci if (wbio->bi_end_io == NULL || 224362306a36Sopenharmony_ci (wbio->bi_end_io == end_sync_read && 224462306a36Sopenharmony_ci (i == r1_bio->read_disk || 224562306a36Sopenharmony_ci !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)))) 224662306a36Sopenharmony_ci continue; 224762306a36Sopenharmony_ci if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) { 224862306a36Sopenharmony_ci abort_sync_write(mddev, r1_bio); 224962306a36Sopenharmony_ci continue; 225062306a36Sopenharmony_ci } 225162306a36Sopenharmony_ci 225262306a36Sopenharmony_ci wbio->bi_opf = REQ_OP_WRITE; 225362306a36Sopenharmony_ci if (test_bit(FailFast, &conf->mirrors[i].rdev->flags)) 225462306a36Sopenharmony_ci wbio->bi_opf |= MD_FAILFAST; 225562306a36Sopenharmony_ci 225662306a36Sopenharmony_ci wbio->bi_end_io = end_sync_write; 225762306a36Sopenharmony_ci atomic_inc(&r1_bio->remaining); 225862306a36Sopenharmony_ci md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); 225962306a36Sopenharmony_ci 226062306a36Sopenharmony_ci submit_bio_noacct(wbio); 226162306a36Sopenharmony_ci } 226262306a36Sopenharmony_ci 226362306a36Sopenharmony_ci put_sync_write_buf(r1_bio, 1); 226462306a36Sopenharmony_ci} 226562306a36Sopenharmony_ci 226662306a36Sopenharmony_ci/* 226762306a36Sopenharmony_ci * This is a kernel thread which: 226862306a36Sopenharmony_ci * 226962306a36Sopenharmony_ci * 1. Retries failed read operations on working mirrors. 227062306a36Sopenharmony_ci * 2. Updates the raid superblock when problems encounter. 227162306a36Sopenharmony_ci * 3. Performs writes following reads for array synchronising. 227262306a36Sopenharmony_ci */ 227362306a36Sopenharmony_ci 227462306a36Sopenharmony_cistatic void fix_read_error(struct r1conf *conf, int read_disk, 227562306a36Sopenharmony_ci sector_t sect, int sectors) 227662306a36Sopenharmony_ci{ 227762306a36Sopenharmony_ci struct mddev *mddev = conf->mddev; 227862306a36Sopenharmony_ci while(sectors) { 227962306a36Sopenharmony_ci int s = sectors; 228062306a36Sopenharmony_ci int d = read_disk; 228162306a36Sopenharmony_ci int success = 0; 228262306a36Sopenharmony_ci int start; 228362306a36Sopenharmony_ci struct md_rdev *rdev; 228462306a36Sopenharmony_ci 228562306a36Sopenharmony_ci if (s > (PAGE_SIZE>>9)) 228662306a36Sopenharmony_ci s = PAGE_SIZE >> 9; 228762306a36Sopenharmony_ci 228862306a36Sopenharmony_ci do { 228962306a36Sopenharmony_ci sector_t first_bad; 229062306a36Sopenharmony_ci int bad_sectors; 229162306a36Sopenharmony_ci 229262306a36Sopenharmony_ci rcu_read_lock(); 229362306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].rdev); 229462306a36Sopenharmony_ci if (rdev && 229562306a36Sopenharmony_ci (test_bit(In_sync, &rdev->flags) || 229662306a36Sopenharmony_ci (!test_bit(Faulty, &rdev->flags) && 229762306a36Sopenharmony_ci rdev->recovery_offset >= sect + s)) && 229862306a36Sopenharmony_ci is_badblock(rdev, sect, s, 229962306a36Sopenharmony_ci &first_bad, &bad_sectors) == 0) { 230062306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 230162306a36Sopenharmony_ci rcu_read_unlock(); 230262306a36Sopenharmony_ci if (sync_page_io(rdev, sect, s<<9, 230362306a36Sopenharmony_ci conf->tmppage, REQ_OP_READ, false)) 230462306a36Sopenharmony_ci success = 1; 230562306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 230662306a36Sopenharmony_ci if (success) 230762306a36Sopenharmony_ci break; 230862306a36Sopenharmony_ci } else 230962306a36Sopenharmony_ci rcu_read_unlock(); 231062306a36Sopenharmony_ci d++; 231162306a36Sopenharmony_ci if (d == conf->raid_disks * 2) 231262306a36Sopenharmony_ci d = 0; 231362306a36Sopenharmony_ci } while (d != read_disk); 231462306a36Sopenharmony_ci 231562306a36Sopenharmony_ci if (!success) { 231662306a36Sopenharmony_ci /* Cannot read from anywhere - mark it bad */ 231762306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[read_disk].rdev; 231862306a36Sopenharmony_ci if (!rdev_set_badblocks(rdev, sect, s, 0)) 231962306a36Sopenharmony_ci md_error(mddev, rdev); 232062306a36Sopenharmony_ci break; 232162306a36Sopenharmony_ci } 232262306a36Sopenharmony_ci /* write it back and re-read */ 232362306a36Sopenharmony_ci start = d; 232462306a36Sopenharmony_ci while (d != read_disk) { 232562306a36Sopenharmony_ci if (d==0) 232662306a36Sopenharmony_ci d = conf->raid_disks * 2; 232762306a36Sopenharmony_ci d--; 232862306a36Sopenharmony_ci rcu_read_lock(); 232962306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].rdev); 233062306a36Sopenharmony_ci if (rdev && 233162306a36Sopenharmony_ci !test_bit(Faulty, &rdev->flags)) { 233262306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 233362306a36Sopenharmony_ci rcu_read_unlock(); 233462306a36Sopenharmony_ci r1_sync_page_io(rdev, sect, s, 233562306a36Sopenharmony_ci conf->tmppage, REQ_OP_WRITE); 233662306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 233762306a36Sopenharmony_ci } else 233862306a36Sopenharmony_ci rcu_read_unlock(); 233962306a36Sopenharmony_ci } 234062306a36Sopenharmony_ci d = start; 234162306a36Sopenharmony_ci while (d != read_disk) { 234262306a36Sopenharmony_ci if (d==0) 234362306a36Sopenharmony_ci d = conf->raid_disks * 2; 234462306a36Sopenharmony_ci d--; 234562306a36Sopenharmony_ci rcu_read_lock(); 234662306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].rdev); 234762306a36Sopenharmony_ci if (rdev && 234862306a36Sopenharmony_ci !test_bit(Faulty, &rdev->flags)) { 234962306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 235062306a36Sopenharmony_ci rcu_read_unlock(); 235162306a36Sopenharmony_ci if (r1_sync_page_io(rdev, sect, s, 235262306a36Sopenharmony_ci conf->tmppage, REQ_OP_READ)) { 235362306a36Sopenharmony_ci atomic_add(s, &rdev->corrected_errors); 235462306a36Sopenharmony_ci pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %pg)\n", 235562306a36Sopenharmony_ci mdname(mddev), s, 235662306a36Sopenharmony_ci (unsigned long long)(sect + 235762306a36Sopenharmony_ci rdev->data_offset), 235862306a36Sopenharmony_ci rdev->bdev); 235962306a36Sopenharmony_ci } 236062306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 236162306a36Sopenharmony_ci } else 236262306a36Sopenharmony_ci rcu_read_unlock(); 236362306a36Sopenharmony_ci } 236462306a36Sopenharmony_ci sectors -= s; 236562306a36Sopenharmony_ci sect += s; 236662306a36Sopenharmony_ci } 236762306a36Sopenharmony_ci} 236862306a36Sopenharmony_ci 236962306a36Sopenharmony_cistatic int narrow_write_error(struct r1bio *r1_bio, int i) 237062306a36Sopenharmony_ci{ 237162306a36Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 237262306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 237362306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[i].rdev; 237462306a36Sopenharmony_ci 237562306a36Sopenharmony_ci /* bio has the data to be written to device 'i' where 237662306a36Sopenharmony_ci * we just recently had a write error. 237762306a36Sopenharmony_ci * We repeatedly clone the bio and trim down to one block, 237862306a36Sopenharmony_ci * then try the write. Where the write fails we record 237962306a36Sopenharmony_ci * a bad block. 238062306a36Sopenharmony_ci * It is conceivable that the bio doesn't exactly align with 238162306a36Sopenharmony_ci * blocks. We must handle this somehow. 238262306a36Sopenharmony_ci * 238362306a36Sopenharmony_ci * We currently own a reference on the rdev. 238462306a36Sopenharmony_ci */ 238562306a36Sopenharmony_ci 238662306a36Sopenharmony_ci int block_sectors; 238762306a36Sopenharmony_ci sector_t sector; 238862306a36Sopenharmony_ci int sectors; 238962306a36Sopenharmony_ci int sect_to_write = r1_bio->sectors; 239062306a36Sopenharmony_ci int ok = 1; 239162306a36Sopenharmony_ci 239262306a36Sopenharmony_ci if (rdev->badblocks.shift < 0) 239362306a36Sopenharmony_ci return 0; 239462306a36Sopenharmony_ci 239562306a36Sopenharmony_ci block_sectors = roundup(1 << rdev->badblocks.shift, 239662306a36Sopenharmony_ci bdev_logical_block_size(rdev->bdev) >> 9); 239762306a36Sopenharmony_ci sector = r1_bio->sector; 239862306a36Sopenharmony_ci sectors = ((sector + block_sectors) 239962306a36Sopenharmony_ci & ~(sector_t)(block_sectors - 1)) 240062306a36Sopenharmony_ci - sector; 240162306a36Sopenharmony_ci 240262306a36Sopenharmony_ci while (sect_to_write) { 240362306a36Sopenharmony_ci struct bio *wbio; 240462306a36Sopenharmony_ci if (sectors > sect_to_write) 240562306a36Sopenharmony_ci sectors = sect_to_write; 240662306a36Sopenharmony_ci /* Write at 'sector' for 'sectors'*/ 240762306a36Sopenharmony_ci 240862306a36Sopenharmony_ci if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 240962306a36Sopenharmony_ci wbio = bio_alloc_clone(rdev->bdev, 241062306a36Sopenharmony_ci r1_bio->behind_master_bio, 241162306a36Sopenharmony_ci GFP_NOIO, &mddev->bio_set); 241262306a36Sopenharmony_ci } else { 241362306a36Sopenharmony_ci wbio = bio_alloc_clone(rdev->bdev, r1_bio->master_bio, 241462306a36Sopenharmony_ci GFP_NOIO, &mddev->bio_set); 241562306a36Sopenharmony_ci } 241662306a36Sopenharmony_ci 241762306a36Sopenharmony_ci wbio->bi_opf = REQ_OP_WRITE; 241862306a36Sopenharmony_ci wbio->bi_iter.bi_sector = r1_bio->sector; 241962306a36Sopenharmony_ci wbio->bi_iter.bi_size = r1_bio->sectors << 9; 242062306a36Sopenharmony_ci 242162306a36Sopenharmony_ci bio_trim(wbio, sector - r1_bio->sector, sectors); 242262306a36Sopenharmony_ci wbio->bi_iter.bi_sector += rdev->data_offset; 242362306a36Sopenharmony_ci 242462306a36Sopenharmony_ci if (submit_bio_wait(wbio) < 0) 242562306a36Sopenharmony_ci /* failure! */ 242662306a36Sopenharmony_ci ok = rdev_set_badblocks(rdev, sector, 242762306a36Sopenharmony_ci sectors, 0) 242862306a36Sopenharmony_ci && ok; 242962306a36Sopenharmony_ci 243062306a36Sopenharmony_ci bio_put(wbio); 243162306a36Sopenharmony_ci sect_to_write -= sectors; 243262306a36Sopenharmony_ci sector += sectors; 243362306a36Sopenharmony_ci sectors = block_sectors; 243462306a36Sopenharmony_ci } 243562306a36Sopenharmony_ci return ok; 243662306a36Sopenharmony_ci} 243762306a36Sopenharmony_ci 243862306a36Sopenharmony_cistatic void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio) 243962306a36Sopenharmony_ci{ 244062306a36Sopenharmony_ci int m; 244162306a36Sopenharmony_ci int s = r1_bio->sectors; 244262306a36Sopenharmony_ci for (m = 0; m < conf->raid_disks * 2 ; m++) { 244362306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[m].rdev; 244462306a36Sopenharmony_ci struct bio *bio = r1_bio->bios[m]; 244562306a36Sopenharmony_ci if (bio->bi_end_io == NULL) 244662306a36Sopenharmony_ci continue; 244762306a36Sopenharmony_ci if (!bio->bi_status && 244862306a36Sopenharmony_ci test_bit(R1BIO_MadeGood, &r1_bio->state)) { 244962306a36Sopenharmony_ci rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); 245062306a36Sopenharmony_ci } 245162306a36Sopenharmony_ci if (bio->bi_status && 245262306a36Sopenharmony_ci test_bit(R1BIO_WriteError, &r1_bio->state)) { 245362306a36Sopenharmony_ci if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) 245462306a36Sopenharmony_ci md_error(conf->mddev, rdev); 245562306a36Sopenharmony_ci } 245662306a36Sopenharmony_ci } 245762306a36Sopenharmony_ci put_buf(r1_bio); 245862306a36Sopenharmony_ci md_done_sync(conf->mddev, s, 1); 245962306a36Sopenharmony_ci} 246062306a36Sopenharmony_ci 246162306a36Sopenharmony_cistatic void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) 246262306a36Sopenharmony_ci{ 246362306a36Sopenharmony_ci int m, idx; 246462306a36Sopenharmony_ci bool fail = false; 246562306a36Sopenharmony_ci 246662306a36Sopenharmony_ci for (m = 0; m < conf->raid_disks * 2 ; m++) 246762306a36Sopenharmony_ci if (r1_bio->bios[m] == IO_MADE_GOOD) { 246862306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[m].rdev; 246962306a36Sopenharmony_ci rdev_clear_badblocks(rdev, 247062306a36Sopenharmony_ci r1_bio->sector, 247162306a36Sopenharmony_ci r1_bio->sectors, 0); 247262306a36Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 247362306a36Sopenharmony_ci } else if (r1_bio->bios[m] != NULL) { 247462306a36Sopenharmony_ci /* This drive got a write error. We need to 247562306a36Sopenharmony_ci * narrow down and record precise write 247662306a36Sopenharmony_ci * errors. 247762306a36Sopenharmony_ci */ 247862306a36Sopenharmony_ci fail = true; 247962306a36Sopenharmony_ci if (!narrow_write_error(r1_bio, m)) { 248062306a36Sopenharmony_ci md_error(conf->mddev, 248162306a36Sopenharmony_ci conf->mirrors[m].rdev); 248262306a36Sopenharmony_ci /* an I/O failed, we can't clear the bitmap */ 248362306a36Sopenharmony_ci set_bit(R1BIO_Degraded, &r1_bio->state); 248462306a36Sopenharmony_ci } 248562306a36Sopenharmony_ci rdev_dec_pending(conf->mirrors[m].rdev, 248662306a36Sopenharmony_ci conf->mddev); 248762306a36Sopenharmony_ci } 248862306a36Sopenharmony_ci if (fail) { 248962306a36Sopenharmony_ci spin_lock_irq(&conf->device_lock); 249062306a36Sopenharmony_ci list_add(&r1_bio->retry_list, &conf->bio_end_io_list); 249162306a36Sopenharmony_ci idx = sector_to_idx(r1_bio->sector); 249262306a36Sopenharmony_ci atomic_inc(&conf->nr_queued[idx]); 249362306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 249462306a36Sopenharmony_ci /* 249562306a36Sopenharmony_ci * In case freeze_array() is waiting for condition 249662306a36Sopenharmony_ci * get_unqueued_pending() == extra to be true. 249762306a36Sopenharmony_ci */ 249862306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 249962306a36Sopenharmony_ci md_wakeup_thread(conf->mddev->thread); 250062306a36Sopenharmony_ci } else { 250162306a36Sopenharmony_ci if (test_bit(R1BIO_WriteError, &r1_bio->state)) 250262306a36Sopenharmony_ci close_write(r1_bio); 250362306a36Sopenharmony_ci raid_end_bio_io(r1_bio); 250462306a36Sopenharmony_ci } 250562306a36Sopenharmony_ci} 250662306a36Sopenharmony_ci 250762306a36Sopenharmony_cistatic void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) 250862306a36Sopenharmony_ci{ 250962306a36Sopenharmony_ci struct mddev *mddev = conf->mddev; 251062306a36Sopenharmony_ci struct bio *bio; 251162306a36Sopenharmony_ci struct md_rdev *rdev; 251262306a36Sopenharmony_ci sector_t sector; 251362306a36Sopenharmony_ci 251462306a36Sopenharmony_ci clear_bit(R1BIO_ReadError, &r1_bio->state); 251562306a36Sopenharmony_ci /* we got a read error. Maybe the drive is bad. Maybe just 251662306a36Sopenharmony_ci * the block and we can fix it. 251762306a36Sopenharmony_ci * We freeze all other IO, and try reading the block from 251862306a36Sopenharmony_ci * other devices. When we find one, we re-write 251962306a36Sopenharmony_ci * and check it that fixes the read error. 252062306a36Sopenharmony_ci * This is all done synchronously while the array is 252162306a36Sopenharmony_ci * frozen 252262306a36Sopenharmony_ci */ 252362306a36Sopenharmony_ci 252462306a36Sopenharmony_ci bio = r1_bio->bios[r1_bio->read_disk]; 252562306a36Sopenharmony_ci bio_put(bio); 252662306a36Sopenharmony_ci r1_bio->bios[r1_bio->read_disk] = NULL; 252762306a36Sopenharmony_ci 252862306a36Sopenharmony_ci rdev = conf->mirrors[r1_bio->read_disk].rdev; 252962306a36Sopenharmony_ci if (mddev->ro == 0 253062306a36Sopenharmony_ci && !test_bit(FailFast, &rdev->flags)) { 253162306a36Sopenharmony_ci freeze_array(conf, 1); 253262306a36Sopenharmony_ci fix_read_error(conf, r1_bio->read_disk, 253362306a36Sopenharmony_ci r1_bio->sector, r1_bio->sectors); 253462306a36Sopenharmony_ci unfreeze_array(conf); 253562306a36Sopenharmony_ci } else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) { 253662306a36Sopenharmony_ci md_error(mddev, rdev); 253762306a36Sopenharmony_ci } else { 253862306a36Sopenharmony_ci r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; 253962306a36Sopenharmony_ci } 254062306a36Sopenharmony_ci 254162306a36Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 254262306a36Sopenharmony_ci sector = r1_bio->sector; 254362306a36Sopenharmony_ci bio = r1_bio->master_bio; 254462306a36Sopenharmony_ci 254562306a36Sopenharmony_ci /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */ 254662306a36Sopenharmony_ci r1_bio->state = 0; 254762306a36Sopenharmony_ci raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio); 254862306a36Sopenharmony_ci allow_barrier(conf, sector); 254962306a36Sopenharmony_ci} 255062306a36Sopenharmony_ci 255162306a36Sopenharmony_cistatic void raid1d(struct md_thread *thread) 255262306a36Sopenharmony_ci{ 255362306a36Sopenharmony_ci struct mddev *mddev = thread->mddev; 255462306a36Sopenharmony_ci struct r1bio *r1_bio; 255562306a36Sopenharmony_ci unsigned long flags; 255662306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 255762306a36Sopenharmony_ci struct list_head *head = &conf->retry_list; 255862306a36Sopenharmony_ci struct blk_plug plug; 255962306a36Sopenharmony_ci int idx; 256062306a36Sopenharmony_ci 256162306a36Sopenharmony_ci md_check_recovery(mddev); 256262306a36Sopenharmony_ci 256362306a36Sopenharmony_ci if (!list_empty_careful(&conf->bio_end_io_list) && 256462306a36Sopenharmony_ci !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 256562306a36Sopenharmony_ci LIST_HEAD(tmp); 256662306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 256762306a36Sopenharmony_ci if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 256862306a36Sopenharmony_ci list_splice_init(&conf->bio_end_io_list, &tmp); 256962306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 257062306a36Sopenharmony_ci while (!list_empty(&tmp)) { 257162306a36Sopenharmony_ci r1_bio = list_first_entry(&tmp, struct r1bio, 257262306a36Sopenharmony_ci retry_list); 257362306a36Sopenharmony_ci list_del(&r1_bio->retry_list); 257462306a36Sopenharmony_ci idx = sector_to_idx(r1_bio->sector); 257562306a36Sopenharmony_ci atomic_dec(&conf->nr_queued[idx]); 257662306a36Sopenharmony_ci if (mddev->degraded) 257762306a36Sopenharmony_ci set_bit(R1BIO_Degraded, &r1_bio->state); 257862306a36Sopenharmony_ci if (test_bit(R1BIO_WriteError, &r1_bio->state)) 257962306a36Sopenharmony_ci close_write(r1_bio); 258062306a36Sopenharmony_ci raid_end_bio_io(r1_bio); 258162306a36Sopenharmony_ci } 258262306a36Sopenharmony_ci } 258362306a36Sopenharmony_ci 258462306a36Sopenharmony_ci blk_start_plug(&plug); 258562306a36Sopenharmony_ci for (;;) { 258662306a36Sopenharmony_ci 258762306a36Sopenharmony_ci flush_pending_writes(conf); 258862306a36Sopenharmony_ci 258962306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 259062306a36Sopenharmony_ci if (list_empty(head)) { 259162306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 259262306a36Sopenharmony_ci break; 259362306a36Sopenharmony_ci } 259462306a36Sopenharmony_ci r1_bio = list_entry(head->prev, struct r1bio, retry_list); 259562306a36Sopenharmony_ci list_del(head->prev); 259662306a36Sopenharmony_ci idx = sector_to_idx(r1_bio->sector); 259762306a36Sopenharmony_ci atomic_dec(&conf->nr_queued[idx]); 259862306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 259962306a36Sopenharmony_ci 260062306a36Sopenharmony_ci mddev = r1_bio->mddev; 260162306a36Sopenharmony_ci conf = mddev->private; 260262306a36Sopenharmony_ci if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 260362306a36Sopenharmony_ci if (test_bit(R1BIO_MadeGood, &r1_bio->state) || 260462306a36Sopenharmony_ci test_bit(R1BIO_WriteError, &r1_bio->state)) 260562306a36Sopenharmony_ci handle_sync_write_finished(conf, r1_bio); 260662306a36Sopenharmony_ci else 260762306a36Sopenharmony_ci sync_request_write(mddev, r1_bio); 260862306a36Sopenharmony_ci } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) || 260962306a36Sopenharmony_ci test_bit(R1BIO_WriteError, &r1_bio->state)) 261062306a36Sopenharmony_ci handle_write_finished(conf, r1_bio); 261162306a36Sopenharmony_ci else if (test_bit(R1BIO_ReadError, &r1_bio->state)) 261262306a36Sopenharmony_ci handle_read_error(conf, r1_bio); 261362306a36Sopenharmony_ci else 261462306a36Sopenharmony_ci WARN_ON_ONCE(1); 261562306a36Sopenharmony_ci 261662306a36Sopenharmony_ci cond_resched(); 261762306a36Sopenharmony_ci if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) 261862306a36Sopenharmony_ci md_check_recovery(mddev); 261962306a36Sopenharmony_ci } 262062306a36Sopenharmony_ci blk_finish_plug(&plug); 262162306a36Sopenharmony_ci} 262262306a36Sopenharmony_ci 262362306a36Sopenharmony_cistatic int init_resync(struct r1conf *conf) 262462306a36Sopenharmony_ci{ 262562306a36Sopenharmony_ci int buffs; 262662306a36Sopenharmony_ci 262762306a36Sopenharmony_ci buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 262862306a36Sopenharmony_ci BUG_ON(mempool_initialized(&conf->r1buf_pool)); 262962306a36Sopenharmony_ci 263062306a36Sopenharmony_ci return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc, 263162306a36Sopenharmony_ci r1buf_pool_free, conf->poolinfo); 263262306a36Sopenharmony_ci} 263362306a36Sopenharmony_ci 263462306a36Sopenharmony_cistatic struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) 263562306a36Sopenharmony_ci{ 263662306a36Sopenharmony_ci struct r1bio *r1bio = mempool_alloc(&conf->r1buf_pool, GFP_NOIO); 263762306a36Sopenharmony_ci struct resync_pages *rps; 263862306a36Sopenharmony_ci struct bio *bio; 263962306a36Sopenharmony_ci int i; 264062306a36Sopenharmony_ci 264162306a36Sopenharmony_ci for (i = conf->poolinfo->raid_disks; i--; ) { 264262306a36Sopenharmony_ci bio = r1bio->bios[i]; 264362306a36Sopenharmony_ci rps = bio->bi_private; 264462306a36Sopenharmony_ci bio_reset(bio, NULL, 0); 264562306a36Sopenharmony_ci bio->bi_private = rps; 264662306a36Sopenharmony_ci } 264762306a36Sopenharmony_ci r1bio->master_bio = NULL; 264862306a36Sopenharmony_ci return r1bio; 264962306a36Sopenharmony_ci} 265062306a36Sopenharmony_ci 265162306a36Sopenharmony_ci/* 265262306a36Sopenharmony_ci * perform a "sync" on one "block" 265362306a36Sopenharmony_ci * 265462306a36Sopenharmony_ci * We need to make sure that no normal I/O request - particularly write 265562306a36Sopenharmony_ci * requests - conflict with active sync requests. 265662306a36Sopenharmony_ci * 265762306a36Sopenharmony_ci * This is achieved by tracking pending requests and a 'barrier' concept 265862306a36Sopenharmony_ci * that can be installed to exclude normal IO requests. 265962306a36Sopenharmony_ci */ 266062306a36Sopenharmony_ci 266162306a36Sopenharmony_cistatic sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, 266262306a36Sopenharmony_ci int *skipped) 266362306a36Sopenharmony_ci{ 266462306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 266562306a36Sopenharmony_ci struct r1bio *r1_bio; 266662306a36Sopenharmony_ci struct bio *bio; 266762306a36Sopenharmony_ci sector_t max_sector, nr_sectors; 266862306a36Sopenharmony_ci int disk = -1; 266962306a36Sopenharmony_ci int i; 267062306a36Sopenharmony_ci int wonly = -1; 267162306a36Sopenharmony_ci int write_targets = 0, read_targets = 0; 267262306a36Sopenharmony_ci sector_t sync_blocks; 267362306a36Sopenharmony_ci int still_degraded = 0; 267462306a36Sopenharmony_ci int good_sectors = RESYNC_SECTORS; 267562306a36Sopenharmony_ci int min_bad = 0; /* number of sectors that are bad in all devices */ 267662306a36Sopenharmony_ci int idx = sector_to_idx(sector_nr); 267762306a36Sopenharmony_ci int page_idx = 0; 267862306a36Sopenharmony_ci 267962306a36Sopenharmony_ci if (!mempool_initialized(&conf->r1buf_pool)) 268062306a36Sopenharmony_ci if (init_resync(conf)) 268162306a36Sopenharmony_ci return 0; 268262306a36Sopenharmony_ci 268362306a36Sopenharmony_ci max_sector = mddev->dev_sectors; 268462306a36Sopenharmony_ci if (sector_nr >= max_sector) { 268562306a36Sopenharmony_ci /* If we aborted, we need to abort the 268662306a36Sopenharmony_ci * sync on the 'current' bitmap chunk (there will 268762306a36Sopenharmony_ci * only be one in raid1 resync. 268862306a36Sopenharmony_ci * We can find the current addess in mddev->curr_resync 268962306a36Sopenharmony_ci */ 269062306a36Sopenharmony_ci if (mddev->curr_resync < max_sector) /* aborted */ 269162306a36Sopenharmony_ci md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 269262306a36Sopenharmony_ci &sync_blocks, 1); 269362306a36Sopenharmony_ci else /* completed sync */ 269462306a36Sopenharmony_ci conf->fullsync = 0; 269562306a36Sopenharmony_ci 269662306a36Sopenharmony_ci md_bitmap_close_sync(mddev->bitmap); 269762306a36Sopenharmony_ci close_sync(conf); 269862306a36Sopenharmony_ci 269962306a36Sopenharmony_ci if (mddev_is_clustered(mddev)) { 270062306a36Sopenharmony_ci conf->cluster_sync_low = 0; 270162306a36Sopenharmony_ci conf->cluster_sync_high = 0; 270262306a36Sopenharmony_ci } 270362306a36Sopenharmony_ci return 0; 270462306a36Sopenharmony_ci } 270562306a36Sopenharmony_ci 270662306a36Sopenharmony_ci if (mddev->bitmap == NULL && 270762306a36Sopenharmony_ci mddev->recovery_cp == MaxSector && 270862306a36Sopenharmony_ci !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 270962306a36Sopenharmony_ci conf->fullsync == 0) { 271062306a36Sopenharmony_ci *skipped = 1; 271162306a36Sopenharmony_ci return max_sector - sector_nr; 271262306a36Sopenharmony_ci } 271362306a36Sopenharmony_ci /* before building a request, check if we can skip these blocks.. 271462306a36Sopenharmony_ci * This call the bitmap_start_sync doesn't actually record anything 271562306a36Sopenharmony_ci */ 271662306a36Sopenharmony_ci if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 271762306a36Sopenharmony_ci !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 271862306a36Sopenharmony_ci /* We can skip this block, and probably several more */ 271962306a36Sopenharmony_ci *skipped = 1; 272062306a36Sopenharmony_ci return sync_blocks; 272162306a36Sopenharmony_ci } 272262306a36Sopenharmony_ci 272362306a36Sopenharmony_ci /* 272462306a36Sopenharmony_ci * If there is non-resync activity waiting for a turn, then let it 272562306a36Sopenharmony_ci * though before starting on this new sync request. 272662306a36Sopenharmony_ci */ 272762306a36Sopenharmony_ci if (atomic_read(&conf->nr_waiting[idx])) 272862306a36Sopenharmony_ci schedule_timeout_uninterruptible(1); 272962306a36Sopenharmony_ci 273062306a36Sopenharmony_ci /* we are incrementing sector_nr below. To be safe, we check against 273162306a36Sopenharmony_ci * sector_nr + two times RESYNC_SECTORS 273262306a36Sopenharmony_ci */ 273362306a36Sopenharmony_ci 273462306a36Sopenharmony_ci md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, 273562306a36Sopenharmony_ci mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); 273662306a36Sopenharmony_ci 273762306a36Sopenharmony_ci 273862306a36Sopenharmony_ci if (raise_barrier(conf, sector_nr)) 273962306a36Sopenharmony_ci return 0; 274062306a36Sopenharmony_ci 274162306a36Sopenharmony_ci r1_bio = raid1_alloc_init_r1buf(conf); 274262306a36Sopenharmony_ci 274362306a36Sopenharmony_ci rcu_read_lock(); 274462306a36Sopenharmony_ci /* 274562306a36Sopenharmony_ci * If we get a correctably read error during resync or recovery, 274662306a36Sopenharmony_ci * we might want to read from a different device. So we 274762306a36Sopenharmony_ci * flag all drives that could conceivably be read from for READ, 274862306a36Sopenharmony_ci * and any others (which will be non-In_sync devices) for WRITE. 274962306a36Sopenharmony_ci * If a read fails, we try reading from something else for which READ 275062306a36Sopenharmony_ci * is OK. 275162306a36Sopenharmony_ci */ 275262306a36Sopenharmony_ci 275362306a36Sopenharmony_ci r1_bio->mddev = mddev; 275462306a36Sopenharmony_ci r1_bio->sector = sector_nr; 275562306a36Sopenharmony_ci r1_bio->state = 0; 275662306a36Sopenharmony_ci set_bit(R1BIO_IsSync, &r1_bio->state); 275762306a36Sopenharmony_ci /* make sure good_sectors won't go across barrier unit boundary */ 275862306a36Sopenharmony_ci good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors); 275962306a36Sopenharmony_ci 276062306a36Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 276162306a36Sopenharmony_ci struct md_rdev *rdev; 276262306a36Sopenharmony_ci bio = r1_bio->bios[i]; 276362306a36Sopenharmony_ci 276462306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[i].rdev); 276562306a36Sopenharmony_ci if (rdev == NULL || 276662306a36Sopenharmony_ci test_bit(Faulty, &rdev->flags)) { 276762306a36Sopenharmony_ci if (i < conf->raid_disks) 276862306a36Sopenharmony_ci still_degraded = 1; 276962306a36Sopenharmony_ci } else if (!test_bit(In_sync, &rdev->flags)) { 277062306a36Sopenharmony_ci bio->bi_opf = REQ_OP_WRITE; 277162306a36Sopenharmony_ci bio->bi_end_io = end_sync_write; 277262306a36Sopenharmony_ci write_targets ++; 277362306a36Sopenharmony_ci } else { 277462306a36Sopenharmony_ci /* may need to read from here */ 277562306a36Sopenharmony_ci sector_t first_bad = MaxSector; 277662306a36Sopenharmony_ci int bad_sectors; 277762306a36Sopenharmony_ci 277862306a36Sopenharmony_ci if (is_badblock(rdev, sector_nr, good_sectors, 277962306a36Sopenharmony_ci &first_bad, &bad_sectors)) { 278062306a36Sopenharmony_ci if (first_bad > sector_nr) 278162306a36Sopenharmony_ci good_sectors = first_bad - sector_nr; 278262306a36Sopenharmony_ci else { 278362306a36Sopenharmony_ci bad_sectors -= (sector_nr - first_bad); 278462306a36Sopenharmony_ci if (min_bad == 0 || 278562306a36Sopenharmony_ci min_bad > bad_sectors) 278662306a36Sopenharmony_ci min_bad = bad_sectors; 278762306a36Sopenharmony_ci } 278862306a36Sopenharmony_ci } 278962306a36Sopenharmony_ci if (sector_nr < first_bad) { 279062306a36Sopenharmony_ci if (test_bit(WriteMostly, &rdev->flags)) { 279162306a36Sopenharmony_ci if (wonly < 0) 279262306a36Sopenharmony_ci wonly = i; 279362306a36Sopenharmony_ci } else { 279462306a36Sopenharmony_ci if (disk < 0) 279562306a36Sopenharmony_ci disk = i; 279662306a36Sopenharmony_ci } 279762306a36Sopenharmony_ci bio->bi_opf = REQ_OP_READ; 279862306a36Sopenharmony_ci bio->bi_end_io = end_sync_read; 279962306a36Sopenharmony_ci read_targets++; 280062306a36Sopenharmony_ci } else if (!test_bit(WriteErrorSeen, &rdev->flags) && 280162306a36Sopenharmony_ci test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 280262306a36Sopenharmony_ci !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 280362306a36Sopenharmony_ci /* 280462306a36Sopenharmony_ci * The device is suitable for reading (InSync), 280562306a36Sopenharmony_ci * but has bad block(s) here. Let's try to correct them, 280662306a36Sopenharmony_ci * if we are doing resync or repair. Otherwise, leave 280762306a36Sopenharmony_ci * this device alone for this sync request. 280862306a36Sopenharmony_ci */ 280962306a36Sopenharmony_ci bio->bi_opf = REQ_OP_WRITE; 281062306a36Sopenharmony_ci bio->bi_end_io = end_sync_write; 281162306a36Sopenharmony_ci write_targets++; 281262306a36Sopenharmony_ci } 281362306a36Sopenharmony_ci } 281462306a36Sopenharmony_ci if (rdev && bio->bi_end_io) { 281562306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 281662306a36Sopenharmony_ci bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; 281762306a36Sopenharmony_ci bio_set_dev(bio, rdev->bdev); 281862306a36Sopenharmony_ci if (test_bit(FailFast, &rdev->flags)) 281962306a36Sopenharmony_ci bio->bi_opf |= MD_FAILFAST; 282062306a36Sopenharmony_ci } 282162306a36Sopenharmony_ci } 282262306a36Sopenharmony_ci rcu_read_unlock(); 282362306a36Sopenharmony_ci if (disk < 0) 282462306a36Sopenharmony_ci disk = wonly; 282562306a36Sopenharmony_ci r1_bio->read_disk = disk; 282662306a36Sopenharmony_ci 282762306a36Sopenharmony_ci if (read_targets == 0 && min_bad > 0) { 282862306a36Sopenharmony_ci /* These sectors are bad on all InSync devices, so we 282962306a36Sopenharmony_ci * need to mark them bad on all write targets 283062306a36Sopenharmony_ci */ 283162306a36Sopenharmony_ci int ok = 1; 283262306a36Sopenharmony_ci for (i = 0 ; i < conf->raid_disks * 2 ; i++) 283362306a36Sopenharmony_ci if (r1_bio->bios[i]->bi_end_io == end_sync_write) { 283462306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[i].rdev; 283562306a36Sopenharmony_ci ok = rdev_set_badblocks(rdev, sector_nr, 283662306a36Sopenharmony_ci min_bad, 0 283762306a36Sopenharmony_ci ) && ok; 283862306a36Sopenharmony_ci } 283962306a36Sopenharmony_ci set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 284062306a36Sopenharmony_ci *skipped = 1; 284162306a36Sopenharmony_ci put_buf(r1_bio); 284262306a36Sopenharmony_ci 284362306a36Sopenharmony_ci if (!ok) { 284462306a36Sopenharmony_ci /* Cannot record the badblocks, so need to 284562306a36Sopenharmony_ci * abort the resync. 284662306a36Sopenharmony_ci * If there are multiple read targets, could just 284762306a36Sopenharmony_ci * fail the really bad ones ??? 284862306a36Sopenharmony_ci */ 284962306a36Sopenharmony_ci conf->recovery_disabled = mddev->recovery_disabled; 285062306a36Sopenharmony_ci set_bit(MD_RECOVERY_INTR, &mddev->recovery); 285162306a36Sopenharmony_ci return 0; 285262306a36Sopenharmony_ci } else 285362306a36Sopenharmony_ci return min_bad; 285462306a36Sopenharmony_ci 285562306a36Sopenharmony_ci } 285662306a36Sopenharmony_ci if (min_bad > 0 && min_bad < good_sectors) { 285762306a36Sopenharmony_ci /* only resync enough to reach the next bad->good 285862306a36Sopenharmony_ci * transition */ 285962306a36Sopenharmony_ci good_sectors = min_bad; 286062306a36Sopenharmony_ci } 286162306a36Sopenharmony_ci 286262306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) 286362306a36Sopenharmony_ci /* extra read targets are also write targets */ 286462306a36Sopenharmony_ci write_targets += read_targets-1; 286562306a36Sopenharmony_ci 286662306a36Sopenharmony_ci if (write_targets == 0 || read_targets == 0) { 286762306a36Sopenharmony_ci /* There is nowhere to write, so all non-sync 286862306a36Sopenharmony_ci * drives must be failed - so we are finished 286962306a36Sopenharmony_ci */ 287062306a36Sopenharmony_ci sector_t rv; 287162306a36Sopenharmony_ci if (min_bad > 0) 287262306a36Sopenharmony_ci max_sector = sector_nr + min_bad; 287362306a36Sopenharmony_ci rv = max_sector - sector_nr; 287462306a36Sopenharmony_ci *skipped = 1; 287562306a36Sopenharmony_ci put_buf(r1_bio); 287662306a36Sopenharmony_ci return rv; 287762306a36Sopenharmony_ci } 287862306a36Sopenharmony_ci 287962306a36Sopenharmony_ci if (max_sector > mddev->resync_max) 288062306a36Sopenharmony_ci max_sector = mddev->resync_max; /* Don't do IO beyond here */ 288162306a36Sopenharmony_ci if (max_sector > sector_nr + good_sectors) 288262306a36Sopenharmony_ci max_sector = sector_nr + good_sectors; 288362306a36Sopenharmony_ci nr_sectors = 0; 288462306a36Sopenharmony_ci sync_blocks = 0; 288562306a36Sopenharmony_ci do { 288662306a36Sopenharmony_ci struct page *page; 288762306a36Sopenharmony_ci int len = PAGE_SIZE; 288862306a36Sopenharmony_ci if (sector_nr + (len>>9) > max_sector) 288962306a36Sopenharmony_ci len = (max_sector - sector_nr) << 9; 289062306a36Sopenharmony_ci if (len == 0) 289162306a36Sopenharmony_ci break; 289262306a36Sopenharmony_ci if (sync_blocks == 0) { 289362306a36Sopenharmony_ci if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, 289462306a36Sopenharmony_ci &sync_blocks, still_degraded) && 289562306a36Sopenharmony_ci !conf->fullsync && 289662306a36Sopenharmony_ci !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 289762306a36Sopenharmony_ci break; 289862306a36Sopenharmony_ci if ((len >> 9) > sync_blocks) 289962306a36Sopenharmony_ci len = sync_blocks<<9; 290062306a36Sopenharmony_ci } 290162306a36Sopenharmony_ci 290262306a36Sopenharmony_ci for (i = 0 ; i < conf->raid_disks * 2; i++) { 290362306a36Sopenharmony_ci struct resync_pages *rp; 290462306a36Sopenharmony_ci 290562306a36Sopenharmony_ci bio = r1_bio->bios[i]; 290662306a36Sopenharmony_ci rp = get_resync_pages(bio); 290762306a36Sopenharmony_ci if (bio->bi_end_io) { 290862306a36Sopenharmony_ci page = resync_fetch_page(rp, page_idx); 290962306a36Sopenharmony_ci 291062306a36Sopenharmony_ci /* 291162306a36Sopenharmony_ci * won't fail because the vec table is big 291262306a36Sopenharmony_ci * enough to hold all these pages 291362306a36Sopenharmony_ci */ 291462306a36Sopenharmony_ci __bio_add_page(bio, page, len, 0); 291562306a36Sopenharmony_ci } 291662306a36Sopenharmony_ci } 291762306a36Sopenharmony_ci nr_sectors += len>>9; 291862306a36Sopenharmony_ci sector_nr += len>>9; 291962306a36Sopenharmony_ci sync_blocks -= (len>>9); 292062306a36Sopenharmony_ci } while (++page_idx < RESYNC_PAGES); 292162306a36Sopenharmony_ci 292262306a36Sopenharmony_ci r1_bio->sectors = nr_sectors; 292362306a36Sopenharmony_ci 292462306a36Sopenharmony_ci if (mddev_is_clustered(mddev) && 292562306a36Sopenharmony_ci conf->cluster_sync_high < sector_nr + nr_sectors) { 292662306a36Sopenharmony_ci conf->cluster_sync_low = mddev->curr_resync_completed; 292762306a36Sopenharmony_ci conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; 292862306a36Sopenharmony_ci /* Send resync message */ 292962306a36Sopenharmony_ci md_cluster_ops->resync_info_update(mddev, 293062306a36Sopenharmony_ci conf->cluster_sync_low, 293162306a36Sopenharmony_ci conf->cluster_sync_high); 293262306a36Sopenharmony_ci } 293362306a36Sopenharmony_ci 293462306a36Sopenharmony_ci /* For a user-requested sync, we read all readable devices and do a 293562306a36Sopenharmony_ci * compare 293662306a36Sopenharmony_ci */ 293762306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 293862306a36Sopenharmony_ci atomic_set(&r1_bio->remaining, read_targets); 293962306a36Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) { 294062306a36Sopenharmony_ci bio = r1_bio->bios[i]; 294162306a36Sopenharmony_ci if (bio->bi_end_io == end_sync_read) { 294262306a36Sopenharmony_ci read_targets--; 294362306a36Sopenharmony_ci md_sync_acct_bio(bio, nr_sectors); 294462306a36Sopenharmony_ci if (read_targets == 1) 294562306a36Sopenharmony_ci bio->bi_opf &= ~MD_FAILFAST; 294662306a36Sopenharmony_ci submit_bio_noacct(bio); 294762306a36Sopenharmony_ci } 294862306a36Sopenharmony_ci } 294962306a36Sopenharmony_ci } else { 295062306a36Sopenharmony_ci atomic_set(&r1_bio->remaining, 1); 295162306a36Sopenharmony_ci bio = r1_bio->bios[r1_bio->read_disk]; 295262306a36Sopenharmony_ci md_sync_acct_bio(bio, nr_sectors); 295362306a36Sopenharmony_ci if (read_targets == 1) 295462306a36Sopenharmony_ci bio->bi_opf &= ~MD_FAILFAST; 295562306a36Sopenharmony_ci submit_bio_noacct(bio); 295662306a36Sopenharmony_ci } 295762306a36Sopenharmony_ci return nr_sectors; 295862306a36Sopenharmony_ci} 295962306a36Sopenharmony_ci 296062306a36Sopenharmony_cistatic sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks) 296162306a36Sopenharmony_ci{ 296262306a36Sopenharmony_ci if (sectors) 296362306a36Sopenharmony_ci return sectors; 296462306a36Sopenharmony_ci 296562306a36Sopenharmony_ci return mddev->dev_sectors; 296662306a36Sopenharmony_ci} 296762306a36Sopenharmony_ci 296862306a36Sopenharmony_cistatic struct r1conf *setup_conf(struct mddev *mddev) 296962306a36Sopenharmony_ci{ 297062306a36Sopenharmony_ci struct r1conf *conf; 297162306a36Sopenharmony_ci int i; 297262306a36Sopenharmony_ci struct raid1_info *disk; 297362306a36Sopenharmony_ci struct md_rdev *rdev; 297462306a36Sopenharmony_ci int err = -ENOMEM; 297562306a36Sopenharmony_ci 297662306a36Sopenharmony_ci conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL); 297762306a36Sopenharmony_ci if (!conf) 297862306a36Sopenharmony_ci goto abort; 297962306a36Sopenharmony_ci 298062306a36Sopenharmony_ci conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, 298162306a36Sopenharmony_ci sizeof(atomic_t), GFP_KERNEL); 298262306a36Sopenharmony_ci if (!conf->nr_pending) 298362306a36Sopenharmony_ci goto abort; 298462306a36Sopenharmony_ci 298562306a36Sopenharmony_ci conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, 298662306a36Sopenharmony_ci sizeof(atomic_t), GFP_KERNEL); 298762306a36Sopenharmony_ci if (!conf->nr_waiting) 298862306a36Sopenharmony_ci goto abort; 298962306a36Sopenharmony_ci 299062306a36Sopenharmony_ci conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, 299162306a36Sopenharmony_ci sizeof(atomic_t), GFP_KERNEL); 299262306a36Sopenharmony_ci if (!conf->nr_queued) 299362306a36Sopenharmony_ci goto abort; 299462306a36Sopenharmony_ci 299562306a36Sopenharmony_ci conf->barrier = kcalloc(BARRIER_BUCKETS_NR, 299662306a36Sopenharmony_ci sizeof(atomic_t), GFP_KERNEL); 299762306a36Sopenharmony_ci if (!conf->barrier) 299862306a36Sopenharmony_ci goto abort; 299962306a36Sopenharmony_ci 300062306a36Sopenharmony_ci conf->mirrors = kzalloc(array3_size(sizeof(struct raid1_info), 300162306a36Sopenharmony_ci mddev->raid_disks, 2), 300262306a36Sopenharmony_ci GFP_KERNEL); 300362306a36Sopenharmony_ci if (!conf->mirrors) 300462306a36Sopenharmony_ci goto abort; 300562306a36Sopenharmony_ci 300662306a36Sopenharmony_ci conf->tmppage = alloc_page(GFP_KERNEL); 300762306a36Sopenharmony_ci if (!conf->tmppage) 300862306a36Sopenharmony_ci goto abort; 300962306a36Sopenharmony_ci 301062306a36Sopenharmony_ci conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 301162306a36Sopenharmony_ci if (!conf->poolinfo) 301262306a36Sopenharmony_ci goto abort; 301362306a36Sopenharmony_ci conf->poolinfo->raid_disks = mddev->raid_disks * 2; 301462306a36Sopenharmony_ci err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc, 301562306a36Sopenharmony_ci rbio_pool_free, conf->poolinfo); 301662306a36Sopenharmony_ci if (err) 301762306a36Sopenharmony_ci goto abort; 301862306a36Sopenharmony_ci 301962306a36Sopenharmony_ci err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 302062306a36Sopenharmony_ci if (err) 302162306a36Sopenharmony_ci goto abort; 302262306a36Sopenharmony_ci 302362306a36Sopenharmony_ci conf->poolinfo->mddev = mddev; 302462306a36Sopenharmony_ci 302562306a36Sopenharmony_ci err = -EINVAL; 302662306a36Sopenharmony_ci spin_lock_init(&conf->device_lock); 302762306a36Sopenharmony_ci rdev_for_each(rdev, mddev) { 302862306a36Sopenharmony_ci int disk_idx = rdev->raid_disk; 302962306a36Sopenharmony_ci if (disk_idx >= mddev->raid_disks 303062306a36Sopenharmony_ci || disk_idx < 0) 303162306a36Sopenharmony_ci continue; 303262306a36Sopenharmony_ci if (test_bit(Replacement, &rdev->flags)) 303362306a36Sopenharmony_ci disk = conf->mirrors + mddev->raid_disks + disk_idx; 303462306a36Sopenharmony_ci else 303562306a36Sopenharmony_ci disk = conf->mirrors + disk_idx; 303662306a36Sopenharmony_ci 303762306a36Sopenharmony_ci if (disk->rdev) 303862306a36Sopenharmony_ci goto abort; 303962306a36Sopenharmony_ci disk->rdev = rdev; 304062306a36Sopenharmony_ci disk->head_position = 0; 304162306a36Sopenharmony_ci disk->seq_start = MaxSector; 304262306a36Sopenharmony_ci } 304362306a36Sopenharmony_ci conf->raid_disks = mddev->raid_disks; 304462306a36Sopenharmony_ci conf->mddev = mddev; 304562306a36Sopenharmony_ci INIT_LIST_HEAD(&conf->retry_list); 304662306a36Sopenharmony_ci INIT_LIST_HEAD(&conf->bio_end_io_list); 304762306a36Sopenharmony_ci 304862306a36Sopenharmony_ci spin_lock_init(&conf->resync_lock); 304962306a36Sopenharmony_ci init_waitqueue_head(&conf->wait_barrier); 305062306a36Sopenharmony_ci 305162306a36Sopenharmony_ci bio_list_init(&conf->pending_bio_list); 305262306a36Sopenharmony_ci conf->recovery_disabled = mddev->recovery_disabled - 1; 305362306a36Sopenharmony_ci 305462306a36Sopenharmony_ci err = -EIO; 305562306a36Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 305662306a36Sopenharmony_ci 305762306a36Sopenharmony_ci disk = conf->mirrors + i; 305862306a36Sopenharmony_ci 305962306a36Sopenharmony_ci if (i < conf->raid_disks && 306062306a36Sopenharmony_ci disk[conf->raid_disks].rdev) { 306162306a36Sopenharmony_ci /* This slot has a replacement. */ 306262306a36Sopenharmony_ci if (!disk->rdev) { 306362306a36Sopenharmony_ci /* No original, just make the replacement 306462306a36Sopenharmony_ci * a recovering spare 306562306a36Sopenharmony_ci */ 306662306a36Sopenharmony_ci disk->rdev = 306762306a36Sopenharmony_ci disk[conf->raid_disks].rdev; 306862306a36Sopenharmony_ci disk[conf->raid_disks].rdev = NULL; 306962306a36Sopenharmony_ci } else if (!test_bit(In_sync, &disk->rdev->flags)) 307062306a36Sopenharmony_ci /* Original is not in_sync - bad */ 307162306a36Sopenharmony_ci goto abort; 307262306a36Sopenharmony_ci } 307362306a36Sopenharmony_ci 307462306a36Sopenharmony_ci if (!disk->rdev || 307562306a36Sopenharmony_ci !test_bit(In_sync, &disk->rdev->flags)) { 307662306a36Sopenharmony_ci disk->head_position = 0; 307762306a36Sopenharmony_ci if (disk->rdev && 307862306a36Sopenharmony_ci (disk->rdev->saved_raid_disk < 0)) 307962306a36Sopenharmony_ci conf->fullsync = 1; 308062306a36Sopenharmony_ci } 308162306a36Sopenharmony_ci } 308262306a36Sopenharmony_ci 308362306a36Sopenharmony_ci err = -ENOMEM; 308462306a36Sopenharmony_ci rcu_assign_pointer(conf->thread, 308562306a36Sopenharmony_ci md_register_thread(raid1d, mddev, "raid1")); 308662306a36Sopenharmony_ci if (!conf->thread) 308762306a36Sopenharmony_ci goto abort; 308862306a36Sopenharmony_ci 308962306a36Sopenharmony_ci return conf; 309062306a36Sopenharmony_ci 309162306a36Sopenharmony_ci abort: 309262306a36Sopenharmony_ci if (conf) { 309362306a36Sopenharmony_ci mempool_exit(&conf->r1bio_pool); 309462306a36Sopenharmony_ci kfree(conf->mirrors); 309562306a36Sopenharmony_ci safe_put_page(conf->tmppage); 309662306a36Sopenharmony_ci kfree(conf->poolinfo); 309762306a36Sopenharmony_ci kfree(conf->nr_pending); 309862306a36Sopenharmony_ci kfree(conf->nr_waiting); 309962306a36Sopenharmony_ci kfree(conf->nr_queued); 310062306a36Sopenharmony_ci kfree(conf->barrier); 310162306a36Sopenharmony_ci bioset_exit(&conf->bio_split); 310262306a36Sopenharmony_ci kfree(conf); 310362306a36Sopenharmony_ci } 310462306a36Sopenharmony_ci return ERR_PTR(err); 310562306a36Sopenharmony_ci} 310662306a36Sopenharmony_ci 310762306a36Sopenharmony_cistatic void raid1_free(struct mddev *mddev, void *priv); 310862306a36Sopenharmony_cistatic int raid1_run(struct mddev *mddev) 310962306a36Sopenharmony_ci{ 311062306a36Sopenharmony_ci struct r1conf *conf; 311162306a36Sopenharmony_ci int i; 311262306a36Sopenharmony_ci struct md_rdev *rdev; 311362306a36Sopenharmony_ci int ret; 311462306a36Sopenharmony_ci 311562306a36Sopenharmony_ci if (mddev->level != 1) { 311662306a36Sopenharmony_ci pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n", 311762306a36Sopenharmony_ci mdname(mddev), mddev->level); 311862306a36Sopenharmony_ci return -EIO; 311962306a36Sopenharmony_ci } 312062306a36Sopenharmony_ci if (mddev->reshape_position != MaxSector) { 312162306a36Sopenharmony_ci pr_warn("md/raid1:%s: reshape_position set but not supported\n", 312262306a36Sopenharmony_ci mdname(mddev)); 312362306a36Sopenharmony_ci return -EIO; 312462306a36Sopenharmony_ci } 312562306a36Sopenharmony_ci if (mddev_init_writes_pending(mddev) < 0) 312662306a36Sopenharmony_ci return -ENOMEM; 312762306a36Sopenharmony_ci /* 312862306a36Sopenharmony_ci * copy the already verified devices into our private RAID1 312962306a36Sopenharmony_ci * bookkeeping area. [whatever we allocate in run(), 313062306a36Sopenharmony_ci * should be freed in raid1_free()] 313162306a36Sopenharmony_ci */ 313262306a36Sopenharmony_ci if (mddev->private == NULL) 313362306a36Sopenharmony_ci conf = setup_conf(mddev); 313462306a36Sopenharmony_ci else 313562306a36Sopenharmony_ci conf = mddev->private; 313662306a36Sopenharmony_ci 313762306a36Sopenharmony_ci if (IS_ERR(conf)) 313862306a36Sopenharmony_ci return PTR_ERR(conf); 313962306a36Sopenharmony_ci 314062306a36Sopenharmony_ci if (mddev->queue) 314162306a36Sopenharmony_ci blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 314262306a36Sopenharmony_ci 314362306a36Sopenharmony_ci rdev_for_each(rdev, mddev) { 314462306a36Sopenharmony_ci if (!mddev->gendisk) 314562306a36Sopenharmony_ci continue; 314662306a36Sopenharmony_ci disk_stack_limits(mddev->gendisk, rdev->bdev, 314762306a36Sopenharmony_ci rdev->data_offset << 9); 314862306a36Sopenharmony_ci } 314962306a36Sopenharmony_ci 315062306a36Sopenharmony_ci mddev->degraded = 0; 315162306a36Sopenharmony_ci for (i = 0; i < conf->raid_disks; i++) 315262306a36Sopenharmony_ci if (conf->mirrors[i].rdev == NULL || 315362306a36Sopenharmony_ci !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || 315462306a36Sopenharmony_ci test_bit(Faulty, &conf->mirrors[i].rdev->flags)) 315562306a36Sopenharmony_ci mddev->degraded++; 315662306a36Sopenharmony_ci /* 315762306a36Sopenharmony_ci * RAID1 needs at least one disk in active 315862306a36Sopenharmony_ci */ 315962306a36Sopenharmony_ci if (conf->raid_disks - mddev->degraded < 1) { 316062306a36Sopenharmony_ci md_unregister_thread(mddev, &conf->thread); 316162306a36Sopenharmony_ci ret = -EINVAL; 316262306a36Sopenharmony_ci goto abort; 316362306a36Sopenharmony_ci } 316462306a36Sopenharmony_ci 316562306a36Sopenharmony_ci if (conf->raid_disks - mddev->degraded == 1) 316662306a36Sopenharmony_ci mddev->recovery_cp = MaxSector; 316762306a36Sopenharmony_ci 316862306a36Sopenharmony_ci if (mddev->recovery_cp != MaxSector) 316962306a36Sopenharmony_ci pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", 317062306a36Sopenharmony_ci mdname(mddev)); 317162306a36Sopenharmony_ci pr_info("md/raid1:%s: active with %d out of %d mirrors\n", 317262306a36Sopenharmony_ci mdname(mddev), mddev->raid_disks - mddev->degraded, 317362306a36Sopenharmony_ci mddev->raid_disks); 317462306a36Sopenharmony_ci 317562306a36Sopenharmony_ci /* 317662306a36Sopenharmony_ci * Ok, everything is just fine now 317762306a36Sopenharmony_ci */ 317862306a36Sopenharmony_ci rcu_assign_pointer(mddev->thread, conf->thread); 317962306a36Sopenharmony_ci rcu_assign_pointer(conf->thread, NULL); 318062306a36Sopenharmony_ci mddev->private = conf; 318162306a36Sopenharmony_ci set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); 318262306a36Sopenharmony_ci 318362306a36Sopenharmony_ci md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); 318462306a36Sopenharmony_ci 318562306a36Sopenharmony_ci ret = md_integrity_register(mddev); 318662306a36Sopenharmony_ci if (ret) { 318762306a36Sopenharmony_ci md_unregister_thread(mddev, &mddev->thread); 318862306a36Sopenharmony_ci goto abort; 318962306a36Sopenharmony_ci } 319062306a36Sopenharmony_ci return 0; 319162306a36Sopenharmony_ci 319262306a36Sopenharmony_ciabort: 319362306a36Sopenharmony_ci raid1_free(mddev, conf); 319462306a36Sopenharmony_ci return ret; 319562306a36Sopenharmony_ci} 319662306a36Sopenharmony_ci 319762306a36Sopenharmony_cistatic void raid1_free(struct mddev *mddev, void *priv) 319862306a36Sopenharmony_ci{ 319962306a36Sopenharmony_ci struct r1conf *conf = priv; 320062306a36Sopenharmony_ci 320162306a36Sopenharmony_ci mempool_exit(&conf->r1bio_pool); 320262306a36Sopenharmony_ci kfree(conf->mirrors); 320362306a36Sopenharmony_ci safe_put_page(conf->tmppage); 320462306a36Sopenharmony_ci kfree(conf->poolinfo); 320562306a36Sopenharmony_ci kfree(conf->nr_pending); 320662306a36Sopenharmony_ci kfree(conf->nr_waiting); 320762306a36Sopenharmony_ci kfree(conf->nr_queued); 320862306a36Sopenharmony_ci kfree(conf->barrier); 320962306a36Sopenharmony_ci bioset_exit(&conf->bio_split); 321062306a36Sopenharmony_ci kfree(conf); 321162306a36Sopenharmony_ci} 321262306a36Sopenharmony_ci 321362306a36Sopenharmony_cistatic int raid1_resize(struct mddev *mddev, sector_t sectors) 321462306a36Sopenharmony_ci{ 321562306a36Sopenharmony_ci /* no resync is happening, and there is enough space 321662306a36Sopenharmony_ci * on all devices, so we can resize. 321762306a36Sopenharmony_ci * We need to make sure resync covers any new space. 321862306a36Sopenharmony_ci * If the array is shrinking we should possibly wait until 321962306a36Sopenharmony_ci * any io in the removed space completes, but it hardly seems 322062306a36Sopenharmony_ci * worth it. 322162306a36Sopenharmony_ci */ 322262306a36Sopenharmony_ci sector_t newsize = raid1_size(mddev, sectors, 0); 322362306a36Sopenharmony_ci if (mddev->external_size && 322462306a36Sopenharmony_ci mddev->array_sectors > newsize) 322562306a36Sopenharmony_ci return -EINVAL; 322662306a36Sopenharmony_ci if (mddev->bitmap) { 322762306a36Sopenharmony_ci int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 322862306a36Sopenharmony_ci if (ret) 322962306a36Sopenharmony_ci return ret; 323062306a36Sopenharmony_ci } 323162306a36Sopenharmony_ci md_set_array_sectors(mddev, newsize); 323262306a36Sopenharmony_ci if (sectors > mddev->dev_sectors && 323362306a36Sopenharmony_ci mddev->recovery_cp > mddev->dev_sectors) { 323462306a36Sopenharmony_ci mddev->recovery_cp = mddev->dev_sectors; 323562306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 323662306a36Sopenharmony_ci } 323762306a36Sopenharmony_ci mddev->dev_sectors = sectors; 323862306a36Sopenharmony_ci mddev->resync_max_sectors = sectors; 323962306a36Sopenharmony_ci return 0; 324062306a36Sopenharmony_ci} 324162306a36Sopenharmony_ci 324262306a36Sopenharmony_cistatic int raid1_reshape(struct mddev *mddev) 324362306a36Sopenharmony_ci{ 324462306a36Sopenharmony_ci /* We need to: 324562306a36Sopenharmony_ci * 1/ resize the r1bio_pool 324662306a36Sopenharmony_ci * 2/ resize conf->mirrors 324762306a36Sopenharmony_ci * 324862306a36Sopenharmony_ci * We allocate a new r1bio_pool if we can. 324962306a36Sopenharmony_ci * Then raise a device barrier and wait until all IO stops. 325062306a36Sopenharmony_ci * Then resize conf->mirrors and swap in the new r1bio pool. 325162306a36Sopenharmony_ci * 325262306a36Sopenharmony_ci * At the same time, we "pack" the devices so that all the missing 325362306a36Sopenharmony_ci * devices have the higher raid_disk numbers. 325462306a36Sopenharmony_ci */ 325562306a36Sopenharmony_ci mempool_t newpool, oldpool; 325662306a36Sopenharmony_ci struct pool_info *newpoolinfo; 325762306a36Sopenharmony_ci struct raid1_info *newmirrors; 325862306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 325962306a36Sopenharmony_ci int cnt, raid_disks; 326062306a36Sopenharmony_ci unsigned long flags; 326162306a36Sopenharmony_ci int d, d2; 326262306a36Sopenharmony_ci int ret; 326362306a36Sopenharmony_ci 326462306a36Sopenharmony_ci memset(&newpool, 0, sizeof(newpool)); 326562306a36Sopenharmony_ci memset(&oldpool, 0, sizeof(oldpool)); 326662306a36Sopenharmony_ci 326762306a36Sopenharmony_ci /* Cannot change chunk_size, layout, or level */ 326862306a36Sopenharmony_ci if (mddev->chunk_sectors != mddev->new_chunk_sectors || 326962306a36Sopenharmony_ci mddev->layout != mddev->new_layout || 327062306a36Sopenharmony_ci mddev->level != mddev->new_level) { 327162306a36Sopenharmony_ci mddev->new_chunk_sectors = mddev->chunk_sectors; 327262306a36Sopenharmony_ci mddev->new_layout = mddev->layout; 327362306a36Sopenharmony_ci mddev->new_level = mddev->level; 327462306a36Sopenharmony_ci return -EINVAL; 327562306a36Sopenharmony_ci } 327662306a36Sopenharmony_ci 327762306a36Sopenharmony_ci if (!mddev_is_clustered(mddev)) 327862306a36Sopenharmony_ci md_allow_write(mddev); 327962306a36Sopenharmony_ci 328062306a36Sopenharmony_ci raid_disks = mddev->raid_disks + mddev->delta_disks; 328162306a36Sopenharmony_ci 328262306a36Sopenharmony_ci if (raid_disks < conf->raid_disks) { 328362306a36Sopenharmony_ci cnt=0; 328462306a36Sopenharmony_ci for (d= 0; d < conf->raid_disks; d++) 328562306a36Sopenharmony_ci if (conf->mirrors[d].rdev) 328662306a36Sopenharmony_ci cnt++; 328762306a36Sopenharmony_ci if (cnt > raid_disks) 328862306a36Sopenharmony_ci return -EBUSY; 328962306a36Sopenharmony_ci } 329062306a36Sopenharmony_ci 329162306a36Sopenharmony_ci newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); 329262306a36Sopenharmony_ci if (!newpoolinfo) 329362306a36Sopenharmony_ci return -ENOMEM; 329462306a36Sopenharmony_ci newpoolinfo->mddev = mddev; 329562306a36Sopenharmony_ci newpoolinfo->raid_disks = raid_disks * 2; 329662306a36Sopenharmony_ci 329762306a36Sopenharmony_ci ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, 329862306a36Sopenharmony_ci rbio_pool_free, newpoolinfo); 329962306a36Sopenharmony_ci if (ret) { 330062306a36Sopenharmony_ci kfree(newpoolinfo); 330162306a36Sopenharmony_ci return ret; 330262306a36Sopenharmony_ci } 330362306a36Sopenharmony_ci newmirrors = kzalloc(array3_size(sizeof(struct raid1_info), 330462306a36Sopenharmony_ci raid_disks, 2), 330562306a36Sopenharmony_ci GFP_KERNEL); 330662306a36Sopenharmony_ci if (!newmirrors) { 330762306a36Sopenharmony_ci kfree(newpoolinfo); 330862306a36Sopenharmony_ci mempool_exit(&newpool); 330962306a36Sopenharmony_ci return -ENOMEM; 331062306a36Sopenharmony_ci } 331162306a36Sopenharmony_ci 331262306a36Sopenharmony_ci freeze_array(conf, 0); 331362306a36Sopenharmony_ci 331462306a36Sopenharmony_ci /* ok, everything is stopped */ 331562306a36Sopenharmony_ci oldpool = conf->r1bio_pool; 331662306a36Sopenharmony_ci conf->r1bio_pool = newpool; 331762306a36Sopenharmony_ci 331862306a36Sopenharmony_ci for (d = d2 = 0; d < conf->raid_disks; d++) { 331962306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[d].rdev; 332062306a36Sopenharmony_ci if (rdev && rdev->raid_disk != d2) { 332162306a36Sopenharmony_ci sysfs_unlink_rdev(mddev, rdev); 332262306a36Sopenharmony_ci rdev->raid_disk = d2; 332362306a36Sopenharmony_ci sysfs_unlink_rdev(mddev, rdev); 332462306a36Sopenharmony_ci if (sysfs_link_rdev(mddev, rdev)) 332562306a36Sopenharmony_ci pr_warn("md/raid1:%s: cannot register rd%d\n", 332662306a36Sopenharmony_ci mdname(mddev), rdev->raid_disk); 332762306a36Sopenharmony_ci } 332862306a36Sopenharmony_ci if (rdev) 332962306a36Sopenharmony_ci newmirrors[d2++].rdev = rdev; 333062306a36Sopenharmony_ci } 333162306a36Sopenharmony_ci kfree(conf->mirrors); 333262306a36Sopenharmony_ci conf->mirrors = newmirrors; 333362306a36Sopenharmony_ci kfree(conf->poolinfo); 333462306a36Sopenharmony_ci conf->poolinfo = newpoolinfo; 333562306a36Sopenharmony_ci 333662306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 333762306a36Sopenharmony_ci mddev->degraded += (raid_disks - conf->raid_disks); 333862306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 333962306a36Sopenharmony_ci conf->raid_disks = mddev->raid_disks = raid_disks; 334062306a36Sopenharmony_ci mddev->delta_disks = 0; 334162306a36Sopenharmony_ci 334262306a36Sopenharmony_ci unfreeze_array(conf); 334362306a36Sopenharmony_ci 334462306a36Sopenharmony_ci set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 334562306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 334662306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 334762306a36Sopenharmony_ci 334862306a36Sopenharmony_ci mempool_exit(&oldpool); 334962306a36Sopenharmony_ci return 0; 335062306a36Sopenharmony_ci} 335162306a36Sopenharmony_ci 335262306a36Sopenharmony_cistatic void raid1_quiesce(struct mddev *mddev, int quiesce) 335362306a36Sopenharmony_ci{ 335462306a36Sopenharmony_ci struct r1conf *conf = mddev->private; 335562306a36Sopenharmony_ci 335662306a36Sopenharmony_ci if (quiesce) 335762306a36Sopenharmony_ci freeze_array(conf, 0); 335862306a36Sopenharmony_ci else 335962306a36Sopenharmony_ci unfreeze_array(conf); 336062306a36Sopenharmony_ci} 336162306a36Sopenharmony_ci 336262306a36Sopenharmony_cistatic void *raid1_takeover(struct mddev *mddev) 336362306a36Sopenharmony_ci{ 336462306a36Sopenharmony_ci /* raid1 can take over: 336562306a36Sopenharmony_ci * raid5 with 2 devices, any layout or chunk size 336662306a36Sopenharmony_ci */ 336762306a36Sopenharmony_ci if (mddev->level == 5 && mddev->raid_disks == 2) { 336862306a36Sopenharmony_ci struct r1conf *conf; 336962306a36Sopenharmony_ci mddev->new_level = 1; 337062306a36Sopenharmony_ci mddev->new_layout = 0; 337162306a36Sopenharmony_ci mddev->new_chunk_sectors = 0; 337262306a36Sopenharmony_ci conf = setup_conf(mddev); 337362306a36Sopenharmony_ci if (!IS_ERR(conf)) { 337462306a36Sopenharmony_ci /* Array must appear to be quiesced */ 337562306a36Sopenharmony_ci conf->array_frozen = 1; 337662306a36Sopenharmony_ci mddev_clear_unsupported_flags(mddev, 337762306a36Sopenharmony_ci UNSUPPORTED_MDDEV_FLAGS); 337862306a36Sopenharmony_ci } 337962306a36Sopenharmony_ci return conf; 338062306a36Sopenharmony_ci } 338162306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 338262306a36Sopenharmony_ci} 338362306a36Sopenharmony_ci 338462306a36Sopenharmony_cistatic struct md_personality raid1_personality = 338562306a36Sopenharmony_ci{ 338662306a36Sopenharmony_ci .name = "raid1", 338762306a36Sopenharmony_ci .level = 1, 338862306a36Sopenharmony_ci .owner = THIS_MODULE, 338962306a36Sopenharmony_ci .make_request = raid1_make_request, 339062306a36Sopenharmony_ci .run = raid1_run, 339162306a36Sopenharmony_ci .free = raid1_free, 339262306a36Sopenharmony_ci .status = raid1_status, 339362306a36Sopenharmony_ci .error_handler = raid1_error, 339462306a36Sopenharmony_ci .hot_add_disk = raid1_add_disk, 339562306a36Sopenharmony_ci .hot_remove_disk= raid1_remove_disk, 339662306a36Sopenharmony_ci .spare_active = raid1_spare_active, 339762306a36Sopenharmony_ci .sync_request = raid1_sync_request, 339862306a36Sopenharmony_ci .resize = raid1_resize, 339962306a36Sopenharmony_ci .size = raid1_size, 340062306a36Sopenharmony_ci .check_reshape = raid1_reshape, 340162306a36Sopenharmony_ci .quiesce = raid1_quiesce, 340262306a36Sopenharmony_ci .takeover = raid1_takeover, 340362306a36Sopenharmony_ci}; 340462306a36Sopenharmony_ci 340562306a36Sopenharmony_cistatic int __init raid_init(void) 340662306a36Sopenharmony_ci{ 340762306a36Sopenharmony_ci return register_md_personality(&raid1_personality); 340862306a36Sopenharmony_ci} 340962306a36Sopenharmony_ci 341062306a36Sopenharmony_cistatic void raid_exit(void) 341162306a36Sopenharmony_ci{ 341262306a36Sopenharmony_ci unregister_md_personality(&raid1_personality); 341362306a36Sopenharmony_ci} 341462306a36Sopenharmony_ci 341562306a36Sopenharmony_cimodule_init(raid_init); 341662306a36Sopenharmony_cimodule_exit(raid_exit); 341762306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 341862306a36Sopenharmony_ciMODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); 341962306a36Sopenharmony_ciMODULE_ALIAS("md-personality-3"); /* RAID1 */ 342062306a36Sopenharmony_ciMODULE_ALIAS("md-raid1"); 342162306a36Sopenharmony_ciMODULE_ALIAS("md-level-1"); 3422