162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * raid10.c : Multiple Devices driver for Linux 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2000-2004 Neil Brown 662306a36Sopenharmony_ci * 762306a36Sopenharmony_ci * RAID-10 support for md. 862306a36Sopenharmony_ci * 962306a36Sopenharmony_ci * Base on code in raid1.c. See raid1.c for further copyright information. 1062306a36Sopenharmony_ci */ 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci#include <linux/slab.h> 1362306a36Sopenharmony_ci#include <linux/delay.h> 1462306a36Sopenharmony_ci#include <linux/blkdev.h> 1562306a36Sopenharmony_ci#include <linux/module.h> 1662306a36Sopenharmony_ci#include <linux/seq_file.h> 1762306a36Sopenharmony_ci#include <linux/ratelimit.h> 1862306a36Sopenharmony_ci#include <linux/kthread.h> 1962306a36Sopenharmony_ci#include <linux/raid/md_p.h> 2062306a36Sopenharmony_ci#include <trace/events/block.h> 2162306a36Sopenharmony_ci#include "md.h" 2262306a36Sopenharmony_ci#include "raid10.h" 2362306a36Sopenharmony_ci#include "raid0.h" 2462306a36Sopenharmony_ci#include "md-bitmap.h" 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci/* 2762306a36Sopenharmony_ci * RAID10 provides a combination of RAID0 and RAID1 functionality. 2862306a36Sopenharmony_ci * The layout of data is defined by 2962306a36Sopenharmony_ci * chunk_size 3062306a36Sopenharmony_ci * raid_disks 3162306a36Sopenharmony_ci * near_copies (stored in low byte of layout) 3262306a36Sopenharmony_ci * far_copies (stored in second byte of layout) 3362306a36Sopenharmony_ci * far_offset (stored in bit 16 of layout ) 3462306a36Sopenharmony_ci * use_far_sets (stored in bit 17 of layout ) 3562306a36Sopenharmony_ci * use_far_sets_bugfixed (stored in bit 18 of layout ) 3662306a36Sopenharmony_ci * 3762306a36Sopenharmony_ci * The data to be stored is divided into chunks using chunksize. Each device 3862306a36Sopenharmony_ci * is divided into far_copies sections. In each section, chunks are laid out 3962306a36Sopenharmony_ci * in a style similar to raid0, but near_copies copies of each chunk is stored 4062306a36Sopenharmony_ci * (each on a different drive). The starting device for each section is offset 4162306a36Sopenharmony_ci * near_copies from the starting device of the previous section. Thus there 4262306a36Sopenharmony_ci * are (near_copies * far_copies) of each chunk, and each is on a different 4362306a36Sopenharmony_ci * drive. near_copies and far_copies must be at least one, and their product 4462306a36Sopenharmony_ci * is at most raid_disks. 4562306a36Sopenharmony_ci * 4662306a36Sopenharmony_ci * If far_offset is true, then the far_copies are handled a bit differently. 4762306a36Sopenharmony_ci * The copies are still in different stripes, but instead of being very far 4862306a36Sopenharmony_ci * apart on disk, there are adjacent stripes. 4962306a36Sopenharmony_ci * 5062306a36Sopenharmony_ci * The far and offset algorithms are handled slightly differently if 5162306a36Sopenharmony_ci * 'use_far_sets' is true. In this case, the array's devices are grouped into 5262306a36Sopenharmony_ci * sets that are (near_copies * far_copies) in size. The far copied stripes 5362306a36Sopenharmony_ci * are still shifted by 'near_copies' devices, but this shifting stays confined 5462306a36Sopenharmony_ci * to the set rather than the entire array. This is done to improve the number 5562306a36Sopenharmony_ci * of device combinations that can fail without causing the array to fail. 5662306a36Sopenharmony_ci * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk 5762306a36Sopenharmony_ci * on a device): 5862306a36Sopenharmony_ci * A B C D A B C D E 5962306a36Sopenharmony_ci * ... ... 6062306a36Sopenharmony_ci * D A B C E A B C D 6162306a36Sopenharmony_ci * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): 6262306a36Sopenharmony_ci * [A B] [C D] [A B] [C D E] 6362306a36Sopenharmony_ci * |...| |...| |...| | ... | 6462306a36Sopenharmony_ci * [B A] [D C] [B A] [E C D] 6562306a36Sopenharmony_ci */ 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_cistatic void allow_barrier(struct r10conf *conf); 6862306a36Sopenharmony_cistatic void lower_barrier(struct r10conf *conf); 6962306a36Sopenharmony_cistatic int _enough(struct r10conf *conf, int previous, int ignore); 7062306a36Sopenharmony_cistatic int enough(struct r10conf *conf, int ignore); 7162306a36Sopenharmony_cistatic sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 7262306a36Sopenharmony_ci int *skipped); 7362306a36Sopenharmony_cistatic void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); 7462306a36Sopenharmony_cistatic void end_reshape_write(struct bio *bio); 7562306a36Sopenharmony_cistatic void end_reshape(struct r10conf *conf); 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci#define raid10_log(md, fmt, args...) \ 7862306a36Sopenharmony_ci do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci#include "raid1-10.c" 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci#define NULL_CMD 8362306a36Sopenharmony_ci#define cmd_before(conf, cmd) \ 8462306a36Sopenharmony_ci do { \ 8562306a36Sopenharmony_ci write_sequnlock_irq(&(conf)->resync_lock); \ 8662306a36Sopenharmony_ci cmd; \ 8762306a36Sopenharmony_ci } while (0) 8862306a36Sopenharmony_ci#define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock) 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci#define wait_event_barrier_cmd(conf, cond, cmd) \ 9162306a36Sopenharmony_ci wait_event_cmd((conf)->wait_barrier, cond, cmd_before(conf, cmd), \ 9262306a36Sopenharmony_ci cmd_after(conf)) 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_ci#define wait_event_barrier(conf, cond) \ 9562306a36Sopenharmony_ci wait_event_barrier_cmd(conf, cond, NULL_CMD) 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci/* 9862306a36Sopenharmony_ci * for resync bio, r10bio pointer can be retrieved from the per-bio 9962306a36Sopenharmony_ci * 'struct resync_pages'. 10062306a36Sopenharmony_ci */ 10162306a36Sopenharmony_cistatic inline struct r10bio *get_resync_r10bio(struct bio *bio) 10262306a36Sopenharmony_ci{ 10362306a36Sopenharmony_ci return get_resync_pages(bio)->raid_bio; 10462306a36Sopenharmony_ci} 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_cistatic void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 10762306a36Sopenharmony_ci{ 10862306a36Sopenharmony_ci struct r10conf *conf = data; 10962306a36Sopenharmony_ci int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci /* allocate a r10bio with room for raid_disks entries in the 11262306a36Sopenharmony_ci * bios array */ 11362306a36Sopenharmony_ci return kzalloc(size, gfp_flags); 11462306a36Sopenharmony_ci} 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 11762306a36Sopenharmony_ci/* amount of memory to reserve for resync requests */ 11862306a36Sopenharmony_ci#define RESYNC_WINDOW (1024*1024) 11962306a36Sopenharmony_ci/* maximum number of concurrent requests, memory permitting */ 12062306a36Sopenharmony_ci#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) 12162306a36Sopenharmony_ci#define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW) 12262306a36Sopenharmony_ci#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) 12362306a36Sopenharmony_ci 12462306a36Sopenharmony_ci/* 12562306a36Sopenharmony_ci * When performing a resync, we need to read and compare, so 12662306a36Sopenharmony_ci * we need as many pages are there are copies. 12762306a36Sopenharmony_ci * When performing a recovery, we need 2 bios, one for read, 12862306a36Sopenharmony_ci * one for write (we recover only one drive per r10buf) 12962306a36Sopenharmony_ci * 13062306a36Sopenharmony_ci */ 13162306a36Sopenharmony_cistatic void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) 13262306a36Sopenharmony_ci{ 13362306a36Sopenharmony_ci struct r10conf *conf = data; 13462306a36Sopenharmony_ci struct r10bio *r10_bio; 13562306a36Sopenharmony_ci struct bio *bio; 13662306a36Sopenharmony_ci int j; 13762306a36Sopenharmony_ci int nalloc, nalloc_rp; 13862306a36Sopenharmony_ci struct resync_pages *rps; 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci r10_bio = r10bio_pool_alloc(gfp_flags, conf); 14162306a36Sopenharmony_ci if (!r10_bio) 14262306a36Sopenharmony_ci return NULL; 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 14562306a36Sopenharmony_ci test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 14662306a36Sopenharmony_ci nalloc = conf->copies; /* resync */ 14762306a36Sopenharmony_ci else 14862306a36Sopenharmony_ci nalloc = 2; /* recovery */ 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_ci /* allocate once for all bios */ 15162306a36Sopenharmony_ci if (!conf->have_replacement) 15262306a36Sopenharmony_ci nalloc_rp = nalloc; 15362306a36Sopenharmony_ci else 15462306a36Sopenharmony_ci nalloc_rp = nalloc * 2; 15562306a36Sopenharmony_ci rps = kmalloc_array(nalloc_rp, sizeof(struct resync_pages), gfp_flags); 15662306a36Sopenharmony_ci if (!rps) 15762306a36Sopenharmony_ci goto out_free_r10bio; 15862306a36Sopenharmony_ci 15962306a36Sopenharmony_ci /* 16062306a36Sopenharmony_ci * Allocate bios. 16162306a36Sopenharmony_ci */ 16262306a36Sopenharmony_ci for (j = nalloc ; j-- ; ) { 16362306a36Sopenharmony_ci bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 16462306a36Sopenharmony_ci if (!bio) 16562306a36Sopenharmony_ci goto out_free_bio; 16662306a36Sopenharmony_ci bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); 16762306a36Sopenharmony_ci r10_bio->devs[j].bio = bio; 16862306a36Sopenharmony_ci if (!conf->have_replacement) 16962306a36Sopenharmony_ci continue; 17062306a36Sopenharmony_ci bio = bio_kmalloc(RESYNC_PAGES, gfp_flags); 17162306a36Sopenharmony_ci if (!bio) 17262306a36Sopenharmony_ci goto out_free_bio; 17362306a36Sopenharmony_ci bio_init(bio, NULL, bio->bi_inline_vecs, RESYNC_PAGES, 0); 17462306a36Sopenharmony_ci r10_bio->devs[j].repl_bio = bio; 17562306a36Sopenharmony_ci } 17662306a36Sopenharmony_ci /* 17762306a36Sopenharmony_ci * Allocate RESYNC_PAGES data pages and attach them 17862306a36Sopenharmony_ci * where needed. 17962306a36Sopenharmony_ci */ 18062306a36Sopenharmony_ci for (j = 0; j < nalloc; j++) { 18162306a36Sopenharmony_ci struct bio *rbio = r10_bio->devs[j].repl_bio; 18262306a36Sopenharmony_ci struct resync_pages *rp, *rp_repl; 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci rp = &rps[j]; 18562306a36Sopenharmony_ci if (rbio) 18662306a36Sopenharmony_ci rp_repl = &rps[nalloc + j]; 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci bio = r10_bio->devs[j].bio; 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci if (!j || test_bit(MD_RECOVERY_SYNC, 19162306a36Sopenharmony_ci &conf->mddev->recovery)) { 19262306a36Sopenharmony_ci if (resync_alloc_pages(rp, gfp_flags)) 19362306a36Sopenharmony_ci goto out_free_pages; 19462306a36Sopenharmony_ci } else { 19562306a36Sopenharmony_ci memcpy(rp, &rps[0], sizeof(*rp)); 19662306a36Sopenharmony_ci resync_get_all_pages(rp); 19762306a36Sopenharmony_ci } 19862306a36Sopenharmony_ci 19962306a36Sopenharmony_ci rp->raid_bio = r10_bio; 20062306a36Sopenharmony_ci bio->bi_private = rp; 20162306a36Sopenharmony_ci if (rbio) { 20262306a36Sopenharmony_ci memcpy(rp_repl, rp, sizeof(*rp)); 20362306a36Sopenharmony_ci rbio->bi_private = rp_repl; 20462306a36Sopenharmony_ci } 20562306a36Sopenharmony_ci } 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci return r10_bio; 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ciout_free_pages: 21062306a36Sopenharmony_ci while (--j >= 0) 21162306a36Sopenharmony_ci resync_free_pages(&rps[j]); 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci j = 0; 21462306a36Sopenharmony_ciout_free_bio: 21562306a36Sopenharmony_ci for ( ; j < nalloc; j++) { 21662306a36Sopenharmony_ci if (r10_bio->devs[j].bio) 21762306a36Sopenharmony_ci bio_uninit(r10_bio->devs[j].bio); 21862306a36Sopenharmony_ci kfree(r10_bio->devs[j].bio); 21962306a36Sopenharmony_ci if (r10_bio->devs[j].repl_bio) 22062306a36Sopenharmony_ci bio_uninit(r10_bio->devs[j].repl_bio); 22162306a36Sopenharmony_ci kfree(r10_bio->devs[j].repl_bio); 22262306a36Sopenharmony_ci } 22362306a36Sopenharmony_ci kfree(rps); 22462306a36Sopenharmony_ciout_free_r10bio: 22562306a36Sopenharmony_ci rbio_pool_free(r10_bio, conf); 22662306a36Sopenharmony_ci return NULL; 22762306a36Sopenharmony_ci} 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_cistatic void r10buf_pool_free(void *__r10_bio, void *data) 23062306a36Sopenharmony_ci{ 23162306a36Sopenharmony_ci struct r10conf *conf = data; 23262306a36Sopenharmony_ci struct r10bio *r10bio = __r10_bio; 23362306a36Sopenharmony_ci int j; 23462306a36Sopenharmony_ci struct resync_pages *rp = NULL; 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci for (j = conf->copies; j--; ) { 23762306a36Sopenharmony_ci struct bio *bio = r10bio->devs[j].bio; 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci if (bio) { 24062306a36Sopenharmony_ci rp = get_resync_pages(bio); 24162306a36Sopenharmony_ci resync_free_pages(rp); 24262306a36Sopenharmony_ci bio_uninit(bio); 24362306a36Sopenharmony_ci kfree(bio); 24462306a36Sopenharmony_ci } 24562306a36Sopenharmony_ci 24662306a36Sopenharmony_ci bio = r10bio->devs[j].repl_bio; 24762306a36Sopenharmony_ci if (bio) { 24862306a36Sopenharmony_ci bio_uninit(bio); 24962306a36Sopenharmony_ci kfree(bio); 25062306a36Sopenharmony_ci } 25162306a36Sopenharmony_ci } 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci /* resync pages array stored in the 1st bio's .bi_private */ 25462306a36Sopenharmony_ci kfree(rp); 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci rbio_pool_free(r10bio, conf); 25762306a36Sopenharmony_ci} 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_cistatic void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) 26062306a36Sopenharmony_ci{ 26162306a36Sopenharmony_ci int i; 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci for (i = 0; i < conf->geo.raid_disks; i++) { 26462306a36Sopenharmony_ci struct bio **bio = & r10_bio->devs[i].bio; 26562306a36Sopenharmony_ci if (!BIO_SPECIAL(*bio)) 26662306a36Sopenharmony_ci bio_put(*bio); 26762306a36Sopenharmony_ci *bio = NULL; 26862306a36Sopenharmony_ci bio = &r10_bio->devs[i].repl_bio; 26962306a36Sopenharmony_ci if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) 27062306a36Sopenharmony_ci bio_put(*bio); 27162306a36Sopenharmony_ci *bio = NULL; 27262306a36Sopenharmony_ci } 27362306a36Sopenharmony_ci} 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_cistatic void free_r10bio(struct r10bio *r10_bio) 27662306a36Sopenharmony_ci{ 27762306a36Sopenharmony_ci struct r10conf *conf = r10_bio->mddev->private; 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_ci put_all_bios(conf, r10_bio); 28062306a36Sopenharmony_ci mempool_free(r10_bio, &conf->r10bio_pool); 28162306a36Sopenharmony_ci} 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_cistatic void put_buf(struct r10bio *r10_bio) 28462306a36Sopenharmony_ci{ 28562306a36Sopenharmony_ci struct r10conf *conf = r10_bio->mddev->private; 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci mempool_free(r10_bio, &conf->r10buf_pool); 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci lower_barrier(conf); 29062306a36Sopenharmony_ci} 29162306a36Sopenharmony_ci 29262306a36Sopenharmony_cistatic void wake_up_barrier(struct r10conf *conf) 29362306a36Sopenharmony_ci{ 29462306a36Sopenharmony_ci if (wq_has_sleeper(&conf->wait_barrier)) 29562306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 29662306a36Sopenharmony_ci} 29762306a36Sopenharmony_ci 29862306a36Sopenharmony_cistatic void reschedule_retry(struct r10bio *r10_bio) 29962306a36Sopenharmony_ci{ 30062306a36Sopenharmony_ci unsigned long flags; 30162306a36Sopenharmony_ci struct mddev *mddev = r10_bio->mddev; 30262306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 30562306a36Sopenharmony_ci list_add(&r10_bio->retry_list, &conf->retry_list); 30662306a36Sopenharmony_ci conf->nr_queued ++; 30762306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci /* wake up frozen array... */ 31062306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 31362306a36Sopenharmony_ci} 31462306a36Sopenharmony_ci 31562306a36Sopenharmony_ci/* 31662306a36Sopenharmony_ci * raid_end_bio_io() is called when we have finished servicing a mirrored 31762306a36Sopenharmony_ci * operation and are ready to return a success/failure code to the buffer 31862306a36Sopenharmony_ci * cache layer. 31962306a36Sopenharmony_ci */ 32062306a36Sopenharmony_cistatic void raid_end_bio_io(struct r10bio *r10_bio) 32162306a36Sopenharmony_ci{ 32262306a36Sopenharmony_ci struct bio *bio = r10_bio->master_bio; 32362306a36Sopenharmony_ci struct r10conf *conf = r10_bio->mddev->private; 32462306a36Sopenharmony_ci 32562306a36Sopenharmony_ci if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 32662306a36Sopenharmony_ci bio->bi_status = BLK_STS_IOERR; 32762306a36Sopenharmony_ci 32862306a36Sopenharmony_ci bio_endio(bio); 32962306a36Sopenharmony_ci /* 33062306a36Sopenharmony_ci * Wake up any possible resync thread that waits for the device 33162306a36Sopenharmony_ci * to go idle. 33262306a36Sopenharmony_ci */ 33362306a36Sopenharmony_ci allow_barrier(conf); 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci free_r10bio(r10_bio); 33662306a36Sopenharmony_ci} 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ci/* 33962306a36Sopenharmony_ci * Update disk head position estimator based on IRQ completion info. 34062306a36Sopenharmony_ci */ 34162306a36Sopenharmony_cistatic inline void update_head_pos(int slot, struct r10bio *r10_bio) 34262306a36Sopenharmony_ci{ 34362306a36Sopenharmony_ci struct r10conf *conf = r10_bio->mddev->private; 34462306a36Sopenharmony_ci 34562306a36Sopenharmony_ci conf->mirrors[r10_bio->devs[slot].devnum].head_position = 34662306a36Sopenharmony_ci r10_bio->devs[slot].addr + (r10_bio->sectors); 34762306a36Sopenharmony_ci} 34862306a36Sopenharmony_ci 34962306a36Sopenharmony_ci/* 35062306a36Sopenharmony_ci * Find the disk number which triggered given bio 35162306a36Sopenharmony_ci */ 35262306a36Sopenharmony_cistatic int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, 35362306a36Sopenharmony_ci struct bio *bio, int *slotp, int *replp) 35462306a36Sopenharmony_ci{ 35562306a36Sopenharmony_ci int slot; 35662306a36Sopenharmony_ci int repl = 0; 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci for (slot = 0; slot < conf->geo.raid_disks; slot++) { 35962306a36Sopenharmony_ci if (r10_bio->devs[slot].bio == bio) 36062306a36Sopenharmony_ci break; 36162306a36Sopenharmony_ci if (r10_bio->devs[slot].repl_bio == bio) { 36262306a36Sopenharmony_ci repl = 1; 36362306a36Sopenharmony_ci break; 36462306a36Sopenharmony_ci } 36562306a36Sopenharmony_ci } 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci update_head_pos(slot, r10_bio); 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_ci if (slotp) 37062306a36Sopenharmony_ci *slotp = slot; 37162306a36Sopenharmony_ci if (replp) 37262306a36Sopenharmony_ci *replp = repl; 37362306a36Sopenharmony_ci return r10_bio->devs[slot].devnum; 37462306a36Sopenharmony_ci} 37562306a36Sopenharmony_ci 37662306a36Sopenharmony_cistatic void raid10_end_read_request(struct bio *bio) 37762306a36Sopenharmony_ci{ 37862306a36Sopenharmony_ci int uptodate = !bio->bi_status; 37962306a36Sopenharmony_ci struct r10bio *r10_bio = bio->bi_private; 38062306a36Sopenharmony_ci int slot; 38162306a36Sopenharmony_ci struct md_rdev *rdev; 38262306a36Sopenharmony_ci struct r10conf *conf = r10_bio->mddev->private; 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci slot = r10_bio->read_slot; 38562306a36Sopenharmony_ci rdev = r10_bio->devs[slot].rdev; 38662306a36Sopenharmony_ci /* 38762306a36Sopenharmony_ci * this branch is our 'one mirror IO has finished' event handler: 38862306a36Sopenharmony_ci */ 38962306a36Sopenharmony_ci update_head_pos(slot, r10_bio); 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci if (uptodate) { 39262306a36Sopenharmony_ci /* 39362306a36Sopenharmony_ci * Set R10BIO_Uptodate in our master bio, so that 39462306a36Sopenharmony_ci * we will return a good error code to the higher 39562306a36Sopenharmony_ci * levels even if IO on some other mirrored buffer fails. 39662306a36Sopenharmony_ci * 39762306a36Sopenharmony_ci * The 'master' represents the composite IO operation to 39862306a36Sopenharmony_ci * user-side. So if something waits for IO, then it will 39962306a36Sopenharmony_ci * wait for the 'master' bio. 40062306a36Sopenharmony_ci */ 40162306a36Sopenharmony_ci set_bit(R10BIO_Uptodate, &r10_bio->state); 40262306a36Sopenharmony_ci } else { 40362306a36Sopenharmony_ci /* If all other devices that store this block have 40462306a36Sopenharmony_ci * failed, we want to return the error upwards rather 40562306a36Sopenharmony_ci * than fail the last device. Here we redefine 40662306a36Sopenharmony_ci * "uptodate" to mean "Don't want to retry" 40762306a36Sopenharmony_ci */ 40862306a36Sopenharmony_ci if (!_enough(conf, test_bit(R10BIO_Previous, &r10_bio->state), 40962306a36Sopenharmony_ci rdev->raid_disk)) 41062306a36Sopenharmony_ci uptodate = 1; 41162306a36Sopenharmony_ci } 41262306a36Sopenharmony_ci if (uptodate) { 41362306a36Sopenharmony_ci raid_end_bio_io(r10_bio); 41462306a36Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 41562306a36Sopenharmony_ci } else { 41662306a36Sopenharmony_ci /* 41762306a36Sopenharmony_ci * oops, read error - keep the refcount on the rdev 41862306a36Sopenharmony_ci */ 41962306a36Sopenharmony_ci pr_err_ratelimited("md/raid10:%s: %pg: rescheduling sector %llu\n", 42062306a36Sopenharmony_ci mdname(conf->mddev), 42162306a36Sopenharmony_ci rdev->bdev, 42262306a36Sopenharmony_ci (unsigned long long)r10_bio->sector); 42362306a36Sopenharmony_ci set_bit(R10BIO_ReadError, &r10_bio->state); 42462306a36Sopenharmony_ci reschedule_retry(r10_bio); 42562306a36Sopenharmony_ci } 42662306a36Sopenharmony_ci} 42762306a36Sopenharmony_ci 42862306a36Sopenharmony_cistatic void close_write(struct r10bio *r10_bio) 42962306a36Sopenharmony_ci{ 43062306a36Sopenharmony_ci /* clear the bitmap if all writes complete successfully */ 43162306a36Sopenharmony_ci md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 43262306a36Sopenharmony_ci r10_bio->sectors, 43362306a36Sopenharmony_ci !test_bit(R10BIO_Degraded, &r10_bio->state), 43462306a36Sopenharmony_ci 0); 43562306a36Sopenharmony_ci md_write_end(r10_bio->mddev); 43662306a36Sopenharmony_ci} 43762306a36Sopenharmony_ci 43862306a36Sopenharmony_cistatic void one_write_done(struct r10bio *r10_bio) 43962306a36Sopenharmony_ci{ 44062306a36Sopenharmony_ci if (atomic_dec_and_test(&r10_bio->remaining)) { 44162306a36Sopenharmony_ci if (test_bit(R10BIO_WriteError, &r10_bio->state)) 44262306a36Sopenharmony_ci reschedule_retry(r10_bio); 44362306a36Sopenharmony_ci else { 44462306a36Sopenharmony_ci close_write(r10_bio); 44562306a36Sopenharmony_ci if (test_bit(R10BIO_MadeGood, &r10_bio->state)) 44662306a36Sopenharmony_ci reschedule_retry(r10_bio); 44762306a36Sopenharmony_ci else 44862306a36Sopenharmony_ci raid_end_bio_io(r10_bio); 44962306a36Sopenharmony_ci } 45062306a36Sopenharmony_ci } 45162306a36Sopenharmony_ci} 45262306a36Sopenharmony_ci 45362306a36Sopenharmony_cistatic void raid10_end_write_request(struct bio *bio) 45462306a36Sopenharmony_ci{ 45562306a36Sopenharmony_ci struct r10bio *r10_bio = bio->bi_private; 45662306a36Sopenharmony_ci int dev; 45762306a36Sopenharmony_ci int dec_rdev = 1; 45862306a36Sopenharmony_ci struct r10conf *conf = r10_bio->mddev->private; 45962306a36Sopenharmony_ci int slot, repl; 46062306a36Sopenharmony_ci struct md_rdev *rdev = NULL; 46162306a36Sopenharmony_ci struct bio *to_put = NULL; 46262306a36Sopenharmony_ci bool discard_error; 46362306a36Sopenharmony_ci 46462306a36Sopenharmony_ci discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; 46562306a36Sopenharmony_ci 46662306a36Sopenharmony_ci dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 46762306a36Sopenharmony_ci 46862306a36Sopenharmony_ci if (repl) 46962306a36Sopenharmony_ci rdev = conf->mirrors[dev].replacement; 47062306a36Sopenharmony_ci if (!rdev) { 47162306a36Sopenharmony_ci smp_rmb(); 47262306a36Sopenharmony_ci repl = 0; 47362306a36Sopenharmony_ci rdev = conf->mirrors[dev].rdev; 47462306a36Sopenharmony_ci } 47562306a36Sopenharmony_ci /* 47662306a36Sopenharmony_ci * this branch is our 'one mirror IO has finished' event handler: 47762306a36Sopenharmony_ci */ 47862306a36Sopenharmony_ci if (bio->bi_status && !discard_error) { 47962306a36Sopenharmony_ci if (repl) 48062306a36Sopenharmony_ci /* Never record new bad blocks to replacement, 48162306a36Sopenharmony_ci * just fail it. 48262306a36Sopenharmony_ci */ 48362306a36Sopenharmony_ci md_error(rdev->mddev, rdev); 48462306a36Sopenharmony_ci else { 48562306a36Sopenharmony_ci set_bit(WriteErrorSeen, &rdev->flags); 48662306a36Sopenharmony_ci if (!test_and_set_bit(WantReplacement, &rdev->flags)) 48762306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, 48862306a36Sopenharmony_ci &rdev->mddev->recovery); 48962306a36Sopenharmony_ci 49062306a36Sopenharmony_ci dec_rdev = 0; 49162306a36Sopenharmony_ci if (test_bit(FailFast, &rdev->flags) && 49262306a36Sopenharmony_ci (bio->bi_opf & MD_FAILFAST)) { 49362306a36Sopenharmony_ci md_error(rdev->mddev, rdev); 49462306a36Sopenharmony_ci } 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_ci /* 49762306a36Sopenharmony_ci * When the device is faulty, it is not necessary to 49862306a36Sopenharmony_ci * handle write error. 49962306a36Sopenharmony_ci */ 50062306a36Sopenharmony_ci if (!test_bit(Faulty, &rdev->flags)) 50162306a36Sopenharmony_ci set_bit(R10BIO_WriteError, &r10_bio->state); 50262306a36Sopenharmony_ci else { 50362306a36Sopenharmony_ci /* Fail the request */ 50462306a36Sopenharmony_ci set_bit(R10BIO_Degraded, &r10_bio->state); 50562306a36Sopenharmony_ci r10_bio->devs[slot].bio = NULL; 50662306a36Sopenharmony_ci to_put = bio; 50762306a36Sopenharmony_ci dec_rdev = 1; 50862306a36Sopenharmony_ci } 50962306a36Sopenharmony_ci } 51062306a36Sopenharmony_ci } else { 51162306a36Sopenharmony_ci /* 51262306a36Sopenharmony_ci * Set R10BIO_Uptodate in our master bio, so that 51362306a36Sopenharmony_ci * we will return a good error code for to the higher 51462306a36Sopenharmony_ci * levels even if IO on some other mirrored buffer fails. 51562306a36Sopenharmony_ci * 51662306a36Sopenharmony_ci * The 'master' represents the composite IO operation to 51762306a36Sopenharmony_ci * user-side. So if something waits for IO, then it will 51862306a36Sopenharmony_ci * wait for the 'master' bio. 51962306a36Sopenharmony_ci */ 52062306a36Sopenharmony_ci sector_t first_bad; 52162306a36Sopenharmony_ci int bad_sectors; 52262306a36Sopenharmony_ci 52362306a36Sopenharmony_ci /* 52462306a36Sopenharmony_ci * Do not set R10BIO_Uptodate if the current device is 52562306a36Sopenharmony_ci * rebuilding or Faulty. This is because we cannot use 52662306a36Sopenharmony_ci * such device for properly reading the data back (we could 52762306a36Sopenharmony_ci * potentially use it, if the current write would have felt 52862306a36Sopenharmony_ci * before rdev->recovery_offset, but for simplicity we don't 52962306a36Sopenharmony_ci * check this here. 53062306a36Sopenharmony_ci */ 53162306a36Sopenharmony_ci if (test_bit(In_sync, &rdev->flags) && 53262306a36Sopenharmony_ci !test_bit(Faulty, &rdev->flags)) 53362306a36Sopenharmony_ci set_bit(R10BIO_Uptodate, &r10_bio->state); 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_ci /* Maybe we can clear some bad blocks. */ 53662306a36Sopenharmony_ci if (is_badblock(rdev, 53762306a36Sopenharmony_ci r10_bio->devs[slot].addr, 53862306a36Sopenharmony_ci r10_bio->sectors, 53962306a36Sopenharmony_ci &first_bad, &bad_sectors) && !discard_error) { 54062306a36Sopenharmony_ci bio_put(bio); 54162306a36Sopenharmony_ci if (repl) 54262306a36Sopenharmony_ci r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; 54362306a36Sopenharmony_ci else 54462306a36Sopenharmony_ci r10_bio->devs[slot].bio = IO_MADE_GOOD; 54562306a36Sopenharmony_ci dec_rdev = 0; 54662306a36Sopenharmony_ci set_bit(R10BIO_MadeGood, &r10_bio->state); 54762306a36Sopenharmony_ci } 54862306a36Sopenharmony_ci } 54962306a36Sopenharmony_ci 55062306a36Sopenharmony_ci /* 55162306a36Sopenharmony_ci * 55262306a36Sopenharmony_ci * Let's see if all mirrored write operations have finished 55362306a36Sopenharmony_ci * already. 55462306a36Sopenharmony_ci */ 55562306a36Sopenharmony_ci one_write_done(r10_bio); 55662306a36Sopenharmony_ci if (dec_rdev) 55762306a36Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 55862306a36Sopenharmony_ci if (to_put) 55962306a36Sopenharmony_ci bio_put(to_put); 56062306a36Sopenharmony_ci} 56162306a36Sopenharmony_ci 56262306a36Sopenharmony_ci/* 56362306a36Sopenharmony_ci * RAID10 layout manager 56462306a36Sopenharmony_ci * As well as the chunksize and raid_disks count, there are two 56562306a36Sopenharmony_ci * parameters: near_copies and far_copies. 56662306a36Sopenharmony_ci * near_copies * far_copies must be <= raid_disks. 56762306a36Sopenharmony_ci * Normally one of these will be 1. 56862306a36Sopenharmony_ci * If both are 1, we get raid0. 56962306a36Sopenharmony_ci * If near_copies == raid_disks, we get raid1. 57062306a36Sopenharmony_ci * 57162306a36Sopenharmony_ci * Chunks are laid out in raid0 style with near_copies copies of the 57262306a36Sopenharmony_ci * first chunk, followed by near_copies copies of the next chunk and 57362306a36Sopenharmony_ci * so on. 57462306a36Sopenharmony_ci * If far_copies > 1, then after 1/far_copies of the array has been assigned 57562306a36Sopenharmony_ci * as described above, we start again with a device offset of near_copies. 57662306a36Sopenharmony_ci * So we effectively have another copy of the whole array further down all 57762306a36Sopenharmony_ci * the drives, but with blocks on different drives. 57862306a36Sopenharmony_ci * With this layout, and block is never stored twice on the one device. 57962306a36Sopenharmony_ci * 58062306a36Sopenharmony_ci * raid10_find_phys finds the sector offset of a given virtual sector 58162306a36Sopenharmony_ci * on each device that it is on. 58262306a36Sopenharmony_ci * 58362306a36Sopenharmony_ci * raid10_find_virt does the reverse mapping, from a device and a 58462306a36Sopenharmony_ci * sector offset to a virtual address 58562306a36Sopenharmony_ci */ 58662306a36Sopenharmony_ci 58762306a36Sopenharmony_cistatic void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) 58862306a36Sopenharmony_ci{ 58962306a36Sopenharmony_ci int n,f; 59062306a36Sopenharmony_ci sector_t sector; 59162306a36Sopenharmony_ci sector_t chunk; 59262306a36Sopenharmony_ci sector_t stripe; 59362306a36Sopenharmony_ci int dev; 59462306a36Sopenharmony_ci int slot = 0; 59562306a36Sopenharmony_ci int last_far_set_start, last_far_set_size; 59662306a36Sopenharmony_ci 59762306a36Sopenharmony_ci last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; 59862306a36Sopenharmony_ci last_far_set_start *= geo->far_set_size; 59962306a36Sopenharmony_ci 60062306a36Sopenharmony_ci last_far_set_size = geo->far_set_size; 60162306a36Sopenharmony_ci last_far_set_size += (geo->raid_disks % geo->far_set_size); 60262306a36Sopenharmony_ci 60362306a36Sopenharmony_ci /* now calculate first sector/dev */ 60462306a36Sopenharmony_ci chunk = r10bio->sector >> geo->chunk_shift; 60562306a36Sopenharmony_ci sector = r10bio->sector & geo->chunk_mask; 60662306a36Sopenharmony_ci 60762306a36Sopenharmony_ci chunk *= geo->near_copies; 60862306a36Sopenharmony_ci stripe = chunk; 60962306a36Sopenharmony_ci dev = sector_div(stripe, geo->raid_disks); 61062306a36Sopenharmony_ci if (geo->far_offset) 61162306a36Sopenharmony_ci stripe *= geo->far_copies; 61262306a36Sopenharmony_ci 61362306a36Sopenharmony_ci sector += stripe << geo->chunk_shift; 61462306a36Sopenharmony_ci 61562306a36Sopenharmony_ci /* and calculate all the others */ 61662306a36Sopenharmony_ci for (n = 0; n < geo->near_copies; n++) { 61762306a36Sopenharmony_ci int d = dev; 61862306a36Sopenharmony_ci int set; 61962306a36Sopenharmony_ci sector_t s = sector; 62062306a36Sopenharmony_ci r10bio->devs[slot].devnum = d; 62162306a36Sopenharmony_ci r10bio->devs[slot].addr = s; 62262306a36Sopenharmony_ci slot++; 62362306a36Sopenharmony_ci 62462306a36Sopenharmony_ci for (f = 1; f < geo->far_copies; f++) { 62562306a36Sopenharmony_ci set = d / geo->far_set_size; 62662306a36Sopenharmony_ci d += geo->near_copies; 62762306a36Sopenharmony_ci 62862306a36Sopenharmony_ci if ((geo->raid_disks % geo->far_set_size) && 62962306a36Sopenharmony_ci (d > last_far_set_start)) { 63062306a36Sopenharmony_ci d -= last_far_set_start; 63162306a36Sopenharmony_ci d %= last_far_set_size; 63262306a36Sopenharmony_ci d += last_far_set_start; 63362306a36Sopenharmony_ci } else { 63462306a36Sopenharmony_ci d %= geo->far_set_size; 63562306a36Sopenharmony_ci d += geo->far_set_size * set; 63662306a36Sopenharmony_ci } 63762306a36Sopenharmony_ci s += geo->stride; 63862306a36Sopenharmony_ci r10bio->devs[slot].devnum = d; 63962306a36Sopenharmony_ci r10bio->devs[slot].addr = s; 64062306a36Sopenharmony_ci slot++; 64162306a36Sopenharmony_ci } 64262306a36Sopenharmony_ci dev++; 64362306a36Sopenharmony_ci if (dev >= geo->raid_disks) { 64462306a36Sopenharmony_ci dev = 0; 64562306a36Sopenharmony_ci sector += (geo->chunk_mask + 1); 64662306a36Sopenharmony_ci } 64762306a36Sopenharmony_ci } 64862306a36Sopenharmony_ci} 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_cistatic void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) 65162306a36Sopenharmony_ci{ 65262306a36Sopenharmony_ci struct geom *geo = &conf->geo; 65362306a36Sopenharmony_ci 65462306a36Sopenharmony_ci if (conf->reshape_progress != MaxSector && 65562306a36Sopenharmony_ci ((r10bio->sector >= conf->reshape_progress) != 65662306a36Sopenharmony_ci conf->mddev->reshape_backwards)) { 65762306a36Sopenharmony_ci set_bit(R10BIO_Previous, &r10bio->state); 65862306a36Sopenharmony_ci geo = &conf->prev; 65962306a36Sopenharmony_ci } else 66062306a36Sopenharmony_ci clear_bit(R10BIO_Previous, &r10bio->state); 66162306a36Sopenharmony_ci 66262306a36Sopenharmony_ci __raid10_find_phys(geo, r10bio); 66362306a36Sopenharmony_ci} 66462306a36Sopenharmony_ci 66562306a36Sopenharmony_cistatic sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) 66662306a36Sopenharmony_ci{ 66762306a36Sopenharmony_ci sector_t offset, chunk, vchunk; 66862306a36Sopenharmony_ci /* Never use conf->prev as this is only called during resync 66962306a36Sopenharmony_ci * or recovery, so reshape isn't happening 67062306a36Sopenharmony_ci */ 67162306a36Sopenharmony_ci struct geom *geo = &conf->geo; 67262306a36Sopenharmony_ci int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; 67362306a36Sopenharmony_ci int far_set_size = geo->far_set_size; 67462306a36Sopenharmony_ci int last_far_set_start; 67562306a36Sopenharmony_ci 67662306a36Sopenharmony_ci if (geo->raid_disks % geo->far_set_size) { 67762306a36Sopenharmony_ci last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; 67862306a36Sopenharmony_ci last_far_set_start *= geo->far_set_size; 67962306a36Sopenharmony_ci 68062306a36Sopenharmony_ci if (dev >= last_far_set_start) { 68162306a36Sopenharmony_ci far_set_size = geo->far_set_size; 68262306a36Sopenharmony_ci far_set_size += (geo->raid_disks % geo->far_set_size); 68362306a36Sopenharmony_ci far_set_start = last_far_set_start; 68462306a36Sopenharmony_ci } 68562306a36Sopenharmony_ci } 68662306a36Sopenharmony_ci 68762306a36Sopenharmony_ci offset = sector & geo->chunk_mask; 68862306a36Sopenharmony_ci if (geo->far_offset) { 68962306a36Sopenharmony_ci int fc; 69062306a36Sopenharmony_ci chunk = sector >> geo->chunk_shift; 69162306a36Sopenharmony_ci fc = sector_div(chunk, geo->far_copies); 69262306a36Sopenharmony_ci dev -= fc * geo->near_copies; 69362306a36Sopenharmony_ci if (dev < far_set_start) 69462306a36Sopenharmony_ci dev += far_set_size; 69562306a36Sopenharmony_ci } else { 69662306a36Sopenharmony_ci while (sector >= geo->stride) { 69762306a36Sopenharmony_ci sector -= geo->stride; 69862306a36Sopenharmony_ci if (dev < (geo->near_copies + far_set_start)) 69962306a36Sopenharmony_ci dev += far_set_size - geo->near_copies; 70062306a36Sopenharmony_ci else 70162306a36Sopenharmony_ci dev -= geo->near_copies; 70262306a36Sopenharmony_ci } 70362306a36Sopenharmony_ci chunk = sector >> geo->chunk_shift; 70462306a36Sopenharmony_ci } 70562306a36Sopenharmony_ci vchunk = chunk * geo->raid_disks + dev; 70662306a36Sopenharmony_ci sector_div(vchunk, geo->near_copies); 70762306a36Sopenharmony_ci return (vchunk << geo->chunk_shift) + offset; 70862306a36Sopenharmony_ci} 70962306a36Sopenharmony_ci 71062306a36Sopenharmony_ci/* 71162306a36Sopenharmony_ci * This routine returns the disk from which the requested read should 71262306a36Sopenharmony_ci * be done. There is a per-array 'next expected sequential IO' sector 71362306a36Sopenharmony_ci * number - if this matches on the next IO then we use the last disk. 71462306a36Sopenharmony_ci * There is also a per-disk 'last know head position' sector that is 71562306a36Sopenharmony_ci * maintained from IRQ contexts, both the normal and the resync IO 71662306a36Sopenharmony_ci * completion handlers update this position correctly. If there is no 71762306a36Sopenharmony_ci * perfect sequential match then we pick the disk whose head is closest. 71862306a36Sopenharmony_ci * 71962306a36Sopenharmony_ci * If there are 2 mirrors in the same 2 devices, performance degrades 72062306a36Sopenharmony_ci * because position is mirror, not device based. 72162306a36Sopenharmony_ci * 72262306a36Sopenharmony_ci * The rdev for the device selected will have nr_pending incremented. 72362306a36Sopenharmony_ci */ 72462306a36Sopenharmony_ci 72562306a36Sopenharmony_ci/* 72662306a36Sopenharmony_ci * FIXME: possibly should rethink readbalancing and do it differently 72762306a36Sopenharmony_ci * depending on near_copies / far_copies geometry. 72862306a36Sopenharmony_ci */ 72962306a36Sopenharmony_cistatic struct md_rdev *read_balance(struct r10conf *conf, 73062306a36Sopenharmony_ci struct r10bio *r10_bio, 73162306a36Sopenharmony_ci int *max_sectors) 73262306a36Sopenharmony_ci{ 73362306a36Sopenharmony_ci const sector_t this_sector = r10_bio->sector; 73462306a36Sopenharmony_ci int disk, slot; 73562306a36Sopenharmony_ci int sectors = r10_bio->sectors; 73662306a36Sopenharmony_ci int best_good_sectors; 73762306a36Sopenharmony_ci sector_t new_distance, best_dist; 73862306a36Sopenharmony_ci struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; 73962306a36Sopenharmony_ci int do_balance; 74062306a36Sopenharmony_ci int best_dist_slot, best_pending_slot; 74162306a36Sopenharmony_ci bool has_nonrot_disk = false; 74262306a36Sopenharmony_ci unsigned int min_pending; 74362306a36Sopenharmony_ci struct geom *geo = &conf->geo; 74462306a36Sopenharmony_ci 74562306a36Sopenharmony_ci raid10_find_phys(conf, r10_bio); 74662306a36Sopenharmony_ci rcu_read_lock(); 74762306a36Sopenharmony_ci best_dist_slot = -1; 74862306a36Sopenharmony_ci min_pending = UINT_MAX; 74962306a36Sopenharmony_ci best_dist_rdev = NULL; 75062306a36Sopenharmony_ci best_pending_rdev = NULL; 75162306a36Sopenharmony_ci best_dist = MaxSector; 75262306a36Sopenharmony_ci best_good_sectors = 0; 75362306a36Sopenharmony_ci do_balance = 1; 75462306a36Sopenharmony_ci clear_bit(R10BIO_FailFast, &r10_bio->state); 75562306a36Sopenharmony_ci /* 75662306a36Sopenharmony_ci * Check if we can balance. We can balance on the whole 75762306a36Sopenharmony_ci * device if no resync is going on (recovery is ok), or below 75862306a36Sopenharmony_ci * the resync window. We take the first readable disk when 75962306a36Sopenharmony_ci * above the resync window. 76062306a36Sopenharmony_ci */ 76162306a36Sopenharmony_ci if ((conf->mddev->recovery_cp < MaxSector 76262306a36Sopenharmony_ci && (this_sector + sectors >= conf->next_resync)) || 76362306a36Sopenharmony_ci (mddev_is_clustered(conf->mddev) && 76462306a36Sopenharmony_ci md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, 76562306a36Sopenharmony_ci this_sector + sectors))) 76662306a36Sopenharmony_ci do_balance = 0; 76762306a36Sopenharmony_ci 76862306a36Sopenharmony_ci for (slot = 0; slot < conf->copies ; slot++) { 76962306a36Sopenharmony_ci sector_t first_bad; 77062306a36Sopenharmony_ci int bad_sectors; 77162306a36Sopenharmony_ci sector_t dev_sector; 77262306a36Sopenharmony_ci unsigned int pending; 77362306a36Sopenharmony_ci bool nonrot; 77462306a36Sopenharmony_ci 77562306a36Sopenharmony_ci if (r10_bio->devs[slot].bio == IO_BLOCKED) 77662306a36Sopenharmony_ci continue; 77762306a36Sopenharmony_ci disk = r10_bio->devs[slot].devnum; 77862306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[disk].replacement); 77962306a36Sopenharmony_ci if (rdev == NULL || test_bit(Faulty, &rdev->flags) || 78062306a36Sopenharmony_ci r10_bio->devs[slot].addr + sectors > 78162306a36Sopenharmony_ci rdev->recovery_offset) { 78262306a36Sopenharmony_ci /* 78362306a36Sopenharmony_ci * Read replacement first to prevent reading both rdev 78462306a36Sopenharmony_ci * and replacement as NULL during replacement replace 78562306a36Sopenharmony_ci * rdev. 78662306a36Sopenharmony_ci */ 78762306a36Sopenharmony_ci smp_mb(); 78862306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[disk].rdev); 78962306a36Sopenharmony_ci } 79062306a36Sopenharmony_ci if (rdev == NULL || 79162306a36Sopenharmony_ci test_bit(Faulty, &rdev->flags)) 79262306a36Sopenharmony_ci continue; 79362306a36Sopenharmony_ci if (!test_bit(In_sync, &rdev->flags) && 79462306a36Sopenharmony_ci r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 79562306a36Sopenharmony_ci continue; 79662306a36Sopenharmony_ci 79762306a36Sopenharmony_ci dev_sector = r10_bio->devs[slot].addr; 79862306a36Sopenharmony_ci if (is_badblock(rdev, dev_sector, sectors, 79962306a36Sopenharmony_ci &first_bad, &bad_sectors)) { 80062306a36Sopenharmony_ci if (best_dist < MaxSector) 80162306a36Sopenharmony_ci /* Already have a better slot */ 80262306a36Sopenharmony_ci continue; 80362306a36Sopenharmony_ci if (first_bad <= dev_sector) { 80462306a36Sopenharmony_ci /* Cannot read here. If this is the 80562306a36Sopenharmony_ci * 'primary' device, then we must not read 80662306a36Sopenharmony_ci * beyond 'bad_sectors' from another device. 80762306a36Sopenharmony_ci */ 80862306a36Sopenharmony_ci bad_sectors -= (dev_sector - first_bad); 80962306a36Sopenharmony_ci if (!do_balance && sectors > bad_sectors) 81062306a36Sopenharmony_ci sectors = bad_sectors; 81162306a36Sopenharmony_ci if (best_good_sectors > sectors) 81262306a36Sopenharmony_ci best_good_sectors = sectors; 81362306a36Sopenharmony_ci } else { 81462306a36Sopenharmony_ci sector_t good_sectors = 81562306a36Sopenharmony_ci first_bad - dev_sector; 81662306a36Sopenharmony_ci if (good_sectors > best_good_sectors) { 81762306a36Sopenharmony_ci best_good_sectors = good_sectors; 81862306a36Sopenharmony_ci best_dist_slot = slot; 81962306a36Sopenharmony_ci best_dist_rdev = rdev; 82062306a36Sopenharmony_ci } 82162306a36Sopenharmony_ci if (!do_balance) 82262306a36Sopenharmony_ci /* Must read from here */ 82362306a36Sopenharmony_ci break; 82462306a36Sopenharmony_ci } 82562306a36Sopenharmony_ci continue; 82662306a36Sopenharmony_ci } else 82762306a36Sopenharmony_ci best_good_sectors = sectors; 82862306a36Sopenharmony_ci 82962306a36Sopenharmony_ci if (!do_balance) 83062306a36Sopenharmony_ci break; 83162306a36Sopenharmony_ci 83262306a36Sopenharmony_ci nonrot = bdev_nonrot(rdev->bdev); 83362306a36Sopenharmony_ci has_nonrot_disk |= nonrot; 83462306a36Sopenharmony_ci pending = atomic_read(&rdev->nr_pending); 83562306a36Sopenharmony_ci if (min_pending > pending && nonrot) { 83662306a36Sopenharmony_ci min_pending = pending; 83762306a36Sopenharmony_ci best_pending_slot = slot; 83862306a36Sopenharmony_ci best_pending_rdev = rdev; 83962306a36Sopenharmony_ci } 84062306a36Sopenharmony_ci 84162306a36Sopenharmony_ci if (best_dist_slot >= 0) 84262306a36Sopenharmony_ci /* At least 2 disks to choose from so failfast is OK */ 84362306a36Sopenharmony_ci set_bit(R10BIO_FailFast, &r10_bio->state); 84462306a36Sopenharmony_ci /* This optimisation is debatable, and completely destroys 84562306a36Sopenharmony_ci * sequential read speed for 'far copies' arrays. So only 84662306a36Sopenharmony_ci * keep it for 'near' arrays, and review those later. 84762306a36Sopenharmony_ci */ 84862306a36Sopenharmony_ci if (geo->near_copies > 1 && !pending) 84962306a36Sopenharmony_ci new_distance = 0; 85062306a36Sopenharmony_ci 85162306a36Sopenharmony_ci /* for far > 1 always use the lowest address */ 85262306a36Sopenharmony_ci else if (geo->far_copies > 1) 85362306a36Sopenharmony_ci new_distance = r10_bio->devs[slot].addr; 85462306a36Sopenharmony_ci else 85562306a36Sopenharmony_ci new_distance = abs(r10_bio->devs[slot].addr - 85662306a36Sopenharmony_ci conf->mirrors[disk].head_position); 85762306a36Sopenharmony_ci 85862306a36Sopenharmony_ci if (new_distance < best_dist) { 85962306a36Sopenharmony_ci best_dist = new_distance; 86062306a36Sopenharmony_ci best_dist_slot = slot; 86162306a36Sopenharmony_ci best_dist_rdev = rdev; 86262306a36Sopenharmony_ci } 86362306a36Sopenharmony_ci } 86462306a36Sopenharmony_ci if (slot >= conf->copies) { 86562306a36Sopenharmony_ci if (has_nonrot_disk) { 86662306a36Sopenharmony_ci slot = best_pending_slot; 86762306a36Sopenharmony_ci rdev = best_pending_rdev; 86862306a36Sopenharmony_ci } else { 86962306a36Sopenharmony_ci slot = best_dist_slot; 87062306a36Sopenharmony_ci rdev = best_dist_rdev; 87162306a36Sopenharmony_ci } 87262306a36Sopenharmony_ci } 87362306a36Sopenharmony_ci 87462306a36Sopenharmony_ci if (slot >= 0) { 87562306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 87662306a36Sopenharmony_ci r10_bio->read_slot = slot; 87762306a36Sopenharmony_ci } else 87862306a36Sopenharmony_ci rdev = NULL; 87962306a36Sopenharmony_ci rcu_read_unlock(); 88062306a36Sopenharmony_ci *max_sectors = best_good_sectors; 88162306a36Sopenharmony_ci 88262306a36Sopenharmony_ci return rdev; 88362306a36Sopenharmony_ci} 88462306a36Sopenharmony_ci 88562306a36Sopenharmony_cistatic void flush_pending_writes(struct r10conf *conf) 88662306a36Sopenharmony_ci{ 88762306a36Sopenharmony_ci /* Any writes that have been queued but are awaiting 88862306a36Sopenharmony_ci * bitmap updates get flushed here. 88962306a36Sopenharmony_ci */ 89062306a36Sopenharmony_ci spin_lock_irq(&conf->device_lock); 89162306a36Sopenharmony_ci 89262306a36Sopenharmony_ci if (conf->pending_bio_list.head) { 89362306a36Sopenharmony_ci struct blk_plug plug; 89462306a36Sopenharmony_ci struct bio *bio; 89562306a36Sopenharmony_ci 89662306a36Sopenharmony_ci bio = bio_list_get(&conf->pending_bio_list); 89762306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 89862306a36Sopenharmony_ci 89962306a36Sopenharmony_ci /* 90062306a36Sopenharmony_ci * As this is called in a wait_event() loop (see freeze_array), 90162306a36Sopenharmony_ci * current->state might be TASK_UNINTERRUPTIBLE which will 90262306a36Sopenharmony_ci * cause a warning when we prepare to wait again. As it is 90362306a36Sopenharmony_ci * rare that this path is taken, it is perfectly safe to force 90462306a36Sopenharmony_ci * us to go around the wait_event() loop again, so the warning 90562306a36Sopenharmony_ci * is a false-positive. Silence the warning by resetting 90662306a36Sopenharmony_ci * thread state 90762306a36Sopenharmony_ci */ 90862306a36Sopenharmony_ci __set_current_state(TASK_RUNNING); 90962306a36Sopenharmony_ci 91062306a36Sopenharmony_ci blk_start_plug(&plug); 91162306a36Sopenharmony_ci raid1_prepare_flush_writes(conf->mddev->bitmap); 91262306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_ci while (bio) { /* submit pending writes */ 91562306a36Sopenharmony_ci struct bio *next = bio->bi_next; 91662306a36Sopenharmony_ci 91762306a36Sopenharmony_ci raid1_submit_write(bio); 91862306a36Sopenharmony_ci bio = next; 91962306a36Sopenharmony_ci cond_resched(); 92062306a36Sopenharmony_ci } 92162306a36Sopenharmony_ci blk_finish_plug(&plug); 92262306a36Sopenharmony_ci } else 92362306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 92462306a36Sopenharmony_ci} 92562306a36Sopenharmony_ci 92662306a36Sopenharmony_ci/* Barriers.... 92762306a36Sopenharmony_ci * Sometimes we need to suspend IO while we do something else, 92862306a36Sopenharmony_ci * either some resync/recovery, or reconfigure the array. 92962306a36Sopenharmony_ci * To do this we raise a 'barrier'. 93062306a36Sopenharmony_ci * The 'barrier' is a counter that can be raised multiple times 93162306a36Sopenharmony_ci * to count how many activities are happening which preclude 93262306a36Sopenharmony_ci * normal IO. 93362306a36Sopenharmony_ci * We can only raise the barrier if there is no pending IO. 93462306a36Sopenharmony_ci * i.e. if nr_pending == 0. 93562306a36Sopenharmony_ci * We choose only to raise the barrier if no-one is waiting for the 93662306a36Sopenharmony_ci * barrier to go down. This means that as soon as an IO request 93762306a36Sopenharmony_ci * is ready, no other operations which require a barrier will start 93862306a36Sopenharmony_ci * until the IO request has had a chance. 93962306a36Sopenharmony_ci * 94062306a36Sopenharmony_ci * So: regular IO calls 'wait_barrier'. When that returns there 94162306a36Sopenharmony_ci * is no backgroup IO happening, It must arrange to call 94262306a36Sopenharmony_ci * allow_barrier when it has finished its IO. 94362306a36Sopenharmony_ci * backgroup IO calls must call raise_barrier. Once that returns 94462306a36Sopenharmony_ci * there is no normal IO happeing. It must arrange to call 94562306a36Sopenharmony_ci * lower_barrier when the particular background IO completes. 94662306a36Sopenharmony_ci */ 94762306a36Sopenharmony_ci 94862306a36Sopenharmony_cistatic void raise_barrier(struct r10conf *conf, int force) 94962306a36Sopenharmony_ci{ 95062306a36Sopenharmony_ci write_seqlock_irq(&conf->resync_lock); 95162306a36Sopenharmony_ci 95262306a36Sopenharmony_ci if (WARN_ON_ONCE(force && !conf->barrier)) 95362306a36Sopenharmony_ci force = false; 95462306a36Sopenharmony_ci 95562306a36Sopenharmony_ci /* Wait until no block IO is waiting (unless 'force') */ 95662306a36Sopenharmony_ci wait_event_barrier(conf, force || !conf->nr_waiting); 95762306a36Sopenharmony_ci 95862306a36Sopenharmony_ci /* block any new IO from starting */ 95962306a36Sopenharmony_ci WRITE_ONCE(conf->barrier, conf->barrier + 1); 96062306a36Sopenharmony_ci 96162306a36Sopenharmony_ci /* Now wait for all pending IO to complete */ 96262306a36Sopenharmony_ci wait_event_barrier(conf, !atomic_read(&conf->nr_pending) && 96362306a36Sopenharmony_ci conf->barrier < RESYNC_DEPTH); 96462306a36Sopenharmony_ci 96562306a36Sopenharmony_ci write_sequnlock_irq(&conf->resync_lock); 96662306a36Sopenharmony_ci} 96762306a36Sopenharmony_ci 96862306a36Sopenharmony_cistatic void lower_barrier(struct r10conf *conf) 96962306a36Sopenharmony_ci{ 97062306a36Sopenharmony_ci unsigned long flags; 97162306a36Sopenharmony_ci 97262306a36Sopenharmony_ci write_seqlock_irqsave(&conf->resync_lock, flags); 97362306a36Sopenharmony_ci WRITE_ONCE(conf->barrier, conf->barrier - 1); 97462306a36Sopenharmony_ci write_sequnlock_irqrestore(&conf->resync_lock, flags); 97562306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 97662306a36Sopenharmony_ci} 97762306a36Sopenharmony_ci 97862306a36Sopenharmony_cistatic bool stop_waiting_barrier(struct r10conf *conf) 97962306a36Sopenharmony_ci{ 98062306a36Sopenharmony_ci struct bio_list *bio_list = current->bio_list; 98162306a36Sopenharmony_ci struct md_thread *thread; 98262306a36Sopenharmony_ci 98362306a36Sopenharmony_ci /* barrier is dropped */ 98462306a36Sopenharmony_ci if (!conf->barrier) 98562306a36Sopenharmony_ci return true; 98662306a36Sopenharmony_ci 98762306a36Sopenharmony_ci /* 98862306a36Sopenharmony_ci * If there are already pending requests (preventing the barrier from 98962306a36Sopenharmony_ci * rising completely), and the pre-process bio queue isn't empty, then 99062306a36Sopenharmony_ci * don't wait, as we need to empty that queue to get the nr_pending 99162306a36Sopenharmony_ci * count down. 99262306a36Sopenharmony_ci */ 99362306a36Sopenharmony_ci if (atomic_read(&conf->nr_pending) && bio_list && 99462306a36Sopenharmony_ci (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) 99562306a36Sopenharmony_ci return true; 99662306a36Sopenharmony_ci 99762306a36Sopenharmony_ci /* daemon thread must exist while handling io */ 99862306a36Sopenharmony_ci thread = rcu_dereference_protected(conf->mddev->thread, true); 99962306a36Sopenharmony_ci /* 100062306a36Sopenharmony_ci * move on if io is issued from raid10d(), nr_pending is not released 100162306a36Sopenharmony_ci * from original io(see handle_read_error()). All raise barrier is 100262306a36Sopenharmony_ci * blocked until this io is done. 100362306a36Sopenharmony_ci */ 100462306a36Sopenharmony_ci if (thread->tsk == current) { 100562306a36Sopenharmony_ci WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0); 100662306a36Sopenharmony_ci return true; 100762306a36Sopenharmony_ci } 100862306a36Sopenharmony_ci 100962306a36Sopenharmony_ci return false; 101062306a36Sopenharmony_ci} 101162306a36Sopenharmony_ci 101262306a36Sopenharmony_cistatic bool wait_barrier_nolock(struct r10conf *conf) 101362306a36Sopenharmony_ci{ 101462306a36Sopenharmony_ci unsigned int seq = read_seqbegin(&conf->resync_lock); 101562306a36Sopenharmony_ci 101662306a36Sopenharmony_ci if (READ_ONCE(conf->barrier)) 101762306a36Sopenharmony_ci return false; 101862306a36Sopenharmony_ci 101962306a36Sopenharmony_ci atomic_inc(&conf->nr_pending); 102062306a36Sopenharmony_ci if (!read_seqretry(&conf->resync_lock, seq)) 102162306a36Sopenharmony_ci return true; 102262306a36Sopenharmony_ci 102362306a36Sopenharmony_ci if (atomic_dec_and_test(&conf->nr_pending)) 102462306a36Sopenharmony_ci wake_up_barrier(conf); 102562306a36Sopenharmony_ci 102662306a36Sopenharmony_ci return false; 102762306a36Sopenharmony_ci} 102862306a36Sopenharmony_ci 102962306a36Sopenharmony_cistatic bool wait_barrier(struct r10conf *conf, bool nowait) 103062306a36Sopenharmony_ci{ 103162306a36Sopenharmony_ci bool ret = true; 103262306a36Sopenharmony_ci 103362306a36Sopenharmony_ci if (wait_barrier_nolock(conf)) 103462306a36Sopenharmony_ci return true; 103562306a36Sopenharmony_ci 103662306a36Sopenharmony_ci write_seqlock_irq(&conf->resync_lock); 103762306a36Sopenharmony_ci if (conf->barrier) { 103862306a36Sopenharmony_ci /* Return false when nowait flag is set */ 103962306a36Sopenharmony_ci if (nowait) { 104062306a36Sopenharmony_ci ret = false; 104162306a36Sopenharmony_ci } else { 104262306a36Sopenharmony_ci conf->nr_waiting++; 104362306a36Sopenharmony_ci raid10_log(conf->mddev, "wait barrier"); 104462306a36Sopenharmony_ci wait_event_barrier(conf, stop_waiting_barrier(conf)); 104562306a36Sopenharmony_ci conf->nr_waiting--; 104662306a36Sopenharmony_ci } 104762306a36Sopenharmony_ci if (!conf->nr_waiting) 104862306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 104962306a36Sopenharmony_ci } 105062306a36Sopenharmony_ci /* Only increment nr_pending when we wait */ 105162306a36Sopenharmony_ci if (ret) 105262306a36Sopenharmony_ci atomic_inc(&conf->nr_pending); 105362306a36Sopenharmony_ci write_sequnlock_irq(&conf->resync_lock); 105462306a36Sopenharmony_ci return ret; 105562306a36Sopenharmony_ci} 105662306a36Sopenharmony_ci 105762306a36Sopenharmony_cistatic void allow_barrier(struct r10conf *conf) 105862306a36Sopenharmony_ci{ 105962306a36Sopenharmony_ci if ((atomic_dec_and_test(&conf->nr_pending)) || 106062306a36Sopenharmony_ci (conf->array_freeze_pending)) 106162306a36Sopenharmony_ci wake_up_barrier(conf); 106262306a36Sopenharmony_ci} 106362306a36Sopenharmony_ci 106462306a36Sopenharmony_cistatic void freeze_array(struct r10conf *conf, int extra) 106562306a36Sopenharmony_ci{ 106662306a36Sopenharmony_ci /* stop syncio and normal IO and wait for everything to 106762306a36Sopenharmony_ci * go quiet. 106862306a36Sopenharmony_ci * We increment barrier and nr_waiting, and then 106962306a36Sopenharmony_ci * wait until nr_pending match nr_queued+extra 107062306a36Sopenharmony_ci * This is called in the context of one normal IO request 107162306a36Sopenharmony_ci * that has failed. Thus any sync request that might be pending 107262306a36Sopenharmony_ci * will be blocked by nr_pending, and we need to wait for 107362306a36Sopenharmony_ci * pending IO requests to complete or be queued for re-try. 107462306a36Sopenharmony_ci * Thus the number queued (nr_queued) plus this request (extra) 107562306a36Sopenharmony_ci * must match the number of pending IOs (nr_pending) before 107662306a36Sopenharmony_ci * we continue. 107762306a36Sopenharmony_ci */ 107862306a36Sopenharmony_ci write_seqlock_irq(&conf->resync_lock); 107962306a36Sopenharmony_ci conf->array_freeze_pending++; 108062306a36Sopenharmony_ci WRITE_ONCE(conf->barrier, conf->barrier + 1); 108162306a36Sopenharmony_ci conf->nr_waiting++; 108262306a36Sopenharmony_ci wait_event_barrier_cmd(conf, atomic_read(&conf->nr_pending) == 108362306a36Sopenharmony_ci conf->nr_queued + extra, flush_pending_writes(conf)); 108462306a36Sopenharmony_ci conf->array_freeze_pending--; 108562306a36Sopenharmony_ci write_sequnlock_irq(&conf->resync_lock); 108662306a36Sopenharmony_ci} 108762306a36Sopenharmony_ci 108862306a36Sopenharmony_cistatic void unfreeze_array(struct r10conf *conf) 108962306a36Sopenharmony_ci{ 109062306a36Sopenharmony_ci /* reverse the effect of the freeze */ 109162306a36Sopenharmony_ci write_seqlock_irq(&conf->resync_lock); 109262306a36Sopenharmony_ci WRITE_ONCE(conf->barrier, conf->barrier - 1); 109362306a36Sopenharmony_ci conf->nr_waiting--; 109462306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 109562306a36Sopenharmony_ci write_sequnlock_irq(&conf->resync_lock); 109662306a36Sopenharmony_ci} 109762306a36Sopenharmony_ci 109862306a36Sopenharmony_cistatic sector_t choose_data_offset(struct r10bio *r10_bio, 109962306a36Sopenharmony_ci struct md_rdev *rdev) 110062306a36Sopenharmony_ci{ 110162306a36Sopenharmony_ci if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || 110262306a36Sopenharmony_ci test_bit(R10BIO_Previous, &r10_bio->state)) 110362306a36Sopenharmony_ci return rdev->data_offset; 110462306a36Sopenharmony_ci else 110562306a36Sopenharmony_ci return rdev->new_data_offset; 110662306a36Sopenharmony_ci} 110762306a36Sopenharmony_ci 110862306a36Sopenharmony_cistatic void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) 110962306a36Sopenharmony_ci{ 111062306a36Sopenharmony_ci struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, cb); 111162306a36Sopenharmony_ci struct mddev *mddev = plug->cb.data; 111262306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 111362306a36Sopenharmony_ci struct bio *bio; 111462306a36Sopenharmony_ci 111562306a36Sopenharmony_ci if (from_schedule) { 111662306a36Sopenharmony_ci spin_lock_irq(&conf->device_lock); 111762306a36Sopenharmony_ci bio_list_merge(&conf->pending_bio_list, &plug->pending); 111862306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 111962306a36Sopenharmony_ci wake_up_barrier(conf); 112062306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 112162306a36Sopenharmony_ci kfree(plug); 112262306a36Sopenharmony_ci return; 112362306a36Sopenharmony_ci } 112462306a36Sopenharmony_ci 112562306a36Sopenharmony_ci /* we aren't scheduling, so we can do the write-out directly. */ 112662306a36Sopenharmony_ci bio = bio_list_get(&plug->pending); 112762306a36Sopenharmony_ci raid1_prepare_flush_writes(mddev->bitmap); 112862306a36Sopenharmony_ci wake_up_barrier(conf); 112962306a36Sopenharmony_ci 113062306a36Sopenharmony_ci while (bio) { /* submit pending writes */ 113162306a36Sopenharmony_ci struct bio *next = bio->bi_next; 113262306a36Sopenharmony_ci 113362306a36Sopenharmony_ci raid1_submit_write(bio); 113462306a36Sopenharmony_ci bio = next; 113562306a36Sopenharmony_ci cond_resched(); 113662306a36Sopenharmony_ci } 113762306a36Sopenharmony_ci kfree(plug); 113862306a36Sopenharmony_ci} 113962306a36Sopenharmony_ci 114062306a36Sopenharmony_ci/* 114162306a36Sopenharmony_ci * 1. Register the new request and wait if the reconstruction thread has put 114262306a36Sopenharmony_ci * up a bar for new requests. Continue immediately if no resync is active 114362306a36Sopenharmony_ci * currently. 114462306a36Sopenharmony_ci * 2. If IO spans the reshape position. Need to wait for reshape to pass. 114562306a36Sopenharmony_ci */ 114662306a36Sopenharmony_cistatic bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, 114762306a36Sopenharmony_ci struct bio *bio, sector_t sectors) 114862306a36Sopenharmony_ci{ 114962306a36Sopenharmony_ci /* Bail out if REQ_NOWAIT is set for the bio */ 115062306a36Sopenharmony_ci if (!wait_barrier(conf, bio->bi_opf & REQ_NOWAIT)) { 115162306a36Sopenharmony_ci bio_wouldblock_error(bio); 115262306a36Sopenharmony_ci return false; 115362306a36Sopenharmony_ci } 115462306a36Sopenharmony_ci while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 115562306a36Sopenharmony_ci bio->bi_iter.bi_sector < conf->reshape_progress && 115662306a36Sopenharmony_ci bio->bi_iter.bi_sector + sectors > conf->reshape_progress) { 115762306a36Sopenharmony_ci allow_barrier(conf); 115862306a36Sopenharmony_ci if (bio->bi_opf & REQ_NOWAIT) { 115962306a36Sopenharmony_ci bio_wouldblock_error(bio); 116062306a36Sopenharmony_ci return false; 116162306a36Sopenharmony_ci } 116262306a36Sopenharmony_ci raid10_log(conf->mddev, "wait reshape"); 116362306a36Sopenharmony_ci wait_event(conf->wait_barrier, 116462306a36Sopenharmony_ci conf->reshape_progress <= bio->bi_iter.bi_sector || 116562306a36Sopenharmony_ci conf->reshape_progress >= bio->bi_iter.bi_sector + 116662306a36Sopenharmony_ci sectors); 116762306a36Sopenharmony_ci wait_barrier(conf, false); 116862306a36Sopenharmony_ci } 116962306a36Sopenharmony_ci return true; 117062306a36Sopenharmony_ci} 117162306a36Sopenharmony_ci 117262306a36Sopenharmony_cistatic void raid10_read_request(struct mddev *mddev, struct bio *bio, 117362306a36Sopenharmony_ci struct r10bio *r10_bio, bool io_accounting) 117462306a36Sopenharmony_ci{ 117562306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 117662306a36Sopenharmony_ci struct bio *read_bio; 117762306a36Sopenharmony_ci const enum req_op op = bio_op(bio); 117862306a36Sopenharmony_ci const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 117962306a36Sopenharmony_ci int max_sectors; 118062306a36Sopenharmony_ci struct md_rdev *rdev; 118162306a36Sopenharmony_ci char b[BDEVNAME_SIZE]; 118262306a36Sopenharmony_ci int slot = r10_bio->read_slot; 118362306a36Sopenharmony_ci struct md_rdev *err_rdev = NULL; 118462306a36Sopenharmony_ci gfp_t gfp = GFP_NOIO; 118562306a36Sopenharmony_ci 118662306a36Sopenharmony_ci if (slot >= 0 && r10_bio->devs[slot].rdev) { 118762306a36Sopenharmony_ci /* 118862306a36Sopenharmony_ci * This is an error retry, but we cannot 118962306a36Sopenharmony_ci * safely dereference the rdev in the r10_bio, 119062306a36Sopenharmony_ci * we must use the one in conf. 119162306a36Sopenharmony_ci * If it has already been disconnected (unlikely) 119262306a36Sopenharmony_ci * we lose the device name in error messages. 119362306a36Sopenharmony_ci */ 119462306a36Sopenharmony_ci int disk; 119562306a36Sopenharmony_ci /* 119662306a36Sopenharmony_ci * As we are blocking raid10, it is a little safer to 119762306a36Sopenharmony_ci * use __GFP_HIGH. 119862306a36Sopenharmony_ci */ 119962306a36Sopenharmony_ci gfp = GFP_NOIO | __GFP_HIGH; 120062306a36Sopenharmony_ci 120162306a36Sopenharmony_ci rcu_read_lock(); 120262306a36Sopenharmony_ci disk = r10_bio->devs[slot].devnum; 120362306a36Sopenharmony_ci err_rdev = rcu_dereference(conf->mirrors[disk].rdev); 120462306a36Sopenharmony_ci if (err_rdev) 120562306a36Sopenharmony_ci snprintf(b, sizeof(b), "%pg", err_rdev->bdev); 120662306a36Sopenharmony_ci else { 120762306a36Sopenharmony_ci strcpy(b, "???"); 120862306a36Sopenharmony_ci /* This never gets dereferenced */ 120962306a36Sopenharmony_ci err_rdev = r10_bio->devs[slot].rdev; 121062306a36Sopenharmony_ci } 121162306a36Sopenharmony_ci rcu_read_unlock(); 121262306a36Sopenharmony_ci } 121362306a36Sopenharmony_ci 121462306a36Sopenharmony_ci if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) 121562306a36Sopenharmony_ci return; 121662306a36Sopenharmony_ci rdev = read_balance(conf, r10_bio, &max_sectors); 121762306a36Sopenharmony_ci if (!rdev) { 121862306a36Sopenharmony_ci if (err_rdev) { 121962306a36Sopenharmony_ci pr_crit_ratelimited("md/raid10:%s: %s: unrecoverable I/O read error for block %llu\n", 122062306a36Sopenharmony_ci mdname(mddev), b, 122162306a36Sopenharmony_ci (unsigned long long)r10_bio->sector); 122262306a36Sopenharmony_ci } 122362306a36Sopenharmony_ci raid_end_bio_io(r10_bio); 122462306a36Sopenharmony_ci return; 122562306a36Sopenharmony_ci } 122662306a36Sopenharmony_ci if (err_rdev) 122762306a36Sopenharmony_ci pr_err_ratelimited("md/raid10:%s: %pg: redirecting sector %llu to another mirror\n", 122862306a36Sopenharmony_ci mdname(mddev), 122962306a36Sopenharmony_ci rdev->bdev, 123062306a36Sopenharmony_ci (unsigned long long)r10_bio->sector); 123162306a36Sopenharmony_ci if (max_sectors < bio_sectors(bio)) { 123262306a36Sopenharmony_ci struct bio *split = bio_split(bio, max_sectors, 123362306a36Sopenharmony_ci gfp, &conf->bio_split); 123462306a36Sopenharmony_ci bio_chain(split, bio); 123562306a36Sopenharmony_ci allow_barrier(conf); 123662306a36Sopenharmony_ci submit_bio_noacct(bio); 123762306a36Sopenharmony_ci wait_barrier(conf, false); 123862306a36Sopenharmony_ci bio = split; 123962306a36Sopenharmony_ci r10_bio->master_bio = bio; 124062306a36Sopenharmony_ci r10_bio->sectors = max_sectors; 124162306a36Sopenharmony_ci } 124262306a36Sopenharmony_ci slot = r10_bio->read_slot; 124362306a36Sopenharmony_ci 124462306a36Sopenharmony_ci if (io_accounting) { 124562306a36Sopenharmony_ci md_account_bio(mddev, &bio); 124662306a36Sopenharmony_ci r10_bio->master_bio = bio; 124762306a36Sopenharmony_ci } 124862306a36Sopenharmony_ci read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); 124962306a36Sopenharmony_ci 125062306a36Sopenharmony_ci r10_bio->devs[slot].bio = read_bio; 125162306a36Sopenharmony_ci r10_bio->devs[slot].rdev = rdev; 125262306a36Sopenharmony_ci 125362306a36Sopenharmony_ci read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr + 125462306a36Sopenharmony_ci choose_data_offset(r10_bio, rdev); 125562306a36Sopenharmony_ci read_bio->bi_end_io = raid10_end_read_request; 125662306a36Sopenharmony_ci read_bio->bi_opf = op | do_sync; 125762306a36Sopenharmony_ci if (test_bit(FailFast, &rdev->flags) && 125862306a36Sopenharmony_ci test_bit(R10BIO_FailFast, &r10_bio->state)) 125962306a36Sopenharmony_ci read_bio->bi_opf |= MD_FAILFAST; 126062306a36Sopenharmony_ci read_bio->bi_private = r10_bio; 126162306a36Sopenharmony_ci 126262306a36Sopenharmony_ci if (mddev->gendisk) 126362306a36Sopenharmony_ci trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), 126462306a36Sopenharmony_ci r10_bio->sector); 126562306a36Sopenharmony_ci submit_bio_noacct(read_bio); 126662306a36Sopenharmony_ci return; 126762306a36Sopenharmony_ci} 126862306a36Sopenharmony_ci 126962306a36Sopenharmony_cistatic void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, 127062306a36Sopenharmony_ci struct bio *bio, bool replacement, 127162306a36Sopenharmony_ci int n_copy) 127262306a36Sopenharmony_ci{ 127362306a36Sopenharmony_ci const enum req_op op = bio_op(bio); 127462306a36Sopenharmony_ci const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 127562306a36Sopenharmony_ci const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; 127662306a36Sopenharmony_ci unsigned long flags; 127762306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 127862306a36Sopenharmony_ci struct md_rdev *rdev; 127962306a36Sopenharmony_ci int devnum = r10_bio->devs[n_copy].devnum; 128062306a36Sopenharmony_ci struct bio *mbio; 128162306a36Sopenharmony_ci 128262306a36Sopenharmony_ci if (replacement) { 128362306a36Sopenharmony_ci rdev = conf->mirrors[devnum].replacement; 128462306a36Sopenharmony_ci if (rdev == NULL) { 128562306a36Sopenharmony_ci /* Replacement just got moved to main 'rdev' */ 128662306a36Sopenharmony_ci smp_mb(); 128762306a36Sopenharmony_ci rdev = conf->mirrors[devnum].rdev; 128862306a36Sopenharmony_ci } 128962306a36Sopenharmony_ci } else 129062306a36Sopenharmony_ci rdev = conf->mirrors[devnum].rdev; 129162306a36Sopenharmony_ci 129262306a36Sopenharmony_ci mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set); 129362306a36Sopenharmony_ci if (replacement) 129462306a36Sopenharmony_ci r10_bio->devs[n_copy].repl_bio = mbio; 129562306a36Sopenharmony_ci else 129662306a36Sopenharmony_ci r10_bio->devs[n_copy].bio = mbio; 129762306a36Sopenharmony_ci 129862306a36Sopenharmony_ci mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + 129962306a36Sopenharmony_ci choose_data_offset(r10_bio, rdev)); 130062306a36Sopenharmony_ci mbio->bi_end_io = raid10_end_write_request; 130162306a36Sopenharmony_ci mbio->bi_opf = op | do_sync | do_fua; 130262306a36Sopenharmony_ci if (!replacement && test_bit(FailFast, 130362306a36Sopenharmony_ci &conf->mirrors[devnum].rdev->flags) 130462306a36Sopenharmony_ci && enough(conf, devnum)) 130562306a36Sopenharmony_ci mbio->bi_opf |= MD_FAILFAST; 130662306a36Sopenharmony_ci mbio->bi_private = r10_bio; 130762306a36Sopenharmony_ci 130862306a36Sopenharmony_ci if (conf->mddev->gendisk) 130962306a36Sopenharmony_ci trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), 131062306a36Sopenharmony_ci r10_bio->sector); 131162306a36Sopenharmony_ci /* flush_pending_writes() needs access to the rdev so...*/ 131262306a36Sopenharmony_ci mbio->bi_bdev = (void *)rdev; 131362306a36Sopenharmony_ci 131462306a36Sopenharmony_ci atomic_inc(&r10_bio->remaining); 131562306a36Sopenharmony_ci 131662306a36Sopenharmony_ci if (!raid1_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) { 131762306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 131862306a36Sopenharmony_ci bio_list_add(&conf->pending_bio_list, mbio); 131962306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 132062306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 132162306a36Sopenharmony_ci } 132262306a36Sopenharmony_ci} 132362306a36Sopenharmony_ci 132462306a36Sopenharmony_cistatic struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror, 132562306a36Sopenharmony_ci struct md_rdev **prrdev) 132662306a36Sopenharmony_ci{ 132762306a36Sopenharmony_ci struct md_rdev *rdev, *rrdev; 132862306a36Sopenharmony_ci 132962306a36Sopenharmony_ci rrdev = rcu_dereference(mirror->replacement); 133062306a36Sopenharmony_ci /* 133162306a36Sopenharmony_ci * Read replacement first to prevent reading both rdev and 133262306a36Sopenharmony_ci * replacement as NULL during replacement replace rdev. 133362306a36Sopenharmony_ci */ 133462306a36Sopenharmony_ci smp_mb(); 133562306a36Sopenharmony_ci rdev = rcu_dereference(mirror->rdev); 133662306a36Sopenharmony_ci if (rdev == rrdev) 133762306a36Sopenharmony_ci rrdev = NULL; 133862306a36Sopenharmony_ci 133962306a36Sopenharmony_ci *prrdev = rrdev; 134062306a36Sopenharmony_ci return rdev; 134162306a36Sopenharmony_ci} 134262306a36Sopenharmony_ci 134362306a36Sopenharmony_cistatic void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) 134462306a36Sopenharmony_ci{ 134562306a36Sopenharmony_ci int i; 134662306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 134762306a36Sopenharmony_ci struct md_rdev *blocked_rdev; 134862306a36Sopenharmony_ci 134962306a36Sopenharmony_ciretry_wait: 135062306a36Sopenharmony_ci blocked_rdev = NULL; 135162306a36Sopenharmony_ci rcu_read_lock(); 135262306a36Sopenharmony_ci for (i = 0; i < conf->copies; i++) { 135362306a36Sopenharmony_ci struct md_rdev *rdev, *rrdev; 135462306a36Sopenharmony_ci 135562306a36Sopenharmony_ci rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev); 135662306a36Sopenharmony_ci if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 135762306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 135862306a36Sopenharmony_ci blocked_rdev = rdev; 135962306a36Sopenharmony_ci break; 136062306a36Sopenharmony_ci } 136162306a36Sopenharmony_ci if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { 136262306a36Sopenharmony_ci atomic_inc(&rrdev->nr_pending); 136362306a36Sopenharmony_ci blocked_rdev = rrdev; 136462306a36Sopenharmony_ci break; 136562306a36Sopenharmony_ci } 136662306a36Sopenharmony_ci 136762306a36Sopenharmony_ci if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 136862306a36Sopenharmony_ci sector_t first_bad; 136962306a36Sopenharmony_ci sector_t dev_sector = r10_bio->devs[i].addr; 137062306a36Sopenharmony_ci int bad_sectors; 137162306a36Sopenharmony_ci int is_bad; 137262306a36Sopenharmony_ci 137362306a36Sopenharmony_ci /* 137462306a36Sopenharmony_ci * Discard request doesn't care the write result 137562306a36Sopenharmony_ci * so it doesn't need to wait blocked disk here. 137662306a36Sopenharmony_ci */ 137762306a36Sopenharmony_ci if (!r10_bio->sectors) 137862306a36Sopenharmony_ci continue; 137962306a36Sopenharmony_ci 138062306a36Sopenharmony_ci is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, 138162306a36Sopenharmony_ci &first_bad, &bad_sectors); 138262306a36Sopenharmony_ci if (is_bad < 0) { 138362306a36Sopenharmony_ci /* 138462306a36Sopenharmony_ci * Mustn't write here until the bad block 138562306a36Sopenharmony_ci * is acknowledged 138662306a36Sopenharmony_ci */ 138762306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 138862306a36Sopenharmony_ci set_bit(BlockedBadBlocks, &rdev->flags); 138962306a36Sopenharmony_ci blocked_rdev = rdev; 139062306a36Sopenharmony_ci break; 139162306a36Sopenharmony_ci } 139262306a36Sopenharmony_ci } 139362306a36Sopenharmony_ci } 139462306a36Sopenharmony_ci rcu_read_unlock(); 139562306a36Sopenharmony_ci 139662306a36Sopenharmony_ci if (unlikely(blocked_rdev)) { 139762306a36Sopenharmony_ci /* Have to wait for this device to get unblocked, then retry */ 139862306a36Sopenharmony_ci allow_barrier(conf); 139962306a36Sopenharmony_ci raid10_log(conf->mddev, "%s wait rdev %d blocked", 140062306a36Sopenharmony_ci __func__, blocked_rdev->raid_disk); 140162306a36Sopenharmony_ci md_wait_for_blocked_rdev(blocked_rdev, mddev); 140262306a36Sopenharmony_ci wait_barrier(conf, false); 140362306a36Sopenharmony_ci goto retry_wait; 140462306a36Sopenharmony_ci } 140562306a36Sopenharmony_ci} 140662306a36Sopenharmony_ci 140762306a36Sopenharmony_cistatic void raid10_write_request(struct mddev *mddev, struct bio *bio, 140862306a36Sopenharmony_ci struct r10bio *r10_bio) 140962306a36Sopenharmony_ci{ 141062306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 141162306a36Sopenharmony_ci int i; 141262306a36Sopenharmony_ci sector_t sectors; 141362306a36Sopenharmony_ci int max_sectors; 141462306a36Sopenharmony_ci 141562306a36Sopenharmony_ci if ((mddev_is_clustered(mddev) && 141662306a36Sopenharmony_ci md_cluster_ops->area_resyncing(mddev, WRITE, 141762306a36Sopenharmony_ci bio->bi_iter.bi_sector, 141862306a36Sopenharmony_ci bio_end_sector(bio)))) { 141962306a36Sopenharmony_ci DEFINE_WAIT(w); 142062306a36Sopenharmony_ci /* Bail out if REQ_NOWAIT is set for the bio */ 142162306a36Sopenharmony_ci if (bio->bi_opf & REQ_NOWAIT) { 142262306a36Sopenharmony_ci bio_wouldblock_error(bio); 142362306a36Sopenharmony_ci return; 142462306a36Sopenharmony_ci } 142562306a36Sopenharmony_ci for (;;) { 142662306a36Sopenharmony_ci prepare_to_wait(&conf->wait_barrier, 142762306a36Sopenharmony_ci &w, TASK_IDLE); 142862306a36Sopenharmony_ci if (!md_cluster_ops->area_resyncing(mddev, WRITE, 142962306a36Sopenharmony_ci bio->bi_iter.bi_sector, bio_end_sector(bio))) 143062306a36Sopenharmony_ci break; 143162306a36Sopenharmony_ci schedule(); 143262306a36Sopenharmony_ci } 143362306a36Sopenharmony_ci finish_wait(&conf->wait_barrier, &w); 143462306a36Sopenharmony_ci } 143562306a36Sopenharmony_ci 143662306a36Sopenharmony_ci sectors = r10_bio->sectors; 143762306a36Sopenharmony_ci if (!regular_request_wait(mddev, conf, bio, sectors)) 143862306a36Sopenharmony_ci return; 143962306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 144062306a36Sopenharmony_ci (mddev->reshape_backwards 144162306a36Sopenharmony_ci ? (bio->bi_iter.bi_sector < conf->reshape_safe && 144262306a36Sopenharmony_ci bio->bi_iter.bi_sector + sectors > conf->reshape_progress) 144362306a36Sopenharmony_ci : (bio->bi_iter.bi_sector + sectors > conf->reshape_safe && 144462306a36Sopenharmony_ci bio->bi_iter.bi_sector < conf->reshape_progress))) { 144562306a36Sopenharmony_ci /* Need to update reshape_position in metadata */ 144662306a36Sopenharmony_ci mddev->reshape_position = conf->reshape_progress; 144762306a36Sopenharmony_ci set_mask_bits(&mddev->sb_flags, 0, 144862306a36Sopenharmony_ci BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 144962306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 145062306a36Sopenharmony_ci if (bio->bi_opf & REQ_NOWAIT) { 145162306a36Sopenharmony_ci allow_barrier(conf); 145262306a36Sopenharmony_ci bio_wouldblock_error(bio); 145362306a36Sopenharmony_ci return; 145462306a36Sopenharmony_ci } 145562306a36Sopenharmony_ci raid10_log(conf->mddev, "wait reshape metadata"); 145662306a36Sopenharmony_ci wait_event(mddev->sb_wait, 145762306a36Sopenharmony_ci !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); 145862306a36Sopenharmony_ci 145962306a36Sopenharmony_ci conf->reshape_safe = mddev->reshape_position; 146062306a36Sopenharmony_ci } 146162306a36Sopenharmony_ci 146262306a36Sopenharmony_ci /* first select target devices under rcu_lock and 146362306a36Sopenharmony_ci * inc refcount on their rdev. Record them by setting 146462306a36Sopenharmony_ci * bios[x] to bio 146562306a36Sopenharmony_ci * If there are known/acknowledged bad blocks on any device 146662306a36Sopenharmony_ci * on which we have seen a write error, we want to avoid 146762306a36Sopenharmony_ci * writing to those blocks. This potentially requires several 146862306a36Sopenharmony_ci * writes to write around the bad blocks. Each set of writes 146962306a36Sopenharmony_ci * gets its own r10_bio with a set of bios attached. 147062306a36Sopenharmony_ci */ 147162306a36Sopenharmony_ci 147262306a36Sopenharmony_ci r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ 147362306a36Sopenharmony_ci raid10_find_phys(conf, r10_bio); 147462306a36Sopenharmony_ci 147562306a36Sopenharmony_ci wait_blocked_dev(mddev, r10_bio); 147662306a36Sopenharmony_ci 147762306a36Sopenharmony_ci rcu_read_lock(); 147862306a36Sopenharmony_ci max_sectors = r10_bio->sectors; 147962306a36Sopenharmony_ci 148062306a36Sopenharmony_ci for (i = 0; i < conf->copies; i++) { 148162306a36Sopenharmony_ci int d = r10_bio->devs[i].devnum; 148262306a36Sopenharmony_ci struct md_rdev *rdev, *rrdev; 148362306a36Sopenharmony_ci 148462306a36Sopenharmony_ci rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev); 148562306a36Sopenharmony_ci if (rdev && (test_bit(Faulty, &rdev->flags))) 148662306a36Sopenharmony_ci rdev = NULL; 148762306a36Sopenharmony_ci if (rrdev && (test_bit(Faulty, &rrdev->flags))) 148862306a36Sopenharmony_ci rrdev = NULL; 148962306a36Sopenharmony_ci 149062306a36Sopenharmony_ci r10_bio->devs[i].bio = NULL; 149162306a36Sopenharmony_ci r10_bio->devs[i].repl_bio = NULL; 149262306a36Sopenharmony_ci 149362306a36Sopenharmony_ci if (!rdev && !rrdev) { 149462306a36Sopenharmony_ci set_bit(R10BIO_Degraded, &r10_bio->state); 149562306a36Sopenharmony_ci continue; 149662306a36Sopenharmony_ci } 149762306a36Sopenharmony_ci if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 149862306a36Sopenharmony_ci sector_t first_bad; 149962306a36Sopenharmony_ci sector_t dev_sector = r10_bio->devs[i].addr; 150062306a36Sopenharmony_ci int bad_sectors; 150162306a36Sopenharmony_ci int is_bad; 150262306a36Sopenharmony_ci 150362306a36Sopenharmony_ci is_bad = is_badblock(rdev, dev_sector, max_sectors, 150462306a36Sopenharmony_ci &first_bad, &bad_sectors); 150562306a36Sopenharmony_ci if (is_bad && first_bad <= dev_sector) { 150662306a36Sopenharmony_ci /* Cannot write here at all */ 150762306a36Sopenharmony_ci bad_sectors -= (dev_sector - first_bad); 150862306a36Sopenharmony_ci if (bad_sectors < max_sectors) 150962306a36Sopenharmony_ci /* Mustn't write more than bad_sectors 151062306a36Sopenharmony_ci * to other devices yet 151162306a36Sopenharmony_ci */ 151262306a36Sopenharmony_ci max_sectors = bad_sectors; 151362306a36Sopenharmony_ci /* We don't set R10BIO_Degraded as that 151462306a36Sopenharmony_ci * only applies if the disk is missing, 151562306a36Sopenharmony_ci * so it might be re-added, and we want to 151662306a36Sopenharmony_ci * know to recover this chunk. 151762306a36Sopenharmony_ci * In this case the device is here, and the 151862306a36Sopenharmony_ci * fact that this chunk is not in-sync is 151962306a36Sopenharmony_ci * recorded in the bad block log. 152062306a36Sopenharmony_ci */ 152162306a36Sopenharmony_ci continue; 152262306a36Sopenharmony_ci } 152362306a36Sopenharmony_ci if (is_bad) { 152462306a36Sopenharmony_ci int good_sectors = first_bad - dev_sector; 152562306a36Sopenharmony_ci if (good_sectors < max_sectors) 152662306a36Sopenharmony_ci max_sectors = good_sectors; 152762306a36Sopenharmony_ci } 152862306a36Sopenharmony_ci } 152962306a36Sopenharmony_ci if (rdev) { 153062306a36Sopenharmony_ci r10_bio->devs[i].bio = bio; 153162306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 153262306a36Sopenharmony_ci } 153362306a36Sopenharmony_ci if (rrdev) { 153462306a36Sopenharmony_ci r10_bio->devs[i].repl_bio = bio; 153562306a36Sopenharmony_ci atomic_inc(&rrdev->nr_pending); 153662306a36Sopenharmony_ci } 153762306a36Sopenharmony_ci } 153862306a36Sopenharmony_ci rcu_read_unlock(); 153962306a36Sopenharmony_ci 154062306a36Sopenharmony_ci if (max_sectors < r10_bio->sectors) 154162306a36Sopenharmony_ci r10_bio->sectors = max_sectors; 154262306a36Sopenharmony_ci 154362306a36Sopenharmony_ci if (r10_bio->sectors < bio_sectors(bio)) { 154462306a36Sopenharmony_ci struct bio *split = bio_split(bio, r10_bio->sectors, 154562306a36Sopenharmony_ci GFP_NOIO, &conf->bio_split); 154662306a36Sopenharmony_ci bio_chain(split, bio); 154762306a36Sopenharmony_ci allow_barrier(conf); 154862306a36Sopenharmony_ci submit_bio_noacct(bio); 154962306a36Sopenharmony_ci wait_barrier(conf, false); 155062306a36Sopenharmony_ci bio = split; 155162306a36Sopenharmony_ci r10_bio->master_bio = bio; 155262306a36Sopenharmony_ci } 155362306a36Sopenharmony_ci 155462306a36Sopenharmony_ci md_account_bio(mddev, &bio); 155562306a36Sopenharmony_ci r10_bio->master_bio = bio; 155662306a36Sopenharmony_ci atomic_set(&r10_bio->remaining, 1); 155762306a36Sopenharmony_ci md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); 155862306a36Sopenharmony_ci 155962306a36Sopenharmony_ci for (i = 0; i < conf->copies; i++) { 156062306a36Sopenharmony_ci if (r10_bio->devs[i].bio) 156162306a36Sopenharmony_ci raid10_write_one_disk(mddev, r10_bio, bio, false, i); 156262306a36Sopenharmony_ci if (r10_bio->devs[i].repl_bio) 156362306a36Sopenharmony_ci raid10_write_one_disk(mddev, r10_bio, bio, true, i); 156462306a36Sopenharmony_ci } 156562306a36Sopenharmony_ci one_write_done(r10_bio); 156662306a36Sopenharmony_ci} 156762306a36Sopenharmony_ci 156862306a36Sopenharmony_cistatic void __make_request(struct mddev *mddev, struct bio *bio, int sectors) 156962306a36Sopenharmony_ci{ 157062306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 157162306a36Sopenharmony_ci struct r10bio *r10_bio; 157262306a36Sopenharmony_ci 157362306a36Sopenharmony_ci r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); 157462306a36Sopenharmony_ci 157562306a36Sopenharmony_ci r10_bio->master_bio = bio; 157662306a36Sopenharmony_ci r10_bio->sectors = sectors; 157762306a36Sopenharmony_ci 157862306a36Sopenharmony_ci r10_bio->mddev = mddev; 157962306a36Sopenharmony_ci r10_bio->sector = bio->bi_iter.bi_sector; 158062306a36Sopenharmony_ci r10_bio->state = 0; 158162306a36Sopenharmony_ci r10_bio->read_slot = -1; 158262306a36Sopenharmony_ci memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * 158362306a36Sopenharmony_ci conf->geo.raid_disks); 158462306a36Sopenharmony_ci 158562306a36Sopenharmony_ci if (bio_data_dir(bio) == READ) 158662306a36Sopenharmony_ci raid10_read_request(mddev, bio, r10_bio, true); 158762306a36Sopenharmony_ci else 158862306a36Sopenharmony_ci raid10_write_request(mddev, bio, r10_bio); 158962306a36Sopenharmony_ci} 159062306a36Sopenharmony_ci 159162306a36Sopenharmony_cistatic void raid_end_discard_bio(struct r10bio *r10bio) 159262306a36Sopenharmony_ci{ 159362306a36Sopenharmony_ci struct r10conf *conf = r10bio->mddev->private; 159462306a36Sopenharmony_ci struct r10bio *first_r10bio; 159562306a36Sopenharmony_ci 159662306a36Sopenharmony_ci while (atomic_dec_and_test(&r10bio->remaining)) { 159762306a36Sopenharmony_ci 159862306a36Sopenharmony_ci allow_barrier(conf); 159962306a36Sopenharmony_ci 160062306a36Sopenharmony_ci if (!test_bit(R10BIO_Discard, &r10bio->state)) { 160162306a36Sopenharmony_ci first_r10bio = (struct r10bio *)r10bio->master_bio; 160262306a36Sopenharmony_ci free_r10bio(r10bio); 160362306a36Sopenharmony_ci r10bio = first_r10bio; 160462306a36Sopenharmony_ci } else { 160562306a36Sopenharmony_ci md_write_end(r10bio->mddev); 160662306a36Sopenharmony_ci bio_endio(r10bio->master_bio); 160762306a36Sopenharmony_ci free_r10bio(r10bio); 160862306a36Sopenharmony_ci break; 160962306a36Sopenharmony_ci } 161062306a36Sopenharmony_ci } 161162306a36Sopenharmony_ci} 161262306a36Sopenharmony_ci 161362306a36Sopenharmony_cistatic void raid10_end_discard_request(struct bio *bio) 161462306a36Sopenharmony_ci{ 161562306a36Sopenharmony_ci struct r10bio *r10_bio = bio->bi_private; 161662306a36Sopenharmony_ci struct r10conf *conf = r10_bio->mddev->private; 161762306a36Sopenharmony_ci struct md_rdev *rdev = NULL; 161862306a36Sopenharmony_ci int dev; 161962306a36Sopenharmony_ci int slot, repl; 162062306a36Sopenharmony_ci 162162306a36Sopenharmony_ci /* 162262306a36Sopenharmony_ci * We don't care the return value of discard bio 162362306a36Sopenharmony_ci */ 162462306a36Sopenharmony_ci if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 162562306a36Sopenharmony_ci set_bit(R10BIO_Uptodate, &r10_bio->state); 162662306a36Sopenharmony_ci 162762306a36Sopenharmony_ci dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 162862306a36Sopenharmony_ci if (repl) 162962306a36Sopenharmony_ci rdev = conf->mirrors[dev].replacement; 163062306a36Sopenharmony_ci if (!rdev) { 163162306a36Sopenharmony_ci /* 163262306a36Sopenharmony_ci * raid10_remove_disk uses smp_mb to make sure rdev is set to 163362306a36Sopenharmony_ci * replacement before setting replacement to NULL. It can read 163462306a36Sopenharmony_ci * rdev first without barrier protect even replacement is NULL 163562306a36Sopenharmony_ci */ 163662306a36Sopenharmony_ci smp_rmb(); 163762306a36Sopenharmony_ci rdev = conf->mirrors[dev].rdev; 163862306a36Sopenharmony_ci } 163962306a36Sopenharmony_ci 164062306a36Sopenharmony_ci raid_end_discard_bio(r10_bio); 164162306a36Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 164262306a36Sopenharmony_ci} 164362306a36Sopenharmony_ci 164462306a36Sopenharmony_ci/* 164562306a36Sopenharmony_ci * There are some limitations to handle discard bio 164662306a36Sopenharmony_ci * 1st, the discard size is bigger than stripe_size*2. 164762306a36Sopenharmony_ci * 2st, if the discard bio spans reshape progress, we use the old way to 164862306a36Sopenharmony_ci * handle discard bio 164962306a36Sopenharmony_ci */ 165062306a36Sopenharmony_cistatic int raid10_handle_discard(struct mddev *mddev, struct bio *bio) 165162306a36Sopenharmony_ci{ 165262306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 165362306a36Sopenharmony_ci struct geom *geo = &conf->geo; 165462306a36Sopenharmony_ci int far_copies = geo->far_copies; 165562306a36Sopenharmony_ci bool first_copy = true; 165662306a36Sopenharmony_ci struct r10bio *r10_bio, *first_r10bio; 165762306a36Sopenharmony_ci struct bio *split; 165862306a36Sopenharmony_ci int disk; 165962306a36Sopenharmony_ci sector_t chunk; 166062306a36Sopenharmony_ci unsigned int stripe_size; 166162306a36Sopenharmony_ci unsigned int stripe_data_disks; 166262306a36Sopenharmony_ci sector_t split_size; 166362306a36Sopenharmony_ci sector_t bio_start, bio_end; 166462306a36Sopenharmony_ci sector_t first_stripe_index, last_stripe_index; 166562306a36Sopenharmony_ci sector_t start_disk_offset; 166662306a36Sopenharmony_ci unsigned int start_disk_index; 166762306a36Sopenharmony_ci sector_t end_disk_offset; 166862306a36Sopenharmony_ci unsigned int end_disk_index; 166962306a36Sopenharmony_ci unsigned int remainder; 167062306a36Sopenharmony_ci 167162306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 167262306a36Sopenharmony_ci return -EAGAIN; 167362306a36Sopenharmony_ci 167462306a36Sopenharmony_ci if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) { 167562306a36Sopenharmony_ci bio_wouldblock_error(bio); 167662306a36Sopenharmony_ci return 0; 167762306a36Sopenharmony_ci } 167862306a36Sopenharmony_ci wait_barrier(conf, false); 167962306a36Sopenharmony_ci 168062306a36Sopenharmony_ci /* 168162306a36Sopenharmony_ci * Check reshape again to avoid reshape happens after checking 168262306a36Sopenharmony_ci * MD_RECOVERY_RESHAPE and before wait_barrier 168362306a36Sopenharmony_ci */ 168462306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 168562306a36Sopenharmony_ci goto out; 168662306a36Sopenharmony_ci 168762306a36Sopenharmony_ci if (geo->near_copies) 168862306a36Sopenharmony_ci stripe_data_disks = geo->raid_disks / geo->near_copies + 168962306a36Sopenharmony_ci geo->raid_disks % geo->near_copies; 169062306a36Sopenharmony_ci else 169162306a36Sopenharmony_ci stripe_data_disks = geo->raid_disks; 169262306a36Sopenharmony_ci 169362306a36Sopenharmony_ci stripe_size = stripe_data_disks << geo->chunk_shift; 169462306a36Sopenharmony_ci 169562306a36Sopenharmony_ci bio_start = bio->bi_iter.bi_sector; 169662306a36Sopenharmony_ci bio_end = bio_end_sector(bio); 169762306a36Sopenharmony_ci 169862306a36Sopenharmony_ci /* 169962306a36Sopenharmony_ci * Maybe one discard bio is smaller than strip size or across one 170062306a36Sopenharmony_ci * stripe and discard region is larger than one stripe size. For far 170162306a36Sopenharmony_ci * offset layout, if the discard region is not aligned with stripe 170262306a36Sopenharmony_ci * size, there is hole when we submit discard bio to member disk. 170362306a36Sopenharmony_ci * For simplicity, we only handle discard bio which discard region 170462306a36Sopenharmony_ci * is bigger than stripe_size * 2 170562306a36Sopenharmony_ci */ 170662306a36Sopenharmony_ci if (bio_sectors(bio) < stripe_size*2) 170762306a36Sopenharmony_ci goto out; 170862306a36Sopenharmony_ci 170962306a36Sopenharmony_ci /* 171062306a36Sopenharmony_ci * Keep bio aligned with strip size. 171162306a36Sopenharmony_ci */ 171262306a36Sopenharmony_ci div_u64_rem(bio_start, stripe_size, &remainder); 171362306a36Sopenharmony_ci if (remainder) { 171462306a36Sopenharmony_ci split_size = stripe_size - remainder; 171562306a36Sopenharmony_ci split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 171662306a36Sopenharmony_ci bio_chain(split, bio); 171762306a36Sopenharmony_ci allow_barrier(conf); 171862306a36Sopenharmony_ci /* Resend the fist split part */ 171962306a36Sopenharmony_ci submit_bio_noacct(split); 172062306a36Sopenharmony_ci wait_barrier(conf, false); 172162306a36Sopenharmony_ci } 172262306a36Sopenharmony_ci div_u64_rem(bio_end, stripe_size, &remainder); 172362306a36Sopenharmony_ci if (remainder) { 172462306a36Sopenharmony_ci split_size = bio_sectors(bio) - remainder; 172562306a36Sopenharmony_ci split = bio_split(bio, split_size, GFP_NOIO, &conf->bio_split); 172662306a36Sopenharmony_ci bio_chain(split, bio); 172762306a36Sopenharmony_ci allow_barrier(conf); 172862306a36Sopenharmony_ci /* Resend the second split part */ 172962306a36Sopenharmony_ci submit_bio_noacct(bio); 173062306a36Sopenharmony_ci bio = split; 173162306a36Sopenharmony_ci wait_barrier(conf, false); 173262306a36Sopenharmony_ci } 173362306a36Sopenharmony_ci 173462306a36Sopenharmony_ci bio_start = bio->bi_iter.bi_sector; 173562306a36Sopenharmony_ci bio_end = bio_end_sector(bio); 173662306a36Sopenharmony_ci 173762306a36Sopenharmony_ci /* 173862306a36Sopenharmony_ci * Raid10 uses chunk as the unit to store data. It's similar like raid0. 173962306a36Sopenharmony_ci * One stripe contains the chunks from all member disk (one chunk from 174062306a36Sopenharmony_ci * one disk at the same HBA address). For layout detail, see 'man md 4' 174162306a36Sopenharmony_ci */ 174262306a36Sopenharmony_ci chunk = bio_start >> geo->chunk_shift; 174362306a36Sopenharmony_ci chunk *= geo->near_copies; 174462306a36Sopenharmony_ci first_stripe_index = chunk; 174562306a36Sopenharmony_ci start_disk_index = sector_div(first_stripe_index, geo->raid_disks); 174662306a36Sopenharmony_ci if (geo->far_offset) 174762306a36Sopenharmony_ci first_stripe_index *= geo->far_copies; 174862306a36Sopenharmony_ci start_disk_offset = (bio_start & geo->chunk_mask) + 174962306a36Sopenharmony_ci (first_stripe_index << geo->chunk_shift); 175062306a36Sopenharmony_ci 175162306a36Sopenharmony_ci chunk = bio_end >> geo->chunk_shift; 175262306a36Sopenharmony_ci chunk *= geo->near_copies; 175362306a36Sopenharmony_ci last_stripe_index = chunk; 175462306a36Sopenharmony_ci end_disk_index = sector_div(last_stripe_index, geo->raid_disks); 175562306a36Sopenharmony_ci if (geo->far_offset) 175662306a36Sopenharmony_ci last_stripe_index *= geo->far_copies; 175762306a36Sopenharmony_ci end_disk_offset = (bio_end & geo->chunk_mask) + 175862306a36Sopenharmony_ci (last_stripe_index << geo->chunk_shift); 175962306a36Sopenharmony_ci 176062306a36Sopenharmony_ciretry_discard: 176162306a36Sopenharmony_ci r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); 176262306a36Sopenharmony_ci r10_bio->mddev = mddev; 176362306a36Sopenharmony_ci r10_bio->state = 0; 176462306a36Sopenharmony_ci r10_bio->sectors = 0; 176562306a36Sopenharmony_ci memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); 176662306a36Sopenharmony_ci wait_blocked_dev(mddev, r10_bio); 176762306a36Sopenharmony_ci 176862306a36Sopenharmony_ci /* 176962306a36Sopenharmony_ci * For far layout it needs more than one r10bio to cover all regions. 177062306a36Sopenharmony_ci * Inspired by raid10_sync_request, we can use the first r10bio->master_bio 177162306a36Sopenharmony_ci * to record the discard bio. Other r10bio->master_bio record the first 177262306a36Sopenharmony_ci * r10bio. The first r10bio only release after all other r10bios finish. 177362306a36Sopenharmony_ci * The discard bio returns only first r10bio finishes 177462306a36Sopenharmony_ci */ 177562306a36Sopenharmony_ci if (first_copy) { 177662306a36Sopenharmony_ci r10_bio->master_bio = bio; 177762306a36Sopenharmony_ci set_bit(R10BIO_Discard, &r10_bio->state); 177862306a36Sopenharmony_ci first_copy = false; 177962306a36Sopenharmony_ci first_r10bio = r10_bio; 178062306a36Sopenharmony_ci } else 178162306a36Sopenharmony_ci r10_bio->master_bio = (struct bio *)first_r10bio; 178262306a36Sopenharmony_ci 178362306a36Sopenharmony_ci /* 178462306a36Sopenharmony_ci * first select target devices under rcu_lock and 178562306a36Sopenharmony_ci * inc refcount on their rdev. Record them by setting 178662306a36Sopenharmony_ci * bios[x] to bio 178762306a36Sopenharmony_ci */ 178862306a36Sopenharmony_ci rcu_read_lock(); 178962306a36Sopenharmony_ci for (disk = 0; disk < geo->raid_disks; disk++) { 179062306a36Sopenharmony_ci struct md_rdev *rdev, *rrdev; 179162306a36Sopenharmony_ci 179262306a36Sopenharmony_ci rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev); 179362306a36Sopenharmony_ci r10_bio->devs[disk].bio = NULL; 179462306a36Sopenharmony_ci r10_bio->devs[disk].repl_bio = NULL; 179562306a36Sopenharmony_ci 179662306a36Sopenharmony_ci if (rdev && (test_bit(Faulty, &rdev->flags))) 179762306a36Sopenharmony_ci rdev = NULL; 179862306a36Sopenharmony_ci if (rrdev && (test_bit(Faulty, &rrdev->flags))) 179962306a36Sopenharmony_ci rrdev = NULL; 180062306a36Sopenharmony_ci if (!rdev && !rrdev) 180162306a36Sopenharmony_ci continue; 180262306a36Sopenharmony_ci 180362306a36Sopenharmony_ci if (rdev) { 180462306a36Sopenharmony_ci r10_bio->devs[disk].bio = bio; 180562306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 180662306a36Sopenharmony_ci } 180762306a36Sopenharmony_ci if (rrdev) { 180862306a36Sopenharmony_ci r10_bio->devs[disk].repl_bio = bio; 180962306a36Sopenharmony_ci atomic_inc(&rrdev->nr_pending); 181062306a36Sopenharmony_ci } 181162306a36Sopenharmony_ci } 181262306a36Sopenharmony_ci rcu_read_unlock(); 181362306a36Sopenharmony_ci 181462306a36Sopenharmony_ci atomic_set(&r10_bio->remaining, 1); 181562306a36Sopenharmony_ci for (disk = 0; disk < geo->raid_disks; disk++) { 181662306a36Sopenharmony_ci sector_t dev_start, dev_end; 181762306a36Sopenharmony_ci struct bio *mbio, *rbio = NULL; 181862306a36Sopenharmony_ci 181962306a36Sopenharmony_ci /* 182062306a36Sopenharmony_ci * Now start to calculate the start and end address for each disk. 182162306a36Sopenharmony_ci * The space between dev_start and dev_end is the discard region. 182262306a36Sopenharmony_ci * 182362306a36Sopenharmony_ci * For dev_start, it needs to consider three conditions: 182462306a36Sopenharmony_ci * 1st, the disk is before start_disk, you can imagine the disk in 182562306a36Sopenharmony_ci * the next stripe. So the dev_start is the start address of next 182662306a36Sopenharmony_ci * stripe. 182762306a36Sopenharmony_ci * 2st, the disk is after start_disk, it means the disk is at the 182862306a36Sopenharmony_ci * same stripe of first disk 182962306a36Sopenharmony_ci * 3st, the first disk itself, we can use start_disk_offset directly 183062306a36Sopenharmony_ci */ 183162306a36Sopenharmony_ci if (disk < start_disk_index) 183262306a36Sopenharmony_ci dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; 183362306a36Sopenharmony_ci else if (disk > start_disk_index) 183462306a36Sopenharmony_ci dev_start = first_stripe_index * mddev->chunk_sectors; 183562306a36Sopenharmony_ci else 183662306a36Sopenharmony_ci dev_start = start_disk_offset; 183762306a36Sopenharmony_ci 183862306a36Sopenharmony_ci if (disk < end_disk_index) 183962306a36Sopenharmony_ci dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; 184062306a36Sopenharmony_ci else if (disk > end_disk_index) 184162306a36Sopenharmony_ci dev_end = last_stripe_index * mddev->chunk_sectors; 184262306a36Sopenharmony_ci else 184362306a36Sopenharmony_ci dev_end = end_disk_offset; 184462306a36Sopenharmony_ci 184562306a36Sopenharmony_ci /* 184662306a36Sopenharmony_ci * It only handles discard bio which size is >= stripe size, so 184762306a36Sopenharmony_ci * dev_end > dev_start all the time. 184862306a36Sopenharmony_ci * It doesn't need to use rcu lock to get rdev here. We already 184962306a36Sopenharmony_ci * add rdev->nr_pending in the first loop. 185062306a36Sopenharmony_ci */ 185162306a36Sopenharmony_ci if (r10_bio->devs[disk].bio) { 185262306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[disk].rdev; 185362306a36Sopenharmony_ci mbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, 185462306a36Sopenharmony_ci &mddev->bio_set); 185562306a36Sopenharmony_ci mbio->bi_end_io = raid10_end_discard_request; 185662306a36Sopenharmony_ci mbio->bi_private = r10_bio; 185762306a36Sopenharmony_ci r10_bio->devs[disk].bio = mbio; 185862306a36Sopenharmony_ci r10_bio->devs[disk].devnum = disk; 185962306a36Sopenharmony_ci atomic_inc(&r10_bio->remaining); 186062306a36Sopenharmony_ci md_submit_discard_bio(mddev, rdev, mbio, 186162306a36Sopenharmony_ci dev_start + choose_data_offset(r10_bio, rdev), 186262306a36Sopenharmony_ci dev_end - dev_start); 186362306a36Sopenharmony_ci bio_endio(mbio); 186462306a36Sopenharmony_ci } 186562306a36Sopenharmony_ci if (r10_bio->devs[disk].repl_bio) { 186662306a36Sopenharmony_ci struct md_rdev *rrdev = conf->mirrors[disk].replacement; 186762306a36Sopenharmony_ci rbio = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, 186862306a36Sopenharmony_ci &mddev->bio_set); 186962306a36Sopenharmony_ci rbio->bi_end_io = raid10_end_discard_request; 187062306a36Sopenharmony_ci rbio->bi_private = r10_bio; 187162306a36Sopenharmony_ci r10_bio->devs[disk].repl_bio = rbio; 187262306a36Sopenharmony_ci r10_bio->devs[disk].devnum = disk; 187362306a36Sopenharmony_ci atomic_inc(&r10_bio->remaining); 187462306a36Sopenharmony_ci md_submit_discard_bio(mddev, rrdev, rbio, 187562306a36Sopenharmony_ci dev_start + choose_data_offset(r10_bio, rrdev), 187662306a36Sopenharmony_ci dev_end - dev_start); 187762306a36Sopenharmony_ci bio_endio(rbio); 187862306a36Sopenharmony_ci } 187962306a36Sopenharmony_ci } 188062306a36Sopenharmony_ci 188162306a36Sopenharmony_ci if (!geo->far_offset && --far_copies) { 188262306a36Sopenharmony_ci first_stripe_index += geo->stride >> geo->chunk_shift; 188362306a36Sopenharmony_ci start_disk_offset += geo->stride; 188462306a36Sopenharmony_ci last_stripe_index += geo->stride >> geo->chunk_shift; 188562306a36Sopenharmony_ci end_disk_offset += geo->stride; 188662306a36Sopenharmony_ci atomic_inc(&first_r10bio->remaining); 188762306a36Sopenharmony_ci raid_end_discard_bio(r10_bio); 188862306a36Sopenharmony_ci wait_barrier(conf, false); 188962306a36Sopenharmony_ci goto retry_discard; 189062306a36Sopenharmony_ci } 189162306a36Sopenharmony_ci 189262306a36Sopenharmony_ci raid_end_discard_bio(r10_bio); 189362306a36Sopenharmony_ci 189462306a36Sopenharmony_ci return 0; 189562306a36Sopenharmony_ciout: 189662306a36Sopenharmony_ci allow_barrier(conf); 189762306a36Sopenharmony_ci return -EAGAIN; 189862306a36Sopenharmony_ci} 189962306a36Sopenharmony_ci 190062306a36Sopenharmony_cistatic bool raid10_make_request(struct mddev *mddev, struct bio *bio) 190162306a36Sopenharmony_ci{ 190262306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 190362306a36Sopenharmony_ci sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); 190462306a36Sopenharmony_ci int chunk_sects = chunk_mask + 1; 190562306a36Sopenharmony_ci int sectors = bio_sectors(bio); 190662306a36Sopenharmony_ci 190762306a36Sopenharmony_ci if (unlikely(bio->bi_opf & REQ_PREFLUSH) 190862306a36Sopenharmony_ci && md_flush_request(mddev, bio)) 190962306a36Sopenharmony_ci return true; 191062306a36Sopenharmony_ci 191162306a36Sopenharmony_ci if (!md_write_start(mddev, bio)) 191262306a36Sopenharmony_ci return false; 191362306a36Sopenharmony_ci 191462306a36Sopenharmony_ci if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 191562306a36Sopenharmony_ci if (!raid10_handle_discard(mddev, bio)) 191662306a36Sopenharmony_ci return true; 191762306a36Sopenharmony_ci 191862306a36Sopenharmony_ci /* 191962306a36Sopenharmony_ci * If this request crosses a chunk boundary, we need to split 192062306a36Sopenharmony_ci * it. 192162306a36Sopenharmony_ci */ 192262306a36Sopenharmony_ci if (unlikely((bio->bi_iter.bi_sector & chunk_mask) + 192362306a36Sopenharmony_ci sectors > chunk_sects 192462306a36Sopenharmony_ci && (conf->geo.near_copies < conf->geo.raid_disks 192562306a36Sopenharmony_ci || conf->prev.near_copies < 192662306a36Sopenharmony_ci conf->prev.raid_disks))) 192762306a36Sopenharmony_ci sectors = chunk_sects - 192862306a36Sopenharmony_ci (bio->bi_iter.bi_sector & 192962306a36Sopenharmony_ci (chunk_sects - 1)); 193062306a36Sopenharmony_ci __make_request(mddev, bio, sectors); 193162306a36Sopenharmony_ci 193262306a36Sopenharmony_ci /* In case raid10d snuck in to freeze_array */ 193362306a36Sopenharmony_ci wake_up_barrier(conf); 193462306a36Sopenharmony_ci return true; 193562306a36Sopenharmony_ci} 193662306a36Sopenharmony_ci 193762306a36Sopenharmony_cistatic void raid10_status(struct seq_file *seq, struct mddev *mddev) 193862306a36Sopenharmony_ci{ 193962306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 194062306a36Sopenharmony_ci int i; 194162306a36Sopenharmony_ci 194262306a36Sopenharmony_ci if (conf->geo.near_copies < conf->geo.raid_disks) 194362306a36Sopenharmony_ci seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); 194462306a36Sopenharmony_ci if (conf->geo.near_copies > 1) 194562306a36Sopenharmony_ci seq_printf(seq, " %d near-copies", conf->geo.near_copies); 194662306a36Sopenharmony_ci if (conf->geo.far_copies > 1) { 194762306a36Sopenharmony_ci if (conf->geo.far_offset) 194862306a36Sopenharmony_ci seq_printf(seq, " %d offset-copies", conf->geo.far_copies); 194962306a36Sopenharmony_ci else 195062306a36Sopenharmony_ci seq_printf(seq, " %d far-copies", conf->geo.far_copies); 195162306a36Sopenharmony_ci if (conf->geo.far_set_size != conf->geo.raid_disks) 195262306a36Sopenharmony_ci seq_printf(seq, " %d devices per set", conf->geo.far_set_size); 195362306a36Sopenharmony_ci } 195462306a36Sopenharmony_ci seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, 195562306a36Sopenharmony_ci conf->geo.raid_disks - mddev->degraded); 195662306a36Sopenharmony_ci rcu_read_lock(); 195762306a36Sopenharmony_ci for (i = 0; i < conf->geo.raid_disks; i++) { 195862306a36Sopenharmony_ci struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 195962306a36Sopenharmony_ci seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 196062306a36Sopenharmony_ci } 196162306a36Sopenharmony_ci rcu_read_unlock(); 196262306a36Sopenharmony_ci seq_printf(seq, "]"); 196362306a36Sopenharmony_ci} 196462306a36Sopenharmony_ci 196562306a36Sopenharmony_ci/* check if there are enough drives for 196662306a36Sopenharmony_ci * every block to appear on atleast one. 196762306a36Sopenharmony_ci * Don't consider the device numbered 'ignore' 196862306a36Sopenharmony_ci * as we might be about to remove it. 196962306a36Sopenharmony_ci */ 197062306a36Sopenharmony_cistatic int _enough(struct r10conf *conf, int previous, int ignore) 197162306a36Sopenharmony_ci{ 197262306a36Sopenharmony_ci int first = 0; 197362306a36Sopenharmony_ci int has_enough = 0; 197462306a36Sopenharmony_ci int disks, ncopies; 197562306a36Sopenharmony_ci if (previous) { 197662306a36Sopenharmony_ci disks = conf->prev.raid_disks; 197762306a36Sopenharmony_ci ncopies = conf->prev.near_copies; 197862306a36Sopenharmony_ci } else { 197962306a36Sopenharmony_ci disks = conf->geo.raid_disks; 198062306a36Sopenharmony_ci ncopies = conf->geo.near_copies; 198162306a36Sopenharmony_ci } 198262306a36Sopenharmony_ci 198362306a36Sopenharmony_ci rcu_read_lock(); 198462306a36Sopenharmony_ci do { 198562306a36Sopenharmony_ci int n = conf->copies; 198662306a36Sopenharmony_ci int cnt = 0; 198762306a36Sopenharmony_ci int this = first; 198862306a36Sopenharmony_ci while (n--) { 198962306a36Sopenharmony_ci struct md_rdev *rdev; 199062306a36Sopenharmony_ci if (this != ignore && 199162306a36Sopenharmony_ci (rdev = rcu_dereference(conf->mirrors[this].rdev)) && 199262306a36Sopenharmony_ci test_bit(In_sync, &rdev->flags)) 199362306a36Sopenharmony_ci cnt++; 199462306a36Sopenharmony_ci this = (this+1) % disks; 199562306a36Sopenharmony_ci } 199662306a36Sopenharmony_ci if (cnt == 0) 199762306a36Sopenharmony_ci goto out; 199862306a36Sopenharmony_ci first = (first + ncopies) % disks; 199962306a36Sopenharmony_ci } while (first != 0); 200062306a36Sopenharmony_ci has_enough = 1; 200162306a36Sopenharmony_ciout: 200262306a36Sopenharmony_ci rcu_read_unlock(); 200362306a36Sopenharmony_ci return has_enough; 200462306a36Sopenharmony_ci} 200562306a36Sopenharmony_ci 200662306a36Sopenharmony_cistatic int enough(struct r10conf *conf, int ignore) 200762306a36Sopenharmony_ci{ 200862306a36Sopenharmony_ci /* when calling 'enough', both 'prev' and 'geo' must 200962306a36Sopenharmony_ci * be stable. 201062306a36Sopenharmony_ci * This is ensured if ->reconfig_mutex or ->device_lock 201162306a36Sopenharmony_ci * is held. 201262306a36Sopenharmony_ci */ 201362306a36Sopenharmony_ci return _enough(conf, 0, ignore) && 201462306a36Sopenharmony_ci _enough(conf, 1, ignore); 201562306a36Sopenharmony_ci} 201662306a36Sopenharmony_ci 201762306a36Sopenharmony_ci/** 201862306a36Sopenharmony_ci * raid10_error() - RAID10 error handler. 201962306a36Sopenharmony_ci * @mddev: affected md device. 202062306a36Sopenharmony_ci * @rdev: member device to fail. 202162306a36Sopenharmony_ci * 202262306a36Sopenharmony_ci * The routine acknowledges &rdev failure and determines new @mddev state. 202362306a36Sopenharmony_ci * If it failed, then: 202462306a36Sopenharmony_ci * - &MD_BROKEN flag is set in &mddev->flags. 202562306a36Sopenharmony_ci * Otherwise, it must be degraded: 202662306a36Sopenharmony_ci * - recovery is interrupted. 202762306a36Sopenharmony_ci * - &mddev->degraded is bumped. 202862306a36Sopenharmony_ci * 202962306a36Sopenharmony_ci * @rdev is marked as &Faulty excluding case when array is failed and 203062306a36Sopenharmony_ci * &mddev->fail_last_dev is off. 203162306a36Sopenharmony_ci */ 203262306a36Sopenharmony_cistatic void raid10_error(struct mddev *mddev, struct md_rdev *rdev) 203362306a36Sopenharmony_ci{ 203462306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 203562306a36Sopenharmony_ci unsigned long flags; 203662306a36Sopenharmony_ci 203762306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 203862306a36Sopenharmony_ci 203962306a36Sopenharmony_ci if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) { 204062306a36Sopenharmony_ci set_bit(MD_BROKEN, &mddev->flags); 204162306a36Sopenharmony_ci 204262306a36Sopenharmony_ci if (!mddev->fail_last_dev) { 204362306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 204462306a36Sopenharmony_ci return; 204562306a36Sopenharmony_ci } 204662306a36Sopenharmony_ci } 204762306a36Sopenharmony_ci if (test_and_clear_bit(In_sync, &rdev->flags)) 204862306a36Sopenharmony_ci mddev->degraded++; 204962306a36Sopenharmony_ci 205062306a36Sopenharmony_ci set_bit(MD_RECOVERY_INTR, &mddev->recovery); 205162306a36Sopenharmony_ci set_bit(Blocked, &rdev->flags); 205262306a36Sopenharmony_ci set_bit(Faulty, &rdev->flags); 205362306a36Sopenharmony_ci set_mask_bits(&mddev->sb_flags, 0, 205462306a36Sopenharmony_ci BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 205562306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 205662306a36Sopenharmony_ci pr_crit("md/raid10:%s: Disk failure on %pg, disabling device.\n" 205762306a36Sopenharmony_ci "md/raid10:%s: Operation continuing on %d devices.\n", 205862306a36Sopenharmony_ci mdname(mddev), rdev->bdev, 205962306a36Sopenharmony_ci mdname(mddev), conf->geo.raid_disks - mddev->degraded); 206062306a36Sopenharmony_ci} 206162306a36Sopenharmony_ci 206262306a36Sopenharmony_cistatic void print_conf(struct r10conf *conf) 206362306a36Sopenharmony_ci{ 206462306a36Sopenharmony_ci int i; 206562306a36Sopenharmony_ci struct md_rdev *rdev; 206662306a36Sopenharmony_ci 206762306a36Sopenharmony_ci pr_debug("RAID10 conf printout:\n"); 206862306a36Sopenharmony_ci if (!conf) { 206962306a36Sopenharmony_ci pr_debug("(!conf)\n"); 207062306a36Sopenharmony_ci return; 207162306a36Sopenharmony_ci } 207262306a36Sopenharmony_ci pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, 207362306a36Sopenharmony_ci conf->geo.raid_disks); 207462306a36Sopenharmony_ci 207562306a36Sopenharmony_ci /* This is only called with ->reconfix_mutex held, so 207662306a36Sopenharmony_ci * rcu protection of rdev is not needed */ 207762306a36Sopenharmony_ci for (i = 0; i < conf->geo.raid_disks; i++) { 207862306a36Sopenharmony_ci rdev = conf->mirrors[i].rdev; 207962306a36Sopenharmony_ci if (rdev) 208062306a36Sopenharmony_ci pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n", 208162306a36Sopenharmony_ci i, !test_bit(In_sync, &rdev->flags), 208262306a36Sopenharmony_ci !test_bit(Faulty, &rdev->flags), 208362306a36Sopenharmony_ci rdev->bdev); 208462306a36Sopenharmony_ci } 208562306a36Sopenharmony_ci} 208662306a36Sopenharmony_ci 208762306a36Sopenharmony_cistatic void close_sync(struct r10conf *conf) 208862306a36Sopenharmony_ci{ 208962306a36Sopenharmony_ci wait_barrier(conf, false); 209062306a36Sopenharmony_ci allow_barrier(conf); 209162306a36Sopenharmony_ci 209262306a36Sopenharmony_ci mempool_exit(&conf->r10buf_pool); 209362306a36Sopenharmony_ci} 209462306a36Sopenharmony_ci 209562306a36Sopenharmony_cistatic int raid10_spare_active(struct mddev *mddev) 209662306a36Sopenharmony_ci{ 209762306a36Sopenharmony_ci int i; 209862306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 209962306a36Sopenharmony_ci struct raid10_info *tmp; 210062306a36Sopenharmony_ci int count = 0; 210162306a36Sopenharmony_ci unsigned long flags; 210262306a36Sopenharmony_ci 210362306a36Sopenharmony_ci /* 210462306a36Sopenharmony_ci * Find all non-in_sync disks within the RAID10 configuration 210562306a36Sopenharmony_ci * and mark them in_sync 210662306a36Sopenharmony_ci */ 210762306a36Sopenharmony_ci for (i = 0; i < conf->geo.raid_disks; i++) { 210862306a36Sopenharmony_ci tmp = conf->mirrors + i; 210962306a36Sopenharmony_ci if (tmp->replacement 211062306a36Sopenharmony_ci && tmp->replacement->recovery_offset == MaxSector 211162306a36Sopenharmony_ci && !test_bit(Faulty, &tmp->replacement->flags) 211262306a36Sopenharmony_ci && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 211362306a36Sopenharmony_ci /* Replacement has just become active */ 211462306a36Sopenharmony_ci if (!tmp->rdev 211562306a36Sopenharmony_ci || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 211662306a36Sopenharmony_ci count++; 211762306a36Sopenharmony_ci if (tmp->rdev) { 211862306a36Sopenharmony_ci /* Replaced device not technically faulty, 211962306a36Sopenharmony_ci * but we need to be sure it gets removed 212062306a36Sopenharmony_ci * and never re-added. 212162306a36Sopenharmony_ci */ 212262306a36Sopenharmony_ci set_bit(Faulty, &tmp->rdev->flags); 212362306a36Sopenharmony_ci sysfs_notify_dirent_safe( 212462306a36Sopenharmony_ci tmp->rdev->sysfs_state); 212562306a36Sopenharmony_ci } 212662306a36Sopenharmony_ci sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 212762306a36Sopenharmony_ci } else if (tmp->rdev 212862306a36Sopenharmony_ci && tmp->rdev->recovery_offset == MaxSector 212962306a36Sopenharmony_ci && !test_bit(Faulty, &tmp->rdev->flags) 213062306a36Sopenharmony_ci && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 213162306a36Sopenharmony_ci count++; 213262306a36Sopenharmony_ci sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 213362306a36Sopenharmony_ci } 213462306a36Sopenharmony_ci } 213562306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 213662306a36Sopenharmony_ci mddev->degraded -= count; 213762306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 213862306a36Sopenharmony_ci 213962306a36Sopenharmony_ci print_conf(conf); 214062306a36Sopenharmony_ci return count; 214162306a36Sopenharmony_ci} 214262306a36Sopenharmony_ci 214362306a36Sopenharmony_cistatic int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) 214462306a36Sopenharmony_ci{ 214562306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 214662306a36Sopenharmony_ci int err = -EEXIST; 214762306a36Sopenharmony_ci int mirror, repl_slot = -1; 214862306a36Sopenharmony_ci int first = 0; 214962306a36Sopenharmony_ci int last = conf->geo.raid_disks - 1; 215062306a36Sopenharmony_ci struct raid10_info *p; 215162306a36Sopenharmony_ci 215262306a36Sopenharmony_ci if (mddev->recovery_cp < MaxSector) 215362306a36Sopenharmony_ci /* only hot-add to in-sync arrays, as recovery is 215462306a36Sopenharmony_ci * very different from resync 215562306a36Sopenharmony_ci */ 215662306a36Sopenharmony_ci return -EBUSY; 215762306a36Sopenharmony_ci if (rdev->saved_raid_disk < 0 && !_enough(conf, 1, -1)) 215862306a36Sopenharmony_ci return -EINVAL; 215962306a36Sopenharmony_ci 216062306a36Sopenharmony_ci if (md_integrity_add_rdev(rdev, mddev)) 216162306a36Sopenharmony_ci return -ENXIO; 216262306a36Sopenharmony_ci 216362306a36Sopenharmony_ci if (rdev->raid_disk >= 0) 216462306a36Sopenharmony_ci first = last = rdev->raid_disk; 216562306a36Sopenharmony_ci 216662306a36Sopenharmony_ci if (rdev->saved_raid_disk >= first && 216762306a36Sopenharmony_ci rdev->saved_raid_disk < conf->geo.raid_disks && 216862306a36Sopenharmony_ci conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 216962306a36Sopenharmony_ci mirror = rdev->saved_raid_disk; 217062306a36Sopenharmony_ci else 217162306a36Sopenharmony_ci mirror = first; 217262306a36Sopenharmony_ci for ( ; mirror <= last ; mirror++) { 217362306a36Sopenharmony_ci p = &conf->mirrors[mirror]; 217462306a36Sopenharmony_ci if (p->recovery_disabled == mddev->recovery_disabled) 217562306a36Sopenharmony_ci continue; 217662306a36Sopenharmony_ci if (p->rdev) { 217762306a36Sopenharmony_ci if (test_bit(WantReplacement, &p->rdev->flags) && 217862306a36Sopenharmony_ci p->replacement == NULL && repl_slot < 0) 217962306a36Sopenharmony_ci repl_slot = mirror; 218062306a36Sopenharmony_ci continue; 218162306a36Sopenharmony_ci } 218262306a36Sopenharmony_ci 218362306a36Sopenharmony_ci if (mddev->gendisk) 218462306a36Sopenharmony_ci disk_stack_limits(mddev->gendisk, rdev->bdev, 218562306a36Sopenharmony_ci rdev->data_offset << 9); 218662306a36Sopenharmony_ci 218762306a36Sopenharmony_ci p->head_position = 0; 218862306a36Sopenharmony_ci p->recovery_disabled = mddev->recovery_disabled - 1; 218962306a36Sopenharmony_ci rdev->raid_disk = mirror; 219062306a36Sopenharmony_ci err = 0; 219162306a36Sopenharmony_ci if (rdev->saved_raid_disk != mirror) 219262306a36Sopenharmony_ci conf->fullsync = 1; 219362306a36Sopenharmony_ci rcu_assign_pointer(p->rdev, rdev); 219462306a36Sopenharmony_ci break; 219562306a36Sopenharmony_ci } 219662306a36Sopenharmony_ci 219762306a36Sopenharmony_ci if (err && repl_slot >= 0) { 219862306a36Sopenharmony_ci p = &conf->mirrors[repl_slot]; 219962306a36Sopenharmony_ci clear_bit(In_sync, &rdev->flags); 220062306a36Sopenharmony_ci set_bit(Replacement, &rdev->flags); 220162306a36Sopenharmony_ci rdev->raid_disk = repl_slot; 220262306a36Sopenharmony_ci err = 0; 220362306a36Sopenharmony_ci if (mddev->gendisk) 220462306a36Sopenharmony_ci disk_stack_limits(mddev->gendisk, rdev->bdev, 220562306a36Sopenharmony_ci rdev->data_offset << 9); 220662306a36Sopenharmony_ci conf->fullsync = 1; 220762306a36Sopenharmony_ci rcu_assign_pointer(p->replacement, rdev); 220862306a36Sopenharmony_ci } 220962306a36Sopenharmony_ci 221062306a36Sopenharmony_ci print_conf(conf); 221162306a36Sopenharmony_ci return err; 221262306a36Sopenharmony_ci} 221362306a36Sopenharmony_ci 221462306a36Sopenharmony_cistatic int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 221562306a36Sopenharmony_ci{ 221662306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 221762306a36Sopenharmony_ci int err = 0; 221862306a36Sopenharmony_ci int number = rdev->raid_disk; 221962306a36Sopenharmony_ci struct md_rdev **rdevp; 222062306a36Sopenharmony_ci struct raid10_info *p; 222162306a36Sopenharmony_ci 222262306a36Sopenharmony_ci print_conf(conf); 222362306a36Sopenharmony_ci if (unlikely(number >= mddev->raid_disks)) 222462306a36Sopenharmony_ci return 0; 222562306a36Sopenharmony_ci p = conf->mirrors + number; 222662306a36Sopenharmony_ci if (rdev == p->rdev) 222762306a36Sopenharmony_ci rdevp = &p->rdev; 222862306a36Sopenharmony_ci else if (rdev == p->replacement) 222962306a36Sopenharmony_ci rdevp = &p->replacement; 223062306a36Sopenharmony_ci else 223162306a36Sopenharmony_ci return 0; 223262306a36Sopenharmony_ci 223362306a36Sopenharmony_ci if (test_bit(In_sync, &rdev->flags) || 223462306a36Sopenharmony_ci atomic_read(&rdev->nr_pending)) { 223562306a36Sopenharmony_ci err = -EBUSY; 223662306a36Sopenharmony_ci goto abort; 223762306a36Sopenharmony_ci } 223862306a36Sopenharmony_ci /* Only remove non-faulty devices if recovery 223962306a36Sopenharmony_ci * is not possible. 224062306a36Sopenharmony_ci */ 224162306a36Sopenharmony_ci if (!test_bit(Faulty, &rdev->flags) && 224262306a36Sopenharmony_ci mddev->recovery_disabled != p->recovery_disabled && 224362306a36Sopenharmony_ci (!p->replacement || p->replacement == rdev) && 224462306a36Sopenharmony_ci number < conf->geo.raid_disks && 224562306a36Sopenharmony_ci enough(conf, -1)) { 224662306a36Sopenharmony_ci err = -EBUSY; 224762306a36Sopenharmony_ci goto abort; 224862306a36Sopenharmony_ci } 224962306a36Sopenharmony_ci *rdevp = NULL; 225062306a36Sopenharmony_ci if (!test_bit(RemoveSynchronized, &rdev->flags)) { 225162306a36Sopenharmony_ci synchronize_rcu(); 225262306a36Sopenharmony_ci if (atomic_read(&rdev->nr_pending)) { 225362306a36Sopenharmony_ci /* lost the race, try later */ 225462306a36Sopenharmony_ci err = -EBUSY; 225562306a36Sopenharmony_ci *rdevp = rdev; 225662306a36Sopenharmony_ci goto abort; 225762306a36Sopenharmony_ci } 225862306a36Sopenharmony_ci } 225962306a36Sopenharmony_ci if (p->replacement) { 226062306a36Sopenharmony_ci /* We must have just cleared 'rdev' */ 226162306a36Sopenharmony_ci p->rdev = p->replacement; 226262306a36Sopenharmony_ci clear_bit(Replacement, &p->replacement->flags); 226362306a36Sopenharmony_ci smp_mb(); /* Make sure other CPUs may see both as identical 226462306a36Sopenharmony_ci * but will never see neither -- if they are careful. 226562306a36Sopenharmony_ci */ 226662306a36Sopenharmony_ci p->replacement = NULL; 226762306a36Sopenharmony_ci } 226862306a36Sopenharmony_ci 226962306a36Sopenharmony_ci clear_bit(WantReplacement, &rdev->flags); 227062306a36Sopenharmony_ci err = md_integrity_register(mddev); 227162306a36Sopenharmony_ci 227262306a36Sopenharmony_ciabort: 227362306a36Sopenharmony_ci 227462306a36Sopenharmony_ci print_conf(conf); 227562306a36Sopenharmony_ci return err; 227662306a36Sopenharmony_ci} 227762306a36Sopenharmony_ci 227862306a36Sopenharmony_cistatic void __end_sync_read(struct r10bio *r10_bio, struct bio *bio, int d) 227962306a36Sopenharmony_ci{ 228062306a36Sopenharmony_ci struct r10conf *conf = r10_bio->mddev->private; 228162306a36Sopenharmony_ci 228262306a36Sopenharmony_ci if (!bio->bi_status) 228362306a36Sopenharmony_ci set_bit(R10BIO_Uptodate, &r10_bio->state); 228462306a36Sopenharmony_ci else 228562306a36Sopenharmony_ci /* The write handler will notice the lack of 228662306a36Sopenharmony_ci * R10BIO_Uptodate and record any errors etc 228762306a36Sopenharmony_ci */ 228862306a36Sopenharmony_ci atomic_add(r10_bio->sectors, 228962306a36Sopenharmony_ci &conf->mirrors[d].rdev->corrected_errors); 229062306a36Sopenharmony_ci 229162306a36Sopenharmony_ci /* for reconstruct, we always reschedule after a read. 229262306a36Sopenharmony_ci * for resync, only after all reads 229362306a36Sopenharmony_ci */ 229462306a36Sopenharmony_ci rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); 229562306a36Sopenharmony_ci if (test_bit(R10BIO_IsRecover, &r10_bio->state) || 229662306a36Sopenharmony_ci atomic_dec_and_test(&r10_bio->remaining)) { 229762306a36Sopenharmony_ci /* we have read all the blocks, 229862306a36Sopenharmony_ci * do the comparison in process context in raid10d 229962306a36Sopenharmony_ci */ 230062306a36Sopenharmony_ci reschedule_retry(r10_bio); 230162306a36Sopenharmony_ci } 230262306a36Sopenharmony_ci} 230362306a36Sopenharmony_ci 230462306a36Sopenharmony_cistatic void end_sync_read(struct bio *bio) 230562306a36Sopenharmony_ci{ 230662306a36Sopenharmony_ci struct r10bio *r10_bio = get_resync_r10bio(bio); 230762306a36Sopenharmony_ci struct r10conf *conf = r10_bio->mddev->private; 230862306a36Sopenharmony_ci int d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); 230962306a36Sopenharmony_ci 231062306a36Sopenharmony_ci __end_sync_read(r10_bio, bio, d); 231162306a36Sopenharmony_ci} 231262306a36Sopenharmony_ci 231362306a36Sopenharmony_cistatic void end_reshape_read(struct bio *bio) 231462306a36Sopenharmony_ci{ 231562306a36Sopenharmony_ci /* reshape read bio isn't allocated from r10buf_pool */ 231662306a36Sopenharmony_ci struct r10bio *r10_bio = bio->bi_private; 231762306a36Sopenharmony_ci 231862306a36Sopenharmony_ci __end_sync_read(r10_bio, bio, r10_bio->read_slot); 231962306a36Sopenharmony_ci} 232062306a36Sopenharmony_ci 232162306a36Sopenharmony_cistatic void end_sync_request(struct r10bio *r10_bio) 232262306a36Sopenharmony_ci{ 232362306a36Sopenharmony_ci struct mddev *mddev = r10_bio->mddev; 232462306a36Sopenharmony_ci 232562306a36Sopenharmony_ci while (atomic_dec_and_test(&r10_bio->remaining)) { 232662306a36Sopenharmony_ci if (r10_bio->master_bio == NULL) { 232762306a36Sopenharmony_ci /* the primary of several recovery bios */ 232862306a36Sopenharmony_ci sector_t s = r10_bio->sectors; 232962306a36Sopenharmony_ci if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 233062306a36Sopenharmony_ci test_bit(R10BIO_WriteError, &r10_bio->state)) 233162306a36Sopenharmony_ci reschedule_retry(r10_bio); 233262306a36Sopenharmony_ci else 233362306a36Sopenharmony_ci put_buf(r10_bio); 233462306a36Sopenharmony_ci md_done_sync(mddev, s, 1); 233562306a36Sopenharmony_ci break; 233662306a36Sopenharmony_ci } else { 233762306a36Sopenharmony_ci struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; 233862306a36Sopenharmony_ci if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 233962306a36Sopenharmony_ci test_bit(R10BIO_WriteError, &r10_bio->state)) 234062306a36Sopenharmony_ci reschedule_retry(r10_bio); 234162306a36Sopenharmony_ci else 234262306a36Sopenharmony_ci put_buf(r10_bio); 234362306a36Sopenharmony_ci r10_bio = r10_bio2; 234462306a36Sopenharmony_ci } 234562306a36Sopenharmony_ci } 234662306a36Sopenharmony_ci} 234762306a36Sopenharmony_ci 234862306a36Sopenharmony_cistatic void end_sync_write(struct bio *bio) 234962306a36Sopenharmony_ci{ 235062306a36Sopenharmony_ci struct r10bio *r10_bio = get_resync_r10bio(bio); 235162306a36Sopenharmony_ci struct mddev *mddev = r10_bio->mddev; 235262306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 235362306a36Sopenharmony_ci int d; 235462306a36Sopenharmony_ci sector_t first_bad; 235562306a36Sopenharmony_ci int bad_sectors; 235662306a36Sopenharmony_ci int slot; 235762306a36Sopenharmony_ci int repl; 235862306a36Sopenharmony_ci struct md_rdev *rdev = NULL; 235962306a36Sopenharmony_ci 236062306a36Sopenharmony_ci d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 236162306a36Sopenharmony_ci if (repl) 236262306a36Sopenharmony_ci rdev = conf->mirrors[d].replacement; 236362306a36Sopenharmony_ci else 236462306a36Sopenharmony_ci rdev = conf->mirrors[d].rdev; 236562306a36Sopenharmony_ci 236662306a36Sopenharmony_ci if (bio->bi_status) { 236762306a36Sopenharmony_ci if (repl) 236862306a36Sopenharmony_ci md_error(mddev, rdev); 236962306a36Sopenharmony_ci else { 237062306a36Sopenharmony_ci set_bit(WriteErrorSeen, &rdev->flags); 237162306a36Sopenharmony_ci if (!test_and_set_bit(WantReplacement, &rdev->flags)) 237262306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, 237362306a36Sopenharmony_ci &rdev->mddev->recovery); 237462306a36Sopenharmony_ci set_bit(R10BIO_WriteError, &r10_bio->state); 237562306a36Sopenharmony_ci } 237662306a36Sopenharmony_ci } else if (is_badblock(rdev, 237762306a36Sopenharmony_ci r10_bio->devs[slot].addr, 237862306a36Sopenharmony_ci r10_bio->sectors, 237962306a36Sopenharmony_ci &first_bad, &bad_sectors)) 238062306a36Sopenharmony_ci set_bit(R10BIO_MadeGood, &r10_bio->state); 238162306a36Sopenharmony_ci 238262306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 238362306a36Sopenharmony_ci 238462306a36Sopenharmony_ci end_sync_request(r10_bio); 238562306a36Sopenharmony_ci} 238662306a36Sopenharmony_ci 238762306a36Sopenharmony_ci/* 238862306a36Sopenharmony_ci * Note: sync and recover and handled very differently for raid10 238962306a36Sopenharmony_ci * This code is for resync. 239062306a36Sopenharmony_ci * For resync, we read through virtual addresses and read all blocks. 239162306a36Sopenharmony_ci * If there is any error, we schedule a write. The lowest numbered 239262306a36Sopenharmony_ci * drive is authoritative. 239362306a36Sopenharmony_ci * However requests come for physical address, so we need to map. 239462306a36Sopenharmony_ci * For every physical address there are raid_disks/copies virtual addresses, 239562306a36Sopenharmony_ci * which is always are least one, but is not necessarly an integer. 239662306a36Sopenharmony_ci * This means that a physical address can span multiple chunks, so we may 239762306a36Sopenharmony_ci * have to submit multiple io requests for a single sync request. 239862306a36Sopenharmony_ci */ 239962306a36Sopenharmony_ci/* 240062306a36Sopenharmony_ci * We check if all blocks are in-sync and only write to blocks that 240162306a36Sopenharmony_ci * aren't in sync 240262306a36Sopenharmony_ci */ 240362306a36Sopenharmony_cistatic void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) 240462306a36Sopenharmony_ci{ 240562306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 240662306a36Sopenharmony_ci int i, first; 240762306a36Sopenharmony_ci struct bio *tbio, *fbio; 240862306a36Sopenharmony_ci int vcnt; 240962306a36Sopenharmony_ci struct page **tpages, **fpages; 241062306a36Sopenharmony_ci 241162306a36Sopenharmony_ci atomic_set(&r10_bio->remaining, 1); 241262306a36Sopenharmony_ci 241362306a36Sopenharmony_ci /* find the first device with a block */ 241462306a36Sopenharmony_ci for (i=0; i<conf->copies; i++) 241562306a36Sopenharmony_ci if (!r10_bio->devs[i].bio->bi_status) 241662306a36Sopenharmony_ci break; 241762306a36Sopenharmony_ci 241862306a36Sopenharmony_ci if (i == conf->copies) 241962306a36Sopenharmony_ci goto done; 242062306a36Sopenharmony_ci 242162306a36Sopenharmony_ci first = i; 242262306a36Sopenharmony_ci fbio = r10_bio->devs[i].bio; 242362306a36Sopenharmony_ci fbio->bi_iter.bi_size = r10_bio->sectors << 9; 242462306a36Sopenharmony_ci fbio->bi_iter.bi_idx = 0; 242562306a36Sopenharmony_ci fpages = get_resync_pages(fbio)->pages; 242662306a36Sopenharmony_ci 242762306a36Sopenharmony_ci vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); 242862306a36Sopenharmony_ci /* now find blocks with errors */ 242962306a36Sopenharmony_ci for (i=0 ; i < conf->copies ; i++) { 243062306a36Sopenharmony_ci int j, d; 243162306a36Sopenharmony_ci struct md_rdev *rdev; 243262306a36Sopenharmony_ci struct resync_pages *rp; 243362306a36Sopenharmony_ci 243462306a36Sopenharmony_ci tbio = r10_bio->devs[i].bio; 243562306a36Sopenharmony_ci 243662306a36Sopenharmony_ci if (tbio->bi_end_io != end_sync_read) 243762306a36Sopenharmony_ci continue; 243862306a36Sopenharmony_ci if (i == first) 243962306a36Sopenharmony_ci continue; 244062306a36Sopenharmony_ci 244162306a36Sopenharmony_ci tpages = get_resync_pages(tbio)->pages; 244262306a36Sopenharmony_ci d = r10_bio->devs[i].devnum; 244362306a36Sopenharmony_ci rdev = conf->mirrors[d].rdev; 244462306a36Sopenharmony_ci if (!r10_bio->devs[i].bio->bi_status) { 244562306a36Sopenharmony_ci /* We know that the bi_io_vec layout is the same for 244662306a36Sopenharmony_ci * both 'first' and 'i', so we just compare them. 244762306a36Sopenharmony_ci * All vec entries are PAGE_SIZE; 244862306a36Sopenharmony_ci */ 244962306a36Sopenharmony_ci int sectors = r10_bio->sectors; 245062306a36Sopenharmony_ci for (j = 0; j < vcnt; j++) { 245162306a36Sopenharmony_ci int len = PAGE_SIZE; 245262306a36Sopenharmony_ci if (sectors < (len / 512)) 245362306a36Sopenharmony_ci len = sectors * 512; 245462306a36Sopenharmony_ci if (memcmp(page_address(fpages[j]), 245562306a36Sopenharmony_ci page_address(tpages[j]), 245662306a36Sopenharmony_ci len)) 245762306a36Sopenharmony_ci break; 245862306a36Sopenharmony_ci sectors -= len/512; 245962306a36Sopenharmony_ci } 246062306a36Sopenharmony_ci if (j == vcnt) 246162306a36Sopenharmony_ci continue; 246262306a36Sopenharmony_ci atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); 246362306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 246462306a36Sopenharmony_ci /* Don't fix anything. */ 246562306a36Sopenharmony_ci continue; 246662306a36Sopenharmony_ci } else if (test_bit(FailFast, &rdev->flags)) { 246762306a36Sopenharmony_ci /* Just give up on this device */ 246862306a36Sopenharmony_ci md_error(rdev->mddev, rdev); 246962306a36Sopenharmony_ci continue; 247062306a36Sopenharmony_ci } 247162306a36Sopenharmony_ci /* Ok, we need to write this bio, either to correct an 247262306a36Sopenharmony_ci * inconsistency or to correct an unreadable block. 247362306a36Sopenharmony_ci * First we need to fixup bv_offset, bv_len and 247462306a36Sopenharmony_ci * bi_vecs, as the read request might have corrupted these 247562306a36Sopenharmony_ci */ 247662306a36Sopenharmony_ci rp = get_resync_pages(tbio); 247762306a36Sopenharmony_ci bio_reset(tbio, conf->mirrors[d].rdev->bdev, REQ_OP_WRITE); 247862306a36Sopenharmony_ci 247962306a36Sopenharmony_ci md_bio_reset_resync_pages(tbio, rp, fbio->bi_iter.bi_size); 248062306a36Sopenharmony_ci 248162306a36Sopenharmony_ci rp->raid_bio = r10_bio; 248262306a36Sopenharmony_ci tbio->bi_private = rp; 248362306a36Sopenharmony_ci tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; 248462306a36Sopenharmony_ci tbio->bi_end_io = end_sync_write; 248562306a36Sopenharmony_ci 248662306a36Sopenharmony_ci bio_copy_data(tbio, fbio); 248762306a36Sopenharmony_ci 248862306a36Sopenharmony_ci atomic_inc(&conf->mirrors[d].rdev->nr_pending); 248962306a36Sopenharmony_ci atomic_inc(&r10_bio->remaining); 249062306a36Sopenharmony_ci md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); 249162306a36Sopenharmony_ci 249262306a36Sopenharmony_ci if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) 249362306a36Sopenharmony_ci tbio->bi_opf |= MD_FAILFAST; 249462306a36Sopenharmony_ci tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; 249562306a36Sopenharmony_ci submit_bio_noacct(tbio); 249662306a36Sopenharmony_ci } 249762306a36Sopenharmony_ci 249862306a36Sopenharmony_ci /* Now write out to any replacement devices 249962306a36Sopenharmony_ci * that are active 250062306a36Sopenharmony_ci */ 250162306a36Sopenharmony_ci for (i = 0; i < conf->copies; i++) { 250262306a36Sopenharmony_ci int d; 250362306a36Sopenharmony_ci 250462306a36Sopenharmony_ci tbio = r10_bio->devs[i].repl_bio; 250562306a36Sopenharmony_ci if (!tbio || !tbio->bi_end_io) 250662306a36Sopenharmony_ci continue; 250762306a36Sopenharmony_ci if (r10_bio->devs[i].bio->bi_end_io != end_sync_write 250862306a36Sopenharmony_ci && r10_bio->devs[i].bio != fbio) 250962306a36Sopenharmony_ci bio_copy_data(tbio, fbio); 251062306a36Sopenharmony_ci d = r10_bio->devs[i].devnum; 251162306a36Sopenharmony_ci atomic_inc(&r10_bio->remaining); 251262306a36Sopenharmony_ci md_sync_acct(conf->mirrors[d].replacement->bdev, 251362306a36Sopenharmony_ci bio_sectors(tbio)); 251462306a36Sopenharmony_ci submit_bio_noacct(tbio); 251562306a36Sopenharmony_ci } 251662306a36Sopenharmony_ci 251762306a36Sopenharmony_cidone: 251862306a36Sopenharmony_ci if (atomic_dec_and_test(&r10_bio->remaining)) { 251962306a36Sopenharmony_ci md_done_sync(mddev, r10_bio->sectors, 1); 252062306a36Sopenharmony_ci put_buf(r10_bio); 252162306a36Sopenharmony_ci } 252262306a36Sopenharmony_ci} 252362306a36Sopenharmony_ci 252462306a36Sopenharmony_ci/* 252562306a36Sopenharmony_ci * Now for the recovery code. 252662306a36Sopenharmony_ci * Recovery happens across physical sectors. 252762306a36Sopenharmony_ci * We recover all non-is_sync drives by finding the virtual address of 252862306a36Sopenharmony_ci * each, and then choose a working drive that also has that virt address. 252962306a36Sopenharmony_ci * There is a separate r10_bio for each non-in_sync drive. 253062306a36Sopenharmony_ci * Only the first two slots are in use. The first for reading, 253162306a36Sopenharmony_ci * The second for writing. 253262306a36Sopenharmony_ci * 253362306a36Sopenharmony_ci */ 253462306a36Sopenharmony_cistatic void fix_recovery_read_error(struct r10bio *r10_bio) 253562306a36Sopenharmony_ci{ 253662306a36Sopenharmony_ci /* We got a read error during recovery. 253762306a36Sopenharmony_ci * We repeat the read in smaller page-sized sections. 253862306a36Sopenharmony_ci * If a read succeeds, write it to the new device or record 253962306a36Sopenharmony_ci * a bad block if we cannot. 254062306a36Sopenharmony_ci * If a read fails, record a bad block on both old and 254162306a36Sopenharmony_ci * new devices. 254262306a36Sopenharmony_ci */ 254362306a36Sopenharmony_ci struct mddev *mddev = r10_bio->mddev; 254462306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 254562306a36Sopenharmony_ci struct bio *bio = r10_bio->devs[0].bio; 254662306a36Sopenharmony_ci sector_t sect = 0; 254762306a36Sopenharmony_ci int sectors = r10_bio->sectors; 254862306a36Sopenharmony_ci int idx = 0; 254962306a36Sopenharmony_ci int dr = r10_bio->devs[0].devnum; 255062306a36Sopenharmony_ci int dw = r10_bio->devs[1].devnum; 255162306a36Sopenharmony_ci struct page **pages = get_resync_pages(bio)->pages; 255262306a36Sopenharmony_ci 255362306a36Sopenharmony_ci while (sectors) { 255462306a36Sopenharmony_ci int s = sectors; 255562306a36Sopenharmony_ci struct md_rdev *rdev; 255662306a36Sopenharmony_ci sector_t addr; 255762306a36Sopenharmony_ci int ok; 255862306a36Sopenharmony_ci 255962306a36Sopenharmony_ci if (s > (PAGE_SIZE>>9)) 256062306a36Sopenharmony_ci s = PAGE_SIZE >> 9; 256162306a36Sopenharmony_ci 256262306a36Sopenharmony_ci rdev = conf->mirrors[dr].rdev; 256362306a36Sopenharmony_ci addr = r10_bio->devs[0].addr + sect, 256462306a36Sopenharmony_ci ok = sync_page_io(rdev, 256562306a36Sopenharmony_ci addr, 256662306a36Sopenharmony_ci s << 9, 256762306a36Sopenharmony_ci pages[idx], 256862306a36Sopenharmony_ci REQ_OP_READ, false); 256962306a36Sopenharmony_ci if (ok) { 257062306a36Sopenharmony_ci rdev = conf->mirrors[dw].rdev; 257162306a36Sopenharmony_ci addr = r10_bio->devs[1].addr + sect; 257262306a36Sopenharmony_ci ok = sync_page_io(rdev, 257362306a36Sopenharmony_ci addr, 257462306a36Sopenharmony_ci s << 9, 257562306a36Sopenharmony_ci pages[idx], 257662306a36Sopenharmony_ci REQ_OP_WRITE, false); 257762306a36Sopenharmony_ci if (!ok) { 257862306a36Sopenharmony_ci set_bit(WriteErrorSeen, &rdev->flags); 257962306a36Sopenharmony_ci if (!test_and_set_bit(WantReplacement, 258062306a36Sopenharmony_ci &rdev->flags)) 258162306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, 258262306a36Sopenharmony_ci &rdev->mddev->recovery); 258362306a36Sopenharmony_ci } 258462306a36Sopenharmony_ci } 258562306a36Sopenharmony_ci if (!ok) { 258662306a36Sopenharmony_ci /* We don't worry if we cannot set a bad block - 258762306a36Sopenharmony_ci * it really is bad so there is no loss in not 258862306a36Sopenharmony_ci * recording it yet 258962306a36Sopenharmony_ci */ 259062306a36Sopenharmony_ci rdev_set_badblocks(rdev, addr, s, 0); 259162306a36Sopenharmony_ci 259262306a36Sopenharmony_ci if (rdev != conf->mirrors[dw].rdev) { 259362306a36Sopenharmony_ci /* need bad block on destination too */ 259462306a36Sopenharmony_ci struct md_rdev *rdev2 = conf->mirrors[dw].rdev; 259562306a36Sopenharmony_ci addr = r10_bio->devs[1].addr + sect; 259662306a36Sopenharmony_ci ok = rdev_set_badblocks(rdev2, addr, s, 0); 259762306a36Sopenharmony_ci if (!ok) { 259862306a36Sopenharmony_ci /* just abort the recovery */ 259962306a36Sopenharmony_ci pr_notice("md/raid10:%s: recovery aborted due to read error\n", 260062306a36Sopenharmony_ci mdname(mddev)); 260162306a36Sopenharmony_ci 260262306a36Sopenharmony_ci conf->mirrors[dw].recovery_disabled 260362306a36Sopenharmony_ci = mddev->recovery_disabled; 260462306a36Sopenharmony_ci set_bit(MD_RECOVERY_INTR, 260562306a36Sopenharmony_ci &mddev->recovery); 260662306a36Sopenharmony_ci break; 260762306a36Sopenharmony_ci } 260862306a36Sopenharmony_ci } 260962306a36Sopenharmony_ci } 261062306a36Sopenharmony_ci 261162306a36Sopenharmony_ci sectors -= s; 261262306a36Sopenharmony_ci sect += s; 261362306a36Sopenharmony_ci idx++; 261462306a36Sopenharmony_ci } 261562306a36Sopenharmony_ci} 261662306a36Sopenharmony_ci 261762306a36Sopenharmony_cistatic void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) 261862306a36Sopenharmony_ci{ 261962306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 262062306a36Sopenharmony_ci int d; 262162306a36Sopenharmony_ci struct bio *wbio = r10_bio->devs[1].bio; 262262306a36Sopenharmony_ci struct bio *wbio2 = r10_bio->devs[1].repl_bio; 262362306a36Sopenharmony_ci 262462306a36Sopenharmony_ci /* Need to test wbio2->bi_end_io before we call 262562306a36Sopenharmony_ci * submit_bio_noacct as if the former is NULL, 262662306a36Sopenharmony_ci * the latter is free to free wbio2. 262762306a36Sopenharmony_ci */ 262862306a36Sopenharmony_ci if (wbio2 && !wbio2->bi_end_io) 262962306a36Sopenharmony_ci wbio2 = NULL; 263062306a36Sopenharmony_ci 263162306a36Sopenharmony_ci if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { 263262306a36Sopenharmony_ci fix_recovery_read_error(r10_bio); 263362306a36Sopenharmony_ci if (wbio->bi_end_io) 263462306a36Sopenharmony_ci end_sync_request(r10_bio); 263562306a36Sopenharmony_ci if (wbio2) 263662306a36Sopenharmony_ci end_sync_request(r10_bio); 263762306a36Sopenharmony_ci return; 263862306a36Sopenharmony_ci } 263962306a36Sopenharmony_ci 264062306a36Sopenharmony_ci /* 264162306a36Sopenharmony_ci * share the pages with the first bio 264262306a36Sopenharmony_ci * and submit the write request 264362306a36Sopenharmony_ci */ 264462306a36Sopenharmony_ci d = r10_bio->devs[1].devnum; 264562306a36Sopenharmony_ci if (wbio->bi_end_io) { 264662306a36Sopenharmony_ci atomic_inc(&conf->mirrors[d].rdev->nr_pending); 264762306a36Sopenharmony_ci md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); 264862306a36Sopenharmony_ci submit_bio_noacct(wbio); 264962306a36Sopenharmony_ci } 265062306a36Sopenharmony_ci if (wbio2) { 265162306a36Sopenharmony_ci atomic_inc(&conf->mirrors[d].replacement->nr_pending); 265262306a36Sopenharmony_ci md_sync_acct(conf->mirrors[d].replacement->bdev, 265362306a36Sopenharmony_ci bio_sectors(wbio2)); 265462306a36Sopenharmony_ci submit_bio_noacct(wbio2); 265562306a36Sopenharmony_ci } 265662306a36Sopenharmony_ci} 265762306a36Sopenharmony_ci 265862306a36Sopenharmony_ci/* 265962306a36Sopenharmony_ci * Used by fix_read_error() to decay the per rdev read_errors. 266062306a36Sopenharmony_ci * We halve the read error count for every hour that has elapsed 266162306a36Sopenharmony_ci * since the last recorded read error. 266262306a36Sopenharmony_ci * 266362306a36Sopenharmony_ci */ 266462306a36Sopenharmony_cistatic void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) 266562306a36Sopenharmony_ci{ 266662306a36Sopenharmony_ci long cur_time_mon; 266762306a36Sopenharmony_ci unsigned long hours_since_last; 266862306a36Sopenharmony_ci unsigned int read_errors = atomic_read(&rdev->read_errors); 266962306a36Sopenharmony_ci 267062306a36Sopenharmony_ci cur_time_mon = ktime_get_seconds(); 267162306a36Sopenharmony_ci 267262306a36Sopenharmony_ci if (rdev->last_read_error == 0) { 267362306a36Sopenharmony_ci /* first time we've seen a read error */ 267462306a36Sopenharmony_ci rdev->last_read_error = cur_time_mon; 267562306a36Sopenharmony_ci return; 267662306a36Sopenharmony_ci } 267762306a36Sopenharmony_ci 267862306a36Sopenharmony_ci hours_since_last = (long)(cur_time_mon - 267962306a36Sopenharmony_ci rdev->last_read_error) / 3600; 268062306a36Sopenharmony_ci 268162306a36Sopenharmony_ci rdev->last_read_error = cur_time_mon; 268262306a36Sopenharmony_ci 268362306a36Sopenharmony_ci /* 268462306a36Sopenharmony_ci * if hours_since_last is > the number of bits in read_errors 268562306a36Sopenharmony_ci * just set read errors to 0. We do this to avoid 268662306a36Sopenharmony_ci * overflowing the shift of read_errors by hours_since_last. 268762306a36Sopenharmony_ci */ 268862306a36Sopenharmony_ci if (hours_since_last >= 8 * sizeof(read_errors)) 268962306a36Sopenharmony_ci atomic_set(&rdev->read_errors, 0); 269062306a36Sopenharmony_ci else 269162306a36Sopenharmony_ci atomic_set(&rdev->read_errors, read_errors >> hours_since_last); 269262306a36Sopenharmony_ci} 269362306a36Sopenharmony_ci 269462306a36Sopenharmony_cistatic int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, 269562306a36Sopenharmony_ci int sectors, struct page *page, enum req_op op) 269662306a36Sopenharmony_ci{ 269762306a36Sopenharmony_ci sector_t first_bad; 269862306a36Sopenharmony_ci int bad_sectors; 269962306a36Sopenharmony_ci 270062306a36Sopenharmony_ci if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) 270162306a36Sopenharmony_ci && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) 270262306a36Sopenharmony_ci return -1; 270362306a36Sopenharmony_ci if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) 270462306a36Sopenharmony_ci /* success */ 270562306a36Sopenharmony_ci return 1; 270662306a36Sopenharmony_ci if (op == REQ_OP_WRITE) { 270762306a36Sopenharmony_ci set_bit(WriteErrorSeen, &rdev->flags); 270862306a36Sopenharmony_ci if (!test_and_set_bit(WantReplacement, &rdev->flags)) 270962306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, 271062306a36Sopenharmony_ci &rdev->mddev->recovery); 271162306a36Sopenharmony_ci } 271262306a36Sopenharmony_ci /* need to record an error - either for the block or the device */ 271362306a36Sopenharmony_ci if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 271462306a36Sopenharmony_ci md_error(rdev->mddev, rdev); 271562306a36Sopenharmony_ci return 0; 271662306a36Sopenharmony_ci} 271762306a36Sopenharmony_ci 271862306a36Sopenharmony_ci/* 271962306a36Sopenharmony_ci * This is a kernel thread which: 272062306a36Sopenharmony_ci * 272162306a36Sopenharmony_ci * 1. Retries failed read operations on working mirrors. 272262306a36Sopenharmony_ci * 2. Updates the raid superblock when problems encounter. 272362306a36Sopenharmony_ci * 3. Performs writes following reads for array synchronising. 272462306a36Sopenharmony_ci */ 272562306a36Sopenharmony_ci 272662306a36Sopenharmony_cistatic void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) 272762306a36Sopenharmony_ci{ 272862306a36Sopenharmony_ci int sect = 0; /* Offset from r10_bio->sector */ 272962306a36Sopenharmony_ci int sectors = r10_bio->sectors, slot = r10_bio->read_slot; 273062306a36Sopenharmony_ci struct md_rdev *rdev; 273162306a36Sopenharmony_ci int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 273262306a36Sopenharmony_ci int d = r10_bio->devs[slot].devnum; 273362306a36Sopenharmony_ci 273462306a36Sopenharmony_ci /* still own a reference to this rdev, so it cannot 273562306a36Sopenharmony_ci * have been cleared recently. 273662306a36Sopenharmony_ci */ 273762306a36Sopenharmony_ci rdev = conf->mirrors[d].rdev; 273862306a36Sopenharmony_ci 273962306a36Sopenharmony_ci if (test_bit(Faulty, &rdev->flags)) 274062306a36Sopenharmony_ci /* drive has already been failed, just ignore any 274162306a36Sopenharmony_ci more fix_read_error() attempts */ 274262306a36Sopenharmony_ci return; 274362306a36Sopenharmony_ci 274462306a36Sopenharmony_ci check_decay_read_errors(mddev, rdev); 274562306a36Sopenharmony_ci atomic_inc(&rdev->read_errors); 274662306a36Sopenharmony_ci if (atomic_read(&rdev->read_errors) > max_read_errors) { 274762306a36Sopenharmony_ci pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n", 274862306a36Sopenharmony_ci mdname(mddev), rdev->bdev, 274962306a36Sopenharmony_ci atomic_read(&rdev->read_errors), max_read_errors); 275062306a36Sopenharmony_ci pr_notice("md/raid10:%s: %pg: Failing raid device\n", 275162306a36Sopenharmony_ci mdname(mddev), rdev->bdev); 275262306a36Sopenharmony_ci md_error(mddev, rdev); 275362306a36Sopenharmony_ci r10_bio->devs[slot].bio = IO_BLOCKED; 275462306a36Sopenharmony_ci return; 275562306a36Sopenharmony_ci } 275662306a36Sopenharmony_ci 275762306a36Sopenharmony_ci while(sectors) { 275862306a36Sopenharmony_ci int s = sectors; 275962306a36Sopenharmony_ci int sl = slot; 276062306a36Sopenharmony_ci int success = 0; 276162306a36Sopenharmony_ci int start; 276262306a36Sopenharmony_ci 276362306a36Sopenharmony_ci if (s > (PAGE_SIZE>>9)) 276462306a36Sopenharmony_ci s = PAGE_SIZE >> 9; 276562306a36Sopenharmony_ci 276662306a36Sopenharmony_ci rcu_read_lock(); 276762306a36Sopenharmony_ci do { 276862306a36Sopenharmony_ci sector_t first_bad; 276962306a36Sopenharmony_ci int bad_sectors; 277062306a36Sopenharmony_ci 277162306a36Sopenharmony_ci d = r10_bio->devs[sl].devnum; 277262306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].rdev); 277362306a36Sopenharmony_ci if (rdev && 277462306a36Sopenharmony_ci test_bit(In_sync, &rdev->flags) && 277562306a36Sopenharmony_ci !test_bit(Faulty, &rdev->flags) && 277662306a36Sopenharmony_ci is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 277762306a36Sopenharmony_ci &first_bad, &bad_sectors) == 0) { 277862306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 277962306a36Sopenharmony_ci rcu_read_unlock(); 278062306a36Sopenharmony_ci success = sync_page_io(rdev, 278162306a36Sopenharmony_ci r10_bio->devs[sl].addr + 278262306a36Sopenharmony_ci sect, 278362306a36Sopenharmony_ci s<<9, 278462306a36Sopenharmony_ci conf->tmppage, 278562306a36Sopenharmony_ci REQ_OP_READ, false); 278662306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 278762306a36Sopenharmony_ci rcu_read_lock(); 278862306a36Sopenharmony_ci if (success) 278962306a36Sopenharmony_ci break; 279062306a36Sopenharmony_ci } 279162306a36Sopenharmony_ci sl++; 279262306a36Sopenharmony_ci if (sl == conf->copies) 279362306a36Sopenharmony_ci sl = 0; 279462306a36Sopenharmony_ci } while (sl != slot); 279562306a36Sopenharmony_ci rcu_read_unlock(); 279662306a36Sopenharmony_ci 279762306a36Sopenharmony_ci if (!success) { 279862306a36Sopenharmony_ci /* Cannot read from anywhere, just mark the block 279962306a36Sopenharmony_ci * as bad on the first device to discourage future 280062306a36Sopenharmony_ci * reads. 280162306a36Sopenharmony_ci */ 280262306a36Sopenharmony_ci int dn = r10_bio->devs[slot].devnum; 280362306a36Sopenharmony_ci rdev = conf->mirrors[dn].rdev; 280462306a36Sopenharmony_ci 280562306a36Sopenharmony_ci if (!rdev_set_badblocks( 280662306a36Sopenharmony_ci rdev, 280762306a36Sopenharmony_ci r10_bio->devs[slot].addr 280862306a36Sopenharmony_ci + sect, 280962306a36Sopenharmony_ci s, 0)) { 281062306a36Sopenharmony_ci md_error(mddev, rdev); 281162306a36Sopenharmony_ci r10_bio->devs[slot].bio 281262306a36Sopenharmony_ci = IO_BLOCKED; 281362306a36Sopenharmony_ci } 281462306a36Sopenharmony_ci break; 281562306a36Sopenharmony_ci } 281662306a36Sopenharmony_ci 281762306a36Sopenharmony_ci start = sl; 281862306a36Sopenharmony_ci /* write it back and re-read */ 281962306a36Sopenharmony_ci rcu_read_lock(); 282062306a36Sopenharmony_ci while (sl != slot) { 282162306a36Sopenharmony_ci if (sl==0) 282262306a36Sopenharmony_ci sl = conf->copies; 282362306a36Sopenharmony_ci sl--; 282462306a36Sopenharmony_ci d = r10_bio->devs[sl].devnum; 282562306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].rdev); 282662306a36Sopenharmony_ci if (!rdev || 282762306a36Sopenharmony_ci test_bit(Faulty, &rdev->flags) || 282862306a36Sopenharmony_ci !test_bit(In_sync, &rdev->flags)) 282962306a36Sopenharmony_ci continue; 283062306a36Sopenharmony_ci 283162306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 283262306a36Sopenharmony_ci rcu_read_unlock(); 283362306a36Sopenharmony_ci if (r10_sync_page_io(rdev, 283462306a36Sopenharmony_ci r10_bio->devs[sl].addr + 283562306a36Sopenharmony_ci sect, 283662306a36Sopenharmony_ci s, conf->tmppage, REQ_OP_WRITE) 283762306a36Sopenharmony_ci == 0) { 283862306a36Sopenharmony_ci /* Well, this device is dead */ 283962306a36Sopenharmony_ci pr_notice("md/raid10:%s: read correction write failed (%d sectors at %llu on %pg)\n", 284062306a36Sopenharmony_ci mdname(mddev), s, 284162306a36Sopenharmony_ci (unsigned long long)( 284262306a36Sopenharmony_ci sect + 284362306a36Sopenharmony_ci choose_data_offset(r10_bio, 284462306a36Sopenharmony_ci rdev)), 284562306a36Sopenharmony_ci rdev->bdev); 284662306a36Sopenharmony_ci pr_notice("md/raid10:%s: %pg: failing drive\n", 284762306a36Sopenharmony_ci mdname(mddev), 284862306a36Sopenharmony_ci rdev->bdev); 284962306a36Sopenharmony_ci } 285062306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 285162306a36Sopenharmony_ci rcu_read_lock(); 285262306a36Sopenharmony_ci } 285362306a36Sopenharmony_ci sl = start; 285462306a36Sopenharmony_ci while (sl != slot) { 285562306a36Sopenharmony_ci if (sl==0) 285662306a36Sopenharmony_ci sl = conf->copies; 285762306a36Sopenharmony_ci sl--; 285862306a36Sopenharmony_ci d = r10_bio->devs[sl].devnum; 285962306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].rdev); 286062306a36Sopenharmony_ci if (!rdev || 286162306a36Sopenharmony_ci test_bit(Faulty, &rdev->flags) || 286262306a36Sopenharmony_ci !test_bit(In_sync, &rdev->flags)) 286362306a36Sopenharmony_ci continue; 286462306a36Sopenharmony_ci 286562306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 286662306a36Sopenharmony_ci rcu_read_unlock(); 286762306a36Sopenharmony_ci switch (r10_sync_page_io(rdev, 286862306a36Sopenharmony_ci r10_bio->devs[sl].addr + 286962306a36Sopenharmony_ci sect, 287062306a36Sopenharmony_ci s, conf->tmppage, REQ_OP_READ)) { 287162306a36Sopenharmony_ci case 0: 287262306a36Sopenharmony_ci /* Well, this device is dead */ 287362306a36Sopenharmony_ci pr_notice("md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %pg)\n", 287462306a36Sopenharmony_ci mdname(mddev), s, 287562306a36Sopenharmony_ci (unsigned long long)( 287662306a36Sopenharmony_ci sect + 287762306a36Sopenharmony_ci choose_data_offset(r10_bio, rdev)), 287862306a36Sopenharmony_ci rdev->bdev); 287962306a36Sopenharmony_ci pr_notice("md/raid10:%s: %pg: failing drive\n", 288062306a36Sopenharmony_ci mdname(mddev), 288162306a36Sopenharmony_ci rdev->bdev); 288262306a36Sopenharmony_ci break; 288362306a36Sopenharmony_ci case 1: 288462306a36Sopenharmony_ci pr_info("md/raid10:%s: read error corrected (%d sectors at %llu on %pg)\n", 288562306a36Sopenharmony_ci mdname(mddev), s, 288662306a36Sopenharmony_ci (unsigned long long)( 288762306a36Sopenharmony_ci sect + 288862306a36Sopenharmony_ci choose_data_offset(r10_bio, rdev)), 288962306a36Sopenharmony_ci rdev->bdev); 289062306a36Sopenharmony_ci atomic_add(s, &rdev->corrected_errors); 289162306a36Sopenharmony_ci } 289262306a36Sopenharmony_ci 289362306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 289462306a36Sopenharmony_ci rcu_read_lock(); 289562306a36Sopenharmony_ci } 289662306a36Sopenharmony_ci rcu_read_unlock(); 289762306a36Sopenharmony_ci 289862306a36Sopenharmony_ci sectors -= s; 289962306a36Sopenharmony_ci sect += s; 290062306a36Sopenharmony_ci } 290162306a36Sopenharmony_ci} 290262306a36Sopenharmony_ci 290362306a36Sopenharmony_cistatic int narrow_write_error(struct r10bio *r10_bio, int i) 290462306a36Sopenharmony_ci{ 290562306a36Sopenharmony_ci struct bio *bio = r10_bio->master_bio; 290662306a36Sopenharmony_ci struct mddev *mddev = r10_bio->mddev; 290762306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 290862306a36Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; 290962306a36Sopenharmony_ci /* bio has the data to be written to slot 'i' where 291062306a36Sopenharmony_ci * we just recently had a write error. 291162306a36Sopenharmony_ci * We repeatedly clone the bio and trim down to one block, 291262306a36Sopenharmony_ci * then try the write. Where the write fails we record 291362306a36Sopenharmony_ci * a bad block. 291462306a36Sopenharmony_ci * It is conceivable that the bio doesn't exactly align with 291562306a36Sopenharmony_ci * blocks. We must handle this. 291662306a36Sopenharmony_ci * 291762306a36Sopenharmony_ci * We currently own a reference to the rdev. 291862306a36Sopenharmony_ci */ 291962306a36Sopenharmony_ci 292062306a36Sopenharmony_ci int block_sectors; 292162306a36Sopenharmony_ci sector_t sector; 292262306a36Sopenharmony_ci int sectors; 292362306a36Sopenharmony_ci int sect_to_write = r10_bio->sectors; 292462306a36Sopenharmony_ci int ok = 1; 292562306a36Sopenharmony_ci 292662306a36Sopenharmony_ci if (rdev->badblocks.shift < 0) 292762306a36Sopenharmony_ci return 0; 292862306a36Sopenharmony_ci 292962306a36Sopenharmony_ci block_sectors = roundup(1 << rdev->badblocks.shift, 293062306a36Sopenharmony_ci bdev_logical_block_size(rdev->bdev) >> 9); 293162306a36Sopenharmony_ci sector = r10_bio->sector; 293262306a36Sopenharmony_ci sectors = ((r10_bio->sector + block_sectors) 293362306a36Sopenharmony_ci & ~(sector_t)(block_sectors - 1)) 293462306a36Sopenharmony_ci - sector; 293562306a36Sopenharmony_ci 293662306a36Sopenharmony_ci while (sect_to_write) { 293762306a36Sopenharmony_ci struct bio *wbio; 293862306a36Sopenharmony_ci sector_t wsector; 293962306a36Sopenharmony_ci if (sectors > sect_to_write) 294062306a36Sopenharmony_ci sectors = sect_to_write; 294162306a36Sopenharmony_ci /* Write at 'sector' for 'sectors' */ 294262306a36Sopenharmony_ci wbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, 294362306a36Sopenharmony_ci &mddev->bio_set); 294462306a36Sopenharmony_ci bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); 294562306a36Sopenharmony_ci wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); 294662306a36Sopenharmony_ci wbio->bi_iter.bi_sector = wsector + 294762306a36Sopenharmony_ci choose_data_offset(r10_bio, rdev); 294862306a36Sopenharmony_ci wbio->bi_opf = REQ_OP_WRITE; 294962306a36Sopenharmony_ci 295062306a36Sopenharmony_ci if (submit_bio_wait(wbio) < 0) 295162306a36Sopenharmony_ci /* Failure! */ 295262306a36Sopenharmony_ci ok = rdev_set_badblocks(rdev, wsector, 295362306a36Sopenharmony_ci sectors, 0) 295462306a36Sopenharmony_ci && ok; 295562306a36Sopenharmony_ci 295662306a36Sopenharmony_ci bio_put(wbio); 295762306a36Sopenharmony_ci sect_to_write -= sectors; 295862306a36Sopenharmony_ci sector += sectors; 295962306a36Sopenharmony_ci sectors = block_sectors; 296062306a36Sopenharmony_ci } 296162306a36Sopenharmony_ci return ok; 296262306a36Sopenharmony_ci} 296362306a36Sopenharmony_ci 296462306a36Sopenharmony_cistatic void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) 296562306a36Sopenharmony_ci{ 296662306a36Sopenharmony_ci int slot = r10_bio->read_slot; 296762306a36Sopenharmony_ci struct bio *bio; 296862306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 296962306a36Sopenharmony_ci struct md_rdev *rdev = r10_bio->devs[slot].rdev; 297062306a36Sopenharmony_ci 297162306a36Sopenharmony_ci /* we got a read error. Maybe the drive is bad. Maybe just 297262306a36Sopenharmony_ci * the block and we can fix it. 297362306a36Sopenharmony_ci * We freeze all other IO, and try reading the block from 297462306a36Sopenharmony_ci * other devices. When we find one, we re-write 297562306a36Sopenharmony_ci * and check it that fixes the read error. 297662306a36Sopenharmony_ci * This is all done synchronously while the array is 297762306a36Sopenharmony_ci * frozen. 297862306a36Sopenharmony_ci */ 297962306a36Sopenharmony_ci bio = r10_bio->devs[slot].bio; 298062306a36Sopenharmony_ci bio_put(bio); 298162306a36Sopenharmony_ci r10_bio->devs[slot].bio = NULL; 298262306a36Sopenharmony_ci 298362306a36Sopenharmony_ci if (mddev->ro) 298462306a36Sopenharmony_ci r10_bio->devs[slot].bio = IO_BLOCKED; 298562306a36Sopenharmony_ci else if (!test_bit(FailFast, &rdev->flags)) { 298662306a36Sopenharmony_ci freeze_array(conf, 1); 298762306a36Sopenharmony_ci fix_read_error(conf, mddev, r10_bio); 298862306a36Sopenharmony_ci unfreeze_array(conf); 298962306a36Sopenharmony_ci } else 299062306a36Sopenharmony_ci md_error(mddev, rdev); 299162306a36Sopenharmony_ci 299262306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 299362306a36Sopenharmony_ci r10_bio->state = 0; 299462306a36Sopenharmony_ci raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false); 299562306a36Sopenharmony_ci /* 299662306a36Sopenharmony_ci * allow_barrier after re-submit to ensure no sync io 299762306a36Sopenharmony_ci * can be issued while regular io pending. 299862306a36Sopenharmony_ci */ 299962306a36Sopenharmony_ci allow_barrier(conf); 300062306a36Sopenharmony_ci} 300162306a36Sopenharmony_ci 300262306a36Sopenharmony_cistatic void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) 300362306a36Sopenharmony_ci{ 300462306a36Sopenharmony_ci /* Some sort of write request has finished and it 300562306a36Sopenharmony_ci * succeeded in writing where we thought there was a 300662306a36Sopenharmony_ci * bad block. So forget the bad block. 300762306a36Sopenharmony_ci * Or possibly if failed and we need to record 300862306a36Sopenharmony_ci * a bad block. 300962306a36Sopenharmony_ci */ 301062306a36Sopenharmony_ci int m; 301162306a36Sopenharmony_ci struct md_rdev *rdev; 301262306a36Sopenharmony_ci 301362306a36Sopenharmony_ci if (test_bit(R10BIO_IsSync, &r10_bio->state) || 301462306a36Sopenharmony_ci test_bit(R10BIO_IsRecover, &r10_bio->state)) { 301562306a36Sopenharmony_ci for (m = 0; m < conf->copies; m++) { 301662306a36Sopenharmony_ci int dev = r10_bio->devs[m].devnum; 301762306a36Sopenharmony_ci rdev = conf->mirrors[dev].rdev; 301862306a36Sopenharmony_ci if (r10_bio->devs[m].bio == NULL || 301962306a36Sopenharmony_ci r10_bio->devs[m].bio->bi_end_io == NULL) 302062306a36Sopenharmony_ci continue; 302162306a36Sopenharmony_ci if (!r10_bio->devs[m].bio->bi_status) { 302262306a36Sopenharmony_ci rdev_clear_badblocks( 302362306a36Sopenharmony_ci rdev, 302462306a36Sopenharmony_ci r10_bio->devs[m].addr, 302562306a36Sopenharmony_ci r10_bio->sectors, 0); 302662306a36Sopenharmony_ci } else { 302762306a36Sopenharmony_ci if (!rdev_set_badblocks( 302862306a36Sopenharmony_ci rdev, 302962306a36Sopenharmony_ci r10_bio->devs[m].addr, 303062306a36Sopenharmony_ci r10_bio->sectors, 0)) 303162306a36Sopenharmony_ci md_error(conf->mddev, rdev); 303262306a36Sopenharmony_ci } 303362306a36Sopenharmony_ci rdev = conf->mirrors[dev].replacement; 303462306a36Sopenharmony_ci if (r10_bio->devs[m].repl_bio == NULL || 303562306a36Sopenharmony_ci r10_bio->devs[m].repl_bio->bi_end_io == NULL) 303662306a36Sopenharmony_ci continue; 303762306a36Sopenharmony_ci 303862306a36Sopenharmony_ci if (!r10_bio->devs[m].repl_bio->bi_status) { 303962306a36Sopenharmony_ci rdev_clear_badblocks( 304062306a36Sopenharmony_ci rdev, 304162306a36Sopenharmony_ci r10_bio->devs[m].addr, 304262306a36Sopenharmony_ci r10_bio->sectors, 0); 304362306a36Sopenharmony_ci } else { 304462306a36Sopenharmony_ci if (!rdev_set_badblocks( 304562306a36Sopenharmony_ci rdev, 304662306a36Sopenharmony_ci r10_bio->devs[m].addr, 304762306a36Sopenharmony_ci r10_bio->sectors, 0)) 304862306a36Sopenharmony_ci md_error(conf->mddev, rdev); 304962306a36Sopenharmony_ci } 305062306a36Sopenharmony_ci } 305162306a36Sopenharmony_ci put_buf(r10_bio); 305262306a36Sopenharmony_ci } else { 305362306a36Sopenharmony_ci bool fail = false; 305462306a36Sopenharmony_ci for (m = 0; m < conf->copies; m++) { 305562306a36Sopenharmony_ci int dev = r10_bio->devs[m].devnum; 305662306a36Sopenharmony_ci struct bio *bio = r10_bio->devs[m].bio; 305762306a36Sopenharmony_ci rdev = conf->mirrors[dev].rdev; 305862306a36Sopenharmony_ci if (bio == IO_MADE_GOOD) { 305962306a36Sopenharmony_ci rdev_clear_badblocks( 306062306a36Sopenharmony_ci rdev, 306162306a36Sopenharmony_ci r10_bio->devs[m].addr, 306262306a36Sopenharmony_ci r10_bio->sectors, 0); 306362306a36Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 306462306a36Sopenharmony_ci } else if (bio != NULL && bio->bi_status) { 306562306a36Sopenharmony_ci fail = true; 306662306a36Sopenharmony_ci if (!narrow_write_error(r10_bio, m)) { 306762306a36Sopenharmony_ci md_error(conf->mddev, rdev); 306862306a36Sopenharmony_ci set_bit(R10BIO_Degraded, 306962306a36Sopenharmony_ci &r10_bio->state); 307062306a36Sopenharmony_ci } 307162306a36Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 307262306a36Sopenharmony_ci } 307362306a36Sopenharmony_ci bio = r10_bio->devs[m].repl_bio; 307462306a36Sopenharmony_ci rdev = conf->mirrors[dev].replacement; 307562306a36Sopenharmony_ci if (rdev && bio == IO_MADE_GOOD) { 307662306a36Sopenharmony_ci rdev_clear_badblocks( 307762306a36Sopenharmony_ci rdev, 307862306a36Sopenharmony_ci r10_bio->devs[m].addr, 307962306a36Sopenharmony_ci r10_bio->sectors, 0); 308062306a36Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 308162306a36Sopenharmony_ci } 308262306a36Sopenharmony_ci } 308362306a36Sopenharmony_ci if (fail) { 308462306a36Sopenharmony_ci spin_lock_irq(&conf->device_lock); 308562306a36Sopenharmony_ci list_add(&r10_bio->retry_list, &conf->bio_end_io_list); 308662306a36Sopenharmony_ci conf->nr_queued++; 308762306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 308862306a36Sopenharmony_ci /* 308962306a36Sopenharmony_ci * In case freeze_array() is waiting for condition 309062306a36Sopenharmony_ci * nr_pending == nr_queued + extra to be true. 309162306a36Sopenharmony_ci */ 309262306a36Sopenharmony_ci wake_up(&conf->wait_barrier); 309362306a36Sopenharmony_ci md_wakeup_thread(conf->mddev->thread); 309462306a36Sopenharmony_ci } else { 309562306a36Sopenharmony_ci if (test_bit(R10BIO_WriteError, 309662306a36Sopenharmony_ci &r10_bio->state)) 309762306a36Sopenharmony_ci close_write(r10_bio); 309862306a36Sopenharmony_ci raid_end_bio_io(r10_bio); 309962306a36Sopenharmony_ci } 310062306a36Sopenharmony_ci } 310162306a36Sopenharmony_ci} 310262306a36Sopenharmony_ci 310362306a36Sopenharmony_cistatic void raid10d(struct md_thread *thread) 310462306a36Sopenharmony_ci{ 310562306a36Sopenharmony_ci struct mddev *mddev = thread->mddev; 310662306a36Sopenharmony_ci struct r10bio *r10_bio; 310762306a36Sopenharmony_ci unsigned long flags; 310862306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 310962306a36Sopenharmony_ci struct list_head *head = &conf->retry_list; 311062306a36Sopenharmony_ci struct blk_plug plug; 311162306a36Sopenharmony_ci 311262306a36Sopenharmony_ci md_check_recovery(mddev); 311362306a36Sopenharmony_ci 311462306a36Sopenharmony_ci if (!list_empty_careful(&conf->bio_end_io_list) && 311562306a36Sopenharmony_ci !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 311662306a36Sopenharmony_ci LIST_HEAD(tmp); 311762306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 311862306a36Sopenharmony_ci if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 311962306a36Sopenharmony_ci while (!list_empty(&conf->bio_end_io_list)) { 312062306a36Sopenharmony_ci list_move(conf->bio_end_io_list.prev, &tmp); 312162306a36Sopenharmony_ci conf->nr_queued--; 312262306a36Sopenharmony_ci } 312362306a36Sopenharmony_ci } 312462306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 312562306a36Sopenharmony_ci while (!list_empty(&tmp)) { 312662306a36Sopenharmony_ci r10_bio = list_first_entry(&tmp, struct r10bio, 312762306a36Sopenharmony_ci retry_list); 312862306a36Sopenharmony_ci list_del(&r10_bio->retry_list); 312962306a36Sopenharmony_ci if (mddev->degraded) 313062306a36Sopenharmony_ci set_bit(R10BIO_Degraded, &r10_bio->state); 313162306a36Sopenharmony_ci 313262306a36Sopenharmony_ci if (test_bit(R10BIO_WriteError, 313362306a36Sopenharmony_ci &r10_bio->state)) 313462306a36Sopenharmony_ci close_write(r10_bio); 313562306a36Sopenharmony_ci raid_end_bio_io(r10_bio); 313662306a36Sopenharmony_ci } 313762306a36Sopenharmony_ci } 313862306a36Sopenharmony_ci 313962306a36Sopenharmony_ci blk_start_plug(&plug); 314062306a36Sopenharmony_ci for (;;) { 314162306a36Sopenharmony_ci 314262306a36Sopenharmony_ci flush_pending_writes(conf); 314362306a36Sopenharmony_ci 314462306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 314562306a36Sopenharmony_ci if (list_empty(head)) { 314662306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 314762306a36Sopenharmony_ci break; 314862306a36Sopenharmony_ci } 314962306a36Sopenharmony_ci r10_bio = list_entry(head->prev, struct r10bio, retry_list); 315062306a36Sopenharmony_ci list_del(head->prev); 315162306a36Sopenharmony_ci conf->nr_queued--; 315262306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 315362306a36Sopenharmony_ci 315462306a36Sopenharmony_ci mddev = r10_bio->mddev; 315562306a36Sopenharmony_ci conf = mddev->private; 315662306a36Sopenharmony_ci if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 315762306a36Sopenharmony_ci test_bit(R10BIO_WriteError, &r10_bio->state)) 315862306a36Sopenharmony_ci handle_write_completed(conf, r10_bio); 315962306a36Sopenharmony_ci else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) 316062306a36Sopenharmony_ci reshape_request_write(mddev, r10_bio); 316162306a36Sopenharmony_ci else if (test_bit(R10BIO_IsSync, &r10_bio->state)) 316262306a36Sopenharmony_ci sync_request_write(mddev, r10_bio); 316362306a36Sopenharmony_ci else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 316462306a36Sopenharmony_ci recovery_request_write(mddev, r10_bio); 316562306a36Sopenharmony_ci else if (test_bit(R10BIO_ReadError, &r10_bio->state)) 316662306a36Sopenharmony_ci handle_read_error(mddev, r10_bio); 316762306a36Sopenharmony_ci else 316862306a36Sopenharmony_ci WARN_ON_ONCE(1); 316962306a36Sopenharmony_ci 317062306a36Sopenharmony_ci cond_resched(); 317162306a36Sopenharmony_ci if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) 317262306a36Sopenharmony_ci md_check_recovery(mddev); 317362306a36Sopenharmony_ci } 317462306a36Sopenharmony_ci blk_finish_plug(&plug); 317562306a36Sopenharmony_ci} 317662306a36Sopenharmony_ci 317762306a36Sopenharmony_cistatic int init_resync(struct r10conf *conf) 317862306a36Sopenharmony_ci{ 317962306a36Sopenharmony_ci int ret, buffs, i; 318062306a36Sopenharmony_ci 318162306a36Sopenharmony_ci buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 318262306a36Sopenharmony_ci BUG_ON(mempool_initialized(&conf->r10buf_pool)); 318362306a36Sopenharmony_ci conf->have_replacement = 0; 318462306a36Sopenharmony_ci for (i = 0; i < conf->geo.raid_disks; i++) 318562306a36Sopenharmony_ci if (conf->mirrors[i].replacement) 318662306a36Sopenharmony_ci conf->have_replacement = 1; 318762306a36Sopenharmony_ci ret = mempool_init(&conf->r10buf_pool, buffs, 318862306a36Sopenharmony_ci r10buf_pool_alloc, r10buf_pool_free, conf); 318962306a36Sopenharmony_ci if (ret) 319062306a36Sopenharmony_ci return ret; 319162306a36Sopenharmony_ci conf->next_resync = 0; 319262306a36Sopenharmony_ci return 0; 319362306a36Sopenharmony_ci} 319462306a36Sopenharmony_ci 319562306a36Sopenharmony_cistatic struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) 319662306a36Sopenharmony_ci{ 319762306a36Sopenharmony_ci struct r10bio *r10bio = mempool_alloc(&conf->r10buf_pool, GFP_NOIO); 319862306a36Sopenharmony_ci struct rsync_pages *rp; 319962306a36Sopenharmony_ci struct bio *bio; 320062306a36Sopenharmony_ci int nalloc; 320162306a36Sopenharmony_ci int i; 320262306a36Sopenharmony_ci 320362306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 320462306a36Sopenharmony_ci test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 320562306a36Sopenharmony_ci nalloc = conf->copies; /* resync */ 320662306a36Sopenharmony_ci else 320762306a36Sopenharmony_ci nalloc = 2; /* recovery */ 320862306a36Sopenharmony_ci 320962306a36Sopenharmony_ci for (i = 0; i < nalloc; i++) { 321062306a36Sopenharmony_ci bio = r10bio->devs[i].bio; 321162306a36Sopenharmony_ci rp = bio->bi_private; 321262306a36Sopenharmony_ci bio_reset(bio, NULL, 0); 321362306a36Sopenharmony_ci bio->bi_private = rp; 321462306a36Sopenharmony_ci bio = r10bio->devs[i].repl_bio; 321562306a36Sopenharmony_ci if (bio) { 321662306a36Sopenharmony_ci rp = bio->bi_private; 321762306a36Sopenharmony_ci bio_reset(bio, NULL, 0); 321862306a36Sopenharmony_ci bio->bi_private = rp; 321962306a36Sopenharmony_ci } 322062306a36Sopenharmony_ci } 322162306a36Sopenharmony_ci return r10bio; 322262306a36Sopenharmony_ci} 322362306a36Sopenharmony_ci 322462306a36Sopenharmony_ci/* 322562306a36Sopenharmony_ci * Set cluster_sync_high since we need other nodes to add the 322662306a36Sopenharmony_ci * range [cluster_sync_low, cluster_sync_high] to suspend list. 322762306a36Sopenharmony_ci */ 322862306a36Sopenharmony_cistatic void raid10_set_cluster_sync_high(struct r10conf *conf) 322962306a36Sopenharmony_ci{ 323062306a36Sopenharmony_ci sector_t window_size; 323162306a36Sopenharmony_ci int extra_chunk, chunks; 323262306a36Sopenharmony_ci 323362306a36Sopenharmony_ci /* 323462306a36Sopenharmony_ci * First, here we define "stripe" as a unit which across 323562306a36Sopenharmony_ci * all member devices one time, so we get chunks by use 323662306a36Sopenharmony_ci * raid_disks / near_copies. Otherwise, if near_copies is 323762306a36Sopenharmony_ci * close to raid_disks, then resync window could increases 323862306a36Sopenharmony_ci * linearly with the increase of raid_disks, which means 323962306a36Sopenharmony_ci * we will suspend a really large IO window while it is not 324062306a36Sopenharmony_ci * necessary. If raid_disks is not divisible by near_copies, 324162306a36Sopenharmony_ci * an extra chunk is needed to ensure the whole "stripe" is 324262306a36Sopenharmony_ci * covered. 324362306a36Sopenharmony_ci */ 324462306a36Sopenharmony_ci 324562306a36Sopenharmony_ci chunks = conf->geo.raid_disks / conf->geo.near_copies; 324662306a36Sopenharmony_ci if (conf->geo.raid_disks % conf->geo.near_copies == 0) 324762306a36Sopenharmony_ci extra_chunk = 0; 324862306a36Sopenharmony_ci else 324962306a36Sopenharmony_ci extra_chunk = 1; 325062306a36Sopenharmony_ci window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors; 325162306a36Sopenharmony_ci 325262306a36Sopenharmony_ci /* 325362306a36Sopenharmony_ci * At least use a 32M window to align with raid1's resync window 325462306a36Sopenharmony_ci */ 325562306a36Sopenharmony_ci window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ? 325662306a36Sopenharmony_ci CLUSTER_RESYNC_WINDOW_SECTORS : window_size; 325762306a36Sopenharmony_ci 325862306a36Sopenharmony_ci conf->cluster_sync_high = conf->cluster_sync_low + window_size; 325962306a36Sopenharmony_ci} 326062306a36Sopenharmony_ci 326162306a36Sopenharmony_ci/* 326262306a36Sopenharmony_ci * perform a "sync" on one "block" 326362306a36Sopenharmony_ci * 326462306a36Sopenharmony_ci * We need to make sure that no normal I/O request - particularly write 326562306a36Sopenharmony_ci * requests - conflict with active sync requests. 326662306a36Sopenharmony_ci * 326762306a36Sopenharmony_ci * This is achieved by tracking pending requests and a 'barrier' concept 326862306a36Sopenharmony_ci * that can be installed to exclude normal IO requests. 326962306a36Sopenharmony_ci * 327062306a36Sopenharmony_ci * Resync and recovery are handled very differently. 327162306a36Sopenharmony_ci * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. 327262306a36Sopenharmony_ci * 327362306a36Sopenharmony_ci * For resync, we iterate over virtual addresses, read all copies, 327462306a36Sopenharmony_ci * and update if there are differences. If only one copy is live, 327562306a36Sopenharmony_ci * skip it. 327662306a36Sopenharmony_ci * For recovery, we iterate over physical addresses, read a good 327762306a36Sopenharmony_ci * value for each non-in_sync drive, and over-write. 327862306a36Sopenharmony_ci * 327962306a36Sopenharmony_ci * So, for recovery we may have several outstanding complex requests for a 328062306a36Sopenharmony_ci * given address, one for each out-of-sync device. We model this by allocating 328162306a36Sopenharmony_ci * a number of r10_bio structures, one for each out-of-sync device. 328262306a36Sopenharmony_ci * As we setup these structures, we collect all bio's together into a list 328362306a36Sopenharmony_ci * which we then process collectively to add pages, and then process again 328462306a36Sopenharmony_ci * to pass to submit_bio_noacct. 328562306a36Sopenharmony_ci * 328662306a36Sopenharmony_ci * The r10_bio structures are linked using a borrowed master_bio pointer. 328762306a36Sopenharmony_ci * This link is counted in ->remaining. When the r10_bio that points to NULL 328862306a36Sopenharmony_ci * has its remaining count decremented to 0, the whole complex operation 328962306a36Sopenharmony_ci * is complete. 329062306a36Sopenharmony_ci * 329162306a36Sopenharmony_ci */ 329262306a36Sopenharmony_ci 329362306a36Sopenharmony_cistatic sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, 329462306a36Sopenharmony_ci int *skipped) 329562306a36Sopenharmony_ci{ 329662306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 329762306a36Sopenharmony_ci struct r10bio *r10_bio; 329862306a36Sopenharmony_ci struct bio *biolist = NULL, *bio; 329962306a36Sopenharmony_ci sector_t max_sector, nr_sectors; 330062306a36Sopenharmony_ci int i; 330162306a36Sopenharmony_ci int max_sync; 330262306a36Sopenharmony_ci sector_t sync_blocks; 330362306a36Sopenharmony_ci sector_t sectors_skipped = 0; 330462306a36Sopenharmony_ci int chunks_skipped = 0; 330562306a36Sopenharmony_ci sector_t chunk_mask = conf->geo.chunk_mask; 330662306a36Sopenharmony_ci int page_idx = 0; 330762306a36Sopenharmony_ci int error_disk = -1; 330862306a36Sopenharmony_ci 330962306a36Sopenharmony_ci /* 331062306a36Sopenharmony_ci * Allow skipping a full rebuild for incremental assembly 331162306a36Sopenharmony_ci * of a clean array, like RAID1 does. 331262306a36Sopenharmony_ci */ 331362306a36Sopenharmony_ci if (mddev->bitmap == NULL && 331462306a36Sopenharmony_ci mddev->recovery_cp == MaxSector && 331562306a36Sopenharmony_ci mddev->reshape_position == MaxSector && 331662306a36Sopenharmony_ci !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 331762306a36Sopenharmony_ci !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 331862306a36Sopenharmony_ci !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 331962306a36Sopenharmony_ci conf->fullsync == 0) { 332062306a36Sopenharmony_ci *skipped = 1; 332162306a36Sopenharmony_ci return mddev->dev_sectors - sector_nr; 332262306a36Sopenharmony_ci } 332362306a36Sopenharmony_ci 332462306a36Sopenharmony_ci if (!mempool_initialized(&conf->r10buf_pool)) 332562306a36Sopenharmony_ci if (init_resync(conf)) 332662306a36Sopenharmony_ci return 0; 332762306a36Sopenharmony_ci 332862306a36Sopenharmony_ci skipped: 332962306a36Sopenharmony_ci max_sector = mddev->dev_sectors; 333062306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 333162306a36Sopenharmony_ci test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 333262306a36Sopenharmony_ci max_sector = mddev->resync_max_sectors; 333362306a36Sopenharmony_ci if (sector_nr >= max_sector) { 333462306a36Sopenharmony_ci conf->cluster_sync_low = 0; 333562306a36Sopenharmony_ci conf->cluster_sync_high = 0; 333662306a36Sopenharmony_ci 333762306a36Sopenharmony_ci /* If we aborted, we need to abort the 333862306a36Sopenharmony_ci * sync on the 'current' bitmap chucks (there can 333962306a36Sopenharmony_ci * be several when recovering multiple devices). 334062306a36Sopenharmony_ci * as we may have started syncing it but not finished. 334162306a36Sopenharmony_ci * We can find the current address in 334262306a36Sopenharmony_ci * mddev->curr_resync, but for recovery, 334362306a36Sopenharmony_ci * we need to convert that to several 334462306a36Sopenharmony_ci * virtual addresses. 334562306a36Sopenharmony_ci */ 334662306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 334762306a36Sopenharmony_ci end_reshape(conf); 334862306a36Sopenharmony_ci close_sync(conf); 334962306a36Sopenharmony_ci return 0; 335062306a36Sopenharmony_ci } 335162306a36Sopenharmony_ci 335262306a36Sopenharmony_ci if (mddev->curr_resync < max_sector) { /* aborted */ 335362306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 335462306a36Sopenharmony_ci md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 335562306a36Sopenharmony_ci &sync_blocks, 1); 335662306a36Sopenharmony_ci else for (i = 0; i < conf->geo.raid_disks; i++) { 335762306a36Sopenharmony_ci sector_t sect = 335862306a36Sopenharmony_ci raid10_find_virt(conf, mddev->curr_resync, i); 335962306a36Sopenharmony_ci md_bitmap_end_sync(mddev->bitmap, sect, 336062306a36Sopenharmony_ci &sync_blocks, 1); 336162306a36Sopenharmony_ci } 336262306a36Sopenharmony_ci } else { 336362306a36Sopenharmony_ci /* completed sync */ 336462306a36Sopenharmony_ci if ((!mddev->bitmap || conf->fullsync) 336562306a36Sopenharmony_ci && conf->have_replacement 336662306a36Sopenharmony_ci && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 336762306a36Sopenharmony_ci /* Completed a full sync so the replacements 336862306a36Sopenharmony_ci * are now fully recovered. 336962306a36Sopenharmony_ci */ 337062306a36Sopenharmony_ci rcu_read_lock(); 337162306a36Sopenharmony_ci for (i = 0; i < conf->geo.raid_disks; i++) { 337262306a36Sopenharmony_ci struct md_rdev *rdev = 337362306a36Sopenharmony_ci rcu_dereference(conf->mirrors[i].replacement); 337462306a36Sopenharmony_ci if (rdev) 337562306a36Sopenharmony_ci rdev->recovery_offset = MaxSector; 337662306a36Sopenharmony_ci } 337762306a36Sopenharmony_ci rcu_read_unlock(); 337862306a36Sopenharmony_ci } 337962306a36Sopenharmony_ci conf->fullsync = 0; 338062306a36Sopenharmony_ci } 338162306a36Sopenharmony_ci md_bitmap_close_sync(mddev->bitmap); 338262306a36Sopenharmony_ci close_sync(conf); 338362306a36Sopenharmony_ci *skipped = 1; 338462306a36Sopenharmony_ci return sectors_skipped; 338562306a36Sopenharmony_ci } 338662306a36Sopenharmony_ci 338762306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 338862306a36Sopenharmony_ci return reshape_request(mddev, sector_nr, skipped); 338962306a36Sopenharmony_ci 339062306a36Sopenharmony_ci if (chunks_skipped >= conf->geo.raid_disks) { 339162306a36Sopenharmony_ci pr_err("md/raid10:%s: %s fails\n", mdname(mddev), 339262306a36Sopenharmony_ci test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"); 339362306a36Sopenharmony_ci if (error_disk >= 0 && 339462306a36Sopenharmony_ci !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 339562306a36Sopenharmony_ci /* 339662306a36Sopenharmony_ci * recovery fails, set mirrors.recovery_disabled, 339762306a36Sopenharmony_ci * device shouldn't be added to there. 339862306a36Sopenharmony_ci */ 339962306a36Sopenharmony_ci conf->mirrors[error_disk].recovery_disabled = 340062306a36Sopenharmony_ci mddev->recovery_disabled; 340162306a36Sopenharmony_ci return 0; 340262306a36Sopenharmony_ci } 340362306a36Sopenharmony_ci /* 340462306a36Sopenharmony_ci * if there has been nothing to do on any drive, 340562306a36Sopenharmony_ci * then there is nothing to do at all. 340662306a36Sopenharmony_ci */ 340762306a36Sopenharmony_ci *skipped = 1; 340862306a36Sopenharmony_ci return (max_sector - sector_nr) + sectors_skipped; 340962306a36Sopenharmony_ci } 341062306a36Sopenharmony_ci 341162306a36Sopenharmony_ci if (max_sector > mddev->resync_max) 341262306a36Sopenharmony_ci max_sector = mddev->resync_max; /* Don't do IO beyond here */ 341362306a36Sopenharmony_ci 341462306a36Sopenharmony_ci /* make sure whole request will fit in a chunk - if chunks 341562306a36Sopenharmony_ci * are meaningful 341662306a36Sopenharmony_ci */ 341762306a36Sopenharmony_ci if (conf->geo.near_copies < conf->geo.raid_disks && 341862306a36Sopenharmony_ci max_sector > (sector_nr | chunk_mask)) 341962306a36Sopenharmony_ci max_sector = (sector_nr | chunk_mask) + 1; 342062306a36Sopenharmony_ci 342162306a36Sopenharmony_ci /* 342262306a36Sopenharmony_ci * If there is non-resync activity waiting for a turn, then let it 342362306a36Sopenharmony_ci * though before starting on this new sync request. 342462306a36Sopenharmony_ci */ 342562306a36Sopenharmony_ci if (conf->nr_waiting) 342662306a36Sopenharmony_ci schedule_timeout_uninterruptible(1); 342762306a36Sopenharmony_ci 342862306a36Sopenharmony_ci /* Again, very different code for resync and recovery. 342962306a36Sopenharmony_ci * Both must result in an r10bio with a list of bios that 343062306a36Sopenharmony_ci * have bi_end_io, bi_sector, bi_bdev set, 343162306a36Sopenharmony_ci * and bi_private set to the r10bio. 343262306a36Sopenharmony_ci * For recovery, we may actually create several r10bios 343362306a36Sopenharmony_ci * with 2 bios in each, that correspond to the bios in the main one. 343462306a36Sopenharmony_ci * In this case, the subordinate r10bios link back through a 343562306a36Sopenharmony_ci * borrowed master_bio pointer, and the counter in the master 343662306a36Sopenharmony_ci * includes a ref from each subordinate. 343762306a36Sopenharmony_ci */ 343862306a36Sopenharmony_ci /* First, we decide what to do and set ->bi_end_io 343962306a36Sopenharmony_ci * To end_sync_read if we want to read, and 344062306a36Sopenharmony_ci * end_sync_write if we will want to write. 344162306a36Sopenharmony_ci */ 344262306a36Sopenharmony_ci 344362306a36Sopenharmony_ci max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 344462306a36Sopenharmony_ci if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 344562306a36Sopenharmony_ci /* recovery... the complicated one */ 344662306a36Sopenharmony_ci int j; 344762306a36Sopenharmony_ci r10_bio = NULL; 344862306a36Sopenharmony_ci 344962306a36Sopenharmony_ci for (i = 0 ; i < conf->geo.raid_disks; i++) { 345062306a36Sopenharmony_ci int still_degraded; 345162306a36Sopenharmony_ci struct r10bio *rb2; 345262306a36Sopenharmony_ci sector_t sect; 345362306a36Sopenharmony_ci int must_sync; 345462306a36Sopenharmony_ci int any_working; 345562306a36Sopenharmony_ci struct raid10_info *mirror = &conf->mirrors[i]; 345662306a36Sopenharmony_ci struct md_rdev *mrdev, *mreplace; 345762306a36Sopenharmony_ci 345862306a36Sopenharmony_ci rcu_read_lock(); 345962306a36Sopenharmony_ci mrdev = rcu_dereference(mirror->rdev); 346062306a36Sopenharmony_ci mreplace = rcu_dereference(mirror->replacement); 346162306a36Sopenharmony_ci 346262306a36Sopenharmony_ci if (mrdev && (test_bit(Faulty, &mrdev->flags) || 346362306a36Sopenharmony_ci test_bit(In_sync, &mrdev->flags))) 346462306a36Sopenharmony_ci mrdev = NULL; 346562306a36Sopenharmony_ci if (mreplace && test_bit(Faulty, &mreplace->flags)) 346662306a36Sopenharmony_ci mreplace = NULL; 346762306a36Sopenharmony_ci 346862306a36Sopenharmony_ci if (!mrdev && !mreplace) { 346962306a36Sopenharmony_ci rcu_read_unlock(); 347062306a36Sopenharmony_ci continue; 347162306a36Sopenharmony_ci } 347262306a36Sopenharmony_ci 347362306a36Sopenharmony_ci still_degraded = 0; 347462306a36Sopenharmony_ci /* want to reconstruct this device */ 347562306a36Sopenharmony_ci rb2 = r10_bio; 347662306a36Sopenharmony_ci sect = raid10_find_virt(conf, sector_nr, i); 347762306a36Sopenharmony_ci if (sect >= mddev->resync_max_sectors) { 347862306a36Sopenharmony_ci /* last stripe is not complete - don't 347962306a36Sopenharmony_ci * try to recover this sector. 348062306a36Sopenharmony_ci */ 348162306a36Sopenharmony_ci rcu_read_unlock(); 348262306a36Sopenharmony_ci continue; 348362306a36Sopenharmony_ci } 348462306a36Sopenharmony_ci /* Unless we are doing a full sync, or a replacement 348562306a36Sopenharmony_ci * we only need to recover the block if it is set in 348662306a36Sopenharmony_ci * the bitmap 348762306a36Sopenharmony_ci */ 348862306a36Sopenharmony_ci must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 348962306a36Sopenharmony_ci &sync_blocks, 1); 349062306a36Sopenharmony_ci if (sync_blocks < max_sync) 349162306a36Sopenharmony_ci max_sync = sync_blocks; 349262306a36Sopenharmony_ci if (!must_sync && 349362306a36Sopenharmony_ci mreplace == NULL && 349462306a36Sopenharmony_ci !conf->fullsync) { 349562306a36Sopenharmony_ci /* yep, skip the sync_blocks here, but don't assume 349662306a36Sopenharmony_ci * that there will never be anything to do here 349762306a36Sopenharmony_ci */ 349862306a36Sopenharmony_ci chunks_skipped = -1; 349962306a36Sopenharmony_ci rcu_read_unlock(); 350062306a36Sopenharmony_ci continue; 350162306a36Sopenharmony_ci } 350262306a36Sopenharmony_ci if (mrdev) 350362306a36Sopenharmony_ci atomic_inc(&mrdev->nr_pending); 350462306a36Sopenharmony_ci if (mreplace) 350562306a36Sopenharmony_ci atomic_inc(&mreplace->nr_pending); 350662306a36Sopenharmony_ci rcu_read_unlock(); 350762306a36Sopenharmony_ci 350862306a36Sopenharmony_ci r10_bio = raid10_alloc_init_r10buf(conf); 350962306a36Sopenharmony_ci r10_bio->state = 0; 351062306a36Sopenharmony_ci raise_barrier(conf, rb2 != NULL); 351162306a36Sopenharmony_ci atomic_set(&r10_bio->remaining, 0); 351262306a36Sopenharmony_ci 351362306a36Sopenharmony_ci r10_bio->master_bio = (struct bio*)rb2; 351462306a36Sopenharmony_ci if (rb2) 351562306a36Sopenharmony_ci atomic_inc(&rb2->remaining); 351662306a36Sopenharmony_ci r10_bio->mddev = mddev; 351762306a36Sopenharmony_ci set_bit(R10BIO_IsRecover, &r10_bio->state); 351862306a36Sopenharmony_ci r10_bio->sector = sect; 351962306a36Sopenharmony_ci 352062306a36Sopenharmony_ci raid10_find_phys(conf, r10_bio); 352162306a36Sopenharmony_ci 352262306a36Sopenharmony_ci /* Need to check if the array will still be 352362306a36Sopenharmony_ci * degraded 352462306a36Sopenharmony_ci */ 352562306a36Sopenharmony_ci rcu_read_lock(); 352662306a36Sopenharmony_ci for (j = 0; j < conf->geo.raid_disks; j++) { 352762306a36Sopenharmony_ci struct md_rdev *rdev = rcu_dereference( 352862306a36Sopenharmony_ci conf->mirrors[j].rdev); 352962306a36Sopenharmony_ci if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 353062306a36Sopenharmony_ci still_degraded = 1; 353162306a36Sopenharmony_ci break; 353262306a36Sopenharmony_ci } 353362306a36Sopenharmony_ci } 353462306a36Sopenharmony_ci 353562306a36Sopenharmony_ci must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 353662306a36Sopenharmony_ci &sync_blocks, still_degraded); 353762306a36Sopenharmony_ci 353862306a36Sopenharmony_ci any_working = 0; 353962306a36Sopenharmony_ci for (j=0; j<conf->copies;j++) { 354062306a36Sopenharmony_ci int k; 354162306a36Sopenharmony_ci int d = r10_bio->devs[j].devnum; 354262306a36Sopenharmony_ci sector_t from_addr, to_addr; 354362306a36Sopenharmony_ci struct md_rdev *rdev = 354462306a36Sopenharmony_ci rcu_dereference(conf->mirrors[d].rdev); 354562306a36Sopenharmony_ci sector_t sector, first_bad; 354662306a36Sopenharmony_ci int bad_sectors; 354762306a36Sopenharmony_ci if (!rdev || 354862306a36Sopenharmony_ci !test_bit(In_sync, &rdev->flags)) 354962306a36Sopenharmony_ci continue; 355062306a36Sopenharmony_ci /* This is where we read from */ 355162306a36Sopenharmony_ci any_working = 1; 355262306a36Sopenharmony_ci sector = r10_bio->devs[j].addr; 355362306a36Sopenharmony_ci 355462306a36Sopenharmony_ci if (is_badblock(rdev, sector, max_sync, 355562306a36Sopenharmony_ci &first_bad, &bad_sectors)) { 355662306a36Sopenharmony_ci if (first_bad > sector) 355762306a36Sopenharmony_ci max_sync = first_bad - sector; 355862306a36Sopenharmony_ci else { 355962306a36Sopenharmony_ci bad_sectors -= (sector 356062306a36Sopenharmony_ci - first_bad); 356162306a36Sopenharmony_ci if (max_sync > bad_sectors) 356262306a36Sopenharmony_ci max_sync = bad_sectors; 356362306a36Sopenharmony_ci continue; 356462306a36Sopenharmony_ci } 356562306a36Sopenharmony_ci } 356662306a36Sopenharmony_ci bio = r10_bio->devs[0].bio; 356762306a36Sopenharmony_ci bio->bi_next = biolist; 356862306a36Sopenharmony_ci biolist = bio; 356962306a36Sopenharmony_ci bio->bi_end_io = end_sync_read; 357062306a36Sopenharmony_ci bio->bi_opf = REQ_OP_READ; 357162306a36Sopenharmony_ci if (test_bit(FailFast, &rdev->flags)) 357262306a36Sopenharmony_ci bio->bi_opf |= MD_FAILFAST; 357362306a36Sopenharmony_ci from_addr = r10_bio->devs[j].addr; 357462306a36Sopenharmony_ci bio->bi_iter.bi_sector = from_addr + 357562306a36Sopenharmony_ci rdev->data_offset; 357662306a36Sopenharmony_ci bio_set_dev(bio, rdev->bdev); 357762306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 357862306a36Sopenharmony_ci /* and we write to 'i' (if not in_sync) */ 357962306a36Sopenharmony_ci 358062306a36Sopenharmony_ci for (k=0; k<conf->copies; k++) 358162306a36Sopenharmony_ci if (r10_bio->devs[k].devnum == i) 358262306a36Sopenharmony_ci break; 358362306a36Sopenharmony_ci BUG_ON(k == conf->copies); 358462306a36Sopenharmony_ci to_addr = r10_bio->devs[k].addr; 358562306a36Sopenharmony_ci r10_bio->devs[0].devnum = d; 358662306a36Sopenharmony_ci r10_bio->devs[0].addr = from_addr; 358762306a36Sopenharmony_ci r10_bio->devs[1].devnum = i; 358862306a36Sopenharmony_ci r10_bio->devs[1].addr = to_addr; 358962306a36Sopenharmony_ci 359062306a36Sopenharmony_ci if (mrdev) { 359162306a36Sopenharmony_ci bio = r10_bio->devs[1].bio; 359262306a36Sopenharmony_ci bio->bi_next = biolist; 359362306a36Sopenharmony_ci biolist = bio; 359462306a36Sopenharmony_ci bio->bi_end_io = end_sync_write; 359562306a36Sopenharmony_ci bio->bi_opf = REQ_OP_WRITE; 359662306a36Sopenharmony_ci bio->bi_iter.bi_sector = to_addr 359762306a36Sopenharmony_ci + mrdev->data_offset; 359862306a36Sopenharmony_ci bio_set_dev(bio, mrdev->bdev); 359962306a36Sopenharmony_ci atomic_inc(&r10_bio->remaining); 360062306a36Sopenharmony_ci } else 360162306a36Sopenharmony_ci r10_bio->devs[1].bio->bi_end_io = NULL; 360262306a36Sopenharmony_ci 360362306a36Sopenharmony_ci /* and maybe write to replacement */ 360462306a36Sopenharmony_ci bio = r10_bio->devs[1].repl_bio; 360562306a36Sopenharmony_ci if (bio) 360662306a36Sopenharmony_ci bio->bi_end_io = NULL; 360762306a36Sopenharmony_ci /* Note: if replace is not NULL, then bio 360862306a36Sopenharmony_ci * cannot be NULL as r10buf_pool_alloc will 360962306a36Sopenharmony_ci * have allocated it. 361062306a36Sopenharmony_ci */ 361162306a36Sopenharmony_ci if (!mreplace) 361262306a36Sopenharmony_ci break; 361362306a36Sopenharmony_ci bio->bi_next = biolist; 361462306a36Sopenharmony_ci biolist = bio; 361562306a36Sopenharmony_ci bio->bi_end_io = end_sync_write; 361662306a36Sopenharmony_ci bio->bi_opf = REQ_OP_WRITE; 361762306a36Sopenharmony_ci bio->bi_iter.bi_sector = to_addr + 361862306a36Sopenharmony_ci mreplace->data_offset; 361962306a36Sopenharmony_ci bio_set_dev(bio, mreplace->bdev); 362062306a36Sopenharmony_ci atomic_inc(&r10_bio->remaining); 362162306a36Sopenharmony_ci break; 362262306a36Sopenharmony_ci } 362362306a36Sopenharmony_ci rcu_read_unlock(); 362462306a36Sopenharmony_ci if (j == conf->copies) { 362562306a36Sopenharmony_ci /* Cannot recover, so abort the recovery or 362662306a36Sopenharmony_ci * record a bad block */ 362762306a36Sopenharmony_ci if (any_working) { 362862306a36Sopenharmony_ci /* problem is that there are bad blocks 362962306a36Sopenharmony_ci * on other device(s) 363062306a36Sopenharmony_ci */ 363162306a36Sopenharmony_ci int k; 363262306a36Sopenharmony_ci for (k = 0; k < conf->copies; k++) 363362306a36Sopenharmony_ci if (r10_bio->devs[k].devnum == i) 363462306a36Sopenharmony_ci break; 363562306a36Sopenharmony_ci if (mrdev && !test_bit(In_sync, 363662306a36Sopenharmony_ci &mrdev->flags) 363762306a36Sopenharmony_ci && !rdev_set_badblocks( 363862306a36Sopenharmony_ci mrdev, 363962306a36Sopenharmony_ci r10_bio->devs[k].addr, 364062306a36Sopenharmony_ci max_sync, 0)) 364162306a36Sopenharmony_ci any_working = 0; 364262306a36Sopenharmony_ci if (mreplace && 364362306a36Sopenharmony_ci !rdev_set_badblocks( 364462306a36Sopenharmony_ci mreplace, 364562306a36Sopenharmony_ci r10_bio->devs[k].addr, 364662306a36Sopenharmony_ci max_sync, 0)) 364762306a36Sopenharmony_ci any_working = 0; 364862306a36Sopenharmony_ci } 364962306a36Sopenharmony_ci if (!any_working) { 365062306a36Sopenharmony_ci if (!test_and_set_bit(MD_RECOVERY_INTR, 365162306a36Sopenharmony_ci &mddev->recovery)) 365262306a36Sopenharmony_ci pr_warn("md/raid10:%s: insufficient working devices for recovery.\n", 365362306a36Sopenharmony_ci mdname(mddev)); 365462306a36Sopenharmony_ci mirror->recovery_disabled 365562306a36Sopenharmony_ci = mddev->recovery_disabled; 365662306a36Sopenharmony_ci } else { 365762306a36Sopenharmony_ci error_disk = i; 365862306a36Sopenharmony_ci } 365962306a36Sopenharmony_ci put_buf(r10_bio); 366062306a36Sopenharmony_ci if (rb2) 366162306a36Sopenharmony_ci atomic_dec(&rb2->remaining); 366262306a36Sopenharmony_ci r10_bio = rb2; 366362306a36Sopenharmony_ci if (mrdev) 366462306a36Sopenharmony_ci rdev_dec_pending(mrdev, mddev); 366562306a36Sopenharmony_ci if (mreplace) 366662306a36Sopenharmony_ci rdev_dec_pending(mreplace, mddev); 366762306a36Sopenharmony_ci break; 366862306a36Sopenharmony_ci } 366962306a36Sopenharmony_ci if (mrdev) 367062306a36Sopenharmony_ci rdev_dec_pending(mrdev, mddev); 367162306a36Sopenharmony_ci if (mreplace) 367262306a36Sopenharmony_ci rdev_dec_pending(mreplace, mddev); 367362306a36Sopenharmony_ci if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { 367462306a36Sopenharmony_ci /* Only want this if there is elsewhere to 367562306a36Sopenharmony_ci * read from. 'j' is currently the first 367662306a36Sopenharmony_ci * readable copy. 367762306a36Sopenharmony_ci */ 367862306a36Sopenharmony_ci int targets = 1; 367962306a36Sopenharmony_ci for (; j < conf->copies; j++) { 368062306a36Sopenharmony_ci int d = r10_bio->devs[j].devnum; 368162306a36Sopenharmony_ci if (conf->mirrors[d].rdev && 368262306a36Sopenharmony_ci test_bit(In_sync, 368362306a36Sopenharmony_ci &conf->mirrors[d].rdev->flags)) 368462306a36Sopenharmony_ci targets++; 368562306a36Sopenharmony_ci } 368662306a36Sopenharmony_ci if (targets == 1) 368762306a36Sopenharmony_ci r10_bio->devs[0].bio->bi_opf 368862306a36Sopenharmony_ci &= ~MD_FAILFAST; 368962306a36Sopenharmony_ci } 369062306a36Sopenharmony_ci } 369162306a36Sopenharmony_ci if (biolist == NULL) { 369262306a36Sopenharmony_ci while (r10_bio) { 369362306a36Sopenharmony_ci struct r10bio *rb2 = r10_bio; 369462306a36Sopenharmony_ci r10_bio = (struct r10bio*) rb2->master_bio; 369562306a36Sopenharmony_ci rb2->master_bio = NULL; 369662306a36Sopenharmony_ci put_buf(rb2); 369762306a36Sopenharmony_ci } 369862306a36Sopenharmony_ci goto giveup; 369962306a36Sopenharmony_ci } 370062306a36Sopenharmony_ci } else { 370162306a36Sopenharmony_ci /* resync. Schedule a read for every block at this virt offset */ 370262306a36Sopenharmony_ci int count = 0; 370362306a36Sopenharmony_ci 370462306a36Sopenharmony_ci /* 370562306a36Sopenharmony_ci * Since curr_resync_completed could probably not update in 370662306a36Sopenharmony_ci * time, and we will set cluster_sync_low based on it. 370762306a36Sopenharmony_ci * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for 370862306a36Sopenharmony_ci * safety reason, which ensures curr_resync_completed is 370962306a36Sopenharmony_ci * updated in bitmap_cond_end_sync. 371062306a36Sopenharmony_ci */ 371162306a36Sopenharmony_ci md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, 371262306a36Sopenharmony_ci mddev_is_clustered(mddev) && 371362306a36Sopenharmony_ci (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); 371462306a36Sopenharmony_ci 371562306a36Sopenharmony_ci if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, 371662306a36Sopenharmony_ci &sync_blocks, mddev->degraded) && 371762306a36Sopenharmony_ci !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, 371862306a36Sopenharmony_ci &mddev->recovery)) { 371962306a36Sopenharmony_ci /* We can skip this block */ 372062306a36Sopenharmony_ci *skipped = 1; 372162306a36Sopenharmony_ci return sync_blocks + sectors_skipped; 372262306a36Sopenharmony_ci } 372362306a36Sopenharmony_ci if (sync_blocks < max_sync) 372462306a36Sopenharmony_ci max_sync = sync_blocks; 372562306a36Sopenharmony_ci r10_bio = raid10_alloc_init_r10buf(conf); 372662306a36Sopenharmony_ci r10_bio->state = 0; 372762306a36Sopenharmony_ci 372862306a36Sopenharmony_ci r10_bio->mddev = mddev; 372962306a36Sopenharmony_ci atomic_set(&r10_bio->remaining, 0); 373062306a36Sopenharmony_ci raise_barrier(conf, 0); 373162306a36Sopenharmony_ci conf->next_resync = sector_nr; 373262306a36Sopenharmony_ci 373362306a36Sopenharmony_ci r10_bio->master_bio = NULL; 373462306a36Sopenharmony_ci r10_bio->sector = sector_nr; 373562306a36Sopenharmony_ci set_bit(R10BIO_IsSync, &r10_bio->state); 373662306a36Sopenharmony_ci raid10_find_phys(conf, r10_bio); 373762306a36Sopenharmony_ci r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; 373862306a36Sopenharmony_ci 373962306a36Sopenharmony_ci for (i = 0; i < conf->copies; i++) { 374062306a36Sopenharmony_ci int d = r10_bio->devs[i].devnum; 374162306a36Sopenharmony_ci sector_t first_bad, sector; 374262306a36Sopenharmony_ci int bad_sectors; 374362306a36Sopenharmony_ci struct md_rdev *rdev; 374462306a36Sopenharmony_ci 374562306a36Sopenharmony_ci if (r10_bio->devs[i].repl_bio) 374662306a36Sopenharmony_ci r10_bio->devs[i].repl_bio->bi_end_io = NULL; 374762306a36Sopenharmony_ci 374862306a36Sopenharmony_ci bio = r10_bio->devs[i].bio; 374962306a36Sopenharmony_ci bio->bi_status = BLK_STS_IOERR; 375062306a36Sopenharmony_ci rcu_read_lock(); 375162306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].rdev); 375262306a36Sopenharmony_ci if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 375362306a36Sopenharmony_ci rcu_read_unlock(); 375462306a36Sopenharmony_ci continue; 375562306a36Sopenharmony_ci } 375662306a36Sopenharmony_ci sector = r10_bio->devs[i].addr; 375762306a36Sopenharmony_ci if (is_badblock(rdev, sector, max_sync, 375862306a36Sopenharmony_ci &first_bad, &bad_sectors)) { 375962306a36Sopenharmony_ci if (first_bad > sector) 376062306a36Sopenharmony_ci max_sync = first_bad - sector; 376162306a36Sopenharmony_ci else { 376262306a36Sopenharmony_ci bad_sectors -= (sector - first_bad); 376362306a36Sopenharmony_ci if (max_sync > bad_sectors) 376462306a36Sopenharmony_ci max_sync = bad_sectors; 376562306a36Sopenharmony_ci rcu_read_unlock(); 376662306a36Sopenharmony_ci continue; 376762306a36Sopenharmony_ci } 376862306a36Sopenharmony_ci } 376962306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 377062306a36Sopenharmony_ci atomic_inc(&r10_bio->remaining); 377162306a36Sopenharmony_ci bio->bi_next = biolist; 377262306a36Sopenharmony_ci biolist = bio; 377362306a36Sopenharmony_ci bio->bi_end_io = end_sync_read; 377462306a36Sopenharmony_ci bio->bi_opf = REQ_OP_READ; 377562306a36Sopenharmony_ci if (test_bit(FailFast, &rdev->flags)) 377662306a36Sopenharmony_ci bio->bi_opf |= MD_FAILFAST; 377762306a36Sopenharmony_ci bio->bi_iter.bi_sector = sector + rdev->data_offset; 377862306a36Sopenharmony_ci bio_set_dev(bio, rdev->bdev); 377962306a36Sopenharmony_ci count++; 378062306a36Sopenharmony_ci 378162306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].replacement); 378262306a36Sopenharmony_ci if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 378362306a36Sopenharmony_ci rcu_read_unlock(); 378462306a36Sopenharmony_ci continue; 378562306a36Sopenharmony_ci } 378662306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 378762306a36Sopenharmony_ci 378862306a36Sopenharmony_ci /* Need to set up for writing to the replacement */ 378962306a36Sopenharmony_ci bio = r10_bio->devs[i].repl_bio; 379062306a36Sopenharmony_ci bio->bi_status = BLK_STS_IOERR; 379162306a36Sopenharmony_ci 379262306a36Sopenharmony_ci sector = r10_bio->devs[i].addr; 379362306a36Sopenharmony_ci bio->bi_next = biolist; 379462306a36Sopenharmony_ci biolist = bio; 379562306a36Sopenharmony_ci bio->bi_end_io = end_sync_write; 379662306a36Sopenharmony_ci bio->bi_opf = REQ_OP_WRITE; 379762306a36Sopenharmony_ci if (test_bit(FailFast, &rdev->flags)) 379862306a36Sopenharmony_ci bio->bi_opf |= MD_FAILFAST; 379962306a36Sopenharmony_ci bio->bi_iter.bi_sector = sector + rdev->data_offset; 380062306a36Sopenharmony_ci bio_set_dev(bio, rdev->bdev); 380162306a36Sopenharmony_ci count++; 380262306a36Sopenharmony_ci rcu_read_unlock(); 380362306a36Sopenharmony_ci } 380462306a36Sopenharmony_ci 380562306a36Sopenharmony_ci if (count < 2) { 380662306a36Sopenharmony_ci for (i=0; i<conf->copies; i++) { 380762306a36Sopenharmony_ci int d = r10_bio->devs[i].devnum; 380862306a36Sopenharmony_ci if (r10_bio->devs[i].bio->bi_end_io) 380962306a36Sopenharmony_ci rdev_dec_pending(conf->mirrors[d].rdev, 381062306a36Sopenharmony_ci mddev); 381162306a36Sopenharmony_ci if (r10_bio->devs[i].repl_bio && 381262306a36Sopenharmony_ci r10_bio->devs[i].repl_bio->bi_end_io) 381362306a36Sopenharmony_ci rdev_dec_pending( 381462306a36Sopenharmony_ci conf->mirrors[d].replacement, 381562306a36Sopenharmony_ci mddev); 381662306a36Sopenharmony_ci } 381762306a36Sopenharmony_ci put_buf(r10_bio); 381862306a36Sopenharmony_ci biolist = NULL; 381962306a36Sopenharmony_ci goto giveup; 382062306a36Sopenharmony_ci } 382162306a36Sopenharmony_ci } 382262306a36Sopenharmony_ci 382362306a36Sopenharmony_ci nr_sectors = 0; 382462306a36Sopenharmony_ci if (sector_nr + max_sync < max_sector) 382562306a36Sopenharmony_ci max_sector = sector_nr + max_sync; 382662306a36Sopenharmony_ci do { 382762306a36Sopenharmony_ci struct page *page; 382862306a36Sopenharmony_ci int len = PAGE_SIZE; 382962306a36Sopenharmony_ci if (sector_nr + (len>>9) > max_sector) 383062306a36Sopenharmony_ci len = (max_sector - sector_nr) << 9; 383162306a36Sopenharmony_ci if (len == 0) 383262306a36Sopenharmony_ci break; 383362306a36Sopenharmony_ci for (bio= biolist ; bio ; bio=bio->bi_next) { 383462306a36Sopenharmony_ci struct resync_pages *rp = get_resync_pages(bio); 383562306a36Sopenharmony_ci page = resync_fetch_page(rp, page_idx); 383662306a36Sopenharmony_ci if (WARN_ON(!bio_add_page(bio, page, len, 0))) { 383762306a36Sopenharmony_ci bio->bi_status = BLK_STS_RESOURCE; 383862306a36Sopenharmony_ci bio_endio(bio); 383962306a36Sopenharmony_ci goto giveup; 384062306a36Sopenharmony_ci } 384162306a36Sopenharmony_ci } 384262306a36Sopenharmony_ci nr_sectors += len>>9; 384362306a36Sopenharmony_ci sector_nr += len>>9; 384462306a36Sopenharmony_ci } while (++page_idx < RESYNC_PAGES); 384562306a36Sopenharmony_ci r10_bio->sectors = nr_sectors; 384662306a36Sopenharmony_ci 384762306a36Sopenharmony_ci if (mddev_is_clustered(mddev) && 384862306a36Sopenharmony_ci test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 384962306a36Sopenharmony_ci /* It is resync not recovery */ 385062306a36Sopenharmony_ci if (conf->cluster_sync_high < sector_nr + nr_sectors) { 385162306a36Sopenharmony_ci conf->cluster_sync_low = mddev->curr_resync_completed; 385262306a36Sopenharmony_ci raid10_set_cluster_sync_high(conf); 385362306a36Sopenharmony_ci /* Send resync message */ 385462306a36Sopenharmony_ci md_cluster_ops->resync_info_update(mddev, 385562306a36Sopenharmony_ci conf->cluster_sync_low, 385662306a36Sopenharmony_ci conf->cluster_sync_high); 385762306a36Sopenharmony_ci } 385862306a36Sopenharmony_ci } else if (mddev_is_clustered(mddev)) { 385962306a36Sopenharmony_ci /* This is recovery not resync */ 386062306a36Sopenharmony_ci sector_t sect_va1, sect_va2; 386162306a36Sopenharmony_ci bool broadcast_msg = false; 386262306a36Sopenharmony_ci 386362306a36Sopenharmony_ci for (i = 0; i < conf->geo.raid_disks; i++) { 386462306a36Sopenharmony_ci /* 386562306a36Sopenharmony_ci * sector_nr is a device address for recovery, so we 386662306a36Sopenharmony_ci * need translate it to array address before compare 386762306a36Sopenharmony_ci * with cluster_sync_high. 386862306a36Sopenharmony_ci */ 386962306a36Sopenharmony_ci sect_va1 = raid10_find_virt(conf, sector_nr, i); 387062306a36Sopenharmony_ci 387162306a36Sopenharmony_ci if (conf->cluster_sync_high < sect_va1 + nr_sectors) { 387262306a36Sopenharmony_ci broadcast_msg = true; 387362306a36Sopenharmony_ci /* 387462306a36Sopenharmony_ci * curr_resync_completed is similar as 387562306a36Sopenharmony_ci * sector_nr, so make the translation too. 387662306a36Sopenharmony_ci */ 387762306a36Sopenharmony_ci sect_va2 = raid10_find_virt(conf, 387862306a36Sopenharmony_ci mddev->curr_resync_completed, i); 387962306a36Sopenharmony_ci 388062306a36Sopenharmony_ci if (conf->cluster_sync_low == 0 || 388162306a36Sopenharmony_ci conf->cluster_sync_low > sect_va2) 388262306a36Sopenharmony_ci conf->cluster_sync_low = sect_va2; 388362306a36Sopenharmony_ci } 388462306a36Sopenharmony_ci } 388562306a36Sopenharmony_ci if (broadcast_msg) { 388662306a36Sopenharmony_ci raid10_set_cluster_sync_high(conf); 388762306a36Sopenharmony_ci md_cluster_ops->resync_info_update(mddev, 388862306a36Sopenharmony_ci conf->cluster_sync_low, 388962306a36Sopenharmony_ci conf->cluster_sync_high); 389062306a36Sopenharmony_ci } 389162306a36Sopenharmony_ci } 389262306a36Sopenharmony_ci 389362306a36Sopenharmony_ci while (biolist) { 389462306a36Sopenharmony_ci bio = biolist; 389562306a36Sopenharmony_ci biolist = biolist->bi_next; 389662306a36Sopenharmony_ci 389762306a36Sopenharmony_ci bio->bi_next = NULL; 389862306a36Sopenharmony_ci r10_bio = get_resync_r10bio(bio); 389962306a36Sopenharmony_ci r10_bio->sectors = nr_sectors; 390062306a36Sopenharmony_ci 390162306a36Sopenharmony_ci if (bio->bi_end_io == end_sync_read) { 390262306a36Sopenharmony_ci md_sync_acct_bio(bio, nr_sectors); 390362306a36Sopenharmony_ci bio->bi_status = 0; 390462306a36Sopenharmony_ci submit_bio_noacct(bio); 390562306a36Sopenharmony_ci } 390662306a36Sopenharmony_ci } 390762306a36Sopenharmony_ci 390862306a36Sopenharmony_ci if (sectors_skipped) 390962306a36Sopenharmony_ci /* pretend they weren't skipped, it makes 391062306a36Sopenharmony_ci * no important difference in this case 391162306a36Sopenharmony_ci */ 391262306a36Sopenharmony_ci md_done_sync(mddev, sectors_skipped, 1); 391362306a36Sopenharmony_ci 391462306a36Sopenharmony_ci return sectors_skipped + nr_sectors; 391562306a36Sopenharmony_ci giveup: 391662306a36Sopenharmony_ci /* There is nowhere to write, so all non-sync 391762306a36Sopenharmony_ci * drives must be failed or in resync, all drives 391862306a36Sopenharmony_ci * have a bad block, so try the next chunk... 391962306a36Sopenharmony_ci */ 392062306a36Sopenharmony_ci if (sector_nr + max_sync < max_sector) 392162306a36Sopenharmony_ci max_sector = sector_nr + max_sync; 392262306a36Sopenharmony_ci 392362306a36Sopenharmony_ci sectors_skipped += (max_sector - sector_nr); 392462306a36Sopenharmony_ci chunks_skipped ++; 392562306a36Sopenharmony_ci sector_nr = max_sector; 392662306a36Sopenharmony_ci goto skipped; 392762306a36Sopenharmony_ci} 392862306a36Sopenharmony_ci 392962306a36Sopenharmony_cistatic sector_t 393062306a36Sopenharmony_ciraid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) 393162306a36Sopenharmony_ci{ 393262306a36Sopenharmony_ci sector_t size; 393362306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 393462306a36Sopenharmony_ci 393562306a36Sopenharmony_ci if (!raid_disks) 393662306a36Sopenharmony_ci raid_disks = min(conf->geo.raid_disks, 393762306a36Sopenharmony_ci conf->prev.raid_disks); 393862306a36Sopenharmony_ci if (!sectors) 393962306a36Sopenharmony_ci sectors = conf->dev_sectors; 394062306a36Sopenharmony_ci 394162306a36Sopenharmony_ci size = sectors >> conf->geo.chunk_shift; 394262306a36Sopenharmony_ci sector_div(size, conf->geo.far_copies); 394362306a36Sopenharmony_ci size = size * raid_disks; 394462306a36Sopenharmony_ci sector_div(size, conf->geo.near_copies); 394562306a36Sopenharmony_ci 394662306a36Sopenharmony_ci return size << conf->geo.chunk_shift; 394762306a36Sopenharmony_ci} 394862306a36Sopenharmony_ci 394962306a36Sopenharmony_cistatic void calc_sectors(struct r10conf *conf, sector_t size) 395062306a36Sopenharmony_ci{ 395162306a36Sopenharmony_ci /* Calculate the number of sectors-per-device that will 395262306a36Sopenharmony_ci * actually be used, and set conf->dev_sectors and 395362306a36Sopenharmony_ci * conf->stride 395462306a36Sopenharmony_ci */ 395562306a36Sopenharmony_ci 395662306a36Sopenharmony_ci size = size >> conf->geo.chunk_shift; 395762306a36Sopenharmony_ci sector_div(size, conf->geo.far_copies); 395862306a36Sopenharmony_ci size = size * conf->geo.raid_disks; 395962306a36Sopenharmony_ci sector_div(size, conf->geo.near_copies); 396062306a36Sopenharmony_ci /* 'size' is now the number of chunks in the array */ 396162306a36Sopenharmony_ci /* calculate "used chunks per device" */ 396262306a36Sopenharmony_ci size = size * conf->copies; 396362306a36Sopenharmony_ci 396462306a36Sopenharmony_ci /* We need to round up when dividing by raid_disks to 396562306a36Sopenharmony_ci * get the stride size. 396662306a36Sopenharmony_ci */ 396762306a36Sopenharmony_ci size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); 396862306a36Sopenharmony_ci 396962306a36Sopenharmony_ci conf->dev_sectors = size << conf->geo.chunk_shift; 397062306a36Sopenharmony_ci 397162306a36Sopenharmony_ci if (conf->geo.far_offset) 397262306a36Sopenharmony_ci conf->geo.stride = 1 << conf->geo.chunk_shift; 397362306a36Sopenharmony_ci else { 397462306a36Sopenharmony_ci sector_div(size, conf->geo.far_copies); 397562306a36Sopenharmony_ci conf->geo.stride = size << conf->geo.chunk_shift; 397662306a36Sopenharmony_ci } 397762306a36Sopenharmony_ci} 397862306a36Sopenharmony_ci 397962306a36Sopenharmony_cienum geo_type {geo_new, geo_old, geo_start}; 398062306a36Sopenharmony_cistatic int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) 398162306a36Sopenharmony_ci{ 398262306a36Sopenharmony_ci int nc, fc, fo; 398362306a36Sopenharmony_ci int layout, chunk, disks; 398462306a36Sopenharmony_ci switch (new) { 398562306a36Sopenharmony_ci case geo_old: 398662306a36Sopenharmony_ci layout = mddev->layout; 398762306a36Sopenharmony_ci chunk = mddev->chunk_sectors; 398862306a36Sopenharmony_ci disks = mddev->raid_disks - mddev->delta_disks; 398962306a36Sopenharmony_ci break; 399062306a36Sopenharmony_ci case geo_new: 399162306a36Sopenharmony_ci layout = mddev->new_layout; 399262306a36Sopenharmony_ci chunk = mddev->new_chunk_sectors; 399362306a36Sopenharmony_ci disks = mddev->raid_disks; 399462306a36Sopenharmony_ci break; 399562306a36Sopenharmony_ci default: /* avoid 'may be unused' warnings */ 399662306a36Sopenharmony_ci case geo_start: /* new when starting reshape - raid_disks not 399762306a36Sopenharmony_ci * updated yet. */ 399862306a36Sopenharmony_ci layout = mddev->new_layout; 399962306a36Sopenharmony_ci chunk = mddev->new_chunk_sectors; 400062306a36Sopenharmony_ci disks = mddev->raid_disks + mddev->delta_disks; 400162306a36Sopenharmony_ci break; 400262306a36Sopenharmony_ci } 400362306a36Sopenharmony_ci if (layout >> 19) 400462306a36Sopenharmony_ci return -1; 400562306a36Sopenharmony_ci if (chunk < (PAGE_SIZE >> 9) || 400662306a36Sopenharmony_ci !is_power_of_2(chunk)) 400762306a36Sopenharmony_ci return -2; 400862306a36Sopenharmony_ci nc = layout & 255; 400962306a36Sopenharmony_ci fc = (layout >> 8) & 255; 401062306a36Sopenharmony_ci fo = layout & (1<<16); 401162306a36Sopenharmony_ci geo->raid_disks = disks; 401262306a36Sopenharmony_ci geo->near_copies = nc; 401362306a36Sopenharmony_ci geo->far_copies = fc; 401462306a36Sopenharmony_ci geo->far_offset = fo; 401562306a36Sopenharmony_ci switch (layout >> 17) { 401662306a36Sopenharmony_ci case 0: /* original layout. simple but not always optimal */ 401762306a36Sopenharmony_ci geo->far_set_size = disks; 401862306a36Sopenharmony_ci break; 401962306a36Sopenharmony_ci case 1: /* "improved" layout which was buggy. Hopefully no-one is 402062306a36Sopenharmony_ci * actually using this, but leave code here just in case.*/ 402162306a36Sopenharmony_ci geo->far_set_size = disks/fc; 402262306a36Sopenharmony_ci WARN(geo->far_set_size < fc, 402362306a36Sopenharmony_ci "This RAID10 layout does not provide data safety - please backup and create new array\n"); 402462306a36Sopenharmony_ci break; 402562306a36Sopenharmony_ci case 2: /* "improved" layout fixed to match documentation */ 402662306a36Sopenharmony_ci geo->far_set_size = fc * nc; 402762306a36Sopenharmony_ci break; 402862306a36Sopenharmony_ci default: /* Not a valid layout */ 402962306a36Sopenharmony_ci return -1; 403062306a36Sopenharmony_ci } 403162306a36Sopenharmony_ci geo->chunk_mask = chunk - 1; 403262306a36Sopenharmony_ci geo->chunk_shift = ffz(~chunk); 403362306a36Sopenharmony_ci return nc*fc; 403462306a36Sopenharmony_ci} 403562306a36Sopenharmony_ci 403662306a36Sopenharmony_cistatic void raid10_free_conf(struct r10conf *conf) 403762306a36Sopenharmony_ci{ 403862306a36Sopenharmony_ci if (!conf) 403962306a36Sopenharmony_ci return; 404062306a36Sopenharmony_ci 404162306a36Sopenharmony_ci mempool_exit(&conf->r10bio_pool); 404262306a36Sopenharmony_ci kfree(conf->mirrors); 404362306a36Sopenharmony_ci kfree(conf->mirrors_old); 404462306a36Sopenharmony_ci kfree(conf->mirrors_new); 404562306a36Sopenharmony_ci safe_put_page(conf->tmppage); 404662306a36Sopenharmony_ci bioset_exit(&conf->bio_split); 404762306a36Sopenharmony_ci kfree(conf); 404862306a36Sopenharmony_ci} 404962306a36Sopenharmony_ci 405062306a36Sopenharmony_cistatic struct r10conf *setup_conf(struct mddev *mddev) 405162306a36Sopenharmony_ci{ 405262306a36Sopenharmony_ci struct r10conf *conf = NULL; 405362306a36Sopenharmony_ci int err = -EINVAL; 405462306a36Sopenharmony_ci struct geom geo; 405562306a36Sopenharmony_ci int copies; 405662306a36Sopenharmony_ci 405762306a36Sopenharmony_ci copies = setup_geo(&geo, mddev, geo_new); 405862306a36Sopenharmony_ci 405962306a36Sopenharmony_ci if (copies == -2) { 406062306a36Sopenharmony_ci pr_warn("md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.\n", 406162306a36Sopenharmony_ci mdname(mddev), PAGE_SIZE); 406262306a36Sopenharmony_ci goto out; 406362306a36Sopenharmony_ci } 406462306a36Sopenharmony_ci 406562306a36Sopenharmony_ci if (copies < 2 || copies > mddev->raid_disks) { 406662306a36Sopenharmony_ci pr_warn("md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 406762306a36Sopenharmony_ci mdname(mddev), mddev->new_layout); 406862306a36Sopenharmony_ci goto out; 406962306a36Sopenharmony_ci } 407062306a36Sopenharmony_ci 407162306a36Sopenharmony_ci err = -ENOMEM; 407262306a36Sopenharmony_ci conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL); 407362306a36Sopenharmony_ci if (!conf) 407462306a36Sopenharmony_ci goto out; 407562306a36Sopenharmony_ci 407662306a36Sopenharmony_ci /* FIXME calc properly */ 407762306a36Sopenharmony_ci conf->mirrors = kcalloc(mddev->raid_disks + max(0, -mddev->delta_disks), 407862306a36Sopenharmony_ci sizeof(struct raid10_info), 407962306a36Sopenharmony_ci GFP_KERNEL); 408062306a36Sopenharmony_ci if (!conf->mirrors) 408162306a36Sopenharmony_ci goto out; 408262306a36Sopenharmony_ci 408362306a36Sopenharmony_ci conf->tmppage = alloc_page(GFP_KERNEL); 408462306a36Sopenharmony_ci if (!conf->tmppage) 408562306a36Sopenharmony_ci goto out; 408662306a36Sopenharmony_ci 408762306a36Sopenharmony_ci conf->geo = geo; 408862306a36Sopenharmony_ci conf->copies = copies; 408962306a36Sopenharmony_ci err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, 409062306a36Sopenharmony_ci rbio_pool_free, conf); 409162306a36Sopenharmony_ci if (err) 409262306a36Sopenharmony_ci goto out; 409362306a36Sopenharmony_ci 409462306a36Sopenharmony_ci err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 409562306a36Sopenharmony_ci if (err) 409662306a36Sopenharmony_ci goto out; 409762306a36Sopenharmony_ci 409862306a36Sopenharmony_ci calc_sectors(conf, mddev->dev_sectors); 409962306a36Sopenharmony_ci if (mddev->reshape_position == MaxSector) { 410062306a36Sopenharmony_ci conf->prev = conf->geo; 410162306a36Sopenharmony_ci conf->reshape_progress = MaxSector; 410262306a36Sopenharmony_ci } else { 410362306a36Sopenharmony_ci if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { 410462306a36Sopenharmony_ci err = -EINVAL; 410562306a36Sopenharmony_ci goto out; 410662306a36Sopenharmony_ci } 410762306a36Sopenharmony_ci conf->reshape_progress = mddev->reshape_position; 410862306a36Sopenharmony_ci if (conf->prev.far_offset) 410962306a36Sopenharmony_ci conf->prev.stride = 1 << conf->prev.chunk_shift; 411062306a36Sopenharmony_ci else 411162306a36Sopenharmony_ci /* far_copies must be 1 */ 411262306a36Sopenharmony_ci conf->prev.stride = conf->dev_sectors; 411362306a36Sopenharmony_ci } 411462306a36Sopenharmony_ci conf->reshape_safe = conf->reshape_progress; 411562306a36Sopenharmony_ci spin_lock_init(&conf->device_lock); 411662306a36Sopenharmony_ci INIT_LIST_HEAD(&conf->retry_list); 411762306a36Sopenharmony_ci INIT_LIST_HEAD(&conf->bio_end_io_list); 411862306a36Sopenharmony_ci 411962306a36Sopenharmony_ci seqlock_init(&conf->resync_lock); 412062306a36Sopenharmony_ci init_waitqueue_head(&conf->wait_barrier); 412162306a36Sopenharmony_ci atomic_set(&conf->nr_pending, 0); 412262306a36Sopenharmony_ci 412362306a36Sopenharmony_ci err = -ENOMEM; 412462306a36Sopenharmony_ci rcu_assign_pointer(conf->thread, 412562306a36Sopenharmony_ci md_register_thread(raid10d, mddev, "raid10")); 412662306a36Sopenharmony_ci if (!conf->thread) 412762306a36Sopenharmony_ci goto out; 412862306a36Sopenharmony_ci 412962306a36Sopenharmony_ci conf->mddev = mddev; 413062306a36Sopenharmony_ci return conf; 413162306a36Sopenharmony_ci 413262306a36Sopenharmony_ci out: 413362306a36Sopenharmony_ci raid10_free_conf(conf); 413462306a36Sopenharmony_ci return ERR_PTR(err); 413562306a36Sopenharmony_ci} 413662306a36Sopenharmony_ci 413762306a36Sopenharmony_cistatic void raid10_set_io_opt(struct r10conf *conf) 413862306a36Sopenharmony_ci{ 413962306a36Sopenharmony_ci int raid_disks = conf->geo.raid_disks; 414062306a36Sopenharmony_ci 414162306a36Sopenharmony_ci if (!(conf->geo.raid_disks % conf->geo.near_copies)) 414262306a36Sopenharmony_ci raid_disks /= conf->geo.near_copies; 414362306a36Sopenharmony_ci blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * 414462306a36Sopenharmony_ci raid_disks); 414562306a36Sopenharmony_ci} 414662306a36Sopenharmony_ci 414762306a36Sopenharmony_cistatic int raid10_run(struct mddev *mddev) 414862306a36Sopenharmony_ci{ 414962306a36Sopenharmony_ci struct r10conf *conf; 415062306a36Sopenharmony_ci int i, disk_idx; 415162306a36Sopenharmony_ci struct raid10_info *disk; 415262306a36Sopenharmony_ci struct md_rdev *rdev; 415362306a36Sopenharmony_ci sector_t size; 415462306a36Sopenharmony_ci sector_t min_offset_diff = 0; 415562306a36Sopenharmony_ci int first = 1; 415662306a36Sopenharmony_ci 415762306a36Sopenharmony_ci if (mddev_init_writes_pending(mddev) < 0) 415862306a36Sopenharmony_ci return -ENOMEM; 415962306a36Sopenharmony_ci 416062306a36Sopenharmony_ci if (mddev->private == NULL) { 416162306a36Sopenharmony_ci conf = setup_conf(mddev); 416262306a36Sopenharmony_ci if (IS_ERR(conf)) 416362306a36Sopenharmony_ci return PTR_ERR(conf); 416462306a36Sopenharmony_ci mddev->private = conf; 416562306a36Sopenharmony_ci } 416662306a36Sopenharmony_ci conf = mddev->private; 416762306a36Sopenharmony_ci if (!conf) 416862306a36Sopenharmony_ci goto out; 416962306a36Sopenharmony_ci 417062306a36Sopenharmony_ci rcu_assign_pointer(mddev->thread, conf->thread); 417162306a36Sopenharmony_ci rcu_assign_pointer(conf->thread, NULL); 417262306a36Sopenharmony_ci 417362306a36Sopenharmony_ci if (mddev_is_clustered(conf->mddev)) { 417462306a36Sopenharmony_ci int fc, fo; 417562306a36Sopenharmony_ci 417662306a36Sopenharmony_ci fc = (mddev->layout >> 8) & 255; 417762306a36Sopenharmony_ci fo = mddev->layout & (1<<16); 417862306a36Sopenharmony_ci if (fc > 1 || fo > 0) { 417962306a36Sopenharmony_ci pr_err("only near layout is supported by clustered" 418062306a36Sopenharmony_ci " raid10\n"); 418162306a36Sopenharmony_ci goto out_free_conf; 418262306a36Sopenharmony_ci } 418362306a36Sopenharmony_ci } 418462306a36Sopenharmony_ci 418562306a36Sopenharmony_ci if (mddev->queue) { 418662306a36Sopenharmony_ci blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 418762306a36Sopenharmony_ci blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); 418862306a36Sopenharmony_ci raid10_set_io_opt(conf); 418962306a36Sopenharmony_ci } 419062306a36Sopenharmony_ci 419162306a36Sopenharmony_ci rdev_for_each(rdev, mddev) { 419262306a36Sopenharmony_ci long long diff; 419362306a36Sopenharmony_ci 419462306a36Sopenharmony_ci disk_idx = rdev->raid_disk; 419562306a36Sopenharmony_ci if (disk_idx < 0) 419662306a36Sopenharmony_ci continue; 419762306a36Sopenharmony_ci if (disk_idx >= conf->geo.raid_disks && 419862306a36Sopenharmony_ci disk_idx >= conf->prev.raid_disks) 419962306a36Sopenharmony_ci continue; 420062306a36Sopenharmony_ci disk = conf->mirrors + disk_idx; 420162306a36Sopenharmony_ci 420262306a36Sopenharmony_ci if (test_bit(Replacement, &rdev->flags)) { 420362306a36Sopenharmony_ci if (disk->replacement) 420462306a36Sopenharmony_ci goto out_free_conf; 420562306a36Sopenharmony_ci disk->replacement = rdev; 420662306a36Sopenharmony_ci } else { 420762306a36Sopenharmony_ci if (disk->rdev) 420862306a36Sopenharmony_ci goto out_free_conf; 420962306a36Sopenharmony_ci disk->rdev = rdev; 421062306a36Sopenharmony_ci } 421162306a36Sopenharmony_ci diff = (rdev->new_data_offset - rdev->data_offset); 421262306a36Sopenharmony_ci if (!mddev->reshape_backwards) 421362306a36Sopenharmony_ci diff = -diff; 421462306a36Sopenharmony_ci if (diff < 0) 421562306a36Sopenharmony_ci diff = 0; 421662306a36Sopenharmony_ci if (first || diff < min_offset_diff) 421762306a36Sopenharmony_ci min_offset_diff = diff; 421862306a36Sopenharmony_ci 421962306a36Sopenharmony_ci if (mddev->gendisk) 422062306a36Sopenharmony_ci disk_stack_limits(mddev->gendisk, rdev->bdev, 422162306a36Sopenharmony_ci rdev->data_offset << 9); 422262306a36Sopenharmony_ci 422362306a36Sopenharmony_ci disk->head_position = 0; 422462306a36Sopenharmony_ci first = 0; 422562306a36Sopenharmony_ci } 422662306a36Sopenharmony_ci 422762306a36Sopenharmony_ci /* need to check that every block has at least one working mirror */ 422862306a36Sopenharmony_ci if (!enough(conf, -1)) { 422962306a36Sopenharmony_ci pr_err("md/raid10:%s: not enough operational mirrors.\n", 423062306a36Sopenharmony_ci mdname(mddev)); 423162306a36Sopenharmony_ci goto out_free_conf; 423262306a36Sopenharmony_ci } 423362306a36Sopenharmony_ci 423462306a36Sopenharmony_ci if (conf->reshape_progress != MaxSector) { 423562306a36Sopenharmony_ci /* must ensure that shape change is supported */ 423662306a36Sopenharmony_ci if (conf->geo.far_copies != 1 && 423762306a36Sopenharmony_ci conf->geo.far_offset == 0) 423862306a36Sopenharmony_ci goto out_free_conf; 423962306a36Sopenharmony_ci if (conf->prev.far_copies != 1 && 424062306a36Sopenharmony_ci conf->prev.far_offset == 0) 424162306a36Sopenharmony_ci goto out_free_conf; 424262306a36Sopenharmony_ci } 424362306a36Sopenharmony_ci 424462306a36Sopenharmony_ci mddev->degraded = 0; 424562306a36Sopenharmony_ci for (i = 0; 424662306a36Sopenharmony_ci i < conf->geo.raid_disks 424762306a36Sopenharmony_ci || i < conf->prev.raid_disks; 424862306a36Sopenharmony_ci i++) { 424962306a36Sopenharmony_ci 425062306a36Sopenharmony_ci disk = conf->mirrors + i; 425162306a36Sopenharmony_ci 425262306a36Sopenharmony_ci if (!disk->rdev && disk->replacement) { 425362306a36Sopenharmony_ci /* The replacement is all we have - use it */ 425462306a36Sopenharmony_ci disk->rdev = disk->replacement; 425562306a36Sopenharmony_ci disk->replacement = NULL; 425662306a36Sopenharmony_ci clear_bit(Replacement, &disk->rdev->flags); 425762306a36Sopenharmony_ci } 425862306a36Sopenharmony_ci 425962306a36Sopenharmony_ci if (!disk->rdev || 426062306a36Sopenharmony_ci !test_bit(In_sync, &disk->rdev->flags)) { 426162306a36Sopenharmony_ci disk->head_position = 0; 426262306a36Sopenharmony_ci mddev->degraded++; 426362306a36Sopenharmony_ci if (disk->rdev && 426462306a36Sopenharmony_ci disk->rdev->saved_raid_disk < 0) 426562306a36Sopenharmony_ci conf->fullsync = 1; 426662306a36Sopenharmony_ci } 426762306a36Sopenharmony_ci 426862306a36Sopenharmony_ci if (disk->replacement && 426962306a36Sopenharmony_ci !test_bit(In_sync, &disk->replacement->flags) && 427062306a36Sopenharmony_ci disk->replacement->saved_raid_disk < 0) { 427162306a36Sopenharmony_ci conf->fullsync = 1; 427262306a36Sopenharmony_ci } 427362306a36Sopenharmony_ci 427462306a36Sopenharmony_ci disk->recovery_disabled = mddev->recovery_disabled - 1; 427562306a36Sopenharmony_ci } 427662306a36Sopenharmony_ci 427762306a36Sopenharmony_ci if (mddev->recovery_cp != MaxSector) 427862306a36Sopenharmony_ci pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", 427962306a36Sopenharmony_ci mdname(mddev)); 428062306a36Sopenharmony_ci pr_info("md/raid10:%s: active with %d out of %d devices\n", 428162306a36Sopenharmony_ci mdname(mddev), conf->geo.raid_disks - mddev->degraded, 428262306a36Sopenharmony_ci conf->geo.raid_disks); 428362306a36Sopenharmony_ci /* 428462306a36Sopenharmony_ci * Ok, everything is just fine now 428562306a36Sopenharmony_ci */ 428662306a36Sopenharmony_ci mddev->dev_sectors = conf->dev_sectors; 428762306a36Sopenharmony_ci size = raid10_size(mddev, 0, 0); 428862306a36Sopenharmony_ci md_set_array_sectors(mddev, size); 428962306a36Sopenharmony_ci mddev->resync_max_sectors = size; 429062306a36Sopenharmony_ci set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); 429162306a36Sopenharmony_ci 429262306a36Sopenharmony_ci if (md_integrity_register(mddev)) 429362306a36Sopenharmony_ci goto out_free_conf; 429462306a36Sopenharmony_ci 429562306a36Sopenharmony_ci if (conf->reshape_progress != MaxSector) { 429662306a36Sopenharmony_ci unsigned long before_length, after_length; 429762306a36Sopenharmony_ci 429862306a36Sopenharmony_ci before_length = ((1 << conf->prev.chunk_shift) * 429962306a36Sopenharmony_ci conf->prev.far_copies); 430062306a36Sopenharmony_ci after_length = ((1 << conf->geo.chunk_shift) * 430162306a36Sopenharmony_ci conf->geo.far_copies); 430262306a36Sopenharmony_ci 430362306a36Sopenharmony_ci if (max(before_length, after_length) > min_offset_diff) { 430462306a36Sopenharmony_ci /* This cannot work */ 430562306a36Sopenharmony_ci pr_warn("md/raid10: offset difference not enough to continue reshape\n"); 430662306a36Sopenharmony_ci goto out_free_conf; 430762306a36Sopenharmony_ci } 430862306a36Sopenharmony_ci conf->offset_diff = min_offset_diff; 430962306a36Sopenharmony_ci 431062306a36Sopenharmony_ci clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 431162306a36Sopenharmony_ci clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 431262306a36Sopenharmony_ci set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 431362306a36Sopenharmony_ci set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 431462306a36Sopenharmony_ci rcu_assign_pointer(mddev->sync_thread, 431562306a36Sopenharmony_ci md_register_thread(md_do_sync, mddev, "reshape")); 431662306a36Sopenharmony_ci if (!mddev->sync_thread) 431762306a36Sopenharmony_ci goto out_free_conf; 431862306a36Sopenharmony_ci } 431962306a36Sopenharmony_ci 432062306a36Sopenharmony_ci return 0; 432162306a36Sopenharmony_ci 432262306a36Sopenharmony_ciout_free_conf: 432362306a36Sopenharmony_ci md_unregister_thread(mddev, &mddev->thread); 432462306a36Sopenharmony_ci raid10_free_conf(conf); 432562306a36Sopenharmony_ci mddev->private = NULL; 432662306a36Sopenharmony_ciout: 432762306a36Sopenharmony_ci return -EIO; 432862306a36Sopenharmony_ci} 432962306a36Sopenharmony_ci 433062306a36Sopenharmony_cistatic void raid10_free(struct mddev *mddev, void *priv) 433162306a36Sopenharmony_ci{ 433262306a36Sopenharmony_ci raid10_free_conf(priv); 433362306a36Sopenharmony_ci} 433462306a36Sopenharmony_ci 433562306a36Sopenharmony_cistatic void raid10_quiesce(struct mddev *mddev, int quiesce) 433662306a36Sopenharmony_ci{ 433762306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 433862306a36Sopenharmony_ci 433962306a36Sopenharmony_ci if (quiesce) 434062306a36Sopenharmony_ci raise_barrier(conf, 0); 434162306a36Sopenharmony_ci else 434262306a36Sopenharmony_ci lower_barrier(conf); 434362306a36Sopenharmony_ci} 434462306a36Sopenharmony_ci 434562306a36Sopenharmony_cistatic int raid10_resize(struct mddev *mddev, sector_t sectors) 434662306a36Sopenharmony_ci{ 434762306a36Sopenharmony_ci /* Resize of 'far' arrays is not supported. 434862306a36Sopenharmony_ci * For 'near' and 'offset' arrays we can set the 434962306a36Sopenharmony_ci * number of sectors used to be an appropriate multiple 435062306a36Sopenharmony_ci * of the chunk size. 435162306a36Sopenharmony_ci * For 'offset', this is far_copies*chunksize. 435262306a36Sopenharmony_ci * For 'near' the multiplier is the LCM of 435362306a36Sopenharmony_ci * near_copies and raid_disks. 435462306a36Sopenharmony_ci * So if far_copies > 1 && !far_offset, fail. 435562306a36Sopenharmony_ci * Else find LCM(raid_disks, near_copy)*far_copies and 435662306a36Sopenharmony_ci * multiply by chunk_size. Then round to this number. 435762306a36Sopenharmony_ci * This is mostly done by raid10_size() 435862306a36Sopenharmony_ci */ 435962306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 436062306a36Sopenharmony_ci sector_t oldsize, size; 436162306a36Sopenharmony_ci 436262306a36Sopenharmony_ci if (mddev->reshape_position != MaxSector) 436362306a36Sopenharmony_ci return -EBUSY; 436462306a36Sopenharmony_ci 436562306a36Sopenharmony_ci if (conf->geo.far_copies > 1 && !conf->geo.far_offset) 436662306a36Sopenharmony_ci return -EINVAL; 436762306a36Sopenharmony_ci 436862306a36Sopenharmony_ci oldsize = raid10_size(mddev, 0, 0); 436962306a36Sopenharmony_ci size = raid10_size(mddev, sectors, 0); 437062306a36Sopenharmony_ci if (mddev->external_size && 437162306a36Sopenharmony_ci mddev->array_sectors > size) 437262306a36Sopenharmony_ci return -EINVAL; 437362306a36Sopenharmony_ci if (mddev->bitmap) { 437462306a36Sopenharmony_ci int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0); 437562306a36Sopenharmony_ci if (ret) 437662306a36Sopenharmony_ci return ret; 437762306a36Sopenharmony_ci } 437862306a36Sopenharmony_ci md_set_array_sectors(mddev, size); 437962306a36Sopenharmony_ci if (sectors > mddev->dev_sectors && 438062306a36Sopenharmony_ci mddev->recovery_cp > oldsize) { 438162306a36Sopenharmony_ci mddev->recovery_cp = oldsize; 438262306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 438362306a36Sopenharmony_ci } 438462306a36Sopenharmony_ci calc_sectors(conf, sectors); 438562306a36Sopenharmony_ci mddev->dev_sectors = conf->dev_sectors; 438662306a36Sopenharmony_ci mddev->resync_max_sectors = size; 438762306a36Sopenharmony_ci return 0; 438862306a36Sopenharmony_ci} 438962306a36Sopenharmony_ci 439062306a36Sopenharmony_cistatic void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) 439162306a36Sopenharmony_ci{ 439262306a36Sopenharmony_ci struct md_rdev *rdev; 439362306a36Sopenharmony_ci struct r10conf *conf; 439462306a36Sopenharmony_ci 439562306a36Sopenharmony_ci if (mddev->degraded > 0) { 439662306a36Sopenharmony_ci pr_warn("md/raid10:%s: Error: degraded raid0!\n", 439762306a36Sopenharmony_ci mdname(mddev)); 439862306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 439962306a36Sopenharmony_ci } 440062306a36Sopenharmony_ci sector_div(size, devs); 440162306a36Sopenharmony_ci 440262306a36Sopenharmony_ci /* Set new parameters */ 440362306a36Sopenharmony_ci mddev->new_level = 10; 440462306a36Sopenharmony_ci /* new layout: far_copies = 1, near_copies = 2 */ 440562306a36Sopenharmony_ci mddev->new_layout = (1<<8) + 2; 440662306a36Sopenharmony_ci mddev->new_chunk_sectors = mddev->chunk_sectors; 440762306a36Sopenharmony_ci mddev->delta_disks = mddev->raid_disks; 440862306a36Sopenharmony_ci mddev->raid_disks *= 2; 440962306a36Sopenharmony_ci /* make sure it will be not marked as dirty */ 441062306a36Sopenharmony_ci mddev->recovery_cp = MaxSector; 441162306a36Sopenharmony_ci mddev->dev_sectors = size; 441262306a36Sopenharmony_ci 441362306a36Sopenharmony_ci conf = setup_conf(mddev); 441462306a36Sopenharmony_ci if (!IS_ERR(conf)) { 441562306a36Sopenharmony_ci rdev_for_each(rdev, mddev) 441662306a36Sopenharmony_ci if (rdev->raid_disk >= 0) { 441762306a36Sopenharmony_ci rdev->new_raid_disk = rdev->raid_disk * 2; 441862306a36Sopenharmony_ci rdev->sectors = size; 441962306a36Sopenharmony_ci } 442062306a36Sopenharmony_ci } 442162306a36Sopenharmony_ci 442262306a36Sopenharmony_ci return conf; 442362306a36Sopenharmony_ci} 442462306a36Sopenharmony_ci 442562306a36Sopenharmony_cistatic void *raid10_takeover(struct mddev *mddev) 442662306a36Sopenharmony_ci{ 442762306a36Sopenharmony_ci struct r0conf *raid0_conf; 442862306a36Sopenharmony_ci 442962306a36Sopenharmony_ci /* raid10 can take over: 443062306a36Sopenharmony_ci * raid0 - providing it has only two drives 443162306a36Sopenharmony_ci */ 443262306a36Sopenharmony_ci if (mddev->level == 0) { 443362306a36Sopenharmony_ci /* for raid0 takeover only one zone is supported */ 443462306a36Sopenharmony_ci raid0_conf = mddev->private; 443562306a36Sopenharmony_ci if (raid0_conf->nr_strip_zones > 1) { 443662306a36Sopenharmony_ci pr_warn("md/raid10:%s: cannot takeover raid 0 with more than one zone.\n", 443762306a36Sopenharmony_ci mdname(mddev)); 443862306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 443962306a36Sopenharmony_ci } 444062306a36Sopenharmony_ci return raid10_takeover_raid0(mddev, 444162306a36Sopenharmony_ci raid0_conf->strip_zone->zone_end, 444262306a36Sopenharmony_ci raid0_conf->strip_zone->nb_dev); 444362306a36Sopenharmony_ci } 444462306a36Sopenharmony_ci return ERR_PTR(-EINVAL); 444562306a36Sopenharmony_ci} 444662306a36Sopenharmony_ci 444762306a36Sopenharmony_cistatic int raid10_check_reshape(struct mddev *mddev) 444862306a36Sopenharmony_ci{ 444962306a36Sopenharmony_ci /* Called when there is a request to change 445062306a36Sopenharmony_ci * - layout (to ->new_layout) 445162306a36Sopenharmony_ci * - chunk size (to ->new_chunk_sectors) 445262306a36Sopenharmony_ci * - raid_disks (by delta_disks) 445362306a36Sopenharmony_ci * or when trying to restart a reshape that was ongoing. 445462306a36Sopenharmony_ci * 445562306a36Sopenharmony_ci * We need to validate the request and possibly allocate 445662306a36Sopenharmony_ci * space if that might be an issue later. 445762306a36Sopenharmony_ci * 445862306a36Sopenharmony_ci * Currently we reject any reshape of a 'far' mode array, 445962306a36Sopenharmony_ci * allow chunk size to change if new is generally acceptable, 446062306a36Sopenharmony_ci * allow raid_disks to increase, and allow 446162306a36Sopenharmony_ci * a switch between 'near' mode and 'offset' mode. 446262306a36Sopenharmony_ci */ 446362306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 446462306a36Sopenharmony_ci struct geom geo; 446562306a36Sopenharmony_ci 446662306a36Sopenharmony_ci if (conf->geo.far_copies != 1 && !conf->geo.far_offset) 446762306a36Sopenharmony_ci return -EINVAL; 446862306a36Sopenharmony_ci 446962306a36Sopenharmony_ci if (setup_geo(&geo, mddev, geo_start) != conf->copies) 447062306a36Sopenharmony_ci /* mustn't change number of copies */ 447162306a36Sopenharmony_ci return -EINVAL; 447262306a36Sopenharmony_ci if (geo.far_copies > 1 && !geo.far_offset) 447362306a36Sopenharmony_ci /* Cannot switch to 'far' mode */ 447462306a36Sopenharmony_ci return -EINVAL; 447562306a36Sopenharmony_ci 447662306a36Sopenharmony_ci if (mddev->array_sectors & geo.chunk_mask) 447762306a36Sopenharmony_ci /* not factor of array size */ 447862306a36Sopenharmony_ci return -EINVAL; 447962306a36Sopenharmony_ci 448062306a36Sopenharmony_ci if (!enough(conf, -1)) 448162306a36Sopenharmony_ci return -EINVAL; 448262306a36Sopenharmony_ci 448362306a36Sopenharmony_ci kfree(conf->mirrors_new); 448462306a36Sopenharmony_ci conf->mirrors_new = NULL; 448562306a36Sopenharmony_ci if (mddev->delta_disks > 0) { 448662306a36Sopenharmony_ci /* allocate new 'mirrors' list */ 448762306a36Sopenharmony_ci conf->mirrors_new = 448862306a36Sopenharmony_ci kcalloc(mddev->raid_disks + mddev->delta_disks, 448962306a36Sopenharmony_ci sizeof(struct raid10_info), 449062306a36Sopenharmony_ci GFP_KERNEL); 449162306a36Sopenharmony_ci if (!conf->mirrors_new) 449262306a36Sopenharmony_ci return -ENOMEM; 449362306a36Sopenharmony_ci } 449462306a36Sopenharmony_ci return 0; 449562306a36Sopenharmony_ci} 449662306a36Sopenharmony_ci 449762306a36Sopenharmony_ci/* 449862306a36Sopenharmony_ci * Need to check if array has failed when deciding whether to: 449962306a36Sopenharmony_ci * - start an array 450062306a36Sopenharmony_ci * - remove non-faulty devices 450162306a36Sopenharmony_ci * - add a spare 450262306a36Sopenharmony_ci * - allow a reshape 450362306a36Sopenharmony_ci * This determination is simple when no reshape is happening. 450462306a36Sopenharmony_ci * However if there is a reshape, we need to carefully check 450562306a36Sopenharmony_ci * both the before and after sections. 450662306a36Sopenharmony_ci * This is because some failed devices may only affect one 450762306a36Sopenharmony_ci * of the two sections, and some non-in_sync devices may 450862306a36Sopenharmony_ci * be insync in the section most affected by failed devices. 450962306a36Sopenharmony_ci */ 451062306a36Sopenharmony_cistatic int calc_degraded(struct r10conf *conf) 451162306a36Sopenharmony_ci{ 451262306a36Sopenharmony_ci int degraded, degraded2; 451362306a36Sopenharmony_ci int i; 451462306a36Sopenharmony_ci 451562306a36Sopenharmony_ci rcu_read_lock(); 451662306a36Sopenharmony_ci degraded = 0; 451762306a36Sopenharmony_ci /* 'prev' section first */ 451862306a36Sopenharmony_ci for (i = 0; i < conf->prev.raid_disks; i++) { 451962306a36Sopenharmony_ci struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 452062306a36Sopenharmony_ci if (!rdev || test_bit(Faulty, &rdev->flags)) 452162306a36Sopenharmony_ci degraded++; 452262306a36Sopenharmony_ci else if (!test_bit(In_sync, &rdev->flags)) 452362306a36Sopenharmony_ci /* When we can reduce the number of devices in 452462306a36Sopenharmony_ci * an array, this might not contribute to 452562306a36Sopenharmony_ci * 'degraded'. It does now. 452662306a36Sopenharmony_ci */ 452762306a36Sopenharmony_ci degraded++; 452862306a36Sopenharmony_ci } 452962306a36Sopenharmony_ci rcu_read_unlock(); 453062306a36Sopenharmony_ci if (conf->geo.raid_disks == conf->prev.raid_disks) 453162306a36Sopenharmony_ci return degraded; 453262306a36Sopenharmony_ci rcu_read_lock(); 453362306a36Sopenharmony_ci degraded2 = 0; 453462306a36Sopenharmony_ci for (i = 0; i < conf->geo.raid_disks; i++) { 453562306a36Sopenharmony_ci struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 453662306a36Sopenharmony_ci if (!rdev || test_bit(Faulty, &rdev->flags)) 453762306a36Sopenharmony_ci degraded2++; 453862306a36Sopenharmony_ci else if (!test_bit(In_sync, &rdev->flags)) { 453962306a36Sopenharmony_ci /* If reshape is increasing the number of devices, 454062306a36Sopenharmony_ci * this section has already been recovered, so 454162306a36Sopenharmony_ci * it doesn't contribute to degraded. 454262306a36Sopenharmony_ci * else it does. 454362306a36Sopenharmony_ci */ 454462306a36Sopenharmony_ci if (conf->geo.raid_disks <= conf->prev.raid_disks) 454562306a36Sopenharmony_ci degraded2++; 454662306a36Sopenharmony_ci } 454762306a36Sopenharmony_ci } 454862306a36Sopenharmony_ci rcu_read_unlock(); 454962306a36Sopenharmony_ci if (degraded2 > degraded) 455062306a36Sopenharmony_ci return degraded2; 455162306a36Sopenharmony_ci return degraded; 455262306a36Sopenharmony_ci} 455362306a36Sopenharmony_ci 455462306a36Sopenharmony_cistatic int raid10_start_reshape(struct mddev *mddev) 455562306a36Sopenharmony_ci{ 455662306a36Sopenharmony_ci /* A 'reshape' has been requested. This commits 455762306a36Sopenharmony_ci * the various 'new' fields and sets MD_RECOVER_RESHAPE 455862306a36Sopenharmony_ci * This also checks if there are enough spares and adds them 455962306a36Sopenharmony_ci * to the array. 456062306a36Sopenharmony_ci * We currently require enough spares to make the final 456162306a36Sopenharmony_ci * array non-degraded. We also require that the difference 456262306a36Sopenharmony_ci * between old and new data_offset - on each device - is 456362306a36Sopenharmony_ci * enough that we never risk over-writing. 456462306a36Sopenharmony_ci */ 456562306a36Sopenharmony_ci 456662306a36Sopenharmony_ci unsigned long before_length, after_length; 456762306a36Sopenharmony_ci sector_t min_offset_diff = 0; 456862306a36Sopenharmony_ci int first = 1; 456962306a36Sopenharmony_ci struct geom new; 457062306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 457162306a36Sopenharmony_ci struct md_rdev *rdev; 457262306a36Sopenharmony_ci int spares = 0; 457362306a36Sopenharmony_ci int ret; 457462306a36Sopenharmony_ci 457562306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 457662306a36Sopenharmony_ci return -EBUSY; 457762306a36Sopenharmony_ci 457862306a36Sopenharmony_ci if (setup_geo(&new, mddev, geo_start) != conf->copies) 457962306a36Sopenharmony_ci return -EINVAL; 458062306a36Sopenharmony_ci 458162306a36Sopenharmony_ci before_length = ((1 << conf->prev.chunk_shift) * 458262306a36Sopenharmony_ci conf->prev.far_copies); 458362306a36Sopenharmony_ci after_length = ((1 << conf->geo.chunk_shift) * 458462306a36Sopenharmony_ci conf->geo.far_copies); 458562306a36Sopenharmony_ci 458662306a36Sopenharmony_ci rdev_for_each(rdev, mddev) { 458762306a36Sopenharmony_ci if (!test_bit(In_sync, &rdev->flags) 458862306a36Sopenharmony_ci && !test_bit(Faulty, &rdev->flags)) 458962306a36Sopenharmony_ci spares++; 459062306a36Sopenharmony_ci if (rdev->raid_disk >= 0) { 459162306a36Sopenharmony_ci long long diff = (rdev->new_data_offset 459262306a36Sopenharmony_ci - rdev->data_offset); 459362306a36Sopenharmony_ci if (!mddev->reshape_backwards) 459462306a36Sopenharmony_ci diff = -diff; 459562306a36Sopenharmony_ci if (diff < 0) 459662306a36Sopenharmony_ci diff = 0; 459762306a36Sopenharmony_ci if (first || diff < min_offset_diff) 459862306a36Sopenharmony_ci min_offset_diff = diff; 459962306a36Sopenharmony_ci first = 0; 460062306a36Sopenharmony_ci } 460162306a36Sopenharmony_ci } 460262306a36Sopenharmony_ci 460362306a36Sopenharmony_ci if (max(before_length, after_length) > min_offset_diff) 460462306a36Sopenharmony_ci return -EINVAL; 460562306a36Sopenharmony_ci 460662306a36Sopenharmony_ci if (spares < mddev->delta_disks) 460762306a36Sopenharmony_ci return -EINVAL; 460862306a36Sopenharmony_ci 460962306a36Sopenharmony_ci conf->offset_diff = min_offset_diff; 461062306a36Sopenharmony_ci spin_lock_irq(&conf->device_lock); 461162306a36Sopenharmony_ci if (conf->mirrors_new) { 461262306a36Sopenharmony_ci memcpy(conf->mirrors_new, conf->mirrors, 461362306a36Sopenharmony_ci sizeof(struct raid10_info)*conf->prev.raid_disks); 461462306a36Sopenharmony_ci smp_mb(); 461562306a36Sopenharmony_ci kfree(conf->mirrors_old); 461662306a36Sopenharmony_ci conf->mirrors_old = conf->mirrors; 461762306a36Sopenharmony_ci conf->mirrors = conf->mirrors_new; 461862306a36Sopenharmony_ci conf->mirrors_new = NULL; 461962306a36Sopenharmony_ci } 462062306a36Sopenharmony_ci setup_geo(&conf->geo, mddev, geo_start); 462162306a36Sopenharmony_ci smp_mb(); 462262306a36Sopenharmony_ci if (mddev->reshape_backwards) { 462362306a36Sopenharmony_ci sector_t size = raid10_size(mddev, 0, 0); 462462306a36Sopenharmony_ci if (size < mddev->array_sectors) { 462562306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 462662306a36Sopenharmony_ci pr_warn("md/raid10:%s: array size must be reduce before number of disks\n", 462762306a36Sopenharmony_ci mdname(mddev)); 462862306a36Sopenharmony_ci return -EINVAL; 462962306a36Sopenharmony_ci } 463062306a36Sopenharmony_ci mddev->resync_max_sectors = size; 463162306a36Sopenharmony_ci conf->reshape_progress = size; 463262306a36Sopenharmony_ci } else 463362306a36Sopenharmony_ci conf->reshape_progress = 0; 463462306a36Sopenharmony_ci conf->reshape_safe = conf->reshape_progress; 463562306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 463662306a36Sopenharmony_ci 463762306a36Sopenharmony_ci if (mddev->delta_disks && mddev->bitmap) { 463862306a36Sopenharmony_ci struct mdp_superblock_1 *sb = NULL; 463962306a36Sopenharmony_ci sector_t oldsize, newsize; 464062306a36Sopenharmony_ci 464162306a36Sopenharmony_ci oldsize = raid10_size(mddev, 0, 0); 464262306a36Sopenharmony_ci newsize = raid10_size(mddev, 0, conf->geo.raid_disks); 464362306a36Sopenharmony_ci 464462306a36Sopenharmony_ci if (!mddev_is_clustered(mddev)) { 464562306a36Sopenharmony_ci ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 464662306a36Sopenharmony_ci if (ret) 464762306a36Sopenharmony_ci goto abort; 464862306a36Sopenharmony_ci else 464962306a36Sopenharmony_ci goto out; 465062306a36Sopenharmony_ci } 465162306a36Sopenharmony_ci 465262306a36Sopenharmony_ci rdev_for_each(rdev, mddev) { 465362306a36Sopenharmony_ci if (rdev->raid_disk > -1 && 465462306a36Sopenharmony_ci !test_bit(Faulty, &rdev->flags)) 465562306a36Sopenharmony_ci sb = page_address(rdev->sb_page); 465662306a36Sopenharmony_ci } 465762306a36Sopenharmony_ci 465862306a36Sopenharmony_ci /* 465962306a36Sopenharmony_ci * some node is already performing reshape, and no need to 466062306a36Sopenharmony_ci * call md_bitmap_resize again since it should be called when 466162306a36Sopenharmony_ci * receiving BITMAP_RESIZE msg 466262306a36Sopenharmony_ci */ 466362306a36Sopenharmony_ci if ((sb && (le32_to_cpu(sb->feature_map) & 466462306a36Sopenharmony_ci MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) 466562306a36Sopenharmony_ci goto out; 466662306a36Sopenharmony_ci 466762306a36Sopenharmony_ci ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 466862306a36Sopenharmony_ci if (ret) 466962306a36Sopenharmony_ci goto abort; 467062306a36Sopenharmony_ci 467162306a36Sopenharmony_ci ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); 467262306a36Sopenharmony_ci if (ret) { 467362306a36Sopenharmony_ci md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); 467462306a36Sopenharmony_ci goto abort; 467562306a36Sopenharmony_ci } 467662306a36Sopenharmony_ci } 467762306a36Sopenharmony_ciout: 467862306a36Sopenharmony_ci if (mddev->delta_disks > 0) { 467962306a36Sopenharmony_ci rdev_for_each(rdev, mddev) 468062306a36Sopenharmony_ci if (rdev->raid_disk < 0 && 468162306a36Sopenharmony_ci !test_bit(Faulty, &rdev->flags)) { 468262306a36Sopenharmony_ci if (raid10_add_disk(mddev, rdev) == 0) { 468362306a36Sopenharmony_ci if (rdev->raid_disk >= 468462306a36Sopenharmony_ci conf->prev.raid_disks) 468562306a36Sopenharmony_ci set_bit(In_sync, &rdev->flags); 468662306a36Sopenharmony_ci else 468762306a36Sopenharmony_ci rdev->recovery_offset = 0; 468862306a36Sopenharmony_ci 468962306a36Sopenharmony_ci /* Failure here is OK */ 469062306a36Sopenharmony_ci sysfs_link_rdev(mddev, rdev); 469162306a36Sopenharmony_ci } 469262306a36Sopenharmony_ci } else if (rdev->raid_disk >= conf->prev.raid_disks 469362306a36Sopenharmony_ci && !test_bit(Faulty, &rdev->flags)) { 469462306a36Sopenharmony_ci /* This is a spare that was manually added */ 469562306a36Sopenharmony_ci set_bit(In_sync, &rdev->flags); 469662306a36Sopenharmony_ci } 469762306a36Sopenharmony_ci } 469862306a36Sopenharmony_ci /* When a reshape changes the number of devices, 469962306a36Sopenharmony_ci * ->degraded is measured against the larger of the 470062306a36Sopenharmony_ci * pre and post numbers. 470162306a36Sopenharmony_ci */ 470262306a36Sopenharmony_ci spin_lock_irq(&conf->device_lock); 470362306a36Sopenharmony_ci mddev->degraded = calc_degraded(conf); 470462306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 470562306a36Sopenharmony_ci mddev->raid_disks = conf->geo.raid_disks; 470662306a36Sopenharmony_ci mddev->reshape_position = conf->reshape_progress; 470762306a36Sopenharmony_ci set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 470862306a36Sopenharmony_ci 470962306a36Sopenharmony_ci clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 471062306a36Sopenharmony_ci clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 471162306a36Sopenharmony_ci clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 471262306a36Sopenharmony_ci set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 471362306a36Sopenharmony_ci set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 471462306a36Sopenharmony_ci 471562306a36Sopenharmony_ci rcu_assign_pointer(mddev->sync_thread, 471662306a36Sopenharmony_ci md_register_thread(md_do_sync, mddev, "reshape")); 471762306a36Sopenharmony_ci if (!mddev->sync_thread) { 471862306a36Sopenharmony_ci ret = -EAGAIN; 471962306a36Sopenharmony_ci goto abort; 472062306a36Sopenharmony_ci } 472162306a36Sopenharmony_ci conf->reshape_checkpoint = jiffies; 472262306a36Sopenharmony_ci md_wakeup_thread(mddev->sync_thread); 472362306a36Sopenharmony_ci md_new_event(); 472462306a36Sopenharmony_ci return 0; 472562306a36Sopenharmony_ci 472662306a36Sopenharmony_ciabort: 472762306a36Sopenharmony_ci mddev->recovery = 0; 472862306a36Sopenharmony_ci spin_lock_irq(&conf->device_lock); 472962306a36Sopenharmony_ci conf->geo = conf->prev; 473062306a36Sopenharmony_ci mddev->raid_disks = conf->geo.raid_disks; 473162306a36Sopenharmony_ci rdev_for_each(rdev, mddev) 473262306a36Sopenharmony_ci rdev->new_data_offset = rdev->data_offset; 473362306a36Sopenharmony_ci smp_wmb(); 473462306a36Sopenharmony_ci conf->reshape_progress = MaxSector; 473562306a36Sopenharmony_ci conf->reshape_safe = MaxSector; 473662306a36Sopenharmony_ci mddev->reshape_position = MaxSector; 473762306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 473862306a36Sopenharmony_ci return ret; 473962306a36Sopenharmony_ci} 474062306a36Sopenharmony_ci 474162306a36Sopenharmony_ci/* Calculate the last device-address that could contain 474262306a36Sopenharmony_ci * any block from the chunk that includes the array-address 's' 474362306a36Sopenharmony_ci * and report the next address. 474462306a36Sopenharmony_ci * i.e. the address returned will be chunk-aligned and after 474562306a36Sopenharmony_ci * any data that is in the chunk containing 's'. 474662306a36Sopenharmony_ci */ 474762306a36Sopenharmony_cistatic sector_t last_dev_address(sector_t s, struct geom *geo) 474862306a36Sopenharmony_ci{ 474962306a36Sopenharmony_ci s = (s | geo->chunk_mask) + 1; 475062306a36Sopenharmony_ci s >>= geo->chunk_shift; 475162306a36Sopenharmony_ci s *= geo->near_copies; 475262306a36Sopenharmony_ci s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); 475362306a36Sopenharmony_ci s *= geo->far_copies; 475462306a36Sopenharmony_ci s <<= geo->chunk_shift; 475562306a36Sopenharmony_ci return s; 475662306a36Sopenharmony_ci} 475762306a36Sopenharmony_ci 475862306a36Sopenharmony_ci/* Calculate the first device-address that could contain 475962306a36Sopenharmony_ci * any block from the chunk that includes the array-address 's'. 476062306a36Sopenharmony_ci * This too will be the start of a chunk 476162306a36Sopenharmony_ci */ 476262306a36Sopenharmony_cistatic sector_t first_dev_address(sector_t s, struct geom *geo) 476362306a36Sopenharmony_ci{ 476462306a36Sopenharmony_ci s >>= geo->chunk_shift; 476562306a36Sopenharmony_ci s *= geo->near_copies; 476662306a36Sopenharmony_ci sector_div(s, geo->raid_disks); 476762306a36Sopenharmony_ci s *= geo->far_copies; 476862306a36Sopenharmony_ci s <<= geo->chunk_shift; 476962306a36Sopenharmony_ci return s; 477062306a36Sopenharmony_ci} 477162306a36Sopenharmony_ci 477262306a36Sopenharmony_cistatic sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 477362306a36Sopenharmony_ci int *skipped) 477462306a36Sopenharmony_ci{ 477562306a36Sopenharmony_ci /* We simply copy at most one chunk (smallest of old and new) 477662306a36Sopenharmony_ci * at a time, possibly less if that exceeds RESYNC_PAGES, 477762306a36Sopenharmony_ci * or we hit a bad block or something. 477862306a36Sopenharmony_ci * This might mean we pause for normal IO in the middle of 477962306a36Sopenharmony_ci * a chunk, but that is not a problem as mddev->reshape_position 478062306a36Sopenharmony_ci * can record any location. 478162306a36Sopenharmony_ci * 478262306a36Sopenharmony_ci * If we will want to write to a location that isn't 478362306a36Sopenharmony_ci * yet recorded as 'safe' (i.e. in metadata on disk) then 478462306a36Sopenharmony_ci * we need to flush all reshape requests and update the metadata. 478562306a36Sopenharmony_ci * 478662306a36Sopenharmony_ci * When reshaping forwards (e.g. to more devices), we interpret 478762306a36Sopenharmony_ci * 'safe' as the earliest block which might not have been copied 478862306a36Sopenharmony_ci * down yet. We divide this by previous stripe size and multiply 478962306a36Sopenharmony_ci * by previous stripe length to get lowest device offset that we 479062306a36Sopenharmony_ci * cannot write to yet. 479162306a36Sopenharmony_ci * We interpret 'sector_nr' as an address that we want to write to. 479262306a36Sopenharmony_ci * From this we use last_device_address() to find where we might 479362306a36Sopenharmony_ci * write to, and first_device_address on the 'safe' position. 479462306a36Sopenharmony_ci * If this 'next' write position is after the 'safe' position, 479562306a36Sopenharmony_ci * we must update the metadata to increase the 'safe' position. 479662306a36Sopenharmony_ci * 479762306a36Sopenharmony_ci * When reshaping backwards, we round in the opposite direction 479862306a36Sopenharmony_ci * and perform the reverse test: next write position must not be 479962306a36Sopenharmony_ci * less than current safe position. 480062306a36Sopenharmony_ci * 480162306a36Sopenharmony_ci * In all this the minimum difference in data offsets 480262306a36Sopenharmony_ci * (conf->offset_diff - always positive) allows a bit of slack, 480362306a36Sopenharmony_ci * so next can be after 'safe', but not by more than offset_diff 480462306a36Sopenharmony_ci * 480562306a36Sopenharmony_ci * We need to prepare all the bios here before we start any IO 480662306a36Sopenharmony_ci * to ensure the size we choose is acceptable to all devices. 480762306a36Sopenharmony_ci * The means one for each copy for write-out and an extra one for 480862306a36Sopenharmony_ci * read-in. 480962306a36Sopenharmony_ci * We store the read-in bio in ->master_bio and the others in 481062306a36Sopenharmony_ci * ->devs[x].bio and ->devs[x].repl_bio. 481162306a36Sopenharmony_ci */ 481262306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 481362306a36Sopenharmony_ci struct r10bio *r10_bio; 481462306a36Sopenharmony_ci sector_t next, safe, last; 481562306a36Sopenharmony_ci int max_sectors; 481662306a36Sopenharmony_ci int nr_sectors; 481762306a36Sopenharmony_ci int s; 481862306a36Sopenharmony_ci struct md_rdev *rdev; 481962306a36Sopenharmony_ci int need_flush = 0; 482062306a36Sopenharmony_ci struct bio *blist; 482162306a36Sopenharmony_ci struct bio *bio, *read_bio; 482262306a36Sopenharmony_ci int sectors_done = 0; 482362306a36Sopenharmony_ci struct page **pages; 482462306a36Sopenharmony_ci 482562306a36Sopenharmony_ci if (sector_nr == 0) { 482662306a36Sopenharmony_ci /* If restarting in the middle, skip the initial sectors */ 482762306a36Sopenharmony_ci if (mddev->reshape_backwards && 482862306a36Sopenharmony_ci conf->reshape_progress < raid10_size(mddev, 0, 0)) { 482962306a36Sopenharmony_ci sector_nr = (raid10_size(mddev, 0, 0) 483062306a36Sopenharmony_ci - conf->reshape_progress); 483162306a36Sopenharmony_ci } else if (!mddev->reshape_backwards && 483262306a36Sopenharmony_ci conf->reshape_progress > 0) 483362306a36Sopenharmony_ci sector_nr = conf->reshape_progress; 483462306a36Sopenharmony_ci if (sector_nr) { 483562306a36Sopenharmony_ci mddev->curr_resync_completed = sector_nr; 483662306a36Sopenharmony_ci sysfs_notify_dirent_safe(mddev->sysfs_completed); 483762306a36Sopenharmony_ci *skipped = 1; 483862306a36Sopenharmony_ci return sector_nr; 483962306a36Sopenharmony_ci } 484062306a36Sopenharmony_ci } 484162306a36Sopenharmony_ci 484262306a36Sopenharmony_ci /* We don't use sector_nr to track where we are up to 484362306a36Sopenharmony_ci * as that doesn't work well for ->reshape_backwards. 484462306a36Sopenharmony_ci * So just use ->reshape_progress. 484562306a36Sopenharmony_ci */ 484662306a36Sopenharmony_ci if (mddev->reshape_backwards) { 484762306a36Sopenharmony_ci /* 'next' is the earliest device address that we might 484862306a36Sopenharmony_ci * write to for this chunk in the new layout 484962306a36Sopenharmony_ci */ 485062306a36Sopenharmony_ci next = first_dev_address(conf->reshape_progress - 1, 485162306a36Sopenharmony_ci &conf->geo); 485262306a36Sopenharmony_ci 485362306a36Sopenharmony_ci /* 'safe' is the last device address that we might read from 485462306a36Sopenharmony_ci * in the old layout after a restart 485562306a36Sopenharmony_ci */ 485662306a36Sopenharmony_ci safe = last_dev_address(conf->reshape_safe - 1, 485762306a36Sopenharmony_ci &conf->prev); 485862306a36Sopenharmony_ci 485962306a36Sopenharmony_ci if (next + conf->offset_diff < safe) 486062306a36Sopenharmony_ci need_flush = 1; 486162306a36Sopenharmony_ci 486262306a36Sopenharmony_ci last = conf->reshape_progress - 1; 486362306a36Sopenharmony_ci sector_nr = last & ~(sector_t)(conf->geo.chunk_mask 486462306a36Sopenharmony_ci & conf->prev.chunk_mask); 486562306a36Sopenharmony_ci if (sector_nr + RESYNC_SECTORS < last) 486662306a36Sopenharmony_ci sector_nr = last + 1 - RESYNC_SECTORS; 486762306a36Sopenharmony_ci } else { 486862306a36Sopenharmony_ci /* 'next' is after the last device address that we 486962306a36Sopenharmony_ci * might write to for this chunk in the new layout 487062306a36Sopenharmony_ci */ 487162306a36Sopenharmony_ci next = last_dev_address(conf->reshape_progress, &conf->geo); 487262306a36Sopenharmony_ci 487362306a36Sopenharmony_ci /* 'safe' is the earliest device address that we might 487462306a36Sopenharmony_ci * read from in the old layout after a restart 487562306a36Sopenharmony_ci */ 487662306a36Sopenharmony_ci safe = first_dev_address(conf->reshape_safe, &conf->prev); 487762306a36Sopenharmony_ci 487862306a36Sopenharmony_ci /* Need to update metadata if 'next' might be beyond 'safe' 487962306a36Sopenharmony_ci * as that would possibly corrupt data 488062306a36Sopenharmony_ci */ 488162306a36Sopenharmony_ci if (next > safe + conf->offset_diff) 488262306a36Sopenharmony_ci need_flush = 1; 488362306a36Sopenharmony_ci 488462306a36Sopenharmony_ci sector_nr = conf->reshape_progress; 488562306a36Sopenharmony_ci last = sector_nr | (conf->geo.chunk_mask 488662306a36Sopenharmony_ci & conf->prev.chunk_mask); 488762306a36Sopenharmony_ci 488862306a36Sopenharmony_ci if (sector_nr + RESYNC_SECTORS <= last) 488962306a36Sopenharmony_ci last = sector_nr + RESYNC_SECTORS - 1; 489062306a36Sopenharmony_ci } 489162306a36Sopenharmony_ci 489262306a36Sopenharmony_ci if (need_flush || 489362306a36Sopenharmony_ci time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 489462306a36Sopenharmony_ci /* Need to update reshape_position in metadata */ 489562306a36Sopenharmony_ci wait_barrier(conf, false); 489662306a36Sopenharmony_ci mddev->reshape_position = conf->reshape_progress; 489762306a36Sopenharmony_ci if (mddev->reshape_backwards) 489862306a36Sopenharmony_ci mddev->curr_resync_completed = raid10_size(mddev, 0, 0) 489962306a36Sopenharmony_ci - conf->reshape_progress; 490062306a36Sopenharmony_ci else 490162306a36Sopenharmony_ci mddev->curr_resync_completed = conf->reshape_progress; 490262306a36Sopenharmony_ci conf->reshape_checkpoint = jiffies; 490362306a36Sopenharmony_ci set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 490462306a36Sopenharmony_ci md_wakeup_thread(mddev->thread); 490562306a36Sopenharmony_ci wait_event(mddev->sb_wait, mddev->sb_flags == 0 || 490662306a36Sopenharmony_ci test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 490762306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 490862306a36Sopenharmony_ci allow_barrier(conf); 490962306a36Sopenharmony_ci return sectors_done; 491062306a36Sopenharmony_ci } 491162306a36Sopenharmony_ci conf->reshape_safe = mddev->reshape_position; 491262306a36Sopenharmony_ci allow_barrier(conf); 491362306a36Sopenharmony_ci } 491462306a36Sopenharmony_ci 491562306a36Sopenharmony_ci raise_barrier(conf, 0); 491662306a36Sopenharmony_ciread_more: 491762306a36Sopenharmony_ci /* Now schedule reads for blocks from sector_nr to last */ 491862306a36Sopenharmony_ci r10_bio = raid10_alloc_init_r10buf(conf); 491962306a36Sopenharmony_ci r10_bio->state = 0; 492062306a36Sopenharmony_ci raise_barrier(conf, 1); 492162306a36Sopenharmony_ci atomic_set(&r10_bio->remaining, 0); 492262306a36Sopenharmony_ci r10_bio->mddev = mddev; 492362306a36Sopenharmony_ci r10_bio->sector = sector_nr; 492462306a36Sopenharmony_ci set_bit(R10BIO_IsReshape, &r10_bio->state); 492562306a36Sopenharmony_ci r10_bio->sectors = last - sector_nr + 1; 492662306a36Sopenharmony_ci rdev = read_balance(conf, r10_bio, &max_sectors); 492762306a36Sopenharmony_ci BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); 492862306a36Sopenharmony_ci 492962306a36Sopenharmony_ci if (!rdev) { 493062306a36Sopenharmony_ci /* Cannot read from here, so need to record bad blocks 493162306a36Sopenharmony_ci * on all the target devices. 493262306a36Sopenharmony_ci */ 493362306a36Sopenharmony_ci // FIXME 493462306a36Sopenharmony_ci mempool_free(r10_bio, &conf->r10buf_pool); 493562306a36Sopenharmony_ci set_bit(MD_RECOVERY_INTR, &mddev->recovery); 493662306a36Sopenharmony_ci return sectors_done; 493762306a36Sopenharmony_ci } 493862306a36Sopenharmony_ci 493962306a36Sopenharmony_ci read_bio = bio_alloc_bioset(rdev->bdev, RESYNC_PAGES, REQ_OP_READ, 494062306a36Sopenharmony_ci GFP_KERNEL, &mddev->bio_set); 494162306a36Sopenharmony_ci read_bio->bi_iter.bi_sector = (r10_bio->devs[r10_bio->read_slot].addr 494262306a36Sopenharmony_ci + rdev->data_offset); 494362306a36Sopenharmony_ci read_bio->bi_private = r10_bio; 494462306a36Sopenharmony_ci read_bio->bi_end_io = end_reshape_read; 494562306a36Sopenharmony_ci r10_bio->master_bio = read_bio; 494662306a36Sopenharmony_ci r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; 494762306a36Sopenharmony_ci 494862306a36Sopenharmony_ci /* 494962306a36Sopenharmony_ci * Broadcast RESYNC message to other nodes, so all nodes would not 495062306a36Sopenharmony_ci * write to the region to avoid conflict. 495162306a36Sopenharmony_ci */ 495262306a36Sopenharmony_ci if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) { 495362306a36Sopenharmony_ci struct mdp_superblock_1 *sb = NULL; 495462306a36Sopenharmony_ci int sb_reshape_pos = 0; 495562306a36Sopenharmony_ci 495662306a36Sopenharmony_ci conf->cluster_sync_low = sector_nr; 495762306a36Sopenharmony_ci conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS; 495862306a36Sopenharmony_ci sb = page_address(rdev->sb_page); 495962306a36Sopenharmony_ci if (sb) { 496062306a36Sopenharmony_ci sb_reshape_pos = le64_to_cpu(sb->reshape_position); 496162306a36Sopenharmony_ci /* 496262306a36Sopenharmony_ci * Set cluster_sync_low again if next address for array 496362306a36Sopenharmony_ci * reshape is less than cluster_sync_low. Since we can't 496462306a36Sopenharmony_ci * update cluster_sync_low until it has finished reshape. 496562306a36Sopenharmony_ci */ 496662306a36Sopenharmony_ci if (sb_reshape_pos < conf->cluster_sync_low) 496762306a36Sopenharmony_ci conf->cluster_sync_low = sb_reshape_pos; 496862306a36Sopenharmony_ci } 496962306a36Sopenharmony_ci 497062306a36Sopenharmony_ci md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low, 497162306a36Sopenharmony_ci conf->cluster_sync_high); 497262306a36Sopenharmony_ci } 497362306a36Sopenharmony_ci 497462306a36Sopenharmony_ci /* Now find the locations in the new layout */ 497562306a36Sopenharmony_ci __raid10_find_phys(&conf->geo, r10_bio); 497662306a36Sopenharmony_ci 497762306a36Sopenharmony_ci blist = read_bio; 497862306a36Sopenharmony_ci read_bio->bi_next = NULL; 497962306a36Sopenharmony_ci 498062306a36Sopenharmony_ci rcu_read_lock(); 498162306a36Sopenharmony_ci for (s = 0; s < conf->copies*2; s++) { 498262306a36Sopenharmony_ci struct bio *b; 498362306a36Sopenharmony_ci int d = r10_bio->devs[s/2].devnum; 498462306a36Sopenharmony_ci struct md_rdev *rdev2; 498562306a36Sopenharmony_ci if (s&1) { 498662306a36Sopenharmony_ci rdev2 = rcu_dereference(conf->mirrors[d].replacement); 498762306a36Sopenharmony_ci b = r10_bio->devs[s/2].repl_bio; 498862306a36Sopenharmony_ci } else { 498962306a36Sopenharmony_ci rdev2 = rcu_dereference(conf->mirrors[d].rdev); 499062306a36Sopenharmony_ci b = r10_bio->devs[s/2].bio; 499162306a36Sopenharmony_ci } 499262306a36Sopenharmony_ci if (!rdev2 || test_bit(Faulty, &rdev2->flags)) 499362306a36Sopenharmony_ci continue; 499462306a36Sopenharmony_ci 499562306a36Sopenharmony_ci bio_set_dev(b, rdev2->bdev); 499662306a36Sopenharmony_ci b->bi_iter.bi_sector = r10_bio->devs[s/2].addr + 499762306a36Sopenharmony_ci rdev2->new_data_offset; 499862306a36Sopenharmony_ci b->bi_end_io = end_reshape_write; 499962306a36Sopenharmony_ci b->bi_opf = REQ_OP_WRITE; 500062306a36Sopenharmony_ci b->bi_next = blist; 500162306a36Sopenharmony_ci blist = b; 500262306a36Sopenharmony_ci } 500362306a36Sopenharmony_ci 500462306a36Sopenharmony_ci /* Now add as many pages as possible to all of these bios. */ 500562306a36Sopenharmony_ci 500662306a36Sopenharmony_ci nr_sectors = 0; 500762306a36Sopenharmony_ci pages = get_resync_pages(r10_bio->devs[0].bio)->pages; 500862306a36Sopenharmony_ci for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { 500962306a36Sopenharmony_ci struct page *page = pages[s / (PAGE_SIZE >> 9)]; 501062306a36Sopenharmony_ci int len = (max_sectors - s) << 9; 501162306a36Sopenharmony_ci if (len > PAGE_SIZE) 501262306a36Sopenharmony_ci len = PAGE_SIZE; 501362306a36Sopenharmony_ci for (bio = blist; bio ; bio = bio->bi_next) { 501462306a36Sopenharmony_ci if (WARN_ON(!bio_add_page(bio, page, len, 0))) { 501562306a36Sopenharmony_ci bio->bi_status = BLK_STS_RESOURCE; 501662306a36Sopenharmony_ci bio_endio(bio); 501762306a36Sopenharmony_ci return sectors_done; 501862306a36Sopenharmony_ci } 501962306a36Sopenharmony_ci } 502062306a36Sopenharmony_ci sector_nr += len >> 9; 502162306a36Sopenharmony_ci nr_sectors += len >> 9; 502262306a36Sopenharmony_ci } 502362306a36Sopenharmony_ci rcu_read_unlock(); 502462306a36Sopenharmony_ci r10_bio->sectors = nr_sectors; 502562306a36Sopenharmony_ci 502662306a36Sopenharmony_ci /* Now submit the read */ 502762306a36Sopenharmony_ci md_sync_acct_bio(read_bio, r10_bio->sectors); 502862306a36Sopenharmony_ci atomic_inc(&r10_bio->remaining); 502962306a36Sopenharmony_ci read_bio->bi_next = NULL; 503062306a36Sopenharmony_ci submit_bio_noacct(read_bio); 503162306a36Sopenharmony_ci sectors_done += nr_sectors; 503262306a36Sopenharmony_ci if (sector_nr <= last) 503362306a36Sopenharmony_ci goto read_more; 503462306a36Sopenharmony_ci 503562306a36Sopenharmony_ci lower_barrier(conf); 503662306a36Sopenharmony_ci 503762306a36Sopenharmony_ci /* Now that we have done the whole section we can 503862306a36Sopenharmony_ci * update reshape_progress 503962306a36Sopenharmony_ci */ 504062306a36Sopenharmony_ci if (mddev->reshape_backwards) 504162306a36Sopenharmony_ci conf->reshape_progress -= sectors_done; 504262306a36Sopenharmony_ci else 504362306a36Sopenharmony_ci conf->reshape_progress += sectors_done; 504462306a36Sopenharmony_ci 504562306a36Sopenharmony_ci return sectors_done; 504662306a36Sopenharmony_ci} 504762306a36Sopenharmony_ci 504862306a36Sopenharmony_cistatic void end_reshape_request(struct r10bio *r10_bio); 504962306a36Sopenharmony_cistatic int handle_reshape_read_error(struct mddev *mddev, 505062306a36Sopenharmony_ci struct r10bio *r10_bio); 505162306a36Sopenharmony_cistatic void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) 505262306a36Sopenharmony_ci{ 505362306a36Sopenharmony_ci /* Reshape read completed. Hopefully we have a block 505462306a36Sopenharmony_ci * to write out. 505562306a36Sopenharmony_ci * If we got a read error then we do sync 1-page reads from 505662306a36Sopenharmony_ci * elsewhere until we find the data - or give up. 505762306a36Sopenharmony_ci */ 505862306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 505962306a36Sopenharmony_ci int s; 506062306a36Sopenharmony_ci 506162306a36Sopenharmony_ci if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 506262306a36Sopenharmony_ci if (handle_reshape_read_error(mddev, r10_bio) < 0) { 506362306a36Sopenharmony_ci /* Reshape has been aborted */ 506462306a36Sopenharmony_ci md_done_sync(mddev, r10_bio->sectors, 0); 506562306a36Sopenharmony_ci return; 506662306a36Sopenharmony_ci } 506762306a36Sopenharmony_ci 506862306a36Sopenharmony_ci /* We definitely have the data in the pages, schedule the 506962306a36Sopenharmony_ci * writes. 507062306a36Sopenharmony_ci */ 507162306a36Sopenharmony_ci atomic_set(&r10_bio->remaining, 1); 507262306a36Sopenharmony_ci for (s = 0; s < conf->copies*2; s++) { 507362306a36Sopenharmony_ci struct bio *b; 507462306a36Sopenharmony_ci int d = r10_bio->devs[s/2].devnum; 507562306a36Sopenharmony_ci struct md_rdev *rdev; 507662306a36Sopenharmony_ci rcu_read_lock(); 507762306a36Sopenharmony_ci if (s&1) { 507862306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].replacement); 507962306a36Sopenharmony_ci b = r10_bio->devs[s/2].repl_bio; 508062306a36Sopenharmony_ci } else { 508162306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].rdev); 508262306a36Sopenharmony_ci b = r10_bio->devs[s/2].bio; 508362306a36Sopenharmony_ci } 508462306a36Sopenharmony_ci if (!rdev || test_bit(Faulty, &rdev->flags)) { 508562306a36Sopenharmony_ci rcu_read_unlock(); 508662306a36Sopenharmony_ci continue; 508762306a36Sopenharmony_ci } 508862306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 508962306a36Sopenharmony_ci rcu_read_unlock(); 509062306a36Sopenharmony_ci md_sync_acct_bio(b, r10_bio->sectors); 509162306a36Sopenharmony_ci atomic_inc(&r10_bio->remaining); 509262306a36Sopenharmony_ci b->bi_next = NULL; 509362306a36Sopenharmony_ci submit_bio_noacct(b); 509462306a36Sopenharmony_ci } 509562306a36Sopenharmony_ci end_reshape_request(r10_bio); 509662306a36Sopenharmony_ci} 509762306a36Sopenharmony_ci 509862306a36Sopenharmony_cistatic void end_reshape(struct r10conf *conf) 509962306a36Sopenharmony_ci{ 510062306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) 510162306a36Sopenharmony_ci return; 510262306a36Sopenharmony_ci 510362306a36Sopenharmony_ci spin_lock_irq(&conf->device_lock); 510462306a36Sopenharmony_ci conf->prev = conf->geo; 510562306a36Sopenharmony_ci md_finish_reshape(conf->mddev); 510662306a36Sopenharmony_ci smp_wmb(); 510762306a36Sopenharmony_ci conf->reshape_progress = MaxSector; 510862306a36Sopenharmony_ci conf->reshape_safe = MaxSector; 510962306a36Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 511062306a36Sopenharmony_ci 511162306a36Sopenharmony_ci if (conf->mddev->queue) 511262306a36Sopenharmony_ci raid10_set_io_opt(conf); 511362306a36Sopenharmony_ci conf->fullsync = 0; 511462306a36Sopenharmony_ci} 511562306a36Sopenharmony_ci 511662306a36Sopenharmony_cistatic void raid10_update_reshape_pos(struct mddev *mddev) 511762306a36Sopenharmony_ci{ 511862306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 511962306a36Sopenharmony_ci sector_t lo, hi; 512062306a36Sopenharmony_ci 512162306a36Sopenharmony_ci md_cluster_ops->resync_info_get(mddev, &lo, &hi); 512262306a36Sopenharmony_ci if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo)) 512362306a36Sopenharmony_ci || mddev->reshape_position == MaxSector) 512462306a36Sopenharmony_ci conf->reshape_progress = mddev->reshape_position; 512562306a36Sopenharmony_ci else 512662306a36Sopenharmony_ci WARN_ON_ONCE(1); 512762306a36Sopenharmony_ci} 512862306a36Sopenharmony_ci 512962306a36Sopenharmony_cistatic int handle_reshape_read_error(struct mddev *mddev, 513062306a36Sopenharmony_ci struct r10bio *r10_bio) 513162306a36Sopenharmony_ci{ 513262306a36Sopenharmony_ci /* Use sync reads to get the blocks from somewhere else */ 513362306a36Sopenharmony_ci int sectors = r10_bio->sectors; 513462306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 513562306a36Sopenharmony_ci struct r10bio *r10b; 513662306a36Sopenharmony_ci int slot = 0; 513762306a36Sopenharmony_ci int idx = 0; 513862306a36Sopenharmony_ci struct page **pages; 513962306a36Sopenharmony_ci 514062306a36Sopenharmony_ci r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO); 514162306a36Sopenharmony_ci if (!r10b) { 514262306a36Sopenharmony_ci set_bit(MD_RECOVERY_INTR, &mddev->recovery); 514362306a36Sopenharmony_ci return -ENOMEM; 514462306a36Sopenharmony_ci } 514562306a36Sopenharmony_ci 514662306a36Sopenharmony_ci /* reshape IOs share pages from .devs[0].bio */ 514762306a36Sopenharmony_ci pages = get_resync_pages(r10_bio->devs[0].bio)->pages; 514862306a36Sopenharmony_ci 514962306a36Sopenharmony_ci r10b->sector = r10_bio->sector; 515062306a36Sopenharmony_ci __raid10_find_phys(&conf->prev, r10b); 515162306a36Sopenharmony_ci 515262306a36Sopenharmony_ci while (sectors) { 515362306a36Sopenharmony_ci int s = sectors; 515462306a36Sopenharmony_ci int success = 0; 515562306a36Sopenharmony_ci int first_slot = slot; 515662306a36Sopenharmony_ci 515762306a36Sopenharmony_ci if (s > (PAGE_SIZE >> 9)) 515862306a36Sopenharmony_ci s = PAGE_SIZE >> 9; 515962306a36Sopenharmony_ci 516062306a36Sopenharmony_ci rcu_read_lock(); 516162306a36Sopenharmony_ci while (!success) { 516262306a36Sopenharmony_ci int d = r10b->devs[slot].devnum; 516362306a36Sopenharmony_ci struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 516462306a36Sopenharmony_ci sector_t addr; 516562306a36Sopenharmony_ci if (rdev == NULL || 516662306a36Sopenharmony_ci test_bit(Faulty, &rdev->flags) || 516762306a36Sopenharmony_ci !test_bit(In_sync, &rdev->flags)) 516862306a36Sopenharmony_ci goto failed; 516962306a36Sopenharmony_ci 517062306a36Sopenharmony_ci addr = r10b->devs[slot].addr + idx * PAGE_SIZE; 517162306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 517262306a36Sopenharmony_ci rcu_read_unlock(); 517362306a36Sopenharmony_ci success = sync_page_io(rdev, 517462306a36Sopenharmony_ci addr, 517562306a36Sopenharmony_ci s << 9, 517662306a36Sopenharmony_ci pages[idx], 517762306a36Sopenharmony_ci REQ_OP_READ, false); 517862306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 517962306a36Sopenharmony_ci rcu_read_lock(); 518062306a36Sopenharmony_ci if (success) 518162306a36Sopenharmony_ci break; 518262306a36Sopenharmony_ci failed: 518362306a36Sopenharmony_ci slot++; 518462306a36Sopenharmony_ci if (slot >= conf->copies) 518562306a36Sopenharmony_ci slot = 0; 518662306a36Sopenharmony_ci if (slot == first_slot) 518762306a36Sopenharmony_ci break; 518862306a36Sopenharmony_ci } 518962306a36Sopenharmony_ci rcu_read_unlock(); 519062306a36Sopenharmony_ci if (!success) { 519162306a36Sopenharmony_ci /* couldn't read this block, must give up */ 519262306a36Sopenharmony_ci set_bit(MD_RECOVERY_INTR, 519362306a36Sopenharmony_ci &mddev->recovery); 519462306a36Sopenharmony_ci kfree(r10b); 519562306a36Sopenharmony_ci return -EIO; 519662306a36Sopenharmony_ci } 519762306a36Sopenharmony_ci sectors -= s; 519862306a36Sopenharmony_ci idx++; 519962306a36Sopenharmony_ci } 520062306a36Sopenharmony_ci kfree(r10b); 520162306a36Sopenharmony_ci return 0; 520262306a36Sopenharmony_ci} 520362306a36Sopenharmony_ci 520462306a36Sopenharmony_cistatic void end_reshape_write(struct bio *bio) 520562306a36Sopenharmony_ci{ 520662306a36Sopenharmony_ci struct r10bio *r10_bio = get_resync_r10bio(bio); 520762306a36Sopenharmony_ci struct mddev *mddev = r10_bio->mddev; 520862306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 520962306a36Sopenharmony_ci int d; 521062306a36Sopenharmony_ci int slot; 521162306a36Sopenharmony_ci int repl; 521262306a36Sopenharmony_ci struct md_rdev *rdev = NULL; 521362306a36Sopenharmony_ci 521462306a36Sopenharmony_ci d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 521562306a36Sopenharmony_ci if (repl) 521662306a36Sopenharmony_ci rdev = conf->mirrors[d].replacement; 521762306a36Sopenharmony_ci if (!rdev) { 521862306a36Sopenharmony_ci smp_mb(); 521962306a36Sopenharmony_ci rdev = conf->mirrors[d].rdev; 522062306a36Sopenharmony_ci } 522162306a36Sopenharmony_ci 522262306a36Sopenharmony_ci if (bio->bi_status) { 522362306a36Sopenharmony_ci /* FIXME should record badblock */ 522462306a36Sopenharmony_ci md_error(mddev, rdev); 522562306a36Sopenharmony_ci } 522662306a36Sopenharmony_ci 522762306a36Sopenharmony_ci rdev_dec_pending(rdev, mddev); 522862306a36Sopenharmony_ci end_reshape_request(r10_bio); 522962306a36Sopenharmony_ci} 523062306a36Sopenharmony_ci 523162306a36Sopenharmony_cistatic void end_reshape_request(struct r10bio *r10_bio) 523262306a36Sopenharmony_ci{ 523362306a36Sopenharmony_ci if (!atomic_dec_and_test(&r10_bio->remaining)) 523462306a36Sopenharmony_ci return; 523562306a36Sopenharmony_ci md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); 523662306a36Sopenharmony_ci bio_put(r10_bio->master_bio); 523762306a36Sopenharmony_ci put_buf(r10_bio); 523862306a36Sopenharmony_ci} 523962306a36Sopenharmony_ci 524062306a36Sopenharmony_cistatic void raid10_finish_reshape(struct mddev *mddev) 524162306a36Sopenharmony_ci{ 524262306a36Sopenharmony_ci struct r10conf *conf = mddev->private; 524362306a36Sopenharmony_ci 524462306a36Sopenharmony_ci if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 524562306a36Sopenharmony_ci return; 524662306a36Sopenharmony_ci 524762306a36Sopenharmony_ci if (mddev->delta_disks > 0) { 524862306a36Sopenharmony_ci if (mddev->recovery_cp > mddev->resync_max_sectors) { 524962306a36Sopenharmony_ci mddev->recovery_cp = mddev->resync_max_sectors; 525062306a36Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 525162306a36Sopenharmony_ci } 525262306a36Sopenharmony_ci mddev->resync_max_sectors = mddev->array_sectors; 525362306a36Sopenharmony_ci } else { 525462306a36Sopenharmony_ci int d; 525562306a36Sopenharmony_ci rcu_read_lock(); 525662306a36Sopenharmony_ci for (d = conf->geo.raid_disks ; 525762306a36Sopenharmony_ci d < conf->geo.raid_disks - mddev->delta_disks; 525862306a36Sopenharmony_ci d++) { 525962306a36Sopenharmony_ci struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 526062306a36Sopenharmony_ci if (rdev) 526162306a36Sopenharmony_ci clear_bit(In_sync, &rdev->flags); 526262306a36Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].replacement); 526362306a36Sopenharmony_ci if (rdev) 526462306a36Sopenharmony_ci clear_bit(In_sync, &rdev->flags); 526562306a36Sopenharmony_ci } 526662306a36Sopenharmony_ci rcu_read_unlock(); 526762306a36Sopenharmony_ci } 526862306a36Sopenharmony_ci mddev->layout = mddev->new_layout; 526962306a36Sopenharmony_ci mddev->chunk_sectors = 1 << conf->geo.chunk_shift; 527062306a36Sopenharmony_ci mddev->reshape_position = MaxSector; 527162306a36Sopenharmony_ci mddev->delta_disks = 0; 527262306a36Sopenharmony_ci mddev->reshape_backwards = 0; 527362306a36Sopenharmony_ci} 527462306a36Sopenharmony_ci 527562306a36Sopenharmony_cistatic struct md_personality raid10_personality = 527662306a36Sopenharmony_ci{ 527762306a36Sopenharmony_ci .name = "raid10", 527862306a36Sopenharmony_ci .level = 10, 527962306a36Sopenharmony_ci .owner = THIS_MODULE, 528062306a36Sopenharmony_ci .make_request = raid10_make_request, 528162306a36Sopenharmony_ci .run = raid10_run, 528262306a36Sopenharmony_ci .free = raid10_free, 528362306a36Sopenharmony_ci .status = raid10_status, 528462306a36Sopenharmony_ci .error_handler = raid10_error, 528562306a36Sopenharmony_ci .hot_add_disk = raid10_add_disk, 528662306a36Sopenharmony_ci .hot_remove_disk= raid10_remove_disk, 528762306a36Sopenharmony_ci .spare_active = raid10_spare_active, 528862306a36Sopenharmony_ci .sync_request = raid10_sync_request, 528962306a36Sopenharmony_ci .quiesce = raid10_quiesce, 529062306a36Sopenharmony_ci .size = raid10_size, 529162306a36Sopenharmony_ci .resize = raid10_resize, 529262306a36Sopenharmony_ci .takeover = raid10_takeover, 529362306a36Sopenharmony_ci .check_reshape = raid10_check_reshape, 529462306a36Sopenharmony_ci .start_reshape = raid10_start_reshape, 529562306a36Sopenharmony_ci .finish_reshape = raid10_finish_reshape, 529662306a36Sopenharmony_ci .update_reshape_pos = raid10_update_reshape_pos, 529762306a36Sopenharmony_ci}; 529862306a36Sopenharmony_ci 529962306a36Sopenharmony_cistatic int __init raid_init(void) 530062306a36Sopenharmony_ci{ 530162306a36Sopenharmony_ci return register_md_personality(&raid10_personality); 530262306a36Sopenharmony_ci} 530362306a36Sopenharmony_ci 530462306a36Sopenharmony_cistatic void raid_exit(void) 530562306a36Sopenharmony_ci{ 530662306a36Sopenharmony_ci unregister_md_personality(&raid10_personality); 530762306a36Sopenharmony_ci} 530862306a36Sopenharmony_ci 530962306a36Sopenharmony_cimodule_init(raid_init); 531062306a36Sopenharmony_cimodule_exit(raid_exit); 531162306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 531262306a36Sopenharmony_ciMODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); 531362306a36Sopenharmony_ciMODULE_ALIAS("md-personality-9"); /* RAID10 */ 531462306a36Sopenharmony_ciMODULE_ALIAS("md-raid10"); 531562306a36Sopenharmony_ciMODULE_ALIAS("md-level-10"); 5316