18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * raid1.c : Multiple Devices driver for Linux 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * RAID-1 management functions. 108c2ecf20Sopenharmony_ci * 118c2ecf20Sopenharmony_ci * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 128c2ecf20Sopenharmony_ci * 138c2ecf20Sopenharmony_ci * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> 148c2ecf20Sopenharmony_ci * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> 158c2ecf20Sopenharmony_ci * 168c2ecf20Sopenharmony_ci * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support 178c2ecf20Sopenharmony_ci * bitmapped intelligence in resync: 188c2ecf20Sopenharmony_ci * 198c2ecf20Sopenharmony_ci * - bitmap marked during normal i/o 208c2ecf20Sopenharmony_ci * - bitmap used to skip nondirty blocks during sync 218c2ecf20Sopenharmony_ci * 228c2ecf20Sopenharmony_ci * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: 238c2ecf20Sopenharmony_ci * - persistent bitmap code 248c2ecf20Sopenharmony_ci */ 258c2ecf20Sopenharmony_ci 268c2ecf20Sopenharmony_ci#include <linux/slab.h> 278c2ecf20Sopenharmony_ci#include <linux/delay.h> 288c2ecf20Sopenharmony_ci#include <linux/blkdev.h> 298c2ecf20Sopenharmony_ci#include <linux/module.h> 308c2ecf20Sopenharmony_ci#include <linux/seq_file.h> 318c2ecf20Sopenharmony_ci#include <linux/ratelimit.h> 328c2ecf20Sopenharmony_ci#include <linux/interval_tree_generic.h> 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_ci#include <trace/events/block.h> 358c2ecf20Sopenharmony_ci 368c2ecf20Sopenharmony_ci#include "md.h" 378c2ecf20Sopenharmony_ci#include "raid1.h" 388c2ecf20Sopenharmony_ci#include "md-bitmap.h" 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_ci#define UNSUPPORTED_MDDEV_FLAGS \ 418c2ecf20Sopenharmony_ci ((1L << MD_HAS_JOURNAL) | \ 428c2ecf20Sopenharmony_ci (1L << MD_JOURNAL_CLEAN) | \ 438c2ecf20Sopenharmony_ci (1L << MD_HAS_PPL) | \ 448c2ecf20Sopenharmony_ci (1L << MD_HAS_MULTIPLE_PPLS)) 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_cistatic void allow_barrier(struct r1conf *conf, sector_t sector_nr); 478c2ecf20Sopenharmony_cistatic void lower_barrier(struct r1conf *conf, sector_t sector_nr); 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci#define raid1_log(md, fmt, args...) \ 508c2ecf20Sopenharmony_ci do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci#include "raid1-10.c" 538c2ecf20Sopenharmony_ci 548c2ecf20Sopenharmony_ci#define START(node) ((node)->start) 558c2ecf20Sopenharmony_ci#define LAST(node) ((node)->last) 568c2ecf20Sopenharmony_ciINTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, 578c2ecf20Sopenharmony_ci START, LAST, static inline, raid1_rb); 588c2ecf20Sopenharmony_ci 598c2ecf20Sopenharmony_cistatic int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, 608c2ecf20Sopenharmony_ci struct serial_info *si, int idx) 618c2ecf20Sopenharmony_ci{ 628c2ecf20Sopenharmony_ci unsigned long flags; 638c2ecf20Sopenharmony_ci int ret = 0; 648c2ecf20Sopenharmony_ci sector_t lo = r1_bio->sector; 658c2ecf20Sopenharmony_ci sector_t hi = lo + r1_bio->sectors; 668c2ecf20Sopenharmony_ci struct serial_in_rdev *serial = &rdev->serial[idx]; 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci spin_lock_irqsave(&serial->serial_lock, flags); 698c2ecf20Sopenharmony_ci /* collision happened */ 708c2ecf20Sopenharmony_ci if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) 718c2ecf20Sopenharmony_ci ret = -EBUSY; 728c2ecf20Sopenharmony_ci else { 738c2ecf20Sopenharmony_ci si->start = lo; 748c2ecf20Sopenharmony_ci si->last = hi; 758c2ecf20Sopenharmony_ci raid1_rb_insert(si, &serial->serial_rb); 768c2ecf20Sopenharmony_ci } 778c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&serial->serial_lock, flags); 788c2ecf20Sopenharmony_ci 798c2ecf20Sopenharmony_ci return ret; 808c2ecf20Sopenharmony_ci} 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_cistatic void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio) 838c2ecf20Sopenharmony_ci{ 848c2ecf20Sopenharmony_ci struct mddev *mddev = rdev->mddev; 858c2ecf20Sopenharmony_ci struct serial_info *si; 868c2ecf20Sopenharmony_ci int idx = sector_to_idx(r1_bio->sector); 878c2ecf20Sopenharmony_ci struct serial_in_rdev *serial = &rdev->serial[idx]; 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci if (WARN_ON(!mddev->serial_info_pool)) 908c2ecf20Sopenharmony_ci return; 918c2ecf20Sopenharmony_ci si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); 928c2ecf20Sopenharmony_ci wait_event(serial->serial_io_wait, 938c2ecf20Sopenharmony_ci check_and_add_serial(rdev, r1_bio, si, idx) == 0); 948c2ecf20Sopenharmony_ci} 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_cistatic void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) 978c2ecf20Sopenharmony_ci{ 988c2ecf20Sopenharmony_ci struct serial_info *si; 998c2ecf20Sopenharmony_ci unsigned long flags; 1008c2ecf20Sopenharmony_ci int found = 0; 1018c2ecf20Sopenharmony_ci struct mddev *mddev = rdev->mddev; 1028c2ecf20Sopenharmony_ci int idx = sector_to_idx(lo); 1038c2ecf20Sopenharmony_ci struct serial_in_rdev *serial = &rdev->serial[idx]; 1048c2ecf20Sopenharmony_ci 1058c2ecf20Sopenharmony_ci spin_lock_irqsave(&serial->serial_lock, flags); 1068c2ecf20Sopenharmony_ci for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); 1078c2ecf20Sopenharmony_ci si; si = raid1_rb_iter_next(si, lo, hi)) { 1088c2ecf20Sopenharmony_ci if (si->start == lo && si->last == hi) { 1098c2ecf20Sopenharmony_ci raid1_rb_remove(si, &serial->serial_rb); 1108c2ecf20Sopenharmony_ci mempool_free(si, mddev->serial_info_pool); 1118c2ecf20Sopenharmony_ci found = 1; 1128c2ecf20Sopenharmony_ci break; 1138c2ecf20Sopenharmony_ci } 1148c2ecf20Sopenharmony_ci } 1158c2ecf20Sopenharmony_ci if (!found) 1168c2ecf20Sopenharmony_ci WARN(1, "The write IO is not recorded for serialization\n"); 1178c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&serial->serial_lock, flags); 1188c2ecf20Sopenharmony_ci wake_up(&serial->serial_io_wait); 1198c2ecf20Sopenharmony_ci} 1208c2ecf20Sopenharmony_ci 1218c2ecf20Sopenharmony_ci/* 1228c2ecf20Sopenharmony_ci * for resync bio, r1bio pointer can be retrieved from the per-bio 1238c2ecf20Sopenharmony_ci * 'struct resync_pages'. 1248c2ecf20Sopenharmony_ci */ 1258c2ecf20Sopenharmony_cistatic inline struct r1bio *get_resync_r1bio(struct bio *bio) 1268c2ecf20Sopenharmony_ci{ 1278c2ecf20Sopenharmony_ci return get_resync_pages(bio)->raid_bio; 1288c2ecf20Sopenharmony_ci} 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_cistatic void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 1318c2ecf20Sopenharmony_ci{ 1328c2ecf20Sopenharmony_ci struct pool_info *pi = data; 1338c2ecf20Sopenharmony_ci int size = offsetof(struct r1bio, bios[pi->raid_disks]); 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci /* allocate a r1bio with room for raid_disks entries in the bios array */ 1368c2ecf20Sopenharmony_ci return kzalloc(size, gfp_flags); 1378c2ecf20Sopenharmony_ci} 1388c2ecf20Sopenharmony_ci 1398c2ecf20Sopenharmony_ci#define RESYNC_DEPTH 32 1408c2ecf20Sopenharmony_ci#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 1418c2ecf20Sopenharmony_ci#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) 1428c2ecf20Sopenharmony_ci#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) 1438c2ecf20Sopenharmony_ci#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) 1448c2ecf20Sopenharmony_ci#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_cistatic void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 1478c2ecf20Sopenharmony_ci{ 1488c2ecf20Sopenharmony_ci struct pool_info *pi = data; 1498c2ecf20Sopenharmony_ci struct r1bio *r1_bio; 1508c2ecf20Sopenharmony_ci struct bio *bio; 1518c2ecf20Sopenharmony_ci int need_pages; 1528c2ecf20Sopenharmony_ci int j; 1538c2ecf20Sopenharmony_ci struct resync_pages *rps; 1548c2ecf20Sopenharmony_ci 1558c2ecf20Sopenharmony_ci r1_bio = r1bio_pool_alloc(gfp_flags, pi); 1568c2ecf20Sopenharmony_ci if (!r1_bio) 1578c2ecf20Sopenharmony_ci return NULL; 1588c2ecf20Sopenharmony_ci 1598c2ecf20Sopenharmony_ci rps = kmalloc_array(pi->raid_disks, sizeof(struct resync_pages), 1608c2ecf20Sopenharmony_ci gfp_flags); 1618c2ecf20Sopenharmony_ci if (!rps) 1628c2ecf20Sopenharmony_ci goto out_free_r1bio; 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_ci /* 1658c2ecf20Sopenharmony_ci * Allocate bios : 1 for reading, n-1 for writing 1668c2ecf20Sopenharmony_ci */ 1678c2ecf20Sopenharmony_ci for (j = pi->raid_disks ; j-- ; ) { 1688c2ecf20Sopenharmony_ci bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); 1698c2ecf20Sopenharmony_ci if (!bio) 1708c2ecf20Sopenharmony_ci goto out_free_bio; 1718c2ecf20Sopenharmony_ci r1_bio->bios[j] = bio; 1728c2ecf20Sopenharmony_ci } 1738c2ecf20Sopenharmony_ci /* 1748c2ecf20Sopenharmony_ci * Allocate RESYNC_PAGES data pages and attach them to 1758c2ecf20Sopenharmony_ci * the first bio. 1768c2ecf20Sopenharmony_ci * If this is a user-requested check/repair, allocate 1778c2ecf20Sopenharmony_ci * RESYNC_PAGES for each bio. 1788c2ecf20Sopenharmony_ci */ 1798c2ecf20Sopenharmony_ci if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) 1808c2ecf20Sopenharmony_ci need_pages = pi->raid_disks; 1818c2ecf20Sopenharmony_ci else 1828c2ecf20Sopenharmony_ci need_pages = 1; 1838c2ecf20Sopenharmony_ci for (j = 0; j < pi->raid_disks; j++) { 1848c2ecf20Sopenharmony_ci struct resync_pages *rp = &rps[j]; 1858c2ecf20Sopenharmony_ci 1868c2ecf20Sopenharmony_ci bio = r1_bio->bios[j]; 1878c2ecf20Sopenharmony_ci 1888c2ecf20Sopenharmony_ci if (j < need_pages) { 1898c2ecf20Sopenharmony_ci if (resync_alloc_pages(rp, gfp_flags)) 1908c2ecf20Sopenharmony_ci goto out_free_pages; 1918c2ecf20Sopenharmony_ci } else { 1928c2ecf20Sopenharmony_ci memcpy(rp, &rps[0], sizeof(*rp)); 1938c2ecf20Sopenharmony_ci resync_get_all_pages(rp); 1948c2ecf20Sopenharmony_ci } 1958c2ecf20Sopenharmony_ci 1968c2ecf20Sopenharmony_ci rp->raid_bio = r1_bio; 1978c2ecf20Sopenharmony_ci bio->bi_private = rp; 1988c2ecf20Sopenharmony_ci } 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ci r1_bio->master_bio = NULL; 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci return r1_bio; 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_ciout_free_pages: 2058c2ecf20Sopenharmony_ci while (--j >= 0) 2068c2ecf20Sopenharmony_ci resync_free_pages(&rps[j]); 2078c2ecf20Sopenharmony_ci 2088c2ecf20Sopenharmony_ciout_free_bio: 2098c2ecf20Sopenharmony_ci while (++j < pi->raid_disks) 2108c2ecf20Sopenharmony_ci bio_put(r1_bio->bios[j]); 2118c2ecf20Sopenharmony_ci kfree(rps); 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ciout_free_r1bio: 2148c2ecf20Sopenharmony_ci rbio_pool_free(r1_bio, data); 2158c2ecf20Sopenharmony_ci return NULL; 2168c2ecf20Sopenharmony_ci} 2178c2ecf20Sopenharmony_ci 2188c2ecf20Sopenharmony_cistatic void r1buf_pool_free(void *__r1_bio, void *data) 2198c2ecf20Sopenharmony_ci{ 2208c2ecf20Sopenharmony_ci struct pool_info *pi = data; 2218c2ecf20Sopenharmony_ci int i; 2228c2ecf20Sopenharmony_ci struct r1bio *r1bio = __r1_bio; 2238c2ecf20Sopenharmony_ci struct resync_pages *rp = NULL; 2248c2ecf20Sopenharmony_ci 2258c2ecf20Sopenharmony_ci for (i = pi->raid_disks; i--; ) { 2268c2ecf20Sopenharmony_ci rp = get_resync_pages(r1bio->bios[i]); 2278c2ecf20Sopenharmony_ci resync_free_pages(rp); 2288c2ecf20Sopenharmony_ci bio_put(r1bio->bios[i]); 2298c2ecf20Sopenharmony_ci } 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci /* resync pages array stored in the 1st bio's .bi_private */ 2328c2ecf20Sopenharmony_ci kfree(rp); 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci rbio_pool_free(r1bio, data); 2358c2ecf20Sopenharmony_ci} 2368c2ecf20Sopenharmony_ci 2378c2ecf20Sopenharmony_cistatic void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) 2388c2ecf20Sopenharmony_ci{ 2398c2ecf20Sopenharmony_ci int i; 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 2428c2ecf20Sopenharmony_ci struct bio **bio = r1_bio->bios + i; 2438c2ecf20Sopenharmony_ci if (!BIO_SPECIAL(*bio)) 2448c2ecf20Sopenharmony_ci bio_put(*bio); 2458c2ecf20Sopenharmony_ci *bio = NULL; 2468c2ecf20Sopenharmony_ci } 2478c2ecf20Sopenharmony_ci} 2488c2ecf20Sopenharmony_ci 2498c2ecf20Sopenharmony_cistatic void free_r1bio(struct r1bio *r1_bio) 2508c2ecf20Sopenharmony_ci{ 2518c2ecf20Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 2528c2ecf20Sopenharmony_ci 2538c2ecf20Sopenharmony_ci put_all_bios(conf, r1_bio); 2548c2ecf20Sopenharmony_ci mempool_free(r1_bio, &conf->r1bio_pool); 2558c2ecf20Sopenharmony_ci} 2568c2ecf20Sopenharmony_ci 2578c2ecf20Sopenharmony_cistatic void put_buf(struct r1bio *r1_bio) 2588c2ecf20Sopenharmony_ci{ 2598c2ecf20Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 2608c2ecf20Sopenharmony_ci sector_t sect = r1_bio->sector; 2618c2ecf20Sopenharmony_ci int i; 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 2648c2ecf20Sopenharmony_ci struct bio *bio = r1_bio->bios[i]; 2658c2ecf20Sopenharmony_ci if (bio->bi_end_io) 2668c2ecf20Sopenharmony_ci rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); 2678c2ecf20Sopenharmony_ci } 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci mempool_free(r1_bio, &conf->r1buf_pool); 2708c2ecf20Sopenharmony_ci 2718c2ecf20Sopenharmony_ci lower_barrier(conf, sect); 2728c2ecf20Sopenharmony_ci} 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_cistatic void reschedule_retry(struct r1bio *r1_bio) 2758c2ecf20Sopenharmony_ci{ 2768c2ecf20Sopenharmony_ci unsigned long flags; 2778c2ecf20Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 2788c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 2798c2ecf20Sopenharmony_ci int idx; 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_ci idx = sector_to_idx(r1_bio->sector); 2828c2ecf20Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 2838c2ecf20Sopenharmony_ci list_add(&r1_bio->retry_list, &conf->retry_list); 2848c2ecf20Sopenharmony_ci atomic_inc(&conf->nr_queued[idx]); 2858c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 2868c2ecf20Sopenharmony_ci 2878c2ecf20Sopenharmony_ci wake_up(&conf->wait_barrier); 2888c2ecf20Sopenharmony_ci md_wakeup_thread(mddev->thread); 2898c2ecf20Sopenharmony_ci} 2908c2ecf20Sopenharmony_ci 2918c2ecf20Sopenharmony_ci/* 2928c2ecf20Sopenharmony_ci * raid_end_bio_io() is called when we have finished servicing a mirrored 2938c2ecf20Sopenharmony_ci * operation and are ready to return a success/failure code to the buffer 2948c2ecf20Sopenharmony_ci * cache layer. 2958c2ecf20Sopenharmony_ci */ 2968c2ecf20Sopenharmony_cistatic void call_bio_endio(struct r1bio *r1_bio) 2978c2ecf20Sopenharmony_ci{ 2988c2ecf20Sopenharmony_ci struct bio *bio = r1_bio->master_bio; 2998c2ecf20Sopenharmony_ci 3008c2ecf20Sopenharmony_ci if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) 3018c2ecf20Sopenharmony_ci bio->bi_status = BLK_STS_IOERR; 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci bio_endio(bio); 3048c2ecf20Sopenharmony_ci} 3058c2ecf20Sopenharmony_ci 3068c2ecf20Sopenharmony_cistatic void raid_end_bio_io(struct r1bio *r1_bio) 3078c2ecf20Sopenharmony_ci{ 3088c2ecf20Sopenharmony_ci struct bio *bio = r1_bio->master_bio; 3098c2ecf20Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 3108c2ecf20Sopenharmony_ci 3118c2ecf20Sopenharmony_ci /* if nobody has done the final endio yet, do it now */ 3128c2ecf20Sopenharmony_ci if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 3138c2ecf20Sopenharmony_ci pr_debug("raid1: sync end %s on sectors %llu-%llu\n", 3148c2ecf20Sopenharmony_ci (bio_data_dir(bio) == WRITE) ? "write" : "read", 3158c2ecf20Sopenharmony_ci (unsigned long long) bio->bi_iter.bi_sector, 3168c2ecf20Sopenharmony_ci (unsigned long long) bio_end_sector(bio) - 1); 3178c2ecf20Sopenharmony_ci 3188c2ecf20Sopenharmony_ci call_bio_endio(r1_bio); 3198c2ecf20Sopenharmony_ci } 3208c2ecf20Sopenharmony_ci /* 3218c2ecf20Sopenharmony_ci * Wake up any possible resync thread that waits for the device 3228c2ecf20Sopenharmony_ci * to go idle. All I/Os, even write-behind writes, are done. 3238c2ecf20Sopenharmony_ci */ 3248c2ecf20Sopenharmony_ci allow_barrier(conf, r1_bio->sector); 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci free_r1bio(r1_bio); 3278c2ecf20Sopenharmony_ci} 3288c2ecf20Sopenharmony_ci 3298c2ecf20Sopenharmony_ci/* 3308c2ecf20Sopenharmony_ci * Update disk head position estimator based on IRQ completion info. 3318c2ecf20Sopenharmony_ci */ 3328c2ecf20Sopenharmony_cistatic inline void update_head_pos(int disk, struct r1bio *r1_bio) 3338c2ecf20Sopenharmony_ci{ 3348c2ecf20Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 3358c2ecf20Sopenharmony_ci 3368c2ecf20Sopenharmony_ci conf->mirrors[disk].head_position = 3378c2ecf20Sopenharmony_ci r1_bio->sector + (r1_bio->sectors); 3388c2ecf20Sopenharmony_ci} 3398c2ecf20Sopenharmony_ci 3408c2ecf20Sopenharmony_ci/* 3418c2ecf20Sopenharmony_ci * Find the disk number which triggered given bio 3428c2ecf20Sopenharmony_ci */ 3438c2ecf20Sopenharmony_cistatic int find_bio_disk(struct r1bio *r1_bio, struct bio *bio) 3448c2ecf20Sopenharmony_ci{ 3458c2ecf20Sopenharmony_ci int mirror; 3468c2ecf20Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 3478c2ecf20Sopenharmony_ci int raid_disks = conf->raid_disks; 3488c2ecf20Sopenharmony_ci 3498c2ecf20Sopenharmony_ci for (mirror = 0; mirror < raid_disks * 2; mirror++) 3508c2ecf20Sopenharmony_ci if (r1_bio->bios[mirror] == bio) 3518c2ecf20Sopenharmony_ci break; 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci BUG_ON(mirror == raid_disks * 2); 3548c2ecf20Sopenharmony_ci update_head_pos(mirror, r1_bio); 3558c2ecf20Sopenharmony_ci 3568c2ecf20Sopenharmony_ci return mirror; 3578c2ecf20Sopenharmony_ci} 3588c2ecf20Sopenharmony_ci 3598c2ecf20Sopenharmony_cistatic void raid1_end_read_request(struct bio *bio) 3608c2ecf20Sopenharmony_ci{ 3618c2ecf20Sopenharmony_ci int uptodate = !bio->bi_status; 3628c2ecf20Sopenharmony_ci struct r1bio *r1_bio = bio->bi_private; 3638c2ecf20Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 3648c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_ci /* 3678c2ecf20Sopenharmony_ci * this branch is our 'one mirror IO has finished' event handler: 3688c2ecf20Sopenharmony_ci */ 3698c2ecf20Sopenharmony_ci update_head_pos(r1_bio->read_disk, r1_bio); 3708c2ecf20Sopenharmony_ci 3718c2ecf20Sopenharmony_ci if (uptodate) 3728c2ecf20Sopenharmony_ci set_bit(R1BIO_Uptodate, &r1_bio->state); 3738c2ecf20Sopenharmony_ci else if (test_bit(FailFast, &rdev->flags) && 3748c2ecf20Sopenharmony_ci test_bit(R1BIO_FailFast, &r1_bio->state)) 3758c2ecf20Sopenharmony_ci /* This was a fail-fast read so we definitely 3768c2ecf20Sopenharmony_ci * want to retry */ 3778c2ecf20Sopenharmony_ci ; 3788c2ecf20Sopenharmony_ci else { 3798c2ecf20Sopenharmony_ci /* If all other devices have failed, we want to return 3808c2ecf20Sopenharmony_ci * the error upwards rather than fail the last device. 3818c2ecf20Sopenharmony_ci * Here we redefine "uptodate" to mean "Don't want to retry" 3828c2ecf20Sopenharmony_ci */ 3838c2ecf20Sopenharmony_ci unsigned long flags; 3848c2ecf20Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 3858c2ecf20Sopenharmony_ci if (r1_bio->mddev->degraded == conf->raid_disks || 3868c2ecf20Sopenharmony_ci (r1_bio->mddev->degraded == conf->raid_disks-1 && 3878c2ecf20Sopenharmony_ci test_bit(In_sync, &rdev->flags))) 3888c2ecf20Sopenharmony_ci uptodate = 1; 3898c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 3908c2ecf20Sopenharmony_ci } 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci if (uptodate) { 3938c2ecf20Sopenharmony_ci raid_end_bio_io(r1_bio); 3948c2ecf20Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 3958c2ecf20Sopenharmony_ci } else { 3968c2ecf20Sopenharmony_ci /* 3978c2ecf20Sopenharmony_ci * oops, read error: 3988c2ecf20Sopenharmony_ci */ 3998c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 4008c2ecf20Sopenharmony_ci pr_err_ratelimited("md/raid1:%s: %s: rescheduling sector %llu\n", 4018c2ecf20Sopenharmony_ci mdname(conf->mddev), 4028c2ecf20Sopenharmony_ci bdevname(rdev->bdev, b), 4038c2ecf20Sopenharmony_ci (unsigned long long)r1_bio->sector); 4048c2ecf20Sopenharmony_ci set_bit(R1BIO_ReadError, &r1_bio->state); 4058c2ecf20Sopenharmony_ci reschedule_retry(r1_bio); 4068c2ecf20Sopenharmony_ci /* don't drop the reference on read_disk yet */ 4078c2ecf20Sopenharmony_ci } 4088c2ecf20Sopenharmony_ci} 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_cistatic void close_write(struct r1bio *r1_bio) 4118c2ecf20Sopenharmony_ci{ 4128c2ecf20Sopenharmony_ci /* it really is the end of this request */ 4138c2ecf20Sopenharmony_ci if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 4148c2ecf20Sopenharmony_ci bio_free_pages(r1_bio->behind_master_bio); 4158c2ecf20Sopenharmony_ci bio_put(r1_bio->behind_master_bio); 4168c2ecf20Sopenharmony_ci r1_bio->behind_master_bio = NULL; 4178c2ecf20Sopenharmony_ci } 4188c2ecf20Sopenharmony_ci /* clear the bitmap if all writes complete successfully */ 4198c2ecf20Sopenharmony_ci md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 4208c2ecf20Sopenharmony_ci r1_bio->sectors, 4218c2ecf20Sopenharmony_ci !test_bit(R1BIO_Degraded, &r1_bio->state), 4228c2ecf20Sopenharmony_ci test_bit(R1BIO_BehindIO, &r1_bio->state)); 4238c2ecf20Sopenharmony_ci md_write_end(r1_bio->mddev); 4248c2ecf20Sopenharmony_ci} 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_cistatic void r1_bio_write_done(struct r1bio *r1_bio) 4278c2ecf20Sopenharmony_ci{ 4288c2ecf20Sopenharmony_ci if (!atomic_dec_and_test(&r1_bio->remaining)) 4298c2ecf20Sopenharmony_ci return; 4308c2ecf20Sopenharmony_ci 4318c2ecf20Sopenharmony_ci if (test_bit(R1BIO_WriteError, &r1_bio->state)) 4328c2ecf20Sopenharmony_ci reschedule_retry(r1_bio); 4338c2ecf20Sopenharmony_ci else { 4348c2ecf20Sopenharmony_ci close_write(r1_bio); 4358c2ecf20Sopenharmony_ci if (test_bit(R1BIO_MadeGood, &r1_bio->state)) 4368c2ecf20Sopenharmony_ci reschedule_retry(r1_bio); 4378c2ecf20Sopenharmony_ci else 4388c2ecf20Sopenharmony_ci raid_end_bio_io(r1_bio); 4398c2ecf20Sopenharmony_ci } 4408c2ecf20Sopenharmony_ci} 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_cistatic void raid1_end_write_request(struct bio *bio) 4438c2ecf20Sopenharmony_ci{ 4448c2ecf20Sopenharmony_ci struct r1bio *r1_bio = bio->bi_private; 4458c2ecf20Sopenharmony_ci int behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 4468c2ecf20Sopenharmony_ci struct r1conf *conf = r1_bio->mddev->private; 4478c2ecf20Sopenharmony_ci struct bio *to_put = NULL; 4488c2ecf20Sopenharmony_ci int mirror = find_bio_disk(r1_bio, bio); 4498c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[mirror].rdev; 4508c2ecf20Sopenharmony_ci bool discard_error; 4518c2ecf20Sopenharmony_ci sector_t lo = r1_bio->sector; 4528c2ecf20Sopenharmony_ci sector_t hi = r1_bio->sector + r1_bio->sectors; 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci /* 4578c2ecf20Sopenharmony_ci * 'one mirror IO has finished' event handler: 4588c2ecf20Sopenharmony_ci */ 4598c2ecf20Sopenharmony_ci if (bio->bi_status && !discard_error) { 4608c2ecf20Sopenharmony_ci set_bit(WriteErrorSeen, &rdev->flags); 4618c2ecf20Sopenharmony_ci if (!test_and_set_bit(WantReplacement, &rdev->flags)) 4628c2ecf20Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, & 4638c2ecf20Sopenharmony_ci conf->mddev->recovery); 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_ci if (test_bit(FailFast, &rdev->flags) && 4668c2ecf20Sopenharmony_ci (bio->bi_opf & MD_FAILFAST) && 4678c2ecf20Sopenharmony_ci /* We never try FailFast to WriteMostly devices */ 4688c2ecf20Sopenharmony_ci !test_bit(WriteMostly, &rdev->flags)) { 4698c2ecf20Sopenharmony_ci md_error(r1_bio->mddev, rdev); 4708c2ecf20Sopenharmony_ci } 4718c2ecf20Sopenharmony_ci 4728c2ecf20Sopenharmony_ci /* 4738c2ecf20Sopenharmony_ci * When the device is faulty, it is not necessary to 4748c2ecf20Sopenharmony_ci * handle write error. 4758c2ecf20Sopenharmony_ci */ 4768c2ecf20Sopenharmony_ci if (!test_bit(Faulty, &rdev->flags)) 4778c2ecf20Sopenharmony_ci set_bit(R1BIO_WriteError, &r1_bio->state); 4788c2ecf20Sopenharmony_ci else { 4798c2ecf20Sopenharmony_ci /* Fail the request */ 4808c2ecf20Sopenharmony_ci set_bit(R1BIO_Degraded, &r1_bio->state); 4818c2ecf20Sopenharmony_ci /* Finished with this branch */ 4828c2ecf20Sopenharmony_ci r1_bio->bios[mirror] = NULL; 4838c2ecf20Sopenharmony_ci to_put = bio; 4848c2ecf20Sopenharmony_ci } 4858c2ecf20Sopenharmony_ci } else { 4868c2ecf20Sopenharmony_ci /* 4878c2ecf20Sopenharmony_ci * Set R1BIO_Uptodate in our master bio, so that we 4888c2ecf20Sopenharmony_ci * will return a good error code for to the higher 4898c2ecf20Sopenharmony_ci * levels even if IO on some other mirrored buffer 4908c2ecf20Sopenharmony_ci * fails. 4918c2ecf20Sopenharmony_ci * 4928c2ecf20Sopenharmony_ci * The 'master' represents the composite IO operation 4938c2ecf20Sopenharmony_ci * to user-side. So if something waits for IO, then it 4948c2ecf20Sopenharmony_ci * will wait for the 'master' bio. 4958c2ecf20Sopenharmony_ci */ 4968c2ecf20Sopenharmony_ci sector_t first_bad; 4978c2ecf20Sopenharmony_ci int bad_sectors; 4988c2ecf20Sopenharmony_ci 4998c2ecf20Sopenharmony_ci r1_bio->bios[mirror] = NULL; 5008c2ecf20Sopenharmony_ci to_put = bio; 5018c2ecf20Sopenharmony_ci /* 5028c2ecf20Sopenharmony_ci * Do not set R1BIO_Uptodate if the current device is 5038c2ecf20Sopenharmony_ci * rebuilding or Faulty. This is because we cannot use 5048c2ecf20Sopenharmony_ci * such device for properly reading the data back (we could 5058c2ecf20Sopenharmony_ci * potentially use it, if the current write would have felt 5068c2ecf20Sopenharmony_ci * before rdev->recovery_offset, but for simplicity we don't 5078c2ecf20Sopenharmony_ci * check this here. 5088c2ecf20Sopenharmony_ci */ 5098c2ecf20Sopenharmony_ci if (test_bit(In_sync, &rdev->flags) && 5108c2ecf20Sopenharmony_ci !test_bit(Faulty, &rdev->flags)) 5118c2ecf20Sopenharmony_ci set_bit(R1BIO_Uptodate, &r1_bio->state); 5128c2ecf20Sopenharmony_ci 5138c2ecf20Sopenharmony_ci /* Maybe we can clear some bad blocks. */ 5148c2ecf20Sopenharmony_ci if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, 5158c2ecf20Sopenharmony_ci &first_bad, &bad_sectors) && !discard_error) { 5168c2ecf20Sopenharmony_ci r1_bio->bios[mirror] = IO_MADE_GOOD; 5178c2ecf20Sopenharmony_ci set_bit(R1BIO_MadeGood, &r1_bio->state); 5188c2ecf20Sopenharmony_ci } 5198c2ecf20Sopenharmony_ci } 5208c2ecf20Sopenharmony_ci 5218c2ecf20Sopenharmony_ci if (behind) { 5228c2ecf20Sopenharmony_ci if (test_bit(CollisionCheck, &rdev->flags)) 5238c2ecf20Sopenharmony_ci remove_serial(rdev, lo, hi); 5248c2ecf20Sopenharmony_ci if (test_bit(WriteMostly, &rdev->flags)) 5258c2ecf20Sopenharmony_ci atomic_dec(&r1_bio->behind_remaining); 5268c2ecf20Sopenharmony_ci 5278c2ecf20Sopenharmony_ci /* 5288c2ecf20Sopenharmony_ci * In behind mode, we ACK the master bio once the I/O 5298c2ecf20Sopenharmony_ci * has safely reached all non-writemostly 5308c2ecf20Sopenharmony_ci * disks. Setting the Returned bit ensures that this 5318c2ecf20Sopenharmony_ci * gets done only once -- we don't ever want to return 5328c2ecf20Sopenharmony_ci * -EIO here, instead we'll wait 5338c2ecf20Sopenharmony_ci */ 5348c2ecf20Sopenharmony_ci if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && 5358c2ecf20Sopenharmony_ci test_bit(R1BIO_Uptodate, &r1_bio->state)) { 5368c2ecf20Sopenharmony_ci /* Maybe we can return now */ 5378c2ecf20Sopenharmony_ci if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 5388c2ecf20Sopenharmony_ci struct bio *mbio = r1_bio->master_bio; 5398c2ecf20Sopenharmony_ci pr_debug("raid1: behind end write sectors" 5408c2ecf20Sopenharmony_ci " %llu-%llu\n", 5418c2ecf20Sopenharmony_ci (unsigned long long) mbio->bi_iter.bi_sector, 5428c2ecf20Sopenharmony_ci (unsigned long long) bio_end_sector(mbio) - 1); 5438c2ecf20Sopenharmony_ci call_bio_endio(r1_bio); 5448c2ecf20Sopenharmony_ci } 5458c2ecf20Sopenharmony_ci } 5468c2ecf20Sopenharmony_ci } else if (rdev->mddev->serialize_policy) 5478c2ecf20Sopenharmony_ci remove_serial(rdev, lo, hi); 5488c2ecf20Sopenharmony_ci if (r1_bio->bios[mirror] == NULL) 5498c2ecf20Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 5508c2ecf20Sopenharmony_ci 5518c2ecf20Sopenharmony_ci /* 5528c2ecf20Sopenharmony_ci * Let's see if all mirrored write operations have finished 5538c2ecf20Sopenharmony_ci * already. 5548c2ecf20Sopenharmony_ci */ 5558c2ecf20Sopenharmony_ci r1_bio_write_done(r1_bio); 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_ci if (to_put) 5588c2ecf20Sopenharmony_ci bio_put(to_put); 5598c2ecf20Sopenharmony_ci} 5608c2ecf20Sopenharmony_ci 5618c2ecf20Sopenharmony_cistatic sector_t align_to_barrier_unit_end(sector_t start_sector, 5628c2ecf20Sopenharmony_ci sector_t sectors) 5638c2ecf20Sopenharmony_ci{ 5648c2ecf20Sopenharmony_ci sector_t len; 5658c2ecf20Sopenharmony_ci 5668c2ecf20Sopenharmony_ci WARN_ON(sectors == 0); 5678c2ecf20Sopenharmony_ci /* 5688c2ecf20Sopenharmony_ci * len is the number of sectors from start_sector to end of the 5698c2ecf20Sopenharmony_ci * barrier unit which start_sector belongs to. 5708c2ecf20Sopenharmony_ci */ 5718c2ecf20Sopenharmony_ci len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) - 5728c2ecf20Sopenharmony_ci start_sector; 5738c2ecf20Sopenharmony_ci 5748c2ecf20Sopenharmony_ci if (len > sectors) 5758c2ecf20Sopenharmony_ci len = sectors; 5768c2ecf20Sopenharmony_ci 5778c2ecf20Sopenharmony_ci return len; 5788c2ecf20Sopenharmony_ci} 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_ci/* 5818c2ecf20Sopenharmony_ci * This routine returns the disk from which the requested read should 5828c2ecf20Sopenharmony_ci * be done. There is a per-array 'next expected sequential IO' sector 5838c2ecf20Sopenharmony_ci * number - if this matches on the next IO then we use the last disk. 5848c2ecf20Sopenharmony_ci * There is also a per-disk 'last know head position' sector that is 5858c2ecf20Sopenharmony_ci * maintained from IRQ contexts, both the normal and the resync IO 5868c2ecf20Sopenharmony_ci * completion handlers update this position correctly. If there is no 5878c2ecf20Sopenharmony_ci * perfect sequential match then we pick the disk whose head is closest. 5888c2ecf20Sopenharmony_ci * 5898c2ecf20Sopenharmony_ci * If there are 2 mirrors in the same 2 devices, performance degrades 5908c2ecf20Sopenharmony_ci * because position is mirror, not device based. 5918c2ecf20Sopenharmony_ci * 5928c2ecf20Sopenharmony_ci * The rdev for the device selected will have nr_pending incremented. 5938c2ecf20Sopenharmony_ci */ 5948c2ecf20Sopenharmony_cistatic int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) 5958c2ecf20Sopenharmony_ci{ 5968c2ecf20Sopenharmony_ci const sector_t this_sector = r1_bio->sector; 5978c2ecf20Sopenharmony_ci int sectors; 5988c2ecf20Sopenharmony_ci int best_good_sectors; 5998c2ecf20Sopenharmony_ci int best_disk, best_dist_disk, best_pending_disk; 6008c2ecf20Sopenharmony_ci int has_nonrot_disk; 6018c2ecf20Sopenharmony_ci int disk; 6028c2ecf20Sopenharmony_ci sector_t best_dist; 6038c2ecf20Sopenharmony_ci unsigned int min_pending; 6048c2ecf20Sopenharmony_ci struct md_rdev *rdev; 6058c2ecf20Sopenharmony_ci int choose_first; 6068c2ecf20Sopenharmony_ci int choose_next_idle; 6078c2ecf20Sopenharmony_ci 6088c2ecf20Sopenharmony_ci rcu_read_lock(); 6098c2ecf20Sopenharmony_ci /* 6108c2ecf20Sopenharmony_ci * Check if we can balance. We can balance on the whole 6118c2ecf20Sopenharmony_ci * device if no resync is going on, or below the resync window. 6128c2ecf20Sopenharmony_ci * We take the first readable disk when above the resync window. 6138c2ecf20Sopenharmony_ci */ 6148c2ecf20Sopenharmony_ci retry: 6158c2ecf20Sopenharmony_ci sectors = r1_bio->sectors; 6168c2ecf20Sopenharmony_ci best_disk = -1; 6178c2ecf20Sopenharmony_ci best_dist_disk = -1; 6188c2ecf20Sopenharmony_ci best_dist = MaxSector; 6198c2ecf20Sopenharmony_ci best_pending_disk = -1; 6208c2ecf20Sopenharmony_ci min_pending = UINT_MAX; 6218c2ecf20Sopenharmony_ci best_good_sectors = 0; 6228c2ecf20Sopenharmony_ci has_nonrot_disk = 0; 6238c2ecf20Sopenharmony_ci choose_next_idle = 0; 6248c2ecf20Sopenharmony_ci clear_bit(R1BIO_FailFast, &r1_bio->state); 6258c2ecf20Sopenharmony_ci 6268c2ecf20Sopenharmony_ci if ((conf->mddev->recovery_cp < this_sector + sectors) || 6278c2ecf20Sopenharmony_ci (mddev_is_clustered(conf->mddev) && 6288c2ecf20Sopenharmony_ci md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, 6298c2ecf20Sopenharmony_ci this_sector + sectors))) 6308c2ecf20Sopenharmony_ci choose_first = 1; 6318c2ecf20Sopenharmony_ci else 6328c2ecf20Sopenharmony_ci choose_first = 0; 6338c2ecf20Sopenharmony_ci 6348c2ecf20Sopenharmony_ci for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { 6358c2ecf20Sopenharmony_ci sector_t dist; 6368c2ecf20Sopenharmony_ci sector_t first_bad; 6378c2ecf20Sopenharmony_ci int bad_sectors; 6388c2ecf20Sopenharmony_ci unsigned int pending; 6398c2ecf20Sopenharmony_ci bool nonrot; 6408c2ecf20Sopenharmony_ci 6418c2ecf20Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[disk].rdev); 6428c2ecf20Sopenharmony_ci if (r1_bio->bios[disk] == IO_BLOCKED 6438c2ecf20Sopenharmony_ci || rdev == NULL 6448c2ecf20Sopenharmony_ci || test_bit(Faulty, &rdev->flags)) 6458c2ecf20Sopenharmony_ci continue; 6468c2ecf20Sopenharmony_ci if (!test_bit(In_sync, &rdev->flags) && 6478c2ecf20Sopenharmony_ci rdev->recovery_offset < this_sector + sectors) 6488c2ecf20Sopenharmony_ci continue; 6498c2ecf20Sopenharmony_ci if (test_bit(WriteMostly, &rdev->flags)) { 6508c2ecf20Sopenharmony_ci /* Don't balance among write-mostly, just 6518c2ecf20Sopenharmony_ci * use the first as a last resort */ 6528c2ecf20Sopenharmony_ci if (best_dist_disk < 0) { 6538c2ecf20Sopenharmony_ci if (is_badblock(rdev, this_sector, sectors, 6548c2ecf20Sopenharmony_ci &first_bad, &bad_sectors)) { 6558c2ecf20Sopenharmony_ci if (first_bad <= this_sector) 6568c2ecf20Sopenharmony_ci /* Cannot use this */ 6578c2ecf20Sopenharmony_ci continue; 6588c2ecf20Sopenharmony_ci best_good_sectors = first_bad - this_sector; 6598c2ecf20Sopenharmony_ci } else 6608c2ecf20Sopenharmony_ci best_good_sectors = sectors; 6618c2ecf20Sopenharmony_ci best_dist_disk = disk; 6628c2ecf20Sopenharmony_ci best_pending_disk = disk; 6638c2ecf20Sopenharmony_ci } 6648c2ecf20Sopenharmony_ci continue; 6658c2ecf20Sopenharmony_ci } 6668c2ecf20Sopenharmony_ci /* This is a reasonable device to use. It might 6678c2ecf20Sopenharmony_ci * even be best. 6688c2ecf20Sopenharmony_ci */ 6698c2ecf20Sopenharmony_ci if (is_badblock(rdev, this_sector, sectors, 6708c2ecf20Sopenharmony_ci &first_bad, &bad_sectors)) { 6718c2ecf20Sopenharmony_ci if (best_dist < MaxSector) 6728c2ecf20Sopenharmony_ci /* already have a better device */ 6738c2ecf20Sopenharmony_ci continue; 6748c2ecf20Sopenharmony_ci if (first_bad <= this_sector) { 6758c2ecf20Sopenharmony_ci /* cannot read here. If this is the 'primary' 6768c2ecf20Sopenharmony_ci * device, then we must not read beyond 6778c2ecf20Sopenharmony_ci * bad_sectors from another device.. 6788c2ecf20Sopenharmony_ci */ 6798c2ecf20Sopenharmony_ci bad_sectors -= (this_sector - first_bad); 6808c2ecf20Sopenharmony_ci if (choose_first && sectors > bad_sectors) 6818c2ecf20Sopenharmony_ci sectors = bad_sectors; 6828c2ecf20Sopenharmony_ci if (best_good_sectors > sectors) 6838c2ecf20Sopenharmony_ci best_good_sectors = sectors; 6848c2ecf20Sopenharmony_ci 6858c2ecf20Sopenharmony_ci } else { 6868c2ecf20Sopenharmony_ci sector_t good_sectors = first_bad - this_sector; 6878c2ecf20Sopenharmony_ci if (good_sectors > best_good_sectors) { 6888c2ecf20Sopenharmony_ci best_good_sectors = good_sectors; 6898c2ecf20Sopenharmony_ci best_disk = disk; 6908c2ecf20Sopenharmony_ci } 6918c2ecf20Sopenharmony_ci if (choose_first) 6928c2ecf20Sopenharmony_ci break; 6938c2ecf20Sopenharmony_ci } 6948c2ecf20Sopenharmony_ci continue; 6958c2ecf20Sopenharmony_ci } else { 6968c2ecf20Sopenharmony_ci if ((sectors > best_good_sectors) && (best_disk >= 0)) 6978c2ecf20Sopenharmony_ci best_disk = -1; 6988c2ecf20Sopenharmony_ci best_good_sectors = sectors; 6998c2ecf20Sopenharmony_ci } 7008c2ecf20Sopenharmony_ci 7018c2ecf20Sopenharmony_ci if (best_disk >= 0) 7028c2ecf20Sopenharmony_ci /* At least two disks to choose from so failfast is OK */ 7038c2ecf20Sopenharmony_ci set_bit(R1BIO_FailFast, &r1_bio->state); 7048c2ecf20Sopenharmony_ci 7058c2ecf20Sopenharmony_ci nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); 7068c2ecf20Sopenharmony_ci has_nonrot_disk |= nonrot; 7078c2ecf20Sopenharmony_ci pending = atomic_read(&rdev->nr_pending); 7088c2ecf20Sopenharmony_ci dist = abs(this_sector - conf->mirrors[disk].head_position); 7098c2ecf20Sopenharmony_ci if (choose_first) { 7108c2ecf20Sopenharmony_ci best_disk = disk; 7118c2ecf20Sopenharmony_ci break; 7128c2ecf20Sopenharmony_ci } 7138c2ecf20Sopenharmony_ci /* Don't change to another disk for sequential reads */ 7148c2ecf20Sopenharmony_ci if (conf->mirrors[disk].next_seq_sect == this_sector 7158c2ecf20Sopenharmony_ci || dist == 0) { 7168c2ecf20Sopenharmony_ci int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; 7178c2ecf20Sopenharmony_ci struct raid1_info *mirror = &conf->mirrors[disk]; 7188c2ecf20Sopenharmony_ci 7198c2ecf20Sopenharmony_ci best_disk = disk; 7208c2ecf20Sopenharmony_ci /* 7218c2ecf20Sopenharmony_ci * If buffered sequential IO size exceeds optimal 7228c2ecf20Sopenharmony_ci * iosize, check if there is idle disk. If yes, choose 7238c2ecf20Sopenharmony_ci * the idle disk. read_balance could already choose an 7248c2ecf20Sopenharmony_ci * idle disk before noticing it's a sequential IO in 7258c2ecf20Sopenharmony_ci * this disk. This doesn't matter because this disk 7268c2ecf20Sopenharmony_ci * will idle, next time it will be utilized after the 7278c2ecf20Sopenharmony_ci * first disk has IO size exceeds optimal iosize. In 7288c2ecf20Sopenharmony_ci * this way, iosize of the first disk will be optimal 7298c2ecf20Sopenharmony_ci * iosize at least. iosize of the second disk might be 7308c2ecf20Sopenharmony_ci * small, but not a big deal since when the second disk 7318c2ecf20Sopenharmony_ci * starts IO, the first disk is likely still busy. 7328c2ecf20Sopenharmony_ci */ 7338c2ecf20Sopenharmony_ci if (nonrot && opt_iosize > 0 && 7348c2ecf20Sopenharmony_ci mirror->seq_start != MaxSector && 7358c2ecf20Sopenharmony_ci mirror->next_seq_sect > opt_iosize && 7368c2ecf20Sopenharmony_ci mirror->next_seq_sect - opt_iosize >= 7378c2ecf20Sopenharmony_ci mirror->seq_start) { 7388c2ecf20Sopenharmony_ci choose_next_idle = 1; 7398c2ecf20Sopenharmony_ci continue; 7408c2ecf20Sopenharmony_ci } 7418c2ecf20Sopenharmony_ci break; 7428c2ecf20Sopenharmony_ci } 7438c2ecf20Sopenharmony_ci 7448c2ecf20Sopenharmony_ci if (choose_next_idle) 7458c2ecf20Sopenharmony_ci continue; 7468c2ecf20Sopenharmony_ci 7478c2ecf20Sopenharmony_ci if (min_pending > pending) { 7488c2ecf20Sopenharmony_ci min_pending = pending; 7498c2ecf20Sopenharmony_ci best_pending_disk = disk; 7508c2ecf20Sopenharmony_ci } 7518c2ecf20Sopenharmony_ci 7528c2ecf20Sopenharmony_ci if (dist < best_dist) { 7538c2ecf20Sopenharmony_ci best_dist = dist; 7548c2ecf20Sopenharmony_ci best_dist_disk = disk; 7558c2ecf20Sopenharmony_ci } 7568c2ecf20Sopenharmony_ci } 7578c2ecf20Sopenharmony_ci 7588c2ecf20Sopenharmony_ci /* 7598c2ecf20Sopenharmony_ci * If all disks are rotational, choose the closest disk. If any disk is 7608c2ecf20Sopenharmony_ci * non-rotational, choose the disk with less pending request even the 7618c2ecf20Sopenharmony_ci * disk is rotational, which might/might not be optimal for raids with 7628c2ecf20Sopenharmony_ci * mixed ratation/non-rotational disks depending on workload. 7638c2ecf20Sopenharmony_ci */ 7648c2ecf20Sopenharmony_ci if (best_disk == -1) { 7658c2ecf20Sopenharmony_ci if (has_nonrot_disk || min_pending == 0) 7668c2ecf20Sopenharmony_ci best_disk = best_pending_disk; 7678c2ecf20Sopenharmony_ci else 7688c2ecf20Sopenharmony_ci best_disk = best_dist_disk; 7698c2ecf20Sopenharmony_ci } 7708c2ecf20Sopenharmony_ci 7718c2ecf20Sopenharmony_ci if (best_disk >= 0) { 7728c2ecf20Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[best_disk].rdev); 7738c2ecf20Sopenharmony_ci if (!rdev) 7748c2ecf20Sopenharmony_ci goto retry; 7758c2ecf20Sopenharmony_ci atomic_inc(&rdev->nr_pending); 7768c2ecf20Sopenharmony_ci sectors = best_good_sectors; 7778c2ecf20Sopenharmony_ci 7788c2ecf20Sopenharmony_ci if (conf->mirrors[best_disk].next_seq_sect != this_sector) 7798c2ecf20Sopenharmony_ci conf->mirrors[best_disk].seq_start = this_sector; 7808c2ecf20Sopenharmony_ci 7818c2ecf20Sopenharmony_ci conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; 7828c2ecf20Sopenharmony_ci } 7838c2ecf20Sopenharmony_ci rcu_read_unlock(); 7848c2ecf20Sopenharmony_ci *max_sectors = sectors; 7858c2ecf20Sopenharmony_ci 7868c2ecf20Sopenharmony_ci return best_disk; 7878c2ecf20Sopenharmony_ci} 7888c2ecf20Sopenharmony_ci 7898c2ecf20Sopenharmony_cistatic void flush_bio_list(struct r1conf *conf, struct bio *bio) 7908c2ecf20Sopenharmony_ci{ 7918c2ecf20Sopenharmony_ci /* flush any pending bitmap writes to disk before proceeding w/ I/O */ 7928c2ecf20Sopenharmony_ci md_bitmap_unplug(conf->mddev->bitmap); 7938c2ecf20Sopenharmony_ci wake_up(&conf->wait_barrier); 7948c2ecf20Sopenharmony_ci 7958c2ecf20Sopenharmony_ci while (bio) { /* submit pending writes */ 7968c2ecf20Sopenharmony_ci struct bio *next = bio->bi_next; 7978c2ecf20Sopenharmony_ci struct md_rdev *rdev = (void *)bio->bi_disk; 7988c2ecf20Sopenharmony_ci bio->bi_next = NULL; 7998c2ecf20Sopenharmony_ci bio_set_dev(bio, rdev->bdev); 8008c2ecf20Sopenharmony_ci if (test_bit(Faulty, &rdev->flags)) { 8018c2ecf20Sopenharmony_ci bio_io_error(bio); 8028c2ecf20Sopenharmony_ci } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && 8038c2ecf20Sopenharmony_ci !blk_queue_discard(bio->bi_disk->queue))) 8048c2ecf20Sopenharmony_ci /* Just ignore it */ 8058c2ecf20Sopenharmony_ci bio_endio(bio); 8068c2ecf20Sopenharmony_ci else 8078c2ecf20Sopenharmony_ci submit_bio_noacct(bio); 8088c2ecf20Sopenharmony_ci bio = next; 8098c2ecf20Sopenharmony_ci cond_resched(); 8108c2ecf20Sopenharmony_ci } 8118c2ecf20Sopenharmony_ci} 8128c2ecf20Sopenharmony_ci 8138c2ecf20Sopenharmony_cistatic void flush_pending_writes(struct r1conf *conf) 8148c2ecf20Sopenharmony_ci{ 8158c2ecf20Sopenharmony_ci /* Any writes that have been queued but are awaiting 8168c2ecf20Sopenharmony_ci * bitmap updates get flushed here. 8178c2ecf20Sopenharmony_ci */ 8188c2ecf20Sopenharmony_ci spin_lock_irq(&conf->device_lock); 8198c2ecf20Sopenharmony_ci 8208c2ecf20Sopenharmony_ci if (conf->pending_bio_list.head) { 8218c2ecf20Sopenharmony_ci struct blk_plug plug; 8228c2ecf20Sopenharmony_ci struct bio *bio; 8238c2ecf20Sopenharmony_ci 8248c2ecf20Sopenharmony_ci bio = bio_list_get(&conf->pending_bio_list); 8258c2ecf20Sopenharmony_ci conf->pending_count = 0; 8268c2ecf20Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 8278c2ecf20Sopenharmony_ci 8288c2ecf20Sopenharmony_ci /* 8298c2ecf20Sopenharmony_ci * As this is called in a wait_event() loop (see freeze_array), 8308c2ecf20Sopenharmony_ci * current->state might be TASK_UNINTERRUPTIBLE which will 8318c2ecf20Sopenharmony_ci * cause a warning when we prepare to wait again. As it is 8328c2ecf20Sopenharmony_ci * rare that this path is taken, it is perfectly safe to force 8338c2ecf20Sopenharmony_ci * us to go around the wait_event() loop again, so the warning 8348c2ecf20Sopenharmony_ci * is a false-positive. Silence the warning by resetting 8358c2ecf20Sopenharmony_ci * thread state 8368c2ecf20Sopenharmony_ci */ 8378c2ecf20Sopenharmony_ci __set_current_state(TASK_RUNNING); 8388c2ecf20Sopenharmony_ci blk_start_plug(&plug); 8398c2ecf20Sopenharmony_ci flush_bio_list(conf, bio); 8408c2ecf20Sopenharmony_ci blk_finish_plug(&plug); 8418c2ecf20Sopenharmony_ci } else 8428c2ecf20Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 8438c2ecf20Sopenharmony_ci} 8448c2ecf20Sopenharmony_ci 8458c2ecf20Sopenharmony_ci/* Barriers.... 8468c2ecf20Sopenharmony_ci * Sometimes we need to suspend IO while we do something else, 8478c2ecf20Sopenharmony_ci * either some resync/recovery, or reconfigure the array. 8488c2ecf20Sopenharmony_ci * To do this we raise a 'barrier'. 8498c2ecf20Sopenharmony_ci * The 'barrier' is a counter that can be raised multiple times 8508c2ecf20Sopenharmony_ci * to count how many activities are happening which preclude 8518c2ecf20Sopenharmony_ci * normal IO. 8528c2ecf20Sopenharmony_ci * We can only raise the barrier if there is no pending IO. 8538c2ecf20Sopenharmony_ci * i.e. if nr_pending == 0. 8548c2ecf20Sopenharmony_ci * We choose only to raise the barrier if no-one is waiting for the 8558c2ecf20Sopenharmony_ci * barrier to go down. This means that as soon as an IO request 8568c2ecf20Sopenharmony_ci * is ready, no other operations which require a barrier will start 8578c2ecf20Sopenharmony_ci * until the IO request has had a chance. 8588c2ecf20Sopenharmony_ci * 8598c2ecf20Sopenharmony_ci * So: regular IO calls 'wait_barrier'. When that returns there 8608c2ecf20Sopenharmony_ci * is no backgroup IO happening, It must arrange to call 8618c2ecf20Sopenharmony_ci * allow_barrier when it has finished its IO. 8628c2ecf20Sopenharmony_ci * backgroup IO calls must call raise_barrier. Once that returns 8638c2ecf20Sopenharmony_ci * there is no normal IO happeing. It must arrange to call 8648c2ecf20Sopenharmony_ci * lower_barrier when the particular background IO completes. 8658c2ecf20Sopenharmony_ci * 8668c2ecf20Sopenharmony_ci * If resync/recovery is interrupted, returns -EINTR; 8678c2ecf20Sopenharmony_ci * Otherwise, returns 0. 8688c2ecf20Sopenharmony_ci */ 8698c2ecf20Sopenharmony_cistatic int raise_barrier(struct r1conf *conf, sector_t sector_nr) 8708c2ecf20Sopenharmony_ci{ 8718c2ecf20Sopenharmony_ci int idx = sector_to_idx(sector_nr); 8728c2ecf20Sopenharmony_ci 8738c2ecf20Sopenharmony_ci spin_lock_irq(&conf->resync_lock); 8748c2ecf20Sopenharmony_ci 8758c2ecf20Sopenharmony_ci /* Wait until no block IO is waiting */ 8768c2ecf20Sopenharmony_ci wait_event_lock_irq(conf->wait_barrier, 8778c2ecf20Sopenharmony_ci !atomic_read(&conf->nr_waiting[idx]), 8788c2ecf20Sopenharmony_ci conf->resync_lock); 8798c2ecf20Sopenharmony_ci 8808c2ecf20Sopenharmony_ci /* block any new IO from starting */ 8818c2ecf20Sopenharmony_ci atomic_inc(&conf->barrier[idx]); 8828c2ecf20Sopenharmony_ci /* 8838c2ecf20Sopenharmony_ci * In raise_barrier() we firstly increase conf->barrier[idx] then 8848c2ecf20Sopenharmony_ci * check conf->nr_pending[idx]. In _wait_barrier() we firstly 8858c2ecf20Sopenharmony_ci * increase conf->nr_pending[idx] then check conf->barrier[idx]. 8868c2ecf20Sopenharmony_ci * A memory barrier here to make sure conf->nr_pending[idx] won't 8878c2ecf20Sopenharmony_ci * be fetched before conf->barrier[idx] is increased. Otherwise 8888c2ecf20Sopenharmony_ci * there will be a race between raise_barrier() and _wait_barrier(). 8898c2ecf20Sopenharmony_ci */ 8908c2ecf20Sopenharmony_ci smp_mb__after_atomic(); 8918c2ecf20Sopenharmony_ci 8928c2ecf20Sopenharmony_ci /* For these conditions we must wait: 8938c2ecf20Sopenharmony_ci * A: while the array is in frozen state 8948c2ecf20Sopenharmony_ci * B: while conf->nr_pending[idx] is not 0, meaning regular I/O 8958c2ecf20Sopenharmony_ci * existing in corresponding I/O barrier bucket. 8968c2ecf20Sopenharmony_ci * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches 8978c2ecf20Sopenharmony_ci * max resync count which allowed on current I/O barrier bucket. 8988c2ecf20Sopenharmony_ci */ 8998c2ecf20Sopenharmony_ci wait_event_lock_irq(conf->wait_barrier, 9008c2ecf20Sopenharmony_ci (!conf->array_frozen && 9018c2ecf20Sopenharmony_ci !atomic_read(&conf->nr_pending[idx]) && 9028c2ecf20Sopenharmony_ci atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH) || 9038c2ecf20Sopenharmony_ci test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery), 9048c2ecf20Sopenharmony_ci conf->resync_lock); 9058c2ecf20Sopenharmony_ci 9068c2ecf20Sopenharmony_ci if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 9078c2ecf20Sopenharmony_ci atomic_dec(&conf->barrier[idx]); 9088c2ecf20Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 9098c2ecf20Sopenharmony_ci wake_up(&conf->wait_barrier); 9108c2ecf20Sopenharmony_ci return -EINTR; 9118c2ecf20Sopenharmony_ci } 9128c2ecf20Sopenharmony_ci 9138c2ecf20Sopenharmony_ci atomic_inc(&conf->nr_sync_pending); 9148c2ecf20Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 9158c2ecf20Sopenharmony_ci 9168c2ecf20Sopenharmony_ci return 0; 9178c2ecf20Sopenharmony_ci} 9188c2ecf20Sopenharmony_ci 9198c2ecf20Sopenharmony_cistatic void lower_barrier(struct r1conf *conf, sector_t sector_nr) 9208c2ecf20Sopenharmony_ci{ 9218c2ecf20Sopenharmony_ci int idx = sector_to_idx(sector_nr); 9228c2ecf20Sopenharmony_ci 9238c2ecf20Sopenharmony_ci BUG_ON(atomic_read(&conf->barrier[idx]) <= 0); 9248c2ecf20Sopenharmony_ci 9258c2ecf20Sopenharmony_ci atomic_dec(&conf->barrier[idx]); 9268c2ecf20Sopenharmony_ci atomic_dec(&conf->nr_sync_pending); 9278c2ecf20Sopenharmony_ci wake_up(&conf->wait_barrier); 9288c2ecf20Sopenharmony_ci} 9298c2ecf20Sopenharmony_ci 9308c2ecf20Sopenharmony_cistatic void _wait_barrier(struct r1conf *conf, int idx) 9318c2ecf20Sopenharmony_ci{ 9328c2ecf20Sopenharmony_ci /* 9338c2ecf20Sopenharmony_ci * We need to increase conf->nr_pending[idx] very early here, 9348c2ecf20Sopenharmony_ci * then raise_barrier() can be blocked when it waits for 9358c2ecf20Sopenharmony_ci * conf->nr_pending[idx] to be 0. Then we can avoid holding 9368c2ecf20Sopenharmony_ci * conf->resync_lock when there is no barrier raised in same 9378c2ecf20Sopenharmony_ci * barrier unit bucket. Also if the array is frozen, I/O 9388c2ecf20Sopenharmony_ci * should be blocked until array is unfrozen. 9398c2ecf20Sopenharmony_ci */ 9408c2ecf20Sopenharmony_ci atomic_inc(&conf->nr_pending[idx]); 9418c2ecf20Sopenharmony_ci /* 9428c2ecf20Sopenharmony_ci * In _wait_barrier() we firstly increase conf->nr_pending[idx], then 9438c2ecf20Sopenharmony_ci * check conf->barrier[idx]. In raise_barrier() we firstly increase 9448c2ecf20Sopenharmony_ci * conf->barrier[idx], then check conf->nr_pending[idx]. A memory 9458c2ecf20Sopenharmony_ci * barrier is necessary here to make sure conf->barrier[idx] won't be 9468c2ecf20Sopenharmony_ci * fetched before conf->nr_pending[idx] is increased. Otherwise there 9478c2ecf20Sopenharmony_ci * will be a race between _wait_barrier() and raise_barrier(). 9488c2ecf20Sopenharmony_ci */ 9498c2ecf20Sopenharmony_ci smp_mb__after_atomic(); 9508c2ecf20Sopenharmony_ci 9518c2ecf20Sopenharmony_ci /* 9528c2ecf20Sopenharmony_ci * Don't worry about checking two atomic_t variables at same time 9538c2ecf20Sopenharmony_ci * here. If during we check conf->barrier[idx], the array is 9548c2ecf20Sopenharmony_ci * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is 9558c2ecf20Sopenharmony_ci * 0, it is safe to return and make the I/O continue. Because the 9568c2ecf20Sopenharmony_ci * array is frozen, all I/O returned here will eventually complete 9578c2ecf20Sopenharmony_ci * or be queued, no race will happen. See code comment in 9588c2ecf20Sopenharmony_ci * frozen_array(). 9598c2ecf20Sopenharmony_ci */ 9608c2ecf20Sopenharmony_ci if (!READ_ONCE(conf->array_frozen) && 9618c2ecf20Sopenharmony_ci !atomic_read(&conf->barrier[idx])) 9628c2ecf20Sopenharmony_ci return; 9638c2ecf20Sopenharmony_ci 9648c2ecf20Sopenharmony_ci /* 9658c2ecf20Sopenharmony_ci * After holding conf->resync_lock, conf->nr_pending[idx] 9668c2ecf20Sopenharmony_ci * should be decreased before waiting for barrier to drop. 9678c2ecf20Sopenharmony_ci * Otherwise, we may encounter a race condition because 9688c2ecf20Sopenharmony_ci * raise_barrer() might be waiting for conf->nr_pending[idx] 9698c2ecf20Sopenharmony_ci * to be 0 at same time. 9708c2ecf20Sopenharmony_ci */ 9718c2ecf20Sopenharmony_ci spin_lock_irq(&conf->resync_lock); 9728c2ecf20Sopenharmony_ci atomic_inc(&conf->nr_waiting[idx]); 9738c2ecf20Sopenharmony_ci atomic_dec(&conf->nr_pending[idx]); 9748c2ecf20Sopenharmony_ci /* 9758c2ecf20Sopenharmony_ci * In case freeze_array() is waiting for 9768c2ecf20Sopenharmony_ci * get_unqueued_pending() == extra 9778c2ecf20Sopenharmony_ci */ 9788c2ecf20Sopenharmony_ci wake_up(&conf->wait_barrier); 9798c2ecf20Sopenharmony_ci /* Wait for the barrier in same barrier unit bucket to drop. */ 9808c2ecf20Sopenharmony_ci wait_event_lock_irq(conf->wait_barrier, 9818c2ecf20Sopenharmony_ci !conf->array_frozen && 9828c2ecf20Sopenharmony_ci !atomic_read(&conf->barrier[idx]), 9838c2ecf20Sopenharmony_ci conf->resync_lock); 9848c2ecf20Sopenharmony_ci atomic_inc(&conf->nr_pending[idx]); 9858c2ecf20Sopenharmony_ci atomic_dec(&conf->nr_waiting[idx]); 9868c2ecf20Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 9878c2ecf20Sopenharmony_ci} 9888c2ecf20Sopenharmony_ci 9898c2ecf20Sopenharmony_cistatic void wait_read_barrier(struct r1conf *conf, sector_t sector_nr) 9908c2ecf20Sopenharmony_ci{ 9918c2ecf20Sopenharmony_ci int idx = sector_to_idx(sector_nr); 9928c2ecf20Sopenharmony_ci 9938c2ecf20Sopenharmony_ci /* 9948c2ecf20Sopenharmony_ci * Very similar to _wait_barrier(). The difference is, for read 9958c2ecf20Sopenharmony_ci * I/O we don't need wait for sync I/O, but if the whole array 9968c2ecf20Sopenharmony_ci * is frozen, the read I/O still has to wait until the array is 9978c2ecf20Sopenharmony_ci * unfrozen. Since there is no ordering requirement with 9988c2ecf20Sopenharmony_ci * conf->barrier[idx] here, memory barrier is unnecessary as well. 9998c2ecf20Sopenharmony_ci */ 10008c2ecf20Sopenharmony_ci atomic_inc(&conf->nr_pending[idx]); 10018c2ecf20Sopenharmony_ci 10028c2ecf20Sopenharmony_ci if (!READ_ONCE(conf->array_frozen)) 10038c2ecf20Sopenharmony_ci return; 10048c2ecf20Sopenharmony_ci 10058c2ecf20Sopenharmony_ci spin_lock_irq(&conf->resync_lock); 10068c2ecf20Sopenharmony_ci atomic_inc(&conf->nr_waiting[idx]); 10078c2ecf20Sopenharmony_ci atomic_dec(&conf->nr_pending[idx]); 10088c2ecf20Sopenharmony_ci /* 10098c2ecf20Sopenharmony_ci * In case freeze_array() is waiting for 10108c2ecf20Sopenharmony_ci * get_unqueued_pending() == extra 10118c2ecf20Sopenharmony_ci */ 10128c2ecf20Sopenharmony_ci wake_up(&conf->wait_barrier); 10138c2ecf20Sopenharmony_ci /* Wait for array to be unfrozen */ 10148c2ecf20Sopenharmony_ci wait_event_lock_irq(conf->wait_barrier, 10158c2ecf20Sopenharmony_ci !conf->array_frozen, 10168c2ecf20Sopenharmony_ci conf->resync_lock); 10178c2ecf20Sopenharmony_ci atomic_inc(&conf->nr_pending[idx]); 10188c2ecf20Sopenharmony_ci atomic_dec(&conf->nr_waiting[idx]); 10198c2ecf20Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 10208c2ecf20Sopenharmony_ci} 10218c2ecf20Sopenharmony_ci 10228c2ecf20Sopenharmony_cistatic void wait_barrier(struct r1conf *conf, sector_t sector_nr) 10238c2ecf20Sopenharmony_ci{ 10248c2ecf20Sopenharmony_ci int idx = sector_to_idx(sector_nr); 10258c2ecf20Sopenharmony_ci 10268c2ecf20Sopenharmony_ci _wait_barrier(conf, idx); 10278c2ecf20Sopenharmony_ci} 10288c2ecf20Sopenharmony_ci 10298c2ecf20Sopenharmony_cistatic void _allow_barrier(struct r1conf *conf, int idx) 10308c2ecf20Sopenharmony_ci{ 10318c2ecf20Sopenharmony_ci atomic_dec(&conf->nr_pending[idx]); 10328c2ecf20Sopenharmony_ci wake_up(&conf->wait_barrier); 10338c2ecf20Sopenharmony_ci} 10348c2ecf20Sopenharmony_ci 10358c2ecf20Sopenharmony_cistatic void allow_barrier(struct r1conf *conf, sector_t sector_nr) 10368c2ecf20Sopenharmony_ci{ 10378c2ecf20Sopenharmony_ci int idx = sector_to_idx(sector_nr); 10388c2ecf20Sopenharmony_ci 10398c2ecf20Sopenharmony_ci _allow_barrier(conf, idx); 10408c2ecf20Sopenharmony_ci} 10418c2ecf20Sopenharmony_ci 10428c2ecf20Sopenharmony_ci/* conf->resync_lock should be held */ 10438c2ecf20Sopenharmony_cistatic int get_unqueued_pending(struct r1conf *conf) 10448c2ecf20Sopenharmony_ci{ 10458c2ecf20Sopenharmony_ci int idx, ret; 10468c2ecf20Sopenharmony_ci 10478c2ecf20Sopenharmony_ci ret = atomic_read(&conf->nr_sync_pending); 10488c2ecf20Sopenharmony_ci for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) 10498c2ecf20Sopenharmony_ci ret += atomic_read(&conf->nr_pending[idx]) - 10508c2ecf20Sopenharmony_ci atomic_read(&conf->nr_queued[idx]); 10518c2ecf20Sopenharmony_ci 10528c2ecf20Sopenharmony_ci return ret; 10538c2ecf20Sopenharmony_ci} 10548c2ecf20Sopenharmony_ci 10558c2ecf20Sopenharmony_cistatic void freeze_array(struct r1conf *conf, int extra) 10568c2ecf20Sopenharmony_ci{ 10578c2ecf20Sopenharmony_ci /* Stop sync I/O and normal I/O and wait for everything to 10588c2ecf20Sopenharmony_ci * go quiet. 10598c2ecf20Sopenharmony_ci * This is called in two situations: 10608c2ecf20Sopenharmony_ci * 1) management command handlers (reshape, remove disk, quiesce). 10618c2ecf20Sopenharmony_ci * 2) one normal I/O request failed. 10628c2ecf20Sopenharmony_ci 10638c2ecf20Sopenharmony_ci * After array_frozen is set to 1, new sync IO will be blocked at 10648c2ecf20Sopenharmony_ci * raise_barrier(), and new normal I/O will blocked at _wait_barrier() 10658c2ecf20Sopenharmony_ci * or wait_read_barrier(). The flying I/Os will either complete or be 10668c2ecf20Sopenharmony_ci * queued. When everything goes quite, there are only queued I/Os left. 10678c2ecf20Sopenharmony_ci 10688c2ecf20Sopenharmony_ci * Every flying I/O contributes to a conf->nr_pending[idx], idx is the 10698c2ecf20Sopenharmony_ci * barrier bucket index which this I/O request hits. When all sync and 10708c2ecf20Sopenharmony_ci * normal I/O are queued, sum of all conf->nr_pending[] will match sum 10718c2ecf20Sopenharmony_ci * of all conf->nr_queued[]. But normal I/O failure is an exception, 10728c2ecf20Sopenharmony_ci * in handle_read_error(), we may call freeze_array() before trying to 10738c2ecf20Sopenharmony_ci * fix the read error. In this case, the error read I/O is not queued, 10748c2ecf20Sopenharmony_ci * so get_unqueued_pending() == 1. 10758c2ecf20Sopenharmony_ci * 10768c2ecf20Sopenharmony_ci * Therefore before this function returns, we need to wait until 10778c2ecf20Sopenharmony_ci * get_unqueued_pendings(conf) gets equal to extra. For 10788c2ecf20Sopenharmony_ci * normal I/O context, extra is 1, in rested situations extra is 0. 10798c2ecf20Sopenharmony_ci */ 10808c2ecf20Sopenharmony_ci spin_lock_irq(&conf->resync_lock); 10818c2ecf20Sopenharmony_ci conf->array_frozen = 1; 10828c2ecf20Sopenharmony_ci raid1_log(conf->mddev, "wait freeze"); 10838c2ecf20Sopenharmony_ci wait_event_lock_irq_cmd( 10848c2ecf20Sopenharmony_ci conf->wait_barrier, 10858c2ecf20Sopenharmony_ci get_unqueued_pending(conf) == extra, 10868c2ecf20Sopenharmony_ci conf->resync_lock, 10878c2ecf20Sopenharmony_ci flush_pending_writes(conf)); 10888c2ecf20Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 10898c2ecf20Sopenharmony_ci} 10908c2ecf20Sopenharmony_cistatic void unfreeze_array(struct r1conf *conf) 10918c2ecf20Sopenharmony_ci{ 10928c2ecf20Sopenharmony_ci /* reverse the effect of the freeze */ 10938c2ecf20Sopenharmony_ci spin_lock_irq(&conf->resync_lock); 10948c2ecf20Sopenharmony_ci conf->array_frozen = 0; 10958c2ecf20Sopenharmony_ci spin_unlock_irq(&conf->resync_lock); 10968c2ecf20Sopenharmony_ci wake_up(&conf->wait_barrier); 10978c2ecf20Sopenharmony_ci} 10988c2ecf20Sopenharmony_ci 10998c2ecf20Sopenharmony_cistatic void alloc_behind_master_bio(struct r1bio *r1_bio, 11008c2ecf20Sopenharmony_ci struct bio *bio) 11018c2ecf20Sopenharmony_ci{ 11028c2ecf20Sopenharmony_ci int size = bio->bi_iter.bi_size; 11038c2ecf20Sopenharmony_ci unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 11048c2ecf20Sopenharmony_ci int i = 0; 11058c2ecf20Sopenharmony_ci struct bio *behind_bio = NULL; 11068c2ecf20Sopenharmony_ci 11078c2ecf20Sopenharmony_ci behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev); 11088c2ecf20Sopenharmony_ci if (!behind_bio) 11098c2ecf20Sopenharmony_ci return; 11108c2ecf20Sopenharmony_ci 11118c2ecf20Sopenharmony_ci /* discard op, we don't support writezero/writesame yet */ 11128c2ecf20Sopenharmony_ci if (!bio_has_data(bio)) { 11138c2ecf20Sopenharmony_ci behind_bio->bi_iter.bi_size = size; 11148c2ecf20Sopenharmony_ci goto skip_copy; 11158c2ecf20Sopenharmony_ci } 11168c2ecf20Sopenharmony_ci 11178c2ecf20Sopenharmony_ci behind_bio->bi_write_hint = bio->bi_write_hint; 11188c2ecf20Sopenharmony_ci 11198c2ecf20Sopenharmony_ci while (i < vcnt && size) { 11208c2ecf20Sopenharmony_ci struct page *page; 11218c2ecf20Sopenharmony_ci int len = min_t(int, PAGE_SIZE, size); 11228c2ecf20Sopenharmony_ci 11238c2ecf20Sopenharmony_ci page = alloc_page(GFP_NOIO); 11248c2ecf20Sopenharmony_ci if (unlikely(!page)) 11258c2ecf20Sopenharmony_ci goto free_pages; 11268c2ecf20Sopenharmony_ci 11278c2ecf20Sopenharmony_ci bio_add_page(behind_bio, page, len, 0); 11288c2ecf20Sopenharmony_ci 11298c2ecf20Sopenharmony_ci size -= len; 11308c2ecf20Sopenharmony_ci i++; 11318c2ecf20Sopenharmony_ci } 11328c2ecf20Sopenharmony_ci 11338c2ecf20Sopenharmony_ci bio_copy_data(behind_bio, bio); 11348c2ecf20Sopenharmony_ciskip_copy: 11358c2ecf20Sopenharmony_ci r1_bio->behind_master_bio = behind_bio; 11368c2ecf20Sopenharmony_ci set_bit(R1BIO_BehindIO, &r1_bio->state); 11378c2ecf20Sopenharmony_ci 11388c2ecf20Sopenharmony_ci return; 11398c2ecf20Sopenharmony_ci 11408c2ecf20Sopenharmony_cifree_pages: 11418c2ecf20Sopenharmony_ci pr_debug("%dB behind alloc failed, doing sync I/O\n", 11428c2ecf20Sopenharmony_ci bio->bi_iter.bi_size); 11438c2ecf20Sopenharmony_ci bio_free_pages(behind_bio); 11448c2ecf20Sopenharmony_ci bio_put(behind_bio); 11458c2ecf20Sopenharmony_ci} 11468c2ecf20Sopenharmony_ci 11478c2ecf20Sopenharmony_cistruct raid1_plug_cb { 11488c2ecf20Sopenharmony_ci struct blk_plug_cb cb; 11498c2ecf20Sopenharmony_ci struct bio_list pending; 11508c2ecf20Sopenharmony_ci int pending_cnt; 11518c2ecf20Sopenharmony_ci}; 11528c2ecf20Sopenharmony_ci 11538c2ecf20Sopenharmony_cistatic void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) 11548c2ecf20Sopenharmony_ci{ 11558c2ecf20Sopenharmony_ci struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, 11568c2ecf20Sopenharmony_ci cb); 11578c2ecf20Sopenharmony_ci struct mddev *mddev = plug->cb.data; 11588c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 11598c2ecf20Sopenharmony_ci struct bio *bio; 11608c2ecf20Sopenharmony_ci 11618c2ecf20Sopenharmony_ci if (from_schedule || current->bio_list) { 11628c2ecf20Sopenharmony_ci spin_lock_irq(&conf->device_lock); 11638c2ecf20Sopenharmony_ci bio_list_merge(&conf->pending_bio_list, &plug->pending); 11648c2ecf20Sopenharmony_ci conf->pending_count += plug->pending_cnt; 11658c2ecf20Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 11668c2ecf20Sopenharmony_ci wake_up(&conf->wait_barrier); 11678c2ecf20Sopenharmony_ci md_wakeup_thread(mddev->thread); 11688c2ecf20Sopenharmony_ci kfree(plug); 11698c2ecf20Sopenharmony_ci return; 11708c2ecf20Sopenharmony_ci } 11718c2ecf20Sopenharmony_ci 11728c2ecf20Sopenharmony_ci /* we aren't scheduling, so we can do the write-out directly. */ 11738c2ecf20Sopenharmony_ci bio = bio_list_get(&plug->pending); 11748c2ecf20Sopenharmony_ci flush_bio_list(conf, bio); 11758c2ecf20Sopenharmony_ci kfree(plug); 11768c2ecf20Sopenharmony_ci} 11778c2ecf20Sopenharmony_ci 11788c2ecf20Sopenharmony_cistatic void init_r1bio(struct r1bio *r1_bio, struct mddev *mddev, struct bio *bio) 11798c2ecf20Sopenharmony_ci{ 11808c2ecf20Sopenharmony_ci r1_bio->master_bio = bio; 11818c2ecf20Sopenharmony_ci r1_bio->sectors = bio_sectors(bio); 11828c2ecf20Sopenharmony_ci r1_bio->state = 0; 11838c2ecf20Sopenharmony_ci r1_bio->mddev = mddev; 11848c2ecf20Sopenharmony_ci r1_bio->sector = bio->bi_iter.bi_sector; 11858c2ecf20Sopenharmony_ci} 11868c2ecf20Sopenharmony_ci 11878c2ecf20Sopenharmony_cistatic inline struct r1bio * 11888c2ecf20Sopenharmony_cialloc_r1bio(struct mddev *mddev, struct bio *bio) 11898c2ecf20Sopenharmony_ci{ 11908c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 11918c2ecf20Sopenharmony_ci struct r1bio *r1_bio; 11928c2ecf20Sopenharmony_ci 11938c2ecf20Sopenharmony_ci r1_bio = mempool_alloc(&conf->r1bio_pool, GFP_NOIO); 11948c2ecf20Sopenharmony_ci /* Ensure no bio records IO_BLOCKED */ 11958c2ecf20Sopenharmony_ci memset(r1_bio->bios, 0, conf->raid_disks * sizeof(r1_bio->bios[0])); 11968c2ecf20Sopenharmony_ci init_r1bio(r1_bio, mddev, bio); 11978c2ecf20Sopenharmony_ci return r1_bio; 11988c2ecf20Sopenharmony_ci} 11998c2ecf20Sopenharmony_ci 12008c2ecf20Sopenharmony_cistatic void raid1_read_request(struct mddev *mddev, struct bio *bio, 12018c2ecf20Sopenharmony_ci int max_read_sectors, struct r1bio *r1_bio) 12028c2ecf20Sopenharmony_ci{ 12038c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 12048c2ecf20Sopenharmony_ci struct raid1_info *mirror; 12058c2ecf20Sopenharmony_ci struct bio *read_bio; 12068c2ecf20Sopenharmony_ci struct bitmap *bitmap = mddev->bitmap; 12078c2ecf20Sopenharmony_ci const int op = bio_op(bio); 12088c2ecf20Sopenharmony_ci const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); 12098c2ecf20Sopenharmony_ci int max_sectors; 12108c2ecf20Sopenharmony_ci int rdisk; 12118c2ecf20Sopenharmony_ci bool print_msg = !!r1_bio; 12128c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 12138c2ecf20Sopenharmony_ci 12148c2ecf20Sopenharmony_ci /* 12158c2ecf20Sopenharmony_ci * If r1_bio is set, we are blocking the raid1d thread 12168c2ecf20Sopenharmony_ci * so there is a tiny risk of deadlock. So ask for 12178c2ecf20Sopenharmony_ci * emergency memory if needed. 12188c2ecf20Sopenharmony_ci */ 12198c2ecf20Sopenharmony_ci gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO; 12208c2ecf20Sopenharmony_ci 12218c2ecf20Sopenharmony_ci if (print_msg) { 12228c2ecf20Sopenharmony_ci /* Need to get the block device name carefully */ 12238c2ecf20Sopenharmony_ci struct md_rdev *rdev; 12248c2ecf20Sopenharmony_ci rcu_read_lock(); 12258c2ecf20Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev); 12268c2ecf20Sopenharmony_ci if (rdev) 12278c2ecf20Sopenharmony_ci bdevname(rdev->bdev, b); 12288c2ecf20Sopenharmony_ci else 12298c2ecf20Sopenharmony_ci strcpy(b, "???"); 12308c2ecf20Sopenharmony_ci rcu_read_unlock(); 12318c2ecf20Sopenharmony_ci } 12328c2ecf20Sopenharmony_ci 12338c2ecf20Sopenharmony_ci /* 12348c2ecf20Sopenharmony_ci * Still need barrier for READ in case that whole 12358c2ecf20Sopenharmony_ci * array is frozen. 12368c2ecf20Sopenharmony_ci */ 12378c2ecf20Sopenharmony_ci wait_read_barrier(conf, bio->bi_iter.bi_sector); 12388c2ecf20Sopenharmony_ci 12398c2ecf20Sopenharmony_ci if (!r1_bio) 12408c2ecf20Sopenharmony_ci r1_bio = alloc_r1bio(mddev, bio); 12418c2ecf20Sopenharmony_ci else 12428c2ecf20Sopenharmony_ci init_r1bio(r1_bio, mddev, bio); 12438c2ecf20Sopenharmony_ci r1_bio->sectors = max_read_sectors; 12448c2ecf20Sopenharmony_ci 12458c2ecf20Sopenharmony_ci /* 12468c2ecf20Sopenharmony_ci * make_request() can abort the operation when read-ahead is being 12478c2ecf20Sopenharmony_ci * used and no empty request is available. 12488c2ecf20Sopenharmony_ci */ 12498c2ecf20Sopenharmony_ci rdisk = read_balance(conf, r1_bio, &max_sectors); 12508c2ecf20Sopenharmony_ci 12518c2ecf20Sopenharmony_ci if (rdisk < 0) { 12528c2ecf20Sopenharmony_ci /* couldn't find anywhere to read from */ 12538c2ecf20Sopenharmony_ci if (print_msg) { 12548c2ecf20Sopenharmony_ci pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", 12558c2ecf20Sopenharmony_ci mdname(mddev), 12568c2ecf20Sopenharmony_ci b, 12578c2ecf20Sopenharmony_ci (unsigned long long)r1_bio->sector); 12588c2ecf20Sopenharmony_ci } 12598c2ecf20Sopenharmony_ci raid_end_bio_io(r1_bio); 12608c2ecf20Sopenharmony_ci return; 12618c2ecf20Sopenharmony_ci } 12628c2ecf20Sopenharmony_ci mirror = conf->mirrors + rdisk; 12638c2ecf20Sopenharmony_ci 12648c2ecf20Sopenharmony_ci if (print_msg) 12658c2ecf20Sopenharmony_ci pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n", 12668c2ecf20Sopenharmony_ci mdname(mddev), 12678c2ecf20Sopenharmony_ci (unsigned long long)r1_bio->sector, 12688c2ecf20Sopenharmony_ci bdevname(mirror->rdev->bdev, b)); 12698c2ecf20Sopenharmony_ci 12708c2ecf20Sopenharmony_ci if (test_bit(WriteMostly, &mirror->rdev->flags) && 12718c2ecf20Sopenharmony_ci bitmap) { 12728c2ecf20Sopenharmony_ci /* 12738c2ecf20Sopenharmony_ci * Reading from a write-mostly device must take care not to 12748c2ecf20Sopenharmony_ci * over-take any writes that are 'behind' 12758c2ecf20Sopenharmony_ci */ 12768c2ecf20Sopenharmony_ci raid1_log(mddev, "wait behind writes"); 12778c2ecf20Sopenharmony_ci wait_event(bitmap->behind_wait, 12788c2ecf20Sopenharmony_ci atomic_read(&bitmap->behind_writes) == 0); 12798c2ecf20Sopenharmony_ci } 12808c2ecf20Sopenharmony_ci 12818c2ecf20Sopenharmony_ci if (max_sectors < bio_sectors(bio)) { 12828c2ecf20Sopenharmony_ci struct bio *split = bio_split(bio, max_sectors, 12838c2ecf20Sopenharmony_ci gfp, &conf->bio_split); 12848c2ecf20Sopenharmony_ci bio_chain(split, bio); 12858c2ecf20Sopenharmony_ci submit_bio_noacct(bio); 12868c2ecf20Sopenharmony_ci bio = split; 12878c2ecf20Sopenharmony_ci r1_bio->master_bio = bio; 12888c2ecf20Sopenharmony_ci r1_bio->sectors = max_sectors; 12898c2ecf20Sopenharmony_ci } 12908c2ecf20Sopenharmony_ci 12918c2ecf20Sopenharmony_ci r1_bio->read_disk = rdisk; 12928c2ecf20Sopenharmony_ci 12938c2ecf20Sopenharmony_ci read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set); 12948c2ecf20Sopenharmony_ci 12958c2ecf20Sopenharmony_ci r1_bio->bios[rdisk] = read_bio; 12968c2ecf20Sopenharmony_ci 12978c2ecf20Sopenharmony_ci read_bio->bi_iter.bi_sector = r1_bio->sector + 12988c2ecf20Sopenharmony_ci mirror->rdev->data_offset; 12998c2ecf20Sopenharmony_ci bio_set_dev(read_bio, mirror->rdev->bdev); 13008c2ecf20Sopenharmony_ci read_bio->bi_end_io = raid1_end_read_request; 13018c2ecf20Sopenharmony_ci bio_set_op_attrs(read_bio, op, do_sync); 13028c2ecf20Sopenharmony_ci if (test_bit(FailFast, &mirror->rdev->flags) && 13038c2ecf20Sopenharmony_ci test_bit(R1BIO_FailFast, &r1_bio->state)) 13048c2ecf20Sopenharmony_ci read_bio->bi_opf |= MD_FAILFAST; 13058c2ecf20Sopenharmony_ci read_bio->bi_private = r1_bio; 13068c2ecf20Sopenharmony_ci 13078c2ecf20Sopenharmony_ci if (mddev->gendisk) 13088c2ecf20Sopenharmony_ci trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, 13098c2ecf20Sopenharmony_ci disk_devt(mddev->gendisk), r1_bio->sector); 13108c2ecf20Sopenharmony_ci 13118c2ecf20Sopenharmony_ci submit_bio_noacct(read_bio); 13128c2ecf20Sopenharmony_ci} 13138c2ecf20Sopenharmony_ci 13148c2ecf20Sopenharmony_cistatic void raid1_write_request(struct mddev *mddev, struct bio *bio, 13158c2ecf20Sopenharmony_ci int max_write_sectors) 13168c2ecf20Sopenharmony_ci{ 13178c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 13188c2ecf20Sopenharmony_ci struct r1bio *r1_bio; 13198c2ecf20Sopenharmony_ci int i, disks; 13208c2ecf20Sopenharmony_ci struct bitmap *bitmap = mddev->bitmap; 13218c2ecf20Sopenharmony_ci unsigned long flags; 13228c2ecf20Sopenharmony_ci struct md_rdev *blocked_rdev; 13238c2ecf20Sopenharmony_ci struct blk_plug_cb *cb; 13248c2ecf20Sopenharmony_ci struct raid1_plug_cb *plug = NULL; 13258c2ecf20Sopenharmony_ci int first_clone; 13268c2ecf20Sopenharmony_ci int max_sectors; 13278c2ecf20Sopenharmony_ci 13288c2ecf20Sopenharmony_ci if (mddev_is_clustered(mddev) && 13298c2ecf20Sopenharmony_ci md_cluster_ops->area_resyncing(mddev, WRITE, 13308c2ecf20Sopenharmony_ci bio->bi_iter.bi_sector, bio_end_sector(bio))) { 13318c2ecf20Sopenharmony_ci 13328c2ecf20Sopenharmony_ci DEFINE_WAIT(w); 13338c2ecf20Sopenharmony_ci for (;;) { 13348c2ecf20Sopenharmony_ci prepare_to_wait(&conf->wait_barrier, 13358c2ecf20Sopenharmony_ci &w, TASK_IDLE); 13368c2ecf20Sopenharmony_ci if (!md_cluster_ops->area_resyncing(mddev, WRITE, 13378c2ecf20Sopenharmony_ci bio->bi_iter.bi_sector, 13388c2ecf20Sopenharmony_ci bio_end_sector(bio))) 13398c2ecf20Sopenharmony_ci break; 13408c2ecf20Sopenharmony_ci schedule(); 13418c2ecf20Sopenharmony_ci } 13428c2ecf20Sopenharmony_ci finish_wait(&conf->wait_barrier, &w); 13438c2ecf20Sopenharmony_ci } 13448c2ecf20Sopenharmony_ci 13458c2ecf20Sopenharmony_ci /* 13468c2ecf20Sopenharmony_ci * Register the new request and wait if the reconstruction 13478c2ecf20Sopenharmony_ci * thread has put up a bar for new requests. 13488c2ecf20Sopenharmony_ci * Continue immediately if no resync is active currently. 13498c2ecf20Sopenharmony_ci */ 13508c2ecf20Sopenharmony_ci wait_barrier(conf, bio->bi_iter.bi_sector); 13518c2ecf20Sopenharmony_ci 13528c2ecf20Sopenharmony_ci r1_bio = alloc_r1bio(mddev, bio); 13538c2ecf20Sopenharmony_ci r1_bio->sectors = max_write_sectors; 13548c2ecf20Sopenharmony_ci 13558c2ecf20Sopenharmony_ci if (conf->pending_count >= max_queued_requests) { 13568c2ecf20Sopenharmony_ci md_wakeup_thread(mddev->thread); 13578c2ecf20Sopenharmony_ci raid1_log(mddev, "wait queued"); 13588c2ecf20Sopenharmony_ci wait_event(conf->wait_barrier, 13598c2ecf20Sopenharmony_ci conf->pending_count < max_queued_requests); 13608c2ecf20Sopenharmony_ci } 13618c2ecf20Sopenharmony_ci /* first select target devices under rcu_lock and 13628c2ecf20Sopenharmony_ci * inc refcount on their rdev. Record them by setting 13638c2ecf20Sopenharmony_ci * bios[x] to bio 13648c2ecf20Sopenharmony_ci * If there are known/acknowledged bad blocks on any device on 13658c2ecf20Sopenharmony_ci * which we have seen a write error, we want to avoid writing those 13668c2ecf20Sopenharmony_ci * blocks. 13678c2ecf20Sopenharmony_ci * This potentially requires several writes to write around 13688c2ecf20Sopenharmony_ci * the bad blocks. Each set of writes gets it's own r1bio 13698c2ecf20Sopenharmony_ci * with a set of bios attached. 13708c2ecf20Sopenharmony_ci */ 13718c2ecf20Sopenharmony_ci 13728c2ecf20Sopenharmony_ci disks = conf->raid_disks * 2; 13738c2ecf20Sopenharmony_ci retry_write: 13748c2ecf20Sopenharmony_ci blocked_rdev = NULL; 13758c2ecf20Sopenharmony_ci rcu_read_lock(); 13768c2ecf20Sopenharmony_ci max_sectors = r1_bio->sectors; 13778c2ecf20Sopenharmony_ci for (i = 0; i < disks; i++) { 13788c2ecf20Sopenharmony_ci struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 13798c2ecf20Sopenharmony_ci if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 13808c2ecf20Sopenharmony_ci atomic_inc(&rdev->nr_pending); 13818c2ecf20Sopenharmony_ci blocked_rdev = rdev; 13828c2ecf20Sopenharmony_ci break; 13838c2ecf20Sopenharmony_ci } 13848c2ecf20Sopenharmony_ci r1_bio->bios[i] = NULL; 13858c2ecf20Sopenharmony_ci if (!rdev || test_bit(Faulty, &rdev->flags)) { 13868c2ecf20Sopenharmony_ci if (i < conf->raid_disks) 13878c2ecf20Sopenharmony_ci set_bit(R1BIO_Degraded, &r1_bio->state); 13888c2ecf20Sopenharmony_ci continue; 13898c2ecf20Sopenharmony_ci } 13908c2ecf20Sopenharmony_ci 13918c2ecf20Sopenharmony_ci atomic_inc(&rdev->nr_pending); 13928c2ecf20Sopenharmony_ci if (test_bit(WriteErrorSeen, &rdev->flags)) { 13938c2ecf20Sopenharmony_ci sector_t first_bad; 13948c2ecf20Sopenharmony_ci int bad_sectors; 13958c2ecf20Sopenharmony_ci int is_bad; 13968c2ecf20Sopenharmony_ci 13978c2ecf20Sopenharmony_ci is_bad = is_badblock(rdev, r1_bio->sector, max_sectors, 13988c2ecf20Sopenharmony_ci &first_bad, &bad_sectors); 13998c2ecf20Sopenharmony_ci if (is_bad < 0) { 14008c2ecf20Sopenharmony_ci /* mustn't write here until the bad block is 14018c2ecf20Sopenharmony_ci * acknowledged*/ 14028c2ecf20Sopenharmony_ci set_bit(BlockedBadBlocks, &rdev->flags); 14038c2ecf20Sopenharmony_ci blocked_rdev = rdev; 14048c2ecf20Sopenharmony_ci break; 14058c2ecf20Sopenharmony_ci } 14068c2ecf20Sopenharmony_ci if (is_bad && first_bad <= r1_bio->sector) { 14078c2ecf20Sopenharmony_ci /* Cannot write here at all */ 14088c2ecf20Sopenharmony_ci bad_sectors -= (r1_bio->sector - first_bad); 14098c2ecf20Sopenharmony_ci if (bad_sectors < max_sectors) 14108c2ecf20Sopenharmony_ci /* mustn't write more than bad_sectors 14118c2ecf20Sopenharmony_ci * to other devices yet 14128c2ecf20Sopenharmony_ci */ 14138c2ecf20Sopenharmony_ci max_sectors = bad_sectors; 14148c2ecf20Sopenharmony_ci rdev_dec_pending(rdev, mddev); 14158c2ecf20Sopenharmony_ci /* We don't set R1BIO_Degraded as that 14168c2ecf20Sopenharmony_ci * only applies if the disk is 14178c2ecf20Sopenharmony_ci * missing, so it might be re-added, 14188c2ecf20Sopenharmony_ci * and we want to know to recover this 14198c2ecf20Sopenharmony_ci * chunk. 14208c2ecf20Sopenharmony_ci * In this case the device is here, 14218c2ecf20Sopenharmony_ci * and the fact that this chunk is not 14228c2ecf20Sopenharmony_ci * in-sync is recorded in the bad 14238c2ecf20Sopenharmony_ci * block log 14248c2ecf20Sopenharmony_ci */ 14258c2ecf20Sopenharmony_ci continue; 14268c2ecf20Sopenharmony_ci } 14278c2ecf20Sopenharmony_ci if (is_bad) { 14288c2ecf20Sopenharmony_ci int good_sectors = first_bad - r1_bio->sector; 14298c2ecf20Sopenharmony_ci if (good_sectors < max_sectors) 14308c2ecf20Sopenharmony_ci max_sectors = good_sectors; 14318c2ecf20Sopenharmony_ci } 14328c2ecf20Sopenharmony_ci } 14338c2ecf20Sopenharmony_ci r1_bio->bios[i] = bio; 14348c2ecf20Sopenharmony_ci } 14358c2ecf20Sopenharmony_ci rcu_read_unlock(); 14368c2ecf20Sopenharmony_ci 14378c2ecf20Sopenharmony_ci if (unlikely(blocked_rdev)) { 14388c2ecf20Sopenharmony_ci /* Wait for this device to become unblocked */ 14398c2ecf20Sopenharmony_ci int j; 14408c2ecf20Sopenharmony_ci 14418c2ecf20Sopenharmony_ci for (j = 0; j < i; j++) 14428c2ecf20Sopenharmony_ci if (r1_bio->bios[j]) 14438c2ecf20Sopenharmony_ci rdev_dec_pending(conf->mirrors[j].rdev, mddev); 14448c2ecf20Sopenharmony_ci r1_bio->state = 0; 14458c2ecf20Sopenharmony_ci allow_barrier(conf, bio->bi_iter.bi_sector); 14468c2ecf20Sopenharmony_ci raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); 14478c2ecf20Sopenharmony_ci md_wait_for_blocked_rdev(blocked_rdev, mddev); 14488c2ecf20Sopenharmony_ci wait_barrier(conf, bio->bi_iter.bi_sector); 14498c2ecf20Sopenharmony_ci goto retry_write; 14508c2ecf20Sopenharmony_ci } 14518c2ecf20Sopenharmony_ci 14528c2ecf20Sopenharmony_ci if (max_sectors < bio_sectors(bio)) { 14538c2ecf20Sopenharmony_ci struct bio *split = bio_split(bio, max_sectors, 14548c2ecf20Sopenharmony_ci GFP_NOIO, &conf->bio_split); 14558c2ecf20Sopenharmony_ci bio_chain(split, bio); 14568c2ecf20Sopenharmony_ci submit_bio_noacct(bio); 14578c2ecf20Sopenharmony_ci bio = split; 14588c2ecf20Sopenharmony_ci r1_bio->master_bio = bio; 14598c2ecf20Sopenharmony_ci r1_bio->sectors = max_sectors; 14608c2ecf20Sopenharmony_ci } 14618c2ecf20Sopenharmony_ci 14628c2ecf20Sopenharmony_ci atomic_set(&r1_bio->remaining, 1); 14638c2ecf20Sopenharmony_ci atomic_set(&r1_bio->behind_remaining, 0); 14648c2ecf20Sopenharmony_ci 14658c2ecf20Sopenharmony_ci first_clone = 1; 14668c2ecf20Sopenharmony_ci 14678c2ecf20Sopenharmony_ci for (i = 0; i < disks; i++) { 14688c2ecf20Sopenharmony_ci struct bio *mbio = NULL; 14698c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[i].rdev; 14708c2ecf20Sopenharmony_ci if (!r1_bio->bios[i]) 14718c2ecf20Sopenharmony_ci continue; 14728c2ecf20Sopenharmony_ci 14738c2ecf20Sopenharmony_ci if (first_clone) { 14748c2ecf20Sopenharmony_ci /* do behind I/O ? 14758c2ecf20Sopenharmony_ci * Not if there are too many, or cannot 14768c2ecf20Sopenharmony_ci * allocate memory, or a reader on WriteMostly 14778c2ecf20Sopenharmony_ci * is waiting for behind writes to flush */ 14788c2ecf20Sopenharmony_ci if (bitmap && 14798c2ecf20Sopenharmony_ci (atomic_read(&bitmap->behind_writes) 14808c2ecf20Sopenharmony_ci < mddev->bitmap_info.max_write_behind) && 14818c2ecf20Sopenharmony_ci !waitqueue_active(&bitmap->behind_wait)) { 14828c2ecf20Sopenharmony_ci alloc_behind_master_bio(r1_bio, bio); 14838c2ecf20Sopenharmony_ci } 14848c2ecf20Sopenharmony_ci 14858c2ecf20Sopenharmony_ci md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, 14868c2ecf20Sopenharmony_ci test_bit(R1BIO_BehindIO, &r1_bio->state)); 14878c2ecf20Sopenharmony_ci first_clone = 0; 14888c2ecf20Sopenharmony_ci } 14898c2ecf20Sopenharmony_ci 14908c2ecf20Sopenharmony_ci if (r1_bio->behind_master_bio) 14918c2ecf20Sopenharmony_ci mbio = bio_clone_fast(r1_bio->behind_master_bio, 14928c2ecf20Sopenharmony_ci GFP_NOIO, &mddev->bio_set); 14938c2ecf20Sopenharmony_ci else 14948c2ecf20Sopenharmony_ci mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); 14958c2ecf20Sopenharmony_ci 14968c2ecf20Sopenharmony_ci if (r1_bio->behind_master_bio) { 14978c2ecf20Sopenharmony_ci if (test_bit(CollisionCheck, &rdev->flags)) 14988c2ecf20Sopenharmony_ci wait_for_serialization(rdev, r1_bio); 14998c2ecf20Sopenharmony_ci if (test_bit(WriteMostly, &rdev->flags)) 15008c2ecf20Sopenharmony_ci atomic_inc(&r1_bio->behind_remaining); 15018c2ecf20Sopenharmony_ci } else if (mddev->serialize_policy) 15028c2ecf20Sopenharmony_ci wait_for_serialization(rdev, r1_bio); 15038c2ecf20Sopenharmony_ci 15048c2ecf20Sopenharmony_ci r1_bio->bios[i] = mbio; 15058c2ecf20Sopenharmony_ci 15068c2ecf20Sopenharmony_ci mbio->bi_iter.bi_sector = (r1_bio->sector + 15078c2ecf20Sopenharmony_ci conf->mirrors[i].rdev->data_offset); 15088c2ecf20Sopenharmony_ci bio_set_dev(mbio, conf->mirrors[i].rdev->bdev); 15098c2ecf20Sopenharmony_ci mbio->bi_end_io = raid1_end_write_request; 15108c2ecf20Sopenharmony_ci mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); 15118c2ecf20Sopenharmony_ci if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) && 15128c2ecf20Sopenharmony_ci !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) && 15138c2ecf20Sopenharmony_ci conf->raid_disks - mddev->degraded > 1) 15148c2ecf20Sopenharmony_ci mbio->bi_opf |= MD_FAILFAST; 15158c2ecf20Sopenharmony_ci mbio->bi_private = r1_bio; 15168c2ecf20Sopenharmony_ci 15178c2ecf20Sopenharmony_ci atomic_inc(&r1_bio->remaining); 15188c2ecf20Sopenharmony_ci 15198c2ecf20Sopenharmony_ci if (mddev->gendisk) 15208c2ecf20Sopenharmony_ci trace_block_bio_remap(mbio->bi_disk->queue, 15218c2ecf20Sopenharmony_ci mbio, disk_devt(mddev->gendisk), 15228c2ecf20Sopenharmony_ci r1_bio->sector); 15238c2ecf20Sopenharmony_ci /* flush_pending_writes() needs access to the rdev so...*/ 15248c2ecf20Sopenharmony_ci mbio->bi_disk = (void *)conf->mirrors[i].rdev; 15258c2ecf20Sopenharmony_ci 15268c2ecf20Sopenharmony_ci cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); 15278c2ecf20Sopenharmony_ci if (cb) 15288c2ecf20Sopenharmony_ci plug = container_of(cb, struct raid1_plug_cb, cb); 15298c2ecf20Sopenharmony_ci else 15308c2ecf20Sopenharmony_ci plug = NULL; 15318c2ecf20Sopenharmony_ci if (plug) { 15328c2ecf20Sopenharmony_ci bio_list_add(&plug->pending, mbio); 15338c2ecf20Sopenharmony_ci plug->pending_cnt++; 15348c2ecf20Sopenharmony_ci } else { 15358c2ecf20Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 15368c2ecf20Sopenharmony_ci bio_list_add(&conf->pending_bio_list, mbio); 15378c2ecf20Sopenharmony_ci conf->pending_count++; 15388c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 15398c2ecf20Sopenharmony_ci md_wakeup_thread(mddev->thread); 15408c2ecf20Sopenharmony_ci } 15418c2ecf20Sopenharmony_ci } 15428c2ecf20Sopenharmony_ci 15438c2ecf20Sopenharmony_ci r1_bio_write_done(r1_bio); 15448c2ecf20Sopenharmony_ci 15458c2ecf20Sopenharmony_ci /* In case raid1d snuck in to freeze_array */ 15468c2ecf20Sopenharmony_ci wake_up(&conf->wait_barrier); 15478c2ecf20Sopenharmony_ci} 15488c2ecf20Sopenharmony_ci 15498c2ecf20Sopenharmony_cistatic bool raid1_make_request(struct mddev *mddev, struct bio *bio) 15508c2ecf20Sopenharmony_ci{ 15518c2ecf20Sopenharmony_ci sector_t sectors; 15528c2ecf20Sopenharmony_ci 15538c2ecf20Sopenharmony_ci if (unlikely(bio->bi_opf & REQ_PREFLUSH) 15548c2ecf20Sopenharmony_ci && md_flush_request(mddev, bio)) 15558c2ecf20Sopenharmony_ci return true; 15568c2ecf20Sopenharmony_ci 15578c2ecf20Sopenharmony_ci /* 15588c2ecf20Sopenharmony_ci * There is a limit to the maximum size, but 15598c2ecf20Sopenharmony_ci * the read/write handler might find a lower limit 15608c2ecf20Sopenharmony_ci * due to bad blocks. To avoid multiple splits, 15618c2ecf20Sopenharmony_ci * we pass the maximum number of sectors down 15628c2ecf20Sopenharmony_ci * and let the lower level perform the split. 15638c2ecf20Sopenharmony_ci */ 15648c2ecf20Sopenharmony_ci sectors = align_to_barrier_unit_end( 15658c2ecf20Sopenharmony_ci bio->bi_iter.bi_sector, bio_sectors(bio)); 15668c2ecf20Sopenharmony_ci 15678c2ecf20Sopenharmony_ci if (bio_data_dir(bio) == READ) 15688c2ecf20Sopenharmony_ci raid1_read_request(mddev, bio, sectors, NULL); 15698c2ecf20Sopenharmony_ci else { 15708c2ecf20Sopenharmony_ci if (!md_write_start(mddev,bio)) 15718c2ecf20Sopenharmony_ci return false; 15728c2ecf20Sopenharmony_ci raid1_write_request(mddev, bio, sectors); 15738c2ecf20Sopenharmony_ci } 15748c2ecf20Sopenharmony_ci return true; 15758c2ecf20Sopenharmony_ci} 15768c2ecf20Sopenharmony_ci 15778c2ecf20Sopenharmony_cistatic void raid1_status(struct seq_file *seq, struct mddev *mddev) 15788c2ecf20Sopenharmony_ci{ 15798c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 15808c2ecf20Sopenharmony_ci int i; 15818c2ecf20Sopenharmony_ci 15828c2ecf20Sopenharmony_ci seq_printf(seq, " [%d/%d] [", conf->raid_disks, 15838c2ecf20Sopenharmony_ci conf->raid_disks - mddev->degraded); 15848c2ecf20Sopenharmony_ci rcu_read_lock(); 15858c2ecf20Sopenharmony_ci for (i = 0; i < conf->raid_disks; i++) { 15868c2ecf20Sopenharmony_ci struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 15878c2ecf20Sopenharmony_ci seq_printf(seq, "%s", 15888c2ecf20Sopenharmony_ci rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); 15898c2ecf20Sopenharmony_ci } 15908c2ecf20Sopenharmony_ci rcu_read_unlock(); 15918c2ecf20Sopenharmony_ci seq_printf(seq, "]"); 15928c2ecf20Sopenharmony_ci} 15938c2ecf20Sopenharmony_ci 15948c2ecf20Sopenharmony_cistatic void raid1_error(struct mddev *mddev, struct md_rdev *rdev) 15958c2ecf20Sopenharmony_ci{ 15968c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 15978c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 15988c2ecf20Sopenharmony_ci unsigned long flags; 15998c2ecf20Sopenharmony_ci 16008c2ecf20Sopenharmony_ci /* 16018c2ecf20Sopenharmony_ci * If it is not operational, then we have already marked it as dead 16028c2ecf20Sopenharmony_ci * else if it is the last working disks with "fail_last_dev == false", 16038c2ecf20Sopenharmony_ci * ignore the error, let the next level up know. 16048c2ecf20Sopenharmony_ci * else mark the drive as failed 16058c2ecf20Sopenharmony_ci */ 16068c2ecf20Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 16078c2ecf20Sopenharmony_ci if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev 16088c2ecf20Sopenharmony_ci && (conf->raid_disks - mddev->degraded) == 1) { 16098c2ecf20Sopenharmony_ci /* 16108c2ecf20Sopenharmony_ci * Don't fail the drive, act as though we were just a 16118c2ecf20Sopenharmony_ci * normal single drive. 16128c2ecf20Sopenharmony_ci * However don't try a recovery from this drive as 16138c2ecf20Sopenharmony_ci * it is very likely to fail. 16148c2ecf20Sopenharmony_ci */ 16158c2ecf20Sopenharmony_ci conf->recovery_disabled = mddev->recovery_disabled; 16168c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 16178c2ecf20Sopenharmony_ci return; 16188c2ecf20Sopenharmony_ci } 16198c2ecf20Sopenharmony_ci set_bit(Blocked, &rdev->flags); 16208c2ecf20Sopenharmony_ci if (test_and_clear_bit(In_sync, &rdev->flags)) 16218c2ecf20Sopenharmony_ci mddev->degraded++; 16228c2ecf20Sopenharmony_ci set_bit(Faulty, &rdev->flags); 16238c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 16248c2ecf20Sopenharmony_ci /* 16258c2ecf20Sopenharmony_ci * if recovery is running, make sure it aborts. 16268c2ecf20Sopenharmony_ci */ 16278c2ecf20Sopenharmony_ci set_bit(MD_RECOVERY_INTR, &mddev->recovery); 16288c2ecf20Sopenharmony_ci set_mask_bits(&mddev->sb_flags, 0, 16298c2ecf20Sopenharmony_ci BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 16308c2ecf20Sopenharmony_ci pr_crit("md/raid1:%s: Disk failure on %s, disabling device.\n" 16318c2ecf20Sopenharmony_ci "md/raid1:%s: Operation continuing on %d devices.\n", 16328c2ecf20Sopenharmony_ci mdname(mddev), bdevname(rdev->bdev, b), 16338c2ecf20Sopenharmony_ci mdname(mddev), conf->raid_disks - mddev->degraded); 16348c2ecf20Sopenharmony_ci} 16358c2ecf20Sopenharmony_ci 16368c2ecf20Sopenharmony_cistatic void print_conf(struct r1conf *conf) 16378c2ecf20Sopenharmony_ci{ 16388c2ecf20Sopenharmony_ci int i; 16398c2ecf20Sopenharmony_ci 16408c2ecf20Sopenharmony_ci pr_debug("RAID1 conf printout:\n"); 16418c2ecf20Sopenharmony_ci if (!conf) { 16428c2ecf20Sopenharmony_ci pr_debug("(!conf)\n"); 16438c2ecf20Sopenharmony_ci return; 16448c2ecf20Sopenharmony_ci } 16458c2ecf20Sopenharmony_ci pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, 16468c2ecf20Sopenharmony_ci conf->raid_disks); 16478c2ecf20Sopenharmony_ci 16488c2ecf20Sopenharmony_ci rcu_read_lock(); 16498c2ecf20Sopenharmony_ci for (i = 0; i < conf->raid_disks; i++) { 16508c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 16518c2ecf20Sopenharmony_ci struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 16528c2ecf20Sopenharmony_ci if (rdev) 16538c2ecf20Sopenharmony_ci pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n", 16548c2ecf20Sopenharmony_ci i, !test_bit(In_sync, &rdev->flags), 16558c2ecf20Sopenharmony_ci !test_bit(Faulty, &rdev->flags), 16568c2ecf20Sopenharmony_ci bdevname(rdev->bdev,b)); 16578c2ecf20Sopenharmony_ci } 16588c2ecf20Sopenharmony_ci rcu_read_unlock(); 16598c2ecf20Sopenharmony_ci} 16608c2ecf20Sopenharmony_ci 16618c2ecf20Sopenharmony_cistatic void close_sync(struct r1conf *conf) 16628c2ecf20Sopenharmony_ci{ 16638c2ecf20Sopenharmony_ci int idx; 16648c2ecf20Sopenharmony_ci 16658c2ecf20Sopenharmony_ci for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) { 16668c2ecf20Sopenharmony_ci _wait_barrier(conf, idx); 16678c2ecf20Sopenharmony_ci _allow_barrier(conf, idx); 16688c2ecf20Sopenharmony_ci } 16698c2ecf20Sopenharmony_ci 16708c2ecf20Sopenharmony_ci mempool_exit(&conf->r1buf_pool); 16718c2ecf20Sopenharmony_ci} 16728c2ecf20Sopenharmony_ci 16738c2ecf20Sopenharmony_cistatic int raid1_spare_active(struct mddev *mddev) 16748c2ecf20Sopenharmony_ci{ 16758c2ecf20Sopenharmony_ci int i; 16768c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 16778c2ecf20Sopenharmony_ci int count = 0; 16788c2ecf20Sopenharmony_ci unsigned long flags; 16798c2ecf20Sopenharmony_ci 16808c2ecf20Sopenharmony_ci /* 16818c2ecf20Sopenharmony_ci * Find all failed disks within the RAID1 configuration 16828c2ecf20Sopenharmony_ci * and mark them readable. 16838c2ecf20Sopenharmony_ci * Called under mddev lock, so rcu protection not needed. 16848c2ecf20Sopenharmony_ci * device_lock used to avoid races with raid1_end_read_request 16858c2ecf20Sopenharmony_ci * which expects 'In_sync' flags and ->degraded to be consistent. 16868c2ecf20Sopenharmony_ci */ 16878c2ecf20Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 16888c2ecf20Sopenharmony_ci for (i = 0; i < conf->raid_disks; i++) { 16898c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[i].rdev; 16908c2ecf20Sopenharmony_ci struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; 16918c2ecf20Sopenharmony_ci if (repl 16928c2ecf20Sopenharmony_ci && !test_bit(Candidate, &repl->flags) 16938c2ecf20Sopenharmony_ci && repl->recovery_offset == MaxSector 16948c2ecf20Sopenharmony_ci && !test_bit(Faulty, &repl->flags) 16958c2ecf20Sopenharmony_ci && !test_and_set_bit(In_sync, &repl->flags)) { 16968c2ecf20Sopenharmony_ci /* replacement has just become active */ 16978c2ecf20Sopenharmony_ci if (!rdev || 16988c2ecf20Sopenharmony_ci !test_and_clear_bit(In_sync, &rdev->flags)) 16998c2ecf20Sopenharmony_ci count++; 17008c2ecf20Sopenharmony_ci if (rdev) { 17018c2ecf20Sopenharmony_ci /* Replaced device not technically 17028c2ecf20Sopenharmony_ci * faulty, but we need to be sure 17038c2ecf20Sopenharmony_ci * it gets removed and never re-added 17048c2ecf20Sopenharmony_ci */ 17058c2ecf20Sopenharmony_ci set_bit(Faulty, &rdev->flags); 17068c2ecf20Sopenharmony_ci sysfs_notify_dirent_safe( 17078c2ecf20Sopenharmony_ci rdev->sysfs_state); 17088c2ecf20Sopenharmony_ci } 17098c2ecf20Sopenharmony_ci } 17108c2ecf20Sopenharmony_ci if (rdev 17118c2ecf20Sopenharmony_ci && rdev->recovery_offset == MaxSector 17128c2ecf20Sopenharmony_ci && !test_bit(Faulty, &rdev->flags) 17138c2ecf20Sopenharmony_ci && !test_and_set_bit(In_sync, &rdev->flags)) { 17148c2ecf20Sopenharmony_ci count++; 17158c2ecf20Sopenharmony_ci sysfs_notify_dirent_safe(rdev->sysfs_state); 17168c2ecf20Sopenharmony_ci } 17178c2ecf20Sopenharmony_ci } 17188c2ecf20Sopenharmony_ci mddev->degraded -= count; 17198c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 17208c2ecf20Sopenharmony_ci 17218c2ecf20Sopenharmony_ci print_conf(conf); 17228c2ecf20Sopenharmony_ci return count; 17238c2ecf20Sopenharmony_ci} 17248c2ecf20Sopenharmony_ci 17258c2ecf20Sopenharmony_cistatic int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) 17268c2ecf20Sopenharmony_ci{ 17278c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 17288c2ecf20Sopenharmony_ci int err = -EEXIST; 17298c2ecf20Sopenharmony_ci int mirror = 0; 17308c2ecf20Sopenharmony_ci struct raid1_info *p; 17318c2ecf20Sopenharmony_ci int first = 0; 17328c2ecf20Sopenharmony_ci int last = conf->raid_disks - 1; 17338c2ecf20Sopenharmony_ci 17348c2ecf20Sopenharmony_ci if (mddev->recovery_disabled == conf->recovery_disabled) 17358c2ecf20Sopenharmony_ci return -EBUSY; 17368c2ecf20Sopenharmony_ci 17378c2ecf20Sopenharmony_ci if (md_integrity_add_rdev(rdev, mddev)) 17388c2ecf20Sopenharmony_ci return -ENXIO; 17398c2ecf20Sopenharmony_ci 17408c2ecf20Sopenharmony_ci if (rdev->raid_disk >= 0) 17418c2ecf20Sopenharmony_ci first = last = rdev->raid_disk; 17428c2ecf20Sopenharmony_ci 17438c2ecf20Sopenharmony_ci /* 17448c2ecf20Sopenharmony_ci * find the disk ... but prefer rdev->saved_raid_disk 17458c2ecf20Sopenharmony_ci * if possible. 17468c2ecf20Sopenharmony_ci */ 17478c2ecf20Sopenharmony_ci if (rdev->saved_raid_disk >= 0 && 17488c2ecf20Sopenharmony_ci rdev->saved_raid_disk >= first && 17498c2ecf20Sopenharmony_ci rdev->saved_raid_disk < conf->raid_disks && 17508c2ecf20Sopenharmony_ci conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 17518c2ecf20Sopenharmony_ci first = last = rdev->saved_raid_disk; 17528c2ecf20Sopenharmony_ci 17538c2ecf20Sopenharmony_ci for (mirror = first; mirror <= last; mirror++) { 17548c2ecf20Sopenharmony_ci p = conf->mirrors + mirror; 17558c2ecf20Sopenharmony_ci if (!p->rdev) { 17568c2ecf20Sopenharmony_ci if (mddev->gendisk) 17578c2ecf20Sopenharmony_ci disk_stack_limits(mddev->gendisk, rdev->bdev, 17588c2ecf20Sopenharmony_ci rdev->data_offset << 9); 17598c2ecf20Sopenharmony_ci 17608c2ecf20Sopenharmony_ci p->head_position = 0; 17618c2ecf20Sopenharmony_ci rdev->raid_disk = mirror; 17628c2ecf20Sopenharmony_ci err = 0; 17638c2ecf20Sopenharmony_ci /* As all devices are equivalent, we don't need a full recovery 17648c2ecf20Sopenharmony_ci * if this was recently any drive of the array 17658c2ecf20Sopenharmony_ci */ 17668c2ecf20Sopenharmony_ci if (rdev->saved_raid_disk < 0) 17678c2ecf20Sopenharmony_ci conf->fullsync = 1; 17688c2ecf20Sopenharmony_ci rcu_assign_pointer(p->rdev, rdev); 17698c2ecf20Sopenharmony_ci break; 17708c2ecf20Sopenharmony_ci } 17718c2ecf20Sopenharmony_ci if (test_bit(WantReplacement, &p->rdev->flags) && 17728c2ecf20Sopenharmony_ci p[conf->raid_disks].rdev == NULL) { 17738c2ecf20Sopenharmony_ci /* Add this device as a replacement */ 17748c2ecf20Sopenharmony_ci clear_bit(In_sync, &rdev->flags); 17758c2ecf20Sopenharmony_ci set_bit(Replacement, &rdev->flags); 17768c2ecf20Sopenharmony_ci rdev->raid_disk = mirror; 17778c2ecf20Sopenharmony_ci err = 0; 17788c2ecf20Sopenharmony_ci conf->fullsync = 1; 17798c2ecf20Sopenharmony_ci rcu_assign_pointer(p[conf->raid_disks].rdev, rdev); 17808c2ecf20Sopenharmony_ci break; 17818c2ecf20Sopenharmony_ci } 17828c2ecf20Sopenharmony_ci } 17838c2ecf20Sopenharmony_ci if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) 17848c2ecf20Sopenharmony_ci blk_queue_flag_set(QUEUE_FLAG_DISCARD, mddev->queue); 17858c2ecf20Sopenharmony_ci print_conf(conf); 17868c2ecf20Sopenharmony_ci return err; 17878c2ecf20Sopenharmony_ci} 17888c2ecf20Sopenharmony_ci 17898c2ecf20Sopenharmony_cistatic int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 17908c2ecf20Sopenharmony_ci{ 17918c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 17928c2ecf20Sopenharmony_ci int err = 0; 17938c2ecf20Sopenharmony_ci int number = rdev->raid_disk; 17948c2ecf20Sopenharmony_ci struct raid1_info *p = conf->mirrors + number; 17958c2ecf20Sopenharmony_ci 17968c2ecf20Sopenharmony_ci if (unlikely(number >= conf->raid_disks)) 17978c2ecf20Sopenharmony_ci goto abort; 17988c2ecf20Sopenharmony_ci 17998c2ecf20Sopenharmony_ci if (rdev != p->rdev) 18008c2ecf20Sopenharmony_ci p = conf->mirrors + conf->raid_disks + number; 18018c2ecf20Sopenharmony_ci 18028c2ecf20Sopenharmony_ci print_conf(conf); 18038c2ecf20Sopenharmony_ci if (rdev == p->rdev) { 18048c2ecf20Sopenharmony_ci if (test_bit(In_sync, &rdev->flags) || 18058c2ecf20Sopenharmony_ci atomic_read(&rdev->nr_pending)) { 18068c2ecf20Sopenharmony_ci err = -EBUSY; 18078c2ecf20Sopenharmony_ci goto abort; 18088c2ecf20Sopenharmony_ci } 18098c2ecf20Sopenharmony_ci /* Only remove non-faulty devices if recovery 18108c2ecf20Sopenharmony_ci * is not possible. 18118c2ecf20Sopenharmony_ci */ 18128c2ecf20Sopenharmony_ci if (!test_bit(Faulty, &rdev->flags) && 18138c2ecf20Sopenharmony_ci mddev->recovery_disabled != conf->recovery_disabled && 18148c2ecf20Sopenharmony_ci mddev->degraded < conf->raid_disks) { 18158c2ecf20Sopenharmony_ci err = -EBUSY; 18168c2ecf20Sopenharmony_ci goto abort; 18178c2ecf20Sopenharmony_ci } 18188c2ecf20Sopenharmony_ci p->rdev = NULL; 18198c2ecf20Sopenharmony_ci if (!test_bit(RemoveSynchronized, &rdev->flags)) { 18208c2ecf20Sopenharmony_ci synchronize_rcu(); 18218c2ecf20Sopenharmony_ci if (atomic_read(&rdev->nr_pending)) { 18228c2ecf20Sopenharmony_ci /* lost the race, try later */ 18238c2ecf20Sopenharmony_ci err = -EBUSY; 18248c2ecf20Sopenharmony_ci p->rdev = rdev; 18258c2ecf20Sopenharmony_ci goto abort; 18268c2ecf20Sopenharmony_ci } 18278c2ecf20Sopenharmony_ci } 18288c2ecf20Sopenharmony_ci if (conf->mirrors[conf->raid_disks + number].rdev) { 18298c2ecf20Sopenharmony_ci /* We just removed a device that is being replaced. 18308c2ecf20Sopenharmony_ci * Move down the replacement. We drain all IO before 18318c2ecf20Sopenharmony_ci * doing this to avoid confusion. 18328c2ecf20Sopenharmony_ci */ 18338c2ecf20Sopenharmony_ci struct md_rdev *repl = 18348c2ecf20Sopenharmony_ci conf->mirrors[conf->raid_disks + number].rdev; 18358c2ecf20Sopenharmony_ci freeze_array(conf, 0); 18368c2ecf20Sopenharmony_ci if (atomic_read(&repl->nr_pending)) { 18378c2ecf20Sopenharmony_ci /* It means that some queued IO of retry_list 18388c2ecf20Sopenharmony_ci * hold repl. Thus, we cannot set replacement 18398c2ecf20Sopenharmony_ci * as NULL, avoiding rdev NULL pointer 18408c2ecf20Sopenharmony_ci * dereference in sync_request_write and 18418c2ecf20Sopenharmony_ci * handle_write_finished. 18428c2ecf20Sopenharmony_ci */ 18438c2ecf20Sopenharmony_ci err = -EBUSY; 18448c2ecf20Sopenharmony_ci unfreeze_array(conf); 18458c2ecf20Sopenharmony_ci goto abort; 18468c2ecf20Sopenharmony_ci } 18478c2ecf20Sopenharmony_ci clear_bit(Replacement, &repl->flags); 18488c2ecf20Sopenharmony_ci p->rdev = repl; 18498c2ecf20Sopenharmony_ci conf->mirrors[conf->raid_disks + number].rdev = NULL; 18508c2ecf20Sopenharmony_ci unfreeze_array(conf); 18518c2ecf20Sopenharmony_ci } 18528c2ecf20Sopenharmony_ci 18538c2ecf20Sopenharmony_ci clear_bit(WantReplacement, &rdev->flags); 18548c2ecf20Sopenharmony_ci err = md_integrity_register(mddev); 18558c2ecf20Sopenharmony_ci } 18568c2ecf20Sopenharmony_ciabort: 18578c2ecf20Sopenharmony_ci 18588c2ecf20Sopenharmony_ci print_conf(conf); 18598c2ecf20Sopenharmony_ci return err; 18608c2ecf20Sopenharmony_ci} 18618c2ecf20Sopenharmony_ci 18628c2ecf20Sopenharmony_cistatic void end_sync_read(struct bio *bio) 18638c2ecf20Sopenharmony_ci{ 18648c2ecf20Sopenharmony_ci struct r1bio *r1_bio = get_resync_r1bio(bio); 18658c2ecf20Sopenharmony_ci 18668c2ecf20Sopenharmony_ci update_head_pos(r1_bio->read_disk, r1_bio); 18678c2ecf20Sopenharmony_ci 18688c2ecf20Sopenharmony_ci /* 18698c2ecf20Sopenharmony_ci * we have read a block, now it needs to be re-written, 18708c2ecf20Sopenharmony_ci * or re-read if the read failed. 18718c2ecf20Sopenharmony_ci * We don't do much here, just schedule handling by raid1d 18728c2ecf20Sopenharmony_ci */ 18738c2ecf20Sopenharmony_ci if (!bio->bi_status) 18748c2ecf20Sopenharmony_ci set_bit(R1BIO_Uptodate, &r1_bio->state); 18758c2ecf20Sopenharmony_ci 18768c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&r1_bio->remaining)) 18778c2ecf20Sopenharmony_ci reschedule_retry(r1_bio); 18788c2ecf20Sopenharmony_ci} 18798c2ecf20Sopenharmony_ci 18808c2ecf20Sopenharmony_cistatic void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) 18818c2ecf20Sopenharmony_ci{ 18828c2ecf20Sopenharmony_ci sector_t sync_blocks = 0; 18838c2ecf20Sopenharmony_ci sector_t s = r1_bio->sector; 18848c2ecf20Sopenharmony_ci long sectors_to_go = r1_bio->sectors; 18858c2ecf20Sopenharmony_ci 18868c2ecf20Sopenharmony_ci /* make sure these bits don't get cleared. */ 18878c2ecf20Sopenharmony_ci do { 18888c2ecf20Sopenharmony_ci md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); 18898c2ecf20Sopenharmony_ci s += sync_blocks; 18908c2ecf20Sopenharmony_ci sectors_to_go -= sync_blocks; 18918c2ecf20Sopenharmony_ci } while (sectors_to_go > 0); 18928c2ecf20Sopenharmony_ci} 18938c2ecf20Sopenharmony_ci 18948c2ecf20Sopenharmony_cistatic void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) 18958c2ecf20Sopenharmony_ci{ 18968c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&r1_bio->remaining)) { 18978c2ecf20Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 18988c2ecf20Sopenharmony_ci int s = r1_bio->sectors; 18998c2ecf20Sopenharmony_ci 19008c2ecf20Sopenharmony_ci if (test_bit(R1BIO_MadeGood, &r1_bio->state) || 19018c2ecf20Sopenharmony_ci test_bit(R1BIO_WriteError, &r1_bio->state)) 19028c2ecf20Sopenharmony_ci reschedule_retry(r1_bio); 19038c2ecf20Sopenharmony_ci else { 19048c2ecf20Sopenharmony_ci put_buf(r1_bio); 19058c2ecf20Sopenharmony_ci md_done_sync(mddev, s, uptodate); 19068c2ecf20Sopenharmony_ci } 19078c2ecf20Sopenharmony_ci } 19088c2ecf20Sopenharmony_ci} 19098c2ecf20Sopenharmony_ci 19108c2ecf20Sopenharmony_cistatic void end_sync_write(struct bio *bio) 19118c2ecf20Sopenharmony_ci{ 19128c2ecf20Sopenharmony_ci int uptodate = !bio->bi_status; 19138c2ecf20Sopenharmony_ci struct r1bio *r1_bio = get_resync_r1bio(bio); 19148c2ecf20Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 19158c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 19168c2ecf20Sopenharmony_ci sector_t first_bad; 19178c2ecf20Sopenharmony_ci int bad_sectors; 19188c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; 19198c2ecf20Sopenharmony_ci 19208c2ecf20Sopenharmony_ci if (!uptodate) { 19218c2ecf20Sopenharmony_ci abort_sync_write(mddev, r1_bio); 19228c2ecf20Sopenharmony_ci set_bit(WriteErrorSeen, &rdev->flags); 19238c2ecf20Sopenharmony_ci if (!test_and_set_bit(WantReplacement, &rdev->flags)) 19248c2ecf20Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, & 19258c2ecf20Sopenharmony_ci mddev->recovery); 19268c2ecf20Sopenharmony_ci set_bit(R1BIO_WriteError, &r1_bio->state); 19278c2ecf20Sopenharmony_ci } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, 19288c2ecf20Sopenharmony_ci &first_bad, &bad_sectors) && 19298c2ecf20Sopenharmony_ci !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, 19308c2ecf20Sopenharmony_ci r1_bio->sector, 19318c2ecf20Sopenharmony_ci r1_bio->sectors, 19328c2ecf20Sopenharmony_ci &first_bad, &bad_sectors) 19338c2ecf20Sopenharmony_ci ) 19348c2ecf20Sopenharmony_ci set_bit(R1BIO_MadeGood, &r1_bio->state); 19358c2ecf20Sopenharmony_ci 19368c2ecf20Sopenharmony_ci put_sync_write_buf(r1_bio, uptodate); 19378c2ecf20Sopenharmony_ci} 19388c2ecf20Sopenharmony_ci 19398c2ecf20Sopenharmony_cistatic int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, 19408c2ecf20Sopenharmony_ci int sectors, struct page *page, int rw) 19418c2ecf20Sopenharmony_ci{ 19428c2ecf20Sopenharmony_ci if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false)) 19438c2ecf20Sopenharmony_ci /* success */ 19448c2ecf20Sopenharmony_ci return 1; 19458c2ecf20Sopenharmony_ci if (rw == WRITE) { 19468c2ecf20Sopenharmony_ci set_bit(WriteErrorSeen, &rdev->flags); 19478c2ecf20Sopenharmony_ci if (!test_and_set_bit(WantReplacement, 19488c2ecf20Sopenharmony_ci &rdev->flags)) 19498c2ecf20Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, & 19508c2ecf20Sopenharmony_ci rdev->mddev->recovery); 19518c2ecf20Sopenharmony_ci } 19528c2ecf20Sopenharmony_ci /* need to record an error - either for the block or the device */ 19538c2ecf20Sopenharmony_ci if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 19548c2ecf20Sopenharmony_ci md_error(rdev->mddev, rdev); 19558c2ecf20Sopenharmony_ci return 0; 19568c2ecf20Sopenharmony_ci} 19578c2ecf20Sopenharmony_ci 19588c2ecf20Sopenharmony_cistatic int fix_sync_read_error(struct r1bio *r1_bio) 19598c2ecf20Sopenharmony_ci{ 19608c2ecf20Sopenharmony_ci /* Try some synchronous reads of other devices to get 19618c2ecf20Sopenharmony_ci * good data, much like with normal read errors. Only 19628c2ecf20Sopenharmony_ci * read into the pages we already have so we don't 19638c2ecf20Sopenharmony_ci * need to re-issue the read request. 19648c2ecf20Sopenharmony_ci * We don't need to freeze the array, because being in an 19658c2ecf20Sopenharmony_ci * active sync request, there is no normal IO, and 19668c2ecf20Sopenharmony_ci * no overlapping syncs. 19678c2ecf20Sopenharmony_ci * We don't need to check is_badblock() again as we 19688c2ecf20Sopenharmony_ci * made sure that anything with a bad block in range 19698c2ecf20Sopenharmony_ci * will have bi_end_io clear. 19708c2ecf20Sopenharmony_ci */ 19718c2ecf20Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 19728c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 19738c2ecf20Sopenharmony_ci struct bio *bio = r1_bio->bios[r1_bio->read_disk]; 19748c2ecf20Sopenharmony_ci struct page **pages = get_resync_pages(bio)->pages; 19758c2ecf20Sopenharmony_ci sector_t sect = r1_bio->sector; 19768c2ecf20Sopenharmony_ci int sectors = r1_bio->sectors; 19778c2ecf20Sopenharmony_ci int idx = 0; 19788c2ecf20Sopenharmony_ci struct md_rdev *rdev; 19798c2ecf20Sopenharmony_ci 19808c2ecf20Sopenharmony_ci rdev = conf->mirrors[r1_bio->read_disk].rdev; 19818c2ecf20Sopenharmony_ci if (test_bit(FailFast, &rdev->flags)) { 19828c2ecf20Sopenharmony_ci /* Don't try recovering from here - just fail it 19838c2ecf20Sopenharmony_ci * ... unless it is the last working device of course */ 19848c2ecf20Sopenharmony_ci md_error(mddev, rdev); 19858c2ecf20Sopenharmony_ci if (test_bit(Faulty, &rdev->flags)) 19868c2ecf20Sopenharmony_ci /* Don't try to read from here, but make sure 19878c2ecf20Sopenharmony_ci * put_buf does it's thing 19888c2ecf20Sopenharmony_ci */ 19898c2ecf20Sopenharmony_ci bio->bi_end_io = end_sync_write; 19908c2ecf20Sopenharmony_ci } 19918c2ecf20Sopenharmony_ci 19928c2ecf20Sopenharmony_ci while(sectors) { 19938c2ecf20Sopenharmony_ci int s = sectors; 19948c2ecf20Sopenharmony_ci int d = r1_bio->read_disk; 19958c2ecf20Sopenharmony_ci int success = 0; 19968c2ecf20Sopenharmony_ci int start; 19978c2ecf20Sopenharmony_ci 19988c2ecf20Sopenharmony_ci if (s > (PAGE_SIZE>>9)) 19998c2ecf20Sopenharmony_ci s = PAGE_SIZE >> 9; 20008c2ecf20Sopenharmony_ci do { 20018c2ecf20Sopenharmony_ci if (r1_bio->bios[d]->bi_end_io == end_sync_read) { 20028c2ecf20Sopenharmony_ci /* No rcu protection needed here devices 20038c2ecf20Sopenharmony_ci * can only be removed when no resync is 20048c2ecf20Sopenharmony_ci * active, and resync is currently active 20058c2ecf20Sopenharmony_ci */ 20068c2ecf20Sopenharmony_ci rdev = conf->mirrors[d].rdev; 20078c2ecf20Sopenharmony_ci if (sync_page_io(rdev, sect, s<<9, 20088c2ecf20Sopenharmony_ci pages[idx], 20098c2ecf20Sopenharmony_ci REQ_OP_READ, 0, false)) { 20108c2ecf20Sopenharmony_ci success = 1; 20118c2ecf20Sopenharmony_ci break; 20128c2ecf20Sopenharmony_ci } 20138c2ecf20Sopenharmony_ci } 20148c2ecf20Sopenharmony_ci d++; 20158c2ecf20Sopenharmony_ci if (d == conf->raid_disks * 2) 20168c2ecf20Sopenharmony_ci d = 0; 20178c2ecf20Sopenharmony_ci } while (!success && d != r1_bio->read_disk); 20188c2ecf20Sopenharmony_ci 20198c2ecf20Sopenharmony_ci if (!success) { 20208c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 20218c2ecf20Sopenharmony_ci int abort = 0; 20228c2ecf20Sopenharmony_ci /* Cannot read from anywhere, this block is lost. 20238c2ecf20Sopenharmony_ci * Record a bad block on each device. If that doesn't 20248c2ecf20Sopenharmony_ci * work just disable and interrupt the recovery. 20258c2ecf20Sopenharmony_ci * Don't fail devices as that won't really help. 20268c2ecf20Sopenharmony_ci */ 20278c2ecf20Sopenharmony_ci pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", 20288c2ecf20Sopenharmony_ci mdname(mddev), bio_devname(bio, b), 20298c2ecf20Sopenharmony_ci (unsigned long long)r1_bio->sector); 20308c2ecf20Sopenharmony_ci for (d = 0; d < conf->raid_disks * 2; d++) { 20318c2ecf20Sopenharmony_ci rdev = conf->mirrors[d].rdev; 20328c2ecf20Sopenharmony_ci if (!rdev || test_bit(Faulty, &rdev->flags)) 20338c2ecf20Sopenharmony_ci continue; 20348c2ecf20Sopenharmony_ci if (!rdev_set_badblocks(rdev, sect, s, 0)) 20358c2ecf20Sopenharmony_ci abort = 1; 20368c2ecf20Sopenharmony_ci } 20378c2ecf20Sopenharmony_ci if (abort) { 20388c2ecf20Sopenharmony_ci conf->recovery_disabled = 20398c2ecf20Sopenharmony_ci mddev->recovery_disabled; 20408c2ecf20Sopenharmony_ci set_bit(MD_RECOVERY_INTR, &mddev->recovery); 20418c2ecf20Sopenharmony_ci md_done_sync(mddev, r1_bio->sectors, 0); 20428c2ecf20Sopenharmony_ci put_buf(r1_bio); 20438c2ecf20Sopenharmony_ci return 0; 20448c2ecf20Sopenharmony_ci } 20458c2ecf20Sopenharmony_ci /* Try next page */ 20468c2ecf20Sopenharmony_ci sectors -= s; 20478c2ecf20Sopenharmony_ci sect += s; 20488c2ecf20Sopenharmony_ci idx++; 20498c2ecf20Sopenharmony_ci continue; 20508c2ecf20Sopenharmony_ci } 20518c2ecf20Sopenharmony_ci 20528c2ecf20Sopenharmony_ci start = d; 20538c2ecf20Sopenharmony_ci /* write it back and re-read */ 20548c2ecf20Sopenharmony_ci while (d != r1_bio->read_disk) { 20558c2ecf20Sopenharmony_ci if (d == 0) 20568c2ecf20Sopenharmony_ci d = conf->raid_disks * 2; 20578c2ecf20Sopenharmony_ci d--; 20588c2ecf20Sopenharmony_ci if (r1_bio->bios[d]->bi_end_io != end_sync_read) 20598c2ecf20Sopenharmony_ci continue; 20608c2ecf20Sopenharmony_ci rdev = conf->mirrors[d].rdev; 20618c2ecf20Sopenharmony_ci if (r1_sync_page_io(rdev, sect, s, 20628c2ecf20Sopenharmony_ci pages[idx], 20638c2ecf20Sopenharmony_ci WRITE) == 0) { 20648c2ecf20Sopenharmony_ci r1_bio->bios[d]->bi_end_io = NULL; 20658c2ecf20Sopenharmony_ci rdev_dec_pending(rdev, mddev); 20668c2ecf20Sopenharmony_ci } 20678c2ecf20Sopenharmony_ci } 20688c2ecf20Sopenharmony_ci d = start; 20698c2ecf20Sopenharmony_ci while (d != r1_bio->read_disk) { 20708c2ecf20Sopenharmony_ci if (d == 0) 20718c2ecf20Sopenharmony_ci d = conf->raid_disks * 2; 20728c2ecf20Sopenharmony_ci d--; 20738c2ecf20Sopenharmony_ci if (r1_bio->bios[d]->bi_end_io != end_sync_read) 20748c2ecf20Sopenharmony_ci continue; 20758c2ecf20Sopenharmony_ci rdev = conf->mirrors[d].rdev; 20768c2ecf20Sopenharmony_ci if (r1_sync_page_io(rdev, sect, s, 20778c2ecf20Sopenharmony_ci pages[idx], 20788c2ecf20Sopenharmony_ci READ) != 0) 20798c2ecf20Sopenharmony_ci atomic_add(s, &rdev->corrected_errors); 20808c2ecf20Sopenharmony_ci } 20818c2ecf20Sopenharmony_ci sectors -= s; 20828c2ecf20Sopenharmony_ci sect += s; 20838c2ecf20Sopenharmony_ci idx ++; 20848c2ecf20Sopenharmony_ci } 20858c2ecf20Sopenharmony_ci set_bit(R1BIO_Uptodate, &r1_bio->state); 20868c2ecf20Sopenharmony_ci bio->bi_status = 0; 20878c2ecf20Sopenharmony_ci return 1; 20888c2ecf20Sopenharmony_ci} 20898c2ecf20Sopenharmony_ci 20908c2ecf20Sopenharmony_cistatic void process_checks(struct r1bio *r1_bio) 20918c2ecf20Sopenharmony_ci{ 20928c2ecf20Sopenharmony_ci /* We have read all readable devices. If we haven't 20938c2ecf20Sopenharmony_ci * got the block, then there is no hope left. 20948c2ecf20Sopenharmony_ci * If we have, then we want to do a comparison 20958c2ecf20Sopenharmony_ci * and skip the write if everything is the same. 20968c2ecf20Sopenharmony_ci * If any blocks failed to read, then we need to 20978c2ecf20Sopenharmony_ci * attempt an over-write 20988c2ecf20Sopenharmony_ci */ 20998c2ecf20Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 21008c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 21018c2ecf20Sopenharmony_ci int primary; 21028c2ecf20Sopenharmony_ci int i; 21038c2ecf20Sopenharmony_ci int vcnt; 21048c2ecf20Sopenharmony_ci 21058c2ecf20Sopenharmony_ci /* Fix variable parts of all bios */ 21068c2ecf20Sopenharmony_ci vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9); 21078c2ecf20Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 21088c2ecf20Sopenharmony_ci blk_status_t status; 21098c2ecf20Sopenharmony_ci struct bio *b = r1_bio->bios[i]; 21108c2ecf20Sopenharmony_ci struct resync_pages *rp = get_resync_pages(b); 21118c2ecf20Sopenharmony_ci if (b->bi_end_io != end_sync_read) 21128c2ecf20Sopenharmony_ci continue; 21138c2ecf20Sopenharmony_ci /* fixup the bio for reuse, but preserve errno */ 21148c2ecf20Sopenharmony_ci status = b->bi_status; 21158c2ecf20Sopenharmony_ci bio_reset(b); 21168c2ecf20Sopenharmony_ci b->bi_status = status; 21178c2ecf20Sopenharmony_ci b->bi_iter.bi_sector = r1_bio->sector + 21188c2ecf20Sopenharmony_ci conf->mirrors[i].rdev->data_offset; 21198c2ecf20Sopenharmony_ci bio_set_dev(b, conf->mirrors[i].rdev->bdev); 21208c2ecf20Sopenharmony_ci b->bi_end_io = end_sync_read; 21218c2ecf20Sopenharmony_ci rp->raid_bio = r1_bio; 21228c2ecf20Sopenharmony_ci b->bi_private = rp; 21238c2ecf20Sopenharmony_ci 21248c2ecf20Sopenharmony_ci /* initialize bvec table again */ 21258c2ecf20Sopenharmony_ci md_bio_reset_resync_pages(b, rp, r1_bio->sectors << 9); 21268c2ecf20Sopenharmony_ci } 21278c2ecf20Sopenharmony_ci for (primary = 0; primary < conf->raid_disks * 2; primary++) 21288c2ecf20Sopenharmony_ci if (r1_bio->bios[primary]->bi_end_io == end_sync_read && 21298c2ecf20Sopenharmony_ci !r1_bio->bios[primary]->bi_status) { 21308c2ecf20Sopenharmony_ci r1_bio->bios[primary]->bi_end_io = NULL; 21318c2ecf20Sopenharmony_ci rdev_dec_pending(conf->mirrors[primary].rdev, mddev); 21328c2ecf20Sopenharmony_ci break; 21338c2ecf20Sopenharmony_ci } 21348c2ecf20Sopenharmony_ci r1_bio->read_disk = primary; 21358c2ecf20Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 21368c2ecf20Sopenharmony_ci int j = 0; 21378c2ecf20Sopenharmony_ci struct bio *pbio = r1_bio->bios[primary]; 21388c2ecf20Sopenharmony_ci struct bio *sbio = r1_bio->bios[i]; 21398c2ecf20Sopenharmony_ci blk_status_t status = sbio->bi_status; 21408c2ecf20Sopenharmony_ci struct page **ppages = get_resync_pages(pbio)->pages; 21418c2ecf20Sopenharmony_ci struct page **spages = get_resync_pages(sbio)->pages; 21428c2ecf20Sopenharmony_ci struct bio_vec *bi; 21438c2ecf20Sopenharmony_ci int page_len[RESYNC_PAGES] = { 0 }; 21448c2ecf20Sopenharmony_ci struct bvec_iter_all iter_all; 21458c2ecf20Sopenharmony_ci 21468c2ecf20Sopenharmony_ci if (sbio->bi_end_io != end_sync_read) 21478c2ecf20Sopenharmony_ci continue; 21488c2ecf20Sopenharmony_ci /* Now we can 'fixup' the error value */ 21498c2ecf20Sopenharmony_ci sbio->bi_status = 0; 21508c2ecf20Sopenharmony_ci 21518c2ecf20Sopenharmony_ci bio_for_each_segment_all(bi, sbio, iter_all) 21528c2ecf20Sopenharmony_ci page_len[j++] = bi->bv_len; 21538c2ecf20Sopenharmony_ci 21548c2ecf20Sopenharmony_ci if (!status) { 21558c2ecf20Sopenharmony_ci for (j = vcnt; j-- ; ) { 21568c2ecf20Sopenharmony_ci if (memcmp(page_address(ppages[j]), 21578c2ecf20Sopenharmony_ci page_address(spages[j]), 21588c2ecf20Sopenharmony_ci page_len[j])) 21598c2ecf20Sopenharmony_ci break; 21608c2ecf20Sopenharmony_ci } 21618c2ecf20Sopenharmony_ci } else 21628c2ecf20Sopenharmony_ci j = 0; 21638c2ecf20Sopenharmony_ci if (j >= 0) 21648c2ecf20Sopenharmony_ci atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); 21658c2ecf20Sopenharmony_ci if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) 21668c2ecf20Sopenharmony_ci && !status)) { 21678c2ecf20Sopenharmony_ci /* No need to write to this device. */ 21688c2ecf20Sopenharmony_ci sbio->bi_end_io = NULL; 21698c2ecf20Sopenharmony_ci rdev_dec_pending(conf->mirrors[i].rdev, mddev); 21708c2ecf20Sopenharmony_ci continue; 21718c2ecf20Sopenharmony_ci } 21728c2ecf20Sopenharmony_ci 21738c2ecf20Sopenharmony_ci bio_copy_data(sbio, pbio); 21748c2ecf20Sopenharmony_ci } 21758c2ecf20Sopenharmony_ci} 21768c2ecf20Sopenharmony_ci 21778c2ecf20Sopenharmony_cistatic void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) 21788c2ecf20Sopenharmony_ci{ 21798c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 21808c2ecf20Sopenharmony_ci int i; 21818c2ecf20Sopenharmony_ci int disks = conf->raid_disks * 2; 21828c2ecf20Sopenharmony_ci struct bio *wbio; 21838c2ecf20Sopenharmony_ci 21848c2ecf20Sopenharmony_ci if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) 21858c2ecf20Sopenharmony_ci /* ouch - failed to read all of that. */ 21868c2ecf20Sopenharmony_ci if (!fix_sync_read_error(r1_bio)) 21878c2ecf20Sopenharmony_ci return; 21888c2ecf20Sopenharmony_ci 21898c2ecf20Sopenharmony_ci if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 21908c2ecf20Sopenharmony_ci process_checks(r1_bio); 21918c2ecf20Sopenharmony_ci 21928c2ecf20Sopenharmony_ci /* 21938c2ecf20Sopenharmony_ci * schedule writes 21948c2ecf20Sopenharmony_ci */ 21958c2ecf20Sopenharmony_ci atomic_set(&r1_bio->remaining, 1); 21968c2ecf20Sopenharmony_ci for (i = 0; i < disks ; i++) { 21978c2ecf20Sopenharmony_ci wbio = r1_bio->bios[i]; 21988c2ecf20Sopenharmony_ci if (wbio->bi_end_io == NULL || 21998c2ecf20Sopenharmony_ci (wbio->bi_end_io == end_sync_read && 22008c2ecf20Sopenharmony_ci (i == r1_bio->read_disk || 22018c2ecf20Sopenharmony_ci !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)))) 22028c2ecf20Sopenharmony_ci continue; 22038c2ecf20Sopenharmony_ci if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) { 22048c2ecf20Sopenharmony_ci abort_sync_write(mddev, r1_bio); 22058c2ecf20Sopenharmony_ci continue; 22068c2ecf20Sopenharmony_ci } 22078c2ecf20Sopenharmony_ci 22088c2ecf20Sopenharmony_ci bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); 22098c2ecf20Sopenharmony_ci if (test_bit(FailFast, &conf->mirrors[i].rdev->flags)) 22108c2ecf20Sopenharmony_ci wbio->bi_opf |= MD_FAILFAST; 22118c2ecf20Sopenharmony_ci 22128c2ecf20Sopenharmony_ci wbio->bi_end_io = end_sync_write; 22138c2ecf20Sopenharmony_ci atomic_inc(&r1_bio->remaining); 22148c2ecf20Sopenharmony_ci md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); 22158c2ecf20Sopenharmony_ci 22168c2ecf20Sopenharmony_ci submit_bio_noacct(wbio); 22178c2ecf20Sopenharmony_ci } 22188c2ecf20Sopenharmony_ci 22198c2ecf20Sopenharmony_ci put_sync_write_buf(r1_bio, 1); 22208c2ecf20Sopenharmony_ci} 22218c2ecf20Sopenharmony_ci 22228c2ecf20Sopenharmony_ci/* 22238c2ecf20Sopenharmony_ci * This is a kernel thread which: 22248c2ecf20Sopenharmony_ci * 22258c2ecf20Sopenharmony_ci * 1. Retries failed read operations on working mirrors. 22268c2ecf20Sopenharmony_ci * 2. Updates the raid superblock when problems encounter. 22278c2ecf20Sopenharmony_ci * 3. Performs writes following reads for array synchronising. 22288c2ecf20Sopenharmony_ci */ 22298c2ecf20Sopenharmony_ci 22308c2ecf20Sopenharmony_cistatic void fix_read_error(struct r1conf *conf, int read_disk, 22318c2ecf20Sopenharmony_ci sector_t sect, int sectors) 22328c2ecf20Sopenharmony_ci{ 22338c2ecf20Sopenharmony_ci struct mddev *mddev = conf->mddev; 22348c2ecf20Sopenharmony_ci while(sectors) { 22358c2ecf20Sopenharmony_ci int s = sectors; 22368c2ecf20Sopenharmony_ci int d = read_disk; 22378c2ecf20Sopenharmony_ci int success = 0; 22388c2ecf20Sopenharmony_ci int start; 22398c2ecf20Sopenharmony_ci struct md_rdev *rdev; 22408c2ecf20Sopenharmony_ci 22418c2ecf20Sopenharmony_ci if (s > (PAGE_SIZE>>9)) 22428c2ecf20Sopenharmony_ci s = PAGE_SIZE >> 9; 22438c2ecf20Sopenharmony_ci 22448c2ecf20Sopenharmony_ci do { 22458c2ecf20Sopenharmony_ci sector_t first_bad; 22468c2ecf20Sopenharmony_ci int bad_sectors; 22478c2ecf20Sopenharmony_ci 22488c2ecf20Sopenharmony_ci rcu_read_lock(); 22498c2ecf20Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].rdev); 22508c2ecf20Sopenharmony_ci if (rdev && 22518c2ecf20Sopenharmony_ci (test_bit(In_sync, &rdev->flags) || 22528c2ecf20Sopenharmony_ci (!test_bit(Faulty, &rdev->flags) && 22538c2ecf20Sopenharmony_ci rdev->recovery_offset >= sect + s)) && 22548c2ecf20Sopenharmony_ci is_badblock(rdev, sect, s, 22558c2ecf20Sopenharmony_ci &first_bad, &bad_sectors) == 0) { 22568c2ecf20Sopenharmony_ci atomic_inc(&rdev->nr_pending); 22578c2ecf20Sopenharmony_ci rcu_read_unlock(); 22588c2ecf20Sopenharmony_ci if (sync_page_io(rdev, sect, s<<9, 22598c2ecf20Sopenharmony_ci conf->tmppage, REQ_OP_READ, 0, false)) 22608c2ecf20Sopenharmony_ci success = 1; 22618c2ecf20Sopenharmony_ci rdev_dec_pending(rdev, mddev); 22628c2ecf20Sopenharmony_ci if (success) 22638c2ecf20Sopenharmony_ci break; 22648c2ecf20Sopenharmony_ci } else 22658c2ecf20Sopenharmony_ci rcu_read_unlock(); 22668c2ecf20Sopenharmony_ci d++; 22678c2ecf20Sopenharmony_ci if (d == conf->raid_disks * 2) 22688c2ecf20Sopenharmony_ci d = 0; 22698c2ecf20Sopenharmony_ci } while (!success && d != read_disk); 22708c2ecf20Sopenharmony_ci 22718c2ecf20Sopenharmony_ci if (!success) { 22728c2ecf20Sopenharmony_ci /* Cannot read from anywhere - mark it bad */ 22738c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[read_disk].rdev; 22748c2ecf20Sopenharmony_ci if (!rdev_set_badblocks(rdev, sect, s, 0)) 22758c2ecf20Sopenharmony_ci md_error(mddev, rdev); 22768c2ecf20Sopenharmony_ci break; 22778c2ecf20Sopenharmony_ci } 22788c2ecf20Sopenharmony_ci /* write it back and re-read */ 22798c2ecf20Sopenharmony_ci start = d; 22808c2ecf20Sopenharmony_ci while (d != read_disk) { 22818c2ecf20Sopenharmony_ci if (d==0) 22828c2ecf20Sopenharmony_ci d = conf->raid_disks * 2; 22838c2ecf20Sopenharmony_ci d--; 22848c2ecf20Sopenharmony_ci rcu_read_lock(); 22858c2ecf20Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].rdev); 22868c2ecf20Sopenharmony_ci if (rdev && 22878c2ecf20Sopenharmony_ci !test_bit(Faulty, &rdev->flags)) { 22888c2ecf20Sopenharmony_ci atomic_inc(&rdev->nr_pending); 22898c2ecf20Sopenharmony_ci rcu_read_unlock(); 22908c2ecf20Sopenharmony_ci r1_sync_page_io(rdev, sect, s, 22918c2ecf20Sopenharmony_ci conf->tmppage, WRITE); 22928c2ecf20Sopenharmony_ci rdev_dec_pending(rdev, mddev); 22938c2ecf20Sopenharmony_ci } else 22948c2ecf20Sopenharmony_ci rcu_read_unlock(); 22958c2ecf20Sopenharmony_ci } 22968c2ecf20Sopenharmony_ci d = start; 22978c2ecf20Sopenharmony_ci while (d != read_disk) { 22988c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 22998c2ecf20Sopenharmony_ci if (d==0) 23008c2ecf20Sopenharmony_ci d = conf->raid_disks * 2; 23018c2ecf20Sopenharmony_ci d--; 23028c2ecf20Sopenharmony_ci rcu_read_lock(); 23038c2ecf20Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[d].rdev); 23048c2ecf20Sopenharmony_ci if (rdev && 23058c2ecf20Sopenharmony_ci !test_bit(Faulty, &rdev->flags)) { 23068c2ecf20Sopenharmony_ci atomic_inc(&rdev->nr_pending); 23078c2ecf20Sopenharmony_ci rcu_read_unlock(); 23088c2ecf20Sopenharmony_ci if (r1_sync_page_io(rdev, sect, s, 23098c2ecf20Sopenharmony_ci conf->tmppage, READ)) { 23108c2ecf20Sopenharmony_ci atomic_add(s, &rdev->corrected_errors); 23118c2ecf20Sopenharmony_ci pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %s)\n", 23128c2ecf20Sopenharmony_ci mdname(mddev), s, 23138c2ecf20Sopenharmony_ci (unsigned long long)(sect + 23148c2ecf20Sopenharmony_ci rdev->data_offset), 23158c2ecf20Sopenharmony_ci bdevname(rdev->bdev, b)); 23168c2ecf20Sopenharmony_ci } 23178c2ecf20Sopenharmony_ci rdev_dec_pending(rdev, mddev); 23188c2ecf20Sopenharmony_ci } else 23198c2ecf20Sopenharmony_ci rcu_read_unlock(); 23208c2ecf20Sopenharmony_ci } 23218c2ecf20Sopenharmony_ci sectors -= s; 23228c2ecf20Sopenharmony_ci sect += s; 23238c2ecf20Sopenharmony_ci } 23248c2ecf20Sopenharmony_ci} 23258c2ecf20Sopenharmony_ci 23268c2ecf20Sopenharmony_cistatic int narrow_write_error(struct r1bio *r1_bio, int i) 23278c2ecf20Sopenharmony_ci{ 23288c2ecf20Sopenharmony_ci struct mddev *mddev = r1_bio->mddev; 23298c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 23308c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[i].rdev; 23318c2ecf20Sopenharmony_ci 23328c2ecf20Sopenharmony_ci /* bio has the data to be written to device 'i' where 23338c2ecf20Sopenharmony_ci * we just recently had a write error. 23348c2ecf20Sopenharmony_ci * We repeatedly clone the bio and trim down to one block, 23358c2ecf20Sopenharmony_ci * then try the write. Where the write fails we record 23368c2ecf20Sopenharmony_ci * a bad block. 23378c2ecf20Sopenharmony_ci * It is conceivable that the bio doesn't exactly align with 23388c2ecf20Sopenharmony_ci * blocks. We must handle this somehow. 23398c2ecf20Sopenharmony_ci * 23408c2ecf20Sopenharmony_ci * We currently own a reference on the rdev. 23418c2ecf20Sopenharmony_ci */ 23428c2ecf20Sopenharmony_ci 23438c2ecf20Sopenharmony_ci int block_sectors; 23448c2ecf20Sopenharmony_ci sector_t sector; 23458c2ecf20Sopenharmony_ci int sectors; 23468c2ecf20Sopenharmony_ci int sect_to_write = r1_bio->sectors; 23478c2ecf20Sopenharmony_ci int ok = 1; 23488c2ecf20Sopenharmony_ci 23498c2ecf20Sopenharmony_ci if (rdev->badblocks.shift < 0) 23508c2ecf20Sopenharmony_ci return 0; 23518c2ecf20Sopenharmony_ci 23528c2ecf20Sopenharmony_ci block_sectors = roundup(1 << rdev->badblocks.shift, 23538c2ecf20Sopenharmony_ci bdev_logical_block_size(rdev->bdev) >> 9); 23548c2ecf20Sopenharmony_ci sector = r1_bio->sector; 23558c2ecf20Sopenharmony_ci sectors = ((sector + block_sectors) 23568c2ecf20Sopenharmony_ci & ~(sector_t)(block_sectors - 1)) 23578c2ecf20Sopenharmony_ci - sector; 23588c2ecf20Sopenharmony_ci 23598c2ecf20Sopenharmony_ci while (sect_to_write) { 23608c2ecf20Sopenharmony_ci struct bio *wbio; 23618c2ecf20Sopenharmony_ci if (sectors > sect_to_write) 23628c2ecf20Sopenharmony_ci sectors = sect_to_write; 23638c2ecf20Sopenharmony_ci /* Write at 'sector' for 'sectors'*/ 23648c2ecf20Sopenharmony_ci 23658c2ecf20Sopenharmony_ci if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 23668c2ecf20Sopenharmony_ci wbio = bio_clone_fast(r1_bio->behind_master_bio, 23678c2ecf20Sopenharmony_ci GFP_NOIO, 23688c2ecf20Sopenharmony_ci &mddev->bio_set); 23698c2ecf20Sopenharmony_ci } else { 23708c2ecf20Sopenharmony_ci wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, 23718c2ecf20Sopenharmony_ci &mddev->bio_set); 23728c2ecf20Sopenharmony_ci } 23738c2ecf20Sopenharmony_ci 23748c2ecf20Sopenharmony_ci bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); 23758c2ecf20Sopenharmony_ci wbio->bi_iter.bi_sector = r1_bio->sector; 23768c2ecf20Sopenharmony_ci wbio->bi_iter.bi_size = r1_bio->sectors << 9; 23778c2ecf20Sopenharmony_ci 23788c2ecf20Sopenharmony_ci bio_trim(wbio, sector - r1_bio->sector, sectors); 23798c2ecf20Sopenharmony_ci wbio->bi_iter.bi_sector += rdev->data_offset; 23808c2ecf20Sopenharmony_ci bio_set_dev(wbio, rdev->bdev); 23818c2ecf20Sopenharmony_ci 23828c2ecf20Sopenharmony_ci if (submit_bio_wait(wbio) < 0) 23838c2ecf20Sopenharmony_ci /* failure! */ 23848c2ecf20Sopenharmony_ci ok = rdev_set_badblocks(rdev, sector, 23858c2ecf20Sopenharmony_ci sectors, 0) 23868c2ecf20Sopenharmony_ci && ok; 23878c2ecf20Sopenharmony_ci 23888c2ecf20Sopenharmony_ci bio_put(wbio); 23898c2ecf20Sopenharmony_ci sect_to_write -= sectors; 23908c2ecf20Sopenharmony_ci sector += sectors; 23918c2ecf20Sopenharmony_ci sectors = block_sectors; 23928c2ecf20Sopenharmony_ci } 23938c2ecf20Sopenharmony_ci return ok; 23948c2ecf20Sopenharmony_ci} 23958c2ecf20Sopenharmony_ci 23968c2ecf20Sopenharmony_cistatic void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio) 23978c2ecf20Sopenharmony_ci{ 23988c2ecf20Sopenharmony_ci int m; 23998c2ecf20Sopenharmony_ci int s = r1_bio->sectors; 24008c2ecf20Sopenharmony_ci for (m = 0; m < conf->raid_disks * 2 ; m++) { 24018c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[m].rdev; 24028c2ecf20Sopenharmony_ci struct bio *bio = r1_bio->bios[m]; 24038c2ecf20Sopenharmony_ci if (bio->bi_end_io == NULL) 24048c2ecf20Sopenharmony_ci continue; 24058c2ecf20Sopenharmony_ci if (!bio->bi_status && 24068c2ecf20Sopenharmony_ci test_bit(R1BIO_MadeGood, &r1_bio->state)) { 24078c2ecf20Sopenharmony_ci rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); 24088c2ecf20Sopenharmony_ci } 24098c2ecf20Sopenharmony_ci if (bio->bi_status && 24108c2ecf20Sopenharmony_ci test_bit(R1BIO_WriteError, &r1_bio->state)) { 24118c2ecf20Sopenharmony_ci if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) 24128c2ecf20Sopenharmony_ci md_error(conf->mddev, rdev); 24138c2ecf20Sopenharmony_ci } 24148c2ecf20Sopenharmony_ci } 24158c2ecf20Sopenharmony_ci put_buf(r1_bio); 24168c2ecf20Sopenharmony_ci md_done_sync(conf->mddev, s, 1); 24178c2ecf20Sopenharmony_ci} 24188c2ecf20Sopenharmony_ci 24198c2ecf20Sopenharmony_cistatic void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) 24208c2ecf20Sopenharmony_ci{ 24218c2ecf20Sopenharmony_ci int m, idx; 24228c2ecf20Sopenharmony_ci bool fail = false; 24238c2ecf20Sopenharmony_ci 24248c2ecf20Sopenharmony_ci for (m = 0; m < conf->raid_disks * 2 ; m++) 24258c2ecf20Sopenharmony_ci if (r1_bio->bios[m] == IO_MADE_GOOD) { 24268c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[m].rdev; 24278c2ecf20Sopenharmony_ci rdev_clear_badblocks(rdev, 24288c2ecf20Sopenharmony_ci r1_bio->sector, 24298c2ecf20Sopenharmony_ci r1_bio->sectors, 0); 24308c2ecf20Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 24318c2ecf20Sopenharmony_ci } else if (r1_bio->bios[m] != NULL) { 24328c2ecf20Sopenharmony_ci /* This drive got a write error. We need to 24338c2ecf20Sopenharmony_ci * narrow down and record precise write 24348c2ecf20Sopenharmony_ci * errors. 24358c2ecf20Sopenharmony_ci */ 24368c2ecf20Sopenharmony_ci fail = true; 24378c2ecf20Sopenharmony_ci if (!narrow_write_error(r1_bio, m)) { 24388c2ecf20Sopenharmony_ci md_error(conf->mddev, 24398c2ecf20Sopenharmony_ci conf->mirrors[m].rdev); 24408c2ecf20Sopenharmony_ci /* an I/O failed, we can't clear the bitmap */ 24418c2ecf20Sopenharmony_ci set_bit(R1BIO_Degraded, &r1_bio->state); 24428c2ecf20Sopenharmony_ci } 24438c2ecf20Sopenharmony_ci rdev_dec_pending(conf->mirrors[m].rdev, 24448c2ecf20Sopenharmony_ci conf->mddev); 24458c2ecf20Sopenharmony_ci } 24468c2ecf20Sopenharmony_ci if (fail) { 24478c2ecf20Sopenharmony_ci spin_lock_irq(&conf->device_lock); 24488c2ecf20Sopenharmony_ci list_add(&r1_bio->retry_list, &conf->bio_end_io_list); 24498c2ecf20Sopenharmony_ci idx = sector_to_idx(r1_bio->sector); 24508c2ecf20Sopenharmony_ci atomic_inc(&conf->nr_queued[idx]); 24518c2ecf20Sopenharmony_ci spin_unlock_irq(&conf->device_lock); 24528c2ecf20Sopenharmony_ci /* 24538c2ecf20Sopenharmony_ci * In case freeze_array() is waiting for condition 24548c2ecf20Sopenharmony_ci * get_unqueued_pending() == extra to be true. 24558c2ecf20Sopenharmony_ci */ 24568c2ecf20Sopenharmony_ci wake_up(&conf->wait_barrier); 24578c2ecf20Sopenharmony_ci md_wakeup_thread(conf->mddev->thread); 24588c2ecf20Sopenharmony_ci } else { 24598c2ecf20Sopenharmony_ci if (test_bit(R1BIO_WriteError, &r1_bio->state)) 24608c2ecf20Sopenharmony_ci close_write(r1_bio); 24618c2ecf20Sopenharmony_ci raid_end_bio_io(r1_bio); 24628c2ecf20Sopenharmony_ci } 24638c2ecf20Sopenharmony_ci} 24648c2ecf20Sopenharmony_ci 24658c2ecf20Sopenharmony_cistatic void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) 24668c2ecf20Sopenharmony_ci{ 24678c2ecf20Sopenharmony_ci struct mddev *mddev = conf->mddev; 24688c2ecf20Sopenharmony_ci struct bio *bio; 24698c2ecf20Sopenharmony_ci struct md_rdev *rdev; 24708c2ecf20Sopenharmony_ci 24718c2ecf20Sopenharmony_ci clear_bit(R1BIO_ReadError, &r1_bio->state); 24728c2ecf20Sopenharmony_ci /* we got a read error. Maybe the drive is bad. Maybe just 24738c2ecf20Sopenharmony_ci * the block and we can fix it. 24748c2ecf20Sopenharmony_ci * We freeze all other IO, and try reading the block from 24758c2ecf20Sopenharmony_ci * other devices. When we find one, we re-write 24768c2ecf20Sopenharmony_ci * and check it that fixes the read error. 24778c2ecf20Sopenharmony_ci * This is all done synchronously while the array is 24788c2ecf20Sopenharmony_ci * frozen 24798c2ecf20Sopenharmony_ci */ 24808c2ecf20Sopenharmony_ci 24818c2ecf20Sopenharmony_ci bio = r1_bio->bios[r1_bio->read_disk]; 24828c2ecf20Sopenharmony_ci bio_put(bio); 24838c2ecf20Sopenharmony_ci r1_bio->bios[r1_bio->read_disk] = NULL; 24848c2ecf20Sopenharmony_ci 24858c2ecf20Sopenharmony_ci rdev = conf->mirrors[r1_bio->read_disk].rdev; 24868c2ecf20Sopenharmony_ci if (mddev->ro == 0 24878c2ecf20Sopenharmony_ci && !test_bit(FailFast, &rdev->flags)) { 24888c2ecf20Sopenharmony_ci freeze_array(conf, 1); 24898c2ecf20Sopenharmony_ci fix_read_error(conf, r1_bio->read_disk, 24908c2ecf20Sopenharmony_ci r1_bio->sector, r1_bio->sectors); 24918c2ecf20Sopenharmony_ci unfreeze_array(conf); 24928c2ecf20Sopenharmony_ci } else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) { 24938c2ecf20Sopenharmony_ci md_error(mddev, rdev); 24948c2ecf20Sopenharmony_ci } else { 24958c2ecf20Sopenharmony_ci r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED; 24968c2ecf20Sopenharmony_ci } 24978c2ecf20Sopenharmony_ci 24988c2ecf20Sopenharmony_ci rdev_dec_pending(rdev, conf->mddev); 24998c2ecf20Sopenharmony_ci allow_barrier(conf, r1_bio->sector); 25008c2ecf20Sopenharmony_ci bio = r1_bio->master_bio; 25018c2ecf20Sopenharmony_ci 25028c2ecf20Sopenharmony_ci /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */ 25038c2ecf20Sopenharmony_ci r1_bio->state = 0; 25048c2ecf20Sopenharmony_ci raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio); 25058c2ecf20Sopenharmony_ci} 25068c2ecf20Sopenharmony_ci 25078c2ecf20Sopenharmony_cistatic void raid1d(struct md_thread *thread) 25088c2ecf20Sopenharmony_ci{ 25098c2ecf20Sopenharmony_ci struct mddev *mddev = thread->mddev; 25108c2ecf20Sopenharmony_ci struct r1bio *r1_bio; 25118c2ecf20Sopenharmony_ci unsigned long flags; 25128c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 25138c2ecf20Sopenharmony_ci struct list_head *head = &conf->retry_list; 25148c2ecf20Sopenharmony_ci struct blk_plug plug; 25158c2ecf20Sopenharmony_ci int idx; 25168c2ecf20Sopenharmony_ci 25178c2ecf20Sopenharmony_ci md_check_recovery(mddev); 25188c2ecf20Sopenharmony_ci 25198c2ecf20Sopenharmony_ci if (!list_empty_careful(&conf->bio_end_io_list) && 25208c2ecf20Sopenharmony_ci !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 25218c2ecf20Sopenharmony_ci LIST_HEAD(tmp); 25228c2ecf20Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 25238c2ecf20Sopenharmony_ci if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 25248c2ecf20Sopenharmony_ci list_splice_init(&conf->bio_end_io_list, &tmp); 25258c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 25268c2ecf20Sopenharmony_ci while (!list_empty(&tmp)) { 25278c2ecf20Sopenharmony_ci r1_bio = list_first_entry(&tmp, struct r1bio, 25288c2ecf20Sopenharmony_ci retry_list); 25298c2ecf20Sopenharmony_ci list_del(&r1_bio->retry_list); 25308c2ecf20Sopenharmony_ci idx = sector_to_idx(r1_bio->sector); 25318c2ecf20Sopenharmony_ci atomic_dec(&conf->nr_queued[idx]); 25328c2ecf20Sopenharmony_ci if (mddev->degraded) 25338c2ecf20Sopenharmony_ci set_bit(R1BIO_Degraded, &r1_bio->state); 25348c2ecf20Sopenharmony_ci if (test_bit(R1BIO_WriteError, &r1_bio->state)) 25358c2ecf20Sopenharmony_ci close_write(r1_bio); 25368c2ecf20Sopenharmony_ci raid_end_bio_io(r1_bio); 25378c2ecf20Sopenharmony_ci } 25388c2ecf20Sopenharmony_ci } 25398c2ecf20Sopenharmony_ci 25408c2ecf20Sopenharmony_ci blk_start_plug(&plug); 25418c2ecf20Sopenharmony_ci for (;;) { 25428c2ecf20Sopenharmony_ci 25438c2ecf20Sopenharmony_ci flush_pending_writes(conf); 25448c2ecf20Sopenharmony_ci 25458c2ecf20Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 25468c2ecf20Sopenharmony_ci if (list_empty(head)) { 25478c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 25488c2ecf20Sopenharmony_ci break; 25498c2ecf20Sopenharmony_ci } 25508c2ecf20Sopenharmony_ci r1_bio = list_entry(head->prev, struct r1bio, retry_list); 25518c2ecf20Sopenharmony_ci list_del(head->prev); 25528c2ecf20Sopenharmony_ci idx = sector_to_idx(r1_bio->sector); 25538c2ecf20Sopenharmony_ci atomic_dec(&conf->nr_queued[idx]); 25548c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 25558c2ecf20Sopenharmony_ci 25568c2ecf20Sopenharmony_ci mddev = r1_bio->mddev; 25578c2ecf20Sopenharmony_ci conf = mddev->private; 25588c2ecf20Sopenharmony_ci if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 25598c2ecf20Sopenharmony_ci if (test_bit(R1BIO_MadeGood, &r1_bio->state) || 25608c2ecf20Sopenharmony_ci test_bit(R1BIO_WriteError, &r1_bio->state)) 25618c2ecf20Sopenharmony_ci handle_sync_write_finished(conf, r1_bio); 25628c2ecf20Sopenharmony_ci else 25638c2ecf20Sopenharmony_ci sync_request_write(mddev, r1_bio); 25648c2ecf20Sopenharmony_ci } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) || 25658c2ecf20Sopenharmony_ci test_bit(R1BIO_WriteError, &r1_bio->state)) 25668c2ecf20Sopenharmony_ci handle_write_finished(conf, r1_bio); 25678c2ecf20Sopenharmony_ci else if (test_bit(R1BIO_ReadError, &r1_bio->state)) 25688c2ecf20Sopenharmony_ci handle_read_error(conf, r1_bio); 25698c2ecf20Sopenharmony_ci else 25708c2ecf20Sopenharmony_ci WARN_ON_ONCE(1); 25718c2ecf20Sopenharmony_ci 25728c2ecf20Sopenharmony_ci cond_resched(); 25738c2ecf20Sopenharmony_ci if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING)) 25748c2ecf20Sopenharmony_ci md_check_recovery(mddev); 25758c2ecf20Sopenharmony_ci } 25768c2ecf20Sopenharmony_ci blk_finish_plug(&plug); 25778c2ecf20Sopenharmony_ci} 25788c2ecf20Sopenharmony_ci 25798c2ecf20Sopenharmony_cistatic int init_resync(struct r1conf *conf) 25808c2ecf20Sopenharmony_ci{ 25818c2ecf20Sopenharmony_ci int buffs; 25828c2ecf20Sopenharmony_ci 25838c2ecf20Sopenharmony_ci buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 25848c2ecf20Sopenharmony_ci BUG_ON(mempool_initialized(&conf->r1buf_pool)); 25858c2ecf20Sopenharmony_ci 25868c2ecf20Sopenharmony_ci return mempool_init(&conf->r1buf_pool, buffs, r1buf_pool_alloc, 25878c2ecf20Sopenharmony_ci r1buf_pool_free, conf->poolinfo); 25888c2ecf20Sopenharmony_ci} 25898c2ecf20Sopenharmony_ci 25908c2ecf20Sopenharmony_cistatic struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf) 25918c2ecf20Sopenharmony_ci{ 25928c2ecf20Sopenharmony_ci struct r1bio *r1bio = mempool_alloc(&conf->r1buf_pool, GFP_NOIO); 25938c2ecf20Sopenharmony_ci struct resync_pages *rps; 25948c2ecf20Sopenharmony_ci struct bio *bio; 25958c2ecf20Sopenharmony_ci int i; 25968c2ecf20Sopenharmony_ci 25978c2ecf20Sopenharmony_ci for (i = conf->poolinfo->raid_disks; i--; ) { 25988c2ecf20Sopenharmony_ci bio = r1bio->bios[i]; 25998c2ecf20Sopenharmony_ci rps = bio->bi_private; 26008c2ecf20Sopenharmony_ci bio_reset(bio); 26018c2ecf20Sopenharmony_ci bio->bi_private = rps; 26028c2ecf20Sopenharmony_ci } 26038c2ecf20Sopenharmony_ci r1bio->master_bio = NULL; 26048c2ecf20Sopenharmony_ci return r1bio; 26058c2ecf20Sopenharmony_ci} 26068c2ecf20Sopenharmony_ci 26078c2ecf20Sopenharmony_ci/* 26088c2ecf20Sopenharmony_ci * perform a "sync" on one "block" 26098c2ecf20Sopenharmony_ci * 26108c2ecf20Sopenharmony_ci * We need to make sure that no normal I/O request - particularly write 26118c2ecf20Sopenharmony_ci * requests - conflict with active sync requests. 26128c2ecf20Sopenharmony_ci * 26138c2ecf20Sopenharmony_ci * This is achieved by tracking pending requests and a 'barrier' concept 26148c2ecf20Sopenharmony_ci * that can be installed to exclude normal IO requests. 26158c2ecf20Sopenharmony_ci */ 26168c2ecf20Sopenharmony_ci 26178c2ecf20Sopenharmony_cistatic sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, 26188c2ecf20Sopenharmony_ci int *skipped) 26198c2ecf20Sopenharmony_ci{ 26208c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 26218c2ecf20Sopenharmony_ci struct r1bio *r1_bio; 26228c2ecf20Sopenharmony_ci struct bio *bio; 26238c2ecf20Sopenharmony_ci sector_t max_sector, nr_sectors; 26248c2ecf20Sopenharmony_ci int disk = -1; 26258c2ecf20Sopenharmony_ci int i; 26268c2ecf20Sopenharmony_ci int wonly = -1; 26278c2ecf20Sopenharmony_ci int write_targets = 0, read_targets = 0; 26288c2ecf20Sopenharmony_ci sector_t sync_blocks; 26298c2ecf20Sopenharmony_ci int still_degraded = 0; 26308c2ecf20Sopenharmony_ci int good_sectors = RESYNC_SECTORS; 26318c2ecf20Sopenharmony_ci int min_bad = 0; /* number of sectors that are bad in all devices */ 26328c2ecf20Sopenharmony_ci int idx = sector_to_idx(sector_nr); 26338c2ecf20Sopenharmony_ci int page_idx = 0; 26348c2ecf20Sopenharmony_ci 26358c2ecf20Sopenharmony_ci if (!mempool_initialized(&conf->r1buf_pool)) 26368c2ecf20Sopenharmony_ci if (init_resync(conf)) 26378c2ecf20Sopenharmony_ci return 0; 26388c2ecf20Sopenharmony_ci 26398c2ecf20Sopenharmony_ci max_sector = mddev->dev_sectors; 26408c2ecf20Sopenharmony_ci if (sector_nr >= max_sector) { 26418c2ecf20Sopenharmony_ci /* If we aborted, we need to abort the 26428c2ecf20Sopenharmony_ci * sync on the 'current' bitmap chunk (there will 26438c2ecf20Sopenharmony_ci * only be one in raid1 resync. 26448c2ecf20Sopenharmony_ci * We can find the current addess in mddev->curr_resync 26458c2ecf20Sopenharmony_ci */ 26468c2ecf20Sopenharmony_ci if (mddev->curr_resync < max_sector) /* aborted */ 26478c2ecf20Sopenharmony_ci md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 26488c2ecf20Sopenharmony_ci &sync_blocks, 1); 26498c2ecf20Sopenharmony_ci else /* completed sync */ 26508c2ecf20Sopenharmony_ci conf->fullsync = 0; 26518c2ecf20Sopenharmony_ci 26528c2ecf20Sopenharmony_ci md_bitmap_close_sync(mddev->bitmap); 26538c2ecf20Sopenharmony_ci close_sync(conf); 26548c2ecf20Sopenharmony_ci 26558c2ecf20Sopenharmony_ci if (mddev_is_clustered(mddev)) { 26568c2ecf20Sopenharmony_ci conf->cluster_sync_low = 0; 26578c2ecf20Sopenharmony_ci conf->cluster_sync_high = 0; 26588c2ecf20Sopenharmony_ci } 26598c2ecf20Sopenharmony_ci return 0; 26608c2ecf20Sopenharmony_ci } 26618c2ecf20Sopenharmony_ci 26628c2ecf20Sopenharmony_ci if (mddev->bitmap == NULL && 26638c2ecf20Sopenharmony_ci mddev->recovery_cp == MaxSector && 26648c2ecf20Sopenharmony_ci !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 26658c2ecf20Sopenharmony_ci conf->fullsync == 0) { 26668c2ecf20Sopenharmony_ci *skipped = 1; 26678c2ecf20Sopenharmony_ci return max_sector - sector_nr; 26688c2ecf20Sopenharmony_ci } 26698c2ecf20Sopenharmony_ci /* before building a request, check if we can skip these blocks.. 26708c2ecf20Sopenharmony_ci * This call the bitmap_start_sync doesn't actually record anything 26718c2ecf20Sopenharmony_ci */ 26728c2ecf20Sopenharmony_ci if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 26738c2ecf20Sopenharmony_ci !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 26748c2ecf20Sopenharmony_ci /* We can skip this block, and probably several more */ 26758c2ecf20Sopenharmony_ci *skipped = 1; 26768c2ecf20Sopenharmony_ci return sync_blocks; 26778c2ecf20Sopenharmony_ci } 26788c2ecf20Sopenharmony_ci 26798c2ecf20Sopenharmony_ci /* 26808c2ecf20Sopenharmony_ci * If there is non-resync activity waiting for a turn, then let it 26818c2ecf20Sopenharmony_ci * though before starting on this new sync request. 26828c2ecf20Sopenharmony_ci */ 26838c2ecf20Sopenharmony_ci if (atomic_read(&conf->nr_waiting[idx])) 26848c2ecf20Sopenharmony_ci schedule_timeout_uninterruptible(1); 26858c2ecf20Sopenharmony_ci 26868c2ecf20Sopenharmony_ci /* we are incrementing sector_nr below. To be safe, we check against 26878c2ecf20Sopenharmony_ci * sector_nr + two times RESYNC_SECTORS 26888c2ecf20Sopenharmony_ci */ 26898c2ecf20Sopenharmony_ci 26908c2ecf20Sopenharmony_ci md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, 26918c2ecf20Sopenharmony_ci mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); 26928c2ecf20Sopenharmony_ci 26938c2ecf20Sopenharmony_ci 26948c2ecf20Sopenharmony_ci if (raise_barrier(conf, sector_nr)) 26958c2ecf20Sopenharmony_ci return 0; 26968c2ecf20Sopenharmony_ci 26978c2ecf20Sopenharmony_ci r1_bio = raid1_alloc_init_r1buf(conf); 26988c2ecf20Sopenharmony_ci 26998c2ecf20Sopenharmony_ci rcu_read_lock(); 27008c2ecf20Sopenharmony_ci /* 27018c2ecf20Sopenharmony_ci * If we get a correctably read error during resync or recovery, 27028c2ecf20Sopenharmony_ci * we might want to read from a different device. So we 27038c2ecf20Sopenharmony_ci * flag all drives that could conceivably be read from for READ, 27048c2ecf20Sopenharmony_ci * and any others (which will be non-In_sync devices) for WRITE. 27058c2ecf20Sopenharmony_ci * If a read fails, we try reading from something else for which READ 27068c2ecf20Sopenharmony_ci * is OK. 27078c2ecf20Sopenharmony_ci */ 27088c2ecf20Sopenharmony_ci 27098c2ecf20Sopenharmony_ci r1_bio->mddev = mddev; 27108c2ecf20Sopenharmony_ci r1_bio->sector = sector_nr; 27118c2ecf20Sopenharmony_ci r1_bio->state = 0; 27128c2ecf20Sopenharmony_ci set_bit(R1BIO_IsSync, &r1_bio->state); 27138c2ecf20Sopenharmony_ci /* make sure good_sectors won't go across barrier unit boundary */ 27148c2ecf20Sopenharmony_ci good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors); 27158c2ecf20Sopenharmony_ci 27168c2ecf20Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 27178c2ecf20Sopenharmony_ci struct md_rdev *rdev; 27188c2ecf20Sopenharmony_ci bio = r1_bio->bios[i]; 27198c2ecf20Sopenharmony_ci 27208c2ecf20Sopenharmony_ci rdev = rcu_dereference(conf->mirrors[i].rdev); 27218c2ecf20Sopenharmony_ci if (rdev == NULL || 27228c2ecf20Sopenharmony_ci test_bit(Faulty, &rdev->flags)) { 27238c2ecf20Sopenharmony_ci if (i < conf->raid_disks) 27248c2ecf20Sopenharmony_ci still_degraded = 1; 27258c2ecf20Sopenharmony_ci } else if (!test_bit(In_sync, &rdev->flags)) { 27268c2ecf20Sopenharmony_ci bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 27278c2ecf20Sopenharmony_ci bio->bi_end_io = end_sync_write; 27288c2ecf20Sopenharmony_ci write_targets ++; 27298c2ecf20Sopenharmony_ci } else { 27308c2ecf20Sopenharmony_ci /* may need to read from here */ 27318c2ecf20Sopenharmony_ci sector_t first_bad = MaxSector; 27328c2ecf20Sopenharmony_ci int bad_sectors; 27338c2ecf20Sopenharmony_ci 27348c2ecf20Sopenharmony_ci if (is_badblock(rdev, sector_nr, good_sectors, 27358c2ecf20Sopenharmony_ci &first_bad, &bad_sectors)) { 27368c2ecf20Sopenharmony_ci if (first_bad > sector_nr) 27378c2ecf20Sopenharmony_ci good_sectors = first_bad - sector_nr; 27388c2ecf20Sopenharmony_ci else { 27398c2ecf20Sopenharmony_ci bad_sectors -= (sector_nr - first_bad); 27408c2ecf20Sopenharmony_ci if (min_bad == 0 || 27418c2ecf20Sopenharmony_ci min_bad > bad_sectors) 27428c2ecf20Sopenharmony_ci min_bad = bad_sectors; 27438c2ecf20Sopenharmony_ci } 27448c2ecf20Sopenharmony_ci } 27458c2ecf20Sopenharmony_ci if (sector_nr < first_bad) { 27468c2ecf20Sopenharmony_ci if (test_bit(WriteMostly, &rdev->flags)) { 27478c2ecf20Sopenharmony_ci if (wonly < 0) 27488c2ecf20Sopenharmony_ci wonly = i; 27498c2ecf20Sopenharmony_ci } else { 27508c2ecf20Sopenharmony_ci if (disk < 0) 27518c2ecf20Sopenharmony_ci disk = i; 27528c2ecf20Sopenharmony_ci } 27538c2ecf20Sopenharmony_ci bio_set_op_attrs(bio, REQ_OP_READ, 0); 27548c2ecf20Sopenharmony_ci bio->bi_end_io = end_sync_read; 27558c2ecf20Sopenharmony_ci read_targets++; 27568c2ecf20Sopenharmony_ci } else if (!test_bit(WriteErrorSeen, &rdev->flags) && 27578c2ecf20Sopenharmony_ci test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 27588c2ecf20Sopenharmony_ci !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 27598c2ecf20Sopenharmony_ci /* 27608c2ecf20Sopenharmony_ci * The device is suitable for reading (InSync), 27618c2ecf20Sopenharmony_ci * but has bad block(s) here. Let's try to correct them, 27628c2ecf20Sopenharmony_ci * if we are doing resync or repair. Otherwise, leave 27638c2ecf20Sopenharmony_ci * this device alone for this sync request. 27648c2ecf20Sopenharmony_ci */ 27658c2ecf20Sopenharmony_ci bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 27668c2ecf20Sopenharmony_ci bio->bi_end_io = end_sync_write; 27678c2ecf20Sopenharmony_ci write_targets++; 27688c2ecf20Sopenharmony_ci } 27698c2ecf20Sopenharmony_ci } 27708c2ecf20Sopenharmony_ci if (rdev && bio->bi_end_io) { 27718c2ecf20Sopenharmony_ci atomic_inc(&rdev->nr_pending); 27728c2ecf20Sopenharmony_ci bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; 27738c2ecf20Sopenharmony_ci bio_set_dev(bio, rdev->bdev); 27748c2ecf20Sopenharmony_ci if (test_bit(FailFast, &rdev->flags)) 27758c2ecf20Sopenharmony_ci bio->bi_opf |= MD_FAILFAST; 27768c2ecf20Sopenharmony_ci } 27778c2ecf20Sopenharmony_ci } 27788c2ecf20Sopenharmony_ci rcu_read_unlock(); 27798c2ecf20Sopenharmony_ci if (disk < 0) 27808c2ecf20Sopenharmony_ci disk = wonly; 27818c2ecf20Sopenharmony_ci r1_bio->read_disk = disk; 27828c2ecf20Sopenharmony_ci 27838c2ecf20Sopenharmony_ci if (read_targets == 0 && min_bad > 0) { 27848c2ecf20Sopenharmony_ci /* These sectors are bad on all InSync devices, so we 27858c2ecf20Sopenharmony_ci * need to mark them bad on all write targets 27868c2ecf20Sopenharmony_ci */ 27878c2ecf20Sopenharmony_ci int ok = 1; 27888c2ecf20Sopenharmony_ci for (i = 0 ; i < conf->raid_disks * 2 ; i++) 27898c2ecf20Sopenharmony_ci if (r1_bio->bios[i]->bi_end_io == end_sync_write) { 27908c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[i].rdev; 27918c2ecf20Sopenharmony_ci ok = rdev_set_badblocks(rdev, sector_nr, 27928c2ecf20Sopenharmony_ci min_bad, 0 27938c2ecf20Sopenharmony_ci ) && ok; 27948c2ecf20Sopenharmony_ci } 27958c2ecf20Sopenharmony_ci set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 27968c2ecf20Sopenharmony_ci *skipped = 1; 27978c2ecf20Sopenharmony_ci put_buf(r1_bio); 27988c2ecf20Sopenharmony_ci 27998c2ecf20Sopenharmony_ci if (!ok) { 28008c2ecf20Sopenharmony_ci /* Cannot record the badblocks, so need to 28018c2ecf20Sopenharmony_ci * abort the resync. 28028c2ecf20Sopenharmony_ci * If there are multiple read targets, could just 28038c2ecf20Sopenharmony_ci * fail the really bad ones ??? 28048c2ecf20Sopenharmony_ci */ 28058c2ecf20Sopenharmony_ci conf->recovery_disabled = mddev->recovery_disabled; 28068c2ecf20Sopenharmony_ci set_bit(MD_RECOVERY_INTR, &mddev->recovery); 28078c2ecf20Sopenharmony_ci return 0; 28088c2ecf20Sopenharmony_ci } else 28098c2ecf20Sopenharmony_ci return min_bad; 28108c2ecf20Sopenharmony_ci 28118c2ecf20Sopenharmony_ci } 28128c2ecf20Sopenharmony_ci if (min_bad > 0 && min_bad < good_sectors) { 28138c2ecf20Sopenharmony_ci /* only resync enough to reach the next bad->good 28148c2ecf20Sopenharmony_ci * transition */ 28158c2ecf20Sopenharmony_ci good_sectors = min_bad; 28168c2ecf20Sopenharmony_ci } 28178c2ecf20Sopenharmony_ci 28188c2ecf20Sopenharmony_ci if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) 28198c2ecf20Sopenharmony_ci /* extra read targets are also write targets */ 28208c2ecf20Sopenharmony_ci write_targets += read_targets-1; 28218c2ecf20Sopenharmony_ci 28228c2ecf20Sopenharmony_ci if (write_targets == 0 || read_targets == 0) { 28238c2ecf20Sopenharmony_ci /* There is nowhere to write, so all non-sync 28248c2ecf20Sopenharmony_ci * drives must be failed - so we are finished 28258c2ecf20Sopenharmony_ci */ 28268c2ecf20Sopenharmony_ci sector_t rv; 28278c2ecf20Sopenharmony_ci if (min_bad > 0) 28288c2ecf20Sopenharmony_ci max_sector = sector_nr + min_bad; 28298c2ecf20Sopenharmony_ci rv = max_sector - sector_nr; 28308c2ecf20Sopenharmony_ci *skipped = 1; 28318c2ecf20Sopenharmony_ci put_buf(r1_bio); 28328c2ecf20Sopenharmony_ci return rv; 28338c2ecf20Sopenharmony_ci } 28348c2ecf20Sopenharmony_ci 28358c2ecf20Sopenharmony_ci if (max_sector > mddev->resync_max) 28368c2ecf20Sopenharmony_ci max_sector = mddev->resync_max; /* Don't do IO beyond here */ 28378c2ecf20Sopenharmony_ci if (max_sector > sector_nr + good_sectors) 28388c2ecf20Sopenharmony_ci max_sector = sector_nr + good_sectors; 28398c2ecf20Sopenharmony_ci nr_sectors = 0; 28408c2ecf20Sopenharmony_ci sync_blocks = 0; 28418c2ecf20Sopenharmony_ci do { 28428c2ecf20Sopenharmony_ci struct page *page; 28438c2ecf20Sopenharmony_ci int len = PAGE_SIZE; 28448c2ecf20Sopenharmony_ci if (sector_nr + (len>>9) > max_sector) 28458c2ecf20Sopenharmony_ci len = (max_sector - sector_nr) << 9; 28468c2ecf20Sopenharmony_ci if (len == 0) 28478c2ecf20Sopenharmony_ci break; 28488c2ecf20Sopenharmony_ci if (sync_blocks == 0) { 28498c2ecf20Sopenharmony_ci if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, 28508c2ecf20Sopenharmony_ci &sync_blocks, still_degraded) && 28518c2ecf20Sopenharmony_ci !conf->fullsync && 28528c2ecf20Sopenharmony_ci !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 28538c2ecf20Sopenharmony_ci break; 28548c2ecf20Sopenharmony_ci if ((len >> 9) > sync_blocks) 28558c2ecf20Sopenharmony_ci len = sync_blocks<<9; 28568c2ecf20Sopenharmony_ci } 28578c2ecf20Sopenharmony_ci 28588c2ecf20Sopenharmony_ci for (i = 0 ; i < conf->raid_disks * 2; i++) { 28598c2ecf20Sopenharmony_ci struct resync_pages *rp; 28608c2ecf20Sopenharmony_ci 28618c2ecf20Sopenharmony_ci bio = r1_bio->bios[i]; 28628c2ecf20Sopenharmony_ci rp = get_resync_pages(bio); 28638c2ecf20Sopenharmony_ci if (bio->bi_end_io) { 28648c2ecf20Sopenharmony_ci page = resync_fetch_page(rp, page_idx); 28658c2ecf20Sopenharmony_ci 28668c2ecf20Sopenharmony_ci /* 28678c2ecf20Sopenharmony_ci * won't fail because the vec table is big 28688c2ecf20Sopenharmony_ci * enough to hold all these pages 28698c2ecf20Sopenharmony_ci */ 28708c2ecf20Sopenharmony_ci bio_add_page(bio, page, len, 0); 28718c2ecf20Sopenharmony_ci } 28728c2ecf20Sopenharmony_ci } 28738c2ecf20Sopenharmony_ci nr_sectors += len>>9; 28748c2ecf20Sopenharmony_ci sector_nr += len>>9; 28758c2ecf20Sopenharmony_ci sync_blocks -= (len>>9); 28768c2ecf20Sopenharmony_ci } while (++page_idx < RESYNC_PAGES); 28778c2ecf20Sopenharmony_ci 28788c2ecf20Sopenharmony_ci r1_bio->sectors = nr_sectors; 28798c2ecf20Sopenharmony_ci 28808c2ecf20Sopenharmony_ci if (mddev_is_clustered(mddev) && 28818c2ecf20Sopenharmony_ci conf->cluster_sync_high < sector_nr + nr_sectors) { 28828c2ecf20Sopenharmony_ci conf->cluster_sync_low = mddev->curr_resync_completed; 28838c2ecf20Sopenharmony_ci conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; 28848c2ecf20Sopenharmony_ci /* Send resync message */ 28858c2ecf20Sopenharmony_ci md_cluster_ops->resync_info_update(mddev, 28868c2ecf20Sopenharmony_ci conf->cluster_sync_low, 28878c2ecf20Sopenharmony_ci conf->cluster_sync_high); 28888c2ecf20Sopenharmony_ci } 28898c2ecf20Sopenharmony_ci 28908c2ecf20Sopenharmony_ci /* For a user-requested sync, we read all readable devices and do a 28918c2ecf20Sopenharmony_ci * compare 28928c2ecf20Sopenharmony_ci */ 28938c2ecf20Sopenharmony_ci if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 28948c2ecf20Sopenharmony_ci atomic_set(&r1_bio->remaining, read_targets); 28958c2ecf20Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) { 28968c2ecf20Sopenharmony_ci bio = r1_bio->bios[i]; 28978c2ecf20Sopenharmony_ci if (bio->bi_end_io == end_sync_read) { 28988c2ecf20Sopenharmony_ci read_targets--; 28998c2ecf20Sopenharmony_ci md_sync_acct_bio(bio, nr_sectors); 29008c2ecf20Sopenharmony_ci if (read_targets == 1) 29018c2ecf20Sopenharmony_ci bio->bi_opf &= ~MD_FAILFAST; 29028c2ecf20Sopenharmony_ci submit_bio_noacct(bio); 29038c2ecf20Sopenharmony_ci } 29048c2ecf20Sopenharmony_ci } 29058c2ecf20Sopenharmony_ci } else { 29068c2ecf20Sopenharmony_ci atomic_set(&r1_bio->remaining, 1); 29078c2ecf20Sopenharmony_ci bio = r1_bio->bios[r1_bio->read_disk]; 29088c2ecf20Sopenharmony_ci md_sync_acct_bio(bio, nr_sectors); 29098c2ecf20Sopenharmony_ci if (read_targets == 1) 29108c2ecf20Sopenharmony_ci bio->bi_opf &= ~MD_FAILFAST; 29118c2ecf20Sopenharmony_ci submit_bio_noacct(bio); 29128c2ecf20Sopenharmony_ci } 29138c2ecf20Sopenharmony_ci return nr_sectors; 29148c2ecf20Sopenharmony_ci} 29158c2ecf20Sopenharmony_ci 29168c2ecf20Sopenharmony_cistatic sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks) 29178c2ecf20Sopenharmony_ci{ 29188c2ecf20Sopenharmony_ci if (sectors) 29198c2ecf20Sopenharmony_ci return sectors; 29208c2ecf20Sopenharmony_ci 29218c2ecf20Sopenharmony_ci return mddev->dev_sectors; 29228c2ecf20Sopenharmony_ci} 29238c2ecf20Sopenharmony_ci 29248c2ecf20Sopenharmony_cistatic struct r1conf *setup_conf(struct mddev *mddev) 29258c2ecf20Sopenharmony_ci{ 29268c2ecf20Sopenharmony_ci struct r1conf *conf; 29278c2ecf20Sopenharmony_ci int i; 29288c2ecf20Sopenharmony_ci struct raid1_info *disk; 29298c2ecf20Sopenharmony_ci struct md_rdev *rdev; 29308c2ecf20Sopenharmony_ci int err = -ENOMEM; 29318c2ecf20Sopenharmony_ci 29328c2ecf20Sopenharmony_ci conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL); 29338c2ecf20Sopenharmony_ci if (!conf) 29348c2ecf20Sopenharmony_ci goto abort; 29358c2ecf20Sopenharmony_ci 29368c2ecf20Sopenharmony_ci conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR, 29378c2ecf20Sopenharmony_ci sizeof(atomic_t), GFP_KERNEL); 29388c2ecf20Sopenharmony_ci if (!conf->nr_pending) 29398c2ecf20Sopenharmony_ci goto abort; 29408c2ecf20Sopenharmony_ci 29418c2ecf20Sopenharmony_ci conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR, 29428c2ecf20Sopenharmony_ci sizeof(atomic_t), GFP_KERNEL); 29438c2ecf20Sopenharmony_ci if (!conf->nr_waiting) 29448c2ecf20Sopenharmony_ci goto abort; 29458c2ecf20Sopenharmony_ci 29468c2ecf20Sopenharmony_ci conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR, 29478c2ecf20Sopenharmony_ci sizeof(atomic_t), GFP_KERNEL); 29488c2ecf20Sopenharmony_ci if (!conf->nr_queued) 29498c2ecf20Sopenharmony_ci goto abort; 29508c2ecf20Sopenharmony_ci 29518c2ecf20Sopenharmony_ci conf->barrier = kcalloc(BARRIER_BUCKETS_NR, 29528c2ecf20Sopenharmony_ci sizeof(atomic_t), GFP_KERNEL); 29538c2ecf20Sopenharmony_ci if (!conf->barrier) 29548c2ecf20Sopenharmony_ci goto abort; 29558c2ecf20Sopenharmony_ci 29568c2ecf20Sopenharmony_ci conf->mirrors = kzalloc(array3_size(sizeof(struct raid1_info), 29578c2ecf20Sopenharmony_ci mddev->raid_disks, 2), 29588c2ecf20Sopenharmony_ci GFP_KERNEL); 29598c2ecf20Sopenharmony_ci if (!conf->mirrors) 29608c2ecf20Sopenharmony_ci goto abort; 29618c2ecf20Sopenharmony_ci 29628c2ecf20Sopenharmony_ci conf->tmppage = alloc_page(GFP_KERNEL); 29638c2ecf20Sopenharmony_ci if (!conf->tmppage) 29648c2ecf20Sopenharmony_ci goto abort; 29658c2ecf20Sopenharmony_ci 29668c2ecf20Sopenharmony_ci conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 29678c2ecf20Sopenharmony_ci if (!conf->poolinfo) 29688c2ecf20Sopenharmony_ci goto abort; 29698c2ecf20Sopenharmony_ci conf->poolinfo->raid_disks = mddev->raid_disks * 2; 29708c2ecf20Sopenharmony_ci err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc, 29718c2ecf20Sopenharmony_ci rbio_pool_free, conf->poolinfo); 29728c2ecf20Sopenharmony_ci if (err) 29738c2ecf20Sopenharmony_ci goto abort; 29748c2ecf20Sopenharmony_ci 29758c2ecf20Sopenharmony_ci err = bioset_init(&conf->bio_split, BIO_POOL_SIZE, 0, 0); 29768c2ecf20Sopenharmony_ci if (err) 29778c2ecf20Sopenharmony_ci goto abort; 29788c2ecf20Sopenharmony_ci 29798c2ecf20Sopenharmony_ci conf->poolinfo->mddev = mddev; 29808c2ecf20Sopenharmony_ci 29818c2ecf20Sopenharmony_ci err = -EINVAL; 29828c2ecf20Sopenharmony_ci spin_lock_init(&conf->device_lock); 29838c2ecf20Sopenharmony_ci rdev_for_each(rdev, mddev) { 29848c2ecf20Sopenharmony_ci int disk_idx = rdev->raid_disk; 29858c2ecf20Sopenharmony_ci if (disk_idx >= mddev->raid_disks 29868c2ecf20Sopenharmony_ci || disk_idx < 0) 29878c2ecf20Sopenharmony_ci continue; 29888c2ecf20Sopenharmony_ci if (test_bit(Replacement, &rdev->flags)) 29898c2ecf20Sopenharmony_ci disk = conf->mirrors + mddev->raid_disks + disk_idx; 29908c2ecf20Sopenharmony_ci else 29918c2ecf20Sopenharmony_ci disk = conf->mirrors + disk_idx; 29928c2ecf20Sopenharmony_ci 29938c2ecf20Sopenharmony_ci if (disk->rdev) 29948c2ecf20Sopenharmony_ci goto abort; 29958c2ecf20Sopenharmony_ci disk->rdev = rdev; 29968c2ecf20Sopenharmony_ci disk->head_position = 0; 29978c2ecf20Sopenharmony_ci disk->seq_start = MaxSector; 29988c2ecf20Sopenharmony_ci } 29998c2ecf20Sopenharmony_ci conf->raid_disks = mddev->raid_disks; 30008c2ecf20Sopenharmony_ci conf->mddev = mddev; 30018c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&conf->retry_list); 30028c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&conf->bio_end_io_list); 30038c2ecf20Sopenharmony_ci 30048c2ecf20Sopenharmony_ci spin_lock_init(&conf->resync_lock); 30058c2ecf20Sopenharmony_ci init_waitqueue_head(&conf->wait_barrier); 30068c2ecf20Sopenharmony_ci 30078c2ecf20Sopenharmony_ci bio_list_init(&conf->pending_bio_list); 30088c2ecf20Sopenharmony_ci conf->pending_count = 0; 30098c2ecf20Sopenharmony_ci conf->recovery_disabled = mddev->recovery_disabled - 1; 30108c2ecf20Sopenharmony_ci 30118c2ecf20Sopenharmony_ci err = -EIO; 30128c2ecf20Sopenharmony_ci for (i = 0; i < conf->raid_disks * 2; i++) { 30138c2ecf20Sopenharmony_ci 30148c2ecf20Sopenharmony_ci disk = conf->mirrors + i; 30158c2ecf20Sopenharmony_ci 30168c2ecf20Sopenharmony_ci if (i < conf->raid_disks && 30178c2ecf20Sopenharmony_ci disk[conf->raid_disks].rdev) { 30188c2ecf20Sopenharmony_ci /* This slot has a replacement. */ 30198c2ecf20Sopenharmony_ci if (!disk->rdev) { 30208c2ecf20Sopenharmony_ci /* No original, just make the replacement 30218c2ecf20Sopenharmony_ci * a recovering spare 30228c2ecf20Sopenharmony_ci */ 30238c2ecf20Sopenharmony_ci disk->rdev = 30248c2ecf20Sopenharmony_ci disk[conf->raid_disks].rdev; 30258c2ecf20Sopenharmony_ci disk[conf->raid_disks].rdev = NULL; 30268c2ecf20Sopenharmony_ci } else if (!test_bit(In_sync, &disk->rdev->flags)) 30278c2ecf20Sopenharmony_ci /* Original is not in_sync - bad */ 30288c2ecf20Sopenharmony_ci goto abort; 30298c2ecf20Sopenharmony_ci } 30308c2ecf20Sopenharmony_ci 30318c2ecf20Sopenharmony_ci if (!disk->rdev || 30328c2ecf20Sopenharmony_ci !test_bit(In_sync, &disk->rdev->flags)) { 30338c2ecf20Sopenharmony_ci disk->head_position = 0; 30348c2ecf20Sopenharmony_ci if (disk->rdev && 30358c2ecf20Sopenharmony_ci (disk->rdev->saved_raid_disk < 0)) 30368c2ecf20Sopenharmony_ci conf->fullsync = 1; 30378c2ecf20Sopenharmony_ci } 30388c2ecf20Sopenharmony_ci } 30398c2ecf20Sopenharmony_ci 30408c2ecf20Sopenharmony_ci err = -ENOMEM; 30418c2ecf20Sopenharmony_ci conf->thread = md_register_thread(raid1d, mddev, "raid1"); 30428c2ecf20Sopenharmony_ci if (!conf->thread) 30438c2ecf20Sopenharmony_ci goto abort; 30448c2ecf20Sopenharmony_ci 30458c2ecf20Sopenharmony_ci return conf; 30468c2ecf20Sopenharmony_ci 30478c2ecf20Sopenharmony_ci abort: 30488c2ecf20Sopenharmony_ci if (conf) { 30498c2ecf20Sopenharmony_ci mempool_exit(&conf->r1bio_pool); 30508c2ecf20Sopenharmony_ci kfree(conf->mirrors); 30518c2ecf20Sopenharmony_ci safe_put_page(conf->tmppage); 30528c2ecf20Sopenharmony_ci kfree(conf->poolinfo); 30538c2ecf20Sopenharmony_ci kfree(conf->nr_pending); 30548c2ecf20Sopenharmony_ci kfree(conf->nr_waiting); 30558c2ecf20Sopenharmony_ci kfree(conf->nr_queued); 30568c2ecf20Sopenharmony_ci kfree(conf->barrier); 30578c2ecf20Sopenharmony_ci bioset_exit(&conf->bio_split); 30588c2ecf20Sopenharmony_ci kfree(conf); 30598c2ecf20Sopenharmony_ci } 30608c2ecf20Sopenharmony_ci return ERR_PTR(err); 30618c2ecf20Sopenharmony_ci} 30628c2ecf20Sopenharmony_ci 30638c2ecf20Sopenharmony_cistatic void raid1_free(struct mddev *mddev, void *priv); 30648c2ecf20Sopenharmony_cistatic int raid1_run(struct mddev *mddev) 30658c2ecf20Sopenharmony_ci{ 30668c2ecf20Sopenharmony_ci struct r1conf *conf; 30678c2ecf20Sopenharmony_ci int i; 30688c2ecf20Sopenharmony_ci struct md_rdev *rdev; 30698c2ecf20Sopenharmony_ci int ret; 30708c2ecf20Sopenharmony_ci bool discard_supported = false; 30718c2ecf20Sopenharmony_ci 30728c2ecf20Sopenharmony_ci if (mddev->level != 1) { 30738c2ecf20Sopenharmony_ci pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n", 30748c2ecf20Sopenharmony_ci mdname(mddev), mddev->level); 30758c2ecf20Sopenharmony_ci return -EIO; 30768c2ecf20Sopenharmony_ci } 30778c2ecf20Sopenharmony_ci if (mddev->reshape_position != MaxSector) { 30788c2ecf20Sopenharmony_ci pr_warn("md/raid1:%s: reshape_position set but not supported\n", 30798c2ecf20Sopenharmony_ci mdname(mddev)); 30808c2ecf20Sopenharmony_ci return -EIO; 30818c2ecf20Sopenharmony_ci } 30828c2ecf20Sopenharmony_ci if (mddev_init_writes_pending(mddev) < 0) 30838c2ecf20Sopenharmony_ci return -ENOMEM; 30848c2ecf20Sopenharmony_ci /* 30858c2ecf20Sopenharmony_ci * copy the already verified devices into our private RAID1 30868c2ecf20Sopenharmony_ci * bookkeeping area. [whatever we allocate in run(), 30878c2ecf20Sopenharmony_ci * should be freed in raid1_free()] 30888c2ecf20Sopenharmony_ci */ 30898c2ecf20Sopenharmony_ci if (mddev->private == NULL) 30908c2ecf20Sopenharmony_ci conf = setup_conf(mddev); 30918c2ecf20Sopenharmony_ci else 30928c2ecf20Sopenharmony_ci conf = mddev->private; 30938c2ecf20Sopenharmony_ci 30948c2ecf20Sopenharmony_ci if (IS_ERR(conf)) 30958c2ecf20Sopenharmony_ci return PTR_ERR(conf); 30968c2ecf20Sopenharmony_ci 30978c2ecf20Sopenharmony_ci if (mddev->queue) { 30988c2ecf20Sopenharmony_ci blk_queue_max_write_same_sectors(mddev->queue, 0); 30998c2ecf20Sopenharmony_ci blk_queue_max_write_zeroes_sectors(mddev->queue, 0); 31008c2ecf20Sopenharmony_ci } 31018c2ecf20Sopenharmony_ci 31028c2ecf20Sopenharmony_ci rdev_for_each(rdev, mddev) { 31038c2ecf20Sopenharmony_ci if (!mddev->gendisk) 31048c2ecf20Sopenharmony_ci continue; 31058c2ecf20Sopenharmony_ci disk_stack_limits(mddev->gendisk, rdev->bdev, 31068c2ecf20Sopenharmony_ci rdev->data_offset << 9); 31078c2ecf20Sopenharmony_ci if (blk_queue_discard(bdev_get_queue(rdev->bdev))) 31088c2ecf20Sopenharmony_ci discard_supported = true; 31098c2ecf20Sopenharmony_ci } 31108c2ecf20Sopenharmony_ci 31118c2ecf20Sopenharmony_ci mddev->degraded = 0; 31128c2ecf20Sopenharmony_ci for (i = 0; i < conf->raid_disks; i++) 31138c2ecf20Sopenharmony_ci if (conf->mirrors[i].rdev == NULL || 31148c2ecf20Sopenharmony_ci !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || 31158c2ecf20Sopenharmony_ci test_bit(Faulty, &conf->mirrors[i].rdev->flags)) 31168c2ecf20Sopenharmony_ci mddev->degraded++; 31178c2ecf20Sopenharmony_ci /* 31188c2ecf20Sopenharmony_ci * RAID1 needs at least one disk in active 31198c2ecf20Sopenharmony_ci */ 31208c2ecf20Sopenharmony_ci if (conf->raid_disks - mddev->degraded < 1) { 31218c2ecf20Sopenharmony_ci md_unregister_thread(&conf->thread); 31228c2ecf20Sopenharmony_ci ret = -EINVAL; 31238c2ecf20Sopenharmony_ci goto abort; 31248c2ecf20Sopenharmony_ci } 31258c2ecf20Sopenharmony_ci 31268c2ecf20Sopenharmony_ci if (conf->raid_disks - mddev->degraded == 1) 31278c2ecf20Sopenharmony_ci mddev->recovery_cp = MaxSector; 31288c2ecf20Sopenharmony_ci 31298c2ecf20Sopenharmony_ci if (mddev->recovery_cp != MaxSector) 31308c2ecf20Sopenharmony_ci pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", 31318c2ecf20Sopenharmony_ci mdname(mddev)); 31328c2ecf20Sopenharmony_ci pr_info("md/raid1:%s: active with %d out of %d mirrors\n", 31338c2ecf20Sopenharmony_ci mdname(mddev), mddev->raid_disks - mddev->degraded, 31348c2ecf20Sopenharmony_ci mddev->raid_disks); 31358c2ecf20Sopenharmony_ci 31368c2ecf20Sopenharmony_ci /* 31378c2ecf20Sopenharmony_ci * Ok, everything is just fine now 31388c2ecf20Sopenharmony_ci */ 31398c2ecf20Sopenharmony_ci mddev->thread = conf->thread; 31408c2ecf20Sopenharmony_ci conf->thread = NULL; 31418c2ecf20Sopenharmony_ci mddev->private = conf; 31428c2ecf20Sopenharmony_ci set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); 31438c2ecf20Sopenharmony_ci 31448c2ecf20Sopenharmony_ci md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); 31458c2ecf20Sopenharmony_ci 31468c2ecf20Sopenharmony_ci if (mddev->queue) { 31478c2ecf20Sopenharmony_ci if (discard_supported) 31488c2ecf20Sopenharmony_ci blk_queue_flag_set(QUEUE_FLAG_DISCARD, 31498c2ecf20Sopenharmony_ci mddev->queue); 31508c2ecf20Sopenharmony_ci else 31518c2ecf20Sopenharmony_ci blk_queue_flag_clear(QUEUE_FLAG_DISCARD, 31528c2ecf20Sopenharmony_ci mddev->queue); 31538c2ecf20Sopenharmony_ci } 31548c2ecf20Sopenharmony_ci 31558c2ecf20Sopenharmony_ci ret = md_integrity_register(mddev); 31568c2ecf20Sopenharmony_ci if (ret) { 31578c2ecf20Sopenharmony_ci md_unregister_thread(&mddev->thread); 31588c2ecf20Sopenharmony_ci goto abort; 31598c2ecf20Sopenharmony_ci } 31608c2ecf20Sopenharmony_ci return 0; 31618c2ecf20Sopenharmony_ci 31628c2ecf20Sopenharmony_ciabort: 31638c2ecf20Sopenharmony_ci raid1_free(mddev, conf); 31648c2ecf20Sopenharmony_ci return ret; 31658c2ecf20Sopenharmony_ci} 31668c2ecf20Sopenharmony_ci 31678c2ecf20Sopenharmony_cistatic void raid1_free(struct mddev *mddev, void *priv) 31688c2ecf20Sopenharmony_ci{ 31698c2ecf20Sopenharmony_ci struct r1conf *conf = priv; 31708c2ecf20Sopenharmony_ci 31718c2ecf20Sopenharmony_ci mempool_exit(&conf->r1bio_pool); 31728c2ecf20Sopenharmony_ci kfree(conf->mirrors); 31738c2ecf20Sopenharmony_ci safe_put_page(conf->tmppage); 31748c2ecf20Sopenharmony_ci kfree(conf->poolinfo); 31758c2ecf20Sopenharmony_ci kfree(conf->nr_pending); 31768c2ecf20Sopenharmony_ci kfree(conf->nr_waiting); 31778c2ecf20Sopenharmony_ci kfree(conf->nr_queued); 31788c2ecf20Sopenharmony_ci kfree(conf->barrier); 31798c2ecf20Sopenharmony_ci bioset_exit(&conf->bio_split); 31808c2ecf20Sopenharmony_ci kfree(conf); 31818c2ecf20Sopenharmony_ci} 31828c2ecf20Sopenharmony_ci 31838c2ecf20Sopenharmony_cistatic int raid1_resize(struct mddev *mddev, sector_t sectors) 31848c2ecf20Sopenharmony_ci{ 31858c2ecf20Sopenharmony_ci /* no resync is happening, and there is enough space 31868c2ecf20Sopenharmony_ci * on all devices, so we can resize. 31878c2ecf20Sopenharmony_ci * We need to make sure resync covers any new space. 31888c2ecf20Sopenharmony_ci * If the array is shrinking we should possibly wait until 31898c2ecf20Sopenharmony_ci * any io in the removed space completes, but it hardly seems 31908c2ecf20Sopenharmony_ci * worth it. 31918c2ecf20Sopenharmony_ci */ 31928c2ecf20Sopenharmony_ci sector_t newsize = raid1_size(mddev, sectors, 0); 31938c2ecf20Sopenharmony_ci if (mddev->external_size && 31948c2ecf20Sopenharmony_ci mddev->array_sectors > newsize) 31958c2ecf20Sopenharmony_ci return -EINVAL; 31968c2ecf20Sopenharmony_ci if (mddev->bitmap) { 31978c2ecf20Sopenharmony_ci int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 31988c2ecf20Sopenharmony_ci if (ret) 31998c2ecf20Sopenharmony_ci return ret; 32008c2ecf20Sopenharmony_ci } 32018c2ecf20Sopenharmony_ci md_set_array_sectors(mddev, newsize); 32028c2ecf20Sopenharmony_ci if (sectors > mddev->dev_sectors && 32038c2ecf20Sopenharmony_ci mddev->recovery_cp > mddev->dev_sectors) { 32048c2ecf20Sopenharmony_ci mddev->recovery_cp = mddev->dev_sectors; 32058c2ecf20Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 32068c2ecf20Sopenharmony_ci } 32078c2ecf20Sopenharmony_ci mddev->dev_sectors = sectors; 32088c2ecf20Sopenharmony_ci mddev->resync_max_sectors = sectors; 32098c2ecf20Sopenharmony_ci return 0; 32108c2ecf20Sopenharmony_ci} 32118c2ecf20Sopenharmony_ci 32128c2ecf20Sopenharmony_cistatic int raid1_reshape(struct mddev *mddev) 32138c2ecf20Sopenharmony_ci{ 32148c2ecf20Sopenharmony_ci /* We need to: 32158c2ecf20Sopenharmony_ci * 1/ resize the r1bio_pool 32168c2ecf20Sopenharmony_ci * 2/ resize conf->mirrors 32178c2ecf20Sopenharmony_ci * 32188c2ecf20Sopenharmony_ci * We allocate a new r1bio_pool if we can. 32198c2ecf20Sopenharmony_ci * Then raise a device barrier and wait until all IO stops. 32208c2ecf20Sopenharmony_ci * Then resize conf->mirrors and swap in the new r1bio pool. 32218c2ecf20Sopenharmony_ci * 32228c2ecf20Sopenharmony_ci * At the same time, we "pack" the devices so that all the missing 32238c2ecf20Sopenharmony_ci * devices have the higher raid_disk numbers. 32248c2ecf20Sopenharmony_ci */ 32258c2ecf20Sopenharmony_ci mempool_t newpool, oldpool; 32268c2ecf20Sopenharmony_ci struct pool_info *newpoolinfo; 32278c2ecf20Sopenharmony_ci struct raid1_info *newmirrors; 32288c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 32298c2ecf20Sopenharmony_ci int cnt, raid_disks; 32308c2ecf20Sopenharmony_ci unsigned long flags; 32318c2ecf20Sopenharmony_ci int d, d2; 32328c2ecf20Sopenharmony_ci int ret; 32338c2ecf20Sopenharmony_ci 32348c2ecf20Sopenharmony_ci memset(&newpool, 0, sizeof(newpool)); 32358c2ecf20Sopenharmony_ci memset(&oldpool, 0, sizeof(oldpool)); 32368c2ecf20Sopenharmony_ci 32378c2ecf20Sopenharmony_ci /* Cannot change chunk_size, layout, or level */ 32388c2ecf20Sopenharmony_ci if (mddev->chunk_sectors != mddev->new_chunk_sectors || 32398c2ecf20Sopenharmony_ci mddev->layout != mddev->new_layout || 32408c2ecf20Sopenharmony_ci mddev->level != mddev->new_level) { 32418c2ecf20Sopenharmony_ci mddev->new_chunk_sectors = mddev->chunk_sectors; 32428c2ecf20Sopenharmony_ci mddev->new_layout = mddev->layout; 32438c2ecf20Sopenharmony_ci mddev->new_level = mddev->level; 32448c2ecf20Sopenharmony_ci return -EINVAL; 32458c2ecf20Sopenharmony_ci } 32468c2ecf20Sopenharmony_ci 32478c2ecf20Sopenharmony_ci if (!mddev_is_clustered(mddev)) 32488c2ecf20Sopenharmony_ci md_allow_write(mddev); 32498c2ecf20Sopenharmony_ci 32508c2ecf20Sopenharmony_ci raid_disks = mddev->raid_disks + mddev->delta_disks; 32518c2ecf20Sopenharmony_ci 32528c2ecf20Sopenharmony_ci if (raid_disks < conf->raid_disks) { 32538c2ecf20Sopenharmony_ci cnt=0; 32548c2ecf20Sopenharmony_ci for (d= 0; d < conf->raid_disks; d++) 32558c2ecf20Sopenharmony_ci if (conf->mirrors[d].rdev) 32568c2ecf20Sopenharmony_ci cnt++; 32578c2ecf20Sopenharmony_ci if (cnt > raid_disks) 32588c2ecf20Sopenharmony_ci return -EBUSY; 32598c2ecf20Sopenharmony_ci } 32608c2ecf20Sopenharmony_ci 32618c2ecf20Sopenharmony_ci newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); 32628c2ecf20Sopenharmony_ci if (!newpoolinfo) 32638c2ecf20Sopenharmony_ci return -ENOMEM; 32648c2ecf20Sopenharmony_ci newpoolinfo->mddev = mddev; 32658c2ecf20Sopenharmony_ci newpoolinfo->raid_disks = raid_disks * 2; 32668c2ecf20Sopenharmony_ci 32678c2ecf20Sopenharmony_ci ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, 32688c2ecf20Sopenharmony_ci rbio_pool_free, newpoolinfo); 32698c2ecf20Sopenharmony_ci if (ret) { 32708c2ecf20Sopenharmony_ci kfree(newpoolinfo); 32718c2ecf20Sopenharmony_ci return ret; 32728c2ecf20Sopenharmony_ci } 32738c2ecf20Sopenharmony_ci newmirrors = kzalloc(array3_size(sizeof(struct raid1_info), 32748c2ecf20Sopenharmony_ci raid_disks, 2), 32758c2ecf20Sopenharmony_ci GFP_KERNEL); 32768c2ecf20Sopenharmony_ci if (!newmirrors) { 32778c2ecf20Sopenharmony_ci kfree(newpoolinfo); 32788c2ecf20Sopenharmony_ci mempool_exit(&newpool); 32798c2ecf20Sopenharmony_ci return -ENOMEM; 32808c2ecf20Sopenharmony_ci } 32818c2ecf20Sopenharmony_ci 32828c2ecf20Sopenharmony_ci freeze_array(conf, 0); 32838c2ecf20Sopenharmony_ci 32848c2ecf20Sopenharmony_ci /* ok, everything is stopped */ 32858c2ecf20Sopenharmony_ci oldpool = conf->r1bio_pool; 32868c2ecf20Sopenharmony_ci conf->r1bio_pool = newpool; 32878c2ecf20Sopenharmony_ci 32888c2ecf20Sopenharmony_ci for (d = d2 = 0; d < conf->raid_disks; d++) { 32898c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->mirrors[d].rdev; 32908c2ecf20Sopenharmony_ci if (rdev && rdev->raid_disk != d2) { 32918c2ecf20Sopenharmony_ci sysfs_unlink_rdev(mddev, rdev); 32928c2ecf20Sopenharmony_ci rdev->raid_disk = d2; 32938c2ecf20Sopenharmony_ci sysfs_unlink_rdev(mddev, rdev); 32948c2ecf20Sopenharmony_ci if (sysfs_link_rdev(mddev, rdev)) 32958c2ecf20Sopenharmony_ci pr_warn("md/raid1:%s: cannot register rd%d\n", 32968c2ecf20Sopenharmony_ci mdname(mddev), rdev->raid_disk); 32978c2ecf20Sopenharmony_ci } 32988c2ecf20Sopenharmony_ci if (rdev) 32998c2ecf20Sopenharmony_ci newmirrors[d2++].rdev = rdev; 33008c2ecf20Sopenharmony_ci } 33018c2ecf20Sopenharmony_ci kfree(conf->mirrors); 33028c2ecf20Sopenharmony_ci conf->mirrors = newmirrors; 33038c2ecf20Sopenharmony_ci kfree(conf->poolinfo); 33048c2ecf20Sopenharmony_ci conf->poolinfo = newpoolinfo; 33058c2ecf20Sopenharmony_ci 33068c2ecf20Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 33078c2ecf20Sopenharmony_ci mddev->degraded += (raid_disks - conf->raid_disks); 33088c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 33098c2ecf20Sopenharmony_ci conf->raid_disks = mddev->raid_disks = raid_disks; 33108c2ecf20Sopenharmony_ci mddev->delta_disks = 0; 33118c2ecf20Sopenharmony_ci 33128c2ecf20Sopenharmony_ci unfreeze_array(conf); 33138c2ecf20Sopenharmony_ci 33148c2ecf20Sopenharmony_ci set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 33158c2ecf20Sopenharmony_ci set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 33168c2ecf20Sopenharmony_ci md_wakeup_thread(mddev->thread); 33178c2ecf20Sopenharmony_ci 33188c2ecf20Sopenharmony_ci mempool_exit(&oldpool); 33198c2ecf20Sopenharmony_ci return 0; 33208c2ecf20Sopenharmony_ci} 33218c2ecf20Sopenharmony_ci 33228c2ecf20Sopenharmony_cistatic void raid1_quiesce(struct mddev *mddev, int quiesce) 33238c2ecf20Sopenharmony_ci{ 33248c2ecf20Sopenharmony_ci struct r1conf *conf = mddev->private; 33258c2ecf20Sopenharmony_ci 33268c2ecf20Sopenharmony_ci if (quiesce) 33278c2ecf20Sopenharmony_ci freeze_array(conf, 0); 33288c2ecf20Sopenharmony_ci else 33298c2ecf20Sopenharmony_ci unfreeze_array(conf); 33308c2ecf20Sopenharmony_ci} 33318c2ecf20Sopenharmony_ci 33328c2ecf20Sopenharmony_cistatic void *raid1_takeover(struct mddev *mddev) 33338c2ecf20Sopenharmony_ci{ 33348c2ecf20Sopenharmony_ci /* raid1 can take over: 33358c2ecf20Sopenharmony_ci * raid5 with 2 devices, any layout or chunk size 33368c2ecf20Sopenharmony_ci */ 33378c2ecf20Sopenharmony_ci if (mddev->level == 5 && mddev->raid_disks == 2) { 33388c2ecf20Sopenharmony_ci struct r1conf *conf; 33398c2ecf20Sopenharmony_ci mddev->new_level = 1; 33408c2ecf20Sopenharmony_ci mddev->new_layout = 0; 33418c2ecf20Sopenharmony_ci mddev->new_chunk_sectors = 0; 33428c2ecf20Sopenharmony_ci conf = setup_conf(mddev); 33438c2ecf20Sopenharmony_ci if (!IS_ERR(conf)) { 33448c2ecf20Sopenharmony_ci /* Array must appear to be quiesced */ 33458c2ecf20Sopenharmony_ci conf->array_frozen = 1; 33468c2ecf20Sopenharmony_ci mddev_clear_unsupported_flags(mddev, 33478c2ecf20Sopenharmony_ci UNSUPPORTED_MDDEV_FLAGS); 33488c2ecf20Sopenharmony_ci } 33498c2ecf20Sopenharmony_ci return conf; 33508c2ecf20Sopenharmony_ci } 33518c2ecf20Sopenharmony_ci return ERR_PTR(-EINVAL); 33528c2ecf20Sopenharmony_ci} 33538c2ecf20Sopenharmony_ci 33548c2ecf20Sopenharmony_cistatic struct md_personality raid1_personality = 33558c2ecf20Sopenharmony_ci{ 33568c2ecf20Sopenharmony_ci .name = "raid1", 33578c2ecf20Sopenharmony_ci .level = 1, 33588c2ecf20Sopenharmony_ci .owner = THIS_MODULE, 33598c2ecf20Sopenharmony_ci .make_request = raid1_make_request, 33608c2ecf20Sopenharmony_ci .run = raid1_run, 33618c2ecf20Sopenharmony_ci .free = raid1_free, 33628c2ecf20Sopenharmony_ci .status = raid1_status, 33638c2ecf20Sopenharmony_ci .error_handler = raid1_error, 33648c2ecf20Sopenharmony_ci .hot_add_disk = raid1_add_disk, 33658c2ecf20Sopenharmony_ci .hot_remove_disk= raid1_remove_disk, 33668c2ecf20Sopenharmony_ci .spare_active = raid1_spare_active, 33678c2ecf20Sopenharmony_ci .sync_request = raid1_sync_request, 33688c2ecf20Sopenharmony_ci .resize = raid1_resize, 33698c2ecf20Sopenharmony_ci .size = raid1_size, 33708c2ecf20Sopenharmony_ci .check_reshape = raid1_reshape, 33718c2ecf20Sopenharmony_ci .quiesce = raid1_quiesce, 33728c2ecf20Sopenharmony_ci .takeover = raid1_takeover, 33738c2ecf20Sopenharmony_ci}; 33748c2ecf20Sopenharmony_ci 33758c2ecf20Sopenharmony_cistatic int __init raid_init(void) 33768c2ecf20Sopenharmony_ci{ 33778c2ecf20Sopenharmony_ci return register_md_personality(&raid1_personality); 33788c2ecf20Sopenharmony_ci} 33798c2ecf20Sopenharmony_ci 33808c2ecf20Sopenharmony_cistatic void raid_exit(void) 33818c2ecf20Sopenharmony_ci{ 33828c2ecf20Sopenharmony_ci unregister_md_personality(&raid1_personality); 33838c2ecf20Sopenharmony_ci} 33848c2ecf20Sopenharmony_ci 33858c2ecf20Sopenharmony_cimodule_init(raid_init); 33868c2ecf20Sopenharmony_cimodule_exit(raid_exit); 33878c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL"); 33888c2ecf20Sopenharmony_ciMODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); 33898c2ecf20Sopenharmony_ciMODULE_ALIAS("md-personality-3"); /* RAID1 */ 33908c2ecf20Sopenharmony_ciMODULE_ALIAS("md-raid1"); 33918c2ecf20Sopenharmony_ciMODULE_ALIAS("md-level-1"); 33928c2ecf20Sopenharmony_ci 33938c2ecf20Sopenharmony_cimodule_param(max_queued_requests, int, S_IRUGO|S_IWUSR); 3394