18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 28c2ecf20Sopenharmony_ci#ifndef _RAID1_H 38c2ecf20Sopenharmony_ci#define _RAID1_H 48c2ecf20Sopenharmony_ci 58c2ecf20Sopenharmony_ci/* 68c2ecf20Sopenharmony_ci * each barrier unit size is 64MB fow now 78c2ecf20Sopenharmony_ci * note: it must be larger than RESYNC_DEPTH 88c2ecf20Sopenharmony_ci */ 98c2ecf20Sopenharmony_ci#define BARRIER_UNIT_SECTOR_BITS 17 108c2ecf20Sopenharmony_ci#define BARRIER_UNIT_SECTOR_SIZE (1<<17) 118c2ecf20Sopenharmony_ci/* 128c2ecf20Sopenharmony_ci * In struct r1conf, the following members are related to I/O barrier 138c2ecf20Sopenharmony_ci * buckets, 148c2ecf20Sopenharmony_ci * atomic_t *nr_pending; 158c2ecf20Sopenharmony_ci * atomic_t *nr_waiting; 168c2ecf20Sopenharmony_ci * atomic_t *nr_queued; 178c2ecf20Sopenharmony_ci * atomic_t *barrier; 188c2ecf20Sopenharmony_ci * Each of them points to array of atomic_t variables, each array is 198c2ecf20Sopenharmony_ci * designed to have BARRIER_BUCKETS_NR elements and occupy a single 208c2ecf20Sopenharmony_ci * memory page. The data width of atomic_t variables is 4 bytes, equal 218c2ecf20Sopenharmony_ci * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined 228c2ecf20Sopenharmony_ci * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of 238c2ecf20Sopenharmony_ci * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly 248c2ecf20Sopenharmony_ci * occupies a single memory page. 258c2ecf20Sopenharmony_ci */ 268c2ecf20Sopenharmony_ci#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t))) 278c2ecf20Sopenharmony_ci#define BARRIER_BUCKETS_NR (1<<BARRIER_BUCKETS_NR_BITS) 288c2ecf20Sopenharmony_ci 298c2ecf20Sopenharmony_ci/* Note: raid1_info.rdev can be set to NULL asynchronously by raid1_remove_disk. 308c2ecf20Sopenharmony_ci * There are three safe ways to access raid1_info.rdev. 318c2ecf20Sopenharmony_ci * 1/ when holding mddev->reconfig_mutex 328c2ecf20Sopenharmony_ci * 2/ when resync/recovery is known to be happening - i.e. in code that is 338c2ecf20Sopenharmony_ci * called as part of performing resync/recovery. 348c2ecf20Sopenharmony_ci * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer 358c2ecf20Sopenharmony_ci * and if it is non-NULL, increment rdev->nr_pending before dropping the 368c2ecf20Sopenharmony_ci * RCU lock. 378c2ecf20Sopenharmony_ci * When .rdev is set to NULL, the nr_pending count checked again and if it has 388c2ecf20Sopenharmony_ci * been incremented, the pointer is put back in .rdev. 398c2ecf20Sopenharmony_ci */ 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_cistruct raid1_info { 428c2ecf20Sopenharmony_ci struct md_rdev *rdev; 438c2ecf20Sopenharmony_ci sector_t head_position; 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_ci /* When choose the best device for a read (read_balance()) 468c2ecf20Sopenharmony_ci * we try to keep sequential reads one the same device 478c2ecf20Sopenharmony_ci */ 488c2ecf20Sopenharmony_ci sector_t next_seq_sect; 498c2ecf20Sopenharmony_ci sector_t seq_start; 508c2ecf20Sopenharmony_ci}; 518c2ecf20Sopenharmony_ci 528c2ecf20Sopenharmony_ci/* 538c2ecf20Sopenharmony_ci * memory pools need a pointer to the mddev, so they can force an unplug 548c2ecf20Sopenharmony_ci * when memory is tight, and a count of the number of drives that the 558c2ecf20Sopenharmony_ci * pool was allocated for, so they know how much to allocate and free. 568c2ecf20Sopenharmony_ci * mddev->raid_disks cannot be used, as it can change while a pool is active 578c2ecf20Sopenharmony_ci * These two datums are stored in a kmalloced struct. 588c2ecf20Sopenharmony_ci * The 'raid_disks' here is twice the raid_disks in r1conf. 598c2ecf20Sopenharmony_ci * This allows space for each 'real' device can have a replacement in the 608c2ecf20Sopenharmony_ci * second half of the array. 618c2ecf20Sopenharmony_ci */ 628c2ecf20Sopenharmony_ci 638c2ecf20Sopenharmony_cistruct pool_info { 648c2ecf20Sopenharmony_ci struct mddev *mddev; 658c2ecf20Sopenharmony_ci int raid_disks; 668c2ecf20Sopenharmony_ci}; 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_cistruct r1conf { 698c2ecf20Sopenharmony_ci struct mddev *mddev; 708c2ecf20Sopenharmony_ci struct raid1_info *mirrors; /* twice 'raid_disks' to 718c2ecf20Sopenharmony_ci * allow for replacements. 728c2ecf20Sopenharmony_ci */ 738c2ecf20Sopenharmony_ci int raid_disks; 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_ci spinlock_t device_lock; 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci /* list of 'struct r1bio' that need to be processed by raid1d, 788c2ecf20Sopenharmony_ci * whether to retry a read, writeout a resync or recovery 798c2ecf20Sopenharmony_ci * block, or anything else. 808c2ecf20Sopenharmony_ci */ 818c2ecf20Sopenharmony_ci struct list_head retry_list; 828c2ecf20Sopenharmony_ci /* A separate list of r1bio which just need raid_end_bio_io called. 838c2ecf20Sopenharmony_ci * This mustn't happen for writes which had any errors if the superblock 848c2ecf20Sopenharmony_ci * needs to be written. 858c2ecf20Sopenharmony_ci */ 868c2ecf20Sopenharmony_ci struct list_head bio_end_io_list; 878c2ecf20Sopenharmony_ci 888c2ecf20Sopenharmony_ci /* queue pending writes to be submitted on unplug */ 898c2ecf20Sopenharmony_ci struct bio_list pending_bio_list; 908c2ecf20Sopenharmony_ci int pending_count; 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci /* for use when syncing mirrors: 938c2ecf20Sopenharmony_ci * We don't allow both normal IO and resync/recovery IO at 948c2ecf20Sopenharmony_ci * the same time - resync/recovery can only happen when there 958c2ecf20Sopenharmony_ci * is no other IO. So when either is active, the other has to wait. 968c2ecf20Sopenharmony_ci * See more details description in raid1.c near raise_barrier(). 978c2ecf20Sopenharmony_ci */ 988c2ecf20Sopenharmony_ci wait_queue_head_t wait_barrier; 998c2ecf20Sopenharmony_ci spinlock_t resync_lock; 1008c2ecf20Sopenharmony_ci atomic_t nr_sync_pending; 1018c2ecf20Sopenharmony_ci atomic_t *nr_pending; 1028c2ecf20Sopenharmony_ci atomic_t *nr_waiting; 1038c2ecf20Sopenharmony_ci atomic_t *nr_queued; 1048c2ecf20Sopenharmony_ci atomic_t *barrier; 1058c2ecf20Sopenharmony_ci int array_frozen; 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_ci /* Set to 1 if a full sync is needed, (fresh device added). 1088c2ecf20Sopenharmony_ci * Cleared when a sync completes. 1098c2ecf20Sopenharmony_ci */ 1108c2ecf20Sopenharmony_ci int fullsync; 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci /* When the same as mddev->recovery_disabled we don't allow 1138c2ecf20Sopenharmony_ci * recovery to be attempted as we expect a read error. 1148c2ecf20Sopenharmony_ci */ 1158c2ecf20Sopenharmony_ci int recovery_disabled; 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci /* poolinfo contains information about the content of the 1188c2ecf20Sopenharmony_ci * mempools - it changes when the array grows or shrinks 1198c2ecf20Sopenharmony_ci */ 1208c2ecf20Sopenharmony_ci struct pool_info *poolinfo; 1218c2ecf20Sopenharmony_ci mempool_t r1bio_pool; 1228c2ecf20Sopenharmony_ci mempool_t r1buf_pool; 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci struct bio_set bio_split; 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci /* temporary buffer to synchronous IO when attempting to repair 1278c2ecf20Sopenharmony_ci * a read error. 1288c2ecf20Sopenharmony_ci */ 1298c2ecf20Sopenharmony_ci struct page *tmppage; 1308c2ecf20Sopenharmony_ci 1318c2ecf20Sopenharmony_ci /* When taking over an array from a different personality, we store 1328c2ecf20Sopenharmony_ci * the new thread here until we fully activate the array. 1338c2ecf20Sopenharmony_ci */ 1348c2ecf20Sopenharmony_ci struct md_thread *thread; 1358c2ecf20Sopenharmony_ci 1368c2ecf20Sopenharmony_ci /* Keep track of cluster resync window to send to other 1378c2ecf20Sopenharmony_ci * nodes. 1388c2ecf20Sopenharmony_ci */ 1398c2ecf20Sopenharmony_ci sector_t cluster_sync_low; 1408c2ecf20Sopenharmony_ci sector_t cluster_sync_high; 1418c2ecf20Sopenharmony_ci 1428c2ecf20Sopenharmony_ci}; 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci/* 1458c2ecf20Sopenharmony_ci * this is our 'private' RAID1 bio. 1468c2ecf20Sopenharmony_ci * 1478c2ecf20Sopenharmony_ci * it contains information about what kind of IO operations were started 1488c2ecf20Sopenharmony_ci * for this RAID1 operation, and about their status: 1498c2ecf20Sopenharmony_ci */ 1508c2ecf20Sopenharmony_ci 1518c2ecf20Sopenharmony_cistruct r1bio { 1528c2ecf20Sopenharmony_ci atomic_t remaining; /* 'have we finished' count, 1538c2ecf20Sopenharmony_ci * used from IRQ handlers 1548c2ecf20Sopenharmony_ci */ 1558c2ecf20Sopenharmony_ci atomic_t behind_remaining; /* number of write-behind ios remaining 1568c2ecf20Sopenharmony_ci * in this BehindIO request 1578c2ecf20Sopenharmony_ci */ 1588c2ecf20Sopenharmony_ci sector_t sector; 1598c2ecf20Sopenharmony_ci int sectors; 1608c2ecf20Sopenharmony_ci unsigned long state; 1618c2ecf20Sopenharmony_ci struct mddev *mddev; 1628c2ecf20Sopenharmony_ci /* 1638c2ecf20Sopenharmony_ci * original bio going to /dev/mdx 1648c2ecf20Sopenharmony_ci */ 1658c2ecf20Sopenharmony_ci struct bio *master_bio; 1668c2ecf20Sopenharmony_ci /* 1678c2ecf20Sopenharmony_ci * if the IO is in READ direction, then this is where we read 1688c2ecf20Sopenharmony_ci */ 1698c2ecf20Sopenharmony_ci int read_disk; 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci struct list_head retry_list; 1728c2ecf20Sopenharmony_ci 1738c2ecf20Sopenharmony_ci /* 1748c2ecf20Sopenharmony_ci * When R1BIO_BehindIO is set, we store pages for write behind 1758c2ecf20Sopenharmony_ci * in behind_master_bio. 1768c2ecf20Sopenharmony_ci */ 1778c2ecf20Sopenharmony_ci struct bio *behind_master_bio; 1788c2ecf20Sopenharmony_ci 1798c2ecf20Sopenharmony_ci /* 1808c2ecf20Sopenharmony_ci * if the IO is in WRITE direction, then multiple bios are used. 1818c2ecf20Sopenharmony_ci * We choose the number when they are allocated. 1828c2ecf20Sopenharmony_ci */ 1838c2ecf20Sopenharmony_ci struct bio *bios[]; 1848c2ecf20Sopenharmony_ci /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ 1858c2ecf20Sopenharmony_ci}; 1868c2ecf20Sopenharmony_ci 1878c2ecf20Sopenharmony_ci/* bits for r1bio.state */ 1888c2ecf20Sopenharmony_cienum r1bio_state { 1898c2ecf20Sopenharmony_ci R1BIO_Uptodate, 1908c2ecf20Sopenharmony_ci R1BIO_IsSync, 1918c2ecf20Sopenharmony_ci R1BIO_Degraded, 1928c2ecf20Sopenharmony_ci R1BIO_BehindIO, 1938c2ecf20Sopenharmony_ci/* Set ReadError on bios that experience a readerror so that 1948c2ecf20Sopenharmony_ci * raid1d knows what to do with them. 1958c2ecf20Sopenharmony_ci */ 1968c2ecf20Sopenharmony_ci R1BIO_ReadError, 1978c2ecf20Sopenharmony_ci/* For write-behind requests, we call bi_end_io when 1988c2ecf20Sopenharmony_ci * the last non-write-behind device completes, providing 1998c2ecf20Sopenharmony_ci * any write was successful. Otherwise we call when 2008c2ecf20Sopenharmony_ci * any write-behind write succeeds, otherwise we call 2018c2ecf20Sopenharmony_ci * with failure when last write completes (and all failed). 2028c2ecf20Sopenharmony_ci * Record that bi_end_io was called with this flag... 2038c2ecf20Sopenharmony_ci */ 2048c2ecf20Sopenharmony_ci R1BIO_Returned, 2058c2ecf20Sopenharmony_ci/* If a write for this request means we can clear some 2068c2ecf20Sopenharmony_ci * known-bad-block records, we set this flag 2078c2ecf20Sopenharmony_ci */ 2088c2ecf20Sopenharmony_ci R1BIO_MadeGood, 2098c2ecf20Sopenharmony_ci R1BIO_WriteError, 2108c2ecf20Sopenharmony_ci R1BIO_FailFast, 2118c2ecf20Sopenharmony_ci}; 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_cistatic inline int sector_to_idx(sector_t sector) 2148c2ecf20Sopenharmony_ci{ 2158c2ecf20Sopenharmony_ci return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS, 2168c2ecf20Sopenharmony_ci BARRIER_BUCKETS_NR_BITS); 2178c2ecf20Sopenharmony_ci} 2188c2ecf20Sopenharmony_ci#endif 219