162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */ 262306a36Sopenharmony_ci#ifndef _RAID1_H 362306a36Sopenharmony_ci#define _RAID1_H 462306a36Sopenharmony_ci 562306a36Sopenharmony_ci/* 662306a36Sopenharmony_ci * each barrier unit size is 64MB fow now 762306a36Sopenharmony_ci * note: it must be larger than RESYNC_DEPTH 862306a36Sopenharmony_ci */ 962306a36Sopenharmony_ci#define BARRIER_UNIT_SECTOR_BITS 17 1062306a36Sopenharmony_ci#define BARRIER_UNIT_SECTOR_SIZE (1<<17) 1162306a36Sopenharmony_ci/* 1262306a36Sopenharmony_ci * In struct r1conf, the following members are related to I/O barrier 1362306a36Sopenharmony_ci * buckets, 1462306a36Sopenharmony_ci * atomic_t *nr_pending; 1562306a36Sopenharmony_ci * atomic_t *nr_waiting; 1662306a36Sopenharmony_ci * atomic_t *nr_queued; 1762306a36Sopenharmony_ci * atomic_t *barrier; 1862306a36Sopenharmony_ci * Each of them points to array of atomic_t variables, each array is 1962306a36Sopenharmony_ci * designed to have BARRIER_BUCKETS_NR elements and occupy a single 2062306a36Sopenharmony_ci * memory page. The data width of atomic_t variables is 4 bytes, equal 2162306a36Sopenharmony_ci * to 1<<(ilog2(sizeof(atomic_t))), BARRIER_BUCKETS_NR_BITS is defined 2262306a36Sopenharmony_ci * as (PAGE_SHIFT - ilog2(sizeof(int))) to make sure an array of 2362306a36Sopenharmony_ci * atomic_t variables with BARRIER_BUCKETS_NR elements just exactly 2462306a36Sopenharmony_ci * occupies a single memory page. 2562306a36Sopenharmony_ci */ 2662306a36Sopenharmony_ci#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t))) 2762306a36Sopenharmony_ci#define BARRIER_BUCKETS_NR (1<<BARRIER_BUCKETS_NR_BITS) 2862306a36Sopenharmony_ci 2962306a36Sopenharmony_ci/* Note: raid1_info.rdev can be set to NULL asynchronously by raid1_remove_disk. 3062306a36Sopenharmony_ci * There are three safe ways to access raid1_info.rdev. 3162306a36Sopenharmony_ci * 1/ when holding mddev->reconfig_mutex 3262306a36Sopenharmony_ci * 2/ when resync/recovery is known to be happening - i.e. in code that is 3362306a36Sopenharmony_ci * called as part of performing resync/recovery. 3462306a36Sopenharmony_ci * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer 3562306a36Sopenharmony_ci * and if it is non-NULL, increment rdev->nr_pending before dropping the 3662306a36Sopenharmony_ci * RCU lock. 3762306a36Sopenharmony_ci * When .rdev is set to NULL, the nr_pending count checked again and if it has 3862306a36Sopenharmony_ci * been incremented, the pointer is put back in .rdev. 3962306a36Sopenharmony_ci */ 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_cistruct raid1_info { 4262306a36Sopenharmony_ci struct md_rdev *rdev; 4362306a36Sopenharmony_ci sector_t head_position; 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_ci /* When choose the best device for a read (read_balance()) 4662306a36Sopenharmony_ci * we try to keep sequential reads one the same device 4762306a36Sopenharmony_ci */ 4862306a36Sopenharmony_ci sector_t next_seq_sect; 4962306a36Sopenharmony_ci sector_t seq_start; 5062306a36Sopenharmony_ci}; 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_ci/* 5362306a36Sopenharmony_ci * memory pools need a pointer to the mddev, so they can force an unplug 5462306a36Sopenharmony_ci * when memory is tight, and a count of the number of drives that the 5562306a36Sopenharmony_ci * pool was allocated for, so they know how much to allocate and free. 5662306a36Sopenharmony_ci * mddev->raid_disks cannot be used, as it can change while a pool is active 5762306a36Sopenharmony_ci * These two datums are stored in a kmalloced struct. 5862306a36Sopenharmony_ci * The 'raid_disks' here is twice the raid_disks in r1conf. 5962306a36Sopenharmony_ci * This allows space for each 'real' device can have a replacement in the 6062306a36Sopenharmony_ci * second half of the array. 6162306a36Sopenharmony_ci */ 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_cistruct pool_info { 6462306a36Sopenharmony_ci struct mddev *mddev; 6562306a36Sopenharmony_ci int raid_disks; 6662306a36Sopenharmony_ci}; 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_cistruct r1conf { 6962306a36Sopenharmony_ci struct mddev *mddev; 7062306a36Sopenharmony_ci struct raid1_info *mirrors; /* twice 'raid_disks' to 7162306a36Sopenharmony_ci * allow for replacements. 7262306a36Sopenharmony_ci */ 7362306a36Sopenharmony_ci int raid_disks; 7462306a36Sopenharmony_ci 7562306a36Sopenharmony_ci spinlock_t device_lock; 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci /* list of 'struct r1bio' that need to be processed by raid1d, 7862306a36Sopenharmony_ci * whether to retry a read, writeout a resync or recovery 7962306a36Sopenharmony_ci * block, or anything else. 8062306a36Sopenharmony_ci */ 8162306a36Sopenharmony_ci struct list_head retry_list; 8262306a36Sopenharmony_ci /* A separate list of r1bio which just need raid_end_bio_io called. 8362306a36Sopenharmony_ci * This mustn't happen for writes which had any errors if the superblock 8462306a36Sopenharmony_ci * needs to be written. 8562306a36Sopenharmony_ci */ 8662306a36Sopenharmony_ci struct list_head bio_end_io_list; 8762306a36Sopenharmony_ci 8862306a36Sopenharmony_ci /* queue pending writes to be submitted on unplug */ 8962306a36Sopenharmony_ci struct bio_list pending_bio_list; 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci /* for use when syncing mirrors: 9262306a36Sopenharmony_ci * We don't allow both normal IO and resync/recovery IO at 9362306a36Sopenharmony_ci * the same time - resync/recovery can only happen when there 9462306a36Sopenharmony_ci * is no other IO. So when either is active, the other has to wait. 9562306a36Sopenharmony_ci * See more details description in raid1.c near raise_barrier(). 9662306a36Sopenharmony_ci */ 9762306a36Sopenharmony_ci wait_queue_head_t wait_barrier; 9862306a36Sopenharmony_ci spinlock_t resync_lock; 9962306a36Sopenharmony_ci atomic_t nr_sync_pending; 10062306a36Sopenharmony_ci atomic_t *nr_pending; 10162306a36Sopenharmony_ci atomic_t *nr_waiting; 10262306a36Sopenharmony_ci atomic_t *nr_queued; 10362306a36Sopenharmony_ci atomic_t *barrier; 10462306a36Sopenharmony_ci int array_frozen; 10562306a36Sopenharmony_ci 10662306a36Sopenharmony_ci /* Set to 1 if a full sync is needed, (fresh device added). 10762306a36Sopenharmony_ci * Cleared when a sync completes. 10862306a36Sopenharmony_ci */ 10962306a36Sopenharmony_ci int fullsync; 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci /* When the same as mddev->recovery_disabled we don't allow 11262306a36Sopenharmony_ci * recovery to be attempted as we expect a read error. 11362306a36Sopenharmony_ci */ 11462306a36Sopenharmony_ci int recovery_disabled; 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci /* poolinfo contains information about the content of the 11762306a36Sopenharmony_ci * mempools - it changes when the array grows or shrinks 11862306a36Sopenharmony_ci */ 11962306a36Sopenharmony_ci struct pool_info *poolinfo; 12062306a36Sopenharmony_ci mempool_t r1bio_pool; 12162306a36Sopenharmony_ci mempool_t r1buf_pool; 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci struct bio_set bio_split; 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_ci /* temporary buffer to synchronous IO when attempting to repair 12662306a36Sopenharmony_ci * a read error. 12762306a36Sopenharmony_ci */ 12862306a36Sopenharmony_ci struct page *tmppage; 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci /* When taking over an array from a different personality, we store 13162306a36Sopenharmony_ci * the new thread here until we fully activate the array. 13262306a36Sopenharmony_ci */ 13362306a36Sopenharmony_ci struct md_thread __rcu *thread; 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci /* Keep track of cluster resync window to send to other 13662306a36Sopenharmony_ci * nodes. 13762306a36Sopenharmony_ci */ 13862306a36Sopenharmony_ci sector_t cluster_sync_low; 13962306a36Sopenharmony_ci sector_t cluster_sync_high; 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci}; 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci/* 14462306a36Sopenharmony_ci * this is our 'private' RAID1 bio. 14562306a36Sopenharmony_ci * 14662306a36Sopenharmony_ci * it contains information about what kind of IO operations were started 14762306a36Sopenharmony_ci * for this RAID1 operation, and about their status: 14862306a36Sopenharmony_ci */ 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_cistruct r1bio { 15162306a36Sopenharmony_ci atomic_t remaining; /* 'have we finished' count, 15262306a36Sopenharmony_ci * used from IRQ handlers 15362306a36Sopenharmony_ci */ 15462306a36Sopenharmony_ci atomic_t behind_remaining; /* number of write-behind ios remaining 15562306a36Sopenharmony_ci * in this BehindIO request 15662306a36Sopenharmony_ci */ 15762306a36Sopenharmony_ci sector_t sector; 15862306a36Sopenharmony_ci int sectors; 15962306a36Sopenharmony_ci unsigned long state; 16062306a36Sopenharmony_ci struct mddev *mddev; 16162306a36Sopenharmony_ci /* 16262306a36Sopenharmony_ci * original bio going to /dev/mdx 16362306a36Sopenharmony_ci */ 16462306a36Sopenharmony_ci struct bio *master_bio; 16562306a36Sopenharmony_ci /* 16662306a36Sopenharmony_ci * if the IO is in READ direction, then this is where we read 16762306a36Sopenharmony_ci */ 16862306a36Sopenharmony_ci int read_disk; 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_ci struct list_head retry_list; 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_ci /* 17362306a36Sopenharmony_ci * When R1BIO_BehindIO is set, we store pages for write behind 17462306a36Sopenharmony_ci * in behind_master_bio. 17562306a36Sopenharmony_ci */ 17662306a36Sopenharmony_ci struct bio *behind_master_bio; 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci /* 17962306a36Sopenharmony_ci * if the IO is in WRITE direction, then multiple bios are used. 18062306a36Sopenharmony_ci * We choose the number when they are allocated. 18162306a36Sopenharmony_ci */ 18262306a36Sopenharmony_ci struct bio *bios[]; 18362306a36Sopenharmony_ci /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ 18462306a36Sopenharmony_ci}; 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci/* bits for r1bio.state */ 18762306a36Sopenharmony_cienum r1bio_state { 18862306a36Sopenharmony_ci R1BIO_Uptodate, 18962306a36Sopenharmony_ci R1BIO_IsSync, 19062306a36Sopenharmony_ci R1BIO_Degraded, 19162306a36Sopenharmony_ci R1BIO_BehindIO, 19262306a36Sopenharmony_ci/* Set ReadError on bios that experience a readerror so that 19362306a36Sopenharmony_ci * raid1d knows what to do with them. 19462306a36Sopenharmony_ci */ 19562306a36Sopenharmony_ci R1BIO_ReadError, 19662306a36Sopenharmony_ci/* For write-behind requests, we call bi_end_io when 19762306a36Sopenharmony_ci * the last non-write-behind device completes, providing 19862306a36Sopenharmony_ci * any write was successful. Otherwise we call when 19962306a36Sopenharmony_ci * any write-behind write succeeds, otherwise we call 20062306a36Sopenharmony_ci * with failure when last write completes (and all failed). 20162306a36Sopenharmony_ci * Record that bi_end_io was called with this flag... 20262306a36Sopenharmony_ci */ 20362306a36Sopenharmony_ci R1BIO_Returned, 20462306a36Sopenharmony_ci/* If a write for this request means we can clear some 20562306a36Sopenharmony_ci * known-bad-block records, we set this flag 20662306a36Sopenharmony_ci */ 20762306a36Sopenharmony_ci R1BIO_MadeGood, 20862306a36Sopenharmony_ci R1BIO_WriteError, 20962306a36Sopenharmony_ci R1BIO_FailFast, 21062306a36Sopenharmony_ci}; 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_cistatic inline int sector_to_idx(sector_t sector) 21362306a36Sopenharmony_ci{ 21462306a36Sopenharmony_ci return hash_long(sector >> BARRIER_UNIT_SECTOR_BITS, 21562306a36Sopenharmony_ci BARRIER_BUCKETS_NR_BITS); 21662306a36Sopenharmony_ci} 21762306a36Sopenharmony_ci#endif 218