162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2015 Shaohua Li <shli@fb.com> 462306a36Sopenharmony_ci * Copyright (C) 2016 Song Liu <songliubraving@fb.com> 562306a36Sopenharmony_ci */ 662306a36Sopenharmony_ci#include <linux/kernel.h> 762306a36Sopenharmony_ci#include <linux/wait.h> 862306a36Sopenharmony_ci#include <linux/blkdev.h> 962306a36Sopenharmony_ci#include <linux/slab.h> 1062306a36Sopenharmony_ci#include <linux/raid/md_p.h> 1162306a36Sopenharmony_ci#include <linux/crc32c.h> 1262306a36Sopenharmony_ci#include <linux/random.h> 1362306a36Sopenharmony_ci#include <linux/kthread.h> 1462306a36Sopenharmony_ci#include <linux/types.h> 1562306a36Sopenharmony_ci#include "md.h" 1662306a36Sopenharmony_ci#include "raid5.h" 1762306a36Sopenharmony_ci#include "md-bitmap.h" 1862306a36Sopenharmony_ci#include "raid5-log.h" 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci/* 2162306a36Sopenharmony_ci * metadata/data stored in disk with 4k size unit (a block) regardless 2262306a36Sopenharmony_ci * underneath hardware sector size. only works with PAGE_SIZE == 4096 2362306a36Sopenharmony_ci */ 2462306a36Sopenharmony_ci#define BLOCK_SECTORS (8) 2562306a36Sopenharmony_ci#define BLOCK_SECTOR_SHIFT (3) 2662306a36Sopenharmony_ci 2762306a36Sopenharmony_ci/* 2862306a36Sopenharmony_ci * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 2962306a36Sopenharmony_ci * 3062306a36Sopenharmony_ci * In write through mode, the reclaim runs every log->max_free_space. 3162306a36Sopenharmony_ci * This can prevent the recovery scans for too long 3262306a36Sopenharmony_ci */ 3362306a36Sopenharmony_ci#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 3462306a36Sopenharmony_ci#define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_ci/* wake up reclaim thread periodically */ 3762306a36Sopenharmony_ci#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 3862306a36Sopenharmony_ci/* start flush with these full stripes */ 3962306a36Sopenharmony_ci#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4) 4062306a36Sopenharmony_ci/* reclaim stripes in groups */ 4162306a36Sopenharmony_ci#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_ci/* 4462306a36Sopenharmony_ci * We only need 2 bios per I/O unit to make progress, but ensure we 4562306a36Sopenharmony_ci * have a few more available to not get too tight. 4662306a36Sopenharmony_ci */ 4762306a36Sopenharmony_ci#define R5L_POOL_SIZE 4 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_cistatic char *r5c_journal_mode_str[] = {"write-through", 5062306a36Sopenharmony_ci "write-back"}; 5162306a36Sopenharmony_ci/* 5262306a36Sopenharmony_ci * raid5 cache state machine 5362306a36Sopenharmony_ci * 5462306a36Sopenharmony_ci * With the RAID cache, each stripe works in two phases: 5562306a36Sopenharmony_ci * - caching phase 5662306a36Sopenharmony_ci * - writing-out phase 5762306a36Sopenharmony_ci * 5862306a36Sopenharmony_ci * These two phases are controlled by bit STRIPE_R5C_CACHING: 5962306a36Sopenharmony_ci * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 6062306a36Sopenharmony_ci * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 6162306a36Sopenharmony_ci * 6262306a36Sopenharmony_ci * When there is no journal, or the journal is in write-through mode, 6362306a36Sopenharmony_ci * the stripe is always in writing-out phase. 6462306a36Sopenharmony_ci * 6562306a36Sopenharmony_ci * For write-back journal, the stripe is sent to caching phase on write 6662306a36Sopenharmony_ci * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 6762306a36Sopenharmony_ci * the write-out phase by clearing STRIPE_R5C_CACHING. 6862306a36Sopenharmony_ci * 6962306a36Sopenharmony_ci * Stripes in caching phase do not write the raid disks. Instead, all 7062306a36Sopenharmony_ci * writes are committed from the log device. Therefore, a stripe in 7162306a36Sopenharmony_ci * caching phase handles writes as: 7262306a36Sopenharmony_ci * - write to log device 7362306a36Sopenharmony_ci * - return IO 7462306a36Sopenharmony_ci * 7562306a36Sopenharmony_ci * Stripes in writing-out phase handle writes as: 7662306a36Sopenharmony_ci * - calculate parity 7762306a36Sopenharmony_ci * - write pending data and parity to journal 7862306a36Sopenharmony_ci * - write data and parity to raid disks 7962306a36Sopenharmony_ci * - return IO for pending writes 8062306a36Sopenharmony_ci */ 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_cistruct r5l_log { 8362306a36Sopenharmony_ci struct md_rdev *rdev; 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci u32 uuid_checksum; 8662306a36Sopenharmony_ci 8762306a36Sopenharmony_ci sector_t device_size; /* log device size, round to 8862306a36Sopenharmony_ci * BLOCK_SECTORS */ 8962306a36Sopenharmony_ci sector_t max_free_space; /* reclaim run if free space is at 9062306a36Sopenharmony_ci * this size */ 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_ci sector_t last_checkpoint; /* log tail. where recovery scan 9362306a36Sopenharmony_ci * starts from */ 9462306a36Sopenharmony_ci u64 last_cp_seq; /* log tail sequence */ 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci sector_t log_start; /* log head. where new data appends */ 9762306a36Sopenharmony_ci u64 seq; /* log head sequence */ 9862306a36Sopenharmony_ci 9962306a36Sopenharmony_ci sector_t next_checkpoint; 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci struct mutex io_mutex; 10262306a36Sopenharmony_ci struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci spinlock_t io_list_lock; 10562306a36Sopenharmony_ci struct list_head running_ios; /* io_units which are still running, 10662306a36Sopenharmony_ci * and have not yet been completely 10762306a36Sopenharmony_ci * written to the log */ 10862306a36Sopenharmony_ci struct list_head io_end_ios; /* io_units which have been completely 10962306a36Sopenharmony_ci * written to the log but not yet written 11062306a36Sopenharmony_ci * to the RAID */ 11162306a36Sopenharmony_ci struct list_head flushing_ios; /* io_units which are waiting for log 11262306a36Sopenharmony_ci * cache flush */ 11362306a36Sopenharmony_ci struct list_head finished_ios; /* io_units which settle down in log disk */ 11462306a36Sopenharmony_ci struct bio flush_bio; 11562306a36Sopenharmony_ci 11662306a36Sopenharmony_ci struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 11762306a36Sopenharmony_ci 11862306a36Sopenharmony_ci struct kmem_cache *io_kc; 11962306a36Sopenharmony_ci mempool_t io_pool; 12062306a36Sopenharmony_ci struct bio_set bs; 12162306a36Sopenharmony_ci mempool_t meta_pool; 12262306a36Sopenharmony_ci 12362306a36Sopenharmony_ci struct md_thread __rcu *reclaim_thread; 12462306a36Sopenharmony_ci unsigned long reclaim_target; /* number of space that need to be 12562306a36Sopenharmony_ci * reclaimed. if it's 0, reclaim spaces 12662306a36Sopenharmony_ci * used by io_units which are in 12762306a36Sopenharmony_ci * IO_UNIT_STRIPE_END state (eg, reclaim 12862306a36Sopenharmony_ci * doesn't wait for specific io_unit 12962306a36Sopenharmony_ci * switching to IO_UNIT_STRIPE_END 13062306a36Sopenharmony_ci * state) */ 13162306a36Sopenharmony_ci wait_queue_head_t iounit_wait; 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci struct list_head no_space_stripes; /* pending stripes, log has no space */ 13462306a36Sopenharmony_ci spinlock_t no_space_stripes_lock; 13562306a36Sopenharmony_ci 13662306a36Sopenharmony_ci bool need_cache_flush; 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_ci /* for r5c_cache */ 13962306a36Sopenharmony_ci enum r5c_journal_mode r5c_journal_mode; 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci /* all stripes in r5cache, in the order of seq at sh->log_start */ 14262306a36Sopenharmony_ci struct list_head stripe_in_journal_list; 14362306a36Sopenharmony_ci 14462306a36Sopenharmony_ci spinlock_t stripe_in_journal_lock; 14562306a36Sopenharmony_ci atomic_t stripe_in_journal_count; 14662306a36Sopenharmony_ci 14762306a36Sopenharmony_ci /* to submit async io_units, to fulfill ordering of flush */ 14862306a36Sopenharmony_ci struct work_struct deferred_io_work; 14962306a36Sopenharmony_ci /* to disable write back during in degraded mode */ 15062306a36Sopenharmony_ci struct work_struct disable_writeback_work; 15162306a36Sopenharmony_ci 15262306a36Sopenharmony_ci /* to for chunk_aligned_read in writeback mode, details below */ 15362306a36Sopenharmony_ci spinlock_t tree_lock; 15462306a36Sopenharmony_ci struct radix_tree_root big_stripe_tree; 15562306a36Sopenharmony_ci}; 15662306a36Sopenharmony_ci 15762306a36Sopenharmony_ci/* 15862306a36Sopenharmony_ci * Enable chunk_aligned_read() with write back cache. 15962306a36Sopenharmony_ci * 16062306a36Sopenharmony_ci * Each chunk may contain more than one stripe (for example, a 256kB 16162306a36Sopenharmony_ci * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For 16262306a36Sopenharmony_ci * chunk_aligned_read, these stripes are grouped into one "big_stripe". 16362306a36Sopenharmony_ci * For each big_stripe, we count how many stripes of this big_stripe 16462306a36Sopenharmony_ci * are in the write back cache. These data are tracked in a radix tree 16562306a36Sopenharmony_ci * (big_stripe_tree). We use radix_tree item pointer as the counter. 16662306a36Sopenharmony_ci * r5c_tree_index() is used to calculate keys for the radix tree. 16762306a36Sopenharmony_ci * 16862306a36Sopenharmony_ci * chunk_aligned_read() calls r5c_big_stripe_cached() to look up 16962306a36Sopenharmony_ci * big_stripe of each chunk in the tree. If this big_stripe is in the 17062306a36Sopenharmony_ci * tree, chunk_aligned_read() aborts. This look up is protected by 17162306a36Sopenharmony_ci * rcu_read_lock(). 17262306a36Sopenharmony_ci * 17362306a36Sopenharmony_ci * It is necessary to remember whether a stripe is counted in 17462306a36Sopenharmony_ci * big_stripe_tree. Instead of adding new flag, we reuses existing flags: 17562306a36Sopenharmony_ci * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these 17662306a36Sopenharmony_ci * two flags are set, the stripe is counted in big_stripe_tree. This 17762306a36Sopenharmony_ci * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to 17862306a36Sopenharmony_ci * r5c_try_caching_write(); and moving clear_bit of 17962306a36Sopenharmony_ci * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to 18062306a36Sopenharmony_ci * r5c_finish_stripe_write_out(). 18162306a36Sopenharmony_ci */ 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci/* 18462306a36Sopenharmony_ci * radix tree requests lowest 2 bits of data pointer to be 2b'00. 18562306a36Sopenharmony_ci * So it is necessary to left shift the counter by 2 bits before using it 18662306a36Sopenharmony_ci * as data pointer of the tree. 18762306a36Sopenharmony_ci */ 18862306a36Sopenharmony_ci#define R5C_RADIX_COUNT_SHIFT 2 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci/* 19162306a36Sopenharmony_ci * calculate key for big_stripe_tree 19262306a36Sopenharmony_ci * 19362306a36Sopenharmony_ci * sect: align_bi->bi_iter.bi_sector or sh->sector 19462306a36Sopenharmony_ci */ 19562306a36Sopenharmony_cistatic inline sector_t r5c_tree_index(struct r5conf *conf, 19662306a36Sopenharmony_ci sector_t sect) 19762306a36Sopenharmony_ci{ 19862306a36Sopenharmony_ci sector_div(sect, conf->chunk_sectors); 19962306a36Sopenharmony_ci return sect; 20062306a36Sopenharmony_ci} 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_ci/* 20362306a36Sopenharmony_ci * an IO range starts from a meta data block and end at the next meta data 20462306a36Sopenharmony_ci * block. The io unit's the meta data block tracks data/parity followed it. io 20562306a36Sopenharmony_ci * unit is written to log disk with normal write, as we always flush log disk 20662306a36Sopenharmony_ci * first and then start move data to raid disks, there is no requirement to 20762306a36Sopenharmony_ci * write io unit with FLUSH/FUA 20862306a36Sopenharmony_ci */ 20962306a36Sopenharmony_cistruct r5l_io_unit { 21062306a36Sopenharmony_ci struct r5l_log *log; 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci struct page *meta_page; /* store meta block */ 21362306a36Sopenharmony_ci int meta_offset; /* current offset in meta_page */ 21462306a36Sopenharmony_ci 21562306a36Sopenharmony_ci struct bio *current_bio;/* current_bio accepting new data */ 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci atomic_t pending_stripe;/* how many stripes not flushed to raid */ 21862306a36Sopenharmony_ci u64 seq; /* seq number of the metablock */ 21962306a36Sopenharmony_ci sector_t log_start; /* where the io_unit starts */ 22062306a36Sopenharmony_ci sector_t log_end; /* where the io_unit ends */ 22162306a36Sopenharmony_ci struct list_head log_sibling; /* log->running_ios */ 22262306a36Sopenharmony_ci struct list_head stripe_list; /* stripes added to the io_unit */ 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_ci int state; 22562306a36Sopenharmony_ci bool need_split_bio; 22662306a36Sopenharmony_ci struct bio *split_bio; 22762306a36Sopenharmony_ci 22862306a36Sopenharmony_ci unsigned int has_flush:1; /* include flush request */ 22962306a36Sopenharmony_ci unsigned int has_fua:1; /* include fua request */ 23062306a36Sopenharmony_ci unsigned int has_null_flush:1; /* include null flush request */ 23162306a36Sopenharmony_ci unsigned int has_flush_payload:1; /* include flush payload */ 23262306a36Sopenharmony_ci /* 23362306a36Sopenharmony_ci * io isn't sent yet, flush/fua request can only be submitted till it's 23462306a36Sopenharmony_ci * the first IO in running_ios list 23562306a36Sopenharmony_ci */ 23662306a36Sopenharmony_ci unsigned int io_deferred:1; 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci struct bio_list flush_barriers; /* size == 0 flush bios */ 23962306a36Sopenharmony_ci}; 24062306a36Sopenharmony_ci 24162306a36Sopenharmony_ci/* r5l_io_unit state */ 24262306a36Sopenharmony_cienum r5l_io_unit_state { 24362306a36Sopenharmony_ci IO_UNIT_RUNNING = 0, /* accepting new IO */ 24462306a36Sopenharmony_ci IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 24562306a36Sopenharmony_ci * don't accepting new bio */ 24662306a36Sopenharmony_ci IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 24762306a36Sopenharmony_ci IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 24862306a36Sopenharmony_ci}; 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_cibool r5c_is_writeback(struct r5l_log *log) 25162306a36Sopenharmony_ci{ 25262306a36Sopenharmony_ci return (log != NULL && 25362306a36Sopenharmony_ci log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 25462306a36Sopenharmony_ci} 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_cistatic sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 25762306a36Sopenharmony_ci{ 25862306a36Sopenharmony_ci start += inc; 25962306a36Sopenharmony_ci if (start >= log->device_size) 26062306a36Sopenharmony_ci start = start - log->device_size; 26162306a36Sopenharmony_ci return start; 26262306a36Sopenharmony_ci} 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_cistatic sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 26562306a36Sopenharmony_ci sector_t end) 26662306a36Sopenharmony_ci{ 26762306a36Sopenharmony_ci if (end >= start) 26862306a36Sopenharmony_ci return end - start; 26962306a36Sopenharmony_ci else 27062306a36Sopenharmony_ci return end + log->device_size - start; 27162306a36Sopenharmony_ci} 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_cistatic bool r5l_has_free_space(struct r5l_log *log, sector_t size) 27462306a36Sopenharmony_ci{ 27562306a36Sopenharmony_ci sector_t used_size; 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ci used_size = r5l_ring_distance(log, log->last_checkpoint, 27862306a36Sopenharmony_ci log->log_start); 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci return log->device_size > used_size + size; 28162306a36Sopenharmony_ci} 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_cistatic void __r5l_set_io_unit_state(struct r5l_io_unit *io, 28462306a36Sopenharmony_ci enum r5l_io_unit_state state) 28562306a36Sopenharmony_ci{ 28662306a36Sopenharmony_ci if (WARN_ON(io->state >= state)) 28762306a36Sopenharmony_ci return; 28862306a36Sopenharmony_ci io->state = state; 28962306a36Sopenharmony_ci} 29062306a36Sopenharmony_ci 29162306a36Sopenharmony_cistatic void 29262306a36Sopenharmony_cir5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev) 29362306a36Sopenharmony_ci{ 29462306a36Sopenharmony_ci struct bio *wbi, *wbi2; 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci wbi = dev->written; 29762306a36Sopenharmony_ci dev->written = NULL; 29862306a36Sopenharmony_ci while (wbi && wbi->bi_iter.bi_sector < 29962306a36Sopenharmony_ci dev->sector + RAID5_STRIPE_SECTORS(conf)) { 30062306a36Sopenharmony_ci wbi2 = r5_next_bio(conf, wbi, dev->sector); 30162306a36Sopenharmony_ci md_write_end(conf->mddev); 30262306a36Sopenharmony_ci bio_endio(wbi); 30362306a36Sopenharmony_ci wbi = wbi2; 30462306a36Sopenharmony_ci } 30562306a36Sopenharmony_ci} 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_civoid r5c_handle_cached_data_endio(struct r5conf *conf, 30862306a36Sopenharmony_ci struct stripe_head *sh, int disks) 30962306a36Sopenharmony_ci{ 31062306a36Sopenharmony_ci int i; 31162306a36Sopenharmony_ci 31262306a36Sopenharmony_ci for (i = sh->disks; i--; ) { 31362306a36Sopenharmony_ci if (sh->dev[i].written) { 31462306a36Sopenharmony_ci set_bit(R5_UPTODATE, &sh->dev[i].flags); 31562306a36Sopenharmony_ci r5c_return_dev_pending_writes(conf, &sh->dev[i]); 31662306a36Sopenharmony_ci md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 31762306a36Sopenharmony_ci RAID5_STRIPE_SECTORS(conf), 31862306a36Sopenharmony_ci !test_bit(STRIPE_DEGRADED, &sh->state), 31962306a36Sopenharmony_ci 0); 32062306a36Sopenharmony_ci } 32162306a36Sopenharmony_ci } 32262306a36Sopenharmony_ci} 32362306a36Sopenharmony_ci 32462306a36Sopenharmony_civoid r5l_wake_reclaim(struct r5l_log *log, sector_t space); 32562306a36Sopenharmony_ci 32662306a36Sopenharmony_ci/* Check whether we should flush some stripes to free up stripe cache */ 32762306a36Sopenharmony_civoid r5c_check_stripe_cache_usage(struct r5conf *conf) 32862306a36Sopenharmony_ci{ 32962306a36Sopenharmony_ci int total_cached; 33062306a36Sopenharmony_ci 33162306a36Sopenharmony_ci if (!r5c_is_writeback(conf->log)) 33262306a36Sopenharmony_ci return; 33362306a36Sopenharmony_ci 33462306a36Sopenharmony_ci total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 33562306a36Sopenharmony_ci atomic_read(&conf->r5c_cached_full_stripes); 33662306a36Sopenharmony_ci 33762306a36Sopenharmony_ci /* 33862306a36Sopenharmony_ci * The following condition is true for either of the following: 33962306a36Sopenharmony_ci * - stripe cache pressure high: 34062306a36Sopenharmony_ci * total_cached > 3/4 min_nr_stripes || 34162306a36Sopenharmony_ci * empty_inactive_list_nr > 0 34262306a36Sopenharmony_ci * - stripe cache pressure moderate: 34362306a36Sopenharmony_ci * total_cached > 1/2 min_nr_stripes 34462306a36Sopenharmony_ci */ 34562306a36Sopenharmony_ci if (total_cached > conf->min_nr_stripes * 1 / 2 || 34662306a36Sopenharmony_ci atomic_read(&conf->empty_inactive_list_nr) > 0) 34762306a36Sopenharmony_ci r5l_wake_reclaim(conf->log, 0); 34862306a36Sopenharmony_ci} 34962306a36Sopenharmony_ci 35062306a36Sopenharmony_ci/* 35162306a36Sopenharmony_ci * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 35262306a36Sopenharmony_ci * stripes in the cache 35362306a36Sopenharmony_ci */ 35462306a36Sopenharmony_civoid r5c_check_cached_full_stripe(struct r5conf *conf) 35562306a36Sopenharmony_ci{ 35662306a36Sopenharmony_ci if (!r5c_is_writeback(conf->log)) 35762306a36Sopenharmony_ci return; 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci /* 36062306a36Sopenharmony_ci * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 36162306a36Sopenharmony_ci * or a full stripe (chunk size / 4k stripes). 36262306a36Sopenharmony_ci */ 36362306a36Sopenharmony_ci if (atomic_read(&conf->r5c_cached_full_stripes) >= 36462306a36Sopenharmony_ci min(R5C_FULL_STRIPE_FLUSH_BATCH(conf), 36562306a36Sopenharmony_ci conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf))) 36662306a36Sopenharmony_ci r5l_wake_reclaim(conf->log, 0); 36762306a36Sopenharmony_ci} 36862306a36Sopenharmony_ci 36962306a36Sopenharmony_ci/* 37062306a36Sopenharmony_ci * Total log space (in sectors) needed to flush all data in cache 37162306a36Sopenharmony_ci * 37262306a36Sopenharmony_ci * To avoid deadlock due to log space, it is necessary to reserve log 37362306a36Sopenharmony_ci * space to flush critical stripes (stripes that occupying log space near 37462306a36Sopenharmony_ci * last_checkpoint). This function helps check how much log space is 37562306a36Sopenharmony_ci * required to flush all cached stripes. 37662306a36Sopenharmony_ci * 37762306a36Sopenharmony_ci * To reduce log space requirements, two mechanisms are used to give cache 37862306a36Sopenharmony_ci * flush higher priorities: 37962306a36Sopenharmony_ci * 1. In handle_stripe_dirtying() and schedule_reconstruction(), 38062306a36Sopenharmony_ci * stripes ALREADY in journal can be flushed w/o pending writes; 38162306a36Sopenharmony_ci * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal 38262306a36Sopenharmony_ci * can be delayed (r5l_add_no_space_stripe). 38362306a36Sopenharmony_ci * 38462306a36Sopenharmony_ci * In cache flush, the stripe goes through 1 and then 2. For a stripe that 38562306a36Sopenharmony_ci * already passed 1, flushing it requires at most (conf->max_degraded + 1) 38662306a36Sopenharmony_ci * pages of journal space. For stripes that has not passed 1, flushing it 38762306a36Sopenharmony_ci * requires (conf->raid_disks + 1) pages of journal space. There are at 38862306a36Sopenharmony_ci * most (conf->group_cnt + 1) stripe that passed 1. So total journal space 38962306a36Sopenharmony_ci * required to flush all cached stripes (in pages) is: 39062306a36Sopenharmony_ci * 39162306a36Sopenharmony_ci * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) + 39262306a36Sopenharmony_ci * (group_cnt + 1) * (raid_disks + 1) 39362306a36Sopenharmony_ci * or 39462306a36Sopenharmony_ci * (stripe_in_journal_count) * (max_degraded + 1) + 39562306a36Sopenharmony_ci * (group_cnt + 1) * (raid_disks - max_degraded) 39662306a36Sopenharmony_ci */ 39762306a36Sopenharmony_cistatic sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 39862306a36Sopenharmony_ci{ 39962306a36Sopenharmony_ci struct r5l_log *log = conf->log; 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci if (!r5c_is_writeback(log)) 40262306a36Sopenharmony_ci return 0; 40362306a36Sopenharmony_ci 40462306a36Sopenharmony_ci return BLOCK_SECTORS * 40562306a36Sopenharmony_ci ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) + 40662306a36Sopenharmony_ci (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1)); 40762306a36Sopenharmony_ci} 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci/* 41062306a36Sopenharmony_ci * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 41162306a36Sopenharmony_ci * 41262306a36Sopenharmony_ci * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 41362306a36Sopenharmony_ci * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 41462306a36Sopenharmony_ci * device is less than 2x of reclaim_required_space. 41562306a36Sopenharmony_ci */ 41662306a36Sopenharmony_cistatic inline void r5c_update_log_state(struct r5l_log *log) 41762306a36Sopenharmony_ci{ 41862306a36Sopenharmony_ci struct r5conf *conf = log->rdev->mddev->private; 41962306a36Sopenharmony_ci sector_t free_space; 42062306a36Sopenharmony_ci sector_t reclaim_space; 42162306a36Sopenharmony_ci bool wake_reclaim = false; 42262306a36Sopenharmony_ci 42362306a36Sopenharmony_ci if (!r5c_is_writeback(log)) 42462306a36Sopenharmony_ci return; 42562306a36Sopenharmony_ci 42662306a36Sopenharmony_ci free_space = r5l_ring_distance(log, log->log_start, 42762306a36Sopenharmony_ci log->last_checkpoint); 42862306a36Sopenharmony_ci reclaim_space = r5c_log_required_to_flush_cache(conf); 42962306a36Sopenharmony_ci if (free_space < 2 * reclaim_space) 43062306a36Sopenharmony_ci set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 43162306a36Sopenharmony_ci else { 43262306a36Sopenharmony_ci if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 43362306a36Sopenharmony_ci wake_reclaim = true; 43462306a36Sopenharmony_ci clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 43562306a36Sopenharmony_ci } 43662306a36Sopenharmony_ci if (free_space < 3 * reclaim_space) 43762306a36Sopenharmony_ci set_bit(R5C_LOG_TIGHT, &conf->cache_state); 43862306a36Sopenharmony_ci else 43962306a36Sopenharmony_ci clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 44062306a36Sopenharmony_ci 44162306a36Sopenharmony_ci if (wake_reclaim) 44262306a36Sopenharmony_ci r5l_wake_reclaim(log, 0); 44362306a36Sopenharmony_ci} 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_ci/* 44662306a36Sopenharmony_ci * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 44762306a36Sopenharmony_ci * This function should only be called in write-back mode. 44862306a36Sopenharmony_ci */ 44962306a36Sopenharmony_civoid r5c_make_stripe_write_out(struct stripe_head *sh) 45062306a36Sopenharmony_ci{ 45162306a36Sopenharmony_ci struct r5conf *conf = sh->raid_conf; 45262306a36Sopenharmony_ci struct r5l_log *log = conf->log; 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci BUG_ON(!r5c_is_writeback(log)); 45562306a36Sopenharmony_ci 45662306a36Sopenharmony_ci WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 45762306a36Sopenharmony_ci clear_bit(STRIPE_R5C_CACHING, &sh->state); 45862306a36Sopenharmony_ci 45962306a36Sopenharmony_ci if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 46062306a36Sopenharmony_ci atomic_inc(&conf->preread_active_stripes); 46162306a36Sopenharmony_ci} 46262306a36Sopenharmony_ci 46362306a36Sopenharmony_cistatic void r5c_handle_data_cached(struct stripe_head *sh) 46462306a36Sopenharmony_ci{ 46562306a36Sopenharmony_ci int i; 46662306a36Sopenharmony_ci 46762306a36Sopenharmony_ci for (i = sh->disks; i--; ) 46862306a36Sopenharmony_ci if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 46962306a36Sopenharmony_ci set_bit(R5_InJournal, &sh->dev[i].flags); 47062306a36Sopenharmony_ci clear_bit(R5_LOCKED, &sh->dev[i].flags); 47162306a36Sopenharmony_ci } 47262306a36Sopenharmony_ci clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 47362306a36Sopenharmony_ci} 47462306a36Sopenharmony_ci 47562306a36Sopenharmony_ci/* 47662306a36Sopenharmony_ci * this journal write must contain full parity, 47762306a36Sopenharmony_ci * it may also contain some data pages 47862306a36Sopenharmony_ci */ 47962306a36Sopenharmony_cistatic void r5c_handle_parity_cached(struct stripe_head *sh) 48062306a36Sopenharmony_ci{ 48162306a36Sopenharmony_ci int i; 48262306a36Sopenharmony_ci 48362306a36Sopenharmony_ci for (i = sh->disks; i--; ) 48462306a36Sopenharmony_ci if (test_bit(R5_InJournal, &sh->dev[i].flags)) 48562306a36Sopenharmony_ci set_bit(R5_Wantwrite, &sh->dev[i].flags); 48662306a36Sopenharmony_ci} 48762306a36Sopenharmony_ci 48862306a36Sopenharmony_ci/* 48962306a36Sopenharmony_ci * Setting proper flags after writing (or flushing) data and/or parity to the 49062306a36Sopenharmony_ci * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 49162306a36Sopenharmony_ci */ 49262306a36Sopenharmony_cistatic void r5c_finish_cache_stripe(struct stripe_head *sh) 49362306a36Sopenharmony_ci{ 49462306a36Sopenharmony_ci struct r5l_log *log = sh->raid_conf->log; 49562306a36Sopenharmony_ci 49662306a36Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 49762306a36Sopenharmony_ci BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 49862306a36Sopenharmony_ci /* 49962306a36Sopenharmony_ci * Set R5_InJournal for parity dev[pd_idx]. This means 50062306a36Sopenharmony_ci * all data AND parity in the journal. For RAID 6, it is 50162306a36Sopenharmony_ci * NOT necessary to set the flag for dev[qd_idx], as the 50262306a36Sopenharmony_ci * two parities are written out together. 50362306a36Sopenharmony_ci */ 50462306a36Sopenharmony_ci set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 50562306a36Sopenharmony_ci } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 50662306a36Sopenharmony_ci r5c_handle_data_cached(sh); 50762306a36Sopenharmony_ci } else { 50862306a36Sopenharmony_ci r5c_handle_parity_cached(sh); 50962306a36Sopenharmony_ci set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 51062306a36Sopenharmony_ci } 51162306a36Sopenharmony_ci} 51262306a36Sopenharmony_ci 51362306a36Sopenharmony_cistatic void r5l_io_run_stripes(struct r5l_io_unit *io) 51462306a36Sopenharmony_ci{ 51562306a36Sopenharmony_ci struct stripe_head *sh, *next; 51662306a36Sopenharmony_ci 51762306a36Sopenharmony_ci list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 51862306a36Sopenharmony_ci list_del_init(&sh->log_list); 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci r5c_finish_cache_stripe(sh); 52162306a36Sopenharmony_ci 52262306a36Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 52362306a36Sopenharmony_ci raid5_release_stripe(sh); 52462306a36Sopenharmony_ci } 52562306a36Sopenharmony_ci} 52662306a36Sopenharmony_ci 52762306a36Sopenharmony_cistatic void r5l_log_run_stripes(struct r5l_log *log) 52862306a36Sopenharmony_ci{ 52962306a36Sopenharmony_ci struct r5l_io_unit *io, *next; 53062306a36Sopenharmony_ci 53162306a36Sopenharmony_ci lockdep_assert_held(&log->io_list_lock); 53262306a36Sopenharmony_ci 53362306a36Sopenharmony_ci list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 53462306a36Sopenharmony_ci /* don't change list order */ 53562306a36Sopenharmony_ci if (io->state < IO_UNIT_IO_END) 53662306a36Sopenharmony_ci break; 53762306a36Sopenharmony_ci 53862306a36Sopenharmony_ci list_move_tail(&io->log_sibling, &log->finished_ios); 53962306a36Sopenharmony_ci r5l_io_run_stripes(io); 54062306a36Sopenharmony_ci } 54162306a36Sopenharmony_ci} 54262306a36Sopenharmony_ci 54362306a36Sopenharmony_cistatic void r5l_move_to_end_ios(struct r5l_log *log) 54462306a36Sopenharmony_ci{ 54562306a36Sopenharmony_ci struct r5l_io_unit *io, *next; 54662306a36Sopenharmony_ci 54762306a36Sopenharmony_ci lockdep_assert_held(&log->io_list_lock); 54862306a36Sopenharmony_ci 54962306a36Sopenharmony_ci list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 55062306a36Sopenharmony_ci /* don't change list order */ 55162306a36Sopenharmony_ci if (io->state < IO_UNIT_IO_END) 55262306a36Sopenharmony_ci break; 55362306a36Sopenharmony_ci list_move_tail(&io->log_sibling, &log->io_end_ios); 55462306a36Sopenharmony_ci } 55562306a36Sopenharmony_ci} 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_cistatic void __r5l_stripe_write_finished(struct r5l_io_unit *io); 55862306a36Sopenharmony_cistatic void r5l_log_endio(struct bio *bio) 55962306a36Sopenharmony_ci{ 56062306a36Sopenharmony_ci struct r5l_io_unit *io = bio->bi_private; 56162306a36Sopenharmony_ci struct r5l_io_unit *io_deferred; 56262306a36Sopenharmony_ci struct r5l_log *log = io->log; 56362306a36Sopenharmony_ci unsigned long flags; 56462306a36Sopenharmony_ci bool has_null_flush; 56562306a36Sopenharmony_ci bool has_flush_payload; 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_ci if (bio->bi_status) 56862306a36Sopenharmony_ci md_error(log->rdev->mddev, log->rdev); 56962306a36Sopenharmony_ci 57062306a36Sopenharmony_ci bio_put(bio); 57162306a36Sopenharmony_ci mempool_free(io->meta_page, &log->meta_pool); 57262306a36Sopenharmony_ci 57362306a36Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 57462306a36Sopenharmony_ci __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 57562306a36Sopenharmony_ci 57662306a36Sopenharmony_ci /* 57762306a36Sopenharmony_ci * if the io doesn't not have null_flush or flush payload, 57862306a36Sopenharmony_ci * it is not safe to access it after releasing io_list_lock. 57962306a36Sopenharmony_ci * Therefore, it is necessary to check the condition with 58062306a36Sopenharmony_ci * the lock held. 58162306a36Sopenharmony_ci */ 58262306a36Sopenharmony_ci has_null_flush = io->has_null_flush; 58362306a36Sopenharmony_ci has_flush_payload = io->has_flush_payload; 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci if (log->need_cache_flush && !list_empty(&io->stripe_list)) 58662306a36Sopenharmony_ci r5l_move_to_end_ios(log); 58762306a36Sopenharmony_ci else 58862306a36Sopenharmony_ci r5l_log_run_stripes(log); 58962306a36Sopenharmony_ci if (!list_empty(&log->running_ios)) { 59062306a36Sopenharmony_ci /* 59162306a36Sopenharmony_ci * FLUSH/FUA io_unit is deferred because of ordering, now we 59262306a36Sopenharmony_ci * can dispatch it 59362306a36Sopenharmony_ci */ 59462306a36Sopenharmony_ci io_deferred = list_first_entry(&log->running_ios, 59562306a36Sopenharmony_ci struct r5l_io_unit, log_sibling); 59662306a36Sopenharmony_ci if (io_deferred->io_deferred) 59762306a36Sopenharmony_ci schedule_work(&log->deferred_io_work); 59862306a36Sopenharmony_ci } 59962306a36Sopenharmony_ci 60062306a36Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 60162306a36Sopenharmony_ci 60262306a36Sopenharmony_ci if (log->need_cache_flush) 60362306a36Sopenharmony_ci md_wakeup_thread(log->rdev->mddev->thread); 60462306a36Sopenharmony_ci 60562306a36Sopenharmony_ci /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ 60662306a36Sopenharmony_ci if (has_null_flush) { 60762306a36Sopenharmony_ci struct bio *bi; 60862306a36Sopenharmony_ci 60962306a36Sopenharmony_ci WARN_ON(bio_list_empty(&io->flush_barriers)); 61062306a36Sopenharmony_ci while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { 61162306a36Sopenharmony_ci bio_endio(bi); 61262306a36Sopenharmony_ci if (atomic_dec_and_test(&io->pending_stripe)) { 61362306a36Sopenharmony_ci __r5l_stripe_write_finished(io); 61462306a36Sopenharmony_ci return; 61562306a36Sopenharmony_ci } 61662306a36Sopenharmony_ci } 61762306a36Sopenharmony_ci } 61862306a36Sopenharmony_ci /* decrease pending_stripe for flush payload */ 61962306a36Sopenharmony_ci if (has_flush_payload) 62062306a36Sopenharmony_ci if (atomic_dec_and_test(&io->pending_stripe)) 62162306a36Sopenharmony_ci __r5l_stripe_write_finished(io); 62262306a36Sopenharmony_ci} 62362306a36Sopenharmony_ci 62462306a36Sopenharmony_cistatic void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) 62562306a36Sopenharmony_ci{ 62662306a36Sopenharmony_ci unsigned long flags; 62762306a36Sopenharmony_ci 62862306a36Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 62962306a36Sopenharmony_ci __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 63062306a36Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ci /* 63362306a36Sopenharmony_ci * In case of journal device failures, submit_bio will get error 63462306a36Sopenharmony_ci * and calls endio, then active stripes will continue write 63562306a36Sopenharmony_ci * process. Therefore, it is not necessary to check Faulty bit 63662306a36Sopenharmony_ci * of journal device here. 63762306a36Sopenharmony_ci * 63862306a36Sopenharmony_ci * We can't check split_bio after current_bio is submitted. If 63962306a36Sopenharmony_ci * io->split_bio is null, after current_bio is submitted, current_bio 64062306a36Sopenharmony_ci * might already be completed and the io_unit is freed. We submit 64162306a36Sopenharmony_ci * split_bio first to avoid the issue. 64262306a36Sopenharmony_ci */ 64362306a36Sopenharmony_ci if (io->split_bio) { 64462306a36Sopenharmony_ci if (io->has_flush) 64562306a36Sopenharmony_ci io->split_bio->bi_opf |= REQ_PREFLUSH; 64662306a36Sopenharmony_ci if (io->has_fua) 64762306a36Sopenharmony_ci io->split_bio->bi_opf |= REQ_FUA; 64862306a36Sopenharmony_ci submit_bio(io->split_bio); 64962306a36Sopenharmony_ci } 65062306a36Sopenharmony_ci 65162306a36Sopenharmony_ci if (io->has_flush) 65262306a36Sopenharmony_ci io->current_bio->bi_opf |= REQ_PREFLUSH; 65362306a36Sopenharmony_ci if (io->has_fua) 65462306a36Sopenharmony_ci io->current_bio->bi_opf |= REQ_FUA; 65562306a36Sopenharmony_ci submit_bio(io->current_bio); 65662306a36Sopenharmony_ci} 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci/* deferred io_unit will be dispatched here */ 65962306a36Sopenharmony_cistatic void r5l_submit_io_async(struct work_struct *work) 66062306a36Sopenharmony_ci{ 66162306a36Sopenharmony_ci struct r5l_log *log = container_of(work, struct r5l_log, 66262306a36Sopenharmony_ci deferred_io_work); 66362306a36Sopenharmony_ci struct r5l_io_unit *io = NULL; 66462306a36Sopenharmony_ci unsigned long flags; 66562306a36Sopenharmony_ci 66662306a36Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 66762306a36Sopenharmony_ci if (!list_empty(&log->running_ios)) { 66862306a36Sopenharmony_ci io = list_first_entry(&log->running_ios, struct r5l_io_unit, 66962306a36Sopenharmony_ci log_sibling); 67062306a36Sopenharmony_ci if (!io->io_deferred) 67162306a36Sopenharmony_ci io = NULL; 67262306a36Sopenharmony_ci else 67362306a36Sopenharmony_ci io->io_deferred = 0; 67462306a36Sopenharmony_ci } 67562306a36Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 67662306a36Sopenharmony_ci if (io) 67762306a36Sopenharmony_ci r5l_do_submit_io(log, io); 67862306a36Sopenharmony_ci} 67962306a36Sopenharmony_ci 68062306a36Sopenharmony_cistatic void r5c_disable_writeback_async(struct work_struct *work) 68162306a36Sopenharmony_ci{ 68262306a36Sopenharmony_ci struct r5l_log *log = container_of(work, struct r5l_log, 68362306a36Sopenharmony_ci disable_writeback_work); 68462306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 68562306a36Sopenharmony_ci struct r5conf *conf = mddev->private; 68662306a36Sopenharmony_ci int locked = 0; 68762306a36Sopenharmony_ci 68862306a36Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 68962306a36Sopenharmony_ci return; 69062306a36Sopenharmony_ci pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", 69162306a36Sopenharmony_ci mdname(mddev)); 69262306a36Sopenharmony_ci 69362306a36Sopenharmony_ci /* wait superblock change before suspend */ 69462306a36Sopenharmony_ci wait_event(mddev->sb_wait, 69562306a36Sopenharmony_ci conf->log == NULL || 69662306a36Sopenharmony_ci (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && 69762306a36Sopenharmony_ci (locked = mddev_trylock(mddev)))); 69862306a36Sopenharmony_ci if (locked) { 69962306a36Sopenharmony_ci mddev_suspend(mddev); 70062306a36Sopenharmony_ci log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 70162306a36Sopenharmony_ci mddev_resume(mddev); 70262306a36Sopenharmony_ci mddev_unlock(mddev); 70362306a36Sopenharmony_ci } 70462306a36Sopenharmony_ci} 70562306a36Sopenharmony_ci 70662306a36Sopenharmony_cistatic void r5l_submit_current_io(struct r5l_log *log) 70762306a36Sopenharmony_ci{ 70862306a36Sopenharmony_ci struct r5l_io_unit *io = log->current_io; 70962306a36Sopenharmony_ci struct r5l_meta_block *block; 71062306a36Sopenharmony_ci unsigned long flags; 71162306a36Sopenharmony_ci u32 crc; 71262306a36Sopenharmony_ci bool do_submit = true; 71362306a36Sopenharmony_ci 71462306a36Sopenharmony_ci if (!io) 71562306a36Sopenharmony_ci return; 71662306a36Sopenharmony_ci 71762306a36Sopenharmony_ci block = page_address(io->meta_page); 71862306a36Sopenharmony_ci block->meta_size = cpu_to_le32(io->meta_offset); 71962306a36Sopenharmony_ci crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 72062306a36Sopenharmony_ci block->checksum = cpu_to_le32(crc); 72162306a36Sopenharmony_ci 72262306a36Sopenharmony_ci log->current_io = NULL; 72362306a36Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 72462306a36Sopenharmony_ci if (io->has_flush || io->has_fua) { 72562306a36Sopenharmony_ci if (io != list_first_entry(&log->running_ios, 72662306a36Sopenharmony_ci struct r5l_io_unit, log_sibling)) { 72762306a36Sopenharmony_ci io->io_deferred = 1; 72862306a36Sopenharmony_ci do_submit = false; 72962306a36Sopenharmony_ci } 73062306a36Sopenharmony_ci } 73162306a36Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 73262306a36Sopenharmony_ci if (do_submit) 73362306a36Sopenharmony_ci r5l_do_submit_io(log, io); 73462306a36Sopenharmony_ci} 73562306a36Sopenharmony_ci 73662306a36Sopenharmony_cistatic struct bio *r5l_bio_alloc(struct r5l_log *log) 73762306a36Sopenharmony_ci{ 73862306a36Sopenharmony_ci struct bio *bio = bio_alloc_bioset(log->rdev->bdev, BIO_MAX_VECS, 73962306a36Sopenharmony_ci REQ_OP_WRITE, GFP_NOIO, &log->bs); 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ci bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci return bio; 74462306a36Sopenharmony_ci} 74562306a36Sopenharmony_ci 74662306a36Sopenharmony_cistatic void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 74762306a36Sopenharmony_ci{ 74862306a36Sopenharmony_ci log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 74962306a36Sopenharmony_ci 75062306a36Sopenharmony_ci r5c_update_log_state(log); 75162306a36Sopenharmony_ci /* 75262306a36Sopenharmony_ci * If we filled up the log device start from the beginning again, 75362306a36Sopenharmony_ci * which will require a new bio. 75462306a36Sopenharmony_ci * 75562306a36Sopenharmony_ci * Note: for this to work properly the log size needs to me a multiple 75662306a36Sopenharmony_ci * of BLOCK_SECTORS. 75762306a36Sopenharmony_ci */ 75862306a36Sopenharmony_ci if (log->log_start == 0) 75962306a36Sopenharmony_ci io->need_split_bio = true; 76062306a36Sopenharmony_ci 76162306a36Sopenharmony_ci io->log_end = log->log_start; 76262306a36Sopenharmony_ci} 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_cistatic struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 76562306a36Sopenharmony_ci{ 76662306a36Sopenharmony_ci struct r5l_io_unit *io; 76762306a36Sopenharmony_ci struct r5l_meta_block *block; 76862306a36Sopenharmony_ci 76962306a36Sopenharmony_ci io = mempool_alloc(&log->io_pool, GFP_ATOMIC); 77062306a36Sopenharmony_ci if (!io) 77162306a36Sopenharmony_ci return NULL; 77262306a36Sopenharmony_ci memset(io, 0, sizeof(*io)); 77362306a36Sopenharmony_ci 77462306a36Sopenharmony_ci io->log = log; 77562306a36Sopenharmony_ci INIT_LIST_HEAD(&io->log_sibling); 77662306a36Sopenharmony_ci INIT_LIST_HEAD(&io->stripe_list); 77762306a36Sopenharmony_ci bio_list_init(&io->flush_barriers); 77862306a36Sopenharmony_ci io->state = IO_UNIT_RUNNING; 77962306a36Sopenharmony_ci 78062306a36Sopenharmony_ci io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO); 78162306a36Sopenharmony_ci block = page_address(io->meta_page); 78262306a36Sopenharmony_ci clear_page(block); 78362306a36Sopenharmony_ci block->magic = cpu_to_le32(R5LOG_MAGIC); 78462306a36Sopenharmony_ci block->version = R5LOG_VERSION; 78562306a36Sopenharmony_ci block->seq = cpu_to_le64(log->seq); 78662306a36Sopenharmony_ci block->position = cpu_to_le64(log->log_start); 78762306a36Sopenharmony_ci 78862306a36Sopenharmony_ci io->log_start = log->log_start; 78962306a36Sopenharmony_ci io->meta_offset = sizeof(struct r5l_meta_block); 79062306a36Sopenharmony_ci io->seq = log->seq++; 79162306a36Sopenharmony_ci 79262306a36Sopenharmony_ci io->current_bio = r5l_bio_alloc(log); 79362306a36Sopenharmony_ci io->current_bio->bi_end_io = r5l_log_endio; 79462306a36Sopenharmony_ci io->current_bio->bi_private = io; 79562306a36Sopenharmony_ci __bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 79662306a36Sopenharmony_ci 79762306a36Sopenharmony_ci r5_reserve_log_entry(log, io); 79862306a36Sopenharmony_ci 79962306a36Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 80062306a36Sopenharmony_ci list_add_tail(&io->log_sibling, &log->running_ios); 80162306a36Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 80262306a36Sopenharmony_ci 80362306a36Sopenharmony_ci return io; 80462306a36Sopenharmony_ci} 80562306a36Sopenharmony_ci 80662306a36Sopenharmony_cistatic int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 80762306a36Sopenharmony_ci{ 80862306a36Sopenharmony_ci if (log->current_io && 80962306a36Sopenharmony_ci log->current_io->meta_offset + payload_size > PAGE_SIZE) 81062306a36Sopenharmony_ci r5l_submit_current_io(log); 81162306a36Sopenharmony_ci 81262306a36Sopenharmony_ci if (!log->current_io) { 81362306a36Sopenharmony_ci log->current_io = r5l_new_meta(log); 81462306a36Sopenharmony_ci if (!log->current_io) 81562306a36Sopenharmony_ci return -ENOMEM; 81662306a36Sopenharmony_ci } 81762306a36Sopenharmony_ci 81862306a36Sopenharmony_ci return 0; 81962306a36Sopenharmony_ci} 82062306a36Sopenharmony_ci 82162306a36Sopenharmony_cistatic void r5l_append_payload_meta(struct r5l_log *log, u16 type, 82262306a36Sopenharmony_ci sector_t location, 82362306a36Sopenharmony_ci u32 checksum1, u32 checksum2, 82462306a36Sopenharmony_ci bool checksum2_valid) 82562306a36Sopenharmony_ci{ 82662306a36Sopenharmony_ci struct r5l_io_unit *io = log->current_io; 82762306a36Sopenharmony_ci struct r5l_payload_data_parity *payload; 82862306a36Sopenharmony_ci 82962306a36Sopenharmony_ci payload = page_address(io->meta_page) + io->meta_offset; 83062306a36Sopenharmony_ci payload->header.type = cpu_to_le16(type); 83162306a36Sopenharmony_ci payload->header.flags = cpu_to_le16(0); 83262306a36Sopenharmony_ci payload->size = cpu_to_le32((1 + !!checksum2_valid) << 83362306a36Sopenharmony_ci (PAGE_SHIFT - 9)); 83462306a36Sopenharmony_ci payload->location = cpu_to_le64(location); 83562306a36Sopenharmony_ci payload->checksum[0] = cpu_to_le32(checksum1); 83662306a36Sopenharmony_ci if (checksum2_valid) 83762306a36Sopenharmony_ci payload->checksum[1] = cpu_to_le32(checksum2); 83862306a36Sopenharmony_ci 83962306a36Sopenharmony_ci io->meta_offset += sizeof(struct r5l_payload_data_parity) + 84062306a36Sopenharmony_ci sizeof(__le32) * (1 + !!checksum2_valid); 84162306a36Sopenharmony_ci} 84262306a36Sopenharmony_ci 84362306a36Sopenharmony_cistatic void r5l_append_payload_page(struct r5l_log *log, struct page *page) 84462306a36Sopenharmony_ci{ 84562306a36Sopenharmony_ci struct r5l_io_unit *io = log->current_io; 84662306a36Sopenharmony_ci 84762306a36Sopenharmony_ci if (io->need_split_bio) { 84862306a36Sopenharmony_ci BUG_ON(io->split_bio); 84962306a36Sopenharmony_ci io->split_bio = io->current_bio; 85062306a36Sopenharmony_ci io->current_bio = r5l_bio_alloc(log); 85162306a36Sopenharmony_ci bio_chain(io->current_bio, io->split_bio); 85262306a36Sopenharmony_ci io->need_split_bio = false; 85362306a36Sopenharmony_ci } 85462306a36Sopenharmony_ci 85562306a36Sopenharmony_ci if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 85662306a36Sopenharmony_ci BUG(); 85762306a36Sopenharmony_ci 85862306a36Sopenharmony_ci r5_reserve_log_entry(log, io); 85962306a36Sopenharmony_ci} 86062306a36Sopenharmony_ci 86162306a36Sopenharmony_cistatic void r5l_append_flush_payload(struct r5l_log *log, sector_t sect) 86262306a36Sopenharmony_ci{ 86362306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 86462306a36Sopenharmony_ci struct r5conf *conf = mddev->private; 86562306a36Sopenharmony_ci struct r5l_io_unit *io; 86662306a36Sopenharmony_ci struct r5l_payload_flush *payload; 86762306a36Sopenharmony_ci int meta_size; 86862306a36Sopenharmony_ci 86962306a36Sopenharmony_ci /* 87062306a36Sopenharmony_ci * payload_flush requires extra writes to the journal. 87162306a36Sopenharmony_ci * To avoid handling the extra IO in quiesce, just skip 87262306a36Sopenharmony_ci * flush_payload 87362306a36Sopenharmony_ci */ 87462306a36Sopenharmony_ci if (conf->quiesce) 87562306a36Sopenharmony_ci return; 87662306a36Sopenharmony_ci 87762306a36Sopenharmony_ci mutex_lock(&log->io_mutex); 87862306a36Sopenharmony_ci meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64); 87962306a36Sopenharmony_ci 88062306a36Sopenharmony_ci if (r5l_get_meta(log, meta_size)) { 88162306a36Sopenharmony_ci mutex_unlock(&log->io_mutex); 88262306a36Sopenharmony_ci return; 88362306a36Sopenharmony_ci } 88462306a36Sopenharmony_ci 88562306a36Sopenharmony_ci /* current implementation is one stripe per flush payload */ 88662306a36Sopenharmony_ci io = log->current_io; 88762306a36Sopenharmony_ci payload = page_address(io->meta_page) + io->meta_offset; 88862306a36Sopenharmony_ci payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH); 88962306a36Sopenharmony_ci payload->header.flags = cpu_to_le16(0); 89062306a36Sopenharmony_ci payload->size = cpu_to_le32(sizeof(__le64)); 89162306a36Sopenharmony_ci payload->flush_stripes[0] = cpu_to_le64(sect); 89262306a36Sopenharmony_ci io->meta_offset += meta_size; 89362306a36Sopenharmony_ci /* multiple flush payloads count as one pending_stripe */ 89462306a36Sopenharmony_ci if (!io->has_flush_payload) { 89562306a36Sopenharmony_ci io->has_flush_payload = 1; 89662306a36Sopenharmony_ci atomic_inc(&io->pending_stripe); 89762306a36Sopenharmony_ci } 89862306a36Sopenharmony_ci mutex_unlock(&log->io_mutex); 89962306a36Sopenharmony_ci} 90062306a36Sopenharmony_ci 90162306a36Sopenharmony_cistatic int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 90262306a36Sopenharmony_ci int data_pages, int parity_pages) 90362306a36Sopenharmony_ci{ 90462306a36Sopenharmony_ci int i; 90562306a36Sopenharmony_ci int meta_size; 90662306a36Sopenharmony_ci int ret; 90762306a36Sopenharmony_ci struct r5l_io_unit *io; 90862306a36Sopenharmony_ci 90962306a36Sopenharmony_ci meta_size = 91062306a36Sopenharmony_ci ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 91162306a36Sopenharmony_ci * data_pages) + 91262306a36Sopenharmony_ci sizeof(struct r5l_payload_data_parity) + 91362306a36Sopenharmony_ci sizeof(__le32) * parity_pages; 91462306a36Sopenharmony_ci 91562306a36Sopenharmony_ci ret = r5l_get_meta(log, meta_size); 91662306a36Sopenharmony_ci if (ret) 91762306a36Sopenharmony_ci return ret; 91862306a36Sopenharmony_ci 91962306a36Sopenharmony_ci io = log->current_io; 92062306a36Sopenharmony_ci 92162306a36Sopenharmony_ci if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) 92262306a36Sopenharmony_ci io->has_flush = 1; 92362306a36Sopenharmony_ci 92462306a36Sopenharmony_ci for (i = 0; i < sh->disks; i++) { 92562306a36Sopenharmony_ci if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 92662306a36Sopenharmony_ci test_bit(R5_InJournal, &sh->dev[i].flags)) 92762306a36Sopenharmony_ci continue; 92862306a36Sopenharmony_ci if (i == sh->pd_idx || i == sh->qd_idx) 92962306a36Sopenharmony_ci continue; 93062306a36Sopenharmony_ci if (test_bit(R5_WantFUA, &sh->dev[i].flags) && 93162306a36Sopenharmony_ci log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { 93262306a36Sopenharmony_ci io->has_fua = 1; 93362306a36Sopenharmony_ci /* 93462306a36Sopenharmony_ci * we need to flush journal to make sure recovery can 93562306a36Sopenharmony_ci * reach the data with fua flag 93662306a36Sopenharmony_ci */ 93762306a36Sopenharmony_ci io->has_flush = 1; 93862306a36Sopenharmony_ci } 93962306a36Sopenharmony_ci r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 94062306a36Sopenharmony_ci raid5_compute_blocknr(sh, i, 0), 94162306a36Sopenharmony_ci sh->dev[i].log_checksum, 0, false); 94262306a36Sopenharmony_ci r5l_append_payload_page(log, sh->dev[i].page); 94362306a36Sopenharmony_ci } 94462306a36Sopenharmony_ci 94562306a36Sopenharmony_ci if (parity_pages == 2) { 94662306a36Sopenharmony_ci r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 94762306a36Sopenharmony_ci sh->sector, sh->dev[sh->pd_idx].log_checksum, 94862306a36Sopenharmony_ci sh->dev[sh->qd_idx].log_checksum, true); 94962306a36Sopenharmony_ci r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 95062306a36Sopenharmony_ci r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 95162306a36Sopenharmony_ci } else if (parity_pages == 1) { 95262306a36Sopenharmony_ci r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 95362306a36Sopenharmony_ci sh->sector, sh->dev[sh->pd_idx].log_checksum, 95462306a36Sopenharmony_ci 0, false); 95562306a36Sopenharmony_ci r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 95662306a36Sopenharmony_ci } else /* Just writing data, not parity, in caching phase */ 95762306a36Sopenharmony_ci BUG_ON(parity_pages != 0); 95862306a36Sopenharmony_ci 95962306a36Sopenharmony_ci list_add_tail(&sh->log_list, &io->stripe_list); 96062306a36Sopenharmony_ci atomic_inc(&io->pending_stripe); 96162306a36Sopenharmony_ci sh->log_io = io; 96262306a36Sopenharmony_ci 96362306a36Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 96462306a36Sopenharmony_ci return 0; 96562306a36Sopenharmony_ci 96662306a36Sopenharmony_ci if (sh->log_start == MaxSector) { 96762306a36Sopenharmony_ci BUG_ON(!list_empty(&sh->r5c)); 96862306a36Sopenharmony_ci sh->log_start = io->log_start; 96962306a36Sopenharmony_ci spin_lock_irq(&log->stripe_in_journal_lock); 97062306a36Sopenharmony_ci list_add_tail(&sh->r5c, 97162306a36Sopenharmony_ci &log->stripe_in_journal_list); 97262306a36Sopenharmony_ci spin_unlock_irq(&log->stripe_in_journal_lock); 97362306a36Sopenharmony_ci atomic_inc(&log->stripe_in_journal_count); 97462306a36Sopenharmony_ci } 97562306a36Sopenharmony_ci return 0; 97662306a36Sopenharmony_ci} 97762306a36Sopenharmony_ci 97862306a36Sopenharmony_ci/* add stripe to no_space_stripes, and then wake up reclaim */ 97962306a36Sopenharmony_cistatic inline void r5l_add_no_space_stripe(struct r5l_log *log, 98062306a36Sopenharmony_ci struct stripe_head *sh) 98162306a36Sopenharmony_ci{ 98262306a36Sopenharmony_ci spin_lock(&log->no_space_stripes_lock); 98362306a36Sopenharmony_ci list_add_tail(&sh->log_list, &log->no_space_stripes); 98462306a36Sopenharmony_ci spin_unlock(&log->no_space_stripes_lock); 98562306a36Sopenharmony_ci} 98662306a36Sopenharmony_ci 98762306a36Sopenharmony_ci/* 98862306a36Sopenharmony_ci * running in raid5d, where reclaim could wait for raid5d too (when it flushes 98962306a36Sopenharmony_ci * data from log to raid disks), so we shouldn't wait for reclaim here 99062306a36Sopenharmony_ci */ 99162306a36Sopenharmony_ciint r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 99262306a36Sopenharmony_ci{ 99362306a36Sopenharmony_ci struct r5conf *conf = sh->raid_conf; 99462306a36Sopenharmony_ci int write_disks = 0; 99562306a36Sopenharmony_ci int data_pages, parity_pages; 99662306a36Sopenharmony_ci int reserve; 99762306a36Sopenharmony_ci int i; 99862306a36Sopenharmony_ci int ret = 0; 99962306a36Sopenharmony_ci bool wake_reclaim = false; 100062306a36Sopenharmony_ci 100162306a36Sopenharmony_ci if (!log) 100262306a36Sopenharmony_ci return -EAGAIN; 100362306a36Sopenharmony_ci /* Don't support stripe batch */ 100462306a36Sopenharmony_ci if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 100562306a36Sopenharmony_ci test_bit(STRIPE_SYNCING, &sh->state)) { 100662306a36Sopenharmony_ci /* the stripe is written to log, we start writing it to raid */ 100762306a36Sopenharmony_ci clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 100862306a36Sopenharmony_ci return -EAGAIN; 100962306a36Sopenharmony_ci } 101062306a36Sopenharmony_ci 101162306a36Sopenharmony_ci WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 101262306a36Sopenharmony_ci 101362306a36Sopenharmony_ci for (i = 0; i < sh->disks; i++) { 101462306a36Sopenharmony_ci void *addr; 101562306a36Sopenharmony_ci 101662306a36Sopenharmony_ci if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 101762306a36Sopenharmony_ci test_bit(R5_InJournal, &sh->dev[i].flags)) 101862306a36Sopenharmony_ci continue; 101962306a36Sopenharmony_ci 102062306a36Sopenharmony_ci write_disks++; 102162306a36Sopenharmony_ci /* checksum is already calculated in last run */ 102262306a36Sopenharmony_ci if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 102362306a36Sopenharmony_ci continue; 102462306a36Sopenharmony_ci addr = kmap_atomic(sh->dev[i].page); 102562306a36Sopenharmony_ci sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 102662306a36Sopenharmony_ci addr, PAGE_SIZE); 102762306a36Sopenharmony_ci kunmap_atomic(addr); 102862306a36Sopenharmony_ci } 102962306a36Sopenharmony_ci parity_pages = 1 + !!(sh->qd_idx >= 0); 103062306a36Sopenharmony_ci data_pages = write_disks - parity_pages; 103162306a36Sopenharmony_ci 103262306a36Sopenharmony_ci set_bit(STRIPE_LOG_TRAPPED, &sh->state); 103362306a36Sopenharmony_ci /* 103462306a36Sopenharmony_ci * The stripe must enter state machine again to finish the write, so 103562306a36Sopenharmony_ci * don't delay. 103662306a36Sopenharmony_ci */ 103762306a36Sopenharmony_ci clear_bit(STRIPE_DELAYED, &sh->state); 103862306a36Sopenharmony_ci atomic_inc(&sh->count); 103962306a36Sopenharmony_ci 104062306a36Sopenharmony_ci mutex_lock(&log->io_mutex); 104162306a36Sopenharmony_ci /* meta + data */ 104262306a36Sopenharmony_ci reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 104362306a36Sopenharmony_ci 104462306a36Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 104562306a36Sopenharmony_ci if (!r5l_has_free_space(log, reserve)) { 104662306a36Sopenharmony_ci r5l_add_no_space_stripe(log, sh); 104762306a36Sopenharmony_ci wake_reclaim = true; 104862306a36Sopenharmony_ci } else { 104962306a36Sopenharmony_ci ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 105062306a36Sopenharmony_ci if (ret) { 105162306a36Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 105262306a36Sopenharmony_ci list_add_tail(&sh->log_list, 105362306a36Sopenharmony_ci &log->no_mem_stripes); 105462306a36Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 105562306a36Sopenharmony_ci } 105662306a36Sopenharmony_ci } 105762306a36Sopenharmony_ci } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 105862306a36Sopenharmony_ci /* 105962306a36Sopenharmony_ci * log space critical, do not process stripes that are 106062306a36Sopenharmony_ci * not in cache yet (sh->log_start == MaxSector). 106162306a36Sopenharmony_ci */ 106262306a36Sopenharmony_ci if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 106362306a36Sopenharmony_ci sh->log_start == MaxSector) { 106462306a36Sopenharmony_ci r5l_add_no_space_stripe(log, sh); 106562306a36Sopenharmony_ci wake_reclaim = true; 106662306a36Sopenharmony_ci reserve = 0; 106762306a36Sopenharmony_ci } else if (!r5l_has_free_space(log, reserve)) { 106862306a36Sopenharmony_ci if (sh->log_start == log->last_checkpoint) 106962306a36Sopenharmony_ci BUG(); 107062306a36Sopenharmony_ci else 107162306a36Sopenharmony_ci r5l_add_no_space_stripe(log, sh); 107262306a36Sopenharmony_ci } else { 107362306a36Sopenharmony_ci ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 107462306a36Sopenharmony_ci if (ret) { 107562306a36Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 107662306a36Sopenharmony_ci list_add_tail(&sh->log_list, 107762306a36Sopenharmony_ci &log->no_mem_stripes); 107862306a36Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 107962306a36Sopenharmony_ci } 108062306a36Sopenharmony_ci } 108162306a36Sopenharmony_ci } 108262306a36Sopenharmony_ci 108362306a36Sopenharmony_ci mutex_unlock(&log->io_mutex); 108462306a36Sopenharmony_ci if (wake_reclaim) 108562306a36Sopenharmony_ci r5l_wake_reclaim(log, reserve); 108662306a36Sopenharmony_ci return 0; 108762306a36Sopenharmony_ci} 108862306a36Sopenharmony_ci 108962306a36Sopenharmony_civoid r5l_write_stripe_run(struct r5l_log *log) 109062306a36Sopenharmony_ci{ 109162306a36Sopenharmony_ci if (!log) 109262306a36Sopenharmony_ci return; 109362306a36Sopenharmony_ci mutex_lock(&log->io_mutex); 109462306a36Sopenharmony_ci r5l_submit_current_io(log); 109562306a36Sopenharmony_ci mutex_unlock(&log->io_mutex); 109662306a36Sopenharmony_ci} 109762306a36Sopenharmony_ci 109862306a36Sopenharmony_ciint r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 109962306a36Sopenharmony_ci{ 110062306a36Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 110162306a36Sopenharmony_ci /* 110262306a36Sopenharmony_ci * in write through (journal only) 110362306a36Sopenharmony_ci * we flush log disk cache first, then write stripe data to 110462306a36Sopenharmony_ci * raid disks. So if bio is finished, the log disk cache is 110562306a36Sopenharmony_ci * flushed already. The recovery guarantees we can recovery 110662306a36Sopenharmony_ci * the bio from log disk, so we don't need to flush again 110762306a36Sopenharmony_ci */ 110862306a36Sopenharmony_ci if (bio->bi_iter.bi_size == 0) { 110962306a36Sopenharmony_ci bio_endio(bio); 111062306a36Sopenharmony_ci return 0; 111162306a36Sopenharmony_ci } 111262306a36Sopenharmony_ci bio->bi_opf &= ~REQ_PREFLUSH; 111362306a36Sopenharmony_ci } else { 111462306a36Sopenharmony_ci /* write back (with cache) */ 111562306a36Sopenharmony_ci if (bio->bi_iter.bi_size == 0) { 111662306a36Sopenharmony_ci mutex_lock(&log->io_mutex); 111762306a36Sopenharmony_ci r5l_get_meta(log, 0); 111862306a36Sopenharmony_ci bio_list_add(&log->current_io->flush_barriers, bio); 111962306a36Sopenharmony_ci log->current_io->has_flush = 1; 112062306a36Sopenharmony_ci log->current_io->has_null_flush = 1; 112162306a36Sopenharmony_ci atomic_inc(&log->current_io->pending_stripe); 112262306a36Sopenharmony_ci r5l_submit_current_io(log); 112362306a36Sopenharmony_ci mutex_unlock(&log->io_mutex); 112462306a36Sopenharmony_ci return 0; 112562306a36Sopenharmony_ci } 112662306a36Sopenharmony_ci } 112762306a36Sopenharmony_ci return -EAGAIN; 112862306a36Sopenharmony_ci} 112962306a36Sopenharmony_ci 113062306a36Sopenharmony_ci/* This will run after log space is reclaimed */ 113162306a36Sopenharmony_cistatic void r5l_run_no_space_stripes(struct r5l_log *log) 113262306a36Sopenharmony_ci{ 113362306a36Sopenharmony_ci struct stripe_head *sh; 113462306a36Sopenharmony_ci 113562306a36Sopenharmony_ci spin_lock(&log->no_space_stripes_lock); 113662306a36Sopenharmony_ci while (!list_empty(&log->no_space_stripes)) { 113762306a36Sopenharmony_ci sh = list_first_entry(&log->no_space_stripes, 113862306a36Sopenharmony_ci struct stripe_head, log_list); 113962306a36Sopenharmony_ci list_del_init(&sh->log_list); 114062306a36Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 114162306a36Sopenharmony_ci raid5_release_stripe(sh); 114262306a36Sopenharmony_ci } 114362306a36Sopenharmony_ci spin_unlock(&log->no_space_stripes_lock); 114462306a36Sopenharmony_ci} 114562306a36Sopenharmony_ci 114662306a36Sopenharmony_ci/* 114762306a36Sopenharmony_ci * calculate new last_checkpoint 114862306a36Sopenharmony_ci * for write through mode, returns log->next_checkpoint 114962306a36Sopenharmony_ci * for write back, returns log_start of first sh in stripe_in_journal_list 115062306a36Sopenharmony_ci */ 115162306a36Sopenharmony_cistatic sector_t r5c_calculate_new_cp(struct r5conf *conf) 115262306a36Sopenharmony_ci{ 115362306a36Sopenharmony_ci struct stripe_head *sh; 115462306a36Sopenharmony_ci struct r5l_log *log = conf->log; 115562306a36Sopenharmony_ci sector_t new_cp; 115662306a36Sopenharmony_ci unsigned long flags; 115762306a36Sopenharmony_ci 115862306a36Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 115962306a36Sopenharmony_ci return log->next_checkpoint; 116062306a36Sopenharmony_ci 116162306a36Sopenharmony_ci spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 116262306a36Sopenharmony_ci if (list_empty(&conf->log->stripe_in_journal_list)) { 116362306a36Sopenharmony_ci /* all stripes flushed */ 116462306a36Sopenharmony_ci spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 116562306a36Sopenharmony_ci return log->next_checkpoint; 116662306a36Sopenharmony_ci } 116762306a36Sopenharmony_ci sh = list_first_entry(&conf->log->stripe_in_journal_list, 116862306a36Sopenharmony_ci struct stripe_head, r5c); 116962306a36Sopenharmony_ci new_cp = sh->log_start; 117062306a36Sopenharmony_ci spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 117162306a36Sopenharmony_ci return new_cp; 117262306a36Sopenharmony_ci} 117362306a36Sopenharmony_ci 117462306a36Sopenharmony_cistatic sector_t r5l_reclaimable_space(struct r5l_log *log) 117562306a36Sopenharmony_ci{ 117662306a36Sopenharmony_ci struct r5conf *conf = log->rdev->mddev->private; 117762306a36Sopenharmony_ci 117862306a36Sopenharmony_ci return r5l_ring_distance(log, log->last_checkpoint, 117962306a36Sopenharmony_ci r5c_calculate_new_cp(conf)); 118062306a36Sopenharmony_ci} 118162306a36Sopenharmony_ci 118262306a36Sopenharmony_cistatic void r5l_run_no_mem_stripe(struct r5l_log *log) 118362306a36Sopenharmony_ci{ 118462306a36Sopenharmony_ci struct stripe_head *sh; 118562306a36Sopenharmony_ci 118662306a36Sopenharmony_ci lockdep_assert_held(&log->io_list_lock); 118762306a36Sopenharmony_ci 118862306a36Sopenharmony_ci if (!list_empty(&log->no_mem_stripes)) { 118962306a36Sopenharmony_ci sh = list_first_entry(&log->no_mem_stripes, 119062306a36Sopenharmony_ci struct stripe_head, log_list); 119162306a36Sopenharmony_ci list_del_init(&sh->log_list); 119262306a36Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 119362306a36Sopenharmony_ci raid5_release_stripe(sh); 119462306a36Sopenharmony_ci } 119562306a36Sopenharmony_ci} 119662306a36Sopenharmony_ci 119762306a36Sopenharmony_cistatic bool r5l_complete_finished_ios(struct r5l_log *log) 119862306a36Sopenharmony_ci{ 119962306a36Sopenharmony_ci struct r5l_io_unit *io, *next; 120062306a36Sopenharmony_ci bool found = false; 120162306a36Sopenharmony_ci 120262306a36Sopenharmony_ci lockdep_assert_held(&log->io_list_lock); 120362306a36Sopenharmony_ci 120462306a36Sopenharmony_ci list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 120562306a36Sopenharmony_ci /* don't change list order */ 120662306a36Sopenharmony_ci if (io->state < IO_UNIT_STRIPE_END) 120762306a36Sopenharmony_ci break; 120862306a36Sopenharmony_ci 120962306a36Sopenharmony_ci log->next_checkpoint = io->log_start; 121062306a36Sopenharmony_ci 121162306a36Sopenharmony_ci list_del(&io->log_sibling); 121262306a36Sopenharmony_ci mempool_free(io, &log->io_pool); 121362306a36Sopenharmony_ci r5l_run_no_mem_stripe(log); 121462306a36Sopenharmony_ci 121562306a36Sopenharmony_ci found = true; 121662306a36Sopenharmony_ci } 121762306a36Sopenharmony_ci 121862306a36Sopenharmony_ci return found; 121962306a36Sopenharmony_ci} 122062306a36Sopenharmony_ci 122162306a36Sopenharmony_cistatic void __r5l_stripe_write_finished(struct r5l_io_unit *io) 122262306a36Sopenharmony_ci{ 122362306a36Sopenharmony_ci struct r5l_log *log = io->log; 122462306a36Sopenharmony_ci struct r5conf *conf = log->rdev->mddev->private; 122562306a36Sopenharmony_ci unsigned long flags; 122662306a36Sopenharmony_ci 122762306a36Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 122862306a36Sopenharmony_ci __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 122962306a36Sopenharmony_ci 123062306a36Sopenharmony_ci if (!r5l_complete_finished_ios(log)) { 123162306a36Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 123262306a36Sopenharmony_ci return; 123362306a36Sopenharmony_ci } 123462306a36Sopenharmony_ci 123562306a36Sopenharmony_ci if (r5l_reclaimable_space(log) > log->max_free_space || 123662306a36Sopenharmony_ci test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 123762306a36Sopenharmony_ci r5l_wake_reclaim(log, 0); 123862306a36Sopenharmony_ci 123962306a36Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 124062306a36Sopenharmony_ci wake_up(&log->iounit_wait); 124162306a36Sopenharmony_ci} 124262306a36Sopenharmony_ci 124362306a36Sopenharmony_civoid r5l_stripe_write_finished(struct stripe_head *sh) 124462306a36Sopenharmony_ci{ 124562306a36Sopenharmony_ci struct r5l_io_unit *io; 124662306a36Sopenharmony_ci 124762306a36Sopenharmony_ci io = sh->log_io; 124862306a36Sopenharmony_ci sh->log_io = NULL; 124962306a36Sopenharmony_ci 125062306a36Sopenharmony_ci if (io && atomic_dec_and_test(&io->pending_stripe)) 125162306a36Sopenharmony_ci __r5l_stripe_write_finished(io); 125262306a36Sopenharmony_ci} 125362306a36Sopenharmony_ci 125462306a36Sopenharmony_cistatic void r5l_log_flush_endio(struct bio *bio) 125562306a36Sopenharmony_ci{ 125662306a36Sopenharmony_ci struct r5l_log *log = container_of(bio, struct r5l_log, 125762306a36Sopenharmony_ci flush_bio); 125862306a36Sopenharmony_ci unsigned long flags; 125962306a36Sopenharmony_ci struct r5l_io_unit *io; 126062306a36Sopenharmony_ci 126162306a36Sopenharmony_ci if (bio->bi_status) 126262306a36Sopenharmony_ci md_error(log->rdev->mddev, log->rdev); 126362306a36Sopenharmony_ci bio_uninit(bio); 126462306a36Sopenharmony_ci 126562306a36Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 126662306a36Sopenharmony_ci list_for_each_entry(io, &log->flushing_ios, log_sibling) 126762306a36Sopenharmony_ci r5l_io_run_stripes(io); 126862306a36Sopenharmony_ci list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 126962306a36Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 127062306a36Sopenharmony_ci} 127162306a36Sopenharmony_ci 127262306a36Sopenharmony_ci/* 127362306a36Sopenharmony_ci * Starting dispatch IO to raid. 127462306a36Sopenharmony_ci * io_unit(meta) consists of a log. There is one situation we want to avoid. A 127562306a36Sopenharmony_ci * broken meta in the middle of a log causes recovery can't find meta at the 127662306a36Sopenharmony_ci * head of log. If operations require meta at the head persistent in log, we 127762306a36Sopenharmony_ci * must make sure meta before it persistent in log too. A case is: 127862306a36Sopenharmony_ci * 127962306a36Sopenharmony_ci * stripe data/parity is in log, we start write stripe to raid disks. stripe 128062306a36Sopenharmony_ci * data/parity must be persistent in log before we do the write to raid disks. 128162306a36Sopenharmony_ci * 128262306a36Sopenharmony_ci * The solution is we restrictly maintain io_unit list order. In this case, we 128362306a36Sopenharmony_ci * only write stripes of an io_unit to raid disks till the io_unit is the first 128462306a36Sopenharmony_ci * one whose data/parity is in log. 128562306a36Sopenharmony_ci */ 128662306a36Sopenharmony_civoid r5l_flush_stripe_to_raid(struct r5l_log *log) 128762306a36Sopenharmony_ci{ 128862306a36Sopenharmony_ci bool do_flush; 128962306a36Sopenharmony_ci 129062306a36Sopenharmony_ci if (!log || !log->need_cache_flush) 129162306a36Sopenharmony_ci return; 129262306a36Sopenharmony_ci 129362306a36Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 129462306a36Sopenharmony_ci /* flush bio is running */ 129562306a36Sopenharmony_ci if (!list_empty(&log->flushing_ios)) { 129662306a36Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 129762306a36Sopenharmony_ci return; 129862306a36Sopenharmony_ci } 129962306a36Sopenharmony_ci list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 130062306a36Sopenharmony_ci do_flush = !list_empty(&log->flushing_ios); 130162306a36Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 130262306a36Sopenharmony_ci 130362306a36Sopenharmony_ci if (!do_flush) 130462306a36Sopenharmony_ci return; 130562306a36Sopenharmony_ci bio_init(&log->flush_bio, log->rdev->bdev, NULL, 0, 130662306a36Sopenharmony_ci REQ_OP_WRITE | REQ_PREFLUSH); 130762306a36Sopenharmony_ci log->flush_bio.bi_end_io = r5l_log_flush_endio; 130862306a36Sopenharmony_ci submit_bio(&log->flush_bio); 130962306a36Sopenharmony_ci} 131062306a36Sopenharmony_ci 131162306a36Sopenharmony_cistatic void r5l_write_super(struct r5l_log *log, sector_t cp); 131262306a36Sopenharmony_cistatic void r5l_write_super_and_discard_space(struct r5l_log *log, 131362306a36Sopenharmony_ci sector_t end) 131462306a36Sopenharmony_ci{ 131562306a36Sopenharmony_ci struct block_device *bdev = log->rdev->bdev; 131662306a36Sopenharmony_ci struct mddev *mddev; 131762306a36Sopenharmony_ci 131862306a36Sopenharmony_ci r5l_write_super(log, end); 131962306a36Sopenharmony_ci 132062306a36Sopenharmony_ci if (!bdev_max_discard_sectors(bdev)) 132162306a36Sopenharmony_ci return; 132262306a36Sopenharmony_ci 132362306a36Sopenharmony_ci mddev = log->rdev->mddev; 132462306a36Sopenharmony_ci /* 132562306a36Sopenharmony_ci * Discard could zero data, so before discard we must make sure 132662306a36Sopenharmony_ci * superblock is updated to new log tail. Updating superblock (either 132762306a36Sopenharmony_ci * directly call md_update_sb() or depend on md thread) must hold 132862306a36Sopenharmony_ci * reconfig mutex. On the other hand, raid5_quiesce is called with 132962306a36Sopenharmony_ci * reconfig_mutex hold. The first step of raid5_quiesce() is waiting 133062306a36Sopenharmony_ci * for all IO finish, hence waiting for reclaim thread, while reclaim 133162306a36Sopenharmony_ci * thread is calling this function and waiting for reconfig mutex. So 133262306a36Sopenharmony_ci * there is a deadlock. We workaround this issue with a trylock. 133362306a36Sopenharmony_ci * FIXME: we could miss discard if we can't take reconfig mutex 133462306a36Sopenharmony_ci */ 133562306a36Sopenharmony_ci set_mask_bits(&mddev->sb_flags, 0, 133662306a36Sopenharmony_ci BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 133762306a36Sopenharmony_ci if (!mddev_trylock(mddev)) 133862306a36Sopenharmony_ci return; 133962306a36Sopenharmony_ci md_update_sb(mddev, 1); 134062306a36Sopenharmony_ci mddev_unlock(mddev); 134162306a36Sopenharmony_ci 134262306a36Sopenharmony_ci /* discard IO error really doesn't matter, ignore it */ 134362306a36Sopenharmony_ci if (log->last_checkpoint < end) { 134462306a36Sopenharmony_ci blkdev_issue_discard(bdev, 134562306a36Sopenharmony_ci log->last_checkpoint + log->rdev->data_offset, 134662306a36Sopenharmony_ci end - log->last_checkpoint, GFP_NOIO); 134762306a36Sopenharmony_ci } else { 134862306a36Sopenharmony_ci blkdev_issue_discard(bdev, 134962306a36Sopenharmony_ci log->last_checkpoint + log->rdev->data_offset, 135062306a36Sopenharmony_ci log->device_size - log->last_checkpoint, 135162306a36Sopenharmony_ci GFP_NOIO); 135262306a36Sopenharmony_ci blkdev_issue_discard(bdev, log->rdev->data_offset, end, 135362306a36Sopenharmony_ci GFP_NOIO); 135462306a36Sopenharmony_ci } 135562306a36Sopenharmony_ci} 135662306a36Sopenharmony_ci 135762306a36Sopenharmony_ci/* 135862306a36Sopenharmony_ci * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 135962306a36Sopenharmony_ci * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 136062306a36Sopenharmony_ci * 136162306a36Sopenharmony_ci * must hold conf->device_lock 136262306a36Sopenharmony_ci */ 136362306a36Sopenharmony_cistatic void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 136462306a36Sopenharmony_ci{ 136562306a36Sopenharmony_ci BUG_ON(list_empty(&sh->lru)); 136662306a36Sopenharmony_ci BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 136762306a36Sopenharmony_ci BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 136862306a36Sopenharmony_ci 136962306a36Sopenharmony_ci /* 137062306a36Sopenharmony_ci * The stripe is not ON_RELEASE_LIST, so it is safe to call 137162306a36Sopenharmony_ci * raid5_release_stripe() while holding conf->device_lock 137262306a36Sopenharmony_ci */ 137362306a36Sopenharmony_ci BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 137462306a36Sopenharmony_ci lockdep_assert_held(&conf->device_lock); 137562306a36Sopenharmony_ci 137662306a36Sopenharmony_ci list_del_init(&sh->lru); 137762306a36Sopenharmony_ci atomic_inc(&sh->count); 137862306a36Sopenharmony_ci 137962306a36Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 138062306a36Sopenharmony_ci atomic_inc(&conf->active_stripes); 138162306a36Sopenharmony_ci r5c_make_stripe_write_out(sh); 138262306a36Sopenharmony_ci 138362306a36Sopenharmony_ci if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 138462306a36Sopenharmony_ci atomic_inc(&conf->r5c_flushing_partial_stripes); 138562306a36Sopenharmony_ci else 138662306a36Sopenharmony_ci atomic_inc(&conf->r5c_flushing_full_stripes); 138762306a36Sopenharmony_ci raid5_release_stripe(sh); 138862306a36Sopenharmony_ci} 138962306a36Sopenharmony_ci 139062306a36Sopenharmony_ci/* 139162306a36Sopenharmony_ci * if num == 0, flush all full stripes 139262306a36Sopenharmony_ci * if num > 0, flush all full stripes. If less than num full stripes are 139362306a36Sopenharmony_ci * flushed, flush some partial stripes until totally num stripes are 139462306a36Sopenharmony_ci * flushed or there is no more cached stripes. 139562306a36Sopenharmony_ci */ 139662306a36Sopenharmony_civoid r5c_flush_cache(struct r5conf *conf, int num) 139762306a36Sopenharmony_ci{ 139862306a36Sopenharmony_ci int count; 139962306a36Sopenharmony_ci struct stripe_head *sh, *next; 140062306a36Sopenharmony_ci 140162306a36Sopenharmony_ci lockdep_assert_held(&conf->device_lock); 140262306a36Sopenharmony_ci if (!conf->log) 140362306a36Sopenharmony_ci return; 140462306a36Sopenharmony_ci 140562306a36Sopenharmony_ci count = 0; 140662306a36Sopenharmony_ci list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 140762306a36Sopenharmony_ci r5c_flush_stripe(conf, sh); 140862306a36Sopenharmony_ci count++; 140962306a36Sopenharmony_ci } 141062306a36Sopenharmony_ci 141162306a36Sopenharmony_ci if (count >= num) 141262306a36Sopenharmony_ci return; 141362306a36Sopenharmony_ci list_for_each_entry_safe(sh, next, 141462306a36Sopenharmony_ci &conf->r5c_partial_stripe_list, lru) { 141562306a36Sopenharmony_ci r5c_flush_stripe(conf, sh); 141662306a36Sopenharmony_ci if (++count >= num) 141762306a36Sopenharmony_ci break; 141862306a36Sopenharmony_ci } 141962306a36Sopenharmony_ci} 142062306a36Sopenharmony_ci 142162306a36Sopenharmony_cistatic void r5c_do_reclaim(struct r5conf *conf) 142262306a36Sopenharmony_ci{ 142362306a36Sopenharmony_ci struct r5l_log *log = conf->log; 142462306a36Sopenharmony_ci struct stripe_head *sh; 142562306a36Sopenharmony_ci int count = 0; 142662306a36Sopenharmony_ci unsigned long flags; 142762306a36Sopenharmony_ci int total_cached; 142862306a36Sopenharmony_ci int stripes_to_flush; 142962306a36Sopenharmony_ci int flushing_partial, flushing_full; 143062306a36Sopenharmony_ci 143162306a36Sopenharmony_ci if (!r5c_is_writeback(log)) 143262306a36Sopenharmony_ci return; 143362306a36Sopenharmony_ci 143462306a36Sopenharmony_ci flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes); 143562306a36Sopenharmony_ci flushing_full = atomic_read(&conf->r5c_flushing_full_stripes); 143662306a36Sopenharmony_ci total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 143762306a36Sopenharmony_ci atomic_read(&conf->r5c_cached_full_stripes) - 143862306a36Sopenharmony_ci flushing_full - flushing_partial; 143962306a36Sopenharmony_ci 144062306a36Sopenharmony_ci if (total_cached > conf->min_nr_stripes * 3 / 4 || 144162306a36Sopenharmony_ci atomic_read(&conf->empty_inactive_list_nr) > 0) 144262306a36Sopenharmony_ci /* 144362306a36Sopenharmony_ci * if stripe cache pressure high, flush all full stripes and 144462306a36Sopenharmony_ci * some partial stripes 144562306a36Sopenharmony_ci */ 144662306a36Sopenharmony_ci stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 144762306a36Sopenharmony_ci else if (total_cached > conf->min_nr_stripes * 1 / 2 || 144862306a36Sopenharmony_ci atomic_read(&conf->r5c_cached_full_stripes) - flushing_full > 144962306a36Sopenharmony_ci R5C_FULL_STRIPE_FLUSH_BATCH(conf)) 145062306a36Sopenharmony_ci /* 145162306a36Sopenharmony_ci * if stripe cache pressure moderate, or if there is many full 145262306a36Sopenharmony_ci * stripes,flush all full stripes 145362306a36Sopenharmony_ci */ 145462306a36Sopenharmony_ci stripes_to_flush = 0; 145562306a36Sopenharmony_ci else 145662306a36Sopenharmony_ci /* no need to flush */ 145762306a36Sopenharmony_ci stripes_to_flush = -1; 145862306a36Sopenharmony_ci 145962306a36Sopenharmony_ci if (stripes_to_flush >= 0) { 146062306a36Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 146162306a36Sopenharmony_ci r5c_flush_cache(conf, stripes_to_flush); 146262306a36Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 146362306a36Sopenharmony_ci } 146462306a36Sopenharmony_ci 146562306a36Sopenharmony_ci /* if log space is tight, flush stripes on stripe_in_journal_list */ 146662306a36Sopenharmony_ci if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 146762306a36Sopenharmony_ci spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 146862306a36Sopenharmony_ci spin_lock(&conf->device_lock); 146962306a36Sopenharmony_ci list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 147062306a36Sopenharmony_ci /* 147162306a36Sopenharmony_ci * stripes on stripe_in_journal_list could be in any 147262306a36Sopenharmony_ci * state of the stripe_cache state machine. In this 147362306a36Sopenharmony_ci * case, we only want to flush stripe on 147462306a36Sopenharmony_ci * r5c_cached_full/partial_stripes. The following 147562306a36Sopenharmony_ci * condition makes sure the stripe is on one of the 147662306a36Sopenharmony_ci * two lists. 147762306a36Sopenharmony_ci */ 147862306a36Sopenharmony_ci if (!list_empty(&sh->lru) && 147962306a36Sopenharmony_ci !test_bit(STRIPE_HANDLE, &sh->state) && 148062306a36Sopenharmony_ci atomic_read(&sh->count) == 0) { 148162306a36Sopenharmony_ci r5c_flush_stripe(conf, sh); 148262306a36Sopenharmony_ci if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 148362306a36Sopenharmony_ci break; 148462306a36Sopenharmony_ci } 148562306a36Sopenharmony_ci } 148662306a36Sopenharmony_ci spin_unlock(&conf->device_lock); 148762306a36Sopenharmony_ci spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 148862306a36Sopenharmony_ci } 148962306a36Sopenharmony_ci 149062306a36Sopenharmony_ci if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 149162306a36Sopenharmony_ci r5l_run_no_space_stripes(log); 149262306a36Sopenharmony_ci 149362306a36Sopenharmony_ci md_wakeup_thread(conf->mddev->thread); 149462306a36Sopenharmony_ci} 149562306a36Sopenharmony_ci 149662306a36Sopenharmony_cistatic void r5l_do_reclaim(struct r5l_log *log) 149762306a36Sopenharmony_ci{ 149862306a36Sopenharmony_ci struct r5conf *conf = log->rdev->mddev->private; 149962306a36Sopenharmony_ci sector_t reclaim_target = xchg(&log->reclaim_target, 0); 150062306a36Sopenharmony_ci sector_t reclaimable; 150162306a36Sopenharmony_ci sector_t next_checkpoint; 150262306a36Sopenharmony_ci bool write_super; 150362306a36Sopenharmony_ci 150462306a36Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 150562306a36Sopenharmony_ci write_super = r5l_reclaimable_space(log) > log->max_free_space || 150662306a36Sopenharmony_ci reclaim_target != 0 || !list_empty(&log->no_space_stripes); 150762306a36Sopenharmony_ci /* 150862306a36Sopenharmony_ci * move proper io_unit to reclaim list. We should not change the order. 150962306a36Sopenharmony_ci * reclaimable/unreclaimable io_unit can be mixed in the list, we 151062306a36Sopenharmony_ci * shouldn't reuse space of an unreclaimable io_unit 151162306a36Sopenharmony_ci */ 151262306a36Sopenharmony_ci while (1) { 151362306a36Sopenharmony_ci reclaimable = r5l_reclaimable_space(log); 151462306a36Sopenharmony_ci if (reclaimable >= reclaim_target || 151562306a36Sopenharmony_ci (list_empty(&log->running_ios) && 151662306a36Sopenharmony_ci list_empty(&log->io_end_ios) && 151762306a36Sopenharmony_ci list_empty(&log->flushing_ios) && 151862306a36Sopenharmony_ci list_empty(&log->finished_ios))) 151962306a36Sopenharmony_ci break; 152062306a36Sopenharmony_ci 152162306a36Sopenharmony_ci md_wakeup_thread(log->rdev->mddev->thread); 152262306a36Sopenharmony_ci wait_event_lock_irq(log->iounit_wait, 152362306a36Sopenharmony_ci r5l_reclaimable_space(log) > reclaimable, 152462306a36Sopenharmony_ci log->io_list_lock); 152562306a36Sopenharmony_ci } 152662306a36Sopenharmony_ci 152762306a36Sopenharmony_ci next_checkpoint = r5c_calculate_new_cp(conf); 152862306a36Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 152962306a36Sopenharmony_ci 153062306a36Sopenharmony_ci if (reclaimable == 0 || !write_super) 153162306a36Sopenharmony_ci return; 153262306a36Sopenharmony_ci 153362306a36Sopenharmony_ci /* 153462306a36Sopenharmony_ci * write_super will flush cache of each raid disk. We must write super 153562306a36Sopenharmony_ci * here, because the log area might be reused soon and we don't want to 153662306a36Sopenharmony_ci * confuse recovery 153762306a36Sopenharmony_ci */ 153862306a36Sopenharmony_ci r5l_write_super_and_discard_space(log, next_checkpoint); 153962306a36Sopenharmony_ci 154062306a36Sopenharmony_ci mutex_lock(&log->io_mutex); 154162306a36Sopenharmony_ci log->last_checkpoint = next_checkpoint; 154262306a36Sopenharmony_ci r5c_update_log_state(log); 154362306a36Sopenharmony_ci mutex_unlock(&log->io_mutex); 154462306a36Sopenharmony_ci 154562306a36Sopenharmony_ci r5l_run_no_space_stripes(log); 154662306a36Sopenharmony_ci} 154762306a36Sopenharmony_ci 154862306a36Sopenharmony_cistatic void r5l_reclaim_thread(struct md_thread *thread) 154962306a36Sopenharmony_ci{ 155062306a36Sopenharmony_ci struct mddev *mddev = thread->mddev; 155162306a36Sopenharmony_ci struct r5conf *conf = mddev->private; 155262306a36Sopenharmony_ci struct r5l_log *log = conf->log; 155362306a36Sopenharmony_ci 155462306a36Sopenharmony_ci if (!log) 155562306a36Sopenharmony_ci return; 155662306a36Sopenharmony_ci r5c_do_reclaim(conf); 155762306a36Sopenharmony_ci r5l_do_reclaim(log); 155862306a36Sopenharmony_ci} 155962306a36Sopenharmony_ci 156062306a36Sopenharmony_civoid r5l_wake_reclaim(struct r5l_log *log, sector_t space) 156162306a36Sopenharmony_ci{ 156262306a36Sopenharmony_ci unsigned long target; 156362306a36Sopenharmony_ci unsigned long new = (unsigned long)space; /* overflow in theory */ 156462306a36Sopenharmony_ci 156562306a36Sopenharmony_ci if (!log) 156662306a36Sopenharmony_ci return; 156762306a36Sopenharmony_ci 156862306a36Sopenharmony_ci target = READ_ONCE(log->reclaim_target); 156962306a36Sopenharmony_ci do { 157062306a36Sopenharmony_ci if (new < target) 157162306a36Sopenharmony_ci return; 157262306a36Sopenharmony_ci } while (!try_cmpxchg(&log->reclaim_target, &target, new)); 157362306a36Sopenharmony_ci md_wakeup_thread(log->reclaim_thread); 157462306a36Sopenharmony_ci} 157562306a36Sopenharmony_ci 157662306a36Sopenharmony_civoid r5l_quiesce(struct r5l_log *log, int quiesce) 157762306a36Sopenharmony_ci{ 157862306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 157962306a36Sopenharmony_ci struct md_thread *thread = rcu_dereference_protected( 158062306a36Sopenharmony_ci log->reclaim_thread, lockdep_is_held(&mddev->reconfig_mutex)); 158162306a36Sopenharmony_ci 158262306a36Sopenharmony_ci if (quiesce) { 158362306a36Sopenharmony_ci /* make sure r5l_write_super_and_discard_space exits */ 158462306a36Sopenharmony_ci wake_up(&mddev->sb_wait); 158562306a36Sopenharmony_ci kthread_park(thread->tsk); 158662306a36Sopenharmony_ci r5l_wake_reclaim(log, MaxSector); 158762306a36Sopenharmony_ci r5l_do_reclaim(log); 158862306a36Sopenharmony_ci } else 158962306a36Sopenharmony_ci kthread_unpark(thread->tsk); 159062306a36Sopenharmony_ci} 159162306a36Sopenharmony_ci 159262306a36Sopenharmony_cibool r5l_log_disk_error(struct r5conf *conf) 159362306a36Sopenharmony_ci{ 159462306a36Sopenharmony_ci struct r5l_log *log = conf->log; 159562306a36Sopenharmony_ci 159662306a36Sopenharmony_ci /* don't allow write if journal disk is missing */ 159762306a36Sopenharmony_ci if (!log) 159862306a36Sopenharmony_ci return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 159962306a36Sopenharmony_ci else 160062306a36Sopenharmony_ci return test_bit(Faulty, &log->rdev->flags); 160162306a36Sopenharmony_ci} 160262306a36Sopenharmony_ci 160362306a36Sopenharmony_ci#define R5L_RECOVERY_PAGE_POOL_SIZE 256 160462306a36Sopenharmony_ci 160562306a36Sopenharmony_cistruct r5l_recovery_ctx { 160662306a36Sopenharmony_ci struct page *meta_page; /* current meta */ 160762306a36Sopenharmony_ci sector_t meta_total_blocks; /* total size of current meta and data */ 160862306a36Sopenharmony_ci sector_t pos; /* recovery position */ 160962306a36Sopenharmony_ci u64 seq; /* recovery position seq */ 161062306a36Sopenharmony_ci int data_parity_stripes; /* number of data_parity stripes */ 161162306a36Sopenharmony_ci int data_only_stripes; /* number of data_only stripes */ 161262306a36Sopenharmony_ci struct list_head cached_list; 161362306a36Sopenharmony_ci 161462306a36Sopenharmony_ci /* 161562306a36Sopenharmony_ci * read ahead page pool (ra_pool) 161662306a36Sopenharmony_ci * in recovery, log is read sequentially. It is not efficient to 161762306a36Sopenharmony_ci * read every page with sync_page_io(). The read ahead page pool 161862306a36Sopenharmony_ci * reads multiple pages with one IO, so further log read can 161962306a36Sopenharmony_ci * just copy data from the pool. 162062306a36Sopenharmony_ci */ 162162306a36Sopenharmony_ci struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE]; 162262306a36Sopenharmony_ci struct bio_vec ra_bvec[R5L_RECOVERY_PAGE_POOL_SIZE]; 162362306a36Sopenharmony_ci sector_t pool_offset; /* offset of first page in the pool */ 162462306a36Sopenharmony_ci int total_pages; /* total allocated pages */ 162562306a36Sopenharmony_ci int valid_pages; /* pages with valid data */ 162662306a36Sopenharmony_ci}; 162762306a36Sopenharmony_ci 162862306a36Sopenharmony_cistatic int r5l_recovery_allocate_ra_pool(struct r5l_log *log, 162962306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx) 163062306a36Sopenharmony_ci{ 163162306a36Sopenharmony_ci struct page *page; 163262306a36Sopenharmony_ci 163362306a36Sopenharmony_ci ctx->valid_pages = 0; 163462306a36Sopenharmony_ci ctx->total_pages = 0; 163562306a36Sopenharmony_ci while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) { 163662306a36Sopenharmony_ci page = alloc_page(GFP_KERNEL); 163762306a36Sopenharmony_ci 163862306a36Sopenharmony_ci if (!page) 163962306a36Sopenharmony_ci break; 164062306a36Sopenharmony_ci ctx->ra_pool[ctx->total_pages] = page; 164162306a36Sopenharmony_ci ctx->total_pages += 1; 164262306a36Sopenharmony_ci } 164362306a36Sopenharmony_ci 164462306a36Sopenharmony_ci if (ctx->total_pages == 0) 164562306a36Sopenharmony_ci return -ENOMEM; 164662306a36Sopenharmony_ci 164762306a36Sopenharmony_ci ctx->pool_offset = 0; 164862306a36Sopenharmony_ci return 0; 164962306a36Sopenharmony_ci} 165062306a36Sopenharmony_ci 165162306a36Sopenharmony_cistatic void r5l_recovery_free_ra_pool(struct r5l_log *log, 165262306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx) 165362306a36Sopenharmony_ci{ 165462306a36Sopenharmony_ci int i; 165562306a36Sopenharmony_ci 165662306a36Sopenharmony_ci for (i = 0; i < ctx->total_pages; ++i) 165762306a36Sopenharmony_ci put_page(ctx->ra_pool[i]); 165862306a36Sopenharmony_ci} 165962306a36Sopenharmony_ci 166062306a36Sopenharmony_ci/* 166162306a36Sopenharmony_ci * fetch ctx->valid_pages pages from offset 166262306a36Sopenharmony_ci * In normal cases, ctx->valid_pages == ctx->total_pages after the call. 166362306a36Sopenharmony_ci * However, if the offset is close to the end of the journal device, 166462306a36Sopenharmony_ci * ctx->valid_pages could be smaller than ctx->total_pages 166562306a36Sopenharmony_ci */ 166662306a36Sopenharmony_cistatic int r5l_recovery_fetch_ra_pool(struct r5l_log *log, 166762306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx, 166862306a36Sopenharmony_ci sector_t offset) 166962306a36Sopenharmony_ci{ 167062306a36Sopenharmony_ci struct bio bio; 167162306a36Sopenharmony_ci int ret; 167262306a36Sopenharmony_ci 167362306a36Sopenharmony_ci bio_init(&bio, log->rdev->bdev, ctx->ra_bvec, 167462306a36Sopenharmony_ci R5L_RECOVERY_PAGE_POOL_SIZE, REQ_OP_READ); 167562306a36Sopenharmony_ci bio.bi_iter.bi_sector = log->rdev->data_offset + offset; 167662306a36Sopenharmony_ci 167762306a36Sopenharmony_ci ctx->valid_pages = 0; 167862306a36Sopenharmony_ci ctx->pool_offset = offset; 167962306a36Sopenharmony_ci 168062306a36Sopenharmony_ci while (ctx->valid_pages < ctx->total_pages) { 168162306a36Sopenharmony_ci __bio_add_page(&bio, ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 168262306a36Sopenharmony_ci 0); 168362306a36Sopenharmony_ci ctx->valid_pages += 1; 168462306a36Sopenharmony_ci 168562306a36Sopenharmony_ci offset = r5l_ring_add(log, offset, BLOCK_SECTORS); 168662306a36Sopenharmony_ci 168762306a36Sopenharmony_ci if (offset == 0) /* reached end of the device */ 168862306a36Sopenharmony_ci break; 168962306a36Sopenharmony_ci } 169062306a36Sopenharmony_ci 169162306a36Sopenharmony_ci ret = submit_bio_wait(&bio); 169262306a36Sopenharmony_ci bio_uninit(&bio); 169362306a36Sopenharmony_ci return ret; 169462306a36Sopenharmony_ci} 169562306a36Sopenharmony_ci 169662306a36Sopenharmony_ci/* 169762306a36Sopenharmony_ci * try read a page from the read ahead page pool, if the page is not in the 169862306a36Sopenharmony_ci * pool, call r5l_recovery_fetch_ra_pool 169962306a36Sopenharmony_ci */ 170062306a36Sopenharmony_cistatic int r5l_recovery_read_page(struct r5l_log *log, 170162306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx, 170262306a36Sopenharmony_ci struct page *page, 170362306a36Sopenharmony_ci sector_t offset) 170462306a36Sopenharmony_ci{ 170562306a36Sopenharmony_ci int ret; 170662306a36Sopenharmony_ci 170762306a36Sopenharmony_ci if (offset < ctx->pool_offset || 170862306a36Sopenharmony_ci offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) { 170962306a36Sopenharmony_ci ret = r5l_recovery_fetch_ra_pool(log, ctx, offset); 171062306a36Sopenharmony_ci if (ret) 171162306a36Sopenharmony_ci return ret; 171262306a36Sopenharmony_ci } 171362306a36Sopenharmony_ci 171462306a36Sopenharmony_ci BUG_ON(offset < ctx->pool_offset || 171562306a36Sopenharmony_ci offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS); 171662306a36Sopenharmony_ci 171762306a36Sopenharmony_ci memcpy(page_address(page), 171862306a36Sopenharmony_ci page_address(ctx->ra_pool[(offset - ctx->pool_offset) >> 171962306a36Sopenharmony_ci BLOCK_SECTOR_SHIFT]), 172062306a36Sopenharmony_ci PAGE_SIZE); 172162306a36Sopenharmony_ci return 0; 172262306a36Sopenharmony_ci} 172362306a36Sopenharmony_ci 172462306a36Sopenharmony_cistatic int r5l_recovery_read_meta_block(struct r5l_log *log, 172562306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx) 172662306a36Sopenharmony_ci{ 172762306a36Sopenharmony_ci struct page *page = ctx->meta_page; 172862306a36Sopenharmony_ci struct r5l_meta_block *mb; 172962306a36Sopenharmony_ci u32 crc, stored_crc; 173062306a36Sopenharmony_ci int ret; 173162306a36Sopenharmony_ci 173262306a36Sopenharmony_ci ret = r5l_recovery_read_page(log, ctx, page, ctx->pos); 173362306a36Sopenharmony_ci if (ret != 0) 173462306a36Sopenharmony_ci return ret; 173562306a36Sopenharmony_ci 173662306a36Sopenharmony_ci mb = page_address(page); 173762306a36Sopenharmony_ci stored_crc = le32_to_cpu(mb->checksum); 173862306a36Sopenharmony_ci mb->checksum = 0; 173962306a36Sopenharmony_ci 174062306a36Sopenharmony_ci if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 174162306a36Sopenharmony_ci le64_to_cpu(mb->seq) != ctx->seq || 174262306a36Sopenharmony_ci mb->version != R5LOG_VERSION || 174362306a36Sopenharmony_ci le64_to_cpu(mb->position) != ctx->pos) 174462306a36Sopenharmony_ci return -EINVAL; 174562306a36Sopenharmony_ci 174662306a36Sopenharmony_ci crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 174762306a36Sopenharmony_ci if (stored_crc != crc) 174862306a36Sopenharmony_ci return -EINVAL; 174962306a36Sopenharmony_ci 175062306a36Sopenharmony_ci if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 175162306a36Sopenharmony_ci return -EINVAL; 175262306a36Sopenharmony_ci 175362306a36Sopenharmony_ci ctx->meta_total_blocks = BLOCK_SECTORS; 175462306a36Sopenharmony_ci 175562306a36Sopenharmony_ci return 0; 175662306a36Sopenharmony_ci} 175762306a36Sopenharmony_ci 175862306a36Sopenharmony_cistatic void 175962306a36Sopenharmony_cir5l_recovery_create_empty_meta_block(struct r5l_log *log, 176062306a36Sopenharmony_ci struct page *page, 176162306a36Sopenharmony_ci sector_t pos, u64 seq) 176262306a36Sopenharmony_ci{ 176362306a36Sopenharmony_ci struct r5l_meta_block *mb; 176462306a36Sopenharmony_ci 176562306a36Sopenharmony_ci mb = page_address(page); 176662306a36Sopenharmony_ci clear_page(mb); 176762306a36Sopenharmony_ci mb->magic = cpu_to_le32(R5LOG_MAGIC); 176862306a36Sopenharmony_ci mb->version = R5LOG_VERSION; 176962306a36Sopenharmony_ci mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 177062306a36Sopenharmony_ci mb->seq = cpu_to_le64(seq); 177162306a36Sopenharmony_ci mb->position = cpu_to_le64(pos); 177262306a36Sopenharmony_ci} 177362306a36Sopenharmony_ci 177462306a36Sopenharmony_cistatic int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 177562306a36Sopenharmony_ci u64 seq) 177662306a36Sopenharmony_ci{ 177762306a36Sopenharmony_ci struct page *page; 177862306a36Sopenharmony_ci struct r5l_meta_block *mb; 177962306a36Sopenharmony_ci 178062306a36Sopenharmony_ci page = alloc_page(GFP_KERNEL); 178162306a36Sopenharmony_ci if (!page) 178262306a36Sopenharmony_ci return -ENOMEM; 178362306a36Sopenharmony_ci r5l_recovery_create_empty_meta_block(log, page, pos, seq); 178462306a36Sopenharmony_ci mb = page_address(page); 178562306a36Sopenharmony_ci mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 178662306a36Sopenharmony_ci mb, PAGE_SIZE)); 178762306a36Sopenharmony_ci if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE | 178862306a36Sopenharmony_ci REQ_SYNC | REQ_FUA, false)) { 178962306a36Sopenharmony_ci __free_page(page); 179062306a36Sopenharmony_ci return -EIO; 179162306a36Sopenharmony_ci } 179262306a36Sopenharmony_ci __free_page(page); 179362306a36Sopenharmony_ci return 0; 179462306a36Sopenharmony_ci} 179562306a36Sopenharmony_ci 179662306a36Sopenharmony_ci/* 179762306a36Sopenharmony_ci * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 179862306a36Sopenharmony_ci * to mark valid (potentially not flushed) data in the journal. 179962306a36Sopenharmony_ci * 180062306a36Sopenharmony_ci * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 180162306a36Sopenharmony_ci * so there should not be any mismatch here. 180262306a36Sopenharmony_ci */ 180362306a36Sopenharmony_cistatic void r5l_recovery_load_data(struct r5l_log *log, 180462306a36Sopenharmony_ci struct stripe_head *sh, 180562306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx, 180662306a36Sopenharmony_ci struct r5l_payload_data_parity *payload, 180762306a36Sopenharmony_ci sector_t log_offset) 180862306a36Sopenharmony_ci{ 180962306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 181062306a36Sopenharmony_ci struct r5conf *conf = mddev->private; 181162306a36Sopenharmony_ci int dd_idx; 181262306a36Sopenharmony_ci 181362306a36Sopenharmony_ci raid5_compute_sector(conf, 181462306a36Sopenharmony_ci le64_to_cpu(payload->location), 0, 181562306a36Sopenharmony_ci &dd_idx, sh); 181662306a36Sopenharmony_ci r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset); 181762306a36Sopenharmony_ci sh->dev[dd_idx].log_checksum = 181862306a36Sopenharmony_ci le32_to_cpu(payload->checksum[0]); 181962306a36Sopenharmony_ci ctx->meta_total_blocks += BLOCK_SECTORS; 182062306a36Sopenharmony_ci 182162306a36Sopenharmony_ci set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 182262306a36Sopenharmony_ci set_bit(STRIPE_R5C_CACHING, &sh->state); 182362306a36Sopenharmony_ci} 182462306a36Sopenharmony_ci 182562306a36Sopenharmony_cistatic void r5l_recovery_load_parity(struct r5l_log *log, 182662306a36Sopenharmony_ci struct stripe_head *sh, 182762306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx, 182862306a36Sopenharmony_ci struct r5l_payload_data_parity *payload, 182962306a36Sopenharmony_ci sector_t log_offset) 183062306a36Sopenharmony_ci{ 183162306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 183262306a36Sopenharmony_ci struct r5conf *conf = mddev->private; 183362306a36Sopenharmony_ci 183462306a36Sopenharmony_ci ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 183562306a36Sopenharmony_ci r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset); 183662306a36Sopenharmony_ci sh->dev[sh->pd_idx].log_checksum = 183762306a36Sopenharmony_ci le32_to_cpu(payload->checksum[0]); 183862306a36Sopenharmony_ci set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 183962306a36Sopenharmony_ci 184062306a36Sopenharmony_ci if (sh->qd_idx >= 0) { 184162306a36Sopenharmony_ci r5l_recovery_read_page( 184262306a36Sopenharmony_ci log, ctx, sh->dev[sh->qd_idx].page, 184362306a36Sopenharmony_ci r5l_ring_add(log, log_offset, BLOCK_SECTORS)); 184462306a36Sopenharmony_ci sh->dev[sh->qd_idx].log_checksum = 184562306a36Sopenharmony_ci le32_to_cpu(payload->checksum[1]); 184662306a36Sopenharmony_ci set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); 184762306a36Sopenharmony_ci } 184862306a36Sopenharmony_ci clear_bit(STRIPE_R5C_CACHING, &sh->state); 184962306a36Sopenharmony_ci} 185062306a36Sopenharmony_ci 185162306a36Sopenharmony_cistatic void r5l_recovery_reset_stripe(struct stripe_head *sh) 185262306a36Sopenharmony_ci{ 185362306a36Sopenharmony_ci int i; 185462306a36Sopenharmony_ci 185562306a36Sopenharmony_ci sh->state = 0; 185662306a36Sopenharmony_ci sh->log_start = MaxSector; 185762306a36Sopenharmony_ci for (i = sh->disks; i--; ) 185862306a36Sopenharmony_ci sh->dev[i].flags = 0; 185962306a36Sopenharmony_ci} 186062306a36Sopenharmony_ci 186162306a36Sopenharmony_cistatic void 186262306a36Sopenharmony_cir5l_recovery_replay_one_stripe(struct r5conf *conf, 186362306a36Sopenharmony_ci struct stripe_head *sh, 186462306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx) 186562306a36Sopenharmony_ci{ 186662306a36Sopenharmony_ci struct md_rdev *rdev, *rrdev; 186762306a36Sopenharmony_ci int disk_index; 186862306a36Sopenharmony_ci int data_count = 0; 186962306a36Sopenharmony_ci 187062306a36Sopenharmony_ci for (disk_index = 0; disk_index < sh->disks; disk_index++) { 187162306a36Sopenharmony_ci if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 187262306a36Sopenharmony_ci continue; 187362306a36Sopenharmony_ci if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) 187462306a36Sopenharmony_ci continue; 187562306a36Sopenharmony_ci data_count++; 187662306a36Sopenharmony_ci } 187762306a36Sopenharmony_ci 187862306a36Sopenharmony_ci /* 187962306a36Sopenharmony_ci * stripes that only have parity must have been flushed 188062306a36Sopenharmony_ci * before the crash that we are now recovering from, so 188162306a36Sopenharmony_ci * there is nothing more to recovery. 188262306a36Sopenharmony_ci */ 188362306a36Sopenharmony_ci if (data_count == 0) 188462306a36Sopenharmony_ci goto out; 188562306a36Sopenharmony_ci 188662306a36Sopenharmony_ci for (disk_index = 0; disk_index < sh->disks; disk_index++) { 188762306a36Sopenharmony_ci if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 188862306a36Sopenharmony_ci continue; 188962306a36Sopenharmony_ci 189062306a36Sopenharmony_ci /* in case device is broken */ 189162306a36Sopenharmony_ci rcu_read_lock(); 189262306a36Sopenharmony_ci rdev = rcu_dereference(conf->disks[disk_index].rdev); 189362306a36Sopenharmony_ci if (rdev) { 189462306a36Sopenharmony_ci atomic_inc(&rdev->nr_pending); 189562306a36Sopenharmony_ci rcu_read_unlock(); 189662306a36Sopenharmony_ci sync_page_io(rdev, sh->sector, PAGE_SIZE, 189762306a36Sopenharmony_ci sh->dev[disk_index].page, REQ_OP_WRITE, 189862306a36Sopenharmony_ci false); 189962306a36Sopenharmony_ci rdev_dec_pending(rdev, rdev->mddev); 190062306a36Sopenharmony_ci rcu_read_lock(); 190162306a36Sopenharmony_ci } 190262306a36Sopenharmony_ci rrdev = rcu_dereference(conf->disks[disk_index].replacement); 190362306a36Sopenharmony_ci if (rrdev) { 190462306a36Sopenharmony_ci atomic_inc(&rrdev->nr_pending); 190562306a36Sopenharmony_ci rcu_read_unlock(); 190662306a36Sopenharmony_ci sync_page_io(rrdev, sh->sector, PAGE_SIZE, 190762306a36Sopenharmony_ci sh->dev[disk_index].page, REQ_OP_WRITE, 190862306a36Sopenharmony_ci false); 190962306a36Sopenharmony_ci rdev_dec_pending(rrdev, rrdev->mddev); 191062306a36Sopenharmony_ci rcu_read_lock(); 191162306a36Sopenharmony_ci } 191262306a36Sopenharmony_ci rcu_read_unlock(); 191362306a36Sopenharmony_ci } 191462306a36Sopenharmony_ci ctx->data_parity_stripes++; 191562306a36Sopenharmony_ciout: 191662306a36Sopenharmony_ci r5l_recovery_reset_stripe(sh); 191762306a36Sopenharmony_ci} 191862306a36Sopenharmony_ci 191962306a36Sopenharmony_cistatic struct stripe_head * 192062306a36Sopenharmony_cir5c_recovery_alloc_stripe( 192162306a36Sopenharmony_ci struct r5conf *conf, 192262306a36Sopenharmony_ci sector_t stripe_sect, 192362306a36Sopenharmony_ci int noblock) 192462306a36Sopenharmony_ci{ 192562306a36Sopenharmony_ci struct stripe_head *sh; 192662306a36Sopenharmony_ci 192762306a36Sopenharmony_ci sh = raid5_get_active_stripe(conf, NULL, stripe_sect, 192862306a36Sopenharmony_ci noblock ? R5_GAS_NOBLOCK : 0); 192962306a36Sopenharmony_ci if (!sh) 193062306a36Sopenharmony_ci return NULL; /* no more stripe available */ 193162306a36Sopenharmony_ci 193262306a36Sopenharmony_ci r5l_recovery_reset_stripe(sh); 193362306a36Sopenharmony_ci 193462306a36Sopenharmony_ci return sh; 193562306a36Sopenharmony_ci} 193662306a36Sopenharmony_ci 193762306a36Sopenharmony_cistatic struct stripe_head * 193862306a36Sopenharmony_cir5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) 193962306a36Sopenharmony_ci{ 194062306a36Sopenharmony_ci struct stripe_head *sh; 194162306a36Sopenharmony_ci 194262306a36Sopenharmony_ci list_for_each_entry(sh, list, lru) 194362306a36Sopenharmony_ci if (sh->sector == sect) 194462306a36Sopenharmony_ci return sh; 194562306a36Sopenharmony_ci return NULL; 194662306a36Sopenharmony_ci} 194762306a36Sopenharmony_ci 194862306a36Sopenharmony_cistatic void 194962306a36Sopenharmony_cir5c_recovery_drop_stripes(struct list_head *cached_stripe_list, 195062306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx) 195162306a36Sopenharmony_ci{ 195262306a36Sopenharmony_ci struct stripe_head *sh, *next; 195362306a36Sopenharmony_ci 195462306a36Sopenharmony_ci list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 195562306a36Sopenharmony_ci r5l_recovery_reset_stripe(sh); 195662306a36Sopenharmony_ci list_del_init(&sh->lru); 195762306a36Sopenharmony_ci raid5_release_stripe(sh); 195862306a36Sopenharmony_ci } 195962306a36Sopenharmony_ci} 196062306a36Sopenharmony_ci 196162306a36Sopenharmony_cistatic void 196262306a36Sopenharmony_cir5c_recovery_replay_stripes(struct list_head *cached_stripe_list, 196362306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx) 196462306a36Sopenharmony_ci{ 196562306a36Sopenharmony_ci struct stripe_head *sh, *next; 196662306a36Sopenharmony_ci 196762306a36Sopenharmony_ci list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 196862306a36Sopenharmony_ci if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 196962306a36Sopenharmony_ci r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 197062306a36Sopenharmony_ci list_del_init(&sh->lru); 197162306a36Sopenharmony_ci raid5_release_stripe(sh); 197262306a36Sopenharmony_ci } 197362306a36Sopenharmony_ci} 197462306a36Sopenharmony_ci 197562306a36Sopenharmony_ci/* if matches return 0; otherwise return -EINVAL */ 197662306a36Sopenharmony_cistatic int 197762306a36Sopenharmony_cir5l_recovery_verify_data_checksum(struct r5l_log *log, 197862306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx, 197962306a36Sopenharmony_ci struct page *page, 198062306a36Sopenharmony_ci sector_t log_offset, __le32 log_checksum) 198162306a36Sopenharmony_ci{ 198262306a36Sopenharmony_ci void *addr; 198362306a36Sopenharmony_ci u32 checksum; 198462306a36Sopenharmony_ci 198562306a36Sopenharmony_ci r5l_recovery_read_page(log, ctx, page, log_offset); 198662306a36Sopenharmony_ci addr = kmap_atomic(page); 198762306a36Sopenharmony_ci checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 198862306a36Sopenharmony_ci kunmap_atomic(addr); 198962306a36Sopenharmony_ci return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 199062306a36Sopenharmony_ci} 199162306a36Sopenharmony_ci 199262306a36Sopenharmony_ci/* 199362306a36Sopenharmony_ci * before loading data to stripe cache, we need verify checksum for all data, 199462306a36Sopenharmony_ci * if there is mismatch for any data page, we drop all data in the mata block 199562306a36Sopenharmony_ci */ 199662306a36Sopenharmony_cistatic int 199762306a36Sopenharmony_cir5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, 199862306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx) 199962306a36Sopenharmony_ci{ 200062306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 200162306a36Sopenharmony_ci struct r5conf *conf = mddev->private; 200262306a36Sopenharmony_ci struct r5l_meta_block *mb = page_address(ctx->meta_page); 200362306a36Sopenharmony_ci sector_t mb_offset = sizeof(struct r5l_meta_block); 200462306a36Sopenharmony_ci sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 200562306a36Sopenharmony_ci struct page *page; 200662306a36Sopenharmony_ci struct r5l_payload_data_parity *payload; 200762306a36Sopenharmony_ci struct r5l_payload_flush *payload_flush; 200862306a36Sopenharmony_ci 200962306a36Sopenharmony_ci page = alloc_page(GFP_KERNEL); 201062306a36Sopenharmony_ci if (!page) 201162306a36Sopenharmony_ci return -ENOMEM; 201262306a36Sopenharmony_ci 201362306a36Sopenharmony_ci while (mb_offset < le32_to_cpu(mb->meta_size)) { 201462306a36Sopenharmony_ci payload = (void *)mb + mb_offset; 201562306a36Sopenharmony_ci payload_flush = (void *)mb + mb_offset; 201662306a36Sopenharmony_ci 201762306a36Sopenharmony_ci if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 201862306a36Sopenharmony_ci if (r5l_recovery_verify_data_checksum( 201962306a36Sopenharmony_ci log, ctx, page, log_offset, 202062306a36Sopenharmony_ci payload->checksum[0]) < 0) 202162306a36Sopenharmony_ci goto mismatch; 202262306a36Sopenharmony_ci } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) { 202362306a36Sopenharmony_ci if (r5l_recovery_verify_data_checksum( 202462306a36Sopenharmony_ci log, ctx, page, log_offset, 202562306a36Sopenharmony_ci payload->checksum[0]) < 0) 202662306a36Sopenharmony_ci goto mismatch; 202762306a36Sopenharmony_ci if (conf->max_degraded == 2 && /* q for RAID 6 */ 202862306a36Sopenharmony_ci r5l_recovery_verify_data_checksum( 202962306a36Sopenharmony_ci log, ctx, page, 203062306a36Sopenharmony_ci r5l_ring_add(log, log_offset, 203162306a36Sopenharmony_ci BLOCK_SECTORS), 203262306a36Sopenharmony_ci payload->checksum[1]) < 0) 203362306a36Sopenharmony_ci goto mismatch; 203462306a36Sopenharmony_ci } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { 203562306a36Sopenharmony_ci /* nothing to do for R5LOG_PAYLOAD_FLUSH here */ 203662306a36Sopenharmony_ci } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */ 203762306a36Sopenharmony_ci goto mismatch; 203862306a36Sopenharmony_ci 203962306a36Sopenharmony_ci if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { 204062306a36Sopenharmony_ci mb_offset += sizeof(struct r5l_payload_flush) + 204162306a36Sopenharmony_ci le32_to_cpu(payload_flush->size); 204262306a36Sopenharmony_ci } else { 204362306a36Sopenharmony_ci /* DATA or PARITY payload */ 204462306a36Sopenharmony_ci log_offset = r5l_ring_add(log, log_offset, 204562306a36Sopenharmony_ci le32_to_cpu(payload->size)); 204662306a36Sopenharmony_ci mb_offset += sizeof(struct r5l_payload_data_parity) + 204762306a36Sopenharmony_ci sizeof(__le32) * 204862306a36Sopenharmony_ci (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 204962306a36Sopenharmony_ci } 205062306a36Sopenharmony_ci 205162306a36Sopenharmony_ci } 205262306a36Sopenharmony_ci 205362306a36Sopenharmony_ci put_page(page); 205462306a36Sopenharmony_ci return 0; 205562306a36Sopenharmony_ci 205662306a36Sopenharmony_cimismatch: 205762306a36Sopenharmony_ci put_page(page); 205862306a36Sopenharmony_ci return -EINVAL; 205962306a36Sopenharmony_ci} 206062306a36Sopenharmony_ci 206162306a36Sopenharmony_ci/* 206262306a36Sopenharmony_ci * Analyze all data/parity pages in one meta block 206362306a36Sopenharmony_ci * Returns: 206462306a36Sopenharmony_ci * 0 for success 206562306a36Sopenharmony_ci * -EINVAL for unknown playload type 206662306a36Sopenharmony_ci * -EAGAIN for checksum mismatch of data page 206762306a36Sopenharmony_ci * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 206862306a36Sopenharmony_ci */ 206962306a36Sopenharmony_cistatic int 207062306a36Sopenharmony_cir5c_recovery_analyze_meta_block(struct r5l_log *log, 207162306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx, 207262306a36Sopenharmony_ci struct list_head *cached_stripe_list) 207362306a36Sopenharmony_ci{ 207462306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 207562306a36Sopenharmony_ci struct r5conf *conf = mddev->private; 207662306a36Sopenharmony_ci struct r5l_meta_block *mb; 207762306a36Sopenharmony_ci struct r5l_payload_data_parity *payload; 207862306a36Sopenharmony_ci struct r5l_payload_flush *payload_flush; 207962306a36Sopenharmony_ci int mb_offset; 208062306a36Sopenharmony_ci sector_t log_offset; 208162306a36Sopenharmony_ci sector_t stripe_sect; 208262306a36Sopenharmony_ci struct stripe_head *sh; 208362306a36Sopenharmony_ci int ret; 208462306a36Sopenharmony_ci 208562306a36Sopenharmony_ci /* 208662306a36Sopenharmony_ci * for mismatch in data blocks, we will drop all data in this mb, but 208762306a36Sopenharmony_ci * we will still read next mb for other data with FLUSH flag, as 208862306a36Sopenharmony_ci * io_unit could finish out of order. 208962306a36Sopenharmony_ci */ 209062306a36Sopenharmony_ci ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 209162306a36Sopenharmony_ci if (ret == -EINVAL) 209262306a36Sopenharmony_ci return -EAGAIN; 209362306a36Sopenharmony_ci else if (ret) 209462306a36Sopenharmony_ci return ret; /* -ENOMEM duo to alloc_page() failed */ 209562306a36Sopenharmony_ci 209662306a36Sopenharmony_ci mb = page_address(ctx->meta_page); 209762306a36Sopenharmony_ci mb_offset = sizeof(struct r5l_meta_block); 209862306a36Sopenharmony_ci log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 209962306a36Sopenharmony_ci 210062306a36Sopenharmony_ci while (mb_offset < le32_to_cpu(mb->meta_size)) { 210162306a36Sopenharmony_ci int dd; 210262306a36Sopenharmony_ci 210362306a36Sopenharmony_ci payload = (void *)mb + mb_offset; 210462306a36Sopenharmony_ci payload_flush = (void *)mb + mb_offset; 210562306a36Sopenharmony_ci 210662306a36Sopenharmony_ci if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { 210762306a36Sopenharmony_ci int i, count; 210862306a36Sopenharmony_ci 210962306a36Sopenharmony_ci count = le32_to_cpu(payload_flush->size) / sizeof(__le64); 211062306a36Sopenharmony_ci for (i = 0; i < count; ++i) { 211162306a36Sopenharmony_ci stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]); 211262306a36Sopenharmony_ci sh = r5c_recovery_lookup_stripe(cached_stripe_list, 211362306a36Sopenharmony_ci stripe_sect); 211462306a36Sopenharmony_ci if (sh) { 211562306a36Sopenharmony_ci WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 211662306a36Sopenharmony_ci r5l_recovery_reset_stripe(sh); 211762306a36Sopenharmony_ci list_del_init(&sh->lru); 211862306a36Sopenharmony_ci raid5_release_stripe(sh); 211962306a36Sopenharmony_ci } 212062306a36Sopenharmony_ci } 212162306a36Sopenharmony_ci 212262306a36Sopenharmony_ci mb_offset += sizeof(struct r5l_payload_flush) + 212362306a36Sopenharmony_ci le32_to_cpu(payload_flush->size); 212462306a36Sopenharmony_ci continue; 212562306a36Sopenharmony_ci } 212662306a36Sopenharmony_ci 212762306a36Sopenharmony_ci /* DATA or PARITY payload */ 212862306a36Sopenharmony_ci stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ? 212962306a36Sopenharmony_ci raid5_compute_sector( 213062306a36Sopenharmony_ci conf, le64_to_cpu(payload->location), 0, &dd, 213162306a36Sopenharmony_ci NULL) 213262306a36Sopenharmony_ci : le64_to_cpu(payload->location); 213362306a36Sopenharmony_ci 213462306a36Sopenharmony_ci sh = r5c_recovery_lookup_stripe(cached_stripe_list, 213562306a36Sopenharmony_ci stripe_sect); 213662306a36Sopenharmony_ci 213762306a36Sopenharmony_ci if (!sh) { 213862306a36Sopenharmony_ci sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1); 213962306a36Sopenharmony_ci /* 214062306a36Sopenharmony_ci * cannot get stripe from raid5_get_active_stripe 214162306a36Sopenharmony_ci * try replay some stripes 214262306a36Sopenharmony_ci */ 214362306a36Sopenharmony_ci if (!sh) { 214462306a36Sopenharmony_ci r5c_recovery_replay_stripes( 214562306a36Sopenharmony_ci cached_stripe_list, ctx); 214662306a36Sopenharmony_ci sh = r5c_recovery_alloc_stripe( 214762306a36Sopenharmony_ci conf, stripe_sect, 1); 214862306a36Sopenharmony_ci } 214962306a36Sopenharmony_ci if (!sh) { 215062306a36Sopenharmony_ci int new_size = conf->min_nr_stripes * 2; 215162306a36Sopenharmony_ci pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 215262306a36Sopenharmony_ci mdname(mddev), 215362306a36Sopenharmony_ci new_size); 215462306a36Sopenharmony_ci ret = raid5_set_cache_size(mddev, new_size); 215562306a36Sopenharmony_ci if (conf->min_nr_stripes <= new_size / 2) { 215662306a36Sopenharmony_ci pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n", 215762306a36Sopenharmony_ci mdname(mddev), 215862306a36Sopenharmony_ci ret, 215962306a36Sopenharmony_ci new_size, 216062306a36Sopenharmony_ci conf->min_nr_stripes, 216162306a36Sopenharmony_ci conf->max_nr_stripes); 216262306a36Sopenharmony_ci return -ENOMEM; 216362306a36Sopenharmony_ci } 216462306a36Sopenharmony_ci sh = r5c_recovery_alloc_stripe( 216562306a36Sopenharmony_ci conf, stripe_sect, 0); 216662306a36Sopenharmony_ci } 216762306a36Sopenharmony_ci if (!sh) { 216862306a36Sopenharmony_ci pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 216962306a36Sopenharmony_ci mdname(mddev)); 217062306a36Sopenharmony_ci return -ENOMEM; 217162306a36Sopenharmony_ci } 217262306a36Sopenharmony_ci list_add_tail(&sh->lru, cached_stripe_list); 217362306a36Sopenharmony_ci } 217462306a36Sopenharmony_ci 217562306a36Sopenharmony_ci if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 217662306a36Sopenharmony_ci if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 217762306a36Sopenharmony_ci test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { 217862306a36Sopenharmony_ci r5l_recovery_replay_one_stripe(conf, sh, ctx); 217962306a36Sopenharmony_ci list_move_tail(&sh->lru, cached_stripe_list); 218062306a36Sopenharmony_ci } 218162306a36Sopenharmony_ci r5l_recovery_load_data(log, sh, ctx, payload, 218262306a36Sopenharmony_ci log_offset); 218362306a36Sopenharmony_ci } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) 218462306a36Sopenharmony_ci r5l_recovery_load_parity(log, sh, ctx, payload, 218562306a36Sopenharmony_ci log_offset); 218662306a36Sopenharmony_ci else 218762306a36Sopenharmony_ci return -EINVAL; 218862306a36Sopenharmony_ci 218962306a36Sopenharmony_ci log_offset = r5l_ring_add(log, log_offset, 219062306a36Sopenharmony_ci le32_to_cpu(payload->size)); 219162306a36Sopenharmony_ci 219262306a36Sopenharmony_ci mb_offset += sizeof(struct r5l_payload_data_parity) + 219362306a36Sopenharmony_ci sizeof(__le32) * 219462306a36Sopenharmony_ci (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 219562306a36Sopenharmony_ci } 219662306a36Sopenharmony_ci 219762306a36Sopenharmony_ci return 0; 219862306a36Sopenharmony_ci} 219962306a36Sopenharmony_ci 220062306a36Sopenharmony_ci/* 220162306a36Sopenharmony_ci * Load the stripe into cache. The stripe will be written out later by 220262306a36Sopenharmony_ci * the stripe cache state machine. 220362306a36Sopenharmony_ci */ 220462306a36Sopenharmony_cistatic void r5c_recovery_load_one_stripe(struct r5l_log *log, 220562306a36Sopenharmony_ci struct stripe_head *sh) 220662306a36Sopenharmony_ci{ 220762306a36Sopenharmony_ci struct r5dev *dev; 220862306a36Sopenharmony_ci int i; 220962306a36Sopenharmony_ci 221062306a36Sopenharmony_ci for (i = sh->disks; i--; ) { 221162306a36Sopenharmony_ci dev = sh->dev + i; 221262306a36Sopenharmony_ci if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 221362306a36Sopenharmony_ci set_bit(R5_InJournal, &dev->flags); 221462306a36Sopenharmony_ci set_bit(R5_UPTODATE, &dev->flags); 221562306a36Sopenharmony_ci } 221662306a36Sopenharmony_ci } 221762306a36Sopenharmony_ci} 221862306a36Sopenharmony_ci 221962306a36Sopenharmony_ci/* 222062306a36Sopenharmony_ci * Scan through the log for all to-be-flushed data 222162306a36Sopenharmony_ci * 222262306a36Sopenharmony_ci * For stripes with data and parity, namely Data-Parity stripe 222362306a36Sopenharmony_ci * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 222462306a36Sopenharmony_ci * 222562306a36Sopenharmony_ci * For stripes with only data, namely Data-Only stripe 222662306a36Sopenharmony_ci * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 222762306a36Sopenharmony_ci * 222862306a36Sopenharmony_ci * For a stripe, if we see data after parity, we should discard all previous 222962306a36Sopenharmony_ci * data and parity for this stripe, as these data are already flushed to 223062306a36Sopenharmony_ci * the array. 223162306a36Sopenharmony_ci * 223262306a36Sopenharmony_ci * At the end of the scan, we return the new journal_tail, which points to 223362306a36Sopenharmony_ci * first data-only stripe on the journal device, or next invalid meta block. 223462306a36Sopenharmony_ci */ 223562306a36Sopenharmony_cistatic int r5c_recovery_flush_log(struct r5l_log *log, 223662306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx) 223762306a36Sopenharmony_ci{ 223862306a36Sopenharmony_ci struct stripe_head *sh; 223962306a36Sopenharmony_ci int ret = 0; 224062306a36Sopenharmony_ci 224162306a36Sopenharmony_ci /* scan through the log */ 224262306a36Sopenharmony_ci while (1) { 224362306a36Sopenharmony_ci if (r5l_recovery_read_meta_block(log, ctx)) 224462306a36Sopenharmony_ci break; 224562306a36Sopenharmony_ci 224662306a36Sopenharmony_ci ret = r5c_recovery_analyze_meta_block(log, ctx, 224762306a36Sopenharmony_ci &ctx->cached_list); 224862306a36Sopenharmony_ci /* 224962306a36Sopenharmony_ci * -EAGAIN means mismatch in data block, in this case, we still 225062306a36Sopenharmony_ci * try scan the next metablock 225162306a36Sopenharmony_ci */ 225262306a36Sopenharmony_ci if (ret && ret != -EAGAIN) 225362306a36Sopenharmony_ci break; /* ret == -EINVAL or -ENOMEM */ 225462306a36Sopenharmony_ci ctx->seq++; 225562306a36Sopenharmony_ci ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 225662306a36Sopenharmony_ci } 225762306a36Sopenharmony_ci 225862306a36Sopenharmony_ci if (ret == -ENOMEM) { 225962306a36Sopenharmony_ci r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 226062306a36Sopenharmony_ci return ret; 226162306a36Sopenharmony_ci } 226262306a36Sopenharmony_ci 226362306a36Sopenharmony_ci /* replay data-parity stripes */ 226462306a36Sopenharmony_ci r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 226562306a36Sopenharmony_ci 226662306a36Sopenharmony_ci /* load data-only stripes to stripe cache */ 226762306a36Sopenharmony_ci list_for_each_entry(sh, &ctx->cached_list, lru) { 226862306a36Sopenharmony_ci WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 226962306a36Sopenharmony_ci r5c_recovery_load_one_stripe(log, sh); 227062306a36Sopenharmony_ci ctx->data_only_stripes++; 227162306a36Sopenharmony_ci } 227262306a36Sopenharmony_ci 227362306a36Sopenharmony_ci return 0; 227462306a36Sopenharmony_ci} 227562306a36Sopenharmony_ci 227662306a36Sopenharmony_ci/* 227762306a36Sopenharmony_ci * we did a recovery. Now ctx.pos points to an invalid meta block. New 227862306a36Sopenharmony_ci * log will start here. but we can't let superblock point to last valid 227962306a36Sopenharmony_ci * meta block. The log might looks like: 228062306a36Sopenharmony_ci * | meta 1| meta 2| meta 3| 228162306a36Sopenharmony_ci * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 228262306a36Sopenharmony_ci * superblock points to meta 1, we write a new valid meta 2n. if crash 228362306a36Sopenharmony_ci * happens again, new recovery will start from meta 1. Since meta 2n is 228462306a36Sopenharmony_ci * valid now, recovery will think meta 3 is valid, which is wrong. 228562306a36Sopenharmony_ci * The solution is we create a new meta in meta2 with its seq == meta 228662306a36Sopenharmony_ci * 1's seq + 10000 and let superblock points to meta2. The same recovery 228762306a36Sopenharmony_ci * will not think meta 3 is a valid meta, because its seq doesn't match 228862306a36Sopenharmony_ci */ 228962306a36Sopenharmony_ci 229062306a36Sopenharmony_ci/* 229162306a36Sopenharmony_ci * Before recovery, the log looks like the following 229262306a36Sopenharmony_ci * 229362306a36Sopenharmony_ci * --------------------------------------------- 229462306a36Sopenharmony_ci * | valid log | invalid log | 229562306a36Sopenharmony_ci * --------------------------------------------- 229662306a36Sopenharmony_ci * ^ 229762306a36Sopenharmony_ci * |- log->last_checkpoint 229862306a36Sopenharmony_ci * |- log->last_cp_seq 229962306a36Sopenharmony_ci * 230062306a36Sopenharmony_ci * Now we scan through the log until we see invalid entry 230162306a36Sopenharmony_ci * 230262306a36Sopenharmony_ci * --------------------------------------------- 230362306a36Sopenharmony_ci * | valid log | invalid log | 230462306a36Sopenharmony_ci * --------------------------------------------- 230562306a36Sopenharmony_ci * ^ ^ 230662306a36Sopenharmony_ci * |- log->last_checkpoint |- ctx->pos 230762306a36Sopenharmony_ci * |- log->last_cp_seq |- ctx->seq 230862306a36Sopenharmony_ci * 230962306a36Sopenharmony_ci * From this point, we need to increase seq number by 10 to avoid 231062306a36Sopenharmony_ci * confusing next recovery. 231162306a36Sopenharmony_ci * 231262306a36Sopenharmony_ci * --------------------------------------------- 231362306a36Sopenharmony_ci * | valid log | invalid log | 231462306a36Sopenharmony_ci * --------------------------------------------- 231562306a36Sopenharmony_ci * ^ ^ 231662306a36Sopenharmony_ci * |- log->last_checkpoint |- ctx->pos+1 231762306a36Sopenharmony_ci * |- log->last_cp_seq |- ctx->seq+10001 231862306a36Sopenharmony_ci * 231962306a36Sopenharmony_ci * However, it is not safe to start the state machine yet, because data only 232062306a36Sopenharmony_ci * parities are not yet secured in RAID. To save these data only parities, we 232162306a36Sopenharmony_ci * rewrite them from seq+11. 232262306a36Sopenharmony_ci * 232362306a36Sopenharmony_ci * ----------------------------------------------------------------- 232462306a36Sopenharmony_ci * | valid log | data only stripes | invalid log | 232562306a36Sopenharmony_ci * ----------------------------------------------------------------- 232662306a36Sopenharmony_ci * ^ ^ 232762306a36Sopenharmony_ci * |- log->last_checkpoint |- ctx->pos+n 232862306a36Sopenharmony_ci * |- log->last_cp_seq |- ctx->seq+10000+n 232962306a36Sopenharmony_ci * 233062306a36Sopenharmony_ci * If failure happens again during this process, the recovery can safe start 233162306a36Sopenharmony_ci * again from log->last_checkpoint. 233262306a36Sopenharmony_ci * 233362306a36Sopenharmony_ci * Once data only stripes are rewritten to journal, we move log_tail 233462306a36Sopenharmony_ci * 233562306a36Sopenharmony_ci * ----------------------------------------------------------------- 233662306a36Sopenharmony_ci * | old log | data only stripes | invalid log | 233762306a36Sopenharmony_ci * ----------------------------------------------------------------- 233862306a36Sopenharmony_ci * ^ ^ 233962306a36Sopenharmony_ci * |- log->last_checkpoint |- ctx->pos+n 234062306a36Sopenharmony_ci * |- log->last_cp_seq |- ctx->seq+10000+n 234162306a36Sopenharmony_ci * 234262306a36Sopenharmony_ci * Then we can safely start the state machine. If failure happens from this 234362306a36Sopenharmony_ci * point on, the recovery will start from new log->last_checkpoint. 234462306a36Sopenharmony_ci */ 234562306a36Sopenharmony_cistatic int 234662306a36Sopenharmony_cir5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 234762306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx) 234862306a36Sopenharmony_ci{ 234962306a36Sopenharmony_ci struct stripe_head *sh; 235062306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 235162306a36Sopenharmony_ci struct page *page; 235262306a36Sopenharmony_ci sector_t next_checkpoint = MaxSector; 235362306a36Sopenharmony_ci 235462306a36Sopenharmony_ci page = alloc_page(GFP_KERNEL); 235562306a36Sopenharmony_ci if (!page) { 235662306a36Sopenharmony_ci pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 235762306a36Sopenharmony_ci mdname(mddev)); 235862306a36Sopenharmony_ci return -ENOMEM; 235962306a36Sopenharmony_ci } 236062306a36Sopenharmony_ci 236162306a36Sopenharmony_ci WARN_ON(list_empty(&ctx->cached_list)); 236262306a36Sopenharmony_ci 236362306a36Sopenharmony_ci list_for_each_entry(sh, &ctx->cached_list, lru) { 236462306a36Sopenharmony_ci struct r5l_meta_block *mb; 236562306a36Sopenharmony_ci int i; 236662306a36Sopenharmony_ci int offset; 236762306a36Sopenharmony_ci sector_t write_pos; 236862306a36Sopenharmony_ci 236962306a36Sopenharmony_ci WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 237062306a36Sopenharmony_ci r5l_recovery_create_empty_meta_block(log, page, 237162306a36Sopenharmony_ci ctx->pos, ctx->seq); 237262306a36Sopenharmony_ci mb = page_address(page); 237362306a36Sopenharmony_ci offset = le32_to_cpu(mb->meta_size); 237462306a36Sopenharmony_ci write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 237562306a36Sopenharmony_ci 237662306a36Sopenharmony_ci for (i = sh->disks; i--; ) { 237762306a36Sopenharmony_ci struct r5dev *dev = &sh->dev[i]; 237862306a36Sopenharmony_ci struct r5l_payload_data_parity *payload; 237962306a36Sopenharmony_ci void *addr; 238062306a36Sopenharmony_ci 238162306a36Sopenharmony_ci if (test_bit(R5_InJournal, &dev->flags)) { 238262306a36Sopenharmony_ci payload = (void *)mb + offset; 238362306a36Sopenharmony_ci payload->header.type = cpu_to_le16( 238462306a36Sopenharmony_ci R5LOG_PAYLOAD_DATA); 238562306a36Sopenharmony_ci payload->size = cpu_to_le32(BLOCK_SECTORS); 238662306a36Sopenharmony_ci payload->location = cpu_to_le64( 238762306a36Sopenharmony_ci raid5_compute_blocknr(sh, i, 0)); 238862306a36Sopenharmony_ci addr = kmap_atomic(dev->page); 238962306a36Sopenharmony_ci payload->checksum[0] = cpu_to_le32( 239062306a36Sopenharmony_ci crc32c_le(log->uuid_checksum, addr, 239162306a36Sopenharmony_ci PAGE_SIZE)); 239262306a36Sopenharmony_ci kunmap_atomic(addr); 239362306a36Sopenharmony_ci sync_page_io(log->rdev, write_pos, PAGE_SIZE, 239462306a36Sopenharmony_ci dev->page, REQ_OP_WRITE, false); 239562306a36Sopenharmony_ci write_pos = r5l_ring_add(log, write_pos, 239662306a36Sopenharmony_ci BLOCK_SECTORS); 239762306a36Sopenharmony_ci offset += sizeof(__le32) + 239862306a36Sopenharmony_ci sizeof(struct r5l_payload_data_parity); 239962306a36Sopenharmony_ci 240062306a36Sopenharmony_ci } 240162306a36Sopenharmony_ci } 240262306a36Sopenharmony_ci mb->meta_size = cpu_to_le32(offset); 240362306a36Sopenharmony_ci mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 240462306a36Sopenharmony_ci mb, PAGE_SIZE)); 240562306a36Sopenharmony_ci sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 240662306a36Sopenharmony_ci REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false); 240762306a36Sopenharmony_ci sh->log_start = ctx->pos; 240862306a36Sopenharmony_ci list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 240962306a36Sopenharmony_ci atomic_inc(&log->stripe_in_journal_count); 241062306a36Sopenharmony_ci ctx->pos = write_pos; 241162306a36Sopenharmony_ci ctx->seq += 1; 241262306a36Sopenharmony_ci next_checkpoint = sh->log_start; 241362306a36Sopenharmony_ci } 241462306a36Sopenharmony_ci log->next_checkpoint = next_checkpoint; 241562306a36Sopenharmony_ci __free_page(page); 241662306a36Sopenharmony_ci return 0; 241762306a36Sopenharmony_ci} 241862306a36Sopenharmony_ci 241962306a36Sopenharmony_cistatic void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, 242062306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx) 242162306a36Sopenharmony_ci{ 242262306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 242362306a36Sopenharmony_ci struct r5conf *conf = mddev->private; 242462306a36Sopenharmony_ci struct stripe_head *sh, *next; 242562306a36Sopenharmony_ci bool cleared_pending = false; 242662306a36Sopenharmony_ci 242762306a36Sopenharmony_ci if (ctx->data_only_stripes == 0) 242862306a36Sopenharmony_ci return; 242962306a36Sopenharmony_ci 243062306a36Sopenharmony_ci if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 243162306a36Sopenharmony_ci cleared_pending = true; 243262306a36Sopenharmony_ci clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 243362306a36Sopenharmony_ci } 243462306a36Sopenharmony_ci log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; 243562306a36Sopenharmony_ci 243662306a36Sopenharmony_ci list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 243762306a36Sopenharmony_ci r5c_make_stripe_write_out(sh); 243862306a36Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 243962306a36Sopenharmony_ci list_del_init(&sh->lru); 244062306a36Sopenharmony_ci raid5_release_stripe(sh); 244162306a36Sopenharmony_ci } 244262306a36Sopenharmony_ci 244362306a36Sopenharmony_ci /* reuse conf->wait_for_quiescent in recovery */ 244462306a36Sopenharmony_ci wait_event(conf->wait_for_quiescent, 244562306a36Sopenharmony_ci atomic_read(&conf->active_stripes) == 0); 244662306a36Sopenharmony_ci 244762306a36Sopenharmony_ci log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 244862306a36Sopenharmony_ci if (cleared_pending) 244962306a36Sopenharmony_ci set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 245062306a36Sopenharmony_ci} 245162306a36Sopenharmony_ci 245262306a36Sopenharmony_cistatic int r5l_recovery_log(struct r5l_log *log) 245362306a36Sopenharmony_ci{ 245462306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 245562306a36Sopenharmony_ci struct r5l_recovery_ctx *ctx; 245662306a36Sopenharmony_ci int ret; 245762306a36Sopenharmony_ci sector_t pos; 245862306a36Sopenharmony_ci 245962306a36Sopenharmony_ci ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 246062306a36Sopenharmony_ci if (!ctx) 246162306a36Sopenharmony_ci return -ENOMEM; 246262306a36Sopenharmony_ci 246362306a36Sopenharmony_ci ctx->pos = log->last_checkpoint; 246462306a36Sopenharmony_ci ctx->seq = log->last_cp_seq; 246562306a36Sopenharmony_ci INIT_LIST_HEAD(&ctx->cached_list); 246662306a36Sopenharmony_ci ctx->meta_page = alloc_page(GFP_KERNEL); 246762306a36Sopenharmony_ci 246862306a36Sopenharmony_ci if (!ctx->meta_page) { 246962306a36Sopenharmony_ci ret = -ENOMEM; 247062306a36Sopenharmony_ci goto meta_page; 247162306a36Sopenharmony_ci } 247262306a36Sopenharmony_ci 247362306a36Sopenharmony_ci if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) { 247462306a36Sopenharmony_ci ret = -ENOMEM; 247562306a36Sopenharmony_ci goto ra_pool; 247662306a36Sopenharmony_ci } 247762306a36Sopenharmony_ci 247862306a36Sopenharmony_ci ret = r5c_recovery_flush_log(log, ctx); 247962306a36Sopenharmony_ci 248062306a36Sopenharmony_ci if (ret) 248162306a36Sopenharmony_ci goto error; 248262306a36Sopenharmony_ci 248362306a36Sopenharmony_ci pos = ctx->pos; 248462306a36Sopenharmony_ci ctx->seq += 10000; 248562306a36Sopenharmony_ci 248662306a36Sopenharmony_ci if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0)) 248762306a36Sopenharmony_ci pr_info("md/raid:%s: starting from clean shutdown\n", 248862306a36Sopenharmony_ci mdname(mddev)); 248962306a36Sopenharmony_ci else 249062306a36Sopenharmony_ci pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", 249162306a36Sopenharmony_ci mdname(mddev), ctx->data_only_stripes, 249262306a36Sopenharmony_ci ctx->data_parity_stripes); 249362306a36Sopenharmony_ci 249462306a36Sopenharmony_ci if (ctx->data_only_stripes == 0) { 249562306a36Sopenharmony_ci log->next_checkpoint = ctx->pos; 249662306a36Sopenharmony_ci r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++); 249762306a36Sopenharmony_ci ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 249862306a36Sopenharmony_ci } else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) { 249962306a36Sopenharmony_ci pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 250062306a36Sopenharmony_ci mdname(mddev)); 250162306a36Sopenharmony_ci ret = -EIO; 250262306a36Sopenharmony_ci goto error; 250362306a36Sopenharmony_ci } 250462306a36Sopenharmony_ci 250562306a36Sopenharmony_ci log->log_start = ctx->pos; 250662306a36Sopenharmony_ci log->seq = ctx->seq; 250762306a36Sopenharmony_ci log->last_checkpoint = pos; 250862306a36Sopenharmony_ci r5l_write_super(log, pos); 250962306a36Sopenharmony_ci 251062306a36Sopenharmony_ci r5c_recovery_flush_data_only_stripes(log, ctx); 251162306a36Sopenharmony_ci ret = 0; 251262306a36Sopenharmony_cierror: 251362306a36Sopenharmony_ci r5l_recovery_free_ra_pool(log, ctx); 251462306a36Sopenharmony_cira_pool: 251562306a36Sopenharmony_ci __free_page(ctx->meta_page); 251662306a36Sopenharmony_cimeta_page: 251762306a36Sopenharmony_ci kfree(ctx); 251862306a36Sopenharmony_ci return ret; 251962306a36Sopenharmony_ci} 252062306a36Sopenharmony_ci 252162306a36Sopenharmony_cistatic void r5l_write_super(struct r5l_log *log, sector_t cp) 252262306a36Sopenharmony_ci{ 252362306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 252462306a36Sopenharmony_ci 252562306a36Sopenharmony_ci log->rdev->journal_tail = cp; 252662306a36Sopenharmony_ci set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 252762306a36Sopenharmony_ci} 252862306a36Sopenharmony_ci 252962306a36Sopenharmony_cistatic ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) 253062306a36Sopenharmony_ci{ 253162306a36Sopenharmony_ci struct r5conf *conf; 253262306a36Sopenharmony_ci int ret; 253362306a36Sopenharmony_ci 253462306a36Sopenharmony_ci ret = mddev_lock(mddev); 253562306a36Sopenharmony_ci if (ret) 253662306a36Sopenharmony_ci return ret; 253762306a36Sopenharmony_ci 253862306a36Sopenharmony_ci conf = mddev->private; 253962306a36Sopenharmony_ci if (!conf || !conf->log) 254062306a36Sopenharmony_ci goto out_unlock; 254162306a36Sopenharmony_ci 254262306a36Sopenharmony_ci switch (conf->log->r5c_journal_mode) { 254362306a36Sopenharmony_ci case R5C_JOURNAL_MODE_WRITE_THROUGH: 254462306a36Sopenharmony_ci ret = snprintf( 254562306a36Sopenharmony_ci page, PAGE_SIZE, "[%s] %s\n", 254662306a36Sopenharmony_ci r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 254762306a36Sopenharmony_ci r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 254862306a36Sopenharmony_ci break; 254962306a36Sopenharmony_ci case R5C_JOURNAL_MODE_WRITE_BACK: 255062306a36Sopenharmony_ci ret = snprintf( 255162306a36Sopenharmony_ci page, PAGE_SIZE, "%s [%s]\n", 255262306a36Sopenharmony_ci r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 255362306a36Sopenharmony_ci r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 255462306a36Sopenharmony_ci break; 255562306a36Sopenharmony_ci default: 255662306a36Sopenharmony_ci ret = 0; 255762306a36Sopenharmony_ci } 255862306a36Sopenharmony_ci 255962306a36Sopenharmony_ciout_unlock: 256062306a36Sopenharmony_ci mddev_unlock(mddev); 256162306a36Sopenharmony_ci return ret; 256262306a36Sopenharmony_ci} 256362306a36Sopenharmony_ci 256462306a36Sopenharmony_ci/* 256562306a36Sopenharmony_ci * Set journal cache mode on @mddev (external API initially needed by dm-raid). 256662306a36Sopenharmony_ci * 256762306a36Sopenharmony_ci * @mode as defined in 'enum r5c_journal_mode'. 256862306a36Sopenharmony_ci * 256962306a36Sopenharmony_ci */ 257062306a36Sopenharmony_ciint r5c_journal_mode_set(struct mddev *mddev, int mode) 257162306a36Sopenharmony_ci{ 257262306a36Sopenharmony_ci struct r5conf *conf; 257362306a36Sopenharmony_ci 257462306a36Sopenharmony_ci if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH || 257562306a36Sopenharmony_ci mode > R5C_JOURNAL_MODE_WRITE_BACK) 257662306a36Sopenharmony_ci return -EINVAL; 257762306a36Sopenharmony_ci 257862306a36Sopenharmony_ci conf = mddev->private; 257962306a36Sopenharmony_ci if (!conf || !conf->log) 258062306a36Sopenharmony_ci return -ENODEV; 258162306a36Sopenharmony_ci 258262306a36Sopenharmony_ci if (raid5_calc_degraded(conf) > 0 && 258362306a36Sopenharmony_ci mode == R5C_JOURNAL_MODE_WRITE_BACK) 258462306a36Sopenharmony_ci return -EINVAL; 258562306a36Sopenharmony_ci 258662306a36Sopenharmony_ci mddev_suspend(mddev); 258762306a36Sopenharmony_ci conf->log->r5c_journal_mode = mode; 258862306a36Sopenharmony_ci mddev_resume(mddev); 258962306a36Sopenharmony_ci 259062306a36Sopenharmony_ci pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 259162306a36Sopenharmony_ci mdname(mddev), mode, r5c_journal_mode_str[mode]); 259262306a36Sopenharmony_ci return 0; 259362306a36Sopenharmony_ci} 259462306a36Sopenharmony_ciEXPORT_SYMBOL(r5c_journal_mode_set); 259562306a36Sopenharmony_ci 259662306a36Sopenharmony_cistatic ssize_t r5c_journal_mode_store(struct mddev *mddev, 259762306a36Sopenharmony_ci const char *page, size_t length) 259862306a36Sopenharmony_ci{ 259962306a36Sopenharmony_ci int mode = ARRAY_SIZE(r5c_journal_mode_str); 260062306a36Sopenharmony_ci size_t len = length; 260162306a36Sopenharmony_ci int ret; 260262306a36Sopenharmony_ci 260362306a36Sopenharmony_ci if (len < 2) 260462306a36Sopenharmony_ci return -EINVAL; 260562306a36Sopenharmony_ci 260662306a36Sopenharmony_ci if (page[len - 1] == '\n') 260762306a36Sopenharmony_ci len--; 260862306a36Sopenharmony_ci 260962306a36Sopenharmony_ci while (mode--) 261062306a36Sopenharmony_ci if (strlen(r5c_journal_mode_str[mode]) == len && 261162306a36Sopenharmony_ci !strncmp(page, r5c_journal_mode_str[mode], len)) 261262306a36Sopenharmony_ci break; 261362306a36Sopenharmony_ci ret = mddev_lock(mddev); 261462306a36Sopenharmony_ci if (ret) 261562306a36Sopenharmony_ci return ret; 261662306a36Sopenharmony_ci ret = r5c_journal_mode_set(mddev, mode); 261762306a36Sopenharmony_ci mddev_unlock(mddev); 261862306a36Sopenharmony_ci return ret ?: length; 261962306a36Sopenharmony_ci} 262062306a36Sopenharmony_ci 262162306a36Sopenharmony_cistruct md_sysfs_entry 262262306a36Sopenharmony_cir5c_journal_mode = __ATTR(journal_mode, 0644, 262362306a36Sopenharmony_ci r5c_journal_mode_show, r5c_journal_mode_store); 262462306a36Sopenharmony_ci 262562306a36Sopenharmony_ci/* 262662306a36Sopenharmony_ci * Try handle write operation in caching phase. This function should only 262762306a36Sopenharmony_ci * be called in write-back mode. 262862306a36Sopenharmony_ci * 262962306a36Sopenharmony_ci * If all outstanding writes can be handled in caching phase, returns 0 263062306a36Sopenharmony_ci * If writes requires write-out phase, call r5c_make_stripe_write_out() 263162306a36Sopenharmony_ci * and returns -EAGAIN 263262306a36Sopenharmony_ci */ 263362306a36Sopenharmony_ciint r5c_try_caching_write(struct r5conf *conf, 263462306a36Sopenharmony_ci struct stripe_head *sh, 263562306a36Sopenharmony_ci struct stripe_head_state *s, 263662306a36Sopenharmony_ci int disks) 263762306a36Sopenharmony_ci{ 263862306a36Sopenharmony_ci struct r5l_log *log = conf->log; 263962306a36Sopenharmony_ci int i; 264062306a36Sopenharmony_ci struct r5dev *dev; 264162306a36Sopenharmony_ci int to_cache = 0; 264262306a36Sopenharmony_ci void __rcu **pslot; 264362306a36Sopenharmony_ci sector_t tree_index; 264462306a36Sopenharmony_ci int ret; 264562306a36Sopenharmony_ci uintptr_t refcount; 264662306a36Sopenharmony_ci 264762306a36Sopenharmony_ci BUG_ON(!r5c_is_writeback(log)); 264862306a36Sopenharmony_ci 264962306a36Sopenharmony_ci if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 265062306a36Sopenharmony_ci /* 265162306a36Sopenharmony_ci * There are two different scenarios here: 265262306a36Sopenharmony_ci * 1. The stripe has some data cached, and it is sent to 265362306a36Sopenharmony_ci * write-out phase for reclaim 265462306a36Sopenharmony_ci * 2. The stripe is clean, and this is the first write 265562306a36Sopenharmony_ci * 265662306a36Sopenharmony_ci * For 1, return -EAGAIN, so we continue with 265762306a36Sopenharmony_ci * handle_stripe_dirtying(). 265862306a36Sopenharmony_ci * 265962306a36Sopenharmony_ci * For 2, set STRIPE_R5C_CACHING and continue with caching 266062306a36Sopenharmony_ci * write. 266162306a36Sopenharmony_ci */ 266262306a36Sopenharmony_ci 266362306a36Sopenharmony_ci /* case 1: anything injournal or anything in written */ 266462306a36Sopenharmony_ci if (s->injournal > 0 || s->written > 0) 266562306a36Sopenharmony_ci return -EAGAIN; 266662306a36Sopenharmony_ci /* case 2 */ 266762306a36Sopenharmony_ci set_bit(STRIPE_R5C_CACHING, &sh->state); 266862306a36Sopenharmony_ci } 266962306a36Sopenharmony_ci 267062306a36Sopenharmony_ci /* 267162306a36Sopenharmony_ci * When run in degraded mode, array is set to write-through mode. 267262306a36Sopenharmony_ci * This check helps drain pending write safely in the transition to 267362306a36Sopenharmony_ci * write-through mode. 267462306a36Sopenharmony_ci * 267562306a36Sopenharmony_ci * When a stripe is syncing, the write is also handled in write 267662306a36Sopenharmony_ci * through mode. 267762306a36Sopenharmony_ci */ 267862306a36Sopenharmony_ci if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) { 267962306a36Sopenharmony_ci r5c_make_stripe_write_out(sh); 268062306a36Sopenharmony_ci return -EAGAIN; 268162306a36Sopenharmony_ci } 268262306a36Sopenharmony_ci 268362306a36Sopenharmony_ci for (i = disks; i--; ) { 268462306a36Sopenharmony_ci dev = &sh->dev[i]; 268562306a36Sopenharmony_ci /* if non-overwrite, use writing-out phase */ 268662306a36Sopenharmony_ci if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 268762306a36Sopenharmony_ci !test_bit(R5_InJournal, &dev->flags)) { 268862306a36Sopenharmony_ci r5c_make_stripe_write_out(sh); 268962306a36Sopenharmony_ci return -EAGAIN; 269062306a36Sopenharmony_ci } 269162306a36Sopenharmony_ci } 269262306a36Sopenharmony_ci 269362306a36Sopenharmony_ci /* if the stripe is not counted in big_stripe_tree, add it now */ 269462306a36Sopenharmony_ci if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && 269562306a36Sopenharmony_ci !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 269662306a36Sopenharmony_ci tree_index = r5c_tree_index(conf, sh->sector); 269762306a36Sopenharmony_ci spin_lock(&log->tree_lock); 269862306a36Sopenharmony_ci pslot = radix_tree_lookup_slot(&log->big_stripe_tree, 269962306a36Sopenharmony_ci tree_index); 270062306a36Sopenharmony_ci if (pslot) { 270162306a36Sopenharmony_ci refcount = (uintptr_t)radix_tree_deref_slot_protected( 270262306a36Sopenharmony_ci pslot, &log->tree_lock) >> 270362306a36Sopenharmony_ci R5C_RADIX_COUNT_SHIFT; 270462306a36Sopenharmony_ci radix_tree_replace_slot( 270562306a36Sopenharmony_ci &log->big_stripe_tree, pslot, 270662306a36Sopenharmony_ci (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT)); 270762306a36Sopenharmony_ci } else { 270862306a36Sopenharmony_ci /* 270962306a36Sopenharmony_ci * this radix_tree_insert can fail safely, so no 271062306a36Sopenharmony_ci * need to call radix_tree_preload() 271162306a36Sopenharmony_ci */ 271262306a36Sopenharmony_ci ret = radix_tree_insert( 271362306a36Sopenharmony_ci &log->big_stripe_tree, tree_index, 271462306a36Sopenharmony_ci (void *)(1 << R5C_RADIX_COUNT_SHIFT)); 271562306a36Sopenharmony_ci if (ret) { 271662306a36Sopenharmony_ci spin_unlock(&log->tree_lock); 271762306a36Sopenharmony_ci r5c_make_stripe_write_out(sh); 271862306a36Sopenharmony_ci return -EAGAIN; 271962306a36Sopenharmony_ci } 272062306a36Sopenharmony_ci } 272162306a36Sopenharmony_ci spin_unlock(&log->tree_lock); 272262306a36Sopenharmony_ci 272362306a36Sopenharmony_ci /* 272462306a36Sopenharmony_ci * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is 272562306a36Sopenharmony_ci * counted in the radix tree 272662306a36Sopenharmony_ci */ 272762306a36Sopenharmony_ci set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); 272862306a36Sopenharmony_ci atomic_inc(&conf->r5c_cached_partial_stripes); 272962306a36Sopenharmony_ci } 273062306a36Sopenharmony_ci 273162306a36Sopenharmony_ci for (i = disks; i--; ) { 273262306a36Sopenharmony_ci dev = &sh->dev[i]; 273362306a36Sopenharmony_ci if (dev->towrite) { 273462306a36Sopenharmony_ci set_bit(R5_Wantwrite, &dev->flags); 273562306a36Sopenharmony_ci set_bit(R5_Wantdrain, &dev->flags); 273662306a36Sopenharmony_ci set_bit(R5_LOCKED, &dev->flags); 273762306a36Sopenharmony_ci to_cache++; 273862306a36Sopenharmony_ci } 273962306a36Sopenharmony_ci } 274062306a36Sopenharmony_ci 274162306a36Sopenharmony_ci if (to_cache) { 274262306a36Sopenharmony_ci set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 274362306a36Sopenharmony_ci /* 274462306a36Sopenharmony_ci * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 274562306a36Sopenharmony_ci * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 274662306a36Sopenharmony_ci * r5c_handle_data_cached() 274762306a36Sopenharmony_ci */ 274862306a36Sopenharmony_ci set_bit(STRIPE_LOG_TRAPPED, &sh->state); 274962306a36Sopenharmony_ci } 275062306a36Sopenharmony_ci 275162306a36Sopenharmony_ci return 0; 275262306a36Sopenharmony_ci} 275362306a36Sopenharmony_ci 275462306a36Sopenharmony_ci/* 275562306a36Sopenharmony_ci * free extra pages (orig_page) we allocated for prexor 275662306a36Sopenharmony_ci */ 275762306a36Sopenharmony_civoid r5c_release_extra_page(struct stripe_head *sh) 275862306a36Sopenharmony_ci{ 275962306a36Sopenharmony_ci struct r5conf *conf = sh->raid_conf; 276062306a36Sopenharmony_ci int i; 276162306a36Sopenharmony_ci bool using_disk_info_extra_page; 276262306a36Sopenharmony_ci 276362306a36Sopenharmony_ci using_disk_info_extra_page = 276462306a36Sopenharmony_ci sh->dev[0].orig_page == conf->disks[0].extra_page; 276562306a36Sopenharmony_ci 276662306a36Sopenharmony_ci for (i = sh->disks; i--; ) 276762306a36Sopenharmony_ci if (sh->dev[i].page != sh->dev[i].orig_page) { 276862306a36Sopenharmony_ci struct page *p = sh->dev[i].orig_page; 276962306a36Sopenharmony_ci 277062306a36Sopenharmony_ci sh->dev[i].orig_page = sh->dev[i].page; 277162306a36Sopenharmony_ci clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 277262306a36Sopenharmony_ci 277362306a36Sopenharmony_ci if (!using_disk_info_extra_page) 277462306a36Sopenharmony_ci put_page(p); 277562306a36Sopenharmony_ci } 277662306a36Sopenharmony_ci 277762306a36Sopenharmony_ci if (using_disk_info_extra_page) { 277862306a36Sopenharmony_ci clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); 277962306a36Sopenharmony_ci md_wakeup_thread(conf->mddev->thread); 278062306a36Sopenharmony_ci } 278162306a36Sopenharmony_ci} 278262306a36Sopenharmony_ci 278362306a36Sopenharmony_civoid r5c_use_extra_page(struct stripe_head *sh) 278462306a36Sopenharmony_ci{ 278562306a36Sopenharmony_ci struct r5conf *conf = sh->raid_conf; 278662306a36Sopenharmony_ci int i; 278762306a36Sopenharmony_ci struct r5dev *dev; 278862306a36Sopenharmony_ci 278962306a36Sopenharmony_ci for (i = sh->disks; i--; ) { 279062306a36Sopenharmony_ci dev = &sh->dev[i]; 279162306a36Sopenharmony_ci if (dev->orig_page != dev->page) 279262306a36Sopenharmony_ci put_page(dev->orig_page); 279362306a36Sopenharmony_ci dev->orig_page = conf->disks[i].extra_page; 279462306a36Sopenharmony_ci } 279562306a36Sopenharmony_ci} 279662306a36Sopenharmony_ci 279762306a36Sopenharmony_ci/* 279862306a36Sopenharmony_ci * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 279962306a36Sopenharmony_ci * stripe is committed to RAID disks. 280062306a36Sopenharmony_ci */ 280162306a36Sopenharmony_civoid r5c_finish_stripe_write_out(struct r5conf *conf, 280262306a36Sopenharmony_ci struct stripe_head *sh, 280362306a36Sopenharmony_ci struct stripe_head_state *s) 280462306a36Sopenharmony_ci{ 280562306a36Sopenharmony_ci struct r5l_log *log = conf->log; 280662306a36Sopenharmony_ci int i; 280762306a36Sopenharmony_ci int do_wakeup = 0; 280862306a36Sopenharmony_ci sector_t tree_index; 280962306a36Sopenharmony_ci void __rcu **pslot; 281062306a36Sopenharmony_ci uintptr_t refcount; 281162306a36Sopenharmony_ci 281262306a36Sopenharmony_ci if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 281362306a36Sopenharmony_ci return; 281462306a36Sopenharmony_ci 281562306a36Sopenharmony_ci WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 281662306a36Sopenharmony_ci clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 281762306a36Sopenharmony_ci 281862306a36Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 281962306a36Sopenharmony_ci return; 282062306a36Sopenharmony_ci 282162306a36Sopenharmony_ci for (i = sh->disks; i--; ) { 282262306a36Sopenharmony_ci clear_bit(R5_InJournal, &sh->dev[i].flags); 282362306a36Sopenharmony_ci if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 282462306a36Sopenharmony_ci do_wakeup = 1; 282562306a36Sopenharmony_ci } 282662306a36Sopenharmony_ci 282762306a36Sopenharmony_ci /* 282862306a36Sopenharmony_ci * analyse_stripe() runs before r5c_finish_stripe_write_out(), 282962306a36Sopenharmony_ci * We updated R5_InJournal, so we also update s->injournal. 283062306a36Sopenharmony_ci */ 283162306a36Sopenharmony_ci s->injournal = 0; 283262306a36Sopenharmony_ci 283362306a36Sopenharmony_ci if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 283462306a36Sopenharmony_ci if (atomic_dec_and_test(&conf->pending_full_writes)) 283562306a36Sopenharmony_ci md_wakeup_thread(conf->mddev->thread); 283662306a36Sopenharmony_ci 283762306a36Sopenharmony_ci if (do_wakeup) 283862306a36Sopenharmony_ci wake_up(&conf->wait_for_overlap); 283962306a36Sopenharmony_ci 284062306a36Sopenharmony_ci spin_lock_irq(&log->stripe_in_journal_lock); 284162306a36Sopenharmony_ci list_del_init(&sh->r5c); 284262306a36Sopenharmony_ci spin_unlock_irq(&log->stripe_in_journal_lock); 284362306a36Sopenharmony_ci sh->log_start = MaxSector; 284462306a36Sopenharmony_ci 284562306a36Sopenharmony_ci atomic_dec(&log->stripe_in_journal_count); 284662306a36Sopenharmony_ci r5c_update_log_state(log); 284762306a36Sopenharmony_ci 284862306a36Sopenharmony_ci /* stop counting this stripe in big_stripe_tree */ 284962306a36Sopenharmony_ci if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) || 285062306a36Sopenharmony_ci test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 285162306a36Sopenharmony_ci tree_index = r5c_tree_index(conf, sh->sector); 285262306a36Sopenharmony_ci spin_lock(&log->tree_lock); 285362306a36Sopenharmony_ci pslot = radix_tree_lookup_slot(&log->big_stripe_tree, 285462306a36Sopenharmony_ci tree_index); 285562306a36Sopenharmony_ci BUG_ON(pslot == NULL); 285662306a36Sopenharmony_ci refcount = (uintptr_t)radix_tree_deref_slot_protected( 285762306a36Sopenharmony_ci pslot, &log->tree_lock) >> 285862306a36Sopenharmony_ci R5C_RADIX_COUNT_SHIFT; 285962306a36Sopenharmony_ci if (refcount == 1) 286062306a36Sopenharmony_ci radix_tree_delete(&log->big_stripe_tree, tree_index); 286162306a36Sopenharmony_ci else 286262306a36Sopenharmony_ci radix_tree_replace_slot( 286362306a36Sopenharmony_ci &log->big_stripe_tree, pslot, 286462306a36Sopenharmony_ci (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT)); 286562306a36Sopenharmony_ci spin_unlock(&log->tree_lock); 286662306a36Sopenharmony_ci } 286762306a36Sopenharmony_ci 286862306a36Sopenharmony_ci if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 286962306a36Sopenharmony_ci BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 287062306a36Sopenharmony_ci atomic_dec(&conf->r5c_flushing_partial_stripes); 287162306a36Sopenharmony_ci atomic_dec(&conf->r5c_cached_partial_stripes); 287262306a36Sopenharmony_ci } 287362306a36Sopenharmony_ci 287462306a36Sopenharmony_ci if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 287562306a36Sopenharmony_ci BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 287662306a36Sopenharmony_ci atomic_dec(&conf->r5c_flushing_full_stripes); 287762306a36Sopenharmony_ci atomic_dec(&conf->r5c_cached_full_stripes); 287862306a36Sopenharmony_ci } 287962306a36Sopenharmony_ci 288062306a36Sopenharmony_ci r5l_append_flush_payload(log, sh->sector); 288162306a36Sopenharmony_ci /* stripe is flused to raid disks, we can do resync now */ 288262306a36Sopenharmony_ci if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 288362306a36Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 288462306a36Sopenharmony_ci} 288562306a36Sopenharmony_ci 288662306a36Sopenharmony_ciint r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) 288762306a36Sopenharmony_ci{ 288862306a36Sopenharmony_ci struct r5conf *conf = sh->raid_conf; 288962306a36Sopenharmony_ci int pages = 0; 289062306a36Sopenharmony_ci int reserve; 289162306a36Sopenharmony_ci int i; 289262306a36Sopenharmony_ci int ret = 0; 289362306a36Sopenharmony_ci 289462306a36Sopenharmony_ci BUG_ON(!log); 289562306a36Sopenharmony_ci 289662306a36Sopenharmony_ci for (i = 0; i < sh->disks; i++) { 289762306a36Sopenharmony_ci void *addr; 289862306a36Sopenharmony_ci 289962306a36Sopenharmony_ci if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 290062306a36Sopenharmony_ci continue; 290162306a36Sopenharmony_ci addr = kmap_atomic(sh->dev[i].page); 290262306a36Sopenharmony_ci sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 290362306a36Sopenharmony_ci addr, PAGE_SIZE); 290462306a36Sopenharmony_ci kunmap_atomic(addr); 290562306a36Sopenharmony_ci pages++; 290662306a36Sopenharmony_ci } 290762306a36Sopenharmony_ci WARN_ON(pages == 0); 290862306a36Sopenharmony_ci 290962306a36Sopenharmony_ci /* 291062306a36Sopenharmony_ci * The stripe must enter state machine again to call endio, so 291162306a36Sopenharmony_ci * don't delay. 291262306a36Sopenharmony_ci */ 291362306a36Sopenharmony_ci clear_bit(STRIPE_DELAYED, &sh->state); 291462306a36Sopenharmony_ci atomic_inc(&sh->count); 291562306a36Sopenharmony_ci 291662306a36Sopenharmony_ci mutex_lock(&log->io_mutex); 291762306a36Sopenharmony_ci /* meta + data */ 291862306a36Sopenharmony_ci reserve = (1 + pages) << (PAGE_SHIFT - 9); 291962306a36Sopenharmony_ci 292062306a36Sopenharmony_ci if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 292162306a36Sopenharmony_ci sh->log_start == MaxSector) 292262306a36Sopenharmony_ci r5l_add_no_space_stripe(log, sh); 292362306a36Sopenharmony_ci else if (!r5l_has_free_space(log, reserve)) { 292462306a36Sopenharmony_ci if (sh->log_start == log->last_checkpoint) 292562306a36Sopenharmony_ci BUG(); 292662306a36Sopenharmony_ci else 292762306a36Sopenharmony_ci r5l_add_no_space_stripe(log, sh); 292862306a36Sopenharmony_ci } else { 292962306a36Sopenharmony_ci ret = r5l_log_stripe(log, sh, pages, 0); 293062306a36Sopenharmony_ci if (ret) { 293162306a36Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 293262306a36Sopenharmony_ci list_add_tail(&sh->log_list, &log->no_mem_stripes); 293362306a36Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 293462306a36Sopenharmony_ci } 293562306a36Sopenharmony_ci } 293662306a36Sopenharmony_ci 293762306a36Sopenharmony_ci mutex_unlock(&log->io_mutex); 293862306a36Sopenharmony_ci return 0; 293962306a36Sopenharmony_ci} 294062306a36Sopenharmony_ci 294162306a36Sopenharmony_ci/* check whether this big stripe is in write back cache. */ 294262306a36Sopenharmony_cibool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) 294362306a36Sopenharmony_ci{ 294462306a36Sopenharmony_ci struct r5l_log *log = conf->log; 294562306a36Sopenharmony_ci sector_t tree_index; 294662306a36Sopenharmony_ci void *slot; 294762306a36Sopenharmony_ci 294862306a36Sopenharmony_ci if (!log) 294962306a36Sopenharmony_ci return false; 295062306a36Sopenharmony_ci 295162306a36Sopenharmony_ci WARN_ON_ONCE(!rcu_read_lock_held()); 295262306a36Sopenharmony_ci tree_index = r5c_tree_index(conf, sect); 295362306a36Sopenharmony_ci slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); 295462306a36Sopenharmony_ci return slot != NULL; 295562306a36Sopenharmony_ci} 295662306a36Sopenharmony_ci 295762306a36Sopenharmony_cistatic int r5l_load_log(struct r5l_log *log) 295862306a36Sopenharmony_ci{ 295962306a36Sopenharmony_ci struct md_rdev *rdev = log->rdev; 296062306a36Sopenharmony_ci struct page *page; 296162306a36Sopenharmony_ci struct r5l_meta_block *mb; 296262306a36Sopenharmony_ci sector_t cp = log->rdev->journal_tail; 296362306a36Sopenharmony_ci u32 stored_crc, expected_crc; 296462306a36Sopenharmony_ci bool create_super = false; 296562306a36Sopenharmony_ci int ret = 0; 296662306a36Sopenharmony_ci 296762306a36Sopenharmony_ci /* Make sure it's valid */ 296862306a36Sopenharmony_ci if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 296962306a36Sopenharmony_ci cp = 0; 297062306a36Sopenharmony_ci page = alloc_page(GFP_KERNEL); 297162306a36Sopenharmony_ci if (!page) 297262306a36Sopenharmony_ci return -ENOMEM; 297362306a36Sopenharmony_ci 297462306a36Sopenharmony_ci if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, false)) { 297562306a36Sopenharmony_ci ret = -EIO; 297662306a36Sopenharmony_ci goto ioerr; 297762306a36Sopenharmony_ci } 297862306a36Sopenharmony_ci mb = page_address(page); 297962306a36Sopenharmony_ci 298062306a36Sopenharmony_ci if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 298162306a36Sopenharmony_ci mb->version != R5LOG_VERSION) { 298262306a36Sopenharmony_ci create_super = true; 298362306a36Sopenharmony_ci goto create; 298462306a36Sopenharmony_ci } 298562306a36Sopenharmony_ci stored_crc = le32_to_cpu(mb->checksum); 298662306a36Sopenharmony_ci mb->checksum = 0; 298762306a36Sopenharmony_ci expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 298862306a36Sopenharmony_ci if (stored_crc != expected_crc) { 298962306a36Sopenharmony_ci create_super = true; 299062306a36Sopenharmony_ci goto create; 299162306a36Sopenharmony_ci } 299262306a36Sopenharmony_ci if (le64_to_cpu(mb->position) != cp) { 299362306a36Sopenharmony_ci create_super = true; 299462306a36Sopenharmony_ci goto create; 299562306a36Sopenharmony_ci } 299662306a36Sopenharmony_cicreate: 299762306a36Sopenharmony_ci if (create_super) { 299862306a36Sopenharmony_ci log->last_cp_seq = get_random_u32(); 299962306a36Sopenharmony_ci cp = 0; 300062306a36Sopenharmony_ci r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 300162306a36Sopenharmony_ci /* 300262306a36Sopenharmony_ci * Make sure super points to correct address. Log might have 300362306a36Sopenharmony_ci * data very soon. If super hasn't correct log tail address, 300462306a36Sopenharmony_ci * recovery can't find the log 300562306a36Sopenharmony_ci */ 300662306a36Sopenharmony_ci r5l_write_super(log, cp); 300762306a36Sopenharmony_ci } else 300862306a36Sopenharmony_ci log->last_cp_seq = le64_to_cpu(mb->seq); 300962306a36Sopenharmony_ci 301062306a36Sopenharmony_ci log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 301162306a36Sopenharmony_ci log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 301262306a36Sopenharmony_ci if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 301362306a36Sopenharmony_ci log->max_free_space = RECLAIM_MAX_FREE_SPACE; 301462306a36Sopenharmony_ci log->last_checkpoint = cp; 301562306a36Sopenharmony_ci 301662306a36Sopenharmony_ci __free_page(page); 301762306a36Sopenharmony_ci 301862306a36Sopenharmony_ci if (create_super) { 301962306a36Sopenharmony_ci log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS); 302062306a36Sopenharmony_ci log->seq = log->last_cp_seq + 1; 302162306a36Sopenharmony_ci log->next_checkpoint = cp; 302262306a36Sopenharmony_ci } else 302362306a36Sopenharmony_ci ret = r5l_recovery_log(log); 302462306a36Sopenharmony_ci 302562306a36Sopenharmony_ci r5c_update_log_state(log); 302662306a36Sopenharmony_ci return ret; 302762306a36Sopenharmony_ciioerr: 302862306a36Sopenharmony_ci __free_page(page); 302962306a36Sopenharmony_ci return ret; 303062306a36Sopenharmony_ci} 303162306a36Sopenharmony_ci 303262306a36Sopenharmony_ciint r5l_start(struct r5l_log *log) 303362306a36Sopenharmony_ci{ 303462306a36Sopenharmony_ci int ret; 303562306a36Sopenharmony_ci 303662306a36Sopenharmony_ci if (!log) 303762306a36Sopenharmony_ci return 0; 303862306a36Sopenharmony_ci 303962306a36Sopenharmony_ci ret = r5l_load_log(log); 304062306a36Sopenharmony_ci if (ret) { 304162306a36Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 304262306a36Sopenharmony_ci struct r5conf *conf = mddev->private; 304362306a36Sopenharmony_ci 304462306a36Sopenharmony_ci r5l_exit_log(conf); 304562306a36Sopenharmony_ci } 304662306a36Sopenharmony_ci return ret; 304762306a36Sopenharmony_ci} 304862306a36Sopenharmony_ci 304962306a36Sopenharmony_civoid r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) 305062306a36Sopenharmony_ci{ 305162306a36Sopenharmony_ci struct r5conf *conf = mddev->private; 305262306a36Sopenharmony_ci struct r5l_log *log = conf->log; 305362306a36Sopenharmony_ci 305462306a36Sopenharmony_ci if (!log) 305562306a36Sopenharmony_ci return; 305662306a36Sopenharmony_ci 305762306a36Sopenharmony_ci if ((raid5_calc_degraded(conf) > 0 || 305862306a36Sopenharmony_ci test_bit(Journal, &rdev->flags)) && 305962306a36Sopenharmony_ci conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) 306062306a36Sopenharmony_ci schedule_work(&log->disable_writeback_work); 306162306a36Sopenharmony_ci} 306262306a36Sopenharmony_ci 306362306a36Sopenharmony_ciint r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 306462306a36Sopenharmony_ci{ 306562306a36Sopenharmony_ci struct r5l_log *log; 306662306a36Sopenharmony_ci struct md_thread *thread; 306762306a36Sopenharmony_ci int ret; 306862306a36Sopenharmony_ci 306962306a36Sopenharmony_ci pr_debug("md/raid:%s: using device %pg as journal\n", 307062306a36Sopenharmony_ci mdname(conf->mddev), rdev->bdev); 307162306a36Sopenharmony_ci 307262306a36Sopenharmony_ci if (PAGE_SIZE != 4096) 307362306a36Sopenharmony_ci return -EINVAL; 307462306a36Sopenharmony_ci 307562306a36Sopenharmony_ci /* 307662306a36Sopenharmony_ci * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 307762306a36Sopenharmony_ci * raid_disks r5l_payload_data_parity. 307862306a36Sopenharmony_ci * 307962306a36Sopenharmony_ci * Write journal and cache does not work for very big array 308062306a36Sopenharmony_ci * (raid_disks > 203) 308162306a36Sopenharmony_ci */ 308262306a36Sopenharmony_ci if (sizeof(struct r5l_meta_block) + 308362306a36Sopenharmony_ci ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 308462306a36Sopenharmony_ci conf->raid_disks) > PAGE_SIZE) { 308562306a36Sopenharmony_ci pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 308662306a36Sopenharmony_ci mdname(conf->mddev), conf->raid_disks); 308762306a36Sopenharmony_ci return -EINVAL; 308862306a36Sopenharmony_ci } 308962306a36Sopenharmony_ci 309062306a36Sopenharmony_ci log = kzalloc(sizeof(*log), GFP_KERNEL); 309162306a36Sopenharmony_ci if (!log) 309262306a36Sopenharmony_ci return -ENOMEM; 309362306a36Sopenharmony_ci log->rdev = rdev; 309462306a36Sopenharmony_ci log->need_cache_flush = bdev_write_cache(rdev->bdev); 309562306a36Sopenharmony_ci log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 309662306a36Sopenharmony_ci sizeof(rdev->mddev->uuid)); 309762306a36Sopenharmony_ci 309862306a36Sopenharmony_ci mutex_init(&log->io_mutex); 309962306a36Sopenharmony_ci 310062306a36Sopenharmony_ci spin_lock_init(&log->io_list_lock); 310162306a36Sopenharmony_ci INIT_LIST_HEAD(&log->running_ios); 310262306a36Sopenharmony_ci INIT_LIST_HEAD(&log->io_end_ios); 310362306a36Sopenharmony_ci INIT_LIST_HEAD(&log->flushing_ios); 310462306a36Sopenharmony_ci INIT_LIST_HEAD(&log->finished_ios); 310562306a36Sopenharmony_ci 310662306a36Sopenharmony_ci log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 310762306a36Sopenharmony_ci if (!log->io_kc) 310862306a36Sopenharmony_ci goto io_kc; 310962306a36Sopenharmony_ci 311062306a36Sopenharmony_ci ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc); 311162306a36Sopenharmony_ci if (ret) 311262306a36Sopenharmony_ci goto io_pool; 311362306a36Sopenharmony_ci 311462306a36Sopenharmony_ci ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS); 311562306a36Sopenharmony_ci if (ret) 311662306a36Sopenharmony_ci goto io_bs; 311762306a36Sopenharmony_ci 311862306a36Sopenharmony_ci ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0); 311962306a36Sopenharmony_ci if (ret) 312062306a36Sopenharmony_ci goto out_mempool; 312162306a36Sopenharmony_ci 312262306a36Sopenharmony_ci spin_lock_init(&log->tree_lock); 312362306a36Sopenharmony_ci INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); 312462306a36Sopenharmony_ci 312562306a36Sopenharmony_ci thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, 312662306a36Sopenharmony_ci "reclaim"); 312762306a36Sopenharmony_ci if (!thread) 312862306a36Sopenharmony_ci goto reclaim_thread; 312962306a36Sopenharmony_ci 313062306a36Sopenharmony_ci thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 313162306a36Sopenharmony_ci rcu_assign_pointer(log->reclaim_thread, thread); 313262306a36Sopenharmony_ci 313362306a36Sopenharmony_ci init_waitqueue_head(&log->iounit_wait); 313462306a36Sopenharmony_ci 313562306a36Sopenharmony_ci INIT_LIST_HEAD(&log->no_mem_stripes); 313662306a36Sopenharmony_ci 313762306a36Sopenharmony_ci INIT_LIST_HEAD(&log->no_space_stripes); 313862306a36Sopenharmony_ci spin_lock_init(&log->no_space_stripes_lock); 313962306a36Sopenharmony_ci 314062306a36Sopenharmony_ci INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 314162306a36Sopenharmony_ci INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async); 314262306a36Sopenharmony_ci 314362306a36Sopenharmony_ci log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 314462306a36Sopenharmony_ci INIT_LIST_HEAD(&log->stripe_in_journal_list); 314562306a36Sopenharmony_ci spin_lock_init(&log->stripe_in_journal_lock); 314662306a36Sopenharmony_ci atomic_set(&log->stripe_in_journal_count, 0); 314762306a36Sopenharmony_ci 314862306a36Sopenharmony_ci conf->log = log; 314962306a36Sopenharmony_ci 315062306a36Sopenharmony_ci set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 315162306a36Sopenharmony_ci return 0; 315262306a36Sopenharmony_ci 315362306a36Sopenharmony_cireclaim_thread: 315462306a36Sopenharmony_ci mempool_exit(&log->meta_pool); 315562306a36Sopenharmony_ciout_mempool: 315662306a36Sopenharmony_ci bioset_exit(&log->bs); 315762306a36Sopenharmony_ciio_bs: 315862306a36Sopenharmony_ci mempool_exit(&log->io_pool); 315962306a36Sopenharmony_ciio_pool: 316062306a36Sopenharmony_ci kmem_cache_destroy(log->io_kc); 316162306a36Sopenharmony_ciio_kc: 316262306a36Sopenharmony_ci kfree(log); 316362306a36Sopenharmony_ci return -EINVAL; 316462306a36Sopenharmony_ci} 316562306a36Sopenharmony_ci 316662306a36Sopenharmony_civoid r5l_exit_log(struct r5conf *conf) 316762306a36Sopenharmony_ci{ 316862306a36Sopenharmony_ci struct r5l_log *log = conf->log; 316962306a36Sopenharmony_ci 317062306a36Sopenharmony_ci md_unregister_thread(conf->mddev, &log->reclaim_thread); 317162306a36Sopenharmony_ci 317262306a36Sopenharmony_ci /* 317362306a36Sopenharmony_ci * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to 317462306a36Sopenharmony_ci * ensure disable_writeback_work wakes up and exits. 317562306a36Sopenharmony_ci */ 317662306a36Sopenharmony_ci conf->log = NULL; 317762306a36Sopenharmony_ci wake_up(&conf->mddev->sb_wait); 317862306a36Sopenharmony_ci flush_work(&log->disable_writeback_work); 317962306a36Sopenharmony_ci 318062306a36Sopenharmony_ci mempool_exit(&log->meta_pool); 318162306a36Sopenharmony_ci bioset_exit(&log->bs); 318262306a36Sopenharmony_ci mempool_exit(&log->io_pool); 318362306a36Sopenharmony_ci kmem_cache_destroy(log->io_kc); 318462306a36Sopenharmony_ci kfree(log); 318562306a36Sopenharmony_ci} 3186