18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (C) 2015 Shaohua Li <shli@fb.com> 48c2ecf20Sopenharmony_ci * Copyright (C) 2016 Song Liu <songliubraving@fb.com> 58c2ecf20Sopenharmony_ci */ 68c2ecf20Sopenharmony_ci#include <linux/kernel.h> 78c2ecf20Sopenharmony_ci#include <linux/wait.h> 88c2ecf20Sopenharmony_ci#include <linux/blkdev.h> 98c2ecf20Sopenharmony_ci#include <linux/slab.h> 108c2ecf20Sopenharmony_ci#include <linux/raid/md_p.h> 118c2ecf20Sopenharmony_ci#include <linux/crc32c.h> 128c2ecf20Sopenharmony_ci#include <linux/random.h> 138c2ecf20Sopenharmony_ci#include <linux/kthread.h> 148c2ecf20Sopenharmony_ci#include <linux/types.h> 158c2ecf20Sopenharmony_ci#include "md.h" 168c2ecf20Sopenharmony_ci#include "raid5.h" 178c2ecf20Sopenharmony_ci#include "md-bitmap.h" 188c2ecf20Sopenharmony_ci#include "raid5-log.h" 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci/* 218c2ecf20Sopenharmony_ci * metadata/data stored in disk with 4k size unit (a block) regardless 228c2ecf20Sopenharmony_ci * underneath hardware sector size. only works with PAGE_SIZE == 4096 238c2ecf20Sopenharmony_ci */ 248c2ecf20Sopenharmony_ci#define BLOCK_SECTORS (8) 258c2ecf20Sopenharmony_ci#define BLOCK_SECTOR_SHIFT (3) 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_ci/* 288c2ecf20Sopenharmony_ci * log->max_free_space is min(1/4 disk size, 10G reclaimable space). 298c2ecf20Sopenharmony_ci * 308c2ecf20Sopenharmony_ci * In write through mode, the reclaim runs every log->max_free_space. 318c2ecf20Sopenharmony_ci * This can prevent the recovery scans for too long 328c2ecf20Sopenharmony_ci */ 338c2ecf20Sopenharmony_ci#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ 348c2ecf20Sopenharmony_ci#define RECLAIM_MAX_FREE_SPACE_SHIFT (2) 358c2ecf20Sopenharmony_ci 368c2ecf20Sopenharmony_ci/* wake up reclaim thread periodically */ 378c2ecf20Sopenharmony_ci#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ) 388c2ecf20Sopenharmony_ci/* start flush with these full stripes */ 398c2ecf20Sopenharmony_ci#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4) 408c2ecf20Sopenharmony_ci/* reclaim stripes in groups */ 418c2ecf20Sopenharmony_ci#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) 428c2ecf20Sopenharmony_ci 438c2ecf20Sopenharmony_ci/* 448c2ecf20Sopenharmony_ci * We only need 2 bios per I/O unit to make progress, but ensure we 458c2ecf20Sopenharmony_ci * have a few more available to not get too tight. 468c2ecf20Sopenharmony_ci */ 478c2ecf20Sopenharmony_ci#define R5L_POOL_SIZE 4 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_cistatic char *r5c_journal_mode_str[] = {"write-through", 508c2ecf20Sopenharmony_ci "write-back"}; 518c2ecf20Sopenharmony_ci/* 528c2ecf20Sopenharmony_ci * raid5 cache state machine 538c2ecf20Sopenharmony_ci * 548c2ecf20Sopenharmony_ci * With the RAID cache, each stripe works in two phases: 558c2ecf20Sopenharmony_ci * - caching phase 568c2ecf20Sopenharmony_ci * - writing-out phase 578c2ecf20Sopenharmony_ci * 588c2ecf20Sopenharmony_ci * These two phases are controlled by bit STRIPE_R5C_CACHING: 598c2ecf20Sopenharmony_ci * if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase 608c2ecf20Sopenharmony_ci * if STRIPE_R5C_CACHING == 1, the stripe is in caching phase 618c2ecf20Sopenharmony_ci * 628c2ecf20Sopenharmony_ci * When there is no journal, or the journal is in write-through mode, 638c2ecf20Sopenharmony_ci * the stripe is always in writing-out phase. 648c2ecf20Sopenharmony_ci * 658c2ecf20Sopenharmony_ci * For write-back journal, the stripe is sent to caching phase on write 668c2ecf20Sopenharmony_ci * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off 678c2ecf20Sopenharmony_ci * the write-out phase by clearing STRIPE_R5C_CACHING. 688c2ecf20Sopenharmony_ci * 698c2ecf20Sopenharmony_ci * Stripes in caching phase do not write the raid disks. Instead, all 708c2ecf20Sopenharmony_ci * writes are committed from the log device. Therefore, a stripe in 718c2ecf20Sopenharmony_ci * caching phase handles writes as: 728c2ecf20Sopenharmony_ci * - write to log device 738c2ecf20Sopenharmony_ci * - return IO 748c2ecf20Sopenharmony_ci * 758c2ecf20Sopenharmony_ci * Stripes in writing-out phase handle writes as: 768c2ecf20Sopenharmony_ci * - calculate parity 778c2ecf20Sopenharmony_ci * - write pending data and parity to journal 788c2ecf20Sopenharmony_ci * - write data and parity to raid disks 798c2ecf20Sopenharmony_ci * - return IO for pending writes 808c2ecf20Sopenharmony_ci */ 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_cistruct r5l_log { 838c2ecf20Sopenharmony_ci struct md_rdev *rdev; 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci u32 uuid_checksum; 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_ci sector_t device_size; /* log device size, round to 888c2ecf20Sopenharmony_ci * BLOCK_SECTORS */ 898c2ecf20Sopenharmony_ci sector_t max_free_space; /* reclaim run if free space is at 908c2ecf20Sopenharmony_ci * this size */ 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci sector_t last_checkpoint; /* log tail. where recovery scan 938c2ecf20Sopenharmony_ci * starts from */ 948c2ecf20Sopenharmony_ci u64 last_cp_seq; /* log tail sequence */ 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_ci sector_t log_start; /* log head. where new data appends */ 978c2ecf20Sopenharmony_ci u64 seq; /* log head sequence */ 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_ci sector_t next_checkpoint; 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci struct mutex io_mutex; 1028c2ecf20Sopenharmony_ci struct r5l_io_unit *current_io; /* current io_unit accepting new data */ 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci spinlock_t io_list_lock; 1058c2ecf20Sopenharmony_ci struct list_head running_ios; /* io_units which are still running, 1068c2ecf20Sopenharmony_ci * and have not yet been completely 1078c2ecf20Sopenharmony_ci * written to the log */ 1088c2ecf20Sopenharmony_ci struct list_head io_end_ios; /* io_units which have been completely 1098c2ecf20Sopenharmony_ci * written to the log but not yet written 1108c2ecf20Sopenharmony_ci * to the RAID */ 1118c2ecf20Sopenharmony_ci struct list_head flushing_ios; /* io_units which are waiting for log 1128c2ecf20Sopenharmony_ci * cache flush */ 1138c2ecf20Sopenharmony_ci struct list_head finished_ios; /* io_units which settle down in log disk */ 1148c2ecf20Sopenharmony_ci struct bio flush_bio; 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */ 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_ci struct kmem_cache *io_kc; 1198c2ecf20Sopenharmony_ci mempool_t io_pool; 1208c2ecf20Sopenharmony_ci struct bio_set bs; 1218c2ecf20Sopenharmony_ci mempool_t meta_pool; 1228c2ecf20Sopenharmony_ci 1238c2ecf20Sopenharmony_ci struct md_thread *reclaim_thread; 1248c2ecf20Sopenharmony_ci unsigned long reclaim_target; /* number of space that need to be 1258c2ecf20Sopenharmony_ci * reclaimed. if it's 0, reclaim spaces 1268c2ecf20Sopenharmony_ci * used by io_units which are in 1278c2ecf20Sopenharmony_ci * IO_UNIT_STRIPE_END state (eg, reclaim 1288c2ecf20Sopenharmony_ci * dones't wait for specific io_unit 1298c2ecf20Sopenharmony_ci * switching to IO_UNIT_STRIPE_END 1308c2ecf20Sopenharmony_ci * state) */ 1318c2ecf20Sopenharmony_ci wait_queue_head_t iounit_wait; 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci struct list_head no_space_stripes; /* pending stripes, log has no space */ 1348c2ecf20Sopenharmony_ci spinlock_t no_space_stripes_lock; 1358c2ecf20Sopenharmony_ci 1368c2ecf20Sopenharmony_ci bool need_cache_flush; 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci /* for r5c_cache */ 1398c2ecf20Sopenharmony_ci enum r5c_journal_mode r5c_journal_mode; 1408c2ecf20Sopenharmony_ci 1418c2ecf20Sopenharmony_ci /* all stripes in r5cache, in the order of seq at sh->log_start */ 1428c2ecf20Sopenharmony_ci struct list_head stripe_in_journal_list; 1438c2ecf20Sopenharmony_ci 1448c2ecf20Sopenharmony_ci spinlock_t stripe_in_journal_lock; 1458c2ecf20Sopenharmony_ci atomic_t stripe_in_journal_count; 1468c2ecf20Sopenharmony_ci 1478c2ecf20Sopenharmony_ci /* to submit async io_units, to fulfill ordering of flush */ 1488c2ecf20Sopenharmony_ci struct work_struct deferred_io_work; 1498c2ecf20Sopenharmony_ci /* to disable write back during in degraded mode */ 1508c2ecf20Sopenharmony_ci struct work_struct disable_writeback_work; 1518c2ecf20Sopenharmony_ci 1528c2ecf20Sopenharmony_ci /* to for chunk_aligned_read in writeback mode, details below */ 1538c2ecf20Sopenharmony_ci spinlock_t tree_lock; 1548c2ecf20Sopenharmony_ci struct radix_tree_root big_stripe_tree; 1558c2ecf20Sopenharmony_ci}; 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci/* 1588c2ecf20Sopenharmony_ci * Enable chunk_aligned_read() with write back cache. 1598c2ecf20Sopenharmony_ci * 1608c2ecf20Sopenharmony_ci * Each chunk may contain more than one stripe (for example, a 256kB 1618c2ecf20Sopenharmony_ci * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For 1628c2ecf20Sopenharmony_ci * chunk_aligned_read, these stripes are grouped into one "big_stripe". 1638c2ecf20Sopenharmony_ci * For each big_stripe, we count how many stripes of this big_stripe 1648c2ecf20Sopenharmony_ci * are in the write back cache. These data are tracked in a radix tree 1658c2ecf20Sopenharmony_ci * (big_stripe_tree). We use radix_tree item pointer as the counter. 1668c2ecf20Sopenharmony_ci * r5c_tree_index() is used to calculate keys for the radix tree. 1678c2ecf20Sopenharmony_ci * 1688c2ecf20Sopenharmony_ci * chunk_aligned_read() calls r5c_big_stripe_cached() to look up 1698c2ecf20Sopenharmony_ci * big_stripe of each chunk in the tree. If this big_stripe is in the 1708c2ecf20Sopenharmony_ci * tree, chunk_aligned_read() aborts. This look up is protected by 1718c2ecf20Sopenharmony_ci * rcu_read_lock(). 1728c2ecf20Sopenharmony_ci * 1738c2ecf20Sopenharmony_ci * It is necessary to remember whether a stripe is counted in 1748c2ecf20Sopenharmony_ci * big_stripe_tree. Instead of adding new flag, we reuses existing flags: 1758c2ecf20Sopenharmony_ci * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these 1768c2ecf20Sopenharmony_ci * two flags are set, the stripe is counted in big_stripe_tree. This 1778c2ecf20Sopenharmony_ci * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to 1788c2ecf20Sopenharmony_ci * r5c_try_caching_write(); and moving clear_bit of 1798c2ecf20Sopenharmony_ci * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to 1808c2ecf20Sopenharmony_ci * r5c_finish_stripe_write_out(). 1818c2ecf20Sopenharmony_ci */ 1828c2ecf20Sopenharmony_ci 1838c2ecf20Sopenharmony_ci/* 1848c2ecf20Sopenharmony_ci * radix tree requests lowest 2 bits of data pointer to be 2b'00. 1858c2ecf20Sopenharmony_ci * So it is necessary to left shift the counter by 2 bits before using it 1868c2ecf20Sopenharmony_ci * as data pointer of the tree. 1878c2ecf20Sopenharmony_ci */ 1888c2ecf20Sopenharmony_ci#define R5C_RADIX_COUNT_SHIFT 2 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci/* 1918c2ecf20Sopenharmony_ci * calculate key for big_stripe_tree 1928c2ecf20Sopenharmony_ci * 1938c2ecf20Sopenharmony_ci * sect: align_bi->bi_iter.bi_sector or sh->sector 1948c2ecf20Sopenharmony_ci */ 1958c2ecf20Sopenharmony_cistatic inline sector_t r5c_tree_index(struct r5conf *conf, 1968c2ecf20Sopenharmony_ci sector_t sect) 1978c2ecf20Sopenharmony_ci{ 1988c2ecf20Sopenharmony_ci sector_div(sect, conf->chunk_sectors); 1998c2ecf20Sopenharmony_ci return sect; 2008c2ecf20Sopenharmony_ci} 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci/* 2038c2ecf20Sopenharmony_ci * an IO range starts from a meta data block and end at the next meta data 2048c2ecf20Sopenharmony_ci * block. The io unit's the meta data block tracks data/parity followed it. io 2058c2ecf20Sopenharmony_ci * unit is written to log disk with normal write, as we always flush log disk 2068c2ecf20Sopenharmony_ci * first and then start move data to raid disks, there is no requirement to 2078c2ecf20Sopenharmony_ci * write io unit with FLUSH/FUA 2088c2ecf20Sopenharmony_ci */ 2098c2ecf20Sopenharmony_cistruct r5l_io_unit { 2108c2ecf20Sopenharmony_ci struct r5l_log *log; 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci struct page *meta_page; /* store meta block */ 2138c2ecf20Sopenharmony_ci int meta_offset; /* current offset in meta_page */ 2148c2ecf20Sopenharmony_ci 2158c2ecf20Sopenharmony_ci struct bio *current_bio;/* current_bio accepting new data */ 2168c2ecf20Sopenharmony_ci 2178c2ecf20Sopenharmony_ci atomic_t pending_stripe;/* how many stripes not flushed to raid */ 2188c2ecf20Sopenharmony_ci u64 seq; /* seq number of the metablock */ 2198c2ecf20Sopenharmony_ci sector_t log_start; /* where the io_unit starts */ 2208c2ecf20Sopenharmony_ci sector_t log_end; /* where the io_unit ends */ 2218c2ecf20Sopenharmony_ci struct list_head log_sibling; /* log->running_ios */ 2228c2ecf20Sopenharmony_ci struct list_head stripe_list; /* stripes added to the io_unit */ 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_ci int state; 2258c2ecf20Sopenharmony_ci bool need_split_bio; 2268c2ecf20Sopenharmony_ci struct bio *split_bio; 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci unsigned int has_flush:1; /* include flush request */ 2298c2ecf20Sopenharmony_ci unsigned int has_fua:1; /* include fua request */ 2308c2ecf20Sopenharmony_ci unsigned int has_null_flush:1; /* include null flush request */ 2318c2ecf20Sopenharmony_ci unsigned int has_flush_payload:1; /* include flush payload */ 2328c2ecf20Sopenharmony_ci /* 2338c2ecf20Sopenharmony_ci * io isn't sent yet, flush/fua request can only be submitted till it's 2348c2ecf20Sopenharmony_ci * the first IO in running_ios list 2358c2ecf20Sopenharmony_ci */ 2368c2ecf20Sopenharmony_ci unsigned int io_deferred:1; 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci struct bio_list flush_barriers; /* size == 0 flush bios */ 2398c2ecf20Sopenharmony_ci}; 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_ci/* r5l_io_unit state */ 2428c2ecf20Sopenharmony_cienum r5l_io_unit_state { 2438c2ecf20Sopenharmony_ci IO_UNIT_RUNNING = 0, /* accepting new IO */ 2448c2ecf20Sopenharmony_ci IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, 2458c2ecf20Sopenharmony_ci * don't accepting new bio */ 2468c2ecf20Sopenharmony_ci IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ 2478c2ecf20Sopenharmony_ci IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ 2488c2ecf20Sopenharmony_ci}; 2498c2ecf20Sopenharmony_ci 2508c2ecf20Sopenharmony_cibool r5c_is_writeback(struct r5l_log *log) 2518c2ecf20Sopenharmony_ci{ 2528c2ecf20Sopenharmony_ci return (log != NULL && 2538c2ecf20Sopenharmony_ci log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK); 2548c2ecf20Sopenharmony_ci} 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_cistatic sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) 2578c2ecf20Sopenharmony_ci{ 2588c2ecf20Sopenharmony_ci start += inc; 2598c2ecf20Sopenharmony_ci if (start >= log->device_size) 2608c2ecf20Sopenharmony_ci start = start - log->device_size; 2618c2ecf20Sopenharmony_ci return start; 2628c2ecf20Sopenharmony_ci} 2638c2ecf20Sopenharmony_ci 2648c2ecf20Sopenharmony_cistatic sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, 2658c2ecf20Sopenharmony_ci sector_t end) 2668c2ecf20Sopenharmony_ci{ 2678c2ecf20Sopenharmony_ci if (end >= start) 2688c2ecf20Sopenharmony_ci return end - start; 2698c2ecf20Sopenharmony_ci else 2708c2ecf20Sopenharmony_ci return end + log->device_size - start; 2718c2ecf20Sopenharmony_ci} 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_cistatic bool r5l_has_free_space(struct r5l_log *log, sector_t size) 2748c2ecf20Sopenharmony_ci{ 2758c2ecf20Sopenharmony_ci sector_t used_size; 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_ci used_size = r5l_ring_distance(log, log->last_checkpoint, 2788c2ecf20Sopenharmony_ci log->log_start); 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci return log->device_size > used_size + size; 2818c2ecf20Sopenharmony_ci} 2828c2ecf20Sopenharmony_ci 2838c2ecf20Sopenharmony_cistatic void __r5l_set_io_unit_state(struct r5l_io_unit *io, 2848c2ecf20Sopenharmony_ci enum r5l_io_unit_state state) 2858c2ecf20Sopenharmony_ci{ 2868c2ecf20Sopenharmony_ci if (WARN_ON(io->state >= state)) 2878c2ecf20Sopenharmony_ci return; 2888c2ecf20Sopenharmony_ci io->state = state; 2898c2ecf20Sopenharmony_ci} 2908c2ecf20Sopenharmony_ci 2918c2ecf20Sopenharmony_cistatic void 2928c2ecf20Sopenharmony_cir5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev) 2938c2ecf20Sopenharmony_ci{ 2948c2ecf20Sopenharmony_ci struct bio *wbi, *wbi2; 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_ci wbi = dev->written; 2978c2ecf20Sopenharmony_ci dev->written = NULL; 2988c2ecf20Sopenharmony_ci while (wbi && wbi->bi_iter.bi_sector < 2998c2ecf20Sopenharmony_ci dev->sector + RAID5_STRIPE_SECTORS(conf)) { 3008c2ecf20Sopenharmony_ci wbi2 = r5_next_bio(conf, wbi, dev->sector); 3018c2ecf20Sopenharmony_ci md_write_end(conf->mddev); 3028c2ecf20Sopenharmony_ci bio_endio(wbi); 3038c2ecf20Sopenharmony_ci wbi = wbi2; 3048c2ecf20Sopenharmony_ci } 3058c2ecf20Sopenharmony_ci} 3068c2ecf20Sopenharmony_ci 3078c2ecf20Sopenharmony_civoid r5c_handle_cached_data_endio(struct r5conf *conf, 3088c2ecf20Sopenharmony_ci struct stripe_head *sh, int disks) 3098c2ecf20Sopenharmony_ci{ 3108c2ecf20Sopenharmony_ci int i; 3118c2ecf20Sopenharmony_ci 3128c2ecf20Sopenharmony_ci for (i = sh->disks; i--; ) { 3138c2ecf20Sopenharmony_ci if (sh->dev[i].written) { 3148c2ecf20Sopenharmony_ci set_bit(R5_UPTODATE, &sh->dev[i].flags); 3158c2ecf20Sopenharmony_ci r5c_return_dev_pending_writes(conf, &sh->dev[i]); 3168c2ecf20Sopenharmony_ci md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3178c2ecf20Sopenharmony_ci RAID5_STRIPE_SECTORS(conf), 3188c2ecf20Sopenharmony_ci !test_bit(STRIPE_DEGRADED, &sh->state), 3198c2ecf20Sopenharmony_ci 0); 3208c2ecf20Sopenharmony_ci } 3218c2ecf20Sopenharmony_ci } 3228c2ecf20Sopenharmony_ci} 3238c2ecf20Sopenharmony_ci 3248c2ecf20Sopenharmony_civoid r5l_wake_reclaim(struct r5l_log *log, sector_t space); 3258c2ecf20Sopenharmony_ci 3268c2ecf20Sopenharmony_ci/* Check whether we should flush some stripes to free up stripe cache */ 3278c2ecf20Sopenharmony_civoid r5c_check_stripe_cache_usage(struct r5conf *conf) 3288c2ecf20Sopenharmony_ci{ 3298c2ecf20Sopenharmony_ci int total_cached; 3308c2ecf20Sopenharmony_ci 3318c2ecf20Sopenharmony_ci if (!r5c_is_writeback(conf->log)) 3328c2ecf20Sopenharmony_ci return; 3338c2ecf20Sopenharmony_ci 3348c2ecf20Sopenharmony_ci total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 3358c2ecf20Sopenharmony_ci atomic_read(&conf->r5c_cached_full_stripes); 3368c2ecf20Sopenharmony_ci 3378c2ecf20Sopenharmony_ci /* 3388c2ecf20Sopenharmony_ci * The following condition is true for either of the following: 3398c2ecf20Sopenharmony_ci * - stripe cache pressure high: 3408c2ecf20Sopenharmony_ci * total_cached > 3/4 min_nr_stripes || 3418c2ecf20Sopenharmony_ci * empty_inactive_list_nr > 0 3428c2ecf20Sopenharmony_ci * - stripe cache pressure moderate: 3438c2ecf20Sopenharmony_ci * total_cached > 1/2 min_nr_stripes 3448c2ecf20Sopenharmony_ci */ 3458c2ecf20Sopenharmony_ci if (total_cached > conf->min_nr_stripes * 1 / 2 || 3468c2ecf20Sopenharmony_ci atomic_read(&conf->empty_inactive_list_nr) > 0) 3478c2ecf20Sopenharmony_ci r5l_wake_reclaim(conf->log, 0); 3488c2ecf20Sopenharmony_ci} 3498c2ecf20Sopenharmony_ci 3508c2ecf20Sopenharmony_ci/* 3518c2ecf20Sopenharmony_ci * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full 3528c2ecf20Sopenharmony_ci * stripes in the cache 3538c2ecf20Sopenharmony_ci */ 3548c2ecf20Sopenharmony_civoid r5c_check_cached_full_stripe(struct r5conf *conf) 3558c2ecf20Sopenharmony_ci{ 3568c2ecf20Sopenharmony_ci if (!r5c_is_writeback(conf->log)) 3578c2ecf20Sopenharmony_ci return; 3588c2ecf20Sopenharmony_ci 3598c2ecf20Sopenharmony_ci /* 3608c2ecf20Sopenharmony_ci * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes 3618c2ecf20Sopenharmony_ci * or a full stripe (chunk size / 4k stripes). 3628c2ecf20Sopenharmony_ci */ 3638c2ecf20Sopenharmony_ci if (atomic_read(&conf->r5c_cached_full_stripes) >= 3648c2ecf20Sopenharmony_ci min(R5C_FULL_STRIPE_FLUSH_BATCH(conf), 3658c2ecf20Sopenharmony_ci conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf))) 3668c2ecf20Sopenharmony_ci r5l_wake_reclaim(conf->log, 0); 3678c2ecf20Sopenharmony_ci} 3688c2ecf20Sopenharmony_ci 3698c2ecf20Sopenharmony_ci/* 3708c2ecf20Sopenharmony_ci * Total log space (in sectors) needed to flush all data in cache 3718c2ecf20Sopenharmony_ci * 3728c2ecf20Sopenharmony_ci * To avoid deadlock due to log space, it is necessary to reserve log 3738c2ecf20Sopenharmony_ci * space to flush critical stripes (stripes that occupying log space near 3748c2ecf20Sopenharmony_ci * last_checkpoint). This function helps check how much log space is 3758c2ecf20Sopenharmony_ci * required to flush all cached stripes. 3768c2ecf20Sopenharmony_ci * 3778c2ecf20Sopenharmony_ci * To reduce log space requirements, two mechanisms are used to give cache 3788c2ecf20Sopenharmony_ci * flush higher priorities: 3798c2ecf20Sopenharmony_ci * 1. In handle_stripe_dirtying() and schedule_reconstruction(), 3808c2ecf20Sopenharmony_ci * stripes ALREADY in journal can be flushed w/o pending writes; 3818c2ecf20Sopenharmony_ci * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal 3828c2ecf20Sopenharmony_ci * can be delayed (r5l_add_no_space_stripe). 3838c2ecf20Sopenharmony_ci * 3848c2ecf20Sopenharmony_ci * In cache flush, the stripe goes through 1 and then 2. For a stripe that 3858c2ecf20Sopenharmony_ci * already passed 1, flushing it requires at most (conf->max_degraded + 1) 3868c2ecf20Sopenharmony_ci * pages of journal space. For stripes that has not passed 1, flushing it 3878c2ecf20Sopenharmony_ci * requires (conf->raid_disks + 1) pages of journal space. There are at 3888c2ecf20Sopenharmony_ci * most (conf->group_cnt + 1) stripe that passed 1. So total journal space 3898c2ecf20Sopenharmony_ci * required to flush all cached stripes (in pages) is: 3908c2ecf20Sopenharmony_ci * 3918c2ecf20Sopenharmony_ci * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) + 3928c2ecf20Sopenharmony_ci * (group_cnt + 1) * (raid_disks + 1) 3938c2ecf20Sopenharmony_ci * or 3948c2ecf20Sopenharmony_ci * (stripe_in_journal_count) * (max_degraded + 1) + 3958c2ecf20Sopenharmony_ci * (group_cnt + 1) * (raid_disks - max_degraded) 3968c2ecf20Sopenharmony_ci */ 3978c2ecf20Sopenharmony_cistatic sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) 3988c2ecf20Sopenharmony_ci{ 3998c2ecf20Sopenharmony_ci struct r5l_log *log = conf->log; 4008c2ecf20Sopenharmony_ci 4018c2ecf20Sopenharmony_ci if (!r5c_is_writeback(log)) 4028c2ecf20Sopenharmony_ci return 0; 4038c2ecf20Sopenharmony_ci 4048c2ecf20Sopenharmony_ci return BLOCK_SECTORS * 4058c2ecf20Sopenharmony_ci ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) + 4068c2ecf20Sopenharmony_ci (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1)); 4078c2ecf20Sopenharmony_ci} 4088c2ecf20Sopenharmony_ci 4098c2ecf20Sopenharmony_ci/* 4108c2ecf20Sopenharmony_ci * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL 4118c2ecf20Sopenharmony_ci * 4128c2ecf20Sopenharmony_ci * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of 4138c2ecf20Sopenharmony_ci * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log 4148c2ecf20Sopenharmony_ci * device is less than 2x of reclaim_required_space. 4158c2ecf20Sopenharmony_ci */ 4168c2ecf20Sopenharmony_cistatic inline void r5c_update_log_state(struct r5l_log *log) 4178c2ecf20Sopenharmony_ci{ 4188c2ecf20Sopenharmony_ci struct r5conf *conf = log->rdev->mddev->private; 4198c2ecf20Sopenharmony_ci sector_t free_space; 4208c2ecf20Sopenharmony_ci sector_t reclaim_space; 4218c2ecf20Sopenharmony_ci bool wake_reclaim = false; 4228c2ecf20Sopenharmony_ci 4238c2ecf20Sopenharmony_ci if (!r5c_is_writeback(log)) 4248c2ecf20Sopenharmony_ci return; 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci free_space = r5l_ring_distance(log, log->log_start, 4278c2ecf20Sopenharmony_ci log->last_checkpoint); 4288c2ecf20Sopenharmony_ci reclaim_space = r5c_log_required_to_flush_cache(conf); 4298c2ecf20Sopenharmony_ci if (free_space < 2 * reclaim_space) 4308c2ecf20Sopenharmony_ci set_bit(R5C_LOG_CRITICAL, &conf->cache_state); 4318c2ecf20Sopenharmony_ci else { 4328c2ecf20Sopenharmony_ci if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 4338c2ecf20Sopenharmony_ci wake_reclaim = true; 4348c2ecf20Sopenharmony_ci clear_bit(R5C_LOG_CRITICAL, &conf->cache_state); 4358c2ecf20Sopenharmony_ci } 4368c2ecf20Sopenharmony_ci if (free_space < 3 * reclaim_space) 4378c2ecf20Sopenharmony_ci set_bit(R5C_LOG_TIGHT, &conf->cache_state); 4388c2ecf20Sopenharmony_ci else 4398c2ecf20Sopenharmony_ci clear_bit(R5C_LOG_TIGHT, &conf->cache_state); 4408c2ecf20Sopenharmony_ci 4418c2ecf20Sopenharmony_ci if (wake_reclaim) 4428c2ecf20Sopenharmony_ci r5l_wake_reclaim(log, 0); 4438c2ecf20Sopenharmony_ci} 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci/* 4468c2ecf20Sopenharmony_ci * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING. 4478c2ecf20Sopenharmony_ci * This function should only be called in write-back mode. 4488c2ecf20Sopenharmony_ci */ 4498c2ecf20Sopenharmony_civoid r5c_make_stripe_write_out(struct stripe_head *sh) 4508c2ecf20Sopenharmony_ci{ 4518c2ecf20Sopenharmony_ci struct r5conf *conf = sh->raid_conf; 4528c2ecf20Sopenharmony_ci struct r5l_log *log = conf->log; 4538c2ecf20Sopenharmony_ci 4548c2ecf20Sopenharmony_ci BUG_ON(!r5c_is_writeback(log)); 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 4578c2ecf20Sopenharmony_ci clear_bit(STRIPE_R5C_CACHING, &sh->state); 4588c2ecf20Sopenharmony_ci 4598c2ecf20Sopenharmony_ci if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4608c2ecf20Sopenharmony_ci atomic_inc(&conf->preread_active_stripes); 4618c2ecf20Sopenharmony_ci} 4628c2ecf20Sopenharmony_ci 4638c2ecf20Sopenharmony_cistatic void r5c_handle_data_cached(struct stripe_head *sh) 4648c2ecf20Sopenharmony_ci{ 4658c2ecf20Sopenharmony_ci int i; 4668c2ecf20Sopenharmony_ci 4678c2ecf20Sopenharmony_ci for (i = sh->disks; i--; ) 4688c2ecf20Sopenharmony_ci if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 4698c2ecf20Sopenharmony_ci set_bit(R5_InJournal, &sh->dev[i].flags); 4708c2ecf20Sopenharmony_ci clear_bit(R5_LOCKED, &sh->dev[i].flags); 4718c2ecf20Sopenharmony_ci } 4728c2ecf20Sopenharmony_ci clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 4738c2ecf20Sopenharmony_ci} 4748c2ecf20Sopenharmony_ci 4758c2ecf20Sopenharmony_ci/* 4768c2ecf20Sopenharmony_ci * this journal write must contain full parity, 4778c2ecf20Sopenharmony_ci * it may also contain some data pages 4788c2ecf20Sopenharmony_ci */ 4798c2ecf20Sopenharmony_cistatic void r5c_handle_parity_cached(struct stripe_head *sh) 4808c2ecf20Sopenharmony_ci{ 4818c2ecf20Sopenharmony_ci int i; 4828c2ecf20Sopenharmony_ci 4838c2ecf20Sopenharmony_ci for (i = sh->disks; i--; ) 4848c2ecf20Sopenharmony_ci if (test_bit(R5_InJournal, &sh->dev[i].flags)) 4858c2ecf20Sopenharmony_ci set_bit(R5_Wantwrite, &sh->dev[i].flags); 4868c2ecf20Sopenharmony_ci} 4878c2ecf20Sopenharmony_ci 4888c2ecf20Sopenharmony_ci/* 4898c2ecf20Sopenharmony_ci * Setting proper flags after writing (or flushing) data and/or parity to the 4908c2ecf20Sopenharmony_ci * log device. This is called from r5l_log_endio() or r5l_log_flush_endio(). 4918c2ecf20Sopenharmony_ci */ 4928c2ecf20Sopenharmony_cistatic void r5c_finish_cache_stripe(struct stripe_head *sh) 4938c2ecf20Sopenharmony_ci{ 4948c2ecf20Sopenharmony_ci struct r5l_log *log = sh->raid_conf->log; 4958c2ecf20Sopenharmony_ci 4968c2ecf20Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 4978c2ecf20Sopenharmony_ci BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 4988c2ecf20Sopenharmony_ci /* 4998c2ecf20Sopenharmony_ci * Set R5_InJournal for parity dev[pd_idx]. This means 5008c2ecf20Sopenharmony_ci * all data AND parity in the journal. For RAID 6, it is 5018c2ecf20Sopenharmony_ci * NOT necessary to set the flag for dev[qd_idx], as the 5028c2ecf20Sopenharmony_ci * two parities are written out together. 5038c2ecf20Sopenharmony_ci */ 5048c2ecf20Sopenharmony_ci set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 5058c2ecf20Sopenharmony_ci } else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) { 5068c2ecf20Sopenharmony_ci r5c_handle_data_cached(sh); 5078c2ecf20Sopenharmony_ci } else { 5088c2ecf20Sopenharmony_ci r5c_handle_parity_cached(sh); 5098c2ecf20Sopenharmony_ci set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 5108c2ecf20Sopenharmony_ci } 5118c2ecf20Sopenharmony_ci} 5128c2ecf20Sopenharmony_ci 5138c2ecf20Sopenharmony_cistatic void r5l_io_run_stripes(struct r5l_io_unit *io) 5148c2ecf20Sopenharmony_ci{ 5158c2ecf20Sopenharmony_ci struct stripe_head *sh, *next; 5168c2ecf20Sopenharmony_ci 5178c2ecf20Sopenharmony_ci list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 5188c2ecf20Sopenharmony_ci list_del_init(&sh->log_list); 5198c2ecf20Sopenharmony_ci 5208c2ecf20Sopenharmony_ci r5c_finish_cache_stripe(sh); 5218c2ecf20Sopenharmony_ci 5228c2ecf20Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 5238c2ecf20Sopenharmony_ci raid5_release_stripe(sh); 5248c2ecf20Sopenharmony_ci } 5258c2ecf20Sopenharmony_ci} 5268c2ecf20Sopenharmony_ci 5278c2ecf20Sopenharmony_cistatic void r5l_log_run_stripes(struct r5l_log *log) 5288c2ecf20Sopenharmony_ci{ 5298c2ecf20Sopenharmony_ci struct r5l_io_unit *io, *next; 5308c2ecf20Sopenharmony_ci 5318c2ecf20Sopenharmony_ci lockdep_assert_held(&log->io_list_lock); 5328c2ecf20Sopenharmony_ci 5338c2ecf20Sopenharmony_ci list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 5348c2ecf20Sopenharmony_ci /* don't change list order */ 5358c2ecf20Sopenharmony_ci if (io->state < IO_UNIT_IO_END) 5368c2ecf20Sopenharmony_ci break; 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_ci list_move_tail(&io->log_sibling, &log->finished_ios); 5398c2ecf20Sopenharmony_ci r5l_io_run_stripes(io); 5408c2ecf20Sopenharmony_ci } 5418c2ecf20Sopenharmony_ci} 5428c2ecf20Sopenharmony_ci 5438c2ecf20Sopenharmony_cistatic void r5l_move_to_end_ios(struct r5l_log *log) 5448c2ecf20Sopenharmony_ci{ 5458c2ecf20Sopenharmony_ci struct r5l_io_unit *io, *next; 5468c2ecf20Sopenharmony_ci 5478c2ecf20Sopenharmony_ci lockdep_assert_held(&log->io_list_lock); 5488c2ecf20Sopenharmony_ci 5498c2ecf20Sopenharmony_ci list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { 5508c2ecf20Sopenharmony_ci /* don't change list order */ 5518c2ecf20Sopenharmony_ci if (io->state < IO_UNIT_IO_END) 5528c2ecf20Sopenharmony_ci break; 5538c2ecf20Sopenharmony_ci list_move_tail(&io->log_sibling, &log->io_end_ios); 5548c2ecf20Sopenharmony_ci } 5558c2ecf20Sopenharmony_ci} 5568c2ecf20Sopenharmony_ci 5578c2ecf20Sopenharmony_cistatic void __r5l_stripe_write_finished(struct r5l_io_unit *io); 5588c2ecf20Sopenharmony_cistatic void r5l_log_endio(struct bio *bio) 5598c2ecf20Sopenharmony_ci{ 5608c2ecf20Sopenharmony_ci struct r5l_io_unit *io = bio->bi_private; 5618c2ecf20Sopenharmony_ci struct r5l_io_unit *io_deferred; 5628c2ecf20Sopenharmony_ci struct r5l_log *log = io->log; 5638c2ecf20Sopenharmony_ci unsigned long flags; 5648c2ecf20Sopenharmony_ci bool has_null_flush; 5658c2ecf20Sopenharmony_ci bool has_flush_payload; 5668c2ecf20Sopenharmony_ci 5678c2ecf20Sopenharmony_ci if (bio->bi_status) 5688c2ecf20Sopenharmony_ci md_error(log->rdev->mddev, log->rdev); 5698c2ecf20Sopenharmony_ci 5708c2ecf20Sopenharmony_ci bio_put(bio); 5718c2ecf20Sopenharmony_ci mempool_free(io->meta_page, &log->meta_pool); 5728c2ecf20Sopenharmony_ci 5738c2ecf20Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 5748c2ecf20Sopenharmony_ci __r5l_set_io_unit_state(io, IO_UNIT_IO_END); 5758c2ecf20Sopenharmony_ci 5768c2ecf20Sopenharmony_ci /* 5778c2ecf20Sopenharmony_ci * if the io doesn't not have null_flush or flush payload, 5788c2ecf20Sopenharmony_ci * it is not safe to access it after releasing io_list_lock. 5798c2ecf20Sopenharmony_ci * Therefore, it is necessary to check the condition with 5808c2ecf20Sopenharmony_ci * the lock held. 5818c2ecf20Sopenharmony_ci */ 5828c2ecf20Sopenharmony_ci has_null_flush = io->has_null_flush; 5838c2ecf20Sopenharmony_ci has_flush_payload = io->has_flush_payload; 5848c2ecf20Sopenharmony_ci 5858c2ecf20Sopenharmony_ci if (log->need_cache_flush && !list_empty(&io->stripe_list)) 5868c2ecf20Sopenharmony_ci r5l_move_to_end_ios(log); 5878c2ecf20Sopenharmony_ci else 5888c2ecf20Sopenharmony_ci r5l_log_run_stripes(log); 5898c2ecf20Sopenharmony_ci if (!list_empty(&log->running_ios)) { 5908c2ecf20Sopenharmony_ci /* 5918c2ecf20Sopenharmony_ci * FLUSH/FUA io_unit is deferred because of ordering, now we 5928c2ecf20Sopenharmony_ci * can dispatch it 5938c2ecf20Sopenharmony_ci */ 5948c2ecf20Sopenharmony_ci io_deferred = list_first_entry(&log->running_ios, 5958c2ecf20Sopenharmony_ci struct r5l_io_unit, log_sibling); 5968c2ecf20Sopenharmony_ci if (io_deferred->io_deferred) 5978c2ecf20Sopenharmony_ci schedule_work(&log->deferred_io_work); 5988c2ecf20Sopenharmony_ci } 5998c2ecf20Sopenharmony_ci 6008c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 6018c2ecf20Sopenharmony_ci 6028c2ecf20Sopenharmony_ci if (log->need_cache_flush) 6038c2ecf20Sopenharmony_ci md_wakeup_thread(log->rdev->mddev->thread); 6048c2ecf20Sopenharmony_ci 6058c2ecf20Sopenharmony_ci /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ 6068c2ecf20Sopenharmony_ci if (has_null_flush) { 6078c2ecf20Sopenharmony_ci struct bio *bi; 6088c2ecf20Sopenharmony_ci 6098c2ecf20Sopenharmony_ci WARN_ON(bio_list_empty(&io->flush_barriers)); 6108c2ecf20Sopenharmony_ci while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { 6118c2ecf20Sopenharmony_ci bio_endio(bi); 6128c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&io->pending_stripe)) { 6138c2ecf20Sopenharmony_ci __r5l_stripe_write_finished(io); 6148c2ecf20Sopenharmony_ci return; 6158c2ecf20Sopenharmony_ci } 6168c2ecf20Sopenharmony_ci } 6178c2ecf20Sopenharmony_ci } 6188c2ecf20Sopenharmony_ci /* decrease pending_stripe for flush payload */ 6198c2ecf20Sopenharmony_ci if (has_flush_payload) 6208c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&io->pending_stripe)) 6218c2ecf20Sopenharmony_ci __r5l_stripe_write_finished(io); 6228c2ecf20Sopenharmony_ci} 6238c2ecf20Sopenharmony_ci 6248c2ecf20Sopenharmony_cistatic void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) 6258c2ecf20Sopenharmony_ci{ 6268c2ecf20Sopenharmony_ci unsigned long flags; 6278c2ecf20Sopenharmony_ci 6288c2ecf20Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 6298c2ecf20Sopenharmony_ci __r5l_set_io_unit_state(io, IO_UNIT_IO_START); 6308c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 6318c2ecf20Sopenharmony_ci 6328c2ecf20Sopenharmony_ci /* 6338c2ecf20Sopenharmony_ci * In case of journal device failures, submit_bio will get error 6348c2ecf20Sopenharmony_ci * and calls endio, then active stripes will continue write 6358c2ecf20Sopenharmony_ci * process. Therefore, it is not necessary to check Faulty bit 6368c2ecf20Sopenharmony_ci * of journal device here. 6378c2ecf20Sopenharmony_ci * 6388c2ecf20Sopenharmony_ci * We can't check split_bio after current_bio is submitted. If 6398c2ecf20Sopenharmony_ci * io->split_bio is null, after current_bio is submitted, current_bio 6408c2ecf20Sopenharmony_ci * might already be completed and the io_unit is freed. We submit 6418c2ecf20Sopenharmony_ci * split_bio first to avoid the issue. 6428c2ecf20Sopenharmony_ci */ 6438c2ecf20Sopenharmony_ci if (io->split_bio) { 6448c2ecf20Sopenharmony_ci if (io->has_flush) 6458c2ecf20Sopenharmony_ci io->split_bio->bi_opf |= REQ_PREFLUSH; 6468c2ecf20Sopenharmony_ci if (io->has_fua) 6478c2ecf20Sopenharmony_ci io->split_bio->bi_opf |= REQ_FUA; 6488c2ecf20Sopenharmony_ci submit_bio(io->split_bio); 6498c2ecf20Sopenharmony_ci } 6508c2ecf20Sopenharmony_ci 6518c2ecf20Sopenharmony_ci if (io->has_flush) 6528c2ecf20Sopenharmony_ci io->current_bio->bi_opf |= REQ_PREFLUSH; 6538c2ecf20Sopenharmony_ci if (io->has_fua) 6548c2ecf20Sopenharmony_ci io->current_bio->bi_opf |= REQ_FUA; 6558c2ecf20Sopenharmony_ci submit_bio(io->current_bio); 6568c2ecf20Sopenharmony_ci} 6578c2ecf20Sopenharmony_ci 6588c2ecf20Sopenharmony_ci/* deferred io_unit will be dispatched here */ 6598c2ecf20Sopenharmony_cistatic void r5l_submit_io_async(struct work_struct *work) 6608c2ecf20Sopenharmony_ci{ 6618c2ecf20Sopenharmony_ci struct r5l_log *log = container_of(work, struct r5l_log, 6628c2ecf20Sopenharmony_ci deferred_io_work); 6638c2ecf20Sopenharmony_ci struct r5l_io_unit *io = NULL; 6648c2ecf20Sopenharmony_ci unsigned long flags; 6658c2ecf20Sopenharmony_ci 6668c2ecf20Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 6678c2ecf20Sopenharmony_ci if (!list_empty(&log->running_ios)) { 6688c2ecf20Sopenharmony_ci io = list_first_entry(&log->running_ios, struct r5l_io_unit, 6698c2ecf20Sopenharmony_ci log_sibling); 6708c2ecf20Sopenharmony_ci if (!io->io_deferred) 6718c2ecf20Sopenharmony_ci io = NULL; 6728c2ecf20Sopenharmony_ci else 6738c2ecf20Sopenharmony_ci io->io_deferred = 0; 6748c2ecf20Sopenharmony_ci } 6758c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 6768c2ecf20Sopenharmony_ci if (io) 6778c2ecf20Sopenharmony_ci r5l_do_submit_io(log, io); 6788c2ecf20Sopenharmony_ci} 6798c2ecf20Sopenharmony_ci 6808c2ecf20Sopenharmony_cistatic void r5c_disable_writeback_async(struct work_struct *work) 6818c2ecf20Sopenharmony_ci{ 6828c2ecf20Sopenharmony_ci struct r5l_log *log = container_of(work, struct r5l_log, 6838c2ecf20Sopenharmony_ci disable_writeback_work); 6848c2ecf20Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 6858c2ecf20Sopenharmony_ci struct r5conf *conf = mddev->private; 6868c2ecf20Sopenharmony_ci int locked = 0; 6878c2ecf20Sopenharmony_ci 6888c2ecf20Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 6898c2ecf20Sopenharmony_ci return; 6908c2ecf20Sopenharmony_ci pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", 6918c2ecf20Sopenharmony_ci mdname(mddev)); 6928c2ecf20Sopenharmony_ci 6938c2ecf20Sopenharmony_ci /* wait superblock change before suspend */ 6948c2ecf20Sopenharmony_ci wait_event(mddev->sb_wait, 6958c2ecf20Sopenharmony_ci conf->log == NULL || 6968c2ecf20Sopenharmony_ci (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && 6978c2ecf20Sopenharmony_ci (locked = mddev_trylock(mddev)))); 6988c2ecf20Sopenharmony_ci if (locked) { 6998c2ecf20Sopenharmony_ci mddev_suspend(mddev); 7008c2ecf20Sopenharmony_ci log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 7018c2ecf20Sopenharmony_ci mddev_resume(mddev); 7028c2ecf20Sopenharmony_ci mddev_unlock(mddev); 7038c2ecf20Sopenharmony_ci } 7048c2ecf20Sopenharmony_ci} 7058c2ecf20Sopenharmony_ci 7068c2ecf20Sopenharmony_cistatic void r5l_submit_current_io(struct r5l_log *log) 7078c2ecf20Sopenharmony_ci{ 7088c2ecf20Sopenharmony_ci struct r5l_io_unit *io = log->current_io; 7098c2ecf20Sopenharmony_ci struct r5l_meta_block *block; 7108c2ecf20Sopenharmony_ci unsigned long flags; 7118c2ecf20Sopenharmony_ci u32 crc; 7128c2ecf20Sopenharmony_ci bool do_submit = true; 7138c2ecf20Sopenharmony_ci 7148c2ecf20Sopenharmony_ci if (!io) 7158c2ecf20Sopenharmony_ci return; 7168c2ecf20Sopenharmony_ci 7178c2ecf20Sopenharmony_ci block = page_address(io->meta_page); 7188c2ecf20Sopenharmony_ci block->meta_size = cpu_to_le32(io->meta_offset); 7198c2ecf20Sopenharmony_ci crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); 7208c2ecf20Sopenharmony_ci block->checksum = cpu_to_le32(crc); 7218c2ecf20Sopenharmony_ci 7228c2ecf20Sopenharmony_ci log->current_io = NULL; 7238c2ecf20Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 7248c2ecf20Sopenharmony_ci if (io->has_flush || io->has_fua) { 7258c2ecf20Sopenharmony_ci if (io != list_first_entry(&log->running_ios, 7268c2ecf20Sopenharmony_ci struct r5l_io_unit, log_sibling)) { 7278c2ecf20Sopenharmony_ci io->io_deferred = 1; 7288c2ecf20Sopenharmony_ci do_submit = false; 7298c2ecf20Sopenharmony_ci } 7308c2ecf20Sopenharmony_ci } 7318c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 7328c2ecf20Sopenharmony_ci if (do_submit) 7338c2ecf20Sopenharmony_ci r5l_do_submit_io(log, io); 7348c2ecf20Sopenharmony_ci} 7358c2ecf20Sopenharmony_ci 7368c2ecf20Sopenharmony_cistatic struct bio *r5l_bio_alloc(struct r5l_log *log) 7378c2ecf20Sopenharmony_ci{ 7388c2ecf20Sopenharmony_ci struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs); 7398c2ecf20Sopenharmony_ci 7408c2ecf20Sopenharmony_ci bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 7418c2ecf20Sopenharmony_ci bio_set_dev(bio, log->rdev->bdev); 7428c2ecf20Sopenharmony_ci bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; 7438c2ecf20Sopenharmony_ci 7448c2ecf20Sopenharmony_ci return bio; 7458c2ecf20Sopenharmony_ci} 7468c2ecf20Sopenharmony_ci 7478c2ecf20Sopenharmony_cistatic void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) 7488c2ecf20Sopenharmony_ci{ 7498c2ecf20Sopenharmony_ci log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); 7508c2ecf20Sopenharmony_ci 7518c2ecf20Sopenharmony_ci r5c_update_log_state(log); 7528c2ecf20Sopenharmony_ci /* 7538c2ecf20Sopenharmony_ci * If we filled up the log device start from the beginning again, 7548c2ecf20Sopenharmony_ci * which will require a new bio. 7558c2ecf20Sopenharmony_ci * 7568c2ecf20Sopenharmony_ci * Note: for this to work properly the log size needs to me a multiple 7578c2ecf20Sopenharmony_ci * of BLOCK_SECTORS. 7588c2ecf20Sopenharmony_ci */ 7598c2ecf20Sopenharmony_ci if (log->log_start == 0) 7608c2ecf20Sopenharmony_ci io->need_split_bio = true; 7618c2ecf20Sopenharmony_ci 7628c2ecf20Sopenharmony_ci io->log_end = log->log_start; 7638c2ecf20Sopenharmony_ci} 7648c2ecf20Sopenharmony_ci 7658c2ecf20Sopenharmony_cistatic struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) 7668c2ecf20Sopenharmony_ci{ 7678c2ecf20Sopenharmony_ci struct r5l_io_unit *io; 7688c2ecf20Sopenharmony_ci struct r5l_meta_block *block; 7698c2ecf20Sopenharmony_ci 7708c2ecf20Sopenharmony_ci io = mempool_alloc(&log->io_pool, GFP_ATOMIC); 7718c2ecf20Sopenharmony_ci if (!io) 7728c2ecf20Sopenharmony_ci return NULL; 7738c2ecf20Sopenharmony_ci memset(io, 0, sizeof(*io)); 7748c2ecf20Sopenharmony_ci 7758c2ecf20Sopenharmony_ci io->log = log; 7768c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&io->log_sibling); 7778c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&io->stripe_list); 7788c2ecf20Sopenharmony_ci bio_list_init(&io->flush_barriers); 7798c2ecf20Sopenharmony_ci io->state = IO_UNIT_RUNNING; 7808c2ecf20Sopenharmony_ci 7818c2ecf20Sopenharmony_ci io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO); 7828c2ecf20Sopenharmony_ci block = page_address(io->meta_page); 7838c2ecf20Sopenharmony_ci clear_page(block); 7848c2ecf20Sopenharmony_ci block->magic = cpu_to_le32(R5LOG_MAGIC); 7858c2ecf20Sopenharmony_ci block->version = R5LOG_VERSION; 7868c2ecf20Sopenharmony_ci block->seq = cpu_to_le64(log->seq); 7878c2ecf20Sopenharmony_ci block->position = cpu_to_le64(log->log_start); 7888c2ecf20Sopenharmony_ci 7898c2ecf20Sopenharmony_ci io->log_start = log->log_start; 7908c2ecf20Sopenharmony_ci io->meta_offset = sizeof(struct r5l_meta_block); 7918c2ecf20Sopenharmony_ci io->seq = log->seq++; 7928c2ecf20Sopenharmony_ci 7938c2ecf20Sopenharmony_ci io->current_bio = r5l_bio_alloc(log); 7948c2ecf20Sopenharmony_ci io->current_bio->bi_end_io = r5l_log_endio; 7958c2ecf20Sopenharmony_ci io->current_bio->bi_private = io; 7968c2ecf20Sopenharmony_ci bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); 7978c2ecf20Sopenharmony_ci 7988c2ecf20Sopenharmony_ci r5_reserve_log_entry(log, io); 7998c2ecf20Sopenharmony_ci 8008c2ecf20Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 8018c2ecf20Sopenharmony_ci list_add_tail(&io->log_sibling, &log->running_ios); 8028c2ecf20Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 8038c2ecf20Sopenharmony_ci 8048c2ecf20Sopenharmony_ci return io; 8058c2ecf20Sopenharmony_ci} 8068c2ecf20Sopenharmony_ci 8078c2ecf20Sopenharmony_cistatic int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) 8088c2ecf20Sopenharmony_ci{ 8098c2ecf20Sopenharmony_ci if (log->current_io && 8108c2ecf20Sopenharmony_ci log->current_io->meta_offset + payload_size > PAGE_SIZE) 8118c2ecf20Sopenharmony_ci r5l_submit_current_io(log); 8128c2ecf20Sopenharmony_ci 8138c2ecf20Sopenharmony_ci if (!log->current_io) { 8148c2ecf20Sopenharmony_ci log->current_io = r5l_new_meta(log); 8158c2ecf20Sopenharmony_ci if (!log->current_io) 8168c2ecf20Sopenharmony_ci return -ENOMEM; 8178c2ecf20Sopenharmony_ci } 8188c2ecf20Sopenharmony_ci 8198c2ecf20Sopenharmony_ci return 0; 8208c2ecf20Sopenharmony_ci} 8218c2ecf20Sopenharmony_ci 8228c2ecf20Sopenharmony_cistatic void r5l_append_payload_meta(struct r5l_log *log, u16 type, 8238c2ecf20Sopenharmony_ci sector_t location, 8248c2ecf20Sopenharmony_ci u32 checksum1, u32 checksum2, 8258c2ecf20Sopenharmony_ci bool checksum2_valid) 8268c2ecf20Sopenharmony_ci{ 8278c2ecf20Sopenharmony_ci struct r5l_io_unit *io = log->current_io; 8288c2ecf20Sopenharmony_ci struct r5l_payload_data_parity *payload; 8298c2ecf20Sopenharmony_ci 8308c2ecf20Sopenharmony_ci payload = page_address(io->meta_page) + io->meta_offset; 8318c2ecf20Sopenharmony_ci payload->header.type = cpu_to_le16(type); 8328c2ecf20Sopenharmony_ci payload->header.flags = cpu_to_le16(0); 8338c2ecf20Sopenharmony_ci payload->size = cpu_to_le32((1 + !!checksum2_valid) << 8348c2ecf20Sopenharmony_ci (PAGE_SHIFT - 9)); 8358c2ecf20Sopenharmony_ci payload->location = cpu_to_le64(location); 8368c2ecf20Sopenharmony_ci payload->checksum[0] = cpu_to_le32(checksum1); 8378c2ecf20Sopenharmony_ci if (checksum2_valid) 8388c2ecf20Sopenharmony_ci payload->checksum[1] = cpu_to_le32(checksum2); 8398c2ecf20Sopenharmony_ci 8408c2ecf20Sopenharmony_ci io->meta_offset += sizeof(struct r5l_payload_data_parity) + 8418c2ecf20Sopenharmony_ci sizeof(__le32) * (1 + !!checksum2_valid); 8428c2ecf20Sopenharmony_ci} 8438c2ecf20Sopenharmony_ci 8448c2ecf20Sopenharmony_cistatic void r5l_append_payload_page(struct r5l_log *log, struct page *page) 8458c2ecf20Sopenharmony_ci{ 8468c2ecf20Sopenharmony_ci struct r5l_io_unit *io = log->current_io; 8478c2ecf20Sopenharmony_ci 8488c2ecf20Sopenharmony_ci if (io->need_split_bio) { 8498c2ecf20Sopenharmony_ci BUG_ON(io->split_bio); 8508c2ecf20Sopenharmony_ci io->split_bio = io->current_bio; 8518c2ecf20Sopenharmony_ci io->current_bio = r5l_bio_alloc(log); 8528c2ecf20Sopenharmony_ci bio_chain(io->current_bio, io->split_bio); 8538c2ecf20Sopenharmony_ci io->need_split_bio = false; 8548c2ecf20Sopenharmony_ci } 8558c2ecf20Sopenharmony_ci 8568c2ecf20Sopenharmony_ci if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) 8578c2ecf20Sopenharmony_ci BUG(); 8588c2ecf20Sopenharmony_ci 8598c2ecf20Sopenharmony_ci r5_reserve_log_entry(log, io); 8608c2ecf20Sopenharmony_ci} 8618c2ecf20Sopenharmony_ci 8628c2ecf20Sopenharmony_cistatic void r5l_append_flush_payload(struct r5l_log *log, sector_t sect) 8638c2ecf20Sopenharmony_ci{ 8648c2ecf20Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 8658c2ecf20Sopenharmony_ci struct r5conf *conf = mddev->private; 8668c2ecf20Sopenharmony_ci struct r5l_io_unit *io; 8678c2ecf20Sopenharmony_ci struct r5l_payload_flush *payload; 8688c2ecf20Sopenharmony_ci int meta_size; 8698c2ecf20Sopenharmony_ci 8708c2ecf20Sopenharmony_ci /* 8718c2ecf20Sopenharmony_ci * payload_flush requires extra writes to the journal. 8728c2ecf20Sopenharmony_ci * To avoid handling the extra IO in quiesce, just skip 8738c2ecf20Sopenharmony_ci * flush_payload 8748c2ecf20Sopenharmony_ci */ 8758c2ecf20Sopenharmony_ci if (conf->quiesce) 8768c2ecf20Sopenharmony_ci return; 8778c2ecf20Sopenharmony_ci 8788c2ecf20Sopenharmony_ci mutex_lock(&log->io_mutex); 8798c2ecf20Sopenharmony_ci meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64); 8808c2ecf20Sopenharmony_ci 8818c2ecf20Sopenharmony_ci if (r5l_get_meta(log, meta_size)) { 8828c2ecf20Sopenharmony_ci mutex_unlock(&log->io_mutex); 8838c2ecf20Sopenharmony_ci return; 8848c2ecf20Sopenharmony_ci } 8858c2ecf20Sopenharmony_ci 8868c2ecf20Sopenharmony_ci /* current implementation is one stripe per flush payload */ 8878c2ecf20Sopenharmony_ci io = log->current_io; 8888c2ecf20Sopenharmony_ci payload = page_address(io->meta_page) + io->meta_offset; 8898c2ecf20Sopenharmony_ci payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH); 8908c2ecf20Sopenharmony_ci payload->header.flags = cpu_to_le16(0); 8918c2ecf20Sopenharmony_ci payload->size = cpu_to_le32(sizeof(__le64)); 8928c2ecf20Sopenharmony_ci payload->flush_stripes[0] = cpu_to_le64(sect); 8938c2ecf20Sopenharmony_ci io->meta_offset += meta_size; 8948c2ecf20Sopenharmony_ci /* multiple flush payloads count as one pending_stripe */ 8958c2ecf20Sopenharmony_ci if (!io->has_flush_payload) { 8968c2ecf20Sopenharmony_ci io->has_flush_payload = 1; 8978c2ecf20Sopenharmony_ci atomic_inc(&io->pending_stripe); 8988c2ecf20Sopenharmony_ci } 8998c2ecf20Sopenharmony_ci mutex_unlock(&log->io_mutex); 9008c2ecf20Sopenharmony_ci} 9018c2ecf20Sopenharmony_ci 9028c2ecf20Sopenharmony_cistatic int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, 9038c2ecf20Sopenharmony_ci int data_pages, int parity_pages) 9048c2ecf20Sopenharmony_ci{ 9058c2ecf20Sopenharmony_ci int i; 9068c2ecf20Sopenharmony_ci int meta_size; 9078c2ecf20Sopenharmony_ci int ret; 9088c2ecf20Sopenharmony_ci struct r5l_io_unit *io; 9098c2ecf20Sopenharmony_ci 9108c2ecf20Sopenharmony_ci meta_size = 9118c2ecf20Sopenharmony_ci ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) 9128c2ecf20Sopenharmony_ci * data_pages) + 9138c2ecf20Sopenharmony_ci sizeof(struct r5l_payload_data_parity) + 9148c2ecf20Sopenharmony_ci sizeof(__le32) * parity_pages; 9158c2ecf20Sopenharmony_ci 9168c2ecf20Sopenharmony_ci ret = r5l_get_meta(log, meta_size); 9178c2ecf20Sopenharmony_ci if (ret) 9188c2ecf20Sopenharmony_ci return ret; 9198c2ecf20Sopenharmony_ci 9208c2ecf20Sopenharmony_ci io = log->current_io; 9218c2ecf20Sopenharmony_ci 9228c2ecf20Sopenharmony_ci if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state)) 9238c2ecf20Sopenharmony_ci io->has_flush = 1; 9248c2ecf20Sopenharmony_ci 9258c2ecf20Sopenharmony_ci for (i = 0; i < sh->disks; i++) { 9268c2ecf20Sopenharmony_ci if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 9278c2ecf20Sopenharmony_ci test_bit(R5_InJournal, &sh->dev[i].flags)) 9288c2ecf20Sopenharmony_ci continue; 9298c2ecf20Sopenharmony_ci if (i == sh->pd_idx || i == sh->qd_idx) 9308c2ecf20Sopenharmony_ci continue; 9318c2ecf20Sopenharmony_ci if (test_bit(R5_WantFUA, &sh->dev[i].flags) && 9328c2ecf20Sopenharmony_ci log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) { 9338c2ecf20Sopenharmony_ci io->has_fua = 1; 9348c2ecf20Sopenharmony_ci /* 9358c2ecf20Sopenharmony_ci * we need to flush journal to make sure recovery can 9368c2ecf20Sopenharmony_ci * reach the data with fua flag 9378c2ecf20Sopenharmony_ci */ 9388c2ecf20Sopenharmony_ci io->has_flush = 1; 9398c2ecf20Sopenharmony_ci } 9408c2ecf20Sopenharmony_ci r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, 9418c2ecf20Sopenharmony_ci raid5_compute_blocknr(sh, i, 0), 9428c2ecf20Sopenharmony_ci sh->dev[i].log_checksum, 0, false); 9438c2ecf20Sopenharmony_ci r5l_append_payload_page(log, sh->dev[i].page); 9448c2ecf20Sopenharmony_ci } 9458c2ecf20Sopenharmony_ci 9468c2ecf20Sopenharmony_ci if (parity_pages == 2) { 9478c2ecf20Sopenharmony_ci r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 9488c2ecf20Sopenharmony_ci sh->sector, sh->dev[sh->pd_idx].log_checksum, 9498c2ecf20Sopenharmony_ci sh->dev[sh->qd_idx].log_checksum, true); 9508c2ecf20Sopenharmony_ci r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 9518c2ecf20Sopenharmony_ci r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); 9528c2ecf20Sopenharmony_ci } else if (parity_pages == 1) { 9538c2ecf20Sopenharmony_ci r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, 9548c2ecf20Sopenharmony_ci sh->sector, sh->dev[sh->pd_idx].log_checksum, 9558c2ecf20Sopenharmony_ci 0, false); 9568c2ecf20Sopenharmony_ci r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); 9578c2ecf20Sopenharmony_ci } else /* Just writing data, not parity, in caching phase */ 9588c2ecf20Sopenharmony_ci BUG_ON(parity_pages != 0); 9598c2ecf20Sopenharmony_ci 9608c2ecf20Sopenharmony_ci list_add_tail(&sh->log_list, &io->stripe_list); 9618c2ecf20Sopenharmony_ci atomic_inc(&io->pending_stripe); 9628c2ecf20Sopenharmony_ci sh->log_io = io; 9638c2ecf20Sopenharmony_ci 9648c2ecf20Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 9658c2ecf20Sopenharmony_ci return 0; 9668c2ecf20Sopenharmony_ci 9678c2ecf20Sopenharmony_ci if (sh->log_start == MaxSector) { 9688c2ecf20Sopenharmony_ci BUG_ON(!list_empty(&sh->r5c)); 9698c2ecf20Sopenharmony_ci sh->log_start = io->log_start; 9708c2ecf20Sopenharmony_ci spin_lock_irq(&log->stripe_in_journal_lock); 9718c2ecf20Sopenharmony_ci list_add_tail(&sh->r5c, 9728c2ecf20Sopenharmony_ci &log->stripe_in_journal_list); 9738c2ecf20Sopenharmony_ci spin_unlock_irq(&log->stripe_in_journal_lock); 9748c2ecf20Sopenharmony_ci atomic_inc(&log->stripe_in_journal_count); 9758c2ecf20Sopenharmony_ci } 9768c2ecf20Sopenharmony_ci return 0; 9778c2ecf20Sopenharmony_ci} 9788c2ecf20Sopenharmony_ci 9798c2ecf20Sopenharmony_ci/* add stripe to no_space_stripes, and then wake up reclaim */ 9808c2ecf20Sopenharmony_cistatic inline void r5l_add_no_space_stripe(struct r5l_log *log, 9818c2ecf20Sopenharmony_ci struct stripe_head *sh) 9828c2ecf20Sopenharmony_ci{ 9838c2ecf20Sopenharmony_ci spin_lock(&log->no_space_stripes_lock); 9848c2ecf20Sopenharmony_ci list_add_tail(&sh->log_list, &log->no_space_stripes); 9858c2ecf20Sopenharmony_ci spin_unlock(&log->no_space_stripes_lock); 9868c2ecf20Sopenharmony_ci} 9878c2ecf20Sopenharmony_ci 9888c2ecf20Sopenharmony_ci/* 9898c2ecf20Sopenharmony_ci * running in raid5d, where reclaim could wait for raid5d too (when it flushes 9908c2ecf20Sopenharmony_ci * data from log to raid disks), so we shouldn't wait for reclaim here 9918c2ecf20Sopenharmony_ci */ 9928c2ecf20Sopenharmony_ciint r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) 9938c2ecf20Sopenharmony_ci{ 9948c2ecf20Sopenharmony_ci struct r5conf *conf = sh->raid_conf; 9958c2ecf20Sopenharmony_ci int write_disks = 0; 9968c2ecf20Sopenharmony_ci int data_pages, parity_pages; 9978c2ecf20Sopenharmony_ci int reserve; 9988c2ecf20Sopenharmony_ci int i; 9998c2ecf20Sopenharmony_ci int ret = 0; 10008c2ecf20Sopenharmony_ci bool wake_reclaim = false; 10018c2ecf20Sopenharmony_ci 10028c2ecf20Sopenharmony_ci if (!log) 10038c2ecf20Sopenharmony_ci return -EAGAIN; 10048c2ecf20Sopenharmony_ci /* Don't support stripe batch */ 10058c2ecf20Sopenharmony_ci if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 10068c2ecf20Sopenharmony_ci test_bit(STRIPE_SYNCING, &sh->state)) { 10078c2ecf20Sopenharmony_ci /* the stripe is written to log, we start writing it to raid */ 10088c2ecf20Sopenharmony_ci clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 10098c2ecf20Sopenharmony_ci return -EAGAIN; 10108c2ecf20Sopenharmony_ci } 10118c2ecf20Sopenharmony_ci 10128c2ecf20Sopenharmony_ci WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 10138c2ecf20Sopenharmony_ci 10148c2ecf20Sopenharmony_ci for (i = 0; i < sh->disks; i++) { 10158c2ecf20Sopenharmony_ci void *addr; 10168c2ecf20Sopenharmony_ci 10178c2ecf20Sopenharmony_ci if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) || 10188c2ecf20Sopenharmony_ci test_bit(R5_InJournal, &sh->dev[i].flags)) 10198c2ecf20Sopenharmony_ci continue; 10208c2ecf20Sopenharmony_ci 10218c2ecf20Sopenharmony_ci write_disks++; 10228c2ecf20Sopenharmony_ci /* checksum is already calculated in last run */ 10238c2ecf20Sopenharmony_ci if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) 10248c2ecf20Sopenharmony_ci continue; 10258c2ecf20Sopenharmony_ci addr = kmap_atomic(sh->dev[i].page); 10268c2ecf20Sopenharmony_ci sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 10278c2ecf20Sopenharmony_ci addr, PAGE_SIZE); 10288c2ecf20Sopenharmony_ci kunmap_atomic(addr); 10298c2ecf20Sopenharmony_ci } 10308c2ecf20Sopenharmony_ci parity_pages = 1 + !!(sh->qd_idx >= 0); 10318c2ecf20Sopenharmony_ci data_pages = write_disks - parity_pages; 10328c2ecf20Sopenharmony_ci 10338c2ecf20Sopenharmony_ci set_bit(STRIPE_LOG_TRAPPED, &sh->state); 10348c2ecf20Sopenharmony_ci /* 10358c2ecf20Sopenharmony_ci * The stripe must enter state machine again to finish the write, so 10368c2ecf20Sopenharmony_ci * don't delay. 10378c2ecf20Sopenharmony_ci */ 10388c2ecf20Sopenharmony_ci clear_bit(STRIPE_DELAYED, &sh->state); 10398c2ecf20Sopenharmony_ci atomic_inc(&sh->count); 10408c2ecf20Sopenharmony_ci 10418c2ecf20Sopenharmony_ci mutex_lock(&log->io_mutex); 10428c2ecf20Sopenharmony_ci /* meta + data */ 10438c2ecf20Sopenharmony_ci reserve = (1 + write_disks) << (PAGE_SHIFT - 9); 10448c2ecf20Sopenharmony_ci 10458c2ecf20Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 10468c2ecf20Sopenharmony_ci if (!r5l_has_free_space(log, reserve)) { 10478c2ecf20Sopenharmony_ci r5l_add_no_space_stripe(log, sh); 10488c2ecf20Sopenharmony_ci wake_reclaim = true; 10498c2ecf20Sopenharmony_ci } else { 10508c2ecf20Sopenharmony_ci ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 10518c2ecf20Sopenharmony_ci if (ret) { 10528c2ecf20Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 10538c2ecf20Sopenharmony_ci list_add_tail(&sh->log_list, 10548c2ecf20Sopenharmony_ci &log->no_mem_stripes); 10558c2ecf20Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 10568c2ecf20Sopenharmony_ci } 10578c2ecf20Sopenharmony_ci } 10588c2ecf20Sopenharmony_ci } else { /* R5C_JOURNAL_MODE_WRITE_BACK */ 10598c2ecf20Sopenharmony_ci /* 10608c2ecf20Sopenharmony_ci * log space critical, do not process stripes that are 10618c2ecf20Sopenharmony_ci * not in cache yet (sh->log_start == MaxSector). 10628c2ecf20Sopenharmony_ci */ 10638c2ecf20Sopenharmony_ci if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 10648c2ecf20Sopenharmony_ci sh->log_start == MaxSector) { 10658c2ecf20Sopenharmony_ci r5l_add_no_space_stripe(log, sh); 10668c2ecf20Sopenharmony_ci wake_reclaim = true; 10678c2ecf20Sopenharmony_ci reserve = 0; 10688c2ecf20Sopenharmony_ci } else if (!r5l_has_free_space(log, reserve)) { 10698c2ecf20Sopenharmony_ci if (sh->log_start == log->last_checkpoint) 10708c2ecf20Sopenharmony_ci BUG(); 10718c2ecf20Sopenharmony_ci else 10728c2ecf20Sopenharmony_ci r5l_add_no_space_stripe(log, sh); 10738c2ecf20Sopenharmony_ci } else { 10748c2ecf20Sopenharmony_ci ret = r5l_log_stripe(log, sh, data_pages, parity_pages); 10758c2ecf20Sopenharmony_ci if (ret) { 10768c2ecf20Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 10778c2ecf20Sopenharmony_ci list_add_tail(&sh->log_list, 10788c2ecf20Sopenharmony_ci &log->no_mem_stripes); 10798c2ecf20Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 10808c2ecf20Sopenharmony_ci } 10818c2ecf20Sopenharmony_ci } 10828c2ecf20Sopenharmony_ci } 10838c2ecf20Sopenharmony_ci 10848c2ecf20Sopenharmony_ci mutex_unlock(&log->io_mutex); 10858c2ecf20Sopenharmony_ci if (wake_reclaim) 10868c2ecf20Sopenharmony_ci r5l_wake_reclaim(log, reserve); 10878c2ecf20Sopenharmony_ci return 0; 10888c2ecf20Sopenharmony_ci} 10898c2ecf20Sopenharmony_ci 10908c2ecf20Sopenharmony_civoid r5l_write_stripe_run(struct r5l_log *log) 10918c2ecf20Sopenharmony_ci{ 10928c2ecf20Sopenharmony_ci if (!log) 10938c2ecf20Sopenharmony_ci return; 10948c2ecf20Sopenharmony_ci mutex_lock(&log->io_mutex); 10958c2ecf20Sopenharmony_ci r5l_submit_current_io(log); 10968c2ecf20Sopenharmony_ci mutex_unlock(&log->io_mutex); 10978c2ecf20Sopenharmony_ci} 10988c2ecf20Sopenharmony_ci 10998c2ecf20Sopenharmony_ciint r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) 11008c2ecf20Sopenharmony_ci{ 11018c2ecf20Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { 11028c2ecf20Sopenharmony_ci /* 11038c2ecf20Sopenharmony_ci * in write through (journal only) 11048c2ecf20Sopenharmony_ci * we flush log disk cache first, then write stripe data to 11058c2ecf20Sopenharmony_ci * raid disks. So if bio is finished, the log disk cache is 11068c2ecf20Sopenharmony_ci * flushed already. The recovery guarantees we can recovery 11078c2ecf20Sopenharmony_ci * the bio from log disk, so we don't need to flush again 11088c2ecf20Sopenharmony_ci */ 11098c2ecf20Sopenharmony_ci if (bio->bi_iter.bi_size == 0) { 11108c2ecf20Sopenharmony_ci bio_endio(bio); 11118c2ecf20Sopenharmony_ci return 0; 11128c2ecf20Sopenharmony_ci } 11138c2ecf20Sopenharmony_ci bio->bi_opf &= ~REQ_PREFLUSH; 11148c2ecf20Sopenharmony_ci } else { 11158c2ecf20Sopenharmony_ci /* write back (with cache) */ 11168c2ecf20Sopenharmony_ci if (bio->bi_iter.bi_size == 0) { 11178c2ecf20Sopenharmony_ci mutex_lock(&log->io_mutex); 11188c2ecf20Sopenharmony_ci r5l_get_meta(log, 0); 11198c2ecf20Sopenharmony_ci bio_list_add(&log->current_io->flush_barriers, bio); 11208c2ecf20Sopenharmony_ci log->current_io->has_flush = 1; 11218c2ecf20Sopenharmony_ci log->current_io->has_null_flush = 1; 11228c2ecf20Sopenharmony_ci atomic_inc(&log->current_io->pending_stripe); 11238c2ecf20Sopenharmony_ci r5l_submit_current_io(log); 11248c2ecf20Sopenharmony_ci mutex_unlock(&log->io_mutex); 11258c2ecf20Sopenharmony_ci return 0; 11268c2ecf20Sopenharmony_ci } 11278c2ecf20Sopenharmony_ci } 11288c2ecf20Sopenharmony_ci return -EAGAIN; 11298c2ecf20Sopenharmony_ci} 11308c2ecf20Sopenharmony_ci 11318c2ecf20Sopenharmony_ci/* This will run after log space is reclaimed */ 11328c2ecf20Sopenharmony_cistatic void r5l_run_no_space_stripes(struct r5l_log *log) 11338c2ecf20Sopenharmony_ci{ 11348c2ecf20Sopenharmony_ci struct stripe_head *sh; 11358c2ecf20Sopenharmony_ci 11368c2ecf20Sopenharmony_ci spin_lock(&log->no_space_stripes_lock); 11378c2ecf20Sopenharmony_ci while (!list_empty(&log->no_space_stripes)) { 11388c2ecf20Sopenharmony_ci sh = list_first_entry(&log->no_space_stripes, 11398c2ecf20Sopenharmony_ci struct stripe_head, log_list); 11408c2ecf20Sopenharmony_ci list_del_init(&sh->log_list); 11418c2ecf20Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 11428c2ecf20Sopenharmony_ci raid5_release_stripe(sh); 11438c2ecf20Sopenharmony_ci } 11448c2ecf20Sopenharmony_ci spin_unlock(&log->no_space_stripes_lock); 11458c2ecf20Sopenharmony_ci} 11468c2ecf20Sopenharmony_ci 11478c2ecf20Sopenharmony_ci/* 11488c2ecf20Sopenharmony_ci * calculate new last_checkpoint 11498c2ecf20Sopenharmony_ci * for write through mode, returns log->next_checkpoint 11508c2ecf20Sopenharmony_ci * for write back, returns log_start of first sh in stripe_in_journal_list 11518c2ecf20Sopenharmony_ci */ 11528c2ecf20Sopenharmony_cistatic sector_t r5c_calculate_new_cp(struct r5conf *conf) 11538c2ecf20Sopenharmony_ci{ 11548c2ecf20Sopenharmony_ci struct stripe_head *sh; 11558c2ecf20Sopenharmony_ci struct r5l_log *log = conf->log; 11568c2ecf20Sopenharmony_ci sector_t new_cp; 11578c2ecf20Sopenharmony_ci unsigned long flags; 11588c2ecf20Sopenharmony_ci 11598c2ecf20Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 11608c2ecf20Sopenharmony_ci return log->next_checkpoint; 11618c2ecf20Sopenharmony_ci 11628c2ecf20Sopenharmony_ci spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 11638c2ecf20Sopenharmony_ci if (list_empty(&conf->log->stripe_in_journal_list)) { 11648c2ecf20Sopenharmony_ci /* all stripes flushed */ 11658c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 11668c2ecf20Sopenharmony_ci return log->next_checkpoint; 11678c2ecf20Sopenharmony_ci } 11688c2ecf20Sopenharmony_ci sh = list_first_entry(&conf->log->stripe_in_journal_list, 11698c2ecf20Sopenharmony_ci struct stripe_head, r5c); 11708c2ecf20Sopenharmony_ci new_cp = sh->log_start; 11718c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 11728c2ecf20Sopenharmony_ci return new_cp; 11738c2ecf20Sopenharmony_ci} 11748c2ecf20Sopenharmony_ci 11758c2ecf20Sopenharmony_cistatic sector_t r5l_reclaimable_space(struct r5l_log *log) 11768c2ecf20Sopenharmony_ci{ 11778c2ecf20Sopenharmony_ci struct r5conf *conf = log->rdev->mddev->private; 11788c2ecf20Sopenharmony_ci 11798c2ecf20Sopenharmony_ci return r5l_ring_distance(log, log->last_checkpoint, 11808c2ecf20Sopenharmony_ci r5c_calculate_new_cp(conf)); 11818c2ecf20Sopenharmony_ci} 11828c2ecf20Sopenharmony_ci 11838c2ecf20Sopenharmony_cistatic void r5l_run_no_mem_stripe(struct r5l_log *log) 11848c2ecf20Sopenharmony_ci{ 11858c2ecf20Sopenharmony_ci struct stripe_head *sh; 11868c2ecf20Sopenharmony_ci 11878c2ecf20Sopenharmony_ci lockdep_assert_held(&log->io_list_lock); 11888c2ecf20Sopenharmony_ci 11898c2ecf20Sopenharmony_ci if (!list_empty(&log->no_mem_stripes)) { 11908c2ecf20Sopenharmony_ci sh = list_first_entry(&log->no_mem_stripes, 11918c2ecf20Sopenharmony_ci struct stripe_head, log_list); 11928c2ecf20Sopenharmony_ci list_del_init(&sh->log_list); 11938c2ecf20Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 11948c2ecf20Sopenharmony_ci raid5_release_stripe(sh); 11958c2ecf20Sopenharmony_ci } 11968c2ecf20Sopenharmony_ci} 11978c2ecf20Sopenharmony_ci 11988c2ecf20Sopenharmony_cistatic bool r5l_complete_finished_ios(struct r5l_log *log) 11998c2ecf20Sopenharmony_ci{ 12008c2ecf20Sopenharmony_ci struct r5l_io_unit *io, *next; 12018c2ecf20Sopenharmony_ci bool found = false; 12028c2ecf20Sopenharmony_ci 12038c2ecf20Sopenharmony_ci lockdep_assert_held(&log->io_list_lock); 12048c2ecf20Sopenharmony_ci 12058c2ecf20Sopenharmony_ci list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { 12068c2ecf20Sopenharmony_ci /* don't change list order */ 12078c2ecf20Sopenharmony_ci if (io->state < IO_UNIT_STRIPE_END) 12088c2ecf20Sopenharmony_ci break; 12098c2ecf20Sopenharmony_ci 12108c2ecf20Sopenharmony_ci log->next_checkpoint = io->log_start; 12118c2ecf20Sopenharmony_ci 12128c2ecf20Sopenharmony_ci list_del(&io->log_sibling); 12138c2ecf20Sopenharmony_ci mempool_free(io, &log->io_pool); 12148c2ecf20Sopenharmony_ci r5l_run_no_mem_stripe(log); 12158c2ecf20Sopenharmony_ci 12168c2ecf20Sopenharmony_ci found = true; 12178c2ecf20Sopenharmony_ci } 12188c2ecf20Sopenharmony_ci 12198c2ecf20Sopenharmony_ci return found; 12208c2ecf20Sopenharmony_ci} 12218c2ecf20Sopenharmony_ci 12228c2ecf20Sopenharmony_cistatic void __r5l_stripe_write_finished(struct r5l_io_unit *io) 12238c2ecf20Sopenharmony_ci{ 12248c2ecf20Sopenharmony_ci struct r5l_log *log = io->log; 12258c2ecf20Sopenharmony_ci struct r5conf *conf = log->rdev->mddev->private; 12268c2ecf20Sopenharmony_ci unsigned long flags; 12278c2ecf20Sopenharmony_ci 12288c2ecf20Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 12298c2ecf20Sopenharmony_ci __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); 12308c2ecf20Sopenharmony_ci 12318c2ecf20Sopenharmony_ci if (!r5l_complete_finished_ios(log)) { 12328c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 12338c2ecf20Sopenharmony_ci return; 12348c2ecf20Sopenharmony_ci } 12358c2ecf20Sopenharmony_ci 12368c2ecf20Sopenharmony_ci if (r5l_reclaimable_space(log) > log->max_free_space || 12378c2ecf20Sopenharmony_ci test_bit(R5C_LOG_TIGHT, &conf->cache_state)) 12388c2ecf20Sopenharmony_ci r5l_wake_reclaim(log, 0); 12398c2ecf20Sopenharmony_ci 12408c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 12418c2ecf20Sopenharmony_ci wake_up(&log->iounit_wait); 12428c2ecf20Sopenharmony_ci} 12438c2ecf20Sopenharmony_ci 12448c2ecf20Sopenharmony_civoid r5l_stripe_write_finished(struct stripe_head *sh) 12458c2ecf20Sopenharmony_ci{ 12468c2ecf20Sopenharmony_ci struct r5l_io_unit *io; 12478c2ecf20Sopenharmony_ci 12488c2ecf20Sopenharmony_ci io = sh->log_io; 12498c2ecf20Sopenharmony_ci sh->log_io = NULL; 12508c2ecf20Sopenharmony_ci 12518c2ecf20Sopenharmony_ci if (io && atomic_dec_and_test(&io->pending_stripe)) 12528c2ecf20Sopenharmony_ci __r5l_stripe_write_finished(io); 12538c2ecf20Sopenharmony_ci} 12548c2ecf20Sopenharmony_ci 12558c2ecf20Sopenharmony_cistatic void r5l_log_flush_endio(struct bio *bio) 12568c2ecf20Sopenharmony_ci{ 12578c2ecf20Sopenharmony_ci struct r5l_log *log = container_of(bio, struct r5l_log, 12588c2ecf20Sopenharmony_ci flush_bio); 12598c2ecf20Sopenharmony_ci unsigned long flags; 12608c2ecf20Sopenharmony_ci struct r5l_io_unit *io; 12618c2ecf20Sopenharmony_ci 12628c2ecf20Sopenharmony_ci if (bio->bi_status) 12638c2ecf20Sopenharmony_ci md_error(log->rdev->mddev, log->rdev); 12648c2ecf20Sopenharmony_ci 12658c2ecf20Sopenharmony_ci spin_lock_irqsave(&log->io_list_lock, flags); 12668c2ecf20Sopenharmony_ci list_for_each_entry(io, &log->flushing_ios, log_sibling) 12678c2ecf20Sopenharmony_ci r5l_io_run_stripes(io); 12688c2ecf20Sopenharmony_ci list_splice_tail_init(&log->flushing_ios, &log->finished_ios); 12698c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&log->io_list_lock, flags); 12708c2ecf20Sopenharmony_ci} 12718c2ecf20Sopenharmony_ci 12728c2ecf20Sopenharmony_ci/* 12738c2ecf20Sopenharmony_ci * Starting dispatch IO to raid. 12748c2ecf20Sopenharmony_ci * io_unit(meta) consists of a log. There is one situation we want to avoid. A 12758c2ecf20Sopenharmony_ci * broken meta in the middle of a log causes recovery can't find meta at the 12768c2ecf20Sopenharmony_ci * head of log. If operations require meta at the head persistent in log, we 12778c2ecf20Sopenharmony_ci * must make sure meta before it persistent in log too. A case is: 12788c2ecf20Sopenharmony_ci * 12798c2ecf20Sopenharmony_ci * stripe data/parity is in log, we start write stripe to raid disks. stripe 12808c2ecf20Sopenharmony_ci * data/parity must be persistent in log before we do the write to raid disks. 12818c2ecf20Sopenharmony_ci * 12828c2ecf20Sopenharmony_ci * The solution is we restrictly maintain io_unit list order. In this case, we 12838c2ecf20Sopenharmony_ci * only write stripes of an io_unit to raid disks till the io_unit is the first 12848c2ecf20Sopenharmony_ci * one whose data/parity is in log. 12858c2ecf20Sopenharmony_ci */ 12868c2ecf20Sopenharmony_civoid r5l_flush_stripe_to_raid(struct r5l_log *log) 12878c2ecf20Sopenharmony_ci{ 12888c2ecf20Sopenharmony_ci bool do_flush; 12898c2ecf20Sopenharmony_ci 12908c2ecf20Sopenharmony_ci if (!log || !log->need_cache_flush) 12918c2ecf20Sopenharmony_ci return; 12928c2ecf20Sopenharmony_ci 12938c2ecf20Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 12948c2ecf20Sopenharmony_ci /* flush bio is running */ 12958c2ecf20Sopenharmony_ci if (!list_empty(&log->flushing_ios)) { 12968c2ecf20Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 12978c2ecf20Sopenharmony_ci return; 12988c2ecf20Sopenharmony_ci } 12998c2ecf20Sopenharmony_ci list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); 13008c2ecf20Sopenharmony_ci do_flush = !list_empty(&log->flushing_ios); 13018c2ecf20Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 13028c2ecf20Sopenharmony_ci 13038c2ecf20Sopenharmony_ci if (!do_flush) 13048c2ecf20Sopenharmony_ci return; 13058c2ecf20Sopenharmony_ci bio_reset(&log->flush_bio); 13068c2ecf20Sopenharmony_ci bio_set_dev(&log->flush_bio, log->rdev->bdev); 13078c2ecf20Sopenharmony_ci log->flush_bio.bi_end_io = r5l_log_flush_endio; 13088c2ecf20Sopenharmony_ci log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 13098c2ecf20Sopenharmony_ci submit_bio(&log->flush_bio); 13108c2ecf20Sopenharmony_ci} 13118c2ecf20Sopenharmony_ci 13128c2ecf20Sopenharmony_cistatic void r5l_write_super(struct r5l_log *log, sector_t cp); 13138c2ecf20Sopenharmony_cistatic void r5l_write_super_and_discard_space(struct r5l_log *log, 13148c2ecf20Sopenharmony_ci sector_t end) 13158c2ecf20Sopenharmony_ci{ 13168c2ecf20Sopenharmony_ci struct block_device *bdev = log->rdev->bdev; 13178c2ecf20Sopenharmony_ci struct mddev *mddev; 13188c2ecf20Sopenharmony_ci 13198c2ecf20Sopenharmony_ci r5l_write_super(log, end); 13208c2ecf20Sopenharmony_ci 13218c2ecf20Sopenharmony_ci if (!blk_queue_discard(bdev_get_queue(bdev))) 13228c2ecf20Sopenharmony_ci return; 13238c2ecf20Sopenharmony_ci 13248c2ecf20Sopenharmony_ci mddev = log->rdev->mddev; 13258c2ecf20Sopenharmony_ci /* 13268c2ecf20Sopenharmony_ci * Discard could zero data, so before discard we must make sure 13278c2ecf20Sopenharmony_ci * superblock is updated to new log tail. Updating superblock (either 13288c2ecf20Sopenharmony_ci * directly call md_update_sb() or depend on md thread) must hold 13298c2ecf20Sopenharmony_ci * reconfig mutex. On the other hand, raid5_quiesce is called with 13308c2ecf20Sopenharmony_ci * reconfig_mutex hold. The first step of raid5_quiesce() is waitting 13318c2ecf20Sopenharmony_ci * for all IO finish, hence waitting for reclaim thread, while reclaim 13328c2ecf20Sopenharmony_ci * thread is calling this function and waitting for reconfig mutex. So 13338c2ecf20Sopenharmony_ci * there is a deadlock. We workaround this issue with a trylock. 13348c2ecf20Sopenharmony_ci * FIXME: we could miss discard if we can't take reconfig mutex 13358c2ecf20Sopenharmony_ci */ 13368c2ecf20Sopenharmony_ci set_mask_bits(&mddev->sb_flags, 0, 13378c2ecf20Sopenharmony_ci BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING)); 13388c2ecf20Sopenharmony_ci if (!mddev_trylock(mddev)) 13398c2ecf20Sopenharmony_ci return; 13408c2ecf20Sopenharmony_ci md_update_sb(mddev, 1); 13418c2ecf20Sopenharmony_ci mddev_unlock(mddev); 13428c2ecf20Sopenharmony_ci 13438c2ecf20Sopenharmony_ci /* discard IO error really doesn't matter, ignore it */ 13448c2ecf20Sopenharmony_ci if (log->last_checkpoint < end) { 13458c2ecf20Sopenharmony_ci blkdev_issue_discard(bdev, 13468c2ecf20Sopenharmony_ci log->last_checkpoint + log->rdev->data_offset, 13478c2ecf20Sopenharmony_ci end - log->last_checkpoint, GFP_NOIO, 0); 13488c2ecf20Sopenharmony_ci } else { 13498c2ecf20Sopenharmony_ci blkdev_issue_discard(bdev, 13508c2ecf20Sopenharmony_ci log->last_checkpoint + log->rdev->data_offset, 13518c2ecf20Sopenharmony_ci log->device_size - log->last_checkpoint, 13528c2ecf20Sopenharmony_ci GFP_NOIO, 0); 13538c2ecf20Sopenharmony_ci blkdev_issue_discard(bdev, log->rdev->data_offset, end, 13548c2ecf20Sopenharmony_ci GFP_NOIO, 0); 13558c2ecf20Sopenharmony_ci } 13568c2ecf20Sopenharmony_ci} 13578c2ecf20Sopenharmony_ci 13588c2ecf20Sopenharmony_ci/* 13598c2ecf20Sopenharmony_ci * r5c_flush_stripe moves stripe from cached list to handle_list. When called, 13608c2ecf20Sopenharmony_ci * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes. 13618c2ecf20Sopenharmony_ci * 13628c2ecf20Sopenharmony_ci * must hold conf->device_lock 13638c2ecf20Sopenharmony_ci */ 13648c2ecf20Sopenharmony_cistatic void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) 13658c2ecf20Sopenharmony_ci{ 13668c2ecf20Sopenharmony_ci BUG_ON(list_empty(&sh->lru)); 13678c2ecf20Sopenharmony_ci BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 13688c2ecf20Sopenharmony_ci BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 13698c2ecf20Sopenharmony_ci 13708c2ecf20Sopenharmony_ci /* 13718c2ecf20Sopenharmony_ci * The stripe is not ON_RELEASE_LIST, so it is safe to call 13728c2ecf20Sopenharmony_ci * raid5_release_stripe() while holding conf->device_lock 13738c2ecf20Sopenharmony_ci */ 13748c2ecf20Sopenharmony_ci BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); 13758c2ecf20Sopenharmony_ci lockdep_assert_held(&conf->device_lock); 13768c2ecf20Sopenharmony_ci 13778c2ecf20Sopenharmony_ci list_del_init(&sh->lru); 13788c2ecf20Sopenharmony_ci atomic_inc(&sh->count); 13798c2ecf20Sopenharmony_ci 13808c2ecf20Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 13818c2ecf20Sopenharmony_ci atomic_inc(&conf->active_stripes); 13828c2ecf20Sopenharmony_ci r5c_make_stripe_write_out(sh); 13838c2ecf20Sopenharmony_ci 13848c2ecf20Sopenharmony_ci if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) 13858c2ecf20Sopenharmony_ci atomic_inc(&conf->r5c_flushing_partial_stripes); 13868c2ecf20Sopenharmony_ci else 13878c2ecf20Sopenharmony_ci atomic_inc(&conf->r5c_flushing_full_stripes); 13888c2ecf20Sopenharmony_ci raid5_release_stripe(sh); 13898c2ecf20Sopenharmony_ci} 13908c2ecf20Sopenharmony_ci 13918c2ecf20Sopenharmony_ci/* 13928c2ecf20Sopenharmony_ci * if num == 0, flush all full stripes 13938c2ecf20Sopenharmony_ci * if num > 0, flush all full stripes. If less than num full stripes are 13948c2ecf20Sopenharmony_ci * flushed, flush some partial stripes until totally num stripes are 13958c2ecf20Sopenharmony_ci * flushed or there is no more cached stripes. 13968c2ecf20Sopenharmony_ci */ 13978c2ecf20Sopenharmony_civoid r5c_flush_cache(struct r5conf *conf, int num) 13988c2ecf20Sopenharmony_ci{ 13998c2ecf20Sopenharmony_ci int count; 14008c2ecf20Sopenharmony_ci struct stripe_head *sh, *next; 14018c2ecf20Sopenharmony_ci 14028c2ecf20Sopenharmony_ci lockdep_assert_held(&conf->device_lock); 14038c2ecf20Sopenharmony_ci if (!conf->log) 14048c2ecf20Sopenharmony_ci return; 14058c2ecf20Sopenharmony_ci 14068c2ecf20Sopenharmony_ci count = 0; 14078c2ecf20Sopenharmony_ci list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) { 14088c2ecf20Sopenharmony_ci r5c_flush_stripe(conf, sh); 14098c2ecf20Sopenharmony_ci count++; 14108c2ecf20Sopenharmony_ci } 14118c2ecf20Sopenharmony_ci 14128c2ecf20Sopenharmony_ci if (count >= num) 14138c2ecf20Sopenharmony_ci return; 14148c2ecf20Sopenharmony_ci list_for_each_entry_safe(sh, next, 14158c2ecf20Sopenharmony_ci &conf->r5c_partial_stripe_list, lru) { 14168c2ecf20Sopenharmony_ci r5c_flush_stripe(conf, sh); 14178c2ecf20Sopenharmony_ci if (++count >= num) 14188c2ecf20Sopenharmony_ci break; 14198c2ecf20Sopenharmony_ci } 14208c2ecf20Sopenharmony_ci} 14218c2ecf20Sopenharmony_ci 14228c2ecf20Sopenharmony_cistatic void r5c_do_reclaim(struct r5conf *conf) 14238c2ecf20Sopenharmony_ci{ 14248c2ecf20Sopenharmony_ci struct r5l_log *log = conf->log; 14258c2ecf20Sopenharmony_ci struct stripe_head *sh; 14268c2ecf20Sopenharmony_ci int count = 0; 14278c2ecf20Sopenharmony_ci unsigned long flags; 14288c2ecf20Sopenharmony_ci int total_cached; 14298c2ecf20Sopenharmony_ci int stripes_to_flush; 14308c2ecf20Sopenharmony_ci int flushing_partial, flushing_full; 14318c2ecf20Sopenharmony_ci 14328c2ecf20Sopenharmony_ci if (!r5c_is_writeback(log)) 14338c2ecf20Sopenharmony_ci return; 14348c2ecf20Sopenharmony_ci 14358c2ecf20Sopenharmony_ci flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes); 14368c2ecf20Sopenharmony_ci flushing_full = atomic_read(&conf->r5c_flushing_full_stripes); 14378c2ecf20Sopenharmony_ci total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + 14388c2ecf20Sopenharmony_ci atomic_read(&conf->r5c_cached_full_stripes) - 14398c2ecf20Sopenharmony_ci flushing_full - flushing_partial; 14408c2ecf20Sopenharmony_ci 14418c2ecf20Sopenharmony_ci if (total_cached > conf->min_nr_stripes * 3 / 4 || 14428c2ecf20Sopenharmony_ci atomic_read(&conf->empty_inactive_list_nr) > 0) 14438c2ecf20Sopenharmony_ci /* 14448c2ecf20Sopenharmony_ci * if stripe cache pressure high, flush all full stripes and 14458c2ecf20Sopenharmony_ci * some partial stripes 14468c2ecf20Sopenharmony_ci */ 14478c2ecf20Sopenharmony_ci stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; 14488c2ecf20Sopenharmony_ci else if (total_cached > conf->min_nr_stripes * 1 / 2 || 14498c2ecf20Sopenharmony_ci atomic_read(&conf->r5c_cached_full_stripes) - flushing_full > 14508c2ecf20Sopenharmony_ci R5C_FULL_STRIPE_FLUSH_BATCH(conf)) 14518c2ecf20Sopenharmony_ci /* 14528c2ecf20Sopenharmony_ci * if stripe cache pressure moderate, or if there is many full 14538c2ecf20Sopenharmony_ci * stripes,flush all full stripes 14548c2ecf20Sopenharmony_ci */ 14558c2ecf20Sopenharmony_ci stripes_to_flush = 0; 14568c2ecf20Sopenharmony_ci else 14578c2ecf20Sopenharmony_ci /* no need to flush */ 14588c2ecf20Sopenharmony_ci stripes_to_flush = -1; 14598c2ecf20Sopenharmony_ci 14608c2ecf20Sopenharmony_ci if (stripes_to_flush >= 0) { 14618c2ecf20Sopenharmony_ci spin_lock_irqsave(&conf->device_lock, flags); 14628c2ecf20Sopenharmony_ci r5c_flush_cache(conf, stripes_to_flush); 14638c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&conf->device_lock, flags); 14648c2ecf20Sopenharmony_ci } 14658c2ecf20Sopenharmony_ci 14668c2ecf20Sopenharmony_ci /* if log space is tight, flush stripes on stripe_in_journal_list */ 14678c2ecf20Sopenharmony_ci if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) { 14688c2ecf20Sopenharmony_ci spin_lock_irqsave(&log->stripe_in_journal_lock, flags); 14698c2ecf20Sopenharmony_ci spin_lock(&conf->device_lock); 14708c2ecf20Sopenharmony_ci list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) { 14718c2ecf20Sopenharmony_ci /* 14728c2ecf20Sopenharmony_ci * stripes on stripe_in_journal_list could be in any 14738c2ecf20Sopenharmony_ci * state of the stripe_cache state machine. In this 14748c2ecf20Sopenharmony_ci * case, we only want to flush stripe on 14758c2ecf20Sopenharmony_ci * r5c_cached_full/partial_stripes. The following 14768c2ecf20Sopenharmony_ci * condition makes sure the stripe is on one of the 14778c2ecf20Sopenharmony_ci * two lists. 14788c2ecf20Sopenharmony_ci */ 14798c2ecf20Sopenharmony_ci if (!list_empty(&sh->lru) && 14808c2ecf20Sopenharmony_ci !test_bit(STRIPE_HANDLE, &sh->state) && 14818c2ecf20Sopenharmony_ci atomic_read(&sh->count) == 0) { 14828c2ecf20Sopenharmony_ci r5c_flush_stripe(conf, sh); 14838c2ecf20Sopenharmony_ci if (count++ >= R5C_RECLAIM_STRIPE_GROUP) 14848c2ecf20Sopenharmony_ci break; 14858c2ecf20Sopenharmony_ci } 14868c2ecf20Sopenharmony_ci } 14878c2ecf20Sopenharmony_ci spin_unlock(&conf->device_lock); 14888c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); 14898c2ecf20Sopenharmony_ci } 14908c2ecf20Sopenharmony_ci 14918c2ecf20Sopenharmony_ci if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) 14928c2ecf20Sopenharmony_ci r5l_run_no_space_stripes(log); 14938c2ecf20Sopenharmony_ci 14948c2ecf20Sopenharmony_ci md_wakeup_thread(conf->mddev->thread); 14958c2ecf20Sopenharmony_ci} 14968c2ecf20Sopenharmony_ci 14978c2ecf20Sopenharmony_cistatic void r5l_do_reclaim(struct r5l_log *log) 14988c2ecf20Sopenharmony_ci{ 14998c2ecf20Sopenharmony_ci struct r5conf *conf = log->rdev->mddev->private; 15008c2ecf20Sopenharmony_ci sector_t reclaim_target = xchg(&log->reclaim_target, 0); 15018c2ecf20Sopenharmony_ci sector_t reclaimable; 15028c2ecf20Sopenharmony_ci sector_t next_checkpoint; 15038c2ecf20Sopenharmony_ci bool write_super; 15048c2ecf20Sopenharmony_ci 15058c2ecf20Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 15068c2ecf20Sopenharmony_ci write_super = r5l_reclaimable_space(log) > log->max_free_space || 15078c2ecf20Sopenharmony_ci reclaim_target != 0 || !list_empty(&log->no_space_stripes); 15088c2ecf20Sopenharmony_ci /* 15098c2ecf20Sopenharmony_ci * move proper io_unit to reclaim list. We should not change the order. 15108c2ecf20Sopenharmony_ci * reclaimable/unreclaimable io_unit can be mixed in the list, we 15118c2ecf20Sopenharmony_ci * shouldn't reuse space of an unreclaimable io_unit 15128c2ecf20Sopenharmony_ci */ 15138c2ecf20Sopenharmony_ci while (1) { 15148c2ecf20Sopenharmony_ci reclaimable = r5l_reclaimable_space(log); 15158c2ecf20Sopenharmony_ci if (reclaimable >= reclaim_target || 15168c2ecf20Sopenharmony_ci (list_empty(&log->running_ios) && 15178c2ecf20Sopenharmony_ci list_empty(&log->io_end_ios) && 15188c2ecf20Sopenharmony_ci list_empty(&log->flushing_ios) && 15198c2ecf20Sopenharmony_ci list_empty(&log->finished_ios))) 15208c2ecf20Sopenharmony_ci break; 15218c2ecf20Sopenharmony_ci 15228c2ecf20Sopenharmony_ci md_wakeup_thread(log->rdev->mddev->thread); 15238c2ecf20Sopenharmony_ci wait_event_lock_irq(log->iounit_wait, 15248c2ecf20Sopenharmony_ci r5l_reclaimable_space(log) > reclaimable, 15258c2ecf20Sopenharmony_ci log->io_list_lock); 15268c2ecf20Sopenharmony_ci } 15278c2ecf20Sopenharmony_ci 15288c2ecf20Sopenharmony_ci next_checkpoint = r5c_calculate_new_cp(conf); 15298c2ecf20Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 15308c2ecf20Sopenharmony_ci 15318c2ecf20Sopenharmony_ci if (reclaimable == 0 || !write_super) 15328c2ecf20Sopenharmony_ci return; 15338c2ecf20Sopenharmony_ci 15348c2ecf20Sopenharmony_ci /* 15358c2ecf20Sopenharmony_ci * write_super will flush cache of each raid disk. We must write super 15368c2ecf20Sopenharmony_ci * here, because the log area might be reused soon and we don't want to 15378c2ecf20Sopenharmony_ci * confuse recovery 15388c2ecf20Sopenharmony_ci */ 15398c2ecf20Sopenharmony_ci r5l_write_super_and_discard_space(log, next_checkpoint); 15408c2ecf20Sopenharmony_ci 15418c2ecf20Sopenharmony_ci mutex_lock(&log->io_mutex); 15428c2ecf20Sopenharmony_ci log->last_checkpoint = next_checkpoint; 15438c2ecf20Sopenharmony_ci r5c_update_log_state(log); 15448c2ecf20Sopenharmony_ci mutex_unlock(&log->io_mutex); 15458c2ecf20Sopenharmony_ci 15468c2ecf20Sopenharmony_ci r5l_run_no_space_stripes(log); 15478c2ecf20Sopenharmony_ci} 15488c2ecf20Sopenharmony_ci 15498c2ecf20Sopenharmony_cistatic void r5l_reclaim_thread(struct md_thread *thread) 15508c2ecf20Sopenharmony_ci{ 15518c2ecf20Sopenharmony_ci struct mddev *mddev = thread->mddev; 15528c2ecf20Sopenharmony_ci struct r5conf *conf = mddev->private; 15538c2ecf20Sopenharmony_ci struct r5l_log *log = conf->log; 15548c2ecf20Sopenharmony_ci 15558c2ecf20Sopenharmony_ci if (!log) 15568c2ecf20Sopenharmony_ci return; 15578c2ecf20Sopenharmony_ci r5c_do_reclaim(conf); 15588c2ecf20Sopenharmony_ci r5l_do_reclaim(log); 15598c2ecf20Sopenharmony_ci} 15608c2ecf20Sopenharmony_ci 15618c2ecf20Sopenharmony_civoid r5l_wake_reclaim(struct r5l_log *log, sector_t space) 15628c2ecf20Sopenharmony_ci{ 15638c2ecf20Sopenharmony_ci unsigned long target; 15648c2ecf20Sopenharmony_ci unsigned long new = (unsigned long)space; /* overflow in theory */ 15658c2ecf20Sopenharmony_ci 15668c2ecf20Sopenharmony_ci if (!log) 15678c2ecf20Sopenharmony_ci return; 15688c2ecf20Sopenharmony_ci do { 15698c2ecf20Sopenharmony_ci target = log->reclaim_target; 15708c2ecf20Sopenharmony_ci if (new < target) 15718c2ecf20Sopenharmony_ci return; 15728c2ecf20Sopenharmony_ci } while (cmpxchg(&log->reclaim_target, target, new) != target); 15738c2ecf20Sopenharmony_ci md_wakeup_thread(log->reclaim_thread); 15748c2ecf20Sopenharmony_ci} 15758c2ecf20Sopenharmony_ci 15768c2ecf20Sopenharmony_civoid r5l_quiesce(struct r5l_log *log, int quiesce) 15778c2ecf20Sopenharmony_ci{ 15788c2ecf20Sopenharmony_ci struct mddev *mddev; 15798c2ecf20Sopenharmony_ci 15808c2ecf20Sopenharmony_ci if (quiesce) { 15818c2ecf20Sopenharmony_ci /* make sure r5l_write_super_and_discard_space exits */ 15828c2ecf20Sopenharmony_ci mddev = log->rdev->mddev; 15838c2ecf20Sopenharmony_ci wake_up(&mddev->sb_wait); 15848c2ecf20Sopenharmony_ci kthread_park(log->reclaim_thread->tsk); 15858c2ecf20Sopenharmony_ci r5l_wake_reclaim(log, MaxSector); 15868c2ecf20Sopenharmony_ci r5l_do_reclaim(log); 15878c2ecf20Sopenharmony_ci } else 15888c2ecf20Sopenharmony_ci kthread_unpark(log->reclaim_thread->tsk); 15898c2ecf20Sopenharmony_ci} 15908c2ecf20Sopenharmony_ci 15918c2ecf20Sopenharmony_cibool r5l_log_disk_error(struct r5conf *conf) 15928c2ecf20Sopenharmony_ci{ 15938c2ecf20Sopenharmony_ci struct r5l_log *log; 15948c2ecf20Sopenharmony_ci bool ret; 15958c2ecf20Sopenharmony_ci /* don't allow write if journal disk is missing */ 15968c2ecf20Sopenharmony_ci rcu_read_lock(); 15978c2ecf20Sopenharmony_ci log = rcu_dereference(conf->log); 15988c2ecf20Sopenharmony_ci 15998c2ecf20Sopenharmony_ci if (!log) 16008c2ecf20Sopenharmony_ci ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 16018c2ecf20Sopenharmony_ci else 16028c2ecf20Sopenharmony_ci ret = test_bit(Faulty, &log->rdev->flags); 16038c2ecf20Sopenharmony_ci rcu_read_unlock(); 16048c2ecf20Sopenharmony_ci return ret; 16058c2ecf20Sopenharmony_ci} 16068c2ecf20Sopenharmony_ci 16078c2ecf20Sopenharmony_ci#define R5L_RECOVERY_PAGE_POOL_SIZE 256 16088c2ecf20Sopenharmony_ci 16098c2ecf20Sopenharmony_cistruct r5l_recovery_ctx { 16108c2ecf20Sopenharmony_ci struct page *meta_page; /* current meta */ 16118c2ecf20Sopenharmony_ci sector_t meta_total_blocks; /* total size of current meta and data */ 16128c2ecf20Sopenharmony_ci sector_t pos; /* recovery position */ 16138c2ecf20Sopenharmony_ci u64 seq; /* recovery position seq */ 16148c2ecf20Sopenharmony_ci int data_parity_stripes; /* number of data_parity stripes */ 16158c2ecf20Sopenharmony_ci int data_only_stripes; /* number of data_only stripes */ 16168c2ecf20Sopenharmony_ci struct list_head cached_list; 16178c2ecf20Sopenharmony_ci 16188c2ecf20Sopenharmony_ci /* 16198c2ecf20Sopenharmony_ci * read ahead page pool (ra_pool) 16208c2ecf20Sopenharmony_ci * in recovery, log is read sequentially. It is not efficient to 16218c2ecf20Sopenharmony_ci * read every page with sync_page_io(). The read ahead page pool 16228c2ecf20Sopenharmony_ci * reads multiple pages with one IO, so further log read can 16238c2ecf20Sopenharmony_ci * just copy data from the pool. 16248c2ecf20Sopenharmony_ci */ 16258c2ecf20Sopenharmony_ci struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE]; 16268c2ecf20Sopenharmony_ci sector_t pool_offset; /* offset of first page in the pool */ 16278c2ecf20Sopenharmony_ci int total_pages; /* total allocated pages */ 16288c2ecf20Sopenharmony_ci int valid_pages; /* pages with valid data */ 16298c2ecf20Sopenharmony_ci struct bio *ra_bio; /* bio to do the read ahead */ 16308c2ecf20Sopenharmony_ci}; 16318c2ecf20Sopenharmony_ci 16328c2ecf20Sopenharmony_cistatic int r5l_recovery_allocate_ra_pool(struct r5l_log *log, 16338c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx) 16348c2ecf20Sopenharmony_ci{ 16358c2ecf20Sopenharmony_ci struct page *page; 16368c2ecf20Sopenharmony_ci 16378c2ecf20Sopenharmony_ci ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs); 16388c2ecf20Sopenharmony_ci if (!ctx->ra_bio) 16398c2ecf20Sopenharmony_ci return -ENOMEM; 16408c2ecf20Sopenharmony_ci 16418c2ecf20Sopenharmony_ci ctx->valid_pages = 0; 16428c2ecf20Sopenharmony_ci ctx->total_pages = 0; 16438c2ecf20Sopenharmony_ci while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) { 16448c2ecf20Sopenharmony_ci page = alloc_page(GFP_KERNEL); 16458c2ecf20Sopenharmony_ci 16468c2ecf20Sopenharmony_ci if (!page) 16478c2ecf20Sopenharmony_ci break; 16488c2ecf20Sopenharmony_ci ctx->ra_pool[ctx->total_pages] = page; 16498c2ecf20Sopenharmony_ci ctx->total_pages += 1; 16508c2ecf20Sopenharmony_ci } 16518c2ecf20Sopenharmony_ci 16528c2ecf20Sopenharmony_ci if (ctx->total_pages == 0) { 16538c2ecf20Sopenharmony_ci bio_put(ctx->ra_bio); 16548c2ecf20Sopenharmony_ci return -ENOMEM; 16558c2ecf20Sopenharmony_ci } 16568c2ecf20Sopenharmony_ci 16578c2ecf20Sopenharmony_ci ctx->pool_offset = 0; 16588c2ecf20Sopenharmony_ci return 0; 16598c2ecf20Sopenharmony_ci} 16608c2ecf20Sopenharmony_ci 16618c2ecf20Sopenharmony_cistatic void r5l_recovery_free_ra_pool(struct r5l_log *log, 16628c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx) 16638c2ecf20Sopenharmony_ci{ 16648c2ecf20Sopenharmony_ci int i; 16658c2ecf20Sopenharmony_ci 16668c2ecf20Sopenharmony_ci for (i = 0; i < ctx->total_pages; ++i) 16678c2ecf20Sopenharmony_ci put_page(ctx->ra_pool[i]); 16688c2ecf20Sopenharmony_ci bio_put(ctx->ra_bio); 16698c2ecf20Sopenharmony_ci} 16708c2ecf20Sopenharmony_ci 16718c2ecf20Sopenharmony_ci/* 16728c2ecf20Sopenharmony_ci * fetch ctx->valid_pages pages from offset 16738c2ecf20Sopenharmony_ci * In normal cases, ctx->valid_pages == ctx->total_pages after the call. 16748c2ecf20Sopenharmony_ci * However, if the offset is close to the end of the journal device, 16758c2ecf20Sopenharmony_ci * ctx->valid_pages could be smaller than ctx->total_pages 16768c2ecf20Sopenharmony_ci */ 16778c2ecf20Sopenharmony_cistatic int r5l_recovery_fetch_ra_pool(struct r5l_log *log, 16788c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx, 16798c2ecf20Sopenharmony_ci sector_t offset) 16808c2ecf20Sopenharmony_ci{ 16818c2ecf20Sopenharmony_ci bio_reset(ctx->ra_bio); 16828c2ecf20Sopenharmony_ci bio_set_dev(ctx->ra_bio, log->rdev->bdev); 16838c2ecf20Sopenharmony_ci bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0); 16848c2ecf20Sopenharmony_ci ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset; 16858c2ecf20Sopenharmony_ci 16868c2ecf20Sopenharmony_ci ctx->valid_pages = 0; 16878c2ecf20Sopenharmony_ci ctx->pool_offset = offset; 16888c2ecf20Sopenharmony_ci 16898c2ecf20Sopenharmony_ci while (ctx->valid_pages < ctx->total_pages) { 16908c2ecf20Sopenharmony_ci bio_add_page(ctx->ra_bio, 16918c2ecf20Sopenharmony_ci ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0); 16928c2ecf20Sopenharmony_ci ctx->valid_pages += 1; 16938c2ecf20Sopenharmony_ci 16948c2ecf20Sopenharmony_ci offset = r5l_ring_add(log, offset, BLOCK_SECTORS); 16958c2ecf20Sopenharmony_ci 16968c2ecf20Sopenharmony_ci if (offset == 0) /* reached end of the device */ 16978c2ecf20Sopenharmony_ci break; 16988c2ecf20Sopenharmony_ci } 16998c2ecf20Sopenharmony_ci 17008c2ecf20Sopenharmony_ci return submit_bio_wait(ctx->ra_bio); 17018c2ecf20Sopenharmony_ci} 17028c2ecf20Sopenharmony_ci 17038c2ecf20Sopenharmony_ci/* 17048c2ecf20Sopenharmony_ci * try read a page from the read ahead page pool, if the page is not in the 17058c2ecf20Sopenharmony_ci * pool, call r5l_recovery_fetch_ra_pool 17068c2ecf20Sopenharmony_ci */ 17078c2ecf20Sopenharmony_cistatic int r5l_recovery_read_page(struct r5l_log *log, 17088c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx, 17098c2ecf20Sopenharmony_ci struct page *page, 17108c2ecf20Sopenharmony_ci sector_t offset) 17118c2ecf20Sopenharmony_ci{ 17128c2ecf20Sopenharmony_ci int ret; 17138c2ecf20Sopenharmony_ci 17148c2ecf20Sopenharmony_ci if (offset < ctx->pool_offset || 17158c2ecf20Sopenharmony_ci offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) { 17168c2ecf20Sopenharmony_ci ret = r5l_recovery_fetch_ra_pool(log, ctx, offset); 17178c2ecf20Sopenharmony_ci if (ret) 17188c2ecf20Sopenharmony_ci return ret; 17198c2ecf20Sopenharmony_ci } 17208c2ecf20Sopenharmony_ci 17218c2ecf20Sopenharmony_ci BUG_ON(offset < ctx->pool_offset || 17228c2ecf20Sopenharmony_ci offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS); 17238c2ecf20Sopenharmony_ci 17248c2ecf20Sopenharmony_ci memcpy(page_address(page), 17258c2ecf20Sopenharmony_ci page_address(ctx->ra_pool[(offset - ctx->pool_offset) >> 17268c2ecf20Sopenharmony_ci BLOCK_SECTOR_SHIFT]), 17278c2ecf20Sopenharmony_ci PAGE_SIZE); 17288c2ecf20Sopenharmony_ci return 0; 17298c2ecf20Sopenharmony_ci} 17308c2ecf20Sopenharmony_ci 17318c2ecf20Sopenharmony_cistatic int r5l_recovery_read_meta_block(struct r5l_log *log, 17328c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx) 17338c2ecf20Sopenharmony_ci{ 17348c2ecf20Sopenharmony_ci struct page *page = ctx->meta_page; 17358c2ecf20Sopenharmony_ci struct r5l_meta_block *mb; 17368c2ecf20Sopenharmony_ci u32 crc, stored_crc; 17378c2ecf20Sopenharmony_ci int ret; 17388c2ecf20Sopenharmony_ci 17398c2ecf20Sopenharmony_ci ret = r5l_recovery_read_page(log, ctx, page, ctx->pos); 17408c2ecf20Sopenharmony_ci if (ret != 0) 17418c2ecf20Sopenharmony_ci return ret; 17428c2ecf20Sopenharmony_ci 17438c2ecf20Sopenharmony_ci mb = page_address(page); 17448c2ecf20Sopenharmony_ci stored_crc = le32_to_cpu(mb->checksum); 17458c2ecf20Sopenharmony_ci mb->checksum = 0; 17468c2ecf20Sopenharmony_ci 17478c2ecf20Sopenharmony_ci if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 17488c2ecf20Sopenharmony_ci le64_to_cpu(mb->seq) != ctx->seq || 17498c2ecf20Sopenharmony_ci mb->version != R5LOG_VERSION || 17508c2ecf20Sopenharmony_ci le64_to_cpu(mb->position) != ctx->pos) 17518c2ecf20Sopenharmony_ci return -EINVAL; 17528c2ecf20Sopenharmony_ci 17538c2ecf20Sopenharmony_ci crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 17548c2ecf20Sopenharmony_ci if (stored_crc != crc) 17558c2ecf20Sopenharmony_ci return -EINVAL; 17568c2ecf20Sopenharmony_ci 17578c2ecf20Sopenharmony_ci if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) 17588c2ecf20Sopenharmony_ci return -EINVAL; 17598c2ecf20Sopenharmony_ci 17608c2ecf20Sopenharmony_ci ctx->meta_total_blocks = BLOCK_SECTORS; 17618c2ecf20Sopenharmony_ci 17628c2ecf20Sopenharmony_ci return 0; 17638c2ecf20Sopenharmony_ci} 17648c2ecf20Sopenharmony_ci 17658c2ecf20Sopenharmony_cistatic void 17668c2ecf20Sopenharmony_cir5l_recovery_create_empty_meta_block(struct r5l_log *log, 17678c2ecf20Sopenharmony_ci struct page *page, 17688c2ecf20Sopenharmony_ci sector_t pos, u64 seq) 17698c2ecf20Sopenharmony_ci{ 17708c2ecf20Sopenharmony_ci struct r5l_meta_block *mb; 17718c2ecf20Sopenharmony_ci 17728c2ecf20Sopenharmony_ci mb = page_address(page); 17738c2ecf20Sopenharmony_ci clear_page(mb); 17748c2ecf20Sopenharmony_ci mb->magic = cpu_to_le32(R5LOG_MAGIC); 17758c2ecf20Sopenharmony_ci mb->version = R5LOG_VERSION; 17768c2ecf20Sopenharmony_ci mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); 17778c2ecf20Sopenharmony_ci mb->seq = cpu_to_le64(seq); 17788c2ecf20Sopenharmony_ci mb->position = cpu_to_le64(pos); 17798c2ecf20Sopenharmony_ci} 17808c2ecf20Sopenharmony_ci 17818c2ecf20Sopenharmony_cistatic int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, 17828c2ecf20Sopenharmony_ci u64 seq) 17838c2ecf20Sopenharmony_ci{ 17848c2ecf20Sopenharmony_ci struct page *page; 17858c2ecf20Sopenharmony_ci struct r5l_meta_block *mb; 17868c2ecf20Sopenharmony_ci 17878c2ecf20Sopenharmony_ci page = alloc_page(GFP_KERNEL); 17888c2ecf20Sopenharmony_ci if (!page) 17898c2ecf20Sopenharmony_ci return -ENOMEM; 17908c2ecf20Sopenharmony_ci r5l_recovery_create_empty_meta_block(log, page, pos, seq); 17918c2ecf20Sopenharmony_ci mb = page_address(page); 17928c2ecf20Sopenharmony_ci mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 17938c2ecf20Sopenharmony_ci mb, PAGE_SIZE)); 17948c2ecf20Sopenharmony_ci if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, 17958c2ecf20Sopenharmony_ci REQ_SYNC | REQ_FUA, false)) { 17968c2ecf20Sopenharmony_ci __free_page(page); 17978c2ecf20Sopenharmony_ci return -EIO; 17988c2ecf20Sopenharmony_ci } 17998c2ecf20Sopenharmony_ci __free_page(page); 18008c2ecf20Sopenharmony_ci return 0; 18018c2ecf20Sopenharmony_ci} 18028c2ecf20Sopenharmony_ci 18038c2ecf20Sopenharmony_ci/* 18048c2ecf20Sopenharmony_ci * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite 18058c2ecf20Sopenharmony_ci * to mark valid (potentially not flushed) data in the journal. 18068c2ecf20Sopenharmony_ci * 18078c2ecf20Sopenharmony_ci * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, 18088c2ecf20Sopenharmony_ci * so there should not be any mismatch here. 18098c2ecf20Sopenharmony_ci */ 18108c2ecf20Sopenharmony_cistatic void r5l_recovery_load_data(struct r5l_log *log, 18118c2ecf20Sopenharmony_ci struct stripe_head *sh, 18128c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx, 18138c2ecf20Sopenharmony_ci struct r5l_payload_data_parity *payload, 18148c2ecf20Sopenharmony_ci sector_t log_offset) 18158c2ecf20Sopenharmony_ci{ 18168c2ecf20Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 18178c2ecf20Sopenharmony_ci struct r5conf *conf = mddev->private; 18188c2ecf20Sopenharmony_ci int dd_idx; 18198c2ecf20Sopenharmony_ci 18208c2ecf20Sopenharmony_ci raid5_compute_sector(conf, 18218c2ecf20Sopenharmony_ci le64_to_cpu(payload->location), 0, 18228c2ecf20Sopenharmony_ci &dd_idx, sh); 18238c2ecf20Sopenharmony_ci r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset); 18248c2ecf20Sopenharmony_ci sh->dev[dd_idx].log_checksum = 18258c2ecf20Sopenharmony_ci le32_to_cpu(payload->checksum[0]); 18268c2ecf20Sopenharmony_ci ctx->meta_total_blocks += BLOCK_SECTORS; 18278c2ecf20Sopenharmony_ci 18288c2ecf20Sopenharmony_ci set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); 18298c2ecf20Sopenharmony_ci set_bit(STRIPE_R5C_CACHING, &sh->state); 18308c2ecf20Sopenharmony_ci} 18318c2ecf20Sopenharmony_ci 18328c2ecf20Sopenharmony_cistatic void r5l_recovery_load_parity(struct r5l_log *log, 18338c2ecf20Sopenharmony_ci struct stripe_head *sh, 18348c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx, 18358c2ecf20Sopenharmony_ci struct r5l_payload_data_parity *payload, 18368c2ecf20Sopenharmony_ci sector_t log_offset) 18378c2ecf20Sopenharmony_ci{ 18388c2ecf20Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 18398c2ecf20Sopenharmony_ci struct r5conf *conf = mddev->private; 18408c2ecf20Sopenharmony_ci 18418c2ecf20Sopenharmony_ci ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; 18428c2ecf20Sopenharmony_ci r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset); 18438c2ecf20Sopenharmony_ci sh->dev[sh->pd_idx].log_checksum = 18448c2ecf20Sopenharmony_ci le32_to_cpu(payload->checksum[0]); 18458c2ecf20Sopenharmony_ci set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); 18468c2ecf20Sopenharmony_ci 18478c2ecf20Sopenharmony_ci if (sh->qd_idx >= 0) { 18488c2ecf20Sopenharmony_ci r5l_recovery_read_page( 18498c2ecf20Sopenharmony_ci log, ctx, sh->dev[sh->qd_idx].page, 18508c2ecf20Sopenharmony_ci r5l_ring_add(log, log_offset, BLOCK_SECTORS)); 18518c2ecf20Sopenharmony_ci sh->dev[sh->qd_idx].log_checksum = 18528c2ecf20Sopenharmony_ci le32_to_cpu(payload->checksum[1]); 18538c2ecf20Sopenharmony_ci set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); 18548c2ecf20Sopenharmony_ci } 18558c2ecf20Sopenharmony_ci clear_bit(STRIPE_R5C_CACHING, &sh->state); 18568c2ecf20Sopenharmony_ci} 18578c2ecf20Sopenharmony_ci 18588c2ecf20Sopenharmony_cistatic void r5l_recovery_reset_stripe(struct stripe_head *sh) 18598c2ecf20Sopenharmony_ci{ 18608c2ecf20Sopenharmony_ci int i; 18618c2ecf20Sopenharmony_ci 18628c2ecf20Sopenharmony_ci sh->state = 0; 18638c2ecf20Sopenharmony_ci sh->log_start = MaxSector; 18648c2ecf20Sopenharmony_ci for (i = sh->disks; i--; ) 18658c2ecf20Sopenharmony_ci sh->dev[i].flags = 0; 18668c2ecf20Sopenharmony_ci} 18678c2ecf20Sopenharmony_ci 18688c2ecf20Sopenharmony_cistatic void 18698c2ecf20Sopenharmony_cir5l_recovery_replay_one_stripe(struct r5conf *conf, 18708c2ecf20Sopenharmony_ci struct stripe_head *sh, 18718c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx) 18728c2ecf20Sopenharmony_ci{ 18738c2ecf20Sopenharmony_ci struct md_rdev *rdev, *rrdev; 18748c2ecf20Sopenharmony_ci int disk_index; 18758c2ecf20Sopenharmony_ci int data_count = 0; 18768c2ecf20Sopenharmony_ci 18778c2ecf20Sopenharmony_ci for (disk_index = 0; disk_index < sh->disks; disk_index++) { 18788c2ecf20Sopenharmony_ci if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 18798c2ecf20Sopenharmony_ci continue; 18808c2ecf20Sopenharmony_ci if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) 18818c2ecf20Sopenharmony_ci continue; 18828c2ecf20Sopenharmony_ci data_count++; 18838c2ecf20Sopenharmony_ci } 18848c2ecf20Sopenharmony_ci 18858c2ecf20Sopenharmony_ci /* 18868c2ecf20Sopenharmony_ci * stripes that only have parity must have been flushed 18878c2ecf20Sopenharmony_ci * before the crash that we are now recovering from, so 18888c2ecf20Sopenharmony_ci * there is nothing more to recovery. 18898c2ecf20Sopenharmony_ci */ 18908c2ecf20Sopenharmony_ci if (data_count == 0) 18918c2ecf20Sopenharmony_ci goto out; 18928c2ecf20Sopenharmony_ci 18938c2ecf20Sopenharmony_ci for (disk_index = 0; disk_index < sh->disks; disk_index++) { 18948c2ecf20Sopenharmony_ci if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) 18958c2ecf20Sopenharmony_ci continue; 18968c2ecf20Sopenharmony_ci 18978c2ecf20Sopenharmony_ci /* in case device is broken */ 18988c2ecf20Sopenharmony_ci rcu_read_lock(); 18998c2ecf20Sopenharmony_ci rdev = rcu_dereference(conf->disks[disk_index].rdev); 19008c2ecf20Sopenharmony_ci if (rdev) { 19018c2ecf20Sopenharmony_ci atomic_inc(&rdev->nr_pending); 19028c2ecf20Sopenharmony_ci rcu_read_unlock(); 19038c2ecf20Sopenharmony_ci sync_page_io(rdev, sh->sector, PAGE_SIZE, 19048c2ecf20Sopenharmony_ci sh->dev[disk_index].page, REQ_OP_WRITE, 0, 19058c2ecf20Sopenharmony_ci false); 19068c2ecf20Sopenharmony_ci rdev_dec_pending(rdev, rdev->mddev); 19078c2ecf20Sopenharmony_ci rcu_read_lock(); 19088c2ecf20Sopenharmony_ci } 19098c2ecf20Sopenharmony_ci rrdev = rcu_dereference(conf->disks[disk_index].replacement); 19108c2ecf20Sopenharmony_ci if (rrdev) { 19118c2ecf20Sopenharmony_ci atomic_inc(&rrdev->nr_pending); 19128c2ecf20Sopenharmony_ci rcu_read_unlock(); 19138c2ecf20Sopenharmony_ci sync_page_io(rrdev, sh->sector, PAGE_SIZE, 19148c2ecf20Sopenharmony_ci sh->dev[disk_index].page, REQ_OP_WRITE, 0, 19158c2ecf20Sopenharmony_ci false); 19168c2ecf20Sopenharmony_ci rdev_dec_pending(rrdev, rrdev->mddev); 19178c2ecf20Sopenharmony_ci rcu_read_lock(); 19188c2ecf20Sopenharmony_ci } 19198c2ecf20Sopenharmony_ci rcu_read_unlock(); 19208c2ecf20Sopenharmony_ci } 19218c2ecf20Sopenharmony_ci ctx->data_parity_stripes++; 19228c2ecf20Sopenharmony_ciout: 19238c2ecf20Sopenharmony_ci r5l_recovery_reset_stripe(sh); 19248c2ecf20Sopenharmony_ci} 19258c2ecf20Sopenharmony_ci 19268c2ecf20Sopenharmony_cistatic struct stripe_head * 19278c2ecf20Sopenharmony_cir5c_recovery_alloc_stripe( 19288c2ecf20Sopenharmony_ci struct r5conf *conf, 19298c2ecf20Sopenharmony_ci sector_t stripe_sect, 19308c2ecf20Sopenharmony_ci int noblock) 19318c2ecf20Sopenharmony_ci{ 19328c2ecf20Sopenharmony_ci struct stripe_head *sh; 19338c2ecf20Sopenharmony_ci 19348c2ecf20Sopenharmony_ci sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0); 19358c2ecf20Sopenharmony_ci if (!sh) 19368c2ecf20Sopenharmony_ci return NULL; /* no more stripe available */ 19378c2ecf20Sopenharmony_ci 19388c2ecf20Sopenharmony_ci r5l_recovery_reset_stripe(sh); 19398c2ecf20Sopenharmony_ci 19408c2ecf20Sopenharmony_ci return sh; 19418c2ecf20Sopenharmony_ci} 19428c2ecf20Sopenharmony_ci 19438c2ecf20Sopenharmony_cistatic struct stripe_head * 19448c2ecf20Sopenharmony_cir5c_recovery_lookup_stripe(struct list_head *list, sector_t sect) 19458c2ecf20Sopenharmony_ci{ 19468c2ecf20Sopenharmony_ci struct stripe_head *sh; 19478c2ecf20Sopenharmony_ci 19488c2ecf20Sopenharmony_ci list_for_each_entry(sh, list, lru) 19498c2ecf20Sopenharmony_ci if (sh->sector == sect) 19508c2ecf20Sopenharmony_ci return sh; 19518c2ecf20Sopenharmony_ci return NULL; 19528c2ecf20Sopenharmony_ci} 19538c2ecf20Sopenharmony_ci 19548c2ecf20Sopenharmony_cistatic void 19558c2ecf20Sopenharmony_cir5c_recovery_drop_stripes(struct list_head *cached_stripe_list, 19568c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx) 19578c2ecf20Sopenharmony_ci{ 19588c2ecf20Sopenharmony_ci struct stripe_head *sh, *next; 19598c2ecf20Sopenharmony_ci 19608c2ecf20Sopenharmony_ci list_for_each_entry_safe(sh, next, cached_stripe_list, lru) { 19618c2ecf20Sopenharmony_ci r5l_recovery_reset_stripe(sh); 19628c2ecf20Sopenharmony_ci list_del_init(&sh->lru); 19638c2ecf20Sopenharmony_ci raid5_release_stripe(sh); 19648c2ecf20Sopenharmony_ci } 19658c2ecf20Sopenharmony_ci} 19668c2ecf20Sopenharmony_ci 19678c2ecf20Sopenharmony_cistatic void 19688c2ecf20Sopenharmony_cir5c_recovery_replay_stripes(struct list_head *cached_stripe_list, 19698c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx) 19708c2ecf20Sopenharmony_ci{ 19718c2ecf20Sopenharmony_ci struct stripe_head *sh, *next; 19728c2ecf20Sopenharmony_ci 19738c2ecf20Sopenharmony_ci list_for_each_entry_safe(sh, next, cached_stripe_list, lru) 19748c2ecf20Sopenharmony_ci if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 19758c2ecf20Sopenharmony_ci r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx); 19768c2ecf20Sopenharmony_ci list_del_init(&sh->lru); 19778c2ecf20Sopenharmony_ci raid5_release_stripe(sh); 19788c2ecf20Sopenharmony_ci } 19798c2ecf20Sopenharmony_ci} 19808c2ecf20Sopenharmony_ci 19818c2ecf20Sopenharmony_ci/* if matches return 0; otherwise return -EINVAL */ 19828c2ecf20Sopenharmony_cistatic int 19838c2ecf20Sopenharmony_cir5l_recovery_verify_data_checksum(struct r5l_log *log, 19848c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx, 19858c2ecf20Sopenharmony_ci struct page *page, 19868c2ecf20Sopenharmony_ci sector_t log_offset, __le32 log_checksum) 19878c2ecf20Sopenharmony_ci{ 19888c2ecf20Sopenharmony_ci void *addr; 19898c2ecf20Sopenharmony_ci u32 checksum; 19908c2ecf20Sopenharmony_ci 19918c2ecf20Sopenharmony_ci r5l_recovery_read_page(log, ctx, page, log_offset); 19928c2ecf20Sopenharmony_ci addr = kmap_atomic(page); 19938c2ecf20Sopenharmony_ci checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); 19948c2ecf20Sopenharmony_ci kunmap_atomic(addr); 19958c2ecf20Sopenharmony_ci return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; 19968c2ecf20Sopenharmony_ci} 19978c2ecf20Sopenharmony_ci 19988c2ecf20Sopenharmony_ci/* 19998c2ecf20Sopenharmony_ci * before loading data to stripe cache, we need verify checksum for all data, 20008c2ecf20Sopenharmony_ci * if there is mismatch for any data page, we drop all data in the mata block 20018c2ecf20Sopenharmony_ci */ 20028c2ecf20Sopenharmony_cistatic int 20038c2ecf20Sopenharmony_cir5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, 20048c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx) 20058c2ecf20Sopenharmony_ci{ 20068c2ecf20Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 20078c2ecf20Sopenharmony_ci struct r5conf *conf = mddev->private; 20088c2ecf20Sopenharmony_ci struct r5l_meta_block *mb = page_address(ctx->meta_page); 20098c2ecf20Sopenharmony_ci sector_t mb_offset = sizeof(struct r5l_meta_block); 20108c2ecf20Sopenharmony_ci sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 20118c2ecf20Sopenharmony_ci struct page *page; 20128c2ecf20Sopenharmony_ci struct r5l_payload_data_parity *payload; 20138c2ecf20Sopenharmony_ci struct r5l_payload_flush *payload_flush; 20148c2ecf20Sopenharmony_ci 20158c2ecf20Sopenharmony_ci page = alloc_page(GFP_KERNEL); 20168c2ecf20Sopenharmony_ci if (!page) 20178c2ecf20Sopenharmony_ci return -ENOMEM; 20188c2ecf20Sopenharmony_ci 20198c2ecf20Sopenharmony_ci while (mb_offset < le32_to_cpu(mb->meta_size)) { 20208c2ecf20Sopenharmony_ci payload = (void *)mb + mb_offset; 20218c2ecf20Sopenharmony_ci payload_flush = (void *)mb + mb_offset; 20228c2ecf20Sopenharmony_ci 20238c2ecf20Sopenharmony_ci if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 20248c2ecf20Sopenharmony_ci if (r5l_recovery_verify_data_checksum( 20258c2ecf20Sopenharmony_ci log, ctx, page, log_offset, 20268c2ecf20Sopenharmony_ci payload->checksum[0]) < 0) 20278c2ecf20Sopenharmony_ci goto mismatch; 20288c2ecf20Sopenharmony_ci } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) { 20298c2ecf20Sopenharmony_ci if (r5l_recovery_verify_data_checksum( 20308c2ecf20Sopenharmony_ci log, ctx, page, log_offset, 20318c2ecf20Sopenharmony_ci payload->checksum[0]) < 0) 20328c2ecf20Sopenharmony_ci goto mismatch; 20338c2ecf20Sopenharmony_ci if (conf->max_degraded == 2 && /* q for RAID 6 */ 20348c2ecf20Sopenharmony_ci r5l_recovery_verify_data_checksum( 20358c2ecf20Sopenharmony_ci log, ctx, page, 20368c2ecf20Sopenharmony_ci r5l_ring_add(log, log_offset, 20378c2ecf20Sopenharmony_ci BLOCK_SECTORS), 20388c2ecf20Sopenharmony_ci payload->checksum[1]) < 0) 20398c2ecf20Sopenharmony_ci goto mismatch; 20408c2ecf20Sopenharmony_ci } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { 20418c2ecf20Sopenharmony_ci /* nothing to do for R5LOG_PAYLOAD_FLUSH here */ 20428c2ecf20Sopenharmony_ci } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */ 20438c2ecf20Sopenharmony_ci goto mismatch; 20448c2ecf20Sopenharmony_ci 20458c2ecf20Sopenharmony_ci if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { 20468c2ecf20Sopenharmony_ci mb_offset += sizeof(struct r5l_payload_flush) + 20478c2ecf20Sopenharmony_ci le32_to_cpu(payload_flush->size); 20488c2ecf20Sopenharmony_ci } else { 20498c2ecf20Sopenharmony_ci /* DATA or PARITY payload */ 20508c2ecf20Sopenharmony_ci log_offset = r5l_ring_add(log, log_offset, 20518c2ecf20Sopenharmony_ci le32_to_cpu(payload->size)); 20528c2ecf20Sopenharmony_ci mb_offset += sizeof(struct r5l_payload_data_parity) + 20538c2ecf20Sopenharmony_ci sizeof(__le32) * 20548c2ecf20Sopenharmony_ci (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 20558c2ecf20Sopenharmony_ci } 20568c2ecf20Sopenharmony_ci 20578c2ecf20Sopenharmony_ci } 20588c2ecf20Sopenharmony_ci 20598c2ecf20Sopenharmony_ci put_page(page); 20608c2ecf20Sopenharmony_ci return 0; 20618c2ecf20Sopenharmony_ci 20628c2ecf20Sopenharmony_cimismatch: 20638c2ecf20Sopenharmony_ci put_page(page); 20648c2ecf20Sopenharmony_ci return -EINVAL; 20658c2ecf20Sopenharmony_ci} 20668c2ecf20Sopenharmony_ci 20678c2ecf20Sopenharmony_ci/* 20688c2ecf20Sopenharmony_ci * Analyze all data/parity pages in one meta block 20698c2ecf20Sopenharmony_ci * Returns: 20708c2ecf20Sopenharmony_ci * 0 for success 20718c2ecf20Sopenharmony_ci * -EINVAL for unknown playload type 20728c2ecf20Sopenharmony_ci * -EAGAIN for checksum mismatch of data page 20738c2ecf20Sopenharmony_ci * -ENOMEM for run out of memory (alloc_page failed or run out of stripes) 20748c2ecf20Sopenharmony_ci */ 20758c2ecf20Sopenharmony_cistatic int 20768c2ecf20Sopenharmony_cir5c_recovery_analyze_meta_block(struct r5l_log *log, 20778c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx, 20788c2ecf20Sopenharmony_ci struct list_head *cached_stripe_list) 20798c2ecf20Sopenharmony_ci{ 20808c2ecf20Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 20818c2ecf20Sopenharmony_ci struct r5conf *conf = mddev->private; 20828c2ecf20Sopenharmony_ci struct r5l_meta_block *mb; 20838c2ecf20Sopenharmony_ci struct r5l_payload_data_parity *payload; 20848c2ecf20Sopenharmony_ci struct r5l_payload_flush *payload_flush; 20858c2ecf20Sopenharmony_ci int mb_offset; 20868c2ecf20Sopenharmony_ci sector_t log_offset; 20878c2ecf20Sopenharmony_ci sector_t stripe_sect; 20888c2ecf20Sopenharmony_ci struct stripe_head *sh; 20898c2ecf20Sopenharmony_ci int ret; 20908c2ecf20Sopenharmony_ci 20918c2ecf20Sopenharmony_ci /* 20928c2ecf20Sopenharmony_ci * for mismatch in data blocks, we will drop all data in this mb, but 20938c2ecf20Sopenharmony_ci * we will still read next mb for other data with FLUSH flag, as 20948c2ecf20Sopenharmony_ci * io_unit could finish out of order. 20958c2ecf20Sopenharmony_ci */ 20968c2ecf20Sopenharmony_ci ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx); 20978c2ecf20Sopenharmony_ci if (ret == -EINVAL) 20988c2ecf20Sopenharmony_ci return -EAGAIN; 20998c2ecf20Sopenharmony_ci else if (ret) 21008c2ecf20Sopenharmony_ci return ret; /* -ENOMEM duo to alloc_page() failed */ 21018c2ecf20Sopenharmony_ci 21028c2ecf20Sopenharmony_ci mb = page_address(ctx->meta_page); 21038c2ecf20Sopenharmony_ci mb_offset = sizeof(struct r5l_meta_block); 21048c2ecf20Sopenharmony_ci log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 21058c2ecf20Sopenharmony_ci 21068c2ecf20Sopenharmony_ci while (mb_offset < le32_to_cpu(mb->meta_size)) { 21078c2ecf20Sopenharmony_ci int dd; 21088c2ecf20Sopenharmony_ci 21098c2ecf20Sopenharmony_ci payload = (void *)mb + mb_offset; 21108c2ecf20Sopenharmony_ci payload_flush = (void *)mb + mb_offset; 21118c2ecf20Sopenharmony_ci 21128c2ecf20Sopenharmony_ci if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { 21138c2ecf20Sopenharmony_ci int i, count; 21148c2ecf20Sopenharmony_ci 21158c2ecf20Sopenharmony_ci count = le32_to_cpu(payload_flush->size) / sizeof(__le64); 21168c2ecf20Sopenharmony_ci for (i = 0; i < count; ++i) { 21178c2ecf20Sopenharmony_ci stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]); 21188c2ecf20Sopenharmony_ci sh = r5c_recovery_lookup_stripe(cached_stripe_list, 21198c2ecf20Sopenharmony_ci stripe_sect); 21208c2ecf20Sopenharmony_ci if (sh) { 21218c2ecf20Sopenharmony_ci WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 21228c2ecf20Sopenharmony_ci r5l_recovery_reset_stripe(sh); 21238c2ecf20Sopenharmony_ci list_del_init(&sh->lru); 21248c2ecf20Sopenharmony_ci raid5_release_stripe(sh); 21258c2ecf20Sopenharmony_ci } 21268c2ecf20Sopenharmony_ci } 21278c2ecf20Sopenharmony_ci 21288c2ecf20Sopenharmony_ci mb_offset += sizeof(struct r5l_payload_flush) + 21298c2ecf20Sopenharmony_ci le32_to_cpu(payload_flush->size); 21308c2ecf20Sopenharmony_ci continue; 21318c2ecf20Sopenharmony_ci } 21328c2ecf20Sopenharmony_ci 21338c2ecf20Sopenharmony_ci /* DATA or PARITY payload */ 21348c2ecf20Sopenharmony_ci stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ? 21358c2ecf20Sopenharmony_ci raid5_compute_sector( 21368c2ecf20Sopenharmony_ci conf, le64_to_cpu(payload->location), 0, &dd, 21378c2ecf20Sopenharmony_ci NULL) 21388c2ecf20Sopenharmony_ci : le64_to_cpu(payload->location); 21398c2ecf20Sopenharmony_ci 21408c2ecf20Sopenharmony_ci sh = r5c_recovery_lookup_stripe(cached_stripe_list, 21418c2ecf20Sopenharmony_ci stripe_sect); 21428c2ecf20Sopenharmony_ci 21438c2ecf20Sopenharmony_ci if (!sh) { 21448c2ecf20Sopenharmony_ci sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1); 21458c2ecf20Sopenharmony_ci /* 21468c2ecf20Sopenharmony_ci * cannot get stripe from raid5_get_active_stripe 21478c2ecf20Sopenharmony_ci * try replay some stripes 21488c2ecf20Sopenharmony_ci */ 21498c2ecf20Sopenharmony_ci if (!sh) { 21508c2ecf20Sopenharmony_ci r5c_recovery_replay_stripes( 21518c2ecf20Sopenharmony_ci cached_stripe_list, ctx); 21528c2ecf20Sopenharmony_ci sh = r5c_recovery_alloc_stripe( 21538c2ecf20Sopenharmony_ci conf, stripe_sect, 1); 21548c2ecf20Sopenharmony_ci } 21558c2ecf20Sopenharmony_ci if (!sh) { 21568c2ecf20Sopenharmony_ci int new_size = conf->min_nr_stripes * 2; 21578c2ecf20Sopenharmony_ci pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", 21588c2ecf20Sopenharmony_ci mdname(mddev), 21598c2ecf20Sopenharmony_ci new_size); 21608c2ecf20Sopenharmony_ci ret = raid5_set_cache_size(mddev, new_size); 21618c2ecf20Sopenharmony_ci if (conf->min_nr_stripes <= new_size / 2) { 21628c2ecf20Sopenharmony_ci pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n", 21638c2ecf20Sopenharmony_ci mdname(mddev), 21648c2ecf20Sopenharmony_ci ret, 21658c2ecf20Sopenharmony_ci new_size, 21668c2ecf20Sopenharmony_ci conf->min_nr_stripes, 21678c2ecf20Sopenharmony_ci conf->max_nr_stripes); 21688c2ecf20Sopenharmony_ci return -ENOMEM; 21698c2ecf20Sopenharmony_ci } 21708c2ecf20Sopenharmony_ci sh = r5c_recovery_alloc_stripe( 21718c2ecf20Sopenharmony_ci conf, stripe_sect, 0); 21728c2ecf20Sopenharmony_ci } 21738c2ecf20Sopenharmony_ci if (!sh) { 21748c2ecf20Sopenharmony_ci pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", 21758c2ecf20Sopenharmony_ci mdname(mddev)); 21768c2ecf20Sopenharmony_ci return -ENOMEM; 21778c2ecf20Sopenharmony_ci } 21788c2ecf20Sopenharmony_ci list_add_tail(&sh->lru, cached_stripe_list); 21798c2ecf20Sopenharmony_ci } 21808c2ecf20Sopenharmony_ci 21818c2ecf20Sopenharmony_ci if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { 21828c2ecf20Sopenharmony_ci if (!test_bit(STRIPE_R5C_CACHING, &sh->state) && 21838c2ecf20Sopenharmony_ci test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) { 21848c2ecf20Sopenharmony_ci r5l_recovery_replay_one_stripe(conf, sh, ctx); 21858c2ecf20Sopenharmony_ci list_move_tail(&sh->lru, cached_stripe_list); 21868c2ecf20Sopenharmony_ci } 21878c2ecf20Sopenharmony_ci r5l_recovery_load_data(log, sh, ctx, payload, 21888c2ecf20Sopenharmony_ci log_offset); 21898c2ecf20Sopenharmony_ci } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) 21908c2ecf20Sopenharmony_ci r5l_recovery_load_parity(log, sh, ctx, payload, 21918c2ecf20Sopenharmony_ci log_offset); 21928c2ecf20Sopenharmony_ci else 21938c2ecf20Sopenharmony_ci return -EINVAL; 21948c2ecf20Sopenharmony_ci 21958c2ecf20Sopenharmony_ci log_offset = r5l_ring_add(log, log_offset, 21968c2ecf20Sopenharmony_ci le32_to_cpu(payload->size)); 21978c2ecf20Sopenharmony_ci 21988c2ecf20Sopenharmony_ci mb_offset += sizeof(struct r5l_payload_data_parity) + 21998c2ecf20Sopenharmony_ci sizeof(__le32) * 22008c2ecf20Sopenharmony_ci (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); 22018c2ecf20Sopenharmony_ci } 22028c2ecf20Sopenharmony_ci 22038c2ecf20Sopenharmony_ci return 0; 22048c2ecf20Sopenharmony_ci} 22058c2ecf20Sopenharmony_ci 22068c2ecf20Sopenharmony_ci/* 22078c2ecf20Sopenharmony_ci * Load the stripe into cache. The stripe will be written out later by 22088c2ecf20Sopenharmony_ci * the stripe cache state machine. 22098c2ecf20Sopenharmony_ci */ 22108c2ecf20Sopenharmony_cistatic void r5c_recovery_load_one_stripe(struct r5l_log *log, 22118c2ecf20Sopenharmony_ci struct stripe_head *sh) 22128c2ecf20Sopenharmony_ci{ 22138c2ecf20Sopenharmony_ci struct r5dev *dev; 22148c2ecf20Sopenharmony_ci int i; 22158c2ecf20Sopenharmony_ci 22168c2ecf20Sopenharmony_ci for (i = sh->disks; i--; ) { 22178c2ecf20Sopenharmony_ci dev = sh->dev + i; 22188c2ecf20Sopenharmony_ci if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) { 22198c2ecf20Sopenharmony_ci set_bit(R5_InJournal, &dev->flags); 22208c2ecf20Sopenharmony_ci set_bit(R5_UPTODATE, &dev->flags); 22218c2ecf20Sopenharmony_ci } 22228c2ecf20Sopenharmony_ci } 22238c2ecf20Sopenharmony_ci} 22248c2ecf20Sopenharmony_ci 22258c2ecf20Sopenharmony_ci/* 22268c2ecf20Sopenharmony_ci * Scan through the log for all to-be-flushed data 22278c2ecf20Sopenharmony_ci * 22288c2ecf20Sopenharmony_ci * For stripes with data and parity, namely Data-Parity stripe 22298c2ecf20Sopenharmony_ci * (STRIPE_R5C_CACHING == 0), we simply replay all the writes. 22308c2ecf20Sopenharmony_ci * 22318c2ecf20Sopenharmony_ci * For stripes with only data, namely Data-Only stripe 22328c2ecf20Sopenharmony_ci * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine. 22338c2ecf20Sopenharmony_ci * 22348c2ecf20Sopenharmony_ci * For a stripe, if we see data after parity, we should discard all previous 22358c2ecf20Sopenharmony_ci * data and parity for this stripe, as these data are already flushed to 22368c2ecf20Sopenharmony_ci * the array. 22378c2ecf20Sopenharmony_ci * 22388c2ecf20Sopenharmony_ci * At the end of the scan, we return the new journal_tail, which points to 22398c2ecf20Sopenharmony_ci * first data-only stripe on the journal device, or next invalid meta block. 22408c2ecf20Sopenharmony_ci */ 22418c2ecf20Sopenharmony_cistatic int r5c_recovery_flush_log(struct r5l_log *log, 22428c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx) 22438c2ecf20Sopenharmony_ci{ 22448c2ecf20Sopenharmony_ci struct stripe_head *sh; 22458c2ecf20Sopenharmony_ci int ret = 0; 22468c2ecf20Sopenharmony_ci 22478c2ecf20Sopenharmony_ci /* scan through the log */ 22488c2ecf20Sopenharmony_ci while (1) { 22498c2ecf20Sopenharmony_ci if (r5l_recovery_read_meta_block(log, ctx)) 22508c2ecf20Sopenharmony_ci break; 22518c2ecf20Sopenharmony_ci 22528c2ecf20Sopenharmony_ci ret = r5c_recovery_analyze_meta_block(log, ctx, 22538c2ecf20Sopenharmony_ci &ctx->cached_list); 22548c2ecf20Sopenharmony_ci /* 22558c2ecf20Sopenharmony_ci * -EAGAIN means mismatch in data block, in this case, we still 22568c2ecf20Sopenharmony_ci * try scan the next metablock 22578c2ecf20Sopenharmony_ci */ 22588c2ecf20Sopenharmony_ci if (ret && ret != -EAGAIN) 22598c2ecf20Sopenharmony_ci break; /* ret == -EINVAL or -ENOMEM */ 22608c2ecf20Sopenharmony_ci ctx->seq++; 22618c2ecf20Sopenharmony_ci ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); 22628c2ecf20Sopenharmony_ci } 22638c2ecf20Sopenharmony_ci 22648c2ecf20Sopenharmony_ci if (ret == -ENOMEM) { 22658c2ecf20Sopenharmony_ci r5c_recovery_drop_stripes(&ctx->cached_list, ctx); 22668c2ecf20Sopenharmony_ci return ret; 22678c2ecf20Sopenharmony_ci } 22688c2ecf20Sopenharmony_ci 22698c2ecf20Sopenharmony_ci /* replay data-parity stripes */ 22708c2ecf20Sopenharmony_ci r5c_recovery_replay_stripes(&ctx->cached_list, ctx); 22718c2ecf20Sopenharmony_ci 22728c2ecf20Sopenharmony_ci /* load data-only stripes to stripe cache */ 22738c2ecf20Sopenharmony_ci list_for_each_entry(sh, &ctx->cached_list, lru) { 22748c2ecf20Sopenharmony_ci WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 22758c2ecf20Sopenharmony_ci r5c_recovery_load_one_stripe(log, sh); 22768c2ecf20Sopenharmony_ci ctx->data_only_stripes++; 22778c2ecf20Sopenharmony_ci } 22788c2ecf20Sopenharmony_ci 22798c2ecf20Sopenharmony_ci return 0; 22808c2ecf20Sopenharmony_ci} 22818c2ecf20Sopenharmony_ci 22828c2ecf20Sopenharmony_ci/* 22838c2ecf20Sopenharmony_ci * we did a recovery. Now ctx.pos points to an invalid meta block. New 22848c2ecf20Sopenharmony_ci * log will start here. but we can't let superblock point to last valid 22858c2ecf20Sopenharmony_ci * meta block. The log might looks like: 22868c2ecf20Sopenharmony_ci * | meta 1| meta 2| meta 3| 22878c2ecf20Sopenharmony_ci * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If 22888c2ecf20Sopenharmony_ci * superblock points to meta 1, we write a new valid meta 2n. if crash 22898c2ecf20Sopenharmony_ci * happens again, new recovery will start from meta 1. Since meta 2n is 22908c2ecf20Sopenharmony_ci * valid now, recovery will think meta 3 is valid, which is wrong. 22918c2ecf20Sopenharmony_ci * The solution is we create a new meta in meta2 with its seq == meta 22928c2ecf20Sopenharmony_ci * 1's seq + 10000 and let superblock points to meta2. The same recovery 22938c2ecf20Sopenharmony_ci * will not think meta 3 is a valid meta, because its seq doesn't match 22948c2ecf20Sopenharmony_ci */ 22958c2ecf20Sopenharmony_ci 22968c2ecf20Sopenharmony_ci/* 22978c2ecf20Sopenharmony_ci * Before recovery, the log looks like the following 22988c2ecf20Sopenharmony_ci * 22998c2ecf20Sopenharmony_ci * --------------------------------------------- 23008c2ecf20Sopenharmony_ci * | valid log | invalid log | 23018c2ecf20Sopenharmony_ci * --------------------------------------------- 23028c2ecf20Sopenharmony_ci * ^ 23038c2ecf20Sopenharmony_ci * |- log->last_checkpoint 23048c2ecf20Sopenharmony_ci * |- log->last_cp_seq 23058c2ecf20Sopenharmony_ci * 23068c2ecf20Sopenharmony_ci * Now we scan through the log until we see invalid entry 23078c2ecf20Sopenharmony_ci * 23088c2ecf20Sopenharmony_ci * --------------------------------------------- 23098c2ecf20Sopenharmony_ci * | valid log | invalid log | 23108c2ecf20Sopenharmony_ci * --------------------------------------------- 23118c2ecf20Sopenharmony_ci * ^ ^ 23128c2ecf20Sopenharmony_ci * |- log->last_checkpoint |- ctx->pos 23138c2ecf20Sopenharmony_ci * |- log->last_cp_seq |- ctx->seq 23148c2ecf20Sopenharmony_ci * 23158c2ecf20Sopenharmony_ci * From this point, we need to increase seq number by 10 to avoid 23168c2ecf20Sopenharmony_ci * confusing next recovery. 23178c2ecf20Sopenharmony_ci * 23188c2ecf20Sopenharmony_ci * --------------------------------------------- 23198c2ecf20Sopenharmony_ci * | valid log | invalid log | 23208c2ecf20Sopenharmony_ci * --------------------------------------------- 23218c2ecf20Sopenharmony_ci * ^ ^ 23228c2ecf20Sopenharmony_ci * |- log->last_checkpoint |- ctx->pos+1 23238c2ecf20Sopenharmony_ci * |- log->last_cp_seq |- ctx->seq+10001 23248c2ecf20Sopenharmony_ci * 23258c2ecf20Sopenharmony_ci * However, it is not safe to start the state machine yet, because data only 23268c2ecf20Sopenharmony_ci * parities are not yet secured in RAID. To save these data only parities, we 23278c2ecf20Sopenharmony_ci * rewrite them from seq+11. 23288c2ecf20Sopenharmony_ci * 23298c2ecf20Sopenharmony_ci * ----------------------------------------------------------------- 23308c2ecf20Sopenharmony_ci * | valid log | data only stripes | invalid log | 23318c2ecf20Sopenharmony_ci * ----------------------------------------------------------------- 23328c2ecf20Sopenharmony_ci * ^ ^ 23338c2ecf20Sopenharmony_ci * |- log->last_checkpoint |- ctx->pos+n 23348c2ecf20Sopenharmony_ci * |- log->last_cp_seq |- ctx->seq+10000+n 23358c2ecf20Sopenharmony_ci * 23368c2ecf20Sopenharmony_ci * If failure happens again during this process, the recovery can safe start 23378c2ecf20Sopenharmony_ci * again from log->last_checkpoint. 23388c2ecf20Sopenharmony_ci * 23398c2ecf20Sopenharmony_ci * Once data only stripes are rewritten to journal, we move log_tail 23408c2ecf20Sopenharmony_ci * 23418c2ecf20Sopenharmony_ci * ----------------------------------------------------------------- 23428c2ecf20Sopenharmony_ci * | old log | data only stripes | invalid log | 23438c2ecf20Sopenharmony_ci * ----------------------------------------------------------------- 23448c2ecf20Sopenharmony_ci * ^ ^ 23458c2ecf20Sopenharmony_ci * |- log->last_checkpoint |- ctx->pos+n 23468c2ecf20Sopenharmony_ci * |- log->last_cp_seq |- ctx->seq+10000+n 23478c2ecf20Sopenharmony_ci * 23488c2ecf20Sopenharmony_ci * Then we can safely start the state machine. If failure happens from this 23498c2ecf20Sopenharmony_ci * point on, the recovery will start from new log->last_checkpoint. 23508c2ecf20Sopenharmony_ci */ 23518c2ecf20Sopenharmony_cistatic int 23528c2ecf20Sopenharmony_cir5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 23538c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx) 23548c2ecf20Sopenharmony_ci{ 23558c2ecf20Sopenharmony_ci struct stripe_head *sh; 23568c2ecf20Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 23578c2ecf20Sopenharmony_ci struct page *page; 23588c2ecf20Sopenharmony_ci sector_t next_checkpoint = MaxSector; 23598c2ecf20Sopenharmony_ci 23608c2ecf20Sopenharmony_ci page = alloc_page(GFP_KERNEL); 23618c2ecf20Sopenharmony_ci if (!page) { 23628c2ecf20Sopenharmony_ci pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", 23638c2ecf20Sopenharmony_ci mdname(mddev)); 23648c2ecf20Sopenharmony_ci return -ENOMEM; 23658c2ecf20Sopenharmony_ci } 23668c2ecf20Sopenharmony_ci 23678c2ecf20Sopenharmony_ci WARN_ON(list_empty(&ctx->cached_list)); 23688c2ecf20Sopenharmony_ci 23698c2ecf20Sopenharmony_ci list_for_each_entry(sh, &ctx->cached_list, lru) { 23708c2ecf20Sopenharmony_ci struct r5l_meta_block *mb; 23718c2ecf20Sopenharmony_ci int i; 23728c2ecf20Sopenharmony_ci int offset; 23738c2ecf20Sopenharmony_ci sector_t write_pos; 23748c2ecf20Sopenharmony_ci 23758c2ecf20Sopenharmony_ci WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); 23768c2ecf20Sopenharmony_ci r5l_recovery_create_empty_meta_block(log, page, 23778c2ecf20Sopenharmony_ci ctx->pos, ctx->seq); 23788c2ecf20Sopenharmony_ci mb = page_address(page); 23798c2ecf20Sopenharmony_ci offset = le32_to_cpu(mb->meta_size); 23808c2ecf20Sopenharmony_ci write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 23818c2ecf20Sopenharmony_ci 23828c2ecf20Sopenharmony_ci for (i = sh->disks; i--; ) { 23838c2ecf20Sopenharmony_ci struct r5dev *dev = &sh->dev[i]; 23848c2ecf20Sopenharmony_ci struct r5l_payload_data_parity *payload; 23858c2ecf20Sopenharmony_ci void *addr; 23868c2ecf20Sopenharmony_ci 23878c2ecf20Sopenharmony_ci if (test_bit(R5_InJournal, &dev->flags)) { 23888c2ecf20Sopenharmony_ci payload = (void *)mb + offset; 23898c2ecf20Sopenharmony_ci payload->header.type = cpu_to_le16( 23908c2ecf20Sopenharmony_ci R5LOG_PAYLOAD_DATA); 23918c2ecf20Sopenharmony_ci payload->size = cpu_to_le32(BLOCK_SECTORS); 23928c2ecf20Sopenharmony_ci payload->location = cpu_to_le64( 23938c2ecf20Sopenharmony_ci raid5_compute_blocknr(sh, i, 0)); 23948c2ecf20Sopenharmony_ci addr = kmap_atomic(dev->page); 23958c2ecf20Sopenharmony_ci payload->checksum[0] = cpu_to_le32( 23968c2ecf20Sopenharmony_ci crc32c_le(log->uuid_checksum, addr, 23978c2ecf20Sopenharmony_ci PAGE_SIZE)); 23988c2ecf20Sopenharmony_ci kunmap_atomic(addr); 23998c2ecf20Sopenharmony_ci sync_page_io(log->rdev, write_pos, PAGE_SIZE, 24008c2ecf20Sopenharmony_ci dev->page, REQ_OP_WRITE, 0, false); 24018c2ecf20Sopenharmony_ci write_pos = r5l_ring_add(log, write_pos, 24028c2ecf20Sopenharmony_ci BLOCK_SECTORS); 24038c2ecf20Sopenharmony_ci offset += sizeof(__le32) + 24048c2ecf20Sopenharmony_ci sizeof(struct r5l_payload_data_parity); 24058c2ecf20Sopenharmony_ci 24068c2ecf20Sopenharmony_ci } 24078c2ecf20Sopenharmony_ci } 24088c2ecf20Sopenharmony_ci mb->meta_size = cpu_to_le32(offset); 24098c2ecf20Sopenharmony_ci mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, 24108c2ecf20Sopenharmony_ci mb, PAGE_SIZE)); 24118c2ecf20Sopenharmony_ci sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, 24128c2ecf20Sopenharmony_ci REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false); 24138c2ecf20Sopenharmony_ci sh->log_start = ctx->pos; 24148c2ecf20Sopenharmony_ci list_add_tail(&sh->r5c, &log->stripe_in_journal_list); 24158c2ecf20Sopenharmony_ci atomic_inc(&log->stripe_in_journal_count); 24168c2ecf20Sopenharmony_ci ctx->pos = write_pos; 24178c2ecf20Sopenharmony_ci ctx->seq += 1; 24188c2ecf20Sopenharmony_ci next_checkpoint = sh->log_start; 24198c2ecf20Sopenharmony_ci } 24208c2ecf20Sopenharmony_ci log->next_checkpoint = next_checkpoint; 24218c2ecf20Sopenharmony_ci __free_page(page); 24228c2ecf20Sopenharmony_ci return 0; 24238c2ecf20Sopenharmony_ci} 24248c2ecf20Sopenharmony_ci 24258c2ecf20Sopenharmony_cistatic void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, 24268c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx) 24278c2ecf20Sopenharmony_ci{ 24288c2ecf20Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 24298c2ecf20Sopenharmony_ci struct r5conf *conf = mddev->private; 24308c2ecf20Sopenharmony_ci struct stripe_head *sh, *next; 24318c2ecf20Sopenharmony_ci bool cleared_pending = false; 24328c2ecf20Sopenharmony_ci 24338c2ecf20Sopenharmony_ci if (ctx->data_only_stripes == 0) 24348c2ecf20Sopenharmony_ci return; 24358c2ecf20Sopenharmony_ci 24368c2ecf20Sopenharmony_ci if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { 24378c2ecf20Sopenharmony_ci cleared_pending = true; 24388c2ecf20Sopenharmony_ci clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 24398c2ecf20Sopenharmony_ci } 24408c2ecf20Sopenharmony_ci log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; 24418c2ecf20Sopenharmony_ci 24428c2ecf20Sopenharmony_ci list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 24438c2ecf20Sopenharmony_ci r5c_make_stripe_write_out(sh); 24448c2ecf20Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 24458c2ecf20Sopenharmony_ci list_del_init(&sh->lru); 24468c2ecf20Sopenharmony_ci raid5_release_stripe(sh); 24478c2ecf20Sopenharmony_ci } 24488c2ecf20Sopenharmony_ci 24498c2ecf20Sopenharmony_ci /* reuse conf->wait_for_quiescent in recovery */ 24508c2ecf20Sopenharmony_ci wait_event(conf->wait_for_quiescent, 24518c2ecf20Sopenharmony_ci atomic_read(&conf->active_stripes) == 0); 24528c2ecf20Sopenharmony_ci 24538c2ecf20Sopenharmony_ci log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 24548c2ecf20Sopenharmony_ci if (cleared_pending) 24558c2ecf20Sopenharmony_ci set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); 24568c2ecf20Sopenharmony_ci} 24578c2ecf20Sopenharmony_ci 24588c2ecf20Sopenharmony_cistatic int r5l_recovery_log(struct r5l_log *log) 24598c2ecf20Sopenharmony_ci{ 24608c2ecf20Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 24618c2ecf20Sopenharmony_ci struct r5l_recovery_ctx *ctx; 24628c2ecf20Sopenharmony_ci int ret; 24638c2ecf20Sopenharmony_ci sector_t pos; 24648c2ecf20Sopenharmony_ci 24658c2ecf20Sopenharmony_ci ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 24668c2ecf20Sopenharmony_ci if (!ctx) 24678c2ecf20Sopenharmony_ci return -ENOMEM; 24688c2ecf20Sopenharmony_ci 24698c2ecf20Sopenharmony_ci ctx->pos = log->last_checkpoint; 24708c2ecf20Sopenharmony_ci ctx->seq = log->last_cp_seq; 24718c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&ctx->cached_list); 24728c2ecf20Sopenharmony_ci ctx->meta_page = alloc_page(GFP_KERNEL); 24738c2ecf20Sopenharmony_ci 24748c2ecf20Sopenharmony_ci if (!ctx->meta_page) { 24758c2ecf20Sopenharmony_ci ret = -ENOMEM; 24768c2ecf20Sopenharmony_ci goto meta_page; 24778c2ecf20Sopenharmony_ci } 24788c2ecf20Sopenharmony_ci 24798c2ecf20Sopenharmony_ci if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) { 24808c2ecf20Sopenharmony_ci ret = -ENOMEM; 24818c2ecf20Sopenharmony_ci goto ra_pool; 24828c2ecf20Sopenharmony_ci } 24838c2ecf20Sopenharmony_ci 24848c2ecf20Sopenharmony_ci ret = r5c_recovery_flush_log(log, ctx); 24858c2ecf20Sopenharmony_ci 24868c2ecf20Sopenharmony_ci if (ret) 24878c2ecf20Sopenharmony_ci goto error; 24888c2ecf20Sopenharmony_ci 24898c2ecf20Sopenharmony_ci pos = ctx->pos; 24908c2ecf20Sopenharmony_ci ctx->seq += 10000; 24918c2ecf20Sopenharmony_ci 24928c2ecf20Sopenharmony_ci if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0)) 24938c2ecf20Sopenharmony_ci pr_info("md/raid:%s: starting from clean shutdown\n", 24948c2ecf20Sopenharmony_ci mdname(mddev)); 24958c2ecf20Sopenharmony_ci else 24968c2ecf20Sopenharmony_ci pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", 24978c2ecf20Sopenharmony_ci mdname(mddev), ctx->data_only_stripes, 24988c2ecf20Sopenharmony_ci ctx->data_parity_stripes); 24998c2ecf20Sopenharmony_ci 25008c2ecf20Sopenharmony_ci if (ctx->data_only_stripes == 0) { 25018c2ecf20Sopenharmony_ci log->next_checkpoint = ctx->pos; 25028c2ecf20Sopenharmony_ci r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++); 25038c2ecf20Sopenharmony_ci ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); 25048c2ecf20Sopenharmony_ci } else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) { 25058c2ecf20Sopenharmony_ci pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 25068c2ecf20Sopenharmony_ci mdname(mddev)); 25078c2ecf20Sopenharmony_ci ret = -EIO; 25088c2ecf20Sopenharmony_ci goto error; 25098c2ecf20Sopenharmony_ci } 25108c2ecf20Sopenharmony_ci 25118c2ecf20Sopenharmony_ci log->log_start = ctx->pos; 25128c2ecf20Sopenharmony_ci log->seq = ctx->seq; 25138c2ecf20Sopenharmony_ci log->last_checkpoint = pos; 25148c2ecf20Sopenharmony_ci r5l_write_super(log, pos); 25158c2ecf20Sopenharmony_ci 25168c2ecf20Sopenharmony_ci r5c_recovery_flush_data_only_stripes(log, ctx); 25178c2ecf20Sopenharmony_ci ret = 0; 25188c2ecf20Sopenharmony_cierror: 25198c2ecf20Sopenharmony_ci r5l_recovery_free_ra_pool(log, ctx); 25208c2ecf20Sopenharmony_cira_pool: 25218c2ecf20Sopenharmony_ci __free_page(ctx->meta_page); 25228c2ecf20Sopenharmony_cimeta_page: 25238c2ecf20Sopenharmony_ci kfree(ctx); 25248c2ecf20Sopenharmony_ci return ret; 25258c2ecf20Sopenharmony_ci} 25268c2ecf20Sopenharmony_ci 25278c2ecf20Sopenharmony_cistatic void r5l_write_super(struct r5l_log *log, sector_t cp) 25288c2ecf20Sopenharmony_ci{ 25298c2ecf20Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 25308c2ecf20Sopenharmony_ci 25318c2ecf20Sopenharmony_ci log->rdev->journal_tail = cp; 25328c2ecf20Sopenharmony_ci set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); 25338c2ecf20Sopenharmony_ci} 25348c2ecf20Sopenharmony_ci 25358c2ecf20Sopenharmony_cistatic ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) 25368c2ecf20Sopenharmony_ci{ 25378c2ecf20Sopenharmony_ci struct r5conf *conf; 25388c2ecf20Sopenharmony_ci int ret; 25398c2ecf20Sopenharmony_ci 25408c2ecf20Sopenharmony_ci spin_lock(&mddev->lock); 25418c2ecf20Sopenharmony_ci conf = mddev->private; 25428c2ecf20Sopenharmony_ci if (!conf || !conf->log) { 25438c2ecf20Sopenharmony_ci spin_unlock(&mddev->lock); 25448c2ecf20Sopenharmony_ci return 0; 25458c2ecf20Sopenharmony_ci } 25468c2ecf20Sopenharmony_ci 25478c2ecf20Sopenharmony_ci switch (conf->log->r5c_journal_mode) { 25488c2ecf20Sopenharmony_ci case R5C_JOURNAL_MODE_WRITE_THROUGH: 25498c2ecf20Sopenharmony_ci ret = snprintf( 25508c2ecf20Sopenharmony_ci page, PAGE_SIZE, "[%s] %s\n", 25518c2ecf20Sopenharmony_ci r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 25528c2ecf20Sopenharmony_ci r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 25538c2ecf20Sopenharmony_ci break; 25548c2ecf20Sopenharmony_ci case R5C_JOURNAL_MODE_WRITE_BACK: 25558c2ecf20Sopenharmony_ci ret = snprintf( 25568c2ecf20Sopenharmony_ci page, PAGE_SIZE, "%s [%s]\n", 25578c2ecf20Sopenharmony_ci r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH], 25588c2ecf20Sopenharmony_ci r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]); 25598c2ecf20Sopenharmony_ci break; 25608c2ecf20Sopenharmony_ci default: 25618c2ecf20Sopenharmony_ci ret = 0; 25628c2ecf20Sopenharmony_ci } 25638c2ecf20Sopenharmony_ci spin_unlock(&mddev->lock); 25648c2ecf20Sopenharmony_ci return ret; 25658c2ecf20Sopenharmony_ci} 25668c2ecf20Sopenharmony_ci 25678c2ecf20Sopenharmony_ci/* 25688c2ecf20Sopenharmony_ci * Set journal cache mode on @mddev (external API initially needed by dm-raid). 25698c2ecf20Sopenharmony_ci * 25708c2ecf20Sopenharmony_ci * @mode as defined in 'enum r5c_journal_mode'. 25718c2ecf20Sopenharmony_ci * 25728c2ecf20Sopenharmony_ci */ 25738c2ecf20Sopenharmony_ciint r5c_journal_mode_set(struct mddev *mddev, int mode) 25748c2ecf20Sopenharmony_ci{ 25758c2ecf20Sopenharmony_ci struct r5conf *conf; 25768c2ecf20Sopenharmony_ci 25778c2ecf20Sopenharmony_ci if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH || 25788c2ecf20Sopenharmony_ci mode > R5C_JOURNAL_MODE_WRITE_BACK) 25798c2ecf20Sopenharmony_ci return -EINVAL; 25808c2ecf20Sopenharmony_ci 25818c2ecf20Sopenharmony_ci conf = mddev->private; 25828c2ecf20Sopenharmony_ci if (!conf || !conf->log) 25838c2ecf20Sopenharmony_ci return -ENODEV; 25848c2ecf20Sopenharmony_ci 25858c2ecf20Sopenharmony_ci if (raid5_calc_degraded(conf) > 0 && 25868c2ecf20Sopenharmony_ci mode == R5C_JOURNAL_MODE_WRITE_BACK) 25878c2ecf20Sopenharmony_ci return -EINVAL; 25888c2ecf20Sopenharmony_ci 25898c2ecf20Sopenharmony_ci mddev_suspend(mddev); 25908c2ecf20Sopenharmony_ci conf->log->r5c_journal_mode = mode; 25918c2ecf20Sopenharmony_ci mddev_resume(mddev); 25928c2ecf20Sopenharmony_ci 25938c2ecf20Sopenharmony_ci pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", 25948c2ecf20Sopenharmony_ci mdname(mddev), mode, r5c_journal_mode_str[mode]); 25958c2ecf20Sopenharmony_ci return 0; 25968c2ecf20Sopenharmony_ci} 25978c2ecf20Sopenharmony_ciEXPORT_SYMBOL(r5c_journal_mode_set); 25988c2ecf20Sopenharmony_ci 25998c2ecf20Sopenharmony_cistatic ssize_t r5c_journal_mode_store(struct mddev *mddev, 26008c2ecf20Sopenharmony_ci const char *page, size_t length) 26018c2ecf20Sopenharmony_ci{ 26028c2ecf20Sopenharmony_ci int mode = ARRAY_SIZE(r5c_journal_mode_str); 26038c2ecf20Sopenharmony_ci size_t len = length; 26048c2ecf20Sopenharmony_ci int ret; 26058c2ecf20Sopenharmony_ci 26068c2ecf20Sopenharmony_ci if (len < 2) 26078c2ecf20Sopenharmony_ci return -EINVAL; 26088c2ecf20Sopenharmony_ci 26098c2ecf20Sopenharmony_ci if (page[len - 1] == '\n') 26108c2ecf20Sopenharmony_ci len--; 26118c2ecf20Sopenharmony_ci 26128c2ecf20Sopenharmony_ci while (mode--) 26138c2ecf20Sopenharmony_ci if (strlen(r5c_journal_mode_str[mode]) == len && 26148c2ecf20Sopenharmony_ci !strncmp(page, r5c_journal_mode_str[mode], len)) 26158c2ecf20Sopenharmony_ci break; 26168c2ecf20Sopenharmony_ci ret = mddev_lock(mddev); 26178c2ecf20Sopenharmony_ci if (ret) 26188c2ecf20Sopenharmony_ci return ret; 26198c2ecf20Sopenharmony_ci ret = r5c_journal_mode_set(mddev, mode); 26208c2ecf20Sopenharmony_ci mddev_unlock(mddev); 26218c2ecf20Sopenharmony_ci return ret ?: length; 26228c2ecf20Sopenharmony_ci} 26238c2ecf20Sopenharmony_ci 26248c2ecf20Sopenharmony_cistruct md_sysfs_entry 26258c2ecf20Sopenharmony_cir5c_journal_mode = __ATTR(journal_mode, 0644, 26268c2ecf20Sopenharmony_ci r5c_journal_mode_show, r5c_journal_mode_store); 26278c2ecf20Sopenharmony_ci 26288c2ecf20Sopenharmony_ci/* 26298c2ecf20Sopenharmony_ci * Try handle write operation in caching phase. This function should only 26308c2ecf20Sopenharmony_ci * be called in write-back mode. 26318c2ecf20Sopenharmony_ci * 26328c2ecf20Sopenharmony_ci * If all outstanding writes can be handled in caching phase, returns 0 26338c2ecf20Sopenharmony_ci * If writes requires write-out phase, call r5c_make_stripe_write_out() 26348c2ecf20Sopenharmony_ci * and returns -EAGAIN 26358c2ecf20Sopenharmony_ci */ 26368c2ecf20Sopenharmony_ciint r5c_try_caching_write(struct r5conf *conf, 26378c2ecf20Sopenharmony_ci struct stripe_head *sh, 26388c2ecf20Sopenharmony_ci struct stripe_head_state *s, 26398c2ecf20Sopenharmony_ci int disks) 26408c2ecf20Sopenharmony_ci{ 26418c2ecf20Sopenharmony_ci struct r5l_log *log = conf->log; 26428c2ecf20Sopenharmony_ci int i; 26438c2ecf20Sopenharmony_ci struct r5dev *dev; 26448c2ecf20Sopenharmony_ci int to_cache = 0; 26458c2ecf20Sopenharmony_ci void **pslot; 26468c2ecf20Sopenharmony_ci sector_t tree_index; 26478c2ecf20Sopenharmony_ci int ret; 26488c2ecf20Sopenharmony_ci uintptr_t refcount; 26498c2ecf20Sopenharmony_ci 26508c2ecf20Sopenharmony_ci BUG_ON(!r5c_is_writeback(log)); 26518c2ecf20Sopenharmony_ci 26528c2ecf20Sopenharmony_ci if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) { 26538c2ecf20Sopenharmony_ci /* 26548c2ecf20Sopenharmony_ci * There are two different scenarios here: 26558c2ecf20Sopenharmony_ci * 1. The stripe has some data cached, and it is sent to 26568c2ecf20Sopenharmony_ci * write-out phase for reclaim 26578c2ecf20Sopenharmony_ci * 2. The stripe is clean, and this is the first write 26588c2ecf20Sopenharmony_ci * 26598c2ecf20Sopenharmony_ci * For 1, return -EAGAIN, so we continue with 26608c2ecf20Sopenharmony_ci * handle_stripe_dirtying(). 26618c2ecf20Sopenharmony_ci * 26628c2ecf20Sopenharmony_ci * For 2, set STRIPE_R5C_CACHING and continue with caching 26638c2ecf20Sopenharmony_ci * write. 26648c2ecf20Sopenharmony_ci */ 26658c2ecf20Sopenharmony_ci 26668c2ecf20Sopenharmony_ci /* case 1: anything injournal or anything in written */ 26678c2ecf20Sopenharmony_ci if (s->injournal > 0 || s->written > 0) 26688c2ecf20Sopenharmony_ci return -EAGAIN; 26698c2ecf20Sopenharmony_ci /* case 2 */ 26708c2ecf20Sopenharmony_ci set_bit(STRIPE_R5C_CACHING, &sh->state); 26718c2ecf20Sopenharmony_ci } 26728c2ecf20Sopenharmony_ci 26738c2ecf20Sopenharmony_ci /* 26748c2ecf20Sopenharmony_ci * When run in degraded mode, array is set to write-through mode. 26758c2ecf20Sopenharmony_ci * This check helps drain pending write safely in the transition to 26768c2ecf20Sopenharmony_ci * write-through mode. 26778c2ecf20Sopenharmony_ci * 26788c2ecf20Sopenharmony_ci * When a stripe is syncing, the write is also handled in write 26798c2ecf20Sopenharmony_ci * through mode. 26808c2ecf20Sopenharmony_ci */ 26818c2ecf20Sopenharmony_ci if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) { 26828c2ecf20Sopenharmony_ci r5c_make_stripe_write_out(sh); 26838c2ecf20Sopenharmony_ci return -EAGAIN; 26848c2ecf20Sopenharmony_ci } 26858c2ecf20Sopenharmony_ci 26868c2ecf20Sopenharmony_ci for (i = disks; i--; ) { 26878c2ecf20Sopenharmony_ci dev = &sh->dev[i]; 26888c2ecf20Sopenharmony_ci /* if non-overwrite, use writing-out phase */ 26898c2ecf20Sopenharmony_ci if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) && 26908c2ecf20Sopenharmony_ci !test_bit(R5_InJournal, &dev->flags)) { 26918c2ecf20Sopenharmony_ci r5c_make_stripe_write_out(sh); 26928c2ecf20Sopenharmony_ci return -EAGAIN; 26938c2ecf20Sopenharmony_ci } 26948c2ecf20Sopenharmony_ci } 26958c2ecf20Sopenharmony_ci 26968c2ecf20Sopenharmony_ci /* if the stripe is not counted in big_stripe_tree, add it now */ 26978c2ecf20Sopenharmony_ci if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && 26988c2ecf20Sopenharmony_ci !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 26998c2ecf20Sopenharmony_ci tree_index = r5c_tree_index(conf, sh->sector); 27008c2ecf20Sopenharmony_ci spin_lock(&log->tree_lock); 27018c2ecf20Sopenharmony_ci pslot = radix_tree_lookup_slot(&log->big_stripe_tree, 27028c2ecf20Sopenharmony_ci tree_index); 27038c2ecf20Sopenharmony_ci if (pslot) { 27048c2ecf20Sopenharmony_ci refcount = (uintptr_t)radix_tree_deref_slot_protected( 27058c2ecf20Sopenharmony_ci pslot, &log->tree_lock) >> 27068c2ecf20Sopenharmony_ci R5C_RADIX_COUNT_SHIFT; 27078c2ecf20Sopenharmony_ci radix_tree_replace_slot( 27088c2ecf20Sopenharmony_ci &log->big_stripe_tree, pslot, 27098c2ecf20Sopenharmony_ci (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT)); 27108c2ecf20Sopenharmony_ci } else { 27118c2ecf20Sopenharmony_ci /* 27128c2ecf20Sopenharmony_ci * this radix_tree_insert can fail safely, so no 27138c2ecf20Sopenharmony_ci * need to call radix_tree_preload() 27148c2ecf20Sopenharmony_ci */ 27158c2ecf20Sopenharmony_ci ret = radix_tree_insert( 27168c2ecf20Sopenharmony_ci &log->big_stripe_tree, tree_index, 27178c2ecf20Sopenharmony_ci (void *)(1 << R5C_RADIX_COUNT_SHIFT)); 27188c2ecf20Sopenharmony_ci if (ret) { 27198c2ecf20Sopenharmony_ci spin_unlock(&log->tree_lock); 27208c2ecf20Sopenharmony_ci r5c_make_stripe_write_out(sh); 27218c2ecf20Sopenharmony_ci return -EAGAIN; 27228c2ecf20Sopenharmony_ci } 27238c2ecf20Sopenharmony_ci } 27248c2ecf20Sopenharmony_ci spin_unlock(&log->tree_lock); 27258c2ecf20Sopenharmony_ci 27268c2ecf20Sopenharmony_ci /* 27278c2ecf20Sopenharmony_ci * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is 27288c2ecf20Sopenharmony_ci * counted in the radix tree 27298c2ecf20Sopenharmony_ci */ 27308c2ecf20Sopenharmony_ci set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); 27318c2ecf20Sopenharmony_ci atomic_inc(&conf->r5c_cached_partial_stripes); 27328c2ecf20Sopenharmony_ci } 27338c2ecf20Sopenharmony_ci 27348c2ecf20Sopenharmony_ci for (i = disks; i--; ) { 27358c2ecf20Sopenharmony_ci dev = &sh->dev[i]; 27368c2ecf20Sopenharmony_ci if (dev->towrite) { 27378c2ecf20Sopenharmony_ci set_bit(R5_Wantwrite, &dev->flags); 27388c2ecf20Sopenharmony_ci set_bit(R5_Wantdrain, &dev->flags); 27398c2ecf20Sopenharmony_ci set_bit(R5_LOCKED, &dev->flags); 27408c2ecf20Sopenharmony_ci to_cache++; 27418c2ecf20Sopenharmony_ci } 27428c2ecf20Sopenharmony_ci } 27438c2ecf20Sopenharmony_ci 27448c2ecf20Sopenharmony_ci if (to_cache) { 27458c2ecf20Sopenharmony_ci set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 27468c2ecf20Sopenharmony_ci /* 27478c2ecf20Sopenharmony_ci * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data() 27488c2ecf20Sopenharmony_ci * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in 27498c2ecf20Sopenharmony_ci * r5c_handle_data_cached() 27508c2ecf20Sopenharmony_ci */ 27518c2ecf20Sopenharmony_ci set_bit(STRIPE_LOG_TRAPPED, &sh->state); 27528c2ecf20Sopenharmony_ci } 27538c2ecf20Sopenharmony_ci 27548c2ecf20Sopenharmony_ci return 0; 27558c2ecf20Sopenharmony_ci} 27568c2ecf20Sopenharmony_ci 27578c2ecf20Sopenharmony_ci/* 27588c2ecf20Sopenharmony_ci * free extra pages (orig_page) we allocated for prexor 27598c2ecf20Sopenharmony_ci */ 27608c2ecf20Sopenharmony_civoid r5c_release_extra_page(struct stripe_head *sh) 27618c2ecf20Sopenharmony_ci{ 27628c2ecf20Sopenharmony_ci struct r5conf *conf = sh->raid_conf; 27638c2ecf20Sopenharmony_ci int i; 27648c2ecf20Sopenharmony_ci bool using_disk_info_extra_page; 27658c2ecf20Sopenharmony_ci 27668c2ecf20Sopenharmony_ci using_disk_info_extra_page = 27678c2ecf20Sopenharmony_ci sh->dev[0].orig_page == conf->disks[0].extra_page; 27688c2ecf20Sopenharmony_ci 27698c2ecf20Sopenharmony_ci for (i = sh->disks; i--; ) 27708c2ecf20Sopenharmony_ci if (sh->dev[i].page != sh->dev[i].orig_page) { 27718c2ecf20Sopenharmony_ci struct page *p = sh->dev[i].orig_page; 27728c2ecf20Sopenharmony_ci 27738c2ecf20Sopenharmony_ci sh->dev[i].orig_page = sh->dev[i].page; 27748c2ecf20Sopenharmony_ci clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 27758c2ecf20Sopenharmony_ci 27768c2ecf20Sopenharmony_ci if (!using_disk_info_extra_page) 27778c2ecf20Sopenharmony_ci put_page(p); 27788c2ecf20Sopenharmony_ci } 27798c2ecf20Sopenharmony_ci 27808c2ecf20Sopenharmony_ci if (using_disk_info_extra_page) { 27818c2ecf20Sopenharmony_ci clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state); 27828c2ecf20Sopenharmony_ci md_wakeup_thread(conf->mddev->thread); 27838c2ecf20Sopenharmony_ci } 27848c2ecf20Sopenharmony_ci} 27858c2ecf20Sopenharmony_ci 27868c2ecf20Sopenharmony_civoid r5c_use_extra_page(struct stripe_head *sh) 27878c2ecf20Sopenharmony_ci{ 27888c2ecf20Sopenharmony_ci struct r5conf *conf = sh->raid_conf; 27898c2ecf20Sopenharmony_ci int i; 27908c2ecf20Sopenharmony_ci struct r5dev *dev; 27918c2ecf20Sopenharmony_ci 27928c2ecf20Sopenharmony_ci for (i = sh->disks; i--; ) { 27938c2ecf20Sopenharmony_ci dev = &sh->dev[i]; 27948c2ecf20Sopenharmony_ci if (dev->orig_page != dev->page) 27958c2ecf20Sopenharmony_ci put_page(dev->orig_page); 27968c2ecf20Sopenharmony_ci dev->orig_page = conf->disks[i].extra_page; 27978c2ecf20Sopenharmony_ci } 27988c2ecf20Sopenharmony_ci} 27998c2ecf20Sopenharmony_ci 28008c2ecf20Sopenharmony_ci/* 28018c2ecf20Sopenharmony_ci * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the 28028c2ecf20Sopenharmony_ci * stripe is committed to RAID disks. 28038c2ecf20Sopenharmony_ci */ 28048c2ecf20Sopenharmony_civoid r5c_finish_stripe_write_out(struct r5conf *conf, 28058c2ecf20Sopenharmony_ci struct stripe_head *sh, 28068c2ecf20Sopenharmony_ci struct stripe_head_state *s) 28078c2ecf20Sopenharmony_ci{ 28088c2ecf20Sopenharmony_ci struct r5l_log *log = conf->log; 28098c2ecf20Sopenharmony_ci int i; 28108c2ecf20Sopenharmony_ci int do_wakeup = 0; 28118c2ecf20Sopenharmony_ci sector_t tree_index; 28128c2ecf20Sopenharmony_ci void **pslot; 28138c2ecf20Sopenharmony_ci uintptr_t refcount; 28148c2ecf20Sopenharmony_ci 28158c2ecf20Sopenharmony_ci if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) 28168c2ecf20Sopenharmony_ci return; 28178c2ecf20Sopenharmony_ci 28188c2ecf20Sopenharmony_ci WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); 28198c2ecf20Sopenharmony_ci clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); 28208c2ecf20Sopenharmony_ci 28218c2ecf20Sopenharmony_ci if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) 28228c2ecf20Sopenharmony_ci return; 28238c2ecf20Sopenharmony_ci 28248c2ecf20Sopenharmony_ci for (i = sh->disks; i--; ) { 28258c2ecf20Sopenharmony_ci clear_bit(R5_InJournal, &sh->dev[i].flags); 28268c2ecf20Sopenharmony_ci if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 28278c2ecf20Sopenharmony_ci do_wakeup = 1; 28288c2ecf20Sopenharmony_ci } 28298c2ecf20Sopenharmony_ci 28308c2ecf20Sopenharmony_ci /* 28318c2ecf20Sopenharmony_ci * analyse_stripe() runs before r5c_finish_stripe_write_out(), 28328c2ecf20Sopenharmony_ci * We updated R5_InJournal, so we also update s->injournal. 28338c2ecf20Sopenharmony_ci */ 28348c2ecf20Sopenharmony_ci s->injournal = 0; 28358c2ecf20Sopenharmony_ci 28368c2ecf20Sopenharmony_ci if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 28378c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&conf->pending_full_writes)) 28388c2ecf20Sopenharmony_ci md_wakeup_thread(conf->mddev->thread); 28398c2ecf20Sopenharmony_ci 28408c2ecf20Sopenharmony_ci if (do_wakeup) 28418c2ecf20Sopenharmony_ci wake_up(&conf->wait_for_overlap); 28428c2ecf20Sopenharmony_ci 28438c2ecf20Sopenharmony_ci spin_lock_irq(&log->stripe_in_journal_lock); 28448c2ecf20Sopenharmony_ci list_del_init(&sh->r5c); 28458c2ecf20Sopenharmony_ci spin_unlock_irq(&log->stripe_in_journal_lock); 28468c2ecf20Sopenharmony_ci sh->log_start = MaxSector; 28478c2ecf20Sopenharmony_ci 28488c2ecf20Sopenharmony_ci atomic_dec(&log->stripe_in_journal_count); 28498c2ecf20Sopenharmony_ci r5c_update_log_state(log); 28508c2ecf20Sopenharmony_ci 28518c2ecf20Sopenharmony_ci /* stop counting this stripe in big_stripe_tree */ 28528c2ecf20Sopenharmony_ci if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) || 28538c2ecf20Sopenharmony_ci test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 28548c2ecf20Sopenharmony_ci tree_index = r5c_tree_index(conf, sh->sector); 28558c2ecf20Sopenharmony_ci spin_lock(&log->tree_lock); 28568c2ecf20Sopenharmony_ci pslot = radix_tree_lookup_slot(&log->big_stripe_tree, 28578c2ecf20Sopenharmony_ci tree_index); 28588c2ecf20Sopenharmony_ci BUG_ON(pslot == NULL); 28598c2ecf20Sopenharmony_ci refcount = (uintptr_t)radix_tree_deref_slot_protected( 28608c2ecf20Sopenharmony_ci pslot, &log->tree_lock) >> 28618c2ecf20Sopenharmony_ci R5C_RADIX_COUNT_SHIFT; 28628c2ecf20Sopenharmony_ci if (refcount == 1) 28638c2ecf20Sopenharmony_ci radix_tree_delete(&log->big_stripe_tree, tree_index); 28648c2ecf20Sopenharmony_ci else 28658c2ecf20Sopenharmony_ci radix_tree_replace_slot( 28668c2ecf20Sopenharmony_ci &log->big_stripe_tree, pslot, 28678c2ecf20Sopenharmony_ci (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT)); 28688c2ecf20Sopenharmony_ci spin_unlock(&log->tree_lock); 28698c2ecf20Sopenharmony_ci } 28708c2ecf20Sopenharmony_ci 28718c2ecf20Sopenharmony_ci if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { 28728c2ecf20Sopenharmony_ci BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); 28738c2ecf20Sopenharmony_ci atomic_dec(&conf->r5c_flushing_partial_stripes); 28748c2ecf20Sopenharmony_ci atomic_dec(&conf->r5c_cached_partial_stripes); 28758c2ecf20Sopenharmony_ci } 28768c2ecf20Sopenharmony_ci 28778c2ecf20Sopenharmony_ci if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { 28788c2ecf20Sopenharmony_ci BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); 28798c2ecf20Sopenharmony_ci atomic_dec(&conf->r5c_flushing_full_stripes); 28808c2ecf20Sopenharmony_ci atomic_dec(&conf->r5c_cached_full_stripes); 28818c2ecf20Sopenharmony_ci } 28828c2ecf20Sopenharmony_ci 28838c2ecf20Sopenharmony_ci r5l_append_flush_payload(log, sh->sector); 28848c2ecf20Sopenharmony_ci /* stripe is flused to raid disks, we can do resync now */ 28858c2ecf20Sopenharmony_ci if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) 28868c2ecf20Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 28878c2ecf20Sopenharmony_ci} 28888c2ecf20Sopenharmony_ci 28898c2ecf20Sopenharmony_ciint r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) 28908c2ecf20Sopenharmony_ci{ 28918c2ecf20Sopenharmony_ci struct r5conf *conf = sh->raid_conf; 28928c2ecf20Sopenharmony_ci int pages = 0; 28938c2ecf20Sopenharmony_ci int reserve; 28948c2ecf20Sopenharmony_ci int i; 28958c2ecf20Sopenharmony_ci int ret = 0; 28968c2ecf20Sopenharmony_ci 28978c2ecf20Sopenharmony_ci BUG_ON(!log); 28988c2ecf20Sopenharmony_ci 28998c2ecf20Sopenharmony_ci for (i = 0; i < sh->disks; i++) { 29008c2ecf20Sopenharmony_ci void *addr; 29018c2ecf20Sopenharmony_ci 29028c2ecf20Sopenharmony_ci if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) 29038c2ecf20Sopenharmony_ci continue; 29048c2ecf20Sopenharmony_ci addr = kmap_atomic(sh->dev[i].page); 29058c2ecf20Sopenharmony_ci sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, 29068c2ecf20Sopenharmony_ci addr, PAGE_SIZE); 29078c2ecf20Sopenharmony_ci kunmap_atomic(addr); 29088c2ecf20Sopenharmony_ci pages++; 29098c2ecf20Sopenharmony_ci } 29108c2ecf20Sopenharmony_ci WARN_ON(pages == 0); 29118c2ecf20Sopenharmony_ci 29128c2ecf20Sopenharmony_ci /* 29138c2ecf20Sopenharmony_ci * The stripe must enter state machine again to call endio, so 29148c2ecf20Sopenharmony_ci * don't delay. 29158c2ecf20Sopenharmony_ci */ 29168c2ecf20Sopenharmony_ci clear_bit(STRIPE_DELAYED, &sh->state); 29178c2ecf20Sopenharmony_ci atomic_inc(&sh->count); 29188c2ecf20Sopenharmony_ci 29198c2ecf20Sopenharmony_ci mutex_lock(&log->io_mutex); 29208c2ecf20Sopenharmony_ci /* meta + data */ 29218c2ecf20Sopenharmony_ci reserve = (1 + pages) << (PAGE_SHIFT - 9); 29228c2ecf20Sopenharmony_ci 29238c2ecf20Sopenharmony_ci if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && 29248c2ecf20Sopenharmony_ci sh->log_start == MaxSector) 29258c2ecf20Sopenharmony_ci r5l_add_no_space_stripe(log, sh); 29268c2ecf20Sopenharmony_ci else if (!r5l_has_free_space(log, reserve)) { 29278c2ecf20Sopenharmony_ci if (sh->log_start == log->last_checkpoint) 29288c2ecf20Sopenharmony_ci BUG(); 29298c2ecf20Sopenharmony_ci else 29308c2ecf20Sopenharmony_ci r5l_add_no_space_stripe(log, sh); 29318c2ecf20Sopenharmony_ci } else { 29328c2ecf20Sopenharmony_ci ret = r5l_log_stripe(log, sh, pages, 0); 29338c2ecf20Sopenharmony_ci if (ret) { 29348c2ecf20Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 29358c2ecf20Sopenharmony_ci list_add_tail(&sh->log_list, &log->no_mem_stripes); 29368c2ecf20Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 29378c2ecf20Sopenharmony_ci } 29388c2ecf20Sopenharmony_ci } 29398c2ecf20Sopenharmony_ci 29408c2ecf20Sopenharmony_ci mutex_unlock(&log->io_mutex); 29418c2ecf20Sopenharmony_ci return 0; 29428c2ecf20Sopenharmony_ci} 29438c2ecf20Sopenharmony_ci 29448c2ecf20Sopenharmony_ci/* check whether this big stripe is in write back cache. */ 29458c2ecf20Sopenharmony_cibool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) 29468c2ecf20Sopenharmony_ci{ 29478c2ecf20Sopenharmony_ci struct r5l_log *log = conf->log; 29488c2ecf20Sopenharmony_ci sector_t tree_index; 29498c2ecf20Sopenharmony_ci void *slot; 29508c2ecf20Sopenharmony_ci 29518c2ecf20Sopenharmony_ci if (!log) 29528c2ecf20Sopenharmony_ci return false; 29538c2ecf20Sopenharmony_ci 29548c2ecf20Sopenharmony_ci WARN_ON_ONCE(!rcu_read_lock_held()); 29558c2ecf20Sopenharmony_ci tree_index = r5c_tree_index(conf, sect); 29568c2ecf20Sopenharmony_ci slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); 29578c2ecf20Sopenharmony_ci return slot != NULL; 29588c2ecf20Sopenharmony_ci} 29598c2ecf20Sopenharmony_ci 29608c2ecf20Sopenharmony_cistatic int r5l_load_log(struct r5l_log *log) 29618c2ecf20Sopenharmony_ci{ 29628c2ecf20Sopenharmony_ci struct md_rdev *rdev = log->rdev; 29638c2ecf20Sopenharmony_ci struct page *page; 29648c2ecf20Sopenharmony_ci struct r5l_meta_block *mb; 29658c2ecf20Sopenharmony_ci sector_t cp = log->rdev->journal_tail; 29668c2ecf20Sopenharmony_ci u32 stored_crc, expected_crc; 29678c2ecf20Sopenharmony_ci bool create_super = false; 29688c2ecf20Sopenharmony_ci int ret = 0; 29698c2ecf20Sopenharmony_ci 29708c2ecf20Sopenharmony_ci /* Make sure it's valid */ 29718c2ecf20Sopenharmony_ci if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) 29728c2ecf20Sopenharmony_ci cp = 0; 29738c2ecf20Sopenharmony_ci page = alloc_page(GFP_KERNEL); 29748c2ecf20Sopenharmony_ci if (!page) 29758c2ecf20Sopenharmony_ci return -ENOMEM; 29768c2ecf20Sopenharmony_ci 29778c2ecf20Sopenharmony_ci if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) { 29788c2ecf20Sopenharmony_ci ret = -EIO; 29798c2ecf20Sopenharmony_ci goto ioerr; 29808c2ecf20Sopenharmony_ci } 29818c2ecf20Sopenharmony_ci mb = page_address(page); 29828c2ecf20Sopenharmony_ci 29838c2ecf20Sopenharmony_ci if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || 29848c2ecf20Sopenharmony_ci mb->version != R5LOG_VERSION) { 29858c2ecf20Sopenharmony_ci create_super = true; 29868c2ecf20Sopenharmony_ci goto create; 29878c2ecf20Sopenharmony_ci } 29888c2ecf20Sopenharmony_ci stored_crc = le32_to_cpu(mb->checksum); 29898c2ecf20Sopenharmony_ci mb->checksum = 0; 29908c2ecf20Sopenharmony_ci expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); 29918c2ecf20Sopenharmony_ci if (stored_crc != expected_crc) { 29928c2ecf20Sopenharmony_ci create_super = true; 29938c2ecf20Sopenharmony_ci goto create; 29948c2ecf20Sopenharmony_ci } 29958c2ecf20Sopenharmony_ci if (le64_to_cpu(mb->position) != cp) { 29968c2ecf20Sopenharmony_ci create_super = true; 29978c2ecf20Sopenharmony_ci goto create; 29988c2ecf20Sopenharmony_ci } 29998c2ecf20Sopenharmony_cicreate: 30008c2ecf20Sopenharmony_ci if (create_super) { 30018c2ecf20Sopenharmony_ci log->last_cp_seq = prandom_u32(); 30028c2ecf20Sopenharmony_ci cp = 0; 30038c2ecf20Sopenharmony_ci r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq); 30048c2ecf20Sopenharmony_ci /* 30058c2ecf20Sopenharmony_ci * Make sure super points to correct address. Log might have 30068c2ecf20Sopenharmony_ci * data very soon. If super hasn't correct log tail address, 30078c2ecf20Sopenharmony_ci * recovery can't find the log 30088c2ecf20Sopenharmony_ci */ 30098c2ecf20Sopenharmony_ci r5l_write_super(log, cp); 30108c2ecf20Sopenharmony_ci } else 30118c2ecf20Sopenharmony_ci log->last_cp_seq = le64_to_cpu(mb->seq); 30128c2ecf20Sopenharmony_ci 30138c2ecf20Sopenharmony_ci log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); 30148c2ecf20Sopenharmony_ci log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; 30158c2ecf20Sopenharmony_ci if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) 30168c2ecf20Sopenharmony_ci log->max_free_space = RECLAIM_MAX_FREE_SPACE; 30178c2ecf20Sopenharmony_ci log->last_checkpoint = cp; 30188c2ecf20Sopenharmony_ci 30198c2ecf20Sopenharmony_ci __free_page(page); 30208c2ecf20Sopenharmony_ci 30218c2ecf20Sopenharmony_ci if (create_super) { 30228c2ecf20Sopenharmony_ci log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS); 30238c2ecf20Sopenharmony_ci log->seq = log->last_cp_seq + 1; 30248c2ecf20Sopenharmony_ci log->next_checkpoint = cp; 30258c2ecf20Sopenharmony_ci } else 30268c2ecf20Sopenharmony_ci ret = r5l_recovery_log(log); 30278c2ecf20Sopenharmony_ci 30288c2ecf20Sopenharmony_ci r5c_update_log_state(log); 30298c2ecf20Sopenharmony_ci return ret; 30308c2ecf20Sopenharmony_ciioerr: 30318c2ecf20Sopenharmony_ci __free_page(page); 30328c2ecf20Sopenharmony_ci return ret; 30338c2ecf20Sopenharmony_ci} 30348c2ecf20Sopenharmony_ci 30358c2ecf20Sopenharmony_ciint r5l_start(struct r5l_log *log) 30368c2ecf20Sopenharmony_ci{ 30378c2ecf20Sopenharmony_ci int ret; 30388c2ecf20Sopenharmony_ci 30398c2ecf20Sopenharmony_ci if (!log) 30408c2ecf20Sopenharmony_ci return 0; 30418c2ecf20Sopenharmony_ci 30428c2ecf20Sopenharmony_ci ret = r5l_load_log(log); 30438c2ecf20Sopenharmony_ci if (ret) { 30448c2ecf20Sopenharmony_ci struct mddev *mddev = log->rdev->mddev; 30458c2ecf20Sopenharmony_ci struct r5conf *conf = mddev->private; 30468c2ecf20Sopenharmony_ci 30478c2ecf20Sopenharmony_ci r5l_exit_log(conf); 30488c2ecf20Sopenharmony_ci } 30498c2ecf20Sopenharmony_ci return ret; 30508c2ecf20Sopenharmony_ci} 30518c2ecf20Sopenharmony_ci 30528c2ecf20Sopenharmony_civoid r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) 30538c2ecf20Sopenharmony_ci{ 30548c2ecf20Sopenharmony_ci struct r5conf *conf = mddev->private; 30558c2ecf20Sopenharmony_ci struct r5l_log *log = conf->log; 30568c2ecf20Sopenharmony_ci 30578c2ecf20Sopenharmony_ci if (!log) 30588c2ecf20Sopenharmony_ci return; 30598c2ecf20Sopenharmony_ci 30608c2ecf20Sopenharmony_ci if ((raid5_calc_degraded(conf) > 0 || 30618c2ecf20Sopenharmony_ci test_bit(Journal, &rdev->flags)) && 30628c2ecf20Sopenharmony_ci conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) 30638c2ecf20Sopenharmony_ci schedule_work(&log->disable_writeback_work); 30648c2ecf20Sopenharmony_ci} 30658c2ecf20Sopenharmony_ci 30668c2ecf20Sopenharmony_ciint r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) 30678c2ecf20Sopenharmony_ci{ 30688c2ecf20Sopenharmony_ci struct request_queue *q = bdev_get_queue(rdev->bdev); 30698c2ecf20Sopenharmony_ci struct r5l_log *log; 30708c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 30718c2ecf20Sopenharmony_ci int ret; 30728c2ecf20Sopenharmony_ci 30738c2ecf20Sopenharmony_ci pr_debug("md/raid:%s: using device %s as journal\n", 30748c2ecf20Sopenharmony_ci mdname(conf->mddev), bdevname(rdev->bdev, b)); 30758c2ecf20Sopenharmony_ci 30768c2ecf20Sopenharmony_ci if (PAGE_SIZE != 4096) 30778c2ecf20Sopenharmony_ci return -EINVAL; 30788c2ecf20Sopenharmony_ci 30798c2ecf20Sopenharmony_ci /* 30808c2ecf20Sopenharmony_ci * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and 30818c2ecf20Sopenharmony_ci * raid_disks r5l_payload_data_parity. 30828c2ecf20Sopenharmony_ci * 30838c2ecf20Sopenharmony_ci * Write journal and cache does not work for very big array 30848c2ecf20Sopenharmony_ci * (raid_disks > 203) 30858c2ecf20Sopenharmony_ci */ 30868c2ecf20Sopenharmony_ci if (sizeof(struct r5l_meta_block) + 30878c2ecf20Sopenharmony_ci ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) * 30888c2ecf20Sopenharmony_ci conf->raid_disks) > PAGE_SIZE) { 30898c2ecf20Sopenharmony_ci pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n", 30908c2ecf20Sopenharmony_ci mdname(conf->mddev), conf->raid_disks); 30918c2ecf20Sopenharmony_ci return -EINVAL; 30928c2ecf20Sopenharmony_ci } 30938c2ecf20Sopenharmony_ci 30948c2ecf20Sopenharmony_ci log = kzalloc(sizeof(*log), GFP_KERNEL); 30958c2ecf20Sopenharmony_ci if (!log) 30968c2ecf20Sopenharmony_ci return -ENOMEM; 30978c2ecf20Sopenharmony_ci log->rdev = rdev; 30988c2ecf20Sopenharmony_ci 30998c2ecf20Sopenharmony_ci log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0; 31008c2ecf20Sopenharmony_ci 31018c2ecf20Sopenharmony_ci log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, 31028c2ecf20Sopenharmony_ci sizeof(rdev->mddev->uuid)); 31038c2ecf20Sopenharmony_ci 31048c2ecf20Sopenharmony_ci mutex_init(&log->io_mutex); 31058c2ecf20Sopenharmony_ci 31068c2ecf20Sopenharmony_ci spin_lock_init(&log->io_list_lock); 31078c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&log->running_ios); 31088c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&log->io_end_ios); 31098c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&log->flushing_ios); 31108c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&log->finished_ios); 31118c2ecf20Sopenharmony_ci bio_init(&log->flush_bio, NULL, 0); 31128c2ecf20Sopenharmony_ci 31138c2ecf20Sopenharmony_ci log->io_kc = KMEM_CACHE(r5l_io_unit, 0); 31148c2ecf20Sopenharmony_ci if (!log->io_kc) 31158c2ecf20Sopenharmony_ci goto io_kc; 31168c2ecf20Sopenharmony_ci 31178c2ecf20Sopenharmony_ci ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc); 31188c2ecf20Sopenharmony_ci if (ret) 31198c2ecf20Sopenharmony_ci goto io_pool; 31208c2ecf20Sopenharmony_ci 31218c2ecf20Sopenharmony_ci ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS); 31228c2ecf20Sopenharmony_ci if (ret) 31238c2ecf20Sopenharmony_ci goto io_bs; 31248c2ecf20Sopenharmony_ci 31258c2ecf20Sopenharmony_ci ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0); 31268c2ecf20Sopenharmony_ci if (ret) 31278c2ecf20Sopenharmony_ci goto out_mempool; 31288c2ecf20Sopenharmony_ci 31298c2ecf20Sopenharmony_ci spin_lock_init(&log->tree_lock); 31308c2ecf20Sopenharmony_ci INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); 31318c2ecf20Sopenharmony_ci 31328c2ecf20Sopenharmony_ci log->reclaim_thread = md_register_thread(r5l_reclaim_thread, 31338c2ecf20Sopenharmony_ci log->rdev->mddev, "reclaim"); 31348c2ecf20Sopenharmony_ci if (!log->reclaim_thread) 31358c2ecf20Sopenharmony_ci goto reclaim_thread; 31368c2ecf20Sopenharmony_ci log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; 31378c2ecf20Sopenharmony_ci 31388c2ecf20Sopenharmony_ci init_waitqueue_head(&log->iounit_wait); 31398c2ecf20Sopenharmony_ci 31408c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&log->no_mem_stripes); 31418c2ecf20Sopenharmony_ci 31428c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&log->no_space_stripes); 31438c2ecf20Sopenharmony_ci spin_lock_init(&log->no_space_stripes_lock); 31448c2ecf20Sopenharmony_ci 31458c2ecf20Sopenharmony_ci INIT_WORK(&log->deferred_io_work, r5l_submit_io_async); 31468c2ecf20Sopenharmony_ci INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async); 31478c2ecf20Sopenharmony_ci 31488c2ecf20Sopenharmony_ci log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 31498c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&log->stripe_in_journal_list); 31508c2ecf20Sopenharmony_ci spin_lock_init(&log->stripe_in_journal_lock); 31518c2ecf20Sopenharmony_ci atomic_set(&log->stripe_in_journal_count, 0); 31528c2ecf20Sopenharmony_ci 31538c2ecf20Sopenharmony_ci rcu_assign_pointer(conf->log, log); 31548c2ecf20Sopenharmony_ci 31558c2ecf20Sopenharmony_ci set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); 31568c2ecf20Sopenharmony_ci return 0; 31578c2ecf20Sopenharmony_ci 31588c2ecf20Sopenharmony_cireclaim_thread: 31598c2ecf20Sopenharmony_ci mempool_exit(&log->meta_pool); 31608c2ecf20Sopenharmony_ciout_mempool: 31618c2ecf20Sopenharmony_ci bioset_exit(&log->bs); 31628c2ecf20Sopenharmony_ciio_bs: 31638c2ecf20Sopenharmony_ci mempool_exit(&log->io_pool); 31648c2ecf20Sopenharmony_ciio_pool: 31658c2ecf20Sopenharmony_ci kmem_cache_destroy(log->io_kc); 31668c2ecf20Sopenharmony_ciio_kc: 31678c2ecf20Sopenharmony_ci kfree(log); 31688c2ecf20Sopenharmony_ci return -EINVAL; 31698c2ecf20Sopenharmony_ci} 31708c2ecf20Sopenharmony_ci 31718c2ecf20Sopenharmony_civoid r5l_exit_log(struct r5conf *conf) 31728c2ecf20Sopenharmony_ci{ 31738c2ecf20Sopenharmony_ci struct r5l_log *log = conf->log; 31748c2ecf20Sopenharmony_ci 31758c2ecf20Sopenharmony_ci conf->log = NULL; 31768c2ecf20Sopenharmony_ci synchronize_rcu(); 31778c2ecf20Sopenharmony_ci 31788c2ecf20Sopenharmony_ci /* Ensure disable_writeback_work wakes up and exits */ 31798c2ecf20Sopenharmony_ci wake_up(&conf->mddev->sb_wait); 31808c2ecf20Sopenharmony_ci flush_work(&log->disable_writeback_work); 31818c2ecf20Sopenharmony_ci md_unregister_thread(&log->reclaim_thread); 31828c2ecf20Sopenharmony_ci mempool_exit(&log->meta_pool); 31838c2ecf20Sopenharmony_ci bioset_exit(&log->bs); 31848c2ecf20Sopenharmony_ci mempool_exit(&log->io_pool); 31858c2ecf20Sopenharmony_ci kmem_cache_destroy(log->io_kc); 31868c2ecf20Sopenharmony_ci kfree(log); 31878c2ecf20Sopenharmony_ci} 3188