18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Partial Parity Log for closing the RAID5 write hole 48c2ecf20Sopenharmony_ci * Copyright (c) 2017, Intel Corporation. 58c2ecf20Sopenharmony_ci */ 68c2ecf20Sopenharmony_ci 78c2ecf20Sopenharmony_ci#include <linux/kernel.h> 88c2ecf20Sopenharmony_ci#include <linux/blkdev.h> 98c2ecf20Sopenharmony_ci#include <linux/slab.h> 108c2ecf20Sopenharmony_ci#include <linux/crc32c.h> 118c2ecf20Sopenharmony_ci#include <linux/async_tx.h> 128c2ecf20Sopenharmony_ci#include <linux/raid/md_p.h> 138c2ecf20Sopenharmony_ci#include "md.h" 148c2ecf20Sopenharmony_ci#include "raid5.h" 158c2ecf20Sopenharmony_ci#include "raid5-log.h" 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ci/* 188c2ecf20Sopenharmony_ci * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for 198c2ecf20Sopenharmony_ci * partial parity data. The header contains an array of entries 208c2ecf20Sopenharmony_ci * (struct ppl_header_entry) which describe the logged write requests. 218c2ecf20Sopenharmony_ci * Partial parity for the entries comes after the header, written in the same 228c2ecf20Sopenharmony_ci * sequence as the entries: 238c2ecf20Sopenharmony_ci * 248c2ecf20Sopenharmony_ci * Header 258c2ecf20Sopenharmony_ci * entry0 268c2ecf20Sopenharmony_ci * ... 278c2ecf20Sopenharmony_ci * entryN 288c2ecf20Sopenharmony_ci * PP data 298c2ecf20Sopenharmony_ci * PP for entry0 308c2ecf20Sopenharmony_ci * ... 318c2ecf20Sopenharmony_ci * PP for entryN 328c2ecf20Sopenharmony_ci * 338c2ecf20Sopenharmony_ci * An entry describes one or more consecutive stripe_heads, up to a full 348c2ecf20Sopenharmony_ci * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the 358c2ecf20Sopenharmony_ci * number of stripe_heads in the entry and n is the number of modified data 368c2ecf20Sopenharmony_ci * disks. Every stripe_head in the entry must write to the same data disks. 378c2ecf20Sopenharmony_ci * An example of a valid case described by a single entry (writes to the first 388c2ecf20Sopenharmony_ci * stripe of a 4 disk array, 16k chunk size): 398c2ecf20Sopenharmony_ci * 408c2ecf20Sopenharmony_ci * sh->sector dd0 dd1 dd2 ppl 418c2ecf20Sopenharmony_ci * +-----+-----+-----+ 428c2ecf20Sopenharmony_ci * 0 | --- | --- | --- | +----+ 438c2ecf20Sopenharmony_ci * 8 | -W- | -W- | --- | | pp | data_sector = 8 448c2ecf20Sopenharmony_ci * 16 | -W- | -W- | --- | | pp | data_size = 3 * 2 * 4k 458c2ecf20Sopenharmony_ci * 24 | -W- | -W- | --- | | pp | pp_size = 3 * 4k 468c2ecf20Sopenharmony_ci * +-----+-----+-----+ +----+ 478c2ecf20Sopenharmony_ci * 488c2ecf20Sopenharmony_ci * data_sector is the first raid sector of the modified data, data_size is the 498c2ecf20Sopenharmony_ci * total size of modified data and pp_size is the size of partial parity for 508c2ecf20Sopenharmony_ci * this entry. Entries for full stripe writes contain no partial parity 518c2ecf20Sopenharmony_ci * (pp_size = 0), they only mark the stripes for which parity should be 528c2ecf20Sopenharmony_ci * recalculated after an unclean shutdown. Every entry holds a checksum of its 538c2ecf20Sopenharmony_ci * partial parity, the header also has a checksum of the header itself. 548c2ecf20Sopenharmony_ci * 558c2ecf20Sopenharmony_ci * A write request is always logged to the PPL instance stored on the parity 568c2ecf20Sopenharmony_ci * disk of the corresponding stripe. For each member disk there is one ppl_log 578c2ecf20Sopenharmony_ci * used to handle logging for this disk, independently from others. They are 588c2ecf20Sopenharmony_ci * grouped in child_logs array in struct ppl_conf, which is assigned to 598c2ecf20Sopenharmony_ci * r5conf->log_private. 608c2ecf20Sopenharmony_ci * 618c2ecf20Sopenharmony_ci * ppl_io_unit represents a full PPL write, header_page contains the ppl_header. 628c2ecf20Sopenharmony_ci * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head 638c2ecf20Sopenharmony_ci * can be appended to the last entry if it meets the conditions for a valid 648c2ecf20Sopenharmony_ci * entry described above, otherwise a new entry is added. Checksums of entries 658c2ecf20Sopenharmony_ci * are calculated incrementally as stripes containing partial parity are being 668c2ecf20Sopenharmony_ci * added. ppl_submit_iounit() calculates the checksum of the header and submits 678c2ecf20Sopenharmony_ci * a bio containing the header page and partial parity pages (sh->ppl_page) for 688c2ecf20Sopenharmony_ci * all stripes of the io_unit. When the PPL write completes, the stripes 698c2ecf20Sopenharmony_ci * associated with the io_unit are released and raid5d starts writing their data 708c2ecf20Sopenharmony_ci * and parity. When all stripes are written, the io_unit is freed and the next 718c2ecf20Sopenharmony_ci * can be submitted. 728c2ecf20Sopenharmony_ci * 738c2ecf20Sopenharmony_ci * An io_unit is used to gather stripes until it is submitted or becomes full 748c2ecf20Sopenharmony_ci * (if the maximum number of entries or size of PPL is reached). Another io_unit 758c2ecf20Sopenharmony_ci * can't be submitted until the previous has completed (PPL and stripe 768c2ecf20Sopenharmony_ci * data+parity is written). The log->io_list tracks all io_units of a log 778c2ecf20Sopenharmony_ci * (for a single member disk). New io_units are added to the end of the list 788c2ecf20Sopenharmony_ci * and the first io_unit is submitted, if it is not submitted already. 798c2ecf20Sopenharmony_ci * The current io_unit accepting new stripes is always at the end of the list. 808c2ecf20Sopenharmony_ci * 818c2ecf20Sopenharmony_ci * If write-back cache is enabled for any of the disks in the array, its data 828c2ecf20Sopenharmony_ci * must be flushed before next io_unit is submitted. 838c2ecf20Sopenharmony_ci */ 848c2ecf20Sopenharmony_ci 858c2ecf20Sopenharmony_ci#define PPL_SPACE_SIZE (128 * 1024) 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_cistruct ppl_conf { 888c2ecf20Sopenharmony_ci struct mddev *mddev; 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci /* array of child logs, one for each raid disk */ 918c2ecf20Sopenharmony_ci struct ppl_log *child_logs; 928c2ecf20Sopenharmony_ci int count; 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_ci int block_size; /* the logical block size used for data_sector 958c2ecf20Sopenharmony_ci * in ppl_header_entry */ 968c2ecf20Sopenharmony_ci u32 signature; /* raid array identifier */ 978c2ecf20Sopenharmony_ci atomic64_t seq; /* current log write sequence number */ 988c2ecf20Sopenharmony_ci 998c2ecf20Sopenharmony_ci struct kmem_cache *io_kc; 1008c2ecf20Sopenharmony_ci mempool_t io_pool; 1018c2ecf20Sopenharmony_ci struct bio_set bs; 1028c2ecf20Sopenharmony_ci struct bio_set flush_bs; 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci /* used only for recovery */ 1058c2ecf20Sopenharmony_ci int recovered_entries; 1068c2ecf20Sopenharmony_ci int mismatch_count; 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_ci /* stripes to retry if failed to allocate io_unit */ 1098c2ecf20Sopenharmony_ci struct list_head no_mem_stripes; 1108c2ecf20Sopenharmony_ci spinlock_t no_mem_stripes_lock; 1118c2ecf20Sopenharmony_ci 1128c2ecf20Sopenharmony_ci unsigned short write_hint; 1138c2ecf20Sopenharmony_ci}; 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_cistruct ppl_log { 1168c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf; /* shared between all log instances */ 1178c2ecf20Sopenharmony_ci 1188c2ecf20Sopenharmony_ci struct md_rdev *rdev; /* array member disk associated with 1198c2ecf20Sopenharmony_ci * this log instance */ 1208c2ecf20Sopenharmony_ci struct mutex io_mutex; 1218c2ecf20Sopenharmony_ci struct ppl_io_unit *current_io; /* current io_unit accepting new data 1228c2ecf20Sopenharmony_ci * always at the end of io_list */ 1238c2ecf20Sopenharmony_ci spinlock_t io_list_lock; 1248c2ecf20Sopenharmony_ci struct list_head io_list; /* all io_units of this log */ 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci sector_t next_io_sector; 1278c2ecf20Sopenharmony_ci unsigned int entry_space; 1288c2ecf20Sopenharmony_ci bool use_multippl; 1298c2ecf20Sopenharmony_ci bool wb_cache_on; 1308c2ecf20Sopenharmony_ci unsigned long disk_flush_bitmap; 1318c2ecf20Sopenharmony_ci}; 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci#define PPL_IO_INLINE_BVECS 32 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_cistruct ppl_io_unit { 1368c2ecf20Sopenharmony_ci struct ppl_log *log; 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci struct page *header_page; /* for ppl_header */ 1398c2ecf20Sopenharmony_ci 1408c2ecf20Sopenharmony_ci unsigned int entries_count; /* number of entries in ppl_header */ 1418c2ecf20Sopenharmony_ci unsigned int pp_size; /* total size current of partial parity */ 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_ci u64 seq; /* sequence number of this log write */ 1448c2ecf20Sopenharmony_ci struct list_head log_sibling; /* log->io_list */ 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_ci struct list_head stripe_list; /* stripes added to the io_unit */ 1478c2ecf20Sopenharmony_ci atomic_t pending_stripes; /* how many stripes not written to raid */ 1488c2ecf20Sopenharmony_ci atomic_t pending_flushes; /* how many disk flushes are in progress */ 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci bool submitted; /* true if write to log started */ 1518c2ecf20Sopenharmony_ci 1528c2ecf20Sopenharmony_ci /* inline bio and its biovec for submitting the iounit */ 1538c2ecf20Sopenharmony_ci struct bio bio; 1548c2ecf20Sopenharmony_ci struct bio_vec biovec[PPL_IO_INLINE_BVECS]; 1558c2ecf20Sopenharmony_ci}; 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_cistruct dma_async_tx_descriptor * 1588c2ecf20Sopenharmony_ciops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, 1598c2ecf20Sopenharmony_ci struct dma_async_tx_descriptor *tx) 1608c2ecf20Sopenharmony_ci{ 1618c2ecf20Sopenharmony_ci int disks = sh->disks; 1628c2ecf20Sopenharmony_ci struct page **srcs = percpu->scribble; 1638c2ecf20Sopenharmony_ci int count = 0, pd_idx = sh->pd_idx, i; 1648c2ecf20Sopenharmony_ci struct async_submit_ctl submit; 1658c2ecf20Sopenharmony_ci 1668c2ecf20Sopenharmony_ci pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci /* 1698c2ecf20Sopenharmony_ci * Partial parity is the XOR of stripe data chunks that are not changed 1708c2ecf20Sopenharmony_ci * during the write request. Depending on available data 1718c2ecf20Sopenharmony_ci * (read-modify-write vs. reconstruct-write case) we calculate it 1728c2ecf20Sopenharmony_ci * differently. 1738c2ecf20Sopenharmony_ci */ 1748c2ecf20Sopenharmony_ci if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1758c2ecf20Sopenharmony_ci /* 1768c2ecf20Sopenharmony_ci * rmw: xor old data and parity from updated disks 1778c2ecf20Sopenharmony_ci * This is calculated earlier by ops_run_prexor5() so just copy 1788c2ecf20Sopenharmony_ci * the parity dev page. 1798c2ecf20Sopenharmony_ci */ 1808c2ecf20Sopenharmony_ci srcs[count++] = sh->dev[pd_idx].page; 1818c2ecf20Sopenharmony_ci } else if (sh->reconstruct_state == reconstruct_state_drain_run) { 1828c2ecf20Sopenharmony_ci /* rcw: xor data from all not updated disks */ 1838c2ecf20Sopenharmony_ci for (i = disks; i--;) { 1848c2ecf20Sopenharmony_ci struct r5dev *dev = &sh->dev[i]; 1858c2ecf20Sopenharmony_ci if (test_bit(R5_UPTODATE, &dev->flags)) 1868c2ecf20Sopenharmony_ci srcs[count++] = dev->page; 1878c2ecf20Sopenharmony_ci } 1888c2ecf20Sopenharmony_ci } else { 1898c2ecf20Sopenharmony_ci return tx; 1908c2ecf20Sopenharmony_ci } 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_ci init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx, 1938c2ecf20Sopenharmony_ci NULL, sh, (void *) (srcs + sh->disks + 2)); 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci if (count == 1) 1968c2ecf20Sopenharmony_ci tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE, 1978c2ecf20Sopenharmony_ci &submit); 1988c2ecf20Sopenharmony_ci else 1998c2ecf20Sopenharmony_ci tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE, 2008c2ecf20Sopenharmony_ci &submit); 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ci return tx; 2038c2ecf20Sopenharmony_ci} 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_cistatic void *ppl_io_pool_alloc(gfp_t gfp_mask, void *pool_data) 2068c2ecf20Sopenharmony_ci{ 2078c2ecf20Sopenharmony_ci struct kmem_cache *kc = pool_data; 2088c2ecf20Sopenharmony_ci struct ppl_io_unit *io; 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci io = kmem_cache_alloc(kc, gfp_mask); 2118c2ecf20Sopenharmony_ci if (!io) 2128c2ecf20Sopenharmony_ci return NULL; 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci io->header_page = alloc_page(gfp_mask); 2158c2ecf20Sopenharmony_ci if (!io->header_page) { 2168c2ecf20Sopenharmony_ci kmem_cache_free(kc, io); 2178c2ecf20Sopenharmony_ci return NULL; 2188c2ecf20Sopenharmony_ci } 2198c2ecf20Sopenharmony_ci 2208c2ecf20Sopenharmony_ci return io; 2218c2ecf20Sopenharmony_ci} 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_cistatic void ppl_io_pool_free(void *element, void *pool_data) 2248c2ecf20Sopenharmony_ci{ 2258c2ecf20Sopenharmony_ci struct kmem_cache *kc = pool_data; 2268c2ecf20Sopenharmony_ci struct ppl_io_unit *io = element; 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci __free_page(io->header_page); 2298c2ecf20Sopenharmony_ci kmem_cache_free(kc, io); 2308c2ecf20Sopenharmony_ci} 2318c2ecf20Sopenharmony_ci 2328c2ecf20Sopenharmony_cistatic struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, 2338c2ecf20Sopenharmony_ci struct stripe_head *sh) 2348c2ecf20Sopenharmony_ci{ 2358c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = log->ppl_conf; 2368c2ecf20Sopenharmony_ci struct ppl_io_unit *io; 2378c2ecf20Sopenharmony_ci struct ppl_header *pplhdr; 2388c2ecf20Sopenharmony_ci struct page *header_page; 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_ci io = mempool_alloc(&ppl_conf->io_pool, GFP_NOWAIT); 2418c2ecf20Sopenharmony_ci if (!io) 2428c2ecf20Sopenharmony_ci return NULL; 2438c2ecf20Sopenharmony_ci 2448c2ecf20Sopenharmony_ci header_page = io->header_page; 2458c2ecf20Sopenharmony_ci memset(io, 0, sizeof(*io)); 2468c2ecf20Sopenharmony_ci io->header_page = header_page; 2478c2ecf20Sopenharmony_ci 2488c2ecf20Sopenharmony_ci io->log = log; 2498c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&io->log_sibling); 2508c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&io->stripe_list); 2518c2ecf20Sopenharmony_ci atomic_set(&io->pending_stripes, 0); 2528c2ecf20Sopenharmony_ci atomic_set(&io->pending_flushes, 0); 2538c2ecf20Sopenharmony_ci bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS); 2548c2ecf20Sopenharmony_ci 2558c2ecf20Sopenharmony_ci pplhdr = page_address(io->header_page); 2568c2ecf20Sopenharmony_ci clear_page(pplhdr); 2578c2ecf20Sopenharmony_ci memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); 2588c2ecf20Sopenharmony_ci pplhdr->signature = cpu_to_le32(ppl_conf->signature); 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ci io->seq = atomic64_add_return(1, &ppl_conf->seq); 2618c2ecf20Sopenharmony_ci pplhdr->generation = cpu_to_le64(io->seq); 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_ci return io; 2648c2ecf20Sopenharmony_ci} 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_cistatic int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) 2678c2ecf20Sopenharmony_ci{ 2688c2ecf20Sopenharmony_ci struct ppl_io_unit *io = log->current_io; 2698c2ecf20Sopenharmony_ci struct ppl_header_entry *e = NULL; 2708c2ecf20Sopenharmony_ci struct ppl_header *pplhdr; 2718c2ecf20Sopenharmony_ci int i; 2728c2ecf20Sopenharmony_ci sector_t data_sector = 0; 2738c2ecf20Sopenharmony_ci int data_disks = 0; 2748c2ecf20Sopenharmony_ci struct r5conf *conf = sh->raid_conf; 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_ci pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector); 2778c2ecf20Sopenharmony_ci 2788c2ecf20Sopenharmony_ci /* check if current io_unit is full */ 2798c2ecf20Sopenharmony_ci if (io && (io->pp_size == log->entry_space || 2808c2ecf20Sopenharmony_ci io->entries_count == PPL_HDR_MAX_ENTRIES)) { 2818c2ecf20Sopenharmony_ci pr_debug("%s: add io_unit blocked by seq: %llu\n", 2828c2ecf20Sopenharmony_ci __func__, io->seq); 2838c2ecf20Sopenharmony_ci io = NULL; 2848c2ecf20Sopenharmony_ci } 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_ci /* add a new unit if there is none or the current is full */ 2878c2ecf20Sopenharmony_ci if (!io) { 2888c2ecf20Sopenharmony_ci io = ppl_new_iounit(log, sh); 2898c2ecf20Sopenharmony_ci if (!io) 2908c2ecf20Sopenharmony_ci return -ENOMEM; 2918c2ecf20Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 2928c2ecf20Sopenharmony_ci list_add_tail(&io->log_sibling, &log->io_list); 2938c2ecf20Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 2948c2ecf20Sopenharmony_ci 2958c2ecf20Sopenharmony_ci log->current_io = io; 2968c2ecf20Sopenharmony_ci } 2978c2ecf20Sopenharmony_ci 2988c2ecf20Sopenharmony_ci for (i = 0; i < sh->disks; i++) { 2998c2ecf20Sopenharmony_ci struct r5dev *dev = &sh->dev[i]; 3008c2ecf20Sopenharmony_ci 3018c2ecf20Sopenharmony_ci if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) { 3028c2ecf20Sopenharmony_ci if (!data_disks || dev->sector < data_sector) 3038c2ecf20Sopenharmony_ci data_sector = dev->sector; 3048c2ecf20Sopenharmony_ci data_disks++; 3058c2ecf20Sopenharmony_ci } 3068c2ecf20Sopenharmony_ci } 3078c2ecf20Sopenharmony_ci BUG_ON(!data_disks); 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__, 3108c2ecf20Sopenharmony_ci io->seq, (unsigned long long)data_sector, data_disks); 3118c2ecf20Sopenharmony_ci 3128c2ecf20Sopenharmony_ci pplhdr = page_address(io->header_page); 3138c2ecf20Sopenharmony_ci 3148c2ecf20Sopenharmony_ci if (io->entries_count > 0) { 3158c2ecf20Sopenharmony_ci struct ppl_header_entry *last = 3168c2ecf20Sopenharmony_ci &pplhdr->entries[io->entries_count - 1]; 3178c2ecf20Sopenharmony_ci struct stripe_head *sh_last = list_last_entry( 3188c2ecf20Sopenharmony_ci &io->stripe_list, struct stripe_head, log_list); 3198c2ecf20Sopenharmony_ci u64 data_sector_last = le64_to_cpu(last->data_sector); 3208c2ecf20Sopenharmony_ci u32 data_size_last = le32_to_cpu(last->data_size); 3218c2ecf20Sopenharmony_ci 3228c2ecf20Sopenharmony_ci /* 3238c2ecf20Sopenharmony_ci * Check if we can append the stripe to the last entry. It must 3248c2ecf20Sopenharmony_ci * be just after the last logged stripe and write to the same 3258c2ecf20Sopenharmony_ci * disks. Use bit shift and logarithm to avoid 64-bit division. 3268c2ecf20Sopenharmony_ci */ 3278c2ecf20Sopenharmony_ci if ((sh->sector == sh_last->sector + RAID5_STRIPE_SECTORS(conf)) && 3288c2ecf20Sopenharmony_ci (data_sector >> ilog2(conf->chunk_sectors) == 3298c2ecf20Sopenharmony_ci data_sector_last >> ilog2(conf->chunk_sectors)) && 3308c2ecf20Sopenharmony_ci ((data_sector - data_sector_last) * data_disks == 3318c2ecf20Sopenharmony_ci data_size_last >> 9)) 3328c2ecf20Sopenharmony_ci e = last; 3338c2ecf20Sopenharmony_ci } 3348c2ecf20Sopenharmony_ci 3358c2ecf20Sopenharmony_ci if (!e) { 3368c2ecf20Sopenharmony_ci e = &pplhdr->entries[io->entries_count++]; 3378c2ecf20Sopenharmony_ci e->data_sector = cpu_to_le64(data_sector); 3388c2ecf20Sopenharmony_ci e->parity_disk = cpu_to_le32(sh->pd_idx); 3398c2ecf20Sopenharmony_ci e->checksum = cpu_to_le32(~0); 3408c2ecf20Sopenharmony_ci } 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_ci le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT); 3438c2ecf20Sopenharmony_ci 3448c2ecf20Sopenharmony_ci /* don't write any PP if full stripe write */ 3458c2ecf20Sopenharmony_ci if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) { 3468c2ecf20Sopenharmony_ci le32_add_cpu(&e->pp_size, PAGE_SIZE); 3478c2ecf20Sopenharmony_ci io->pp_size += PAGE_SIZE; 3488c2ecf20Sopenharmony_ci e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum), 3498c2ecf20Sopenharmony_ci page_address(sh->ppl_page), 3508c2ecf20Sopenharmony_ci PAGE_SIZE)); 3518c2ecf20Sopenharmony_ci } 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci list_add_tail(&sh->log_list, &io->stripe_list); 3548c2ecf20Sopenharmony_ci atomic_inc(&io->pending_stripes); 3558c2ecf20Sopenharmony_ci sh->ppl_io = io; 3568c2ecf20Sopenharmony_ci 3578c2ecf20Sopenharmony_ci return 0; 3588c2ecf20Sopenharmony_ci} 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ciint ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh) 3618c2ecf20Sopenharmony_ci{ 3628c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = conf->log_private; 3638c2ecf20Sopenharmony_ci struct ppl_io_unit *io = sh->ppl_io; 3648c2ecf20Sopenharmony_ci struct ppl_log *log; 3658c2ecf20Sopenharmony_ci 3668c2ecf20Sopenharmony_ci if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page || 3678c2ecf20Sopenharmony_ci !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || 3688c2ecf20Sopenharmony_ci !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) { 3698c2ecf20Sopenharmony_ci clear_bit(STRIPE_LOG_TRAPPED, &sh->state); 3708c2ecf20Sopenharmony_ci return -EAGAIN; 3718c2ecf20Sopenharmony_ci } 3728c2ecf20Sopenharmony_ci 3738c2ecf20Sopenharmony_ci log = &ppl_conf->child_logs[sh->pd_idx]; 3748c2ecf20Sopenharmony_ci 3758c2ecf20Sopenharmony_ci mutex_lock(&log->io_mutex); 3768c2ecf20Sopenharmony_ci 3778c2ecf20Sopenharmony_ci if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) { 3788c2ecf20Sopenharmony_ci mutex_unlock(&log->io_mutex); 3798c2ecf20Sopenharmony_ci return -EAGAIN; 3808c2ecf20Sopenharmony_ci } 3818c2ecf20Sopenharmony_ci 3828c2ecf20Sopenharmony_ci set_bit(STRIPE_LOG_TRAPPED, &sh->state); 3838c2ecf20Sopenharmony_ci clear_bit(STRIPE_DELAYED, &sh->state); 3848c2ecf20Sopenharmony_ci atomic_inc(&sh->count); 3858c2ecf20Sopenharmony_ci 3868c2ecf20Sopenharmony_ci if (ppl_log_stripe(log, sh)) { 3878c2ecf20Sopenharmony_ci spin_lock_irq(&ppl_conf->no_mem_stripes_lock); 3888c2ecf20Sopenharmony_ci list_add_tail(&sh->log_list, &ppl_conf->no_mem_stripes); 3898c2ecf20Sopenharmony_ci spin_unlock_irq(&ppl_conf->no_mem_stripes_lock); 3908c2ecf20Sopenharmony_ci } 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci mutex_unlock(&log->io_mutex); 3938c2ecf20Sopenharmony_ci 3948c2ecf20Sopenharmony_ci return 0; 3958c2ecf20Sopenharmony_ci} 3968c2ecf20Sopenharmony_ci 3978c2ecf20Sopenharmony_cistatic void ppl_log_endio(struct bio *bio) 3988c2ecf20Sopenharmony_ci{ 3998c2ecf20Sopenharmony_ci struct ppl_io_unit *io = bio->bi_private; 4008c2ecf20Sopenharmony_ci struct ppl_log *log = io->log; 4018c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = log->ppl_conf; 4028c2ecf20Sopenharmony_ci struct stripe_head *sh, *next; 4038c2ecf20Sopenharmony_ci 4048c2ecf20Sopenharmony_ci pr_debug("%s: seq: %llu\n", __func__, io->seq); 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_ci if (bio->bi_status) 4078c2ecf20Sopenharmony_ci md_error(ppl_conf->mddev, log->rdev); 4088c2ecf20Sopenharmony_ci 4098c2ecf20Sopenharmony_ci list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { 4108c2ecf20Sopenharmony_ci list_del_init(&sh->log_list); 4118c2ecf20Sopenharmony_ci 4128c2ecf20Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 4138c2ecf20Sopenharmony_ci raid5_release_stripe(sh); 4148c2ecf20Sopenharmony_ci } 4158c2ecf20Sopenharmony_ci} 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_cistatic void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio) 4188c2ecf20Sopenharmony_ci{ 4198c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_ci pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n", 4228c2ecf20Sopenharmony_ci __func__, io->seq, bio->bi_iter.bi_size, 4238c2ecf20Sopenharmony_ci (unsigned long long)bio->bi_iter.bi_sector, 4248c2ecf20Sopenharmony_ci bio_devname(bio, b)); 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci submit_bio(bio); 4278c2ecf20Sopenharmony_ci} 4288c2ecf20Sopenharmony_ci 4298c2ecf20Sopenharmony_cistatic void ppl_submit_iounit(struct ppl_io_unit *io) 4308c2ecf20Sopenharmony_ci{ 4318c2ecf20Sopenharmony_ci struct ppl_log *log = io->log; 4328c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = log->ppl_conf; 4338c2ecf20Sopenharmony_ci struct ppl_header *pplhdr = page_address(io->header_page); 4348c2ecf20Sopenharmony_ci struct bio *bio = &io->bio; 4358c2ecf20Sopenharmony_ci struct stripe_head *sh; 4368c2ecf20Sopenharmony_ci int i; 4378c2ecf20Sopenharmony_ci 4388c2ecf20Sopenharmony_ci bio->bi_private = io; 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_ci if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) { 4418c2ecf20Sopenharmony_ci ppl_log_endio(bio); 4428c2ecf20Sopenharmony_ci return; 4438c2ecf20Sopenharmony_ci } 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci for (i = 0; i < io->entries_count; i++) { 4468c2ecf20Sopenharmony_ci struct ppl_header_entry *e = &pplhdr->entries[i]; 4478c2ecf20Sopenharmony_ci 4488c2ecf20Sopenharmony_ci pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n", 4498c2ecf20Sopenharmony_ci __func__, io->seq, i, le64_to_cpu(e->data_sector), 4508c2ecf20Sopenharmony_ci le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size)); 4518c2ecf20Sopenharmony_ci 4528c2ecf20Sopenharmony_ci e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >> 4538c2ecf20Sopenharmony_ci ilog2(ppl_conf->block_size >> 9)); 4548c2ecf20Sopenharmony_ci e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum)); 4558c2ecf20Sopenharmony_ci } 4568c2ecf20Sopenharmony_ci 4578c2ecf20Sopenharmony_ci pplhdr->entries_count = cpu_to_le32(io->entries_count); 4588c2ecf20Sopenharmony_ci pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE)); 4598c2ecf20Sopenharmony_ci 4608c2ecf20Sopenharmony_ci /* Rewind the buffer if current PPL is larger then remaining space */ 4618c2ecf20Sopenharmony_ci if (log->use_multippl && 4628c2ecf20Sopenharmony_ci log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector < 4638c2ecf20Sopenharmony_ci (PPL_HEADER_SIZE + io->pp_size) >> 9) 4648c2ecf20Sopenharmony_ci log->next_io_sector = log->rdev->ppl.sector; 4658c2ecf20Sopenharmony_ci 4668c2ecf20Sopenharmony_ci 4678c2ecf20Sopenharmony_ci bio->bi_end_io = ppl_log_endio; 4688c2ecf20Sopenharmony_ci bio->bi_opf = REQ_OP_WRITE | REQ_FUA; 4698c2ecf20Sopenharmony_ci bio_set_dev(bio, log->rdev->bdev); 4708c2ecf20Sopenharmony_ci bio->bi_iter.bi_sector = log->next_io_sector; 4718c2ecf20Sopenharmony_ci bio_add_page(bio, io->header_page, PAGE_SIZE, 0); 4728c2ecf20Sopenharmony_ci bio->bi_write_hint = ppl_conf->write_hint; 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci pr_debug("%s: log->current_io_sector: %llu\n", __func__, 4758c2ecf20Sopenharmony_ci (unsigned long long)log->next_io_sector); 4768c2ecf20Sopenharmony_ci 4778c2ecf20Sopenharmony_ci if (log->use_multippl) 4788c2ecf20Sopenharmony_ci log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9; 4798c2ecf20Sopenharmony_ci 4808c2ecf20Sopenharmony_ci WARN_ON(log->disk_flush_bitmap != 0); 4818c2ecf20Sopenharmony_ci 4828c2ecf20Sopenharmony_ci list_for_each_entry(sh, &io->stripe_list, log_list) { 4838c2ecf20Sopenharmony_ci for (i = 0; i < sh->disks; i++) { 4848c2ecf20Sopenharmony_ci struct r5dev *dev = &sh->dev[i]; 4858c2ecf20Sopenharmony_ci 4868c2ecf20Sopenharmony_ci if ((ppl_conf->child_logs[i].wb_cache_on) && 4878c2ecf20Sopenharmony_ci (test_bit(R5_Wantwrite, &dev->flags))) { 4888c2ecf20Sopenharmony_ci set_bit(i, &log->disk_flush_bitmap); 4898c2ecf20Sopenharmony_ci } 4908c2ecf20Sopenharmony_ci } 4918c2ecf20Sopenharmony_ci 4928c2ecf20Sopenharmony_ci /* entries for full stripe writes have no partial parity */ 4938c2ecf20Sopenharmony_ci if (test_bit(STRIPE_FULL_WRITE, &sh->state)) 4948c2ecf20Sopenharmony_ci continue; 4958c2ecf20Sopenharmony_ci 4968c2ecf20Sopenharmony_ci if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) { 4978c2ecf20Sopenharmony_ci struct bio *prev = bio; 4988c2ecf20Sopenharmony_ci 4998c2ecf20Sopenharmony_ci bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, 5008c2ecf20Sopenharmony_ci &ppl_conf->bs); 5018c2ecf20Sopenharmony_ci bio->bi_opf = prev->bi_opf; 5028c2ecf20Sopenharmony_ci bio->bi_write_hint = prev->bi_write_hint; 5038c2ecf20Sopenharmony_ci bio_copy_dev(bio, prev); 5048c2ecf20Sopenharmony_ci bio->bi_iter.bi_sector = bio_end_sector(prev); 5058c2ecf20Sopenharmony_ci bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); 5068c2ecf20Sopenharmony_ci 5078c2ecf20Sopenharmony_ci bio_chain(bio, prev); 5088c2ecf20Sopenharmony_ci ppl_submit_iounit_bio(io, prev); 5098c2ecf20Sopenharmony_ci } 5108c2ecf20Sopenharmony_ci } 5118c2ecf20Sopenharmony_ci 5128c2ecf20Sopenharmony_ci ppl_submit_iounit_bio(io, bio); 5138c2ecf20Sopenharmony_ci} 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_cistatic void ppl_submit_current_io(struct ppl_log *log) 5168c2ecf20Sopenharmony_ci{ 5178c2ecf20Sopenharmony_ci struct ppl_io_unit *io; 5188c2ecf20Sopenharmony_ci 5198c2ecf20Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 5208c2ecf20Sopenharmony_ci 5218c2ecf20Sopenharmony_ci io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit, 5228c2ecf20Sopenharmony_ci log_sibling); 5238c2ecf20Sopenharmony_ci if (io && io->submitted) 5248c2ecf20Sopenharmony_ci io = NULL; 5258c2ecf20Sopenharmony_ci 5268c2ecf20Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 5278c2ecf20Sopenharmony_ci 5288c2ecf20Sopenharmony_ci if (io) { 5298c2ecf20Sopenharmony_ci io->submitted = true; 5308c2ecf20Sopenharmony_ci 5318c2ecf20Sopenharmony_ci if (io == log->current_io) 5328c2ecf20Sopenharmony_ci log->current_io = NULL; 5338c2ecf20Sopenharmony_ci 5348c2ecf20Sopenharmony_ci ppl_submit_iounit(io); 5358c2ecf20Sopenharmony_ci } 5368c2ecf20Sopenharmony_ci} 5378c2ecf20Sopenharmony_ci 5388c2ecf20Sopenharmony_civoid ppl_write_stripe_run(struct r5conf *conf) 5398c2ecf20Sopenharmony_ci{ 5408c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = conf->log_private; 5418c2ecf20Sopenharmony_ci struct ppl_log *log; 5428c2ecf20Sopenharmony_ci int i; 5438c2ecf20Sopenharmony_ci 5448c2ecf20Sopenharmony_ci for (i = 0; i < ppl_conf->count; i++) { 5458c2ecf20Sopenharmony_ci log = &ppl_conf->child_logs[i]; 5468c2ecf20Sopenharmony_ci 5478c2ecf20Sopenharmony_ci mutex_lock(&log->io_mutex); 5488c2ecf20Sopenharmony_ci ppl_submit_current_io(log); 5498c2ecf20Sopenharmony_ci mutex_unlock(&log->io_mutex); 5508c2ecf20Sopenharmony_ci } 5518c2ecf20Sopenharmony_ci} 5528c2ecf20Sopenharmony_ci 5538c2ecf20Sopenharmony_cistatic void ppl_io_unit_finished(struct ppl_io_unit *io) 5548c2ecf20Sopenharmony_ci{ 5558c2ecf20Sopenharmony_ci struct ppl_log *log = io->log; 5568c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = log->ppl_conf; 5578c2ecf20Sopenharmony_ci struct r5conf *conf = ppl_conf->mddev->private; 5588c2ecf20Sopenharmony_ci unsigned long flags; 5598c2ecf20Sopenharmony_ci 5608c2ecf20Sopenharmony_ci pr_debug("%s: seq: %llu\n", __func__, io->seq); 5618c2ecf20Sopenharmony_ci 5628c2ecf20Sopenharmony_ci local_irq_save(flags); 5638c2ecf20Sopenharmony_ci 5648c2ecf20Sopenharmony_ci spin_lock(&log->io_list_lock); 5658c2ecf20Sopenharmony_ci list_del(&io->log_sibling); 5668c2ecf20Sopenharmony_ci spin_unlock(&log->io_list_lock); 5678c2ecf20Sopenharmony_ci 5688c2ecf20Sopenharmony_ci mempool_free(io, &ppl_conf->io_pool); 5698c2ecf20Sopenharmony_ci 5708c2ecf20Sopenharmony_ci spin_lock(&ppl_conf->no_mem_stripes_lock); 5718c2ecf20Sopenharmony_ci if (!list_empty(&ppl_conf->no_mem_stripes)) { 5728c2ecf20Sopenharmony_ci struct stripe_head *sh; 5738c2ecf20Sopenharmony_ci 5748c2ecf20Sopenharmony_ci sh = list_first_entry(&ppl_conf->no_mem_stripes, 5758c2ecf20Sopenharmony_ci struct stripe_head, log_list); 5768c2ecf20Sopenharmony_ci list_del_init(&sh->log_list); 5778c2ecf20Sopenharmony_ci set_bit(STRIPE_HANDLE, &sh->state); 5788c2ecf20Sopenharmony_ci raid5_release_stripe(sh); 5798c2ecf20Sopenharmony_ci } 5808c2ecf20Sopenharmony_ci spin_unlock(&ppl_conf->no_mem_stripes_lock); 5818c2ecf20Sopenharmony_ci 5828c2ecf20Sopenharmony_ci local_irq_restore(flags); 5838c2ecf20Sopenharmony_ci 5848c2ecf20Sopenharmony_ci wake_up(&conf->wait_for_quiescent); 5858c2ecf20Sopenharmony_ci} 5868c2ecf20Sopenharmony_ci 5878c2ecf20Sopenharmony_cistatic void ppl_flush_endio(struct bio *bio) 5888c2ecf20Sopenharmony_ci{ 5898c2ecf20Sopenharmony_ci struct ppl_io_unit *io = bio->bi_private; 5908c2ecf20Sopenharmony_ci struct ppl_log *log = io->log; 5918c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = log->ppl_conf; 5928c2ecf20Sopenharmony_ci struct r5conf *conf = ppl_conf->mddev->private; 5938c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ci pr_debug("%s: dev: %s\n", __func__, bio_devname(bio, b)); 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_ci if (bio->bi_status) { 5988c2ecf20Sopenharmony_ci struct md_rdev *rdev; 5998c2ecf20Sopenharmony_ci 6008c2ecf20Sopenharmony_ci rcu_read_lock(); 6018c2ecf20Sopenharmony_ci rdev = md_find_rdev_rcu(conf->mddev, bio_dev(bio)); 6028c2ecf20Sopenharmony_ci if (rdev) 6038c2ecf20Sopenharmony_ci md_error(rdev->mddev, rdev); 6048c2ecf20Sopenharmony_ci rcu_read_unlock(); 6058c2ecf20Sopenharmony_ci } 6068c2ecf20Sopenharmony_ci 6078c2ecf20Sopenharmony_ci bio_put(bio); 6088c2ecf20Sopenharmony_ci 6098c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&io->pending_flushes)) { 6108c2ecf20Sopenharmony_ci ppl_io_unit_finished(io); 6118c2ecf20Sopenharmony_ci md_wakeup_thread(conf->mddev->thread); 6128c2ecf20Sopenharmony_ci } 6138c2ecf20Sopenharmony_ci} 6148c2ecf20Sopenharmony_ci 6158c2ecf20Sopenharmony_cistatic void ppl_do_flush(struct ppl_io_unit *io) 6168c2ecf20Sopenharmony_ci{ 6178c2ecf20Sopenharmony_ci struct ppl_log *log = io->log; 6188c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = log->ppl_conf; 6198c2ecf20Sopenharmony_ci struct r5conf *conf = ppl_conf->mddev->private; 6208c2ecf20Sopenharmony_ci int raid_disks = conf->raid_disks; 6218c2ecf20Sopenharmony_ci int flushed_disks = 0; 6228c2ecf20Sopenharmony_ci int i; 6238c2ecf20Sopenharmony_ci 6248c2ecf20Sopenharmony_ci atomic_set(&io->pending_flushes, raid_disks); 6258c2ecf20Sopenharmony_ci 6268c2ecf20Sopenharmony_ci for_each_set_bit(i, &log->disk_flush_bitmap, raid_disks) { 6278c2ecf20Sopenharmony_ci struct md_rdev *rdev; 6288c2ecf20Sopenharmony_ci struct block_device *bdev = NULL; 6298c2ecf20Sopenharmony_ci 6308c2ecf20Sopenharmony_ci rcu_read_lock(); 6318c2ecf20Sopenharmony_ci rdev = rcu_dereference(conf->disks[i].rdev); 6328c2ecf20Sopenharmony_ci if (rdev && !test_bit(Faulty, &rdev->flags)) 6338c2ecf20Sopenharmony_ci bdev = rdev->bdev; 6348c2ecf20Sopenharmony_ci rcu_read_unlock(); 6358c2ecf20Sopenharmony_ci 6368c2ecf20Sopenharmony_ci if (bdev) { 6378c2ecf20Sopenharmony_ci struct bio *bio; 6388c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 6398c2ecf20Sopenharmony_ci 6408c2ecf20Sopenharmony_ci bio = bio_alloc_bioset(GFP_NOIO, 0, &ppl_conf->flush_bs); 6418c2ecf20Sopenharmony_ci bio_set_dev(bio, bdev); 6428c2ecf20Sopenharmony_ci bio->bi_private = io; 6438c2ecf20Sopenharmony_ci bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; 6448c2ecf20Sopenharmony_ci bio->bi_end_io = ppl_flush_endio; 6458c2ecf20Sopenharmony_ci 6468c2ecf20Sopenharmony_ci pr_debug("%s: dev: %s\n", __func__, 6478c2ecf20Sopenharmony_ci bio_devname(bio, b)); 6488c2ecf20Sopenharmony_ci 6498c2ecf20Sopenharmony_ci submit_bio(bio); 6508c2ecf20Sopenharmony_ci flushed_disks++; 6518c2ecf20Sopenharmony_ci } 6528c2ecf20Sopenharmony_ci } 6538c2ecf20Sopenharmony_ci 6548c2ecf20Sopenharmony_ci log->disk_flush_bitmap = 0; 6558c2ecf20Sopenharmony_ci 6568c2ecf20Sopenharmony_ci for (i = flushed_disks ; i < raid_disks; i++) { 6578c2ecf20Sopenharmony_ci if (atomic_dec_and_test(&io->pending_flushes)) 6588c2ecf20Sopenharmony_ci ppl_io_unit_finished(io); 6598c2ecf20Sopenharmony_ci } 6608c2ecf20Sopenharmony_ci} 6618c2ecf20Sopenharmony_ci 6628c2ecf20Sopenharmony_cistatic inline bool ppl_no_io_unit_submitted(struct r5conf *conf, 6638c2ecf20Sopenharmony_ci struct ppl_log *log) 6648c2ecf20Sopenharmony_ci{ 6658c2ecf20Sopenharmony_ci struct ppl_io_unit *io; 6668c2ecf20Sopenharmony_ci 6678c2ecf20Sopenharmony_ci io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit, 6688c2ecf20Sopenharmony_ci log_sibling); 6698c2ecf20Sopenharmony_ci 6708c2ecf20Sopenharmony_ci return !io || !io->submitted; 6718c2ecf20Sopenharmony_ci} 6728c2ecf20Sopenharmony_ci 6738c2ecf20Sopenharmony_civoid ppl_quiesce(struct r5conf *conf, int quiesce) 6748c2ecf20Sopenharmony_ci{ 6758c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = conf->log_private; 6768c2ecf20Sopenharmony_ci int i; 6778c2ecf20Sopenharmony_ci 6788c2ecf20Sopenharmony_ci if (quiesce) { 6798c2ecf20Sopenharmony_ci for (i = 0; i < ppl_conf->count; i++) { 6808c2ecf20Sopenharmony_ci struct ppl_log *log = &ppl_conf->child_logs[i]; 6818c2ecf20Sopenharmony_ci 6828c2ecf20Sopenharmony_ci spin_lock_irq(&log->io_list_lock); 6838c2ecf20Sopenharmony_ci wait_event_lock_irq(conf->wait_for_quiescent, 6848c2ecf20Sopenharmony_ci ppl_no_io_unit_submitted(conf, log), 6858c2ecf20Sopenharmony_ci log->io_list_lock); 6868c2ecf20Sopenharmony_ci spin_unlock_irq(&log->io_list_lock); 6878c2ecf20Sopenharmony_ci } 6888c2ecf20Sopenharmony_ci } 6898c2ecf20Sopenharmony_ci} 6908c2ecf20Sopenharmony_ci 6918c2ecf20Sopenharmony_ciint ppl_handle_flush_request(struct r5l_log *log, struct bio *bio) 6928c2ecf20Sopenharmony_ci{ 6938c2ecf20Sopenharmony_ci if (bio->bi_iter.bi_size == 0) { 6948c2ecf20Sopenharmony_ci bio_endio(bio); 6958c2ecf20Sopenharmony_ci return 0; 6968c2ecf20Sopenharmony_ci } 6978c2ecf20Sopenharmony_ci bio->bi_opf &= ~REQ_PREFLUSH; 6988c2ecf20Sopenharmony_ci return -EAGAIN; 6998c2ecf20Sopenharmony_ci} 7008c2ecf20Sopenharmony_ci 7018c2ecf20Sopenharmony_civoid ppl_stripe_write_finished(struct stripe_head *sh) 7028c2ecf20Sopenharmony_ci{ 7038c2ecf20Sopenharmony_ci struct ppl_io_unit *io; 7048c2ecf20Sopenharmony_ci 7058c2ecf20Sopenharmony_ci io = sh->ppl_io; 7068c2ecf20Sopenharmony_ci sh->ppl_io = NULL; 7078c2ecf20Sopenharmony_ci 7088c2ecf20Sopenharmony_ci if (io && atomic_dec_and_test(&io->pending_stripes)) { 7098c2ecf20Sopenharmony_ci if (io->log->disk_flush_bitmap) 7108c2ecf20Sopenharmony_ci ppl_do_flush(io); 7118c2ecf20Sopenharmony_ci else 7128c2ecf20Sopenharmony_ci ppl_io_unit_finished(io); 7138c2ecf20Sopenharmony_ci } 7148c2ecf20Sopenharmony_ci} 7158c2ecf20Sopenharmony_ci 7168c2ecf20Sopenharmony_cistatic void ppl_xor(int size, struct page *page1, struct page *page2) 7178c2ecf20Sopenharmony_ci{ 7188c2ecf20Sopenharmony_ci struct async_submit_ctl submit; 7198c2ecf20Sopenharmony_ci struct dma_async_tx_descriptor *tx; 7208c2ecf20Sopenharmony_ci struct page *xor_srcs[] = { page1, page2 }; 7218c2ecf20Sopenharmony_ci 7228c2ecf20Sopenharmony_ci init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST, 7238c2ecf20Sopenharmony_ci NULL, NULL, NULL, NULL); 7248c2ecf20Sopenharmony_ci tx = async_xor(page1, xor_srcs, 0, 2, size, &submit); 7258c2ecf20Sopenharmony_ci 7268c2ecf20Sopenharmony_ci async_tx_quiesce(&tx); 7278c2ecf20Sopenharmony_ci} 7288c2ecf20Sopenharmony_ci 7298c2ecf20Sopenharmony_ci/* 7308c2ecf20Sopenharmony_ci * PPL recovery strategy: xor partial parity and data from all modified data 7318c2ecf20Sopenharmony_ci * disks within a stripe and write the result as the new stripe parity. If all 7328c2ecf20Sopenharmony_ci * stripe data disks are modified (full stripe write), no partial parity is 7338c2ecf20Sopenharmony_ci * available, so just xor the data disks. 7348c2ecf20Sopenharmony_ci * 7358c2ecf20Sopenharmony_ci * Recovery of a PPL entry shall occur only if all modified data disks are 7368c2ecf20Sopenharmony_ci * available and read from all of them succeeds. 7378c2ecf20Sopenharmony_ci * 7388c2ecf20Sopenharmony_ci * A PPL entry applies to a stripe, partial parity size for an entry is at most 7398c2ecf20Sopenharmony_ci * the size of the chunk. Examples of possible cases for a single entry: 7408c2ecf20Sopenharmony_ci * 7418c2ecf20Sopenharmony_ci * case 0: single data disk write: 7428c2ecf20Sopenharmony_ci * data0 data1 data2 ppl parity 7438c2ecf20Sopenharmony_ci * +--------+--------+--------+ +--------------------+ 7448c2ecf20Sopenharmony_ci * | ------ | ------ | ------ | +----+ | (no change) | 7458c2ecf20Sopenharmony_ci * | ------ | -data- | ------ | | pp | -> | data1 ^ pp | 7468c2ecf20Sopenharmony_ci * | ------ | -data- | ------ | | pp | -> | data1 ^ pp | 7478c2ecf20Sopenharmony_ci * | ------ | ------ | ------ | +----+ | (no change) | 7488c2ecf20Sopenharmony_ci * +--------+--------+--------+ +--------------------+ 7498c2ecf20Sopenharmony_ci * pp_size = data_size 7508c2ecf20Sopenharmony_ci * 7518c2ecf20Sopenharmony_ci * case 1: more than one data disk write: 7528c2ecf20Sopenharmony_ci * data0 data1 data2 ppl parity 7538c2ecf20Sopenharmony_ci * +--------+--------+--------+ +--------------------+ 7548c2ecf20Sopenharmony_ci * | ------ | ------ | ------ | +----+ | (no change) | 7558c2ecf20Sopenharmony_ci * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp | 7568c2ecf20Sopenharmony_ci * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp | 7578c2ecf20Sopenharmony_ci * | ------ | ------ | ------ | +----+ | (no change) | 7588c2ecf20Sopenharmony_ci * +--------+--------+--------+ +--------------------+ 7598c2ecf20Sopenharmony_ci * pp_size = data_size / modified_data_disks 7608c2ecf20Sopenharmony_ci * 7618c2ecf20Sopenharmony_ci * case 2: write to all data disks (also full stripe write): 7628c2ecf20Sopenharmony_ci * data0 data1 data2 parity 7638c2ecf20Sopenharmony_ci * +--------+--------+--------+ +--------------------+ 7648c2ecf20Sopenharmony_ci * | ------ | ------ | ------ | | (no change) | 7658c2ecf20Sopenharmony_ci * | -data- | -data- | -data- | --------> | xor all data | 7668c2ecf20Sopenharmony_ci * | ------ | ------ | ------ | --------> | (no change) | 7678c2ecf20Sopenharmony_ci * | ------ | ------ | ------ | | (no change) | 7688c2ecf20Sopenharmony_ci * +--------+--------+--------+ +--------------------+ 7698c2ecf20Sopenharmony_ci * pp_size = 0 7708c2ecf20Sopenharmony_ci * 7718c2ecf20Sopenharmony_ci * The following cases are possible only in other implementations. The recovery 7728c2ecf20Sopenharmony_ci * code can handle them, but they are not generated at runtime because they can 7738c2ecf20Sopenharmony_ci * be reduced to cases 0, 1 and 2: 7748c2ecf20Sopenharmony_ci * 7758c2ecf20Sopenharmony_ci * case 3: 7768c2ecf20Sopenharmony_ci * data0 data1 data2 ppl parity 7778c2ecf20Sopenharmony_ci * +--------+--------+--------+ +----+ +--------------------+ 7788c2ecf20Sopenharmony_ci * | ------ | -data- | -data- | | pp | | data1 ^ data2 ^ pp | 7798c2ecf20Sopenharmony_ci * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp | 7808c2ecf20Sopenharmony_ci * | -data- | -data- | -data- | | -- | -> | xor all data | 7818c2ecf20Sopenharmony_ci * | -data- | -data- | ------ | | pp | | data0 ^ data1 ^ pp | 7828c2ecf20Sopenharmony_ci * +--------+--------+--------+ +----+ +--------------------+ 7838c2ecf20Sopenharmony_ci * pp_size = chunk_size 7848c2ecf20Sopenharmony_ci * 7858c2ecf20Sopenharmony_ci * case 4: 7868c2ecf20Sopenharmony_ci * data0 data1 data2 ppl parity 7878c2ecf20Sopenharmony_ci * +--------+--------+--------+ +----+ +--------------------+ 7888c2ecf20Sopenharmony_ci * | ------ | -data- | ------ | | pp | | data1 ^ pp | 7898c2ecf20Sopenharmony_ci * | ------ | ------ | ------ | | -- | -> | (no change) | 7908c2ecf20Sopenharmony_ci * | ------ | ------ | ------ | | -- | -> | (no change) | 7918c2ecf20Sopenharmony_ci * | -data- | ------ | ------ | | pp | | data0 ^ pp | 7928c2ecf20Sopenharmony_ci * +--------+--------+--------+ +----+ +--------------------+ 7938c2ecf20Sopenharmony_ci * pp_size = chunk_size 7948c2ecf20Sopenharmony_ci */ 7958c2ecf20Sopenharmony_cistatic int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, 7968c2ecf20Sopenharmony_ci sector_t ppl_sector) 7978c2ecf20Sopenharmony_ci{ 7988c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = log->ppl_conf; 7998c2ecf20Sopenharmony_ci struct mddev *mddev = ppl_conf->mddev; 8008c2ecf20Sopenharmony_ci struct r5conf *conf = mddev->private; 8018c2ecf20Sopenharmony_ci int block_size = ppl_conf->block_size; 8028c2ecf20Sopenharmony_ci struct page *page1; 8038c2ecf20Sopenharmony_ci struct page *page2; 8048c2ecf20Sopenharmony_ci sector_t r_sector_first; 8058c2ecf20Sopenharmony_ci sector_t r_sector_last; 8068c2ecf20Sopenharmony_ci int strip_sectors; 8078c2ecf20Sopenharmony_ci int data_disks; 8088c2ecf20Sopenharmony_ci int i; 8098c2ecf20Sopenharmony_ci int ret = 0; 8108c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 8118c2ecf20Sopenharmony_ci unsigned int pp_size = le32_to_cpu(e->pp_size); 8128c2ecf20Sopenharmony_ci unsigned int data_size = le32_to_cpu(e->data_size); 8138c2ecf20Sopenharmony_ci 8148c2ecf20Sopenharmony_ci page1 = alloc_page(GFP_KERNEL); 8158c2ecf20Sopenharmony_ci page2 = alloc_page(GFP_KERNEL); 8168c2ecf20Sopenharmony_ci 8178c2ecf20Sopenharmony_ci if (!page1 || !page2) { 8188c2ecf20Sopenharmony_ci ret = -ENOMEM; 8198c2ecf20Sopenharmony_ci goto out; 8208c2ecf20Sopenharmony_ci } 8218c2ecf20Sopenharmony_ci 8228c2ecf20Sopenharmony_ci r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9); 8238c2ecf20Sopenharmony_ci 8248c2ecf20Sopenharmony_ci if ((pp_size >> 9) < conf->chunk_sectors) { 8258c2ecf20Sopenharmony_ci if (pp_size > 0) { 8268c2ecf20Sopenharmony_ci data_disks = data_size / pp_size; 8278c2ecf20Sopenharmony_ci strip_sectors = pp_size >> 9; 8288c2ecf20Sopenharmony_ci } else { 8298c2ecf20Sopenharmony_ci data_disks = conf->raid_disks - conf->max_degraded; 8308c2ecf20Sopenharmony_ci strip_sectors = (data_size >> 9) / data_disks; 8318c2ecf20Sopenharmony_ci } 8328c2ecf20Sopenharmony_ci r_sector_last = r_sector_first + 8338c2ecf20Sopenharmony_ci (data_disks - 1) * conf->chunk_sectors + 8348c2ecf20Sopenharmony_ci strip_sectors; 8358c2ecf20Sopenharmony_ci } else { 8368c2ecf20Sopenharmony_ci data_disks = conf->raid_disks - conf->max_degraded; 8378c2ecf20Sopenharmony_ci strip_sectors = conf->chunk_sectors; 8388c2ecf20Sopenharmony_ci r_sector_last = r_sector_first + (data_size >> 9); 8398c2ecf20Sopenharmony_ci } 8408c2ecf20Sopenharmony_ci 8418c2ecf20Sopenharmony_ci pr_debug("%s: array sector first: %llu last: %llu\n", __func__, 8428c2ecf20Sopenharmony_ci (unsigned long long)r_sector_first, 8438c2ecf20Sopenharmony_ci (unsigned long long)r_sector_last); 8448c2ecf20Sopenharmony_ci 8458c2ecf20Sopenharmony_ci /* if start and end is 4k aligned, use a 4k block */ 8468c2ecf20Sopenharmony_ci if (block_size == 512 && 8478c2ecf20Sopenharmony_ci (r_sector_first & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0 && 8488c2ecf20Sopenharmony_ci (r_sector_last & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0) 8498c2ecf20Sopenharmony_ci block_size = RAID5_STRIPE_SIZE(conf); 8508c2ecf20Sopenharmony_ci 8518c2ecf20Sopenharmony_ci /* iterate through blocks in strip */ 8528c2ecf20Sopenharmony_ci for (i = 0; i < strip_sectors; i += (block_size >> 9)) { 8538c2ecf20Sopenharmony_ci bool update_parity = false; 8548c2ecf20Sopenharmony_ci sector_t parity_sector; 8558c2ecf20Sopenharmony_ci struct md_rdev *parity_rdev; 8568c2ecf20Sopenharmony_ci struct stripe_head sh; 8578c2ecf20Sopenharmony_ci int disk; 8588c2ecf20Sopenharmony_ci int indent = 0; 8598c2ecf20Sopenharmony_ci 8608c2ecf20Sopenharmony_ci pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i); 8618c2ecf20Sopenharmony_ci indent += 2; 8628c2ecf20Sopenharmony_ci 8638c2ecf20Sopenharmony_ci memset(page_address(page1), 0, PAGE_SIZE); 8648c2ecf20Sopenharmony_ci 8658c2ecf20Sopenharmony_ci /* iterate through data member disks */ 8668c2ecf20Sopenharmony_ci for (disk = 0; disk < data_disks; disk++) { 8678c2ecf20Sopenharmony_ci int dd_idx; 8688c2ecf20Sopenharmony_ci struct md_rdev *rdev; 8698c2ecf20Sopenharmony_ci sector_t sector; 8708c2ecf20Sopenharmony_ci sector_t r_sector = r_sector_first + i + 8718c2ecf20Sopenharmony_ci (disk * conf->chunk_sectors); 8728c2ecf20Sopenharmony_ci 8738c2ecf20Sopenharmony_ci pr_debug("%s:%*s data member disk %d start\n", 8748c2ecf20Sopenharmony_ci __func__, indent, "", disk); 8758c2ecf20Sopenharmony_ci indent += 2; 8768c2ecf20Sopenharmony_ci 8778c2ecf20Sopenharmony_ci if (r_sector >= r_sector_last) { 8788c2ecf20Sopenharmony_ci pr_debug("%s:%*s array sector %llu doesn't need parity update\n", 8798c2ecf20Sopenharmony_ci __func__, indent, "", 8808c2ecf20Sopenharmony_ci (unsigned long long)r_sector); 8818c2ecf20Sopenharmony_ci indent -= 2; 8828c2ecf20Sopenharmony_ci continue; 8838c2ecf20Sopenharmony_ci } 8848c2ecf20Sopenharmony_ci 8858c2ecf20Sopenharmony_ci update_parity = true; 8868c2ecf20Sopenharmony_ci 8878c2ecf20Sopenharmony_ci /* map raid sector to member disk */ 8888c2ecf20Sopenharmony_ci sector = raid5_compute_sector(conf, r_sector, 0, 8898c2ecf20Sopenharmony_ci &dd_idx, NULL); 8908c2ecf20Sopenharmony_ci pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n", 8918c2ecf20Sopenharmony_ci __func__, indent, "", 8928c2ecf20Sopenharmony_ci (unsigned long long)r_sector, dd_idx, 8938c2ecf20Sopenharmony_ci (unsigned long long)sector); 8948c2ecf20Sopenharmony_ci 8958c2ecf20Sopenharmony_ci rdev = conf->disks[dd_idx].rdev; 8968c2ecf20Sopenharmony_ci if (!rdev || (!test_bit(In_sync, &rdev->flags) && 8978c2ecf20Sopenharmony_ci sector >= rdev->recovery_offset)) { 8988c2ecf20Sopenharmony_ci pr_debug("%s:%*s data member disk %d missing\n", 8998c2ecf20Sopenharmony_ci __func__, indent, "", dd_idx); 9008c2ecf20Sopenharmony_ci update_parity = false; 9018c2ecf20Sopenharmony_ci break; 9028c2ecf20Sopenharmony_ci } 9038c2ecf20Sopenharmony_ci 9048c2ecf20Sopenharmony_ci pr_debug("%s:%*s reading data member disk %s sector %llu\n", 9058c2ecf20Sopenharmony_ci __func__, indent, "", bdevname(rdev->bdev, b), 9068c2ecf20Sopenharmony_ci (unsigned long long)sector); 9078c2ecf20Sopenharmony_ci if (!sync_page_io(rdev, sector, block_size, page2, 9088c2ecf20Sopenharmony_ci REQ_OP_READ, 0, false)) { 9098c2ecf20Sopenharmony_ci md_error(mddev, rdev); 9108c2ecf20Sopenharmony_ci pr_debug("%s:%*s read failed!\n", __func__, 9118c2ecf20Sopenharmony_ci indent, ""); 9128c2ecf20Sopenharmony_ci ret = -EIO; 9138c2ecf20Sopenharmony_ci goto out; 9148c2ecf20Sopenharmony_ci } 9158c2ecf20Sopenharmony_ci 9168c2ecf20Sopenharmony_ci ppl_xor(block_size, page1, page2); 9178c2ecf20Sopenharmony_ci 9188c2ecf20Sopenharmony_ci indent -= 2; 9198c2ecf20Sopenharmony_ci } 9208c2ecf20Sopenharmony_ci 9218c2ecf20Sopenharmony_ci if (!update_parity) 9228c2ecf20Sopenharmony_ci continue; 9238c2ecf20Sopenharmony_ci 9248c2ecf20Sopenharmony_ci if (pp_size > 0) { 9258c2ecf20Sopenharmony_ci pr_debug("%s:%*s reading pp disk sector %llu\n", 9268c2ecf20Sopenharmony_ci __func__, indent, "", 9278c2ecf20Sopenharmony_ci (unsigned long long)(ppl_sector + i)); 9288c2ecf20Sopenharmony_ci if (!sync_page_io(log->rdev, 9298c2ecf20Sopenharmony_ci ppl_sector - log->rdev->data_offset + i, 9308c2ecf20Sopenharmony_ci block_size, page2, REQ_OP_READ, 0, 9318c2ecf20Sopenharmony_ci false)) { 9328c2ecf20Sopenharmony_ci pr_debug("%s:%*s read failed!\n", __func__, 9338c2ecf20Sopenharmony_ci indent, ""); 9348c2ecf20Sopenharmony_ci md_error(mddev, log->rdev); 9358c2ecf20Sopenharmony_ci ret = -EIO; 9368c2ecf20Sopenharmony_ci goto out; 9378c2ecf20Sopenharmony_ci } 9388c2ecf20Sopenharmony_ci 9398c2ecf20Sopenharmony_ci ppl_xor(block_size, page1, page2); 9408c2ecf20Sopenharmony_ci } 9418c2ecf20Sopenharmony_ci 9428c2ecf20Sopenharmony_ci /* map raid sector to parity disk */ 9438c2ecf20Sopenharmony_ci parity_sector = raid5_compute_sector(conf, r_sector_first + i, 9448c2ecf20Sopenharmony_ci 0, &disk, &sh); 9458c2ecf20Sopenharmony_ci BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk)); 9468c2ecf20Sopenharmony_ci parity_rdev = conf->disks[sh.pd_idx].rdev; 9478c2ecf20Sopenharmony_ci 9488c2ecf20Sopenharmony_ci BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev); 9498c2ecf20Sopenharmony_ci pr_debug("%s:%*s write parity at sector %llu, disk %s\n", 9508c2ecf20Sopenharmony_ci __func__, indent, "", 9518c2ecf20Sopenharmony_ci (unsigned long long)parity_sector, 9528c2ecf20Sopenharmony_ci bdevname(parity_rdev->bdev, b)); 9538c2ecf20Sopenharmony_ci if (!sync_page_io(parity_rdev, parity_sector, block_size, 9548c2ecf20Sopenharmony_ci page1, REQ_OP_WRITE, 0, false)) { 9558c2ecf20Sopenharmony_ci pr_debug("%s:%*s parity write error!\n", __func__, 9568c2ecf20Sopenharmony_ci indent, ""); 9578c2ecf20Sopenharmony_ci md_error(mddev, parity_rdev); 9588c2ecf20Sopenharmony_ci ret = -EIO; 9598c2ecf20Sopenharmony_ci goto out; 9608c2ecf20Sopenharmony_ci } 9618c2ecf20Sopenharmony_ci } 9628c2ecf20Sopenharmony_ciout: 9638c2ecf20Sopenharmony_ci if (page1) 9648c2ecf20Sopenharmony_ci __free_page(page1); 9658c2ecf20Sopenharmony_ci if (page2) 9668c2ecf20Sopenharmony_ci __free_page(page2); 9678c2ecf20Sopenharmony_ci return ret; 9688c2ecf20Sopenharmony_ci} 9698c2ecf20Sopenharmony_ci 9708c2ecf20Sopenharmony_cistatic int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr, 9718c2ecf20Sopenharmony_ci sector_t offset) 9728c2ecf20Sopenharmony_ci{ 9738c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = log->ppl_conf; 9748c2ecf20Sopenharmony_ci struct md_rdev *rdev = log->rdev; 9758c2ecf20Sopenharmony_ci struct mddev *mddev = rdev->mddev; 9768c2ecf20Sopenharmony_ci sector_t ppl_sector = rdev->ppl.sector + offset + 9778c2ecf20Sopenharmony_ci (PPL_HEADER_SIZE >> 9); 9788c2ecf20Sopenharmony_ci struct page *page; 9798c2ecf20Sopenharmony_ci int i; 9808c2ecf20Sopenharmony_ci int ret = 0; 9818c2ecf20Sopenharmony_ci 9828c2ecf20Sopenharmony_ci page = alloc_page(GFP_KERNEL); 9838c2ecf20Sopenharmony_ci if (!page) 9848c2ecf20Sopenharmony_ci return -ENOMEM; 9858c2ecf20Sopenharmony_ci 9868c2ecf20Sopenharmony_ci /* iterate through all PPL entries saved */ 9878c2ecf20Sopenharmony_ci for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) { 9888c2ecf20Sopenharmony_ci struct ppl_header_entry *e = &pplhdr->entries[i]; 9898c2ecf20Sopenharmony_ci u32 pp_size = le32_to_cpu(e->pp_size); 9908c2ecf20Sopenharmony_ci sector_t sector = ppl_sector; 9918c2ecf20Sopenharmony_ci int ppl_entry_sectors = pp_size >> 9; 9928c2ecf20Sopenharmony_ci u32 crc, crc_stored; 9938c2ecf20Sopenharmony_ci 9948c2ecf20Sopenharmony_ci pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n", 9958c2ecf20Sopenharmony_ci __func__, rdev->raid_disk, i, 9968c2ecf20Sopenharmony_ci (unsigned long long)ppl_sector, pp_size); 9978c2ecf20Sopenharmony_ci 9988c2ecf20Sopenharmony_ci crc = ~0; 9998c2ecf20Sopenharmony_ci crc_stored = le32_to_cpu(e->checksum); 10008c2ecf20Sopenharmony_ci 10018c2ecf20Sopenharmony_ci /* read parial parity for this entry and calculate its checksum */ 10028c2ecf20Sopenharmony_ci while (pp_size) { 10038c2ecf20Sopenharmony_ci int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size; 10048c2ecf20Sopenharmony_ci 10058c2ecf20Sopenharmony_ci if (!sync_page_io(rdev, sector - rdev->data_offset, 10068c2ecf20Sopenharmony_ci s, page, REQ_OP_READ, 0, false)) { 10078c2ecf20Sopenharmony_ci md_error(mddev, rdev); 10088c2ecf20Sopenharmony_ci ret = -EIO; 10098c2ecf20Sopenharmony_ci goto out; 10108c2ecf20Sopenharmony_ci } 10118c2ecf20Sopenharmony_ci 10128c2ecf20Sopenharmony_ci crc = crc32c_le(crc, page_address(page), s); 10138c2ecf20Sopenharmony_ci 10148c2ecf20Sopenharmony_ci pp_size -= s; 10158c2ecf20Sopenharmony_ci sector += s >> 9; 10168c2ecf20Sopenharmony_ci } 10178c2ecf20Sopenharmony_ci 10188c2ecf20Sopenharmony_ci crc = ~crc; 10198c2ecf20Sopenharmony_ci 10208c2ecf20Sopenharmony_ci if (crc != crc_stored) { 10218c2ecf20Sopenharmony_ci /* 10228c2ecf20Sopenharmony_ci * Don't recover this entry if the checksum does not 10238c2ecf20Sopenharmony_ci * match, but keep going and try to recover other 10248c2ecf20Sopenharmony_ci * entries. 10258c2ecf20Sopenharmony_ci */ 10268c2ecf20Sopenharmony_ci pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n", 10278c2ecf20Sopenharmony_ci __func__, crc_stored, crc); 10288c2ecf20Sopenharmony_ci ppl_conf->mismatch_count++; 10298c2ecf20Sopenharmony_ci } else { 10308c2ecf20Sopenharmony_ci ret = ppl_recover_entry(log, e, ppl_sector); 10318c2ecf20Sopenharmony_ci if (ret) 10328c2ecf20Sopenharmony_ci goto out; 10338c2ecf20Sopenharmony_ci ppl_conf->recovered_entries++; 10348c2ecf20Sopenharmony_ci } 10358c2ecf20Sopenharmony_ci 10368c2ecf20Sopenharmony_ci ppl_sector += ppl_entry_sectors; 10378c2ecf20Sopenharmony_ci } 10388c2ecf20Sopenharmony_ci 10398c2ecf20Sopenharmony_ci /* flush the disk cache after recovery if necessary */ 10408c2ecf20Sopenharmony_ci ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL); 10418c2ecf20Sopenharmony_ciout: 10428c2ecf20Sopenharmony_ci __free_page(page); 10438c2ecf20Sopenharmony_ci return ret; 10448c2ecf20Sopenharmony_ci} 10458c2ecf20Sopenharmony_ci 10468c2ecf20Sopenharmony_cistatic int ppl_write_empty_header(struct ppl_log *log) 10478c2ecf20Sopenharmony_ci{ 10488c2ecf20Sopenharmony_ci struct page *page; 10498c2ecf20Sopenharmony_ci struct ppl_header *pplhdr; 10508c2ecf20Sopenharmony_ci struct md_rdev *rdev = log->rdev; 10518c2ecf20Sopenharmony_ci int ret = 0; 10528c2ecf20Sopenharmony_ci 10538c2ecf20Sopenharmony_ci pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__, 10548c2ecf20Sopenharmony_ci rdev->raid_disk, (unsigned long long)rdev->ppl.sector); 10558c2ecf20Sopenharmony_ci 10568c2ecf20Sopenharmony_ci page = alloc_page(GFP_NOIO | __GFP_ZERO); 10578c2ecf20Sopenharmony_ci if (!page) 10588c2ecf20Sopenharmony_ci return -ENOMEM; 10598c2ecf20Sopenharmony_ci 10608c2ecf20Sopenharmony_ci pplhdr = page_address(page); 10618c2ecf20Sopenharmony_ci /* zero out PPL space to avoid collision with old PPLs */ 10628c2ecf20Sopenharmony_ci blkdev_issue_zeroout(rdev->bdev, rdev->ppl.sector, 10638c2ecf20Sopenharmony_ci log->rdev->ppl.size, GFP_NOIO, 0); 10648c2ecf20Sopenharmony_ci memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED); 10658c2ecf20Sopenharmony_ci pplhdr->signature = cpu_to_le32(log->ppl_conf->signature); 10668c2ecf20Sopenharmony_ci pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE)); 10678c2ecf20Sopenharmony_ci 10688c2ecf20Sopenharmony_ci if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset, 10698c2ecf20Sopenharmony_ci PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_SYNC | 10708c2ecf20Sopenharmony_ci REQ_FUA, 0, false)) { 10718c2ecf20Sopenharmony_ci md_error(rdev->mddev, rdev); 10728c2ecf20Sopenharmony_ci ret = -EIO; 10738c2ecf20Sopenharmony_ci } 10748c2ecf20Sopenharmony_ci 10758c2ecf20Sopenharmony_ci __free_page(page); 10768c2ecf20Sopenharmony_ci return ret; 10778c2ecf20Sopenharmony_ci} 10788c2ecf20Sopenharmony_ci 10798c2ecf20Sopenharmony_cistatic int ppl_load_distributed(struct ppl_log *log) 10808c2ecf20Sopenharmony_ci{ 10818c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = log->ppl_conf; 10828c2ecf20Sopenharmony_ci struct md_rdev *rdev = log->rdev; 10838c2ecf20Sopenharmony_ci struct mddev *mddev = rdev->mddev; 10848c2ecf20Sopenharmony_ci struct page *page, *page2, *tmp; 10858c2ecf20Sopenharmony_ci struct ppl_header *pplhdr = NULL, *prev_pplhdr = NULL; 10868c2ecf20Sopenharmony_ci u32 crc, crc_stored; 10878c2ecf20Sopenharmony_ci u32 signature; 10888c2ecf20Sopenharmony_ci int ret = 0, i; 10898c2ecf20Sopenharmony_ci sector_t pplhdr_offset = 0, prev_pplhdr_offset = 0; 10908c2ecf20Sopenharmony_ci 10918c2ecf20Sopenharmony_ci pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk); 10928c2ecf20Sopenharmony_ci /* read PPL headers, find the recent one */ 10938c2ecf20Sopenharmony_ci page = alloc_page(GFP_KERNEL); 10948c2ecf20Sopenharmony_ci if (!page) 10958c2ecf20Sopenharmony_ci return -ENOMEM; 10968c2ecf20Sopenharmony_ci 10978c2ecf20Sopenharmony_ci page2 = alloc_page(GFP_KERNEL); 10988c2ecf20Sopenharmony_ci if (!page2) { 10998c2ecf20Sopenharmony_ci __free_page(page); 11008c2ecf20Sopenharmony_ci return -ENOMEM; 11018c2ecf20Sopenharmony_ci } 11028c2ecf20Sopenharmony_ci 11038c2ecf20Sopenharmony_ci /* searching ppl area for latest ppl */ 11048c2ecf20Sopenharmony_ci while (pplhdr_offset < rdev->ppl.size - (PPL_HEADER_SIZE >> 9)) { 11058c2ecf20Sopenharmony_ci if (!sync_page_io(rdev, 11068c2ecf20Sopenharmony_ci rdev->ppl.sector - rdev->data_offset + 11078c2ecf20Sopenharmony_ci pplhdr_offset, PAGE_SIZE, page, REQ_OP_READ, 11088c2ecf20Sopenharmony_ci 0, false)) { 11098c2ecf20Sopenharmony_ci md_error(mddev, rdev); 11108c2ecf20Sopenharmony_ci ret = -EIO; 11118c2ecf20Sopenharmony_ci /* if not able to read - don't recover any PPL */ 11128c2ecf20Sopenharmony_ci pplhdr = NULL; 11138c2ecf20Sopenharmony_ci break; 11148c2ecf20Sopenharmony_ci } 11158c2ecf20Sopenharmony_ci pplhdr = page_address(page); 11168c2ecf20Sopenharmony_ci 11178c2ecf20Sopenharmony_ci /* check header validity */ 11188c2ecf20Sopenharmony_ci crc_stored = le32_to_cpu(pplhdr->checksum); 11198c2ecf20Sopenharmony_ci pplhdr->checksum = 0; 11208c2ecf20Sopenharmony_ci crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE); 11218c2ecf20Sopenharmony_ci 11228c2ecf20Sopenharmony_ci if (crc_stored != crc) { 11238c2ecf20Sopenharmony_ci pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x (offset: %llu)\n", 11248c2ecf20Sopenharmony_ci __func__, crc_stored, crc, 11258c2ecf20Sopenharmony_ci (unsigned long long)pplhdr_offset); 11268c2ecf20Sopenharmony_ci pplhdr = prev_pplhdr; 11278c2ecf20Sopenharmony_ci pplhdr_offset = prev_pplhdr_offset; 11288c2ecf20Sopenharmony_ci break; 11298c2ecf20Sopenharmony_ci } 11308c2ecf20Sopenharmony_ci 11318c2ecf20Sopenharmony_ci signature = le32_to_cpu(pplhdr->signature); 11328c2ecf20Sopenharmony_ci 11338c2ecf20Sopenharmony_ci if (mddev->external) { 11348c2ecf20Sopenharmony_ci /* 11358c2ecf20Sopenharmony_ci * For external metadata the header signature is set and 11368c2ecf20Sopenharmony_ci * validated in userspace. 11378c2ecf20Sopenharmony_ci */ 11388c2ecf20Sopenharmony_ci ppl_conf->signature = signature; 11398c2ecf20Sopenharmony_ci } else if (ppl_conf->signature != signature) { 11408c2ecf20Sopenharmony_ci pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x (offset: %llu)\n", 11418c2ecf20Sopenharmony_ci __func__, signature, ppl_conf->signature, 11428c2ecf20Sopenharmony_ci (unsigned long long)pplhdr_offset); 11438c2ecf20Sopenharmony_ci pplhdr = prev_pplhdr; 11448c2ecf20Sopenharmony_ci pplhdr_offset = prev_pplhdr_offset; 11458c2ecf20Sopenharmony_ci break; 11468c2ecf20Sopenharmony_ci } 11478c2ecf20Sopenharmony_ci 11488c2ecf20Sopenharmony_ci if (prev_pplhdr && le64_to_cpu(prev_pplhdr->generation) > 11498c2ecf20Sopenharmony_ci le64_to_cpu(pplhdr->generation)) { 11508c2ecf20Sopenharmony_ci /* previous was newest */ 11518c2ecf20Sopenharmony_ci pplhdr = prev_pplhdr; 11528c2ecf20Sopenharmony_ci pplhdr_offset = prev_pplhdr_offset; 11538c2ecf20Sopenharmony_ci break; 11548c2ecf20Sopenharmony_ci } 11558c2ecf20Sopenharmony_ci 11568c2ecf20Sopenharmony_ci prev_pplhdr_offset = pplhdr_offset; 11578c2ecf20Sopenharmony_ci prev_pplhdr = pplhdr; 11588c2ecf20Sopenharmony_ci 11598c2ecf20Sopenharmony_ci tmp = page; 11608c2ecf20Sopenharmony_ci page = page2; 11618c2ecf20Sopenharmony_ci page2 = tmp; 11628c2ecf20Sopenharmony_ci 11638c2ecf20Sopenharmony_ci /* calculate next potential ppl offset */ 11648c2ecf20Sopenharmony_ci for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) 11658c2ecf20Sopenharmony_ci pplhdr_offset += 11668c2ecf20Sopenharmony_ci le32_to_cpu(pplhdr->entries[i].pp_size) >> 9; 11678c2ecf20Sopenharmony_ci pplhdr_offset += PPL_HEADER_SIZE >> 9; 11688c2ecf20Sopenharmony_ci } 11698c2ecf20Sopenharmony_ci 11708c2ecf20Sopenharmony_ci /* no valid ppl found */ 11718c2ecf20Sopenharmony_ci if (!pplhdr) 11728c2ecf20Sopenharmony_ci ppl_conf->mismatch_count++; 11738c2ecf20Sopenharmony_ci else 11748c2ecf20Sopenharmony_ci pr_debug("%s: latest PPL found at offset: %llu, with generation: %llu\n", 11758c2ecf20Sopenharmony_ci __func__, (unsigned long long)pplhdr_offset, 11768c2ecf20Sopenharmony_ci le64_to_cpu(pplhdr->generation)); 11778c2ecf20Sopenharmony_ci 11788c2ecf20Sopenharmony_ci /* attempt to recover from log if we are starting a dirty array */ 11798c2ecf20Sopenharmony_ci if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector) 11808c2ecf20Sopenharmony_ci ret = ppl_recover(log, pplhdr, pplhdr_offset); 11818c2ecf20Sopenharmony_ci 11828c2ecf20Sopenharmony_ci /* write empty header if we are starting the array */ 11838c2ecf20Sopenharmony_ci if (!ret && !mddev->pers) 11848c2ecf20Sopenharmony_ci ret = ppl_write_empty_header(log); 11858c2ecf20Sopenharmony_ci 11868c2ecf20Sopenharmony_ci __free_page(page); 11878c2ecf20Sopenharmony_ci __free_page(page2); 11888c2ecf20Sopenharmony_ci 11898c2ecf20Sopenharmony_ci pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n", 11908c2ecf20Sopenharmony_ci __func__, ret, ppl_conf->mismatch_count, 11918c2ecf20Sopenharmony_ci ppl_conf->recovered_entries); 11928c2ecf20Sopenharmony_ci return ret; 11938c2ecf20Sopenharmony_ci} 11948c2ecf20Sopenharmony_ci 11958c2ecf20Sopenharmony_cistatic int ppl_load(struct ppl_conf *ppl_conf) 11968c2ecf20Sopenharmony_ci{ 11978c2ecf20Sopenharmony_ci int ret = 0; 11988c2ecf20Sopenharmony_ci u32 signature = 0; 11998c2ecf20Sopenharmony_ci bool signature_set = false; 12008c2ecf20Sopenharmony_ci int i; 12018c2ecf20Sopenharmony_ci 12028c2ecf20Sopenharmony_ci for (i = 0; i < ppl_conf->count; i++) { 12038c2ecf20Sopenharmony_ci struct ppl_log *log = &ppl_conf->child_logs[i]; 12048c2ecf20Sopenharmony_ci 12058c2ecf20Sopenharmony_ci /* skip missing drive */ 12068c2ecf20Sopenharmony_ci if (!log->rdev) 12078c2ecf20Sopenharmony_ci continue; 12088c2ecf20Sopenharmony_ci 12098c2ecf20Sopenharmony_ci ret = ppl_load_distributed(log); 12108c2ecf20Sopenharmony_ci if (ret) 12118c2ecf20Sopenharmony_ci break; 12128c2ecf20Sopenharmony_ci 12138c2ecf20Sopenharmony_ci /* 12148c2ecf20Sopenharmony_ci * For external metadata we can't check if the signature is 12158c2ecf20Sopenharmony_ci * correct on a single drive, but we can check if it is the same 12168c2ecf20Sopenharmony_ci * on all drives. 12178c2ecf20Sopenharmony_ci */ 12188c2ecf20Sopenharmony_ci if (ppl_conf->mddev->external) { 12198c2ecf20Sopenharmony_ci if (!signature_set) { 12208c2ecf20Sopenharmony_ci signature = ppl_conf->signature; 12218c2ecf20Sopenharmony_ci signature_set = true; 12228c2ecf20Sopenharmony_ci } else if (signature != ppl_conf->signature) { 12238c2ecf20Sopenharmony_ci pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n", 12248c2ecf20Sopenharmony_ci mdname(ppl_conf->mddev)); 12258c2ecf20Sopenharmony_ci ret = -EINVAL; 12268c2ecf20Sopenharmony_ci break; 12278c2ecf20Sopenharmony_ci } 12288c2ecf20Sopenharmony_ci } 12298c2ecf20Sopenharmony_ci } 12308c2ecf20Sopenharmony_ci 12318c2ecf20Sopenharmony_ci pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n", 12328c2ecf20Sopenharmony_ci __func__, ret, ppl_conf->mismatch_count, 12338c2ecf20Sopenharmony_ci ppl_conf->recovered_entries); 12348c2ecf20Sopenharmony_ci return ret; 12358c2ecf20Sopenharmony_ci} 12368c2ecf20Sopenharmony_ci 12378c2ecf20Sopenharmony_cistatic void __ppl_exit_log(struct ppl_conf *ppl_conf) 12388c2ecf20Sopenharmony_ci{ 12398c2ecf20Sopenharmony_ci clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags); 12408c2ecf20Sopenharmony_ci clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags); 12418c2ecf20Sopenharmony_ci 12428c2ecf20Sopenharmony_ci kfree(ppl_conf->child_logs); 12438c2ecf20Sopenharmony_ci 12448c2ecf20Sopenharmony_ci bioset_exit(&ppl_conf->bs); 12458c2ecf20Sopenharmony_ci bioset_exit(&ppl_conf->flush_bs); 12468c2ecf20Sopenharmony_ci mempool_exit(&ppl_conf->io_pool); 12478c2ecf20Sopenharmony_ci kmem_cache_destroy(ppl_conf->io_kc); 12488c2ecf20Sopenharmony_ci 12498c2ecf20Sopenharmony_ci kfree(ppl_conf); 12508c2ecf20Sopenharmony_ci} 12518c2ecf20Sopenharmony_ci 12528c2ecf20Sopenharmony_civoid ppl_exit_log(struct r5conf *conf) 12538c2ecf20Sopenharmony_ci{ 12548c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = conf->log_private; 12558c2ecf20Sopenharmony_ci 12568c2ecf20Sopenharmony_ci if (ppl_conf) { 12578c2ecf20Sopenharmony_ci __ppl_exit_log(ppl_conf); 12588c2ecf20Sopenharmony_ci conf->log_private = NULL; 12598c2ecf20Sopenharmony_ci } 12608c2ecf20Sopenharmony_ci} 12618c2ecf20Sopenharmony_ci 12628c2ecf20Sopenharmony_cistatic int ppl_validate_rdev(struct md_rdev *rdev) 12638c2ecf20Sopenharmony_ci{ 12648c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 12658c2ecf20Sopenharmony_ci int ppl_data_sectors; 12668c2ecf20Sopenharmony_ci int ppl_size_new; 12678c2ecf20Sopenharmony_ci 12688c2ecf20Sopenharmony_ci /* 12698c2ecf20Sopenharmony_ci * The configured PPL size must be enough to store 12708c2ecf20Sopenharmony_ci * the header and (at the very least) partial parity 12718c2ecf20Sopenharmony_ci * for one stripe. Round it down to ensure the data 12728c2ecf20Sopenharmony_ci * space is cleanly divisible by stripe size. 12738c2ecf20Sopenharmony_ci */ 12748c2ecf20Sopenharmony_ci ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9); 12758c2ecf20Sopenharmony_ci 12768c2ecf20Sopenharmony_ci if (ppl_data_sectors > 0) 12778c2ecf20Sopenharmony_ci ppl_data_sectors = rounddown(ppl_data_sectors, 12788c2ecf20Sopenharmony_ci RAID5_STRIPE_SECTORS((struct r5conf *)rdev->mddev->private)); 12798c2ecf20Sopenharmony_ci 12808c2ecf20Sopenharmony_ci if (ppl_data_sectors <= 0) { 12818c2ecf20Sopenharmony_ci pr_warn("md/raid:%s: PPL space too small on %s\n", 12828c2ecf20Sopenharmony_ci mdname(rdev->mddev), bdevname(rdev->bdev, b)); 12838c2ecf20Sopenharmony_ci return -ENOSPC; 12848c2ecf20Sopenharmony_ci } 12858c2ecf20Sopenharmony_ci 12868c2ecf20Sopenharmony_ci ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9); 12878c2ecf20Sopenharmony_ci 12888c2ecf20Sopenharmony_ci if ((rdev->ppl.sector < rdev->data_offset && 12898c2ecf20Sopenharmony_ci rdev->ppl.sector + ppl_size_new > rdev->data_offset) || 12908c2ecf20Sopenharmony_ci (rdev->ppl.sector >= rdev->data_offset && 12918c2ecf20Sopenharmony_ci rdev->data_offset + rdev->sectors > rdev->ppl.sector)) { 12928c2ecf20Sopenharmony_ci pr_warn("md/raid:%s: PPL space overlaps with data on %s\n", 12938c2ecf20Sopenharmony_ci mdname(rdev->mddev), bdevname(rdev->bdev, b)); 12948c2ecf20Sopenharmony_ci return -EINVAL; 12958c2ecf20Sopenharmony_ci } 12968c2ecf20Sopenharmony_ci 12978c2ecf20Sopenharmony_ci if (!rdev->mddev->external && 12988c2ecf20Sopenharmony_ci ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) || 12998c2ecf20Sopenharmony_ci (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) { 13008c2ecf20Sopenharmony_ci pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n", 13018c2ecf20Sopenharmony_ci mdname(rdev->mddev), bdevname(rdev->bdev, b)); 13028c2ecf20Sopenharmony_ci return -EINVAL; 13038c2ecf20Sopenharmony_ci } 13048c2ecf20Sopenharmony_ci 13058c2ecf20Sopenharmony_ci rdev->ppl.size = ppl_size_new; 13068c2ecf20Sopenharmony_ci 13078c2ecf20Sopenharmony_ci return 0; 13088c2ecf20Sopenharmony_ci} 13098c2ecf20Sopenharmony_ci 13108c2ecf20Sopenharmony_cistatic void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) 13118c2ecf20Sopenharmony_ci{ 13128c2ecf20Sopenharmony_ci struct request_queue *q; 13138c2ecf20Sopenharmony_ci 13148c2ecf20Sopenharmony_ci if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE + 13158c2ecf20Sopenharmony_ci PPL_HEADER_SIZE) * 2) { 13168c2ecf20Sopenharmony_ci log->use_multippl = true; 13178c2ecf20Sopenharmony_ci set_bit(MD_HAS_MULTIPLE_PPLS, 13188c2ecf20Sopenharmony_ci &log->ppl_conf->mddev->flags); 13198c2ecf20Sopenharmony_ci log->entry_space = PPL_SPACE_SIZE; 13208c2ecf20Sopenharmony_ci } else { 13218c2ecf20Sopenharmony_ci log->use_multippl = false; 13228c2ecf20Sopenharmony_ci log->entry_space = (log->rdev->ppl.size << 9) - 13238c2ecf20Sopenharmony_ci PPL_HEADER_SIZE; 13248c2ecf20Sopenharmony_ci } 13258c2ecf20Sopenharmony_ci log->next_io_sector = rdev->ppl.sector; 13268c2ecf20Sopenharmony_ci 13278c2ecf20Sopenharmony_ci q = bdev_get_queue(rdev->bdev); 13288c2ecf20Sopenharmony_ci if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) 13298c2ecf20Sopenharmony_ci log->wb_cache_on = true; 13308c2ecf20Sopenharmony_ci} 13318c2ecf20Sopenharmony_ci 13328c2ecf20Sopenharmony_ciint ppl_init_log(struct r5conf *conf) 13338c2ecf20Sopenharmony_ci{ 13348c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf; 13358c2ecf20Sopenharmony_ci struct mddev *mddev = conf->mddev; 13368c2ecf20Sopenharmony_ci int ret = 0; 13378c2ecf20Sopenharmony_ci int max_disks; 13388c2ecf20Sopenharmony_ci int i; 13398c2ecf20Sopenharmony_ci 13408c2ecf20Sopenharmony_ci pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n", 13418c2ecf20Sopenharmony_ci mdname(conf->mddev)); 13428c2ecf20Sopenharmony_ci 13438c2ecf20Sopenharmony_ci if (PAGE_SIZE != 4096) 13448c2ecf20Sopenharmony_ci return -EINVAL; 13458c2ecf20Sopenharmony_ci 13468c2ecf20Sopenharmony_ci if (mddev->level != 5) { 13478c2ecf20Sopenharmony_ci pr_warn("md/raid:%s PPL is not compatible with raid level %d\n", 13488c2ecf20Sopenharmony_ci mdname(mddev), mddev->level); 13498c2ecf20Sopenharmony_ci return -EINVAL; 13508c2ecf20Sopenharmony_ci } 13518c2ecf20Sopenharmony_ci 13528c2ecf20Sopenharmony_ci if (mddev->bitmap_info.file || mddev->bitmap_info.offset) { 13538c2ecf20Sopenharmony_ci pr_warn("md/raid:%s PPL is not compatible with bitmap\n", 13548c2ecf20Sopenharmony_ci mdname(mddev)); 13558c2ecf20Sopenharmony_ci return -EINVAL; 13568c2ecf20Sopenharmony_ci } 13578c2ecf20Sopenharmony_ci 13588c2ecf20Sopenharmony_ci if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { 13598c2ecf20Sopenharmony_ci pr_warn("md/raid:%s PPL is not compatible with journal\n", 13608c2ecf20Sopenharmony_ci mdname(mddev)); 13618c2ecf20Sopenharmony_ci return -EINVAL; 13628c2ecf20Sopenharmony_ci } 13638c2ecf20Sopenharmony_ci 13648c2ecf20Sopenharmony_ci max_disks = sizeof_field(struct ppl_log, disk_flush_bitmap) * 13658c2ecf20Sopenharmony_ci BITS_PER_BYTE; 13668c2ecf20Sopenharmony_ci if (conf->raid_disks > max_disks) { 13678c2ecf20Sopenharmony_ci pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n", 13688c2ecf20Sopenharmony_ci mdname(mddev), max_disks); 13698c2ecf20Sopenharmony_ci return -EINVAL; 13708c2ecf20Sopenharmony_ci } 13718c2ecf20Sopenharmony_ci 13728c2ecf20Sopenharmony_ci ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL); 13738c2ecf20Sopenharmony_ci if (!ppl_conf) 13748c2ecf20Sopenharmony_ci return -ENOMEM; 13758c2ecf20Sopenharmony_ci 13768c2ecf20Sopenharmony_ci ppl_conf->mddev = mddev; 13778c2ecf20Sopenharmony_ci 13788c2ecf20Sopenharmony_ci ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0); 13798c2ecf20Sopenharmony_ci if (!ppl_conf->io_kc) { 13808c2ecf20Sopenharmony_ci ret = -ENOMEM; 13818c2ecf20Sopenharmony_ci goto err; 13828c2ecf20Sopenharmony_ci } 13838c2ecf20Sopenharmony_ci 13848c2ecf20Sopenharmony_ci ret = mempool_init(&ppl_conf->io_pool, conf->raid_disks, ppl_io_pool_alloc, 13858c2ecf20Sopenharmony_ci ppl_io_pool_free, ppl_conf->io_kc); 13868c2ecf20Sopenharmony_ci if (ret) 13878c2ecf20Sopenharmony_ci goto err; 13888c2ecf20Sopenharmony_ci 13898c2ecf20Sopenharmony_ci ret = bioset_init(&ppl_conf->bs, conf->raid_disks, 0, BIOSET_NEED_BVECS); 13908c2ecf20Sopenharmony_ci if (ret) 13918c2ecf20Sopenharmony_ci goto err; 13928c2ecf20Sopenharmony_ci 13938c2ecf20Sopenharmony_ci ret = bioset_init(&ppl_conf->flush_bs, conf->raid_disks, 0, 0); 13948c2ecf20Sopenharmony_ci if (ret) 13958c2ecf20Sopenharmony_ci goto err; 13968c2ecf20Sopenharmony_ci 13978c2ecf20Sopenharmony_ci ppl_conf->count = conf->raid_disks; 13988c2ecf20Sopenharmony_ci ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log), 13998c2ecf20Sopenharmony_ci GFP_KERNEL); 14008c2ecf20Sopenharmony_ci if (!ppl_conf->child_logs) { 14018c2ecf20Sopenharmony_ci ret = -ENOMEM; 14028c2ecf20Sopenharmony_ci goto err; 14038c2ecf20Sopenharmony_ci } 14048c2ecf20Sopenharmony_ci 14058c2ecf20Sopenharmony_ci atomic64_set(&ppl_conf->seq, 0); 14068c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&ppl_conf->no_mem_stripes); 14078c2ecf20Sopenharmony_ci spin_lock_init(&ppl_conf->no_mem_stripes_lock); 14088c2ecf20Sopenharmony_ci ppl_conf->write_hint = RWH_WRITE_LIFE_NOT_SET; 14098c2ecf20Sopenharmony_ci 14108c2ecf20Sopenharmony_ci if (!mddev->external) { 14118c2ecf20Sopenharmony_ci ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); 14128c2ecf20Sopenharmony_ci ppl_conf->block_size = 512; 14138c2ecf20Sopenharmony_ci } else { 14148c2ecf20Sopenharmony_ci ppl_conf->block_size = queue_logical_block_size(mddev->queue); 14158c2ecf20Sopenharmony_ci } 14168c2ecf20Sopenharmony_ci 14178c2ecf20Sopenharmony_ci for (i = 0; i < ppl_conf->count; i++) { 14188c2ecf20Sopenharmony_ci struct ppl_log *log = &ppl_conf->child_logs[i]; 14198c2ecf20Sopenharmony_ci struct md_rdev *rdev = conf->disks[i].rdev; 14208c2ecf20Sopenharmony_ci 14218c2ecf20Sopenharmony_ci mutex_init(&log->io_mutex); 14228c2ecf20Sopenharmony_ci spin_lock_init(&log->io_list_lock); 14238c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&log->io_list); 14248c2ecf20Sopenharmony_ci 14258c2ecf20Sopenharmony_ci log->ppl_conf = ppl_conf; 14268c2ecf20Sopenharmony_ci log->rdev = rdev; 14278c2ecf20Sopenharmony_ci 14288c2ecf20Sopenharmony_ci if (rdev) { 14298c2ecf20Sopenharmony_ci ret = ppl_validate_rdev(rdev); 14308c2ecf20Sopenharmony_ci if (ret) 14318c2ecf20Sopenharmony_ci goto err; 14328c2ecf20Sopenharmony_ci 14338c2ecf20Sopenharmony_ci ppl_init_child_log(log, rdev); 14348c2ecf20Sopenharmony_ci } 14358c2ecf20Sopenharmony_ci } 14368c2ecf20Sopenharmony_ci 14378c2ecf20Sopenharmony_ci /* load and possibly recover the logs from the member disks */ 14388c2ecf20Sopenharmony_ci ret = ppl_load(ppl_conf); 14398c2ecf20Sopenharmony_ci 14408c2ecf20Sopenharmony_ci if (ret) { 14418c2ecf20Sopenharmony_ci goto err; 14428c2ecf20Sopenharmony_ci } else if (!mddev->pers && mddev->recovery_cp == 0 && 14438c2ecf20Sopenharmony_ci ppl_conf->recovered_entries > 0 && 14448c2ecf20Sopenharmony_ci ppl_conf->mismatch_count == 0) { 14458c2ecf20Sopenharmony_ci /* 14468c2ecf20Sopenharmony_ci * If we are starting a dirty array and the recovery succeeds 14478c2ecf20Sopenharmony_ci * without any issues, set the array as clean. 14488c2ecf20Sopenharmony_ci */ 14498c2ecf20Sopenharmony_ci mddev->recovery_cp = MaxSector; 14508c2ecf20Sopenharmony_ci set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); 14518c2ecf20Sopenharmony_ci } else if (mddev->pers && ppl_conf->mismatch_count > 0) { 14528c2ecf20Sopenharmony_ci /* no mismatch allowed when enabling PPL for a running array */ 14538c2ecf20Sopenharmony_ci ret = -EINVAL; 14548c2ecf20Sopenharmony_ci goto err; 14558c2ecf20Sopenharmony_ci } 14568c2ecf20Sopenharmony_ci 14578c2ecf20Sopenharmony_ci conf->log_private = ppl_conf; 14588c2ecf20Sopenharmony_ci set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags); 14598c2ecf20Sopenharmony_ci 14608c2ecf20Sopenharmony_ci return 0; 14618c2ecf20Sopenharmony_cierr: 14628c2ecf20Sopenharmony_ci __ppl_exit_log(ppl_conf); 14638c2ecf20Sopenharmony_ci return ret; 14648c2ecf20Sopenharmony_ci} 14658c2ecf20Sopenharmony_ci 14668c2ecf20Sopenharmony_ciint ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add) 14678c2ecf20Sopenharmony_ci{ 14688c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = conf->log_private; 14698c2ecf20Sopenharmony_ci struct ppl_log *log; 14708c2ecf20Sopenharmony_ci int ret = 0; 14718c2ecf20Sopenharmony_ci char b[BDEVNAME_SIZE]; 14728c2ecf20Sopenharmony_ci 14738c2ecf20Sopenharmony_ci if (!rdev) 14748c2ecf20Sopenharmony_ci return -EINVAL; 14758c2ecf20Sopenharmony_ci 14768c2ecf20Sopenharmony_ci pr_debug("%s: disk: %d operation: %s dev: %s\n", 14778c2ecf20Sopenharmony_ci __func__, rdev->raid_disk, add ? "add" : "remove", 14788c2ecf20Sopenharmony_ci bdevname(rdev->bdev, b)); 14798c2ecf20Sopenharmony_ci 14808c2ecf20Sopenharmony_ci if (rdev->raid_disk < 0) 14818c2ecf20Sopenharmony_ci return 0; 14828c2ecf20Sopenharmony_ci 14838c2ecf20Sopenharmony_ci if (rdev->raid_disk >= ppl_conf->count) 14848c2ecf20Sopenharmony_ci return -ENODEV; 14858c2ecf20Sopenharmony_ci 14868c2ecf20Sopenharmony_ci log = &ppl_conf->child_logs[rdev->raid_disk]; 14878c2ecf20Sopenharmony_ci 14888c2ecf20Sopenharmony_ci mutex_lock(&log->io_mutex); 14898c2ecf20Sopenharmony_ci if (add) { 14908c2ecf20Sopenharmony_ci ret = ppl_validate_rdev(rdev); 14918c2ecf20Sopenharmony_ci if (!ret) { 14928c2ecf20Sopenharmony_ci log->rdev = rdev; 14938c2ecf20Sopenharmony_ci ret = ppl_write_empty_header(log); 14948c2ecf20Sopenharmony_ci ppl_init_child_log(log, rdev); 14958c2ecf20Sopenharmony_ci } 14968c2ecf20Sopenharmony_ci } else { 14978c2ecf20Sopenharmony_ci log->rdev = NULL; 14988c2ecf20Sopenharmony_ci } 14998c2ecf20Sopenharmony_ci mutex_unlock(&log->io_mutex); 15008c2ecf20Sopenharmony_ci 15018c2ecf20Sopenharmony_ci return ret; 15028c2ecf20Sopenharmony_ci} 15038c2ecf20Sopenharmony_ci 15048c2ecf20Sopenharmony_cistatic ssize_t 15058c2ecf20Sopenharmony_cippl_write_hint_show(struct mddev *mddev, char *buf) 15068c2ecf20Sopenharmony_ci{ 15078c2ecf20Sopenharmony_ci size_t ret = 0; 15088c2ecf20Sopenharmony_ci struct r5conf *conf; 15098c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf = NULL; 15108c2ecf20Sopenharmony_ci 15118c2ecf20Sopenharmony_ci spin_lock(&mddev->lock); 15128c2ecf20Sopenharmony_ci conf = mddev->private; 15138c2ecf20Sopenharmony_ci if (conf && raid5_has_ppl(conf)) 15148c2ecf20Sopenharmony_ci ppl_conf = conf->log_private; 15158c2ecf20Sopenharmony_ci ret = sprintf(buf, "%d\n", ppl_conf ? ppl_conf->write_hint : 0); 15168c2ecf20Sopenharmony_ci spin_unlock(&mddev->lock); 15178c2ecf20Sopenharmony_ci 15188c2ecf20Sopenharmony_ci return ret; 15198c2ecf20Sopenharmony_ci} 15208c2ecf20Sopenharmony_ci 15218c2ecf20Sopenharmony_cistatic ssize_t 15228c2ecf20Sopenharmony_cippl_write_hint_store(struct mddev *mddev, const char *page, size_t len) 15238c2ecf20Sopenharmony_ci{ 15248c2ecf20Sopenharmony_ci struct r5conf *conf; 15258c2ecf20Sopenharmony_ci struct ppl_conf *ppl_conf; 15268c2ecf20Sopenharmony_ci int err = 0; 15278c2ecf20Sopenharmony_ci unsigned short new; 15288c2ecf20Sopenharmony_ci 15298c2ecf20Sopenharmony_ci if (len >= PAGE_SIZE) 15308c2ecf20Sopenharmony_ci return -EINVAL; 15318c2ecf20Sopenharmony_ci if (kstrtou16(page, 10, &new)) 15328c2ecf20Sopenharmony_ci return -EINVAL; 15338c2ecf20Sopenharmony_ci 15348c2ecf20Sopenharmony_ci err = mddev_lock(mddev); 15358c2ecf20Sopenharmony_ci if (err) 15368c2ecf20Sopenharmony_ci return err; 15378c2ecf20Sopenharmony_ci 15388c2ecf20Sopenharmony_ci conf = mddev->private; 15398c2ecf20Sopenharmony_ci if (!conf) { 15408c2ecf20Sopenharmony_ci err = -ENODEV; 15418c2ecf20Sopenharmony_ci } else if (raid5_has_ppl(conf)) { 15428c2ecf20Sopenharmony_ci ppl_conf = conf->log_private; 15438c2ecf20Sopenharmony_ci if (!ppl_conf) 15448c2ecf20Sopenharmony_ci err = -EINVAL; 15458c2ecf20Sopenharmony_ci else 15468c2ecf20Sopenharmony_ci ppl_conf->write_hint = new; 15478c2ecf20Sopenharmony_ci } else { 15488c2ecf20Sopenharmony_ci err = -EINVAL; 15498c2ecf20Sopenharmony_ci } 15508c2ecf20Sopenharmony_ci 15518c2ecf20Sopenharmony_ci mddev_unlock(mddev); 15528c2ecf20Sopenharmony_ci 15538c2ecf20Sopenharmony_ci return err ?: len; 15548c2ecf20Sopenharmony_ci} 15558c2ecf20Sopenharmony_ci 15568c2ecf20Sopenharmony_cistruct md_sysfs_entry 15578c2ecf20Sopenharmony_cippl_write_hint = __ATTR(ppl_write_hint, S_IRUGO | S_IWUSR, 15588c2ecf20Sopenharmony_ci ppl_write_hint_show, 15598c2ecf20Sopenharmony_ci ppl_write_hint_store); 1560