18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Copyright (C) 2015 Shaohua Li <shli@fb.com>
48c2ecf20Sopenharmony_ci * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
58c2ecf20Sopenharmony_ci */
68c2ecf20Sopenharmony_ci#include <linux/kernel.h>
78c2ecf20Sopenharmony_ci#include <linux/wait.h>
88c2ecf20Sopenharmony_ci#include <linux/blkdev.h>
98c2ecf20Sopenharmony_ci#include <linux/slab.h>
108c2ecf20Sopenharmony_ci#include <linux/raid/md_p.h>
118c2ecf20Sopenharmony_ci#include <linux/crc32c.h>
128c2ecf20Sopenharmony_ci#include <linux/random.h>
138c2ecf20Sopenharmony_ci#include <linux/kthread.h>
148c2ecf20Sopenharmony_ci#include <linux/types.h>
158c2ecf20Sopenharmony_ci#include "md.h"
168c2ecf20Sopenharmony_ci#include "raid5.h"
178c2ecf20Sopenharmony_ci#include "md-bitmap.h"
188c2ecf20Sopenharmony_ci#include "raid5-log.h"
198c2ecf20Sopenharmony_ci
208c2ecf20Sopenharmony_ci/*
218c2ecf20Sopenharmony_ci * metadata/data stored in disk with 4k size unit (a block) regardless
228c2ecf20Sopenharmony_ci * underneath hardware sector size. only works with PAGE_SIZE == 4096
238c2ecf20Sopenharmony_ci */
248c2ecf20Sopenharmony_ci#define BLOCK_SECTORS (8)
258c2ecf20Sopenharmony_ci#define BLOCK_SECTOR_SHIFT (3)
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_ci/*
288c2ecf20Sopenharmony_ci * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
298c2ecf20Sopenharmony_ci *
308c2ecf20Sopenharmony_ci * In write through mode, the reclaim runs every log->max_free_space.
318c2ecf20Sopenharmony_ci * This can prevent the recovery scans for too long
328c2ecf20Sopenharmony_ci */
338c2ecf20Sopenharmony_ci#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
348c2ecf20Sopenharmony_ci#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
358c2ecf20Sopenharmony_ci
368c2ecf20Sopenharmony_ci/* wake up reclaim thread periodically */
378c2ecf20Sopenharmony_ci#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
388c2ecf20Sopenharmony_ci/* start flush with these full stripes */
398c2ecf20Sopenharmony_ci#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
408c2ecf20Sopenharmony_ci/* reclaim stripes in groups */
418c2ecf20Sopenharmony_ci#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
428c2ecf20Sopenharmony_ci
438c2ecf20Sopenharmony_ci/*
448c2ecf20Sopenharmony_ci * We only need 2 bios per I/O unit to make progress, but ensure we
458c2ecf20Sopenharmony_ci * have a few more available to not get too tight.
468c2ecf20Sopenharmony_ci */
478c2ecf20Sopenharmony_ci#define R5L_POOL_SIZE	4
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_cistatic char *r5c_journal_mode_str[] = {"write-through",
508c2ecf20Sopenharmony_ci				       "write-back"};
518c2ecf20Sopenharmony_ci/*
528c2ecf20Sopenharmony_ci * raid5 cache state machine
538c2ecf20Sopenharmony_ci *
548c2ecf20Sopenharmony_ci * With the RAID cache, each stripe works in two phases:
558c2ecf20Sopenharmony_ci *	- caching phase
568c2ecf20Sopenharmony_ci *	- writing-out phase
578c2ecf20Sopenharmony_ci *
588c2ecf20Sopenharmony_ci * These two phases are controlled by bit STRIPE_R5C_CACHING:
598c2ecf20Sopenharmony_ci *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
608c2ecf20Sopenharmony_ci *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
618c2ecf20Sopenharmony_ci *
628c2ecf20Sopenharmony_ci * When there is no journal, or the journal is in write-through mode,
638c2ecf20Sopenharmony_ci * the stripe is always in writing-out phase.
648c2ecf20Sopenharmony_ci *
658c2ecf20Sopenharmony_ci * For write-back journal, the stripe is sent to caching phase on write
668c2ecf20Sopenharmony_ci * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
678c2ecf20Sopenharmony_ci * the write-out phase by clearing STRIPE_R5C_CACHING.
688c2ecf20Sopenharmony_ci *
698c2ecf20Sopenharmony_ci * Stripes in caching phase do not write the raid disks. Instead, all
708c2ecf20Sopenharmony_ci * writes are committed from the log device. Therefore, a stripe in
718c2ecf20Sopenharmony_ci * caching phase handles writes as:
728c2ecf20Sopenharmony_ci *	- write to log device
738c2ecf20Sopenharmony_ci *	- return IO
748c2ecf20Sopenharmony_ci *
758c2ecf20Sopenharmony_ci * Stripes in writing-out phase handle writes as:
768c2ecf20Sopenharmony_ci *	- calculate parity
778c2ecf20Sopenharmony_ci *	- write pending data and parity to journal
788c2ecf20Sopenharmony_ci *	- write data and parity to raid disks
798c2ecf20Sopenharmony_ci *	- return IO for pending writes
808c2ecf20Sopenharmony_ci */
818c2ecf20Sopenharmony_ci
828c2ecf20Sopenharmony_cistruct r5l_log {
838c2ecf20Sopenharmony_ci	struct md_rdev *rdev;
848c2ecf20Sopenharmony_ci
858c2ecf20Sopenharmony_ci	u32 uuid_checksum;
868c2ecf20Sopenharmony_ci
878c2ecf20Sopenharmony_ci	sector_t device_size;		/* log device size, round to
888c2ecf20Sopenharmony_ci					 * BLOCK_SECTORS */
898c2ecf20Sopenharmony_ci	sector_t max_free_space;	/* reclaim run if free space is at
908c2ecf20Sopenharmony_ci					 * this size */
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci	sector_t last_checkpoint;	/* log tail. where recovery scan
938c2ecf20Sopenharmony_ci					 * starts from */
948c2ecf20Sopenharmony_ci	u64 last_cp_seq;		/* log tail sequence */
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_ci	sector_t log_start;		/* log head. where new data appends */
978c2ecf20Sopenharmony_ci	u64 seq;			/* log head sequence */
988c2ecf20Sopenharmony_ci
998c2ecf20Sopenharmony_ci	sector_t next_checkpoint;
1008c2ecf20Sopenharmony_ci
1018c2ecf20Sopenharmony_ci	struct mutex io_mutex;
1028c2ecf20Sopenharmony_ci	struct r5l_io_unit *current_io;	/* current io_unit accepting new data */
1038c2ecf20Sopenharmony_ci
1048c2ecf20Sopenharmony_ci	spinlock_t io_list_lock;
1058c2ecf20Sopenharmony_ci	struct list_head running_ios;	/* io_units which are still running,
1068c2ecf20Sopenharmony_ci					 * and have not yet been completely
1078c2ecf20Sopenharmony_ci					 * written to the log */
1088c2ecf20Sopenharmony_ci	struct list_head io_end_ios;	/* io_units which have been completely
1098c2ecf20Sopenharmony_ci					 * written to the log but not yet written
1108c2ecf20Sopenharmony_ci					 * to the RAID */
1118c2ecf20Sopenharmony_ci	struct list_head flushing_ios;	/* io_units which are waiting for log
1128c2ecf20Sopenharmony_ci					 * cache flush */
1138c2ecf20Sopenharmony_ci	struct list_head finished_ios;	/* io_units which settle down in log disk */
1148c2ecf20Sopenharmony_ci	struct bio flush_bio;
1158c2ecf20Sopenharmony_ci
1168c2ecf20Sopenharmony_ci	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
1178c2ecf20Sopenharmony_ci
1188c2ecf20Sopenharmony_ci	struct kmem_cache *io_kc;
1198c2ecf20Sopenharmony_ci	mempool_t io_pool;
1208c2ecf20Sopenharmony_ci	struct bio_set bs;
1218c2ecf20Sopenharmony_ci	mempool_t meta_pool;
1228c2ecf20Sopenharmony_ci
1238c2ecf20Sopenharmony_ci	struct md_thread *reclaim_thread;
1248c2ecf20Sopenharmony_ci	unsigned long reclaim_target;	/* number of space that need to be
1258c2ecf20Sopenharmony_ci					 * reclaimed.  if it's 0, reclaim spaces
1268c2ecf20Sopenharmony_ci					 * used by io_units which are in
1278c2ecf20Sopenharmony_ci					 * IO_UNIT_STRIPE_END state (eg, reclaim
1288c2ecf20Sopenharmony_ci					 * dones't wait for specific io_unit
1298c2ecf20Sopenharmony_ci					 * switching to IO_UNIT_STRIPE_END
1308c2ecf20Sopenharmony_ci					 * state) */
1318c2ecf20Sopenharmony_ci	wait_queue_head_t iounit_wait;
1328c2ecf20Sopenharmony_ci
1338c2ecf20Sopenharmony_ci	struct list_head no_space_stripes; /* pending stripes, log has no space */
1348c2ecf20Sopenharmony_ci	spinlock_t no_space_stripes_lock;
1358c2ecf20Sopenharmony_ci
1368c2ecf20Sopenharmony_ci	bool need_cache_flush;
1378c2ecf20Sopenharmony_ci
1388c2ecf20Sopenharmony_ci	/* for r5c_cache */
1398c2ecf20Sopenharmony_ci	enum r5c_journal_mode r5c_journal_mode;
1408c2ecf20Sopenharmony_ci
1418c2ecf20Sopenharmony_ci	/* all stripes in r5cache, in the order of seq at sh->log_start */
1428c2ecf20Sopenharmony_ci	struct list_head stripe_in_journal_list;
1438c2ecf20Sopenharmony_ci
1448c2ecf20Sopenharmony_ci	spinlock_t stripe_in_journal_lock;
1458c2ecf20Sopenharmony_ci	atomic_t stripe_in_journal_count;
1468c2ecf20Sopenharmony_ci
1478c2ecf20Sopenharmony_ci	/* to submit async io_units, to fulfill ordering of flush */
1488c2ecf20Sopenharmony_ci	struct work_struct deferred_io_work;
1498c2ecf20Sopenharmony_ci	/* to disable write back during in degraded mode */
1508c2ecf20Sopenharmony_ci	struct work_struct disable_writeback_work;
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci	/* to for chunk_aligned_read in writeback mode, details below */
1538c2ecf20Sopenharmony_ci	spinlock_t tree_lock;
1548c2ecf20Sopenharmony_ci	struct radix_tree_root big_stripe_tree;
1558c2ecf20Sopenharmony_ci};
1568c2ecf20Sopenharmony_ci
1578c2ecf20Sopenharmony_ci/*
1588c2ecf20Sopenharmony_ci * Enable chunk_aligned_read() with write back cache.
1598c2ecf20Sopenharmony_ci *
1608c2ecf20Sopenharmony_ci * Each chunk may contain more than one stripe (for example, a 256kB
1618c2ecf20Sopenharmony_ci * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
1628c2ecf20Sopenharmony_ci * chunk_aligned_read, these stripes are grouped into one "big_stripe".
1638c2ecf20Sopenharmony_ci * For each big_stripe, we count how many stripes of this big_stripe
1648c2ecf20Sopenharmony_ci * are in the write back cache. These data are tracked in a radix tree
1658c2ecf20Sopenharmony_ci * (big_stripe_tree). We use radix_tree item pointer as the counter.
1668c2ecf20Sopenharmony_ci * r5c_tree_index() is used to calculate keys for the radix tree.
1678c2ecf20Sopenharmony_ci *
1688c2ecf20Sopenharmony_ci * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
1698c2ecf20Sopenharmony_ci * big_stripe of each chunk in the tree. If this big_stripe is in the
1708c2ecf20Sopenharmony_ci * tree, chunk_aligned_read() aborts. This look up is protected by
1718c2ecf20Sopenharmony_ci * rcu_read_lock().
1728c2ecf20Sopenharmony_ci *
1738c2ecf20Sopenharmony_ci * It is necessary to remember whether a stripe is counted in
1748c2ecf20Sopenharmony_ci * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
1758c2ecf20Sopenharmony_ci * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
1768c2ecf20Sopenharmony_ci * two flags are set, the stripe is counted in big_stripe_tree. This
1778c2ecf20Sopenharmony_ci * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
1788c2ecf20Sopenharmony_ci * r5c_try_caching_write(); and moving clear_bit of
1798c2ecf20Sopenharmony_ci * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
1808c2ecf20Sopenharmony_ci * r5c_finish_stripe_write_out().
1818c2ecf20Sopenharmony_ci */
1828c2ecf20Sopenharmony_ci
1838c2ecf20Sopenharmony_ci/*
1848c2ecf20Sopenharmony_ci * radix tree requests lowest 2 bits of data pointer to be 2b'00.
1858c2ecf20Sopenharmony_ci * So it is necessary to left shift the counter by 2 bits before using it
1868c2ecf20Sopenharmony_ci * as data pointer of the tree.
1878c2ecf20Sopenharmony_ci */
1888c2ecf20Sopenharmony_ci#define R5C_RADIX_COUNT_SHIFT 2
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci/*
1918c2ecf20Sopenharmony_ci * calculate key for big_stripe_tree
1928c2ecf20Sopenharmony_ci *
1938c2ecf20Sopenharmony_ci * sect: align_bi->bi_iter.bi_sector or sh->sector
1948c2ecf20Sopenharmony_ci */
1958c2ecf20Sopenharmony_cistatic inline sector_t r5c_tree_index(struct r5conf *conf,
1968c2ecf20Sopenharmony_ci				      sector_t sect)
1978c2ecf20Sopenharmony_ci{
1988c2ecf20Sopenharmony_ci	sector_div(sect, conf->chunk_sectors);
1998c2ecf20Sopenharmony_ci	return sect;
2008c2ecf20Sopenharmony_ci}
2018c2ecf20Sopenharmony_ci
2028c2ecf20Sopenharmony_ci/*
2038c2ecf20Sopenharmony_ci * an IO range starts from a meta data block and end at the next meta data
2048c2ecf20Sopenharmony_ci * block. The io unit's the meta data block tracks data/parity followed it. io
2058c2ecf20Sopenharmony_ci * unit is written to log disk with normal write, as we always flush log disk
2068c2ecf20Sopenharmony_ci * first and then start move data to raid disks, there is no requirement to
2078c2ecf20Sopenharmony_ci * write io unit with FLUSH/FUA
2088c2ecf20Sopenharmony_ci */
2098c2ecf20Sopenharmony_cistruct r5l_io_unit {
2108c2ecf20Sopenharmony_ci	struct r5l_log *log;
2118c2ecf20Sopenharmony_ci
2128c2ecf20Sopenharmony_ci	struct page *meta_page;	/* store meta block */
2138c2ecf20Sopenharmony_ci	int meta_offset;	/* current offset in meta_page */
2148c2ecf20Sopenharmony_ci
2158c2ecf20Sopenharmony_ci	struct bio *current_bio;/* current_bio accepting new data */
2168c2ecf20Sopenharmony_ci
2178c2ecf20Sopenharmony_ci	atomic_t pending_stripe;/* how many stripes not flushed to raid */
2188c2ecf20Sopenharmony_ci	u64 seq;		/* seq number of the metablock */
2198c2ecf20Sopenharmony_ci	sector_t log_start;	/* where the io_unit starts */
2208c2ecf20Sopenharmony_ci	sector_t log_end;	/* where the io_unit ends */
2218c2ecf20Sopenharmony_ci	struct list_head log_sibling; /* log->running_ios */
2228c2ecf20Sopenharmony_ci	struct list_head stripe_list; /* stripes added to the io_unit */
2238c2ecf20Sopenharmony_ci
2248c2ecf20Sopenharmony_ci	int state;
2258c2ecf20Sopenharmony_ci	bool need_split_bio;
2268c2ecf20Sopenharmony_ci	struct bio *split_bio;
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci	unsigned int has_flush:1;		/* include flush request */
2298c2ecf20Sopenharmony_ci	unsigned int has_fua:1;			/* include fua request */
2308c2ecf20Sopenharmony_ci	unsigned int has_null_flush:1;		/* include null flush request */
2318c2ecf20Sopenharmony_ci	unsigned int has_flush_payload:1;	/* include flush payload  */
2328c2ecf20Sopenharmony_ci	/*
2338c2ecf20Sopenharmony_ci	 * io isn't sent yet, flush/fua request can only be submitted till it's
2348c2ecf20Sopenharmony_ci	 * the first IO in running_ios list
2358c2ecf20Sopenharmony_ci	 */
2368c2ecf20Sopenharmony_ci	unsigned int io_deferred:1;
2378c2ecf20Sopenharmony_ci
2388c2ecf20Sopenharmony_ci	struct bio_list flush_barriers;   /* size == 0 flush bios */
2398c2ecf20Sopenharmony_ci};
2408c2ecf20Sopenharmony_ci
2418c2ecf20Sopenharmony_ci/* r5l_io_unit state */
2428c2ecf20Sopenharmony_cienum r5l_io_unit_state {
2438c2ecf20Sopenharmony_ci	IO_UNIT_RUNNING = 0,	/* accepting new IO */
2448c2ecf20Sopenharmony_ci	IO_UNIT_IO_START = 1,	/* io_unit bio start writing to log,
2458c2ecf20Sopenharmony_ci				 * don't accepting new bio */
2468c2ecf20Sopenharmony_ci	IO_UNIT_IO_END = 2,	/* io_unit bio finish writing to log */
2478c2ecf20Sopenharmony_ci	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
2488c2ecf20Sopenharmony_ci};
2498c2ecf20Sopenharmony_ci
2508c2ecf20Sopenharmony_cibool r5c_is_writeback(struct r5l_log *log)
2518c2ecf20Sopenharmony_ci{
2528c2ecf20Sopenharmony_ci	return (log != NULL &&
2538c2ecf20Sopenharmony_ci		log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
2548c2ecf20Sopenharmony_ci}
2558c2ecf20Sopenharmony_ci
2568c2ecf20Sopenharmony_cistatic sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
2578c2ecf20Sopenharmony_ci{
2588c2ecf20Sopenharmony_ci	start += inc;
2598c2ecf20Sopenharmony_ci	if (start >= log->device_size)
2608c2ecf20Sopenharmony_ci		start = start - log->device_size;
2618c2ecf20Sopenharmony_ci	return start;
2628c2ecf20Sopenharmony_ci}
2638c2ecf20Sopenharmony_ci
2648c2ecf20Sopenharmony_cistatic sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
2658c2ecf20Sopenharmony_ci				  sector_t end)
2668c2ecf20Sopenharmony_ci{
2678c2ecf20Sopenharmony_ci	if (end >= start)
2688c2ecf20Sopenharmony_ci		return end - start;
2698c2ecf20Sopenharmony_ci	else
2708c2ecf20Sopenharmony_ci		return end + log->device_size - start;
2718c2ecf20Sopenharmony_ci}
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_cistatic bool r5l_has_free_space(struct r5l_log *log, sector_t size)
2748c2ecf20Sopenharmony_ci{
2758c2ecf20Sopenharmony_ci	sector_t used_size;
2768c2ecf20Sopenharmony_ci
2778c2ecf20Sopenharmony_ci	used_size = r5l_ring_distance(log, log->last_checkpoint,
2788c2ecf20Sopenharmony_ci					log->log_start);
2798c2ecf20Sopenharmony_ci
2808c2ecf20Sopenharmony_ci	return log->device_size > used_size + size;
2818c2ecf20Sopenharmony_ci}
2828c2ecf20Sopenharmony_ci
2838c2ecf20Sopenharmony_cistatic void __r5l_set_io_unit_state(struct r5l_io_unit *io,
2848c2ecf20Sopenharmony_ci				    enum r5l_io_unit_state state)
2858c2ecf20Sopenharmony_ci{
2868c2ecf20Sopenharmony_ci	if (WARN_ON(io->state >= state))
2878c2ecf20Sopenharmony_ci		return;
2888c2ecf20Sopenharmony_ci	io->state = state;
2898c2ecf20Sopenharmony_ci}
2908c2ecf20Sopenharmony_ci
2918c2ecf20Sopenharmony_cistatic void
2928c2ecf20Sopenharmony_cir5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
2938c2ecf20Sopenharmony_ci{
2948c2ecf20Sopenharmony_ci	struct bio *wbi, *wbi2;
2958c2ecf20Sopenharmony_ci
2968c2ecf20Sopenharmony_ci	wbi = dev->written;
2978c2ecf20Sopenharmony_ci	dev->written = NULL;
2988c2ecf20Sopenharmony_ci	while (wbi && wbi->bi_iter.bi_sector <
2998c2ecf20Sopenharmony_ci	       dev->sector + RAID5_STRIPE_SECTORS(conf)) {
3008c2ecf20Sopenharmony_ci		wbi2 = r5_next_bio(conf, wbi, dev->sector);
3018c2ecf20Sopenharmony_ci		md_write_end(conf->mddev);
3028c2ecf20Sopenharmony_ci		bio_endio(wbi);
3038c2ecf20Sopenharmony_ci		wbi = wbi2;
3048c2ecf20Sopenharmony_ci	}
3058c2ecf20Sopenharmony_ci}
3068c2ecf20Sopenharmony_ci
3078c2ecf20Sopenharmony_civoid r5c_handle_cached_data_endio(struct r5conf *conf,
3088c2ecf20Sopenharmony_ci				  struct stripe_head *sh, int disks)
3098c2ecf20Sopenharmony_ci{
3108c2ecf20Sopenharmony_ci	int i;
3118c2ecf20Sopenharmony_ci
3128c2ecf20Sopenharmony_ci	for (i = sh->disks; i--; ) {
3138c2ecf20Sopenharmony_ci		if (sh->dev[i].written) {
3148c2ecf20Sopenharmony_ci			set_bit(R5_UPTODATE, &sh->dev[i].flags);
3158c2ecf20Sopenharmony_ci			r5c_return_dev_pending_writes(conf, &sh->dev[i]);
3168c2ecf20Sopenharmony_ci			md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3178c2ecf20Sopenharmony_ci					   RAID5_STRIPE_SECTORS(conf),
3188c2ecf20Sopenharmony_ci					   !test_bit(STRIPE_DEGRADED, &sh->state),
3198c2ecf20Sopenharmony_ci					   0);
3208c2ecf20Sopenharmony_ci		}
3218c2ecf20Sopenharmony_ci	}
3228c2ecf20Sopenharmony_ci}
3238c2ecf20Sopenharmony_ci
3248c2ecf20Sopenharmony_civoid r5l_wake_reclaim(struct r5l_log *log, sector_t space);
3258c2ecf20Sopenharmony_ci
3268c2ecf20Sopenharmony_ci/* Check whether we should flush some stripes to free up stripe cache */
3278c2ecf20Sopenharmony_civoid r5c_check_stripe_cache_usage(struct r5conf *conf)
3288c2ecf20Sopenharmony_ci{
3298c2ecf20Sopenharmony_ci	int total_cached;
3308c2ecf20Sopenharmony_ci
3318c2ecf20Sopenharmony_ci	if (!r5c_is_writeback(conf->log))
3328c2ecf20Sopenharmony_ci		return;
3338c2ecf20Sopenharmony_ci
3348c2ecf20Sopenharmony_ci	total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
3358c2ecf20Sopenharmony_ci		atomic_read(&conf->r5c_cached_full_stripes);
3368c2ecf20Sopenharmony_ci
3378c2ecf20Sopenharmony_ci	/*
3388c2ecf20Sopenharmony_ci	 * The following condition is true for either of the following:
3398c2ecf20Sopenharmony_ci	 *   - stripe cache pressure high:
3408c2ecf20Sopenharmony_ci	 *          total_cached > 3/4 min_nr_stripes ||
3418c2ecf20Sopenharmony_ci	 *          empty_inactive_list_nr > 0
3428c2ecf20Sopenharmony_ci	 *   - stripe cache pressure moderate:
3438c2ecf20Sopenharmony_ci	 *          total_cached > 1/2 min_nr_stripes
3448c2ecf20Sopenharmony_ci	 */
3458c2ecf20Sopenharmony_ci	if (total_cached > conf->min_nr_stripes * 1 / 2 ||
3468c2ecf20Sopenharmony_ci	    atomic_read(&conf->empty_inactive_list_nr) > 0)
3478c2ecf20Sopenharmony_ci		r5l_wake_reclaim(conf->log, 0);
3488c2ecf20Sopenharmony_ci}
3498c2ecf20Sopenharmony_ci
3508c2ecf20Sopenharmony_ci/*
3518c2ecf20Sopenharmony_ci * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
3528c2ecf20Sopenharmony_ci * stripes in the cache
3538c2ecf20Sopenharmony_ci */
3548c2ecf20Sopenharmony_civoid r5c_check_cached_full_stripe(struct r5conf *conf)
3558c2ecf20Sopenharmony_ci{
3568c2ecf20Sopenharmony_ci	if (!r5c_is_writeback(conf->log))
3578c2ecf20Sopenharmony_ci		return;
3588c2ecf20Sopenharmony_ci
3598c2ecf20Sopenharmony_ci	/*
3608c2ecf20Sopenharmony_ci	 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
3618c2ecf20Sopenharmony_ci	 * or a full stripe (chunk size / 4k stripes).
3628c2ecf20Sopenharmony_ci	 */
3638c2ecf20Sopenharmony_ci	if (atomic_read(&conf->r5c_cached_full_stripes) >=
3648c2ecf20Sopenharmony_ci	    min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
3658c2ecf20Sopenharmony_ci		conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
3668c2ecf20Sopenharmony_ci		r5l_wake_reclaim(conf->log, 0);
3678c2ecf20Sopenharmony_ci}
3688c2ecf20Sopenharmony_ci
3698c2ecf20Sopenharmony_ci/*
3708c2ecf20Sopenharmony_ci * Total log space (in sectors) needed to flush all data in cache
3718c2ecf20Sopenharmony_ci *
3728c2ecf20Sopenharmony_ci * To avoid deadlock due to log space, it is necessary to reserve log
3738c2ecf20Sopenharmony_ci * space to flush critical stripes (stripes that occupying log space near
3748c2ecf20Sopenharmony_ci * last_checkpoint). This function helps check how much log space is
3758c2ecf20Sopenharmony_ci * required to flush all cached stripes.
3768c2ecf20Sopenharmony_ci *
3778c2ecf20Sopenharmony_ci * To reduce log space requirements, two mechanisms are used to give cache
3788c2ecf20Sopenharmony_ci * flush higher priorities:
3798c2ecf20Sopenharmony_ci *    1. In handle_stripe_dirtying() and schedule_reconstruction(),
3808c2ecf20Sopenharmony_ci *       stripes ALREADY in journal can be flushed w/o pending writes;
3818c2ecf20Sopenharmony_ci *    2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
3828c2ecf20Sopenharmony_ci *       can be delayed (r5l_add_no_space_stripe).
3838c2ecf20Sopenharmony_ci *
3848c2ecf20Sopenharmony_ci * In cache flush, the stripe goes through 1 and then 2. For a stripe that
3858c2ecf20Sopenharmony_ci * already passed 1, flushing it requires at most (conf->max_degraded + 1)
3868c2ecf20Sopenharmony_ci * pages of journal space. For stripes that has not passed 1, flushing it
3878c2ecf20Sopenharmony_ci * requires (conf->raid_disks + 1) pages of journal space. There are at
3888c2ecf20Sopenharmony_ci * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
3898c2ecf20Sopenharmony_ci * required to flush all cached stripes (in pages) is:
3908c2ecf20Sopenharmony_ci *
3918c2ecf20Sopenharmony_ci *     (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
3928c2ecf20Sopenharmony_ci *     (group_cnt + 1) * (raid_disks + 1)
3938c2ecf20Sopenharmony_ci * or
3948c2ecf20Sopenharmony_ci *     (stripe_in_journal_count) * (max_degraded + 1) +
3958c2ecf20Sopenharmony_ci *     (group_cnt + 1) * (raid_disks - max_degraded)
3968c2ecf20Sopenharmony_ci */
3978c2ecf20Sopenharmony_cistatic sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
3988c2ecf20Sopenharmony_ci{
3998c2ecf20Sopenharmony_ci	struct r5l_log *log = conf->log;
4008c2ecf20Sopenharmony_ci
4018c2ecf20Sopenharmony_ci	if (!r5c_is_writeback(log))
4028c2ecf20Sopenharmony_ci		return 0;
4038c2ecf20Sopenharmony_ci
4048c2ecf20Sopenharmony_ci	return BLOCK_SECTORS *
4058c2ecf20Sopenharmony_ci		((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
4068c2ecf20Sopenharmony_ci		 (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
4078c2ecf20Sopenharmony_ci}
4088c2ecf20Sopenharmony_ci
4098c2ecf20Sopenharmony_ci/*
4108c2ecf20Sopenharmony_ci * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
4118c2ecf20Sopenharmony_ci *
4128c2ecf20Sopenharmony_ci * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
4138c2ecf20Sopenharmony_ci * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
4148c2ecf20Sopenharmony_ci * device is less than 2x of reclaim_required_space.
4158c2ecf20Sopenharmony_ci */
4168c2ecf20Sopenharmony_cistatic inline void r5c_update_log_state(struct r5l_log *log)
4178c2ecf20Sopenharmony_ci{
4188c2ecf20Sopenharmony_ci	struct r5conf *conf = log->rdev->mddev->private;
4198c2ecf20Sopenharmony_ci	sector_t free_space;
4208c2ecf20Sopenharmony_ci	sector_t reclaim_space;
4218c2ecf20Sopenharmony_ci	bool wake_reclaim = false;
4228c2ecf20Sopenharmony_ci
4238c2ecf20Sopenharmony_ci	if (!r5c_is_writeback(log))
4248c2ecf20Sopenharmony_ci		return;
4258c2ecf20Sopenharmony_ci
4268c2ecf20Sopenharmony_ci	free_space = r5l_ring_distance(log, log->log_start,
4278c2ecf20Sopenharmony_ci				       log->last_checkpoint);
4288c2ecf20Sopenharmony_ci	reclaim_space = r5c_log_required_to_flush_cache(conf);
4298c2ecf20Sopenharmony_ci	if (free_space < 2 * reclaim_space)
4308c2ecf20Sopenharmony_ci		set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
4318c2ecf20Sopenharmony_ci	else {
4328c2ecf20Sopenharmony_ci		if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
4338c2ecf20Sopenharmony_ci			wake_reclaim = true;
4348c2ecf20Sopenharmony_ci		clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
4358c2ecf20Sopenharmony_ci	}
4368c2ecf20Sopenharmony_ci	if (free_space < 3 * reclaim_space)
4378c2ecf20Sopenharmony_ci		set_bit(R5C_LOG_TIGHT, &conf->cache_state);
4388c2ecf20Sopenharmony_ci	else
4398c2ecf20Sopenharmony_ci		clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
4408c2ecf20Sopenharmony_ci
4418c2ecf20Sopenharmony_ci	if (wake_reclaim)
4428c2ecf20Sopenharmony_ci		r5l_wake_reclaim(log, 0);
4438c2ecf20Sopenharmony_ci}
4448c2ecf20Sopenharmony_ci
4458c2ecf20Sopenharmony_ci/*
4468c2ecf20Sopenharmony_ci * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
4478c2ecf20Sopenharmony_ci * This function should only be called in write-back mode.
4488c2ecf20Sopenharmony_ci */
4498c2ecf20Sopenharmony_civoid r5c_make_stripe_write_out(struct stripe_head *sh)
4508c2ecf20Sopenharmony_ci{
4518c2ecf20Sopenharmony_ci	struct r5conf *conf = sh->raid_conf;
4528c2ecf20Sopenharmony_ci	struct r5l_log *log = conf->log;
4538c2ecf20Sopenharmony_ci
4548c2ecf20Sopenharmony_ci	BUG_ON(!r5c_is_writeback(log));
4558c2ecf20Sopenharmony_ci
4568c2ecf20Sopenharmony_ci	WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
4578c2ecf20Sopenharmony_ci	clear_bit(STRIPE_R5C_CACHING, &sh->state);
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_ci	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4608c2ecf20Sopenharmony_ci		atomic_inc(&conf->preread_active_stripes);
4618c2ecf20Sopenharmony_ci}
4628c2ecf20Sopenharmony_ci
4638c2ecf20Sopenharmony_cistatic void r5c_handle_data_cached(struct stripe_head *sh)
4648c2ecf20Sopenharmony_ci{
4658c2ecf20Sopenharmony_ci	int i;
4668c2ecf20Sopenharmony_ci
4678c2ecf20Sopenharmony_ci	for (i = sh->disks; i--; )
4688c2ecf20Sopenharmony_ci		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
4698c2ecf20Sopenharmony_ci			set_bit(R5_InJournal, &sh->dev[i].flags);
4708c2ecf20Sopenharmony_ci			clear_bit(R5_LOCKED, &sh->dev[i].flags);
4718c2ecf20Sopenharmony_ci		}
4728c2ecf20Sopenharmony_ci	clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
4738c2ecf20Sopenharmony_ci}
4748c2ecf20Sopenharmony_ci
4758c2ecf20Sopenharmony_ci/*
4768c2ecf20Sopenharmony_ci * this journal write must contain full parity,
4778c2ecf20Sopenharmony_ci * it may also contain some data pages
4788c2ecf20Sopenharmony_ci */
4798c2ecf20Sopenharmony_cistatic void r5c_handle_parity_cached(struct stripe_head *sh)
4808c2ecf20Sopenharmony_ci{
4818c2ecf20Sopenharmony_ci	int i;
4828c2ecf20Sopenharmony_ci
4838c2ecf20Sopenharmony_ci	for (i = sh->disks; i--; )
4848c2ecf20Sopenharmony_ci		if (test_bit(R5_InJournal, &sh->dev[i].flags))
4858c2ecf20Sopenharmony_ci			set_bit(R5_Wantwrite, &sh->dev[i].flags);
4868c2ecf20Sopenharmony_ci}
4878c2ecf20Sopenharmony_ci
4888c2ecf20Sopenharmony_ci/*
4898c2ecf20Sopenharmony_ci * Setting proper flags after writing (or flushing) data and/or parity to the
4908c2ecf20Sopenharmony_ci * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
4918c2ecf20Sopenharmony_ci */
4928c2ecf20Sopenharmony_cistatic void r5c_finish_cache_stripe(struct stripe_head *sh)
4938c2ecf20Sopenharmony_ci{
4948c2ecf20Sopenharmony_ci	struct r5l_log *log = sh->raid_conf->log;
4958c2ecf20Sopenharmony_ci
4968c2ecf20Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
4978c2ecf20Sopenharmony_ci		BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
4988c2ecf20Sopenharmony_ci		/*
4998c2ecf20Sopenharmony_ci		 * Set R5_InJournal for parity dev[pd_idx]. This means
5008c2ecf20Sopenharmony_ci		 * all data AND parity in the journal. For RAID 6, it is
5018c2ecf20Sopenharmony_ci		 * NOT necessary to set the flag for dev[qd_idx], as the
5028c2ecf20Sopenharmony_ci		 * two parities are written out together.
5038c2ecf20Sopenharmony_ci		 */
5048c2ecf20Sopenharmony_ci		set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
5058c2ecf20Sopenharmony_ci	} else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
5068c2ecf20Sopenharmony_ci		r5c_handle_data_cached(sh);
5078c2ecf20Sopenharmony_ci	} else {
5088c2ecf20Sopenharmony_ci		r5c_handle_parity_cached(sh);
5098c2ecf20Sopenharmony_ci		set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
5108c2ecf20Sopenharmony_ci	}
5118c2ecf20Sopenharmony_ci}
5128c2ecf20Sopenharmony_ci
5138c2ecf20Sopenharmony_cistatic void r5l_io_run_stripes(struct r5l_io_unit *io)
5148c2ecf20Sopenharmony_ci{
5158c2ecf20Sopenharmony_ci	struct stripe_head *sh, *next;
5168c2ecf20Sopenharmony_ci
5178c2ecf20Sopenharmony_ci	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
5188c2ecf20Sopenharmony_ci		list_del_init(&sh->log_list);
5198c2ecf20Sopenharmony_ci
5208c2ecf20Sopenharmony_ci		r5c_finish_cache_stripe(sh);
5218c2ecf20Sopenharmony_ci
5228c2ecf20Sopenharmony_ci		set_bit(STRIPE_HANDLE, &sh->state);
5238c2ecf20Sopenharmony_ci		raid5_release_stripe(sh);
5248c2ecf20Sopenharmony_ci	}
5258c2ecf20Sopenharmony_ci}
5268c2ecf20Sopenharmony_ci
5278c2ecf20Sopenharmony_cistatic void r5l_log_run_stripes(struct r5l_log *log)
5288c2ecf20Sopenharmony_ci{
5298c2ecf20Sopenharmony_ci	struct r5l_io_unit *io, *next;
5308c2ecf20Sopenharmony_ci
5318c2ecf20Sopenharmony_ci	lockdep_assert_held(&log->io_list_lock);
5328c2ecf20Sopenharmony_ci
5338c2ecf20Sopenharmony_ci	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
5348c2ecf20Sopenharmony_ci		/* don't change list order */
5358c2ecf20Sopenharmony_ci		if (io->state < IO_UNIT_IO_END)
5368c2ecf20Sopenharmony_ci			break;
5378c2ecf20Sopenharmony_ci
5388c2ecf20Sopenharmony_ci		list_move_tail(&io->log_sibling, &log->finished_ios);
5398c2ecf20Sopenharmony_ci		r5l_io_run_stripes(io);
5408c2ecf20Sopenharmony_ci	}
5418c2ecf20Sopenharmony_ci}
5428c2ecf20Sopenharmony_ci
5438c2ecf20Sopenharmony_cistatic void r5l_move_to_end_ios(struct r5l_log *log)
5448c2ecf20Sopenharmony_ci{
5458c2ecf20Sopenharmony_ci	struct r5l_io_unit *io, *next;
5468c2ecf20Sopenharmony_ci
5478c2ecf20Sopenharmony_ci	lockdep_assert_held(&log->io_list_lock);
5488c2ecf20Sopenharmony_ci
5498c2ecf20Sopenharmony_ci	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
5508c2ecf20Sopenharmony_ci		/* don't change list order */
5518c2ecf20Sopenharmony_ci		if (io->state < IO_UNIT_IO_END)
5528c2ecf20Sopenharmony_ci			break;
5538c2ecf20Sopenharmony_ci		list_move_tail(&io->log_sibling, &log->io_end_ios);
5548c2ecf20Sopenharmony_ci	}
5558c2ecf20Sopenharmony_ci}
5568c2ecf20Sopenharmony_ci
5578c2ecf20Sopenharmony_cistatic void __r5l_stripe_write_finished(struct r5l_io_unit *io);
5588c2ecf20Sopenharmony_cistatic void r5l_log_endio(struct bio *bio)
5598c2ecf20Sopenharmony_ci{
5608c2ecf20Sopenharmony_ci	struct r5l_io_unit *io = bio->bi_private;
5618c2ecf20Sopenharmony_ci	struct r5l_io_unit *io_deferred;
5628c2ecf20Sopenharmony_ci	struct r5l_log *log = io->log;
5638c2ecf20Sopenharmony_ci	unsigned long flags;
5648c2ecf20Sopenharmony_ci	bool has_null_flush;
5658c2ecf20Sopenharmony_ci	bool has_flush_payload;
5668c2ecf20Sopenharmony_ci
5678c2ecf20Sopenharmony_ci	if (bio->bi_status)
5688c2ecf20Sopenharmony_ci		md_error(log->rdev->mddev, log->rdev);
5698c2ecf20Sopenharmony_ci
5708c2ecf20Sopenharmony_ci	bio_put(bio);
5718c2ecf20Sopenharmony_ci	mempool_free(io->meta_page, &log->meta_pool);
5728c2ecf20Sopenharmony_ci
5738c2ecf20Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
5748c2ecf20Sopenharmony_ci	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
5758c2ecf20Sopenharmony_ci
5768c2ecf20Sopenharmony_ci	/*
5778c2ecf20Sopenharmony_ci	 * if the io doesn't not have null_flush or flush payload,
5788c2ecf20Sopenharmony_ci	 * it is not safe to access it after releasing io_list_lock.
5798c2ecf20Sopenharmony_ci	 * Therefore, it is necessary to check the condition with
5808c2ecf20Sopenharmony_ci	 * the lock held.
5818c2ecf20Sopenharmony_ci	 */
5828c2ecf20Sopenharmony_ci	has_null_flush = io->has_null_flush;
5838c2ecf20Sopenharmony_ci	has_flush_payload = io->has_flush_payload;
5848c2ecf20Sopenharmony_ci
5858c2ecf20Sopenharmony_ci	if (log->need_cache_flush && !list_empty(&io->stripe_list))
5868c2ecf20Sopenharmony_ci		r5l_move_to_end_ios(log);
5878c2ecf20Sopenharmony_ci	else
5888c2ecf20Sopenharmony_ci		r5l_log_run_stripes(log);
5898c2ecf20Sopenharmony_ci	if (!list_empty(&log->running_ios)) {
5908c2ecf20Sopenharmony_ci		/*
5918c2ecf20Sopenharmony_ci		 * FLUSH/FUA io_unit is deferred because of ordering, now we
5928c2ecf20Sopenharmony_ci		 * can dispatch it
5938c2ecf20Sopenharmony_ci		 */
5948c2ecf20Sopenharmony_ci		io_deferred = list_first_entry(&log->running_ios,
5958c2ecf20Sopenharmony_ci					       struct r5l_io_unit, log_sibling);
5968c2ecf20Sopenharmony_ci		if (io_deferred->io_deferred)
5978c2ecf20Sopenharmony_ci			schedule_work(&log->deferred_io_work);
5988c2ecf20Sopenharmony_ci	}
5998c2ecf20Sopenharmony_ci
6008c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
6018c2ecf20Sopenharmony_ci
6028c2ecf20Sopenharmony_ci	if (log->need_cache_flush)
6038c2ecf20Sopenharmony_ci		md_wakeup_thread(log->rdev->mddev->thread);
6048c2ecf20Sopenharmony_ci
6058c2ecf20Sopenharmony_ci	/* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
6068c2ecf20Sopenharmony_ci	if (has_null_flush) {
6078c2ecf20Sopenharmony_ci		struct bio *bi;
6088c2ecf20Sopenharmony_ci
6098c2ecf20Sopenharmony_ci		WARN_ON(bio_list_empty(&io->flush_barriers));
6108c2ecf20Sopenharmony_ci		while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
6118c2ecf20Sopenharmony_ci			bio_endio(bi);
6128c2ecf20Sopenharmony_ci			if (atomic_dec_and_test(&io->pending_stripe)) {
6138c2ecf20Sopenharmony_ci				__r5l_stripe_write_finished(io);
6148c2ecf20Sopenharmony_ci				return;
6158c2ecf20Sopenharmony_ci			}
6168c2ecf20Sopenharmony_ci		}
6178c2ecf20Sopenharmony_ci	}
6188c2ecf20Sopenharmony_ci	/* decrease pending_stripe for flush payload */
6198c2ecf20Sopenharmony_ci	if (has_flush_payload)
6208c2ecf20Sopenharmony_ci		if (atomic_dec_and_test(&io->pending_stripe))
6218c2ecf20Sopenharmony_ci			__r5l_stripe_write_finished(io);
6228c2ecf20Sopenharmony_ci}
6238c2ecf20Sopenharmony_ci
6248c2ecf20Sopenharmony_cistatic void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
6258c2ecf20Sopenharmony_ci{
6268c2ecf20Sopenharmony_ci	unsigned long flags;
6278c2ecf20Sopenharmony_ci
6288c2ecf20Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
6298c2ecf20Sopenharmony_ci	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
6308c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
6318c2ecf20Sopenharmony_ci
6328c2ecf20Sopenharmony_ci	/*
6338c2ecf20Sopenharmony_ci	 * In case of journal device failures, submit_bio will get error
6348c2ecf20Sopenharmony_ci	 * and calls endio, then active stripes will continue write
6358c2ecf20Sopenharmony_ci	 * process. Therefore, it is not necessary to check Faulty bit
6368c2ecf20Sopenharmony_ci	 * of journal device here.
6378c2ecf20Sopenharmony_ci	 *
6388c2ecf20Sopenharmony_ci	 * We can't check split_bio after current_bio is submitted. If
6398c2ecf20Sopenharmony_ci	 * io->split_bio is null, after current_bio is submitted, current_bio
6408c2ecf20Sopenharmony_ci	 * might already be completed and the io_unit is freed. We submit
6418c2ecf20Sopenharmony_ci	 * split_bio first to avoid the issue.
6428c2ecf20Sopenharmony_ci	 */
6438c2ecf20Sopenharmony_ci	if (io->split_bio) {
6448c2ecf20Sopenharmony_ci		if (io->has_flush)
6458c2ecf20Sopenharmony_ci			io->split_bio->bi_opf |= REQ_PREFLUSH;
6468c2ecf20Sopenharmony_ci		if (io->has_fua)
6478c2ecf20Sopenharmony_ci			io->split_bio->bi_opf |= REQ_FUA;
6488c2ecf20Sopenharmony_ci		submit_bio(io->split_bio);
6498c2ecf20Sopenharmony_ci	}
6508c2ecf20Sopenharmony_ci
6518c2ecf20Sopenharmony_ci	if (io->has_flush)
6528c2ecf20Sopenharmony_ci		io->current_bio->bi_opf |= REQ_PREFLUSH;
6538c2ecf20Sopenharmony_ci	if (io->has_fua)
6548c2ecf20Sopenharmony_ci		io->current_bio->bi_opf |= REQ_FUA;
6558c2ecf20Sopenharmony_ci	submit_bio(io->current_bio);
6568c2ecf20Sopenharmony_ci}
6578c2ecf20Sopenharmony_ci
6588c2ecf20Sopenharmony_ci/* deferred io_unit will be dispatched here */
6598c2ecf20Sopenharmony_cistatic void r5l_submit_io_async(struct work_struct *work)
6608c2ecf20Sopenharmony_ci{
6618c2ecf20Sopenharmony_ci	struct r5l_log *log = container_of(work, struct r5l_log,
6628c2ecf20Sopenharmony_ci					   deferred_io_work);
6638c2ecf20Sopenharmony_ci	struct r5l_io_unit *io = NULL;
6648c2ecf20Sopenharmony_ci	unsigned long flags;
6658c2ecf20Sopenharmony_ci
6668c2ecf20Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
6678c2ecf20Sopenharmony_ci	if (!list_empty(&log->running_ios)) {
6688c2ecf20Sopenharmony_ci		io = list_first_entry(&log->running_ios, struct r5l_io_unit,
6698c2ecf20Sopenharmony_ci				      log_sibling);
6708c2ecf20Sopenharmony_ci		if (!io->io_deferred)
6718c2ecf20Sopenharmony_ci			io = NULL;
6728c2ecf20Sopenharmony_ci		else
6738c2ecf20Sopenharmony_ci			io->io_deferred = 0;
6748c2ecf20Sopenharmony_ci	}
6758c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
6768c2ecf20Sopenharmony_ci	if (io)
6778c2ecf20Sopenharmony_ci		r5l_do_submit_io(log, io);
6788c2ecf20Sopenharmony_ci}
6798c2ecf20Sopenharmony_ci
6808c2ecf20Sopenharmony_cistatic void r5c_disable_writeback_async(struct work_struct *work)
6818c2ecf20Sopenharmony_ci{
6828c2ecf20Sopenharmony_ci	struct r5l_log *log = container_of(work, struct r5l_log,
6838c2ecf20Sopenharmony_ci					   disable_writeback_work);
6848c2ecf20Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
6858c2ecf20Sopenharmony_ci	struct r5conf *conf = mddev->private;
6868c2ecf20Sopenharmony_ci	int locked = 0;
6878c2ecf20Sopenharmony_ci
6888c2ecf20Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
6898c2ecf20Sopenharmony_ci		return;
6908c2ecf20Sopenharmony_ci	pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
6918c2ecf20Sopenharmony_ci		mdname(mddev));
6928c2ecf20Sopenharmony_ci
6938c2ecf20Sopenharmony_ci	/* wait superblock change before suspend */
6948c2ecf20Sopenharmony_ci	wait_event(mddev->sb_wait,
6958c2ecf20Sopenharmony_ci		   conf->log == NULL ||
6968c2ecf20Sopenharmony_ci		   (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
6978c2ecf20Sopenharmony_ci		    (locked = mddev_trylock(mddev))));
6988c2ecf20Sopenharmony_ci	if (locked) {
6998c2ecf20Sopenharmony_ci		mddev_suspend(mddev);
7008c2ecf20Sopenharmony_ci		log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
7018c2ecf20Sopenharmony_ci		mddev_resume(mddev);
7028c2ecf20Sopenharmony_ci		mddev_unlock(mddev);
7038c2ecf20Sopenharmony_ci	}
7048c2ecf20Sopenharmony_ci}
7058c2ecf20Sopenharmony_ci
7068c2ecf20Sopenharmony_cistatic void r5l_submit_current_io(struct r5l_log *log)
7078c2ecf20Sopenharmony_ci{
7088c2ecf20Sopenharmony_ci	struct r5l_io_unit *io = log->current_io;
7098c2ecf20Sopenharmony_ci	struct r5l_meta_block *block;
7108c2ecf20Sopenharmony_ci	unsigned long flags;
7118c2ecf20Sopenharmony_ci	u32 crc;
7128c2ecf20Sopenharmony_ci	bool do_submit = true;
7138c2ecf20Sopenharmony_ci
7148c2ecf20Sopenharmony_ci	if (!io)
7158c2ecf20Sopenharmony_ci		return;
7168c2ecf20Sopenharmony_ci
7178c2ecf20Sopenharmony_ci	block = page_address(io->meta_page);
7188c2ecf20Sopenharmony_ci	block->meta_size = cpu_to_le32(io->meta_offset);
7198c2ecf20Sopenharmony_ci	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
7208c2ecf20Sopenharmony_ci	block->checksum = cpu_to_le32(crc);
7218c2ecf20Sopenharmony_ci
7228c2ecf20Sopenharmony_ci	log->current_io = NULL;
7238c2ecf20Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
7248c2ecf20Sopenharmony_ci	if (io->has_flush || io->has_fua) {
7258c2ecf20Sopenharmony_ci		if (io != list_first_entry(&log->running_ios,
7268c2ecf20Sopenharmony_ci					   struct r5l_io_unit, log_sibling)) {
7278c2ecf20Sopenharmony_ci			io->io_deferred = 1;
7288c2ecf20Sopenharmony_ci			do_submit = false;
7298c2ecf20Sopenharmony_ci		}
7308c2ecf20Sopenharmony_ci	}
7318c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
7328c2ecf20Sopenharmony_ci	if (do_submit)
7338c2ecf20Sopenharmony_ci		r5l_do_submit_io(log, io);
7348c2ecf20Sopenharmony_ci}
7358c2ecf20Sopenharmony_ci
7368c2ecf20Sopenharmony_cistatic struct bio *r5l_bio_alloc(struct r5l_log *log)
7378c2ecf20Sopenharmony_ci{
7388c2ecf20Sopenharmony_ci	struct bio *bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, &log->bs);
7398c2ecf20Sopenharmony_ci
7408c2ecf20Sopenharmony_ci	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
7418c2ecf20Sopenharmony_ci	bio_set_dev(bio, log->rdev->bdev);
7428c2ecf20Sopenharmony_ci	bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
7438c2ecf20Sopenharmony_ci
7448c2ecf20Sopenharmony_ci	return bio;
7458c2ecf20Sopenharmony_ci}
7468c2ecf20Sopenharmony_ci
7478c2ecf20Sopenharmony_cistatic void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
7488c2ecf20Sopenharmony_ci{
7498c2ecf20Sopenharmony_ci	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
7508c2ecf20Sopenharmony_ci
7518c2ecf20Sopenharmony_ci	r5c_update_log_state(log);
7528c2ecf20Sopenharmony_ci	/*
7538c2ecf20Sopenharmony_ci	 * If we filled up the log device start from the beginning again,
7548c2ecf20Sopenharmony_ci	 * which will require a new bio.
7558c2ecf20Sopenharmony_ci	 *
7568c2ecf20Sopenharmony_ci	 * Note: for this to work properly the log size needs to me a multiple
7578c2ecf20Sopenharmony_ci	 * of BLOCK_SECTORS.
7588c2ecf20Sopenharmony_ci	 */
7598c2ecf20Sopenharmony_ci	if (log->log_start == 0)
7608c2ecf20Sopenharmony_ci		io->need_split_bio = true;
7618c2ecf20Sopenharmony_ci
7628c2ecf20Sopenharmony_ci	io->log_end = log->log_start;
7638c2ecf20Sopenharmony_ci}
7648c2ecf20Sopenharmony_ci
7658c2ecf20Sopenharmony_cistatic struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
7668c2ecf20Sopenharmony_ci{
7678c2ecf20Sopenharmony_ci	struct r5l_io_unit *io;
7688c2ecf20Sopenharmony_ci	struct r5l_meta_block *block;
7698c2ecf20Sopenharmony_ci
7708c2ecf20Sopenharmony_ci	io = mempool_alloc(&log->io_pool, GFP_ATOMIC);
7718c2ecf20Sopenharmony_ci	if (!io)
7728c2ecf20Sopenharmony_ci		return NULL;
7738c2ecf20Sopenharmony_ci	memset(io, 0, sizeof(*io));
7748c2ecf20Sopenharmony_ci
7758c2ecf20Sopenharmony_ci	io->log = log;
7768c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&io->log_sibling);
7778c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&io->stripe_list);
7788c2ecf20Sopenharmony_ci	bio_list_init(&io->flush_barriers);
7798c2ecf20Sopenharmony_ci	io->state = IO_UNIT_RUNNING;
7808c2ecf20Sopenharmony_ci
7818c2ecf20Sopenharmony_ci	io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO);
7828c2ecf20Sopenharmony_ci	block = page_address(io->meta_page);
7838c2ecf20Sopenharmony_ci	clear_page(block);
7848c2ecf20Sopenharmony_ci	block->magic = cpu_to_le32(R5LOG_MAGIC);
7858c2ecf20Sopenharmony_ci	block->version = R5LOG_VERSION;
7868c2ecf20Sopenharmony_ci	block->seq = cpu_to_le64(log->seq);
7878c2ecf20Sopenharmony_ci	block->position = cpu_to_le64(log->log_start);
7888c2ecf20Sopenharmony_ci
7898c2ecf20Sopenharmony_ci	io->log_start = log->log_start;
7908c2ecf20Sopenharmony_ci	io->meta_offset = sizeof(struct r5l_meta_block);
7918c2ecf20Sopenharmony_ci	io->seq = log->seq++;
7928c2ecf20Sopenharmony_ci
7938c2ecf20Sopenharmony_ci	io->current_bio = r5l_bio_alloc(log);
7948c2ecf20Sopenharmony_ci	io->current_bio->bi_end_io = r5l_log_endio;
7958c2ecf20Sopenharmony_ci	io->current_bio->bi_private = io;
7968c2ecf20Sopenharmony_ci	bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
7978c2ecf20Sopenharmony_ci
7988c2ecf20Sopenharmony_ci	r5_reserve_log_entry(log, io);
7998c2ecf20Sopenharmony_ci
8008c2ecf20Sopenharmony_ci	spin_lock_irq(&log->io_list_lock);
8018c2ecf20Sopenharmony_ci	list_add_tail(&io->log_sibling, &log->running_ios);
8028c2ecf20Sopenharmony_ci	spin_unlock_irq(&log->io_list_lock);
8038c2ecf20Sopenharmony_ci
8048c2ecf20Sopenharmony_ci	return io;
8058c2ecf20Sopenharmony_ci}
8068c2ecf20Sopenharmony_ci
8078c2ecf20Sopenharmony_cistatic int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
8088c2ecf20Sopenharmony_ci{
8098c2ecf20Sopenharmony_ci	if (log->current_io &&
8108c2ecf20Sopenharmony_ci	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
8118c2ecf20Sopenharmony_ci		r5l_submit_current_io(log);
8128c2ecf20Sopenharmony_ci
8138c2ecf20Sopenharmony_ci	if (!log->current_io) {
8148c2ecf20Sopenharmony_ci		log->current_io = r5l_new_meta(log);
8158c2ecf20Sopenharmony_ci		if (!log->current_io)
8168c2ecf20Sopenharmony_ci			return -ENOMEM;
8178c2ecf20Sopenharmony_ci	}
8188c2ecf20Sopenharmony_ci
8198c2ecf20Sopenharmony_ci	return 0;
8208c2ecf20Sopenharmony_ci}
8218c2ecf20Sopenharmony_ci
8228c2ecf20Sopenharmony_cistatic void r5l_append_payload_meta(struct r5l_log *log, u16 type,
8238c2ecf20Sopenharmony_ci				    sector_t location,
8248c2ecf20Sopenharmony_ci				    u32 checksum1, u32 checksum2,
8258c2ecf20Sopenharmony_ci				    bool checksum2_valid)
8268c2ecf20Sopenharmony_ci{
8278c2ecf20Sopenharmony_ci	struct r5l_io_unit *io = log->current_io;
8288c2ecf20Sopenharmony_ci	struct r5l_payload_data_parity *payload;
8298c2ecf20Sopenharmony_ci
8308c2ecf20Sopenharmony_ci	payload = page_address(io->meta_page) + io->meta_offset;
8318c2ecf20Sopenharmony_ci	payload->header.type = cpu_to_le16(type);
8328c2ecf20Sopenharmony_ci	payload->header.flags = cpu_to_le16(0);
8338c2ecf20Sopenharmony_ci	payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
8348c2ecf20Sopenharmony_ci				    (PAGE_SHIFT - 9));
8358c2ecf20Sopenharmony_ci	payload->location = cpu_to_le64(location);
8368c2ecf20Sopenharmony_ci	payload->checksum[0] = cpu_to_le32(checksum1);
8378c2ecf20Sopenharmony_ci	if (checksum2_valid)
8388c2ecf20Sopenharmony_ci		payload->checksum[1] = cpu_to_le32(checksum2);
8398c2ecf20Sopenharmony_ci
8408c2ecf20Sopenharmony_ci	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
8418c2ecf20Sopenharmony_ci		sizeof(__le32) * (1 + !!checksum2_valid);
8428c2ecf20Sopenharmony_ci}
8438c2ecf20Sopenharmony_ci
8448c2ecf20Sopenharmony_cistatic void r5l_append_payload_page(struct r5l_log *log, struct page *page)
8458c2ecf20Sopenharmony_ci{
8468c2ecf20Sopenharmony_ci	struct r5l_io_unit *io = log->current_io;
8478c2ecf20Sopenharmony_ci
8488c2ecf20Sopenharmony_ci	if (io->need_split_bio) {
8498c2ecf20Sopenharmony_ci		BUG_ON(io->split_bio);
8508c2ecf20Sopenharmony_ci		io->split_bio = io->current_bio;
8518c2ecf20Sopenharmony_ci		io->current_bio = r5l_bio_alloc(log);
8528c2ecf20Sopenharmony_ci		bio_chain(io->current_bio, io->split_bio);
8538c2ecf20Sopenharmony_ci		io->need_split_bio = false;
8548c2ecf20Sopenharmony_ci	}
8558c2ecf20Sopenharmony_ci
8568c2ecf20Sopenharmony_ci	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
8578c2ecf20Sopenharmony_ci		BUG();
8588c2ecf20Sopenharmony_ci
8598c2ecf20Sopenharmony_ci	r5_reserve_log_entry(log, io);
8608c2ecf20Sopenharmony_ci}
8618c2ecf20Sopenharmony_ci
8628c2ecf20Sopenharmony_cistatic void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
8638c2ecf20Sopenharmony_ci{
8648c2ecf20Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
8658c2ecf20Sopenharmony_ci	struct r5conf *conf = mddev->private;
8668c2ecf20Sopenharmony_ci	struct r5l_io_unit *io;
8678c2ecf20Sopenharmony_ci	struct r5l_payload_flush *payload;
8688c2ecf20Sopenharmony_ci	int meta_size;
8698c2ecf20Sopenharmony_ci
8708c2ecf20Sopenharmony_ci	/*
8718c2ecf20Sopenharmony_ci	 * payload_flush requires extra writes to the journal.
8728c2ecf20Sopenharmony_ci	 * To avoid handling the extra IO in quiesce, just skip
8738c2ecf20Sopenharmony_ci	 * flush_payload
8748c2ecf20Sopenharmony_ci	 */
8758c2ecf20Sopenharmony_ci	if (conf->quiesce)
8768c2ecf20Sopenharmony_ci		return;
8778c2ecf20Sopenharmony_ci
8788c2ecf20Sopenharmony_ci	mutex_lock(&log->io_mutex);
8798c2ecf20Sopenharmony_ci	meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
8808c2ecf20Sopenharmony_ci
8818c2ecf20Sopenharmony_ci	if (r5l_get_meta(log, meta_size)) {
8828c2ecf20Sopenharmony_ci		mutex_unlock(&log->io_mutex);
8838c2ecf20Sopenharmony_ci		return;
8848c2ecf20Sopenharmony_ci	}
8858c2ecf20Sopenharmony_ci
8868c2ecf20Sopenharmony_ci	/* current implementation is one stripe per flush payload */
8878c2ecf20Sopenharmony_ci	io = log->current_io;
8888c2ecf20Sopenharmony_ci	payload = page_address(io->meta_page) + io->meta_offset;
8898c2ecf20Sopenharmony_ci	payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
8908c2ecf20Sopenharmony_ci	payload->header.flags = cpu_to_le16(0);
8918c2ecf20Sopenharmony_ci	payload->size = cpu_to_le32(sizeof(__le64));
8928c2ecf20Sopenharmony_ci	payload->flush_stripes[0] = cpu_to_le64(sect);
8938c2ecf20Sopenharmony_ci	io->meta_offset += meta_size;
8948c2ecf20Sopenharmony_ci	/* multiple flush payloads count as one pending_stripe */
8958c2ecf20Sopenharmony_ci	if (!io->has_flush_payload) {
8968c2ecf20Sopenharmony_ci		io->has_flush_payload = 1;
8978c2ecf20Sopenharmony_ci		atomic_inc(&io->pending_stripe);
8988c2ecf20Sopenharmony_ci	}
8998c2ecf20Sopenharmony_ci	mutex_unlock(&log->io_mutex);
9008c2ecf20Sopenharmony_ci}
9018c2ecf20Sopenharmony_ci
9028c2ecf20Sopenharmony_cistatic int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
9038c2ecf20Sopenharmony_ci			   int data_pages, int parity_pages)
9048c2ecf20Sopenharmony_ci{
9058c2ecf20Sopenharmony_ci	int i;
9068c2ecf20Sopenharmony_ci	int meta_size;
9078c2ecf20Sopenharmony_ci	int ret;
9088c2ecf20Sopenharmony_ci	struct r5l_io_unit *io;
9098c2ecf20Sopenharmony_ci
9108c2ecf20Sopenharmony_ci	meta_size =
9118c2ecf20Sopenharmony_ci		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
9128c2ecf20Sopenharmony_ci		 * data_pages) +
9138c2ecf20Sopenharmony_ci		sizeof(struct r5l_payload_data_parity) +
9148c2ecf20Sopenharmony_ci		sizeof(__le32) * parity_pages;
9158c2ecf20Sopenharmony_ci
9168c2ecf20Sopenharmony_ci	ret = r5l_get_meta(log, meta_size);
9178c2ecf20Sopenharmony_ci	if (ret)
9188c2ecf20Sopenharmony_ci		return ret;
9198c2ecf20Sopenharmony_ci
9208c2ecf20Sopenharmony_ci	io = log->current_io;
9218c2ecf20Sopenharmony_ci
9228c2ecf20Sopenharmony_ci	if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
9238c2ecf20Sopenharmony_ci		io->has_flush = 1;
9248c2ecf20Sopenharmony_ci
9258c2ecf20Sopenharmony_ci	for (i = 0; i < sh->disks; i++) {
9268c2ecf20Sopenharmony_ci		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
9278c2ecf20Sopenharmony_ci		    test_bit(R5_InJournal, &sh->dev[i].flags))
9288c2ecf20Sopenharmony_ci			continue;
9298c2ecf20Sopenharmony_ci		if (i == sh->pd_idx || i == sh->qd_idx)
9308c2ecf20Sopenharmony_ci			continue;
9318c2ecf20Sopenharmony_ci		if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
9328c2ecf20Sopenharmony_ci		    log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
9338c2ecf20Sopenharmony_ci			io->has_fua = 1;
9348c2ecf20Sopenharmony_ci			/*
9358c2ecf20Sopenharmony_ci			 * we need to flush journal to make sure recovery can
9368c2ecf20Sopenharmony_ci			 * reach the data with fua flag
9378c2ecf20Sopenharmony_ci			 */
9388c2ecf20Sopenharmony_ci			io->has_flush = 1;
9398c2ecf20Sopenharmony_ci		}
9408c2ecf20Sopenharmony_ci		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
9418c2ecf20Sopenharmony_ci					raid5_compute_blocknr(sh, i, 0),
9428c2ecf20Sopenharmony_ci					sh->dev[i].log_checksum, 0, false);
9438c2ecf20Sopenharmony_ci		r5l_append_payload_page(log, sh->dev[i].page);
9448c2ecf20Sopenharmony_ci	}
9458c2ecf20Sopenharmony_ci
9468c2ecf20Sopenharmony_ci	if (parity_pages == 2) {
9478c2ecf20Sopenharmony_ci		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
9488c2ecf20Sopenharmony_ci					sh->sector, sh->dev[sh->pd_idx].log_checksum,
9498c2ecf20Sopenharmony_ci					sh->dev[sh->qd_idx].log_checksum, true);
9508c2ecf20Sopenharmony_ci		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
9518c2ecf20Sopenharmony_ci		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
9528c2ecf20Sopenharmony_ci	} else if (parity_pages == 1) {
9538c2ecf20Sopenharmony_ci		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
9548c2ecf20Sopenharmony_ci					sh->sector, sh->dev[sh->pd_idx].log_checksum,
9558c2ecf20Sopenharmony_ci					0, false);
9568c2ecf20Sopenharmony_ci		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
9578c2ecf20Sopenharmony_ci	} else  /* Just writing data, not parity, in caching phase */
9588c2ecf20Sopenharmony_ci		BUG_ON(parity_pages != 0);
9598c2ecf20Sopenharmony_ci
9608c2ecf20Sopenharmony_ci	list_add_tail(&sh->log_list, &io->stripe_list);
9618c2ecf20Sopenharmony_ci	atomic_inc(&io->pending_stripe);
9628c2ecf20Sopenharmony_ci	sh->log_io = io;
9638c2ecf20Sopenharmony_ci
9648c2ecf20Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
9658c2ecf20Sopenharmony_ci		return 0;
9668c2ecf20Sopenharmony_ci
9678c2ecf20Sopenharmony_ci	if (sh->log_start == MaxSector) {
9688c2ecf20Sopenharmony_ci		BUG_ON(!list_empty(&sh->r5c));
9698c2ecf20Sopenharmony_ci		sh->log_start = io->log_start;
9708c2ecf20Sopenharmony_ci		spin_lock_irq(&log->stripe_in_journal_lock);
9718c2ecf20Sopenharmony_ci		list_add_tail(&sh->r5c,
9728c2ecf20Sopenharmony_ci			      &log->stripe_in_journal_list);
9738c2ecf20Sopenharmony_ci		spin_unlock_irq(&log->stripe_in_journal_lock);
9748c2ecf20Sopenharmony_ci		atomic_inc(&log->stripe_in_journal_count);
9758c2ecf20Sopenharmony_ci	}
9768c2ecf20Sopenharmony_ci	return 0;
9778c2ecf20Sopenharmony_ci}
9788c2ecf20Sopenharmony_ci
9798c2ecf20Sopenharmony_ci/* add stripe to no_space_stripes, and then wake up reclaim */
9808c2ecf20Sopenharmony_cistatic inline void r5l_add_no_space_stripe(struct r5l_log *log,
9818c2ecf20Sopenharmony_ci					   struct stripe_head *sh)
9828c2ecf20Sopenharmony_ci{
9838c2ecf20Sopenharmony_ci	spin_lock(&log->no_space_stripes_lock);
9848c2ecf20Sopenharmony_ci	list_add_tail(&sh->log_list, &log->no_space_stripes);
9858c2ecf20Sopenharmony_ci	spin_unlock(&log->no_space_stripes_lock);
9868c2ecf20Sopenharmony_ci}
9878c2ecf20Sopenharmony_ci
9888c2ecf20Sopenharmony_ci/*
9898c2ecf20Sopenharmony_ci * running in raid5d, where reclaim could wait for raid5d too (when it flushes
9908c2ecf20Sopenharmony_ci * data from log to raid disks), so we shouldn't wait for reclaim here
9918c2ecf20Sopenharmony_ci */
9928c2ecf20Sopenharmony_ciint r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
9938c2ecf20Sopenharmony_ci{
9948c2ecf20Sopenharmony_ci	struct r5conf *conf = sh->raid_conf;
9958c2ecf20Sopenharmony_ci	int write_disks = 0;
9968c2ecf20Sopenharmony_ci	int data_pages, parity_pages;
9978c2ecf20Sopenharmony_ci	int reserve;
9988c2ecf20Sopenharmony_ci	int i;
9998c2ecf20Sopenharmony_ci	int ret = 0;
10008c2ecf20Sopenharmony_ci	bool wake_reclaim = false;
10018c2ecf20Sopenharmony_ci
10028c2ecf20Sopenharmony_ci	if (!log)
10038c2ecf20Sopenharmony_ci		return -EAGAIN;
10048c2ecf20Sopenharmony_ci	/* Don't support stripe batch */
10058c2ecf20Sopenharmony_ci	if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
10068c2ecf20Sopenharmony_ci	    test_bit(STRIPE_SYNCING, &sh->state)) {
10078c2ecf20Sopenharmony_ci		/* the stripe is written to log, we start writing it to raid */
10088c2ecf20Sopenharmony_ci		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
10098c2ecf20Sopenharmony_ci		return -EAGAIN;
10108c2ecf20Sopenharmony_ci	}
10118c2ecf20Sopenharmony_ci
10128c2ecf20Sopenharmony_ci	WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
10138c2ecf20Sopenharmony_ci
10148c2ecf20Sopenharmony_ci	for (i = 0; i < sh->disks; i++) {
10158c2ecf20Sopenharmony_ci		void *addr;
10168c2ecf20Sopenharmony_ci
10178c2ecf20Sopenharmony_ci		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
10188c2ecf20Sopenharmony_ci		    test_bit(R5_InJournal, &sh->dev[i].flags))
10198c2ecf20Sopenharmony_ci			continue;
10208c2ecf20Sopenharmony_ci
10218c2ecf20Sopenharmony_ci		write_disks++;
10228c2ecf20Sopenharmony_ci		/* checksum is already calculated in last run */
10238c2ecf20Sopenharmony_ci		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
10248c2ecf20Sopenharmony_ci			continue;
10258c2ecf20Sopenharmony_ci		addr = kmap_atomic(sh->dev[i].page);
10268c2ecf20Sopenharmony_ci		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
10278c2ecf20Sopenharmony_ci						    addr, PAGE_SIZE);
10288c2ecf20Sopenharmony_ci		kunmap_atomic(addr);
10298c2ecf20Sopenharmony_ci	}
10308c2ecf20Sopenharmony_ci	parity_pages = 1 + !!(sh->qd_idx >= 0);
10318c2ecf20Sopenharmony_ci	data_pages = write_disks - parity_pages;
10328c2ecf20Sopenharmony_ci
10338c2ecf20Sopenharmony_ci	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
10348c2ecf20Sopenharmony_ci	/*
10358c2ecf20Sopenharmony_ci	 * The stripe must enter state machine again to finish the write, so
10368c2ecf20Sopenharmony_ci	 * don't delay.
10378c2ecf20Sopenharmony_ci	 */
10388c2ecf20Sopenharmony_ci	clear_bit(STRIPE_DELAYED, &sh->state);
10398c2ecf20Sopenharmony_ci	atomic_inc(&sh->count);
10408c2ecf20Sopenharmony_ci
10418c2ecf20Sopenharmony_ci	mutex_lock(&log->io_mutex);
10428c2ecf20Sopenharmony_ci	/* meta + data */
10438c2ecf20Sopenharmony_ci	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
10448c2ecf20Sopenharmony_ci
10458c2ecf20Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
10468c2ecf20Sopenharmony_ci		if (!r5l_has_free_space(log, reserve)) {
10478c2ecf20Sopenharmony_ci			r5l_add_no_space_stripe(log, sh);
10488c2ecf20Sopenharmony_ci			wake_reclaim = true;
10498c2ecf20Sopenharmony_ci		} else {
10508c2ecf20Sopenharmony_ci			ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
10518c2ecf20Sopenharmony_ci			if (ret) {
10528c2ecf20Sopenharmony_ci				spin_lock_irq(&log->io_list_lock);
10538c2ecf20Sopenharmony_ci				list_add_tail(&sh->log_list,
10548c2ecf20Sopenharmony_ci					      &log->no_mem_stripes);
10558c2ecf20Sopenharmony_ci				spin_unlock_irq(&log->io_list_lock);
10568c2ecf20Sopenharmony_ci			}
10578c2ecf20Sopenharmony_ci		}
10588c2ecf20Sopenharmony_ci	} else {  /* R5C_JOURNAL_MODE_WRITE_BACK */
10598c2ecf20Sopenharmony_ci		/*
10608c2ecf20Sopenharmony_ci		 * log space critical, do not process stripes that are
10618c2ecf20Sopenharmony_ci		 * not in cache yet (sh->log_start == MaxSector).
10628c2ecf20Sopenharmony_ci		 */
10638c2ecf20Sopenharmony_ci		if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
10648c2ecf20Sopenharmony_ci		    sh->log_start == MaxSector) {
10658c2ecf20Sopenharmony_ci			r5l_add_no_space_stripe(log, sh);
10668c2ecf20Sopenharmony_ci			wake_reclaim = true;
10678c2ecf20Sopenharmony_ci			reserve = 0;
10688c2ecf20Sopenharmony_ci		} else if (!r5l_has_free_space(log, reserve)) {
10698c2ecf20Sopenharmony_ci			if (sh->log_start == log->last_checkpoint)
10708c2ecf20Sopenharmony_ci				BUG();
10718c2ecf20Sopenharmony_ci			else
10728c2ecf20Sopenharmony_ci				r5l_add_no_space_stripe(log, sh);
10738c2ecf20Sopenharmony_ci		} else {
10748c2ecf20Sopenharmony_ci			ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
10758c2ecf20Sopenharmony_ci			if (ret) {
10768c2ecf20Sopenharmony_ci				spin_lock_irq(&log->io_list_lock);
10778c2ecf20Sopenharmony_ci				list_add_tail(&sh->log_list,
10788c2ecf20Sopenharmony_ci					      &log->no_mem_stripes);
10798c2ecf20Sopenharmony_ci				spin_unlock_irq(&log->io_list_lock);
10808c2ecf20Sopenharmony_ci			}
10818c2ecf20Sopenharmony_ci		}
10828c2ecf20Sopenharmony_ci	}
10838c2ecf20Sopenharmony_ci
10848c2ecf20Sopenharmony_ci	mutex_unlock(&log->io_mutex);
10858c2ecf20Sopenharmony_ci	if (wake_reclaim)
10868c2ecf20Sopenharmony_ci		r5l_wake_reclaim(log, reserve);
10878c2ecf20Sopenharmony_ci	return 0;
10888c2ecf20Sopenharmony_ci}
10898c2ecf20Sopenharmony_ci
10908c2ecf20Sopenharmony_civoid r5l_write_stripe_run(struct r5l_log *log)
10918c2ecf20Sopenharmony_ci{
10928c2ecf20Sopenharmony_ci	if (!log)
10938c2ecf20Sopenharmony_ci		return;
10948c2ecf20Sopenharmony_ci	mutex_lock(&log->io_mutex);
10958c2ecf20Sopenharmony_ci	r5l_submit_current_io(log);
10968c2ecf20Sopenharmony_ci	mutex_unlock(&log->io_mutex);
10978c2ecf20Sopenharmony_ci}
10988c2ecf20Sopenharmony_ci
10998c2ecf20Sopenharmony_ciint r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
11008c2ecf20Sopenharmony_ci{
11018c2ecf20Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
11028c2ecf20Sopenharmony_ci		/*
11038c2ecf20Sopenharmony_ci		 * in write through (journal only)
11048c2ecf20Sopenharmony_ci		 * we flush log disk cache first, then write stripe data to
11058c2ecf20Sopenharmony_ci		 * raid disks. So if bio is finished, the log disk cache is
11068c2ecf20Sopenharmony_ci		 * flushed already. The recovery guarantees we can recovery
11078c2ecf20Sopenharmony_ci		 * the bio from log disk, so we don't need to flush again
11088c2ecf20Sopenharmony_ci		 */
11098c2ecf20Sopenharmony_ci		if (bio->bi_iter.bi_size == 0) {
11108c2ecf20Sopenharmony_ci			bio_endio(bio);
11118c2ecf20Sopenharmony_ci			return 0;
11128c2ecf20Sopenharmony_ci		}
11138c2ecf20Sopenharmony_ci		bio->bi_opf &= ~REQ_PREFLUSH;
11148c2ecf20Sopenharmony_ci	} else {
11158c2ecf20Sopenharmony_ci		/* write back (with cache) */
11168c2ecf20Sopenharmony_ci		if (bio->bi_iter.bi_size == 0) {
11178c2ecf20Sopenharmony_ci			mutex_lock(&log->io_mutex);
11188c2ecf20Sopenharmony_ci			r5l_get_meta(log, 0);
11198c2ecf20Sopenharmony_ci			bio_list_add(&log->current_io->flush_barriers, bio);
11208c2ecf20Sopenharmony_ci			log->current_io->has_flush = 1;
11218c2ecf20Sopenharmony_ci			log->current_io->has_null_flush = 1;
11228c2ecf20Sopenharmony_ci			atomic_inc(&log->current_io->pending_stripe);
11238c2ecf20Sopenharmony_ci			r5l_submit_current_io(log);
11248c2ecf20Sopenharmony_ci			mutex_unlock(&log->io_mutex);
11258c2ecf20Sopenharmony_ci			return 0;
11268c2ecf20Sopenharmony_ci		}
11278c2ecf20Sopenharmony_ci	}
11288c2ecf20Sopenharmony_ci	return -EAGAIN;
11298c2ecf20Sopenharmony_ci}
11308c2ecf20Sopenharmony_ci
11318c2ecf20Sopenharmony_ci/* This will run after log space is reclaimed */
11328c2ecf20Sopenharmony_cistatic void r5l_run_no_space_stripes(struct r5l_log *log)
11338c2ecf20Sopenharmony_ci{
11348c2ecf20Sopenharmony_ci	struct stripe_head *sh;
11358c2ecf20Sopenharmony_ci
11368c2ecf20Sopenharmony_ci	spin_lock(&log->no_space_stripes_lock);
11378c2ecf20Sopenharmony_ci	while (!list_empty(&log->no_space_stripes)) {
11388c2ecf20Sopenharmony_ci		sh = list_first_entry(&log->no_space_stripes,
11398c2ecf20Sopenharmony_ci				      struct stripe_head, log_list);
11408c2ecf20Sopenharmony_ci		list_del_init(&sh->log_list);
11418c2ecf20Sopenharmony_ci		set_bit(STRIPE_HANDLE, &sh->state);
11428c2ecf20Sopenharmony_ci		raid5_release_stripe(sh);
11438c2ecf20Sopenharmony_ci	}
11448c2ecf20Sopenharmony_ci	spin_unlock(&log->no_space_stripes_lock);
11458c2ecf20Sopenharmony_ci}
11468c2ecf20Sopenharmony_ci
11478c2ecf20Sopenharmony_ci/*
11488c2ecf20Sopenharmony_ci * calculate new last_checkpoint
11498c2ecf20Sopenharmony_ci * for write through mode, returns log->next_checkpoint
11508c2ecf20Sopenharmony_ci * for write back, returns log_start of first sh in stripe_in_journal_list
11518c2ecf20Sopenharmony_ci */
11528c2ecf20Sopenharmony_cistatic sector_t r5c_calculate_new_cp(struct r5conf *conf)
11538c2ecf20Sopenharmony_ci{
11548c2ecf20Sopenharmony_ci	struct stripe_head *sh;
11558c2ecf20Sopenharmony_ci	struct r5l_log *log = conf->log;
11568c2ecf20Sopenharmony_ci	sector_t new_cp;
11578c2ecf20Sopenharmony_ci	unsigned long flags;
11588c2ecf20Sopenharmony_ci
11598c2ecf20Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
11608c2ecf20Sopenharmony_ci		return log->next_checkpoint;
11618c2ecf20Sopenharmony_ci
11628c2ecf20Sopenharmony_ci	spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
11638c2ecf20Sopenharmony_ci	if (list_empty(&conf->log->stripe_in_journal_list)) {
11648c2ecf20Sopenharmony_ci		/* all stripes flushed */
11658c2ecf20Sopenharmony_ci		spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
11668c2ecf20Sopenharmony_ci		return log->next_checkpoint;
11678c2ecf20Sopenharmony_ci	}
11688c2ecf20Sopenharmony_ci	sh = list_first_entry(&conf->log->stripe_in_journal_list,
11698c2ecf20Sopenharmony_ci			      struct stripe_head, r5c);
11708c2ecf20Sopenharmony_ci	new_cp = sh->log_start;
11718c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
11728c2ecf20Sopenharmony_ci	return new_cp;
11738c2ecf20Sopenharmony_ci}
11748c2ecf20Sopenharmony_ci
11758c2ecf20Sopenharmony_cistatic sector_t r5l_reclaimable_space(struct r5l_log *log)
11768c2ecf20Sopenharmony_ci{
11778c2ecf20Sopenharmony_ci	struct r5conf *conf = log->rdev->mddev->private;
11788c2ecf20Sopenharmony_ci
11798c2ecf20Sopenharmony_ci	return r5l_ring_distance(log, log->last_checkpoint,
11808c2ecf20Sopenharmony_ci				 r5c_calculate_new_cp(conf));
11818c2ecf20Sopenharmony_ci}
11828c2ecf20Sopenharmony_ci
11838c2ecf20Sopenharmony_cistatic void r5l_run_no_mem_stripe(struct r5l_log *log)
11848c2ecf20Sopenharmony_ci{
11858c2ecf20Sopenharmony_ci	struct stripe_head *sh;
11868c2ecf20Sopenharmony_ci
11878c2ecf20Sopenharmony_ci	lockdep_assert_held(&log->io_list_lock);
11888c2ecf20Sopenharmony_ci
11898c2ecf20Sopenharmony_ci	if (!list_empty(&log->no_mem_stripes)) {
11908c2ecf20Sopenharmony_ci		sh = list_first_entry(&log->no_mem_stripes,
11918c2ecf20Sopenharmony_ci				      struct stripe_head, log_list);
11928c2ecf20Sopenharmony_ci		list_del_init(&sh->log_list);
11938c2ecf20Sopenharmony_ci		set_bit(STRIPE_HANDLE, &sh->state);
11948c2ecf20Sopenharmony_ci		raid5_release_stripe(sh);
11958c2ecf20Sopenharmony_ci	}
11968c2ecf20Sopenharmony_ci}
11978c2ecf20Sopenharmony_ci
11988c2ecf20Sopenharmony_cistatic bool r5l_complete_finished_ios(struct r5l_log *log)
11998c2ecf20Sopenharmony_ci{
12008c2ecf20Sopenharmony_ci	struct r5l_io_unit *io, *next;
12018c2ecf20Sopenharmony_ci	bool found = false;
12028c2ecf20Sopenharmony_ci
12038c2ecf20Sopenharmony_ci	lockdep_assert_held(&log->io_list_lock);
12048c2ecf20Sopenharmony_ci
12058c2ecf20Sopenharmony_ci	list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
12068c2ecf20Sopenharmony_ci		/* don't change list order */
12078c2ecf20Sopenharmony_ci		if (io->state < IO_UNIT_STRIPE_END)
12088c2ecf20Sopenharmony_ci			break;
12098c2ecf20Sopenharmony_ci
12108c2ecf20Sopenharmony_ci		log->next_checkpoint = io->log_start;
12118c2ecf20Sopenharmony_ci
12128c2ecf20Sopenharmony_ci		list_del(&io->log_sibling);
12138c2ecf20Sopenharmony_ci		mempool_free(io, &log->io_pool);
12148c2ecf20Sopenharmony_ci		r5l_run_no_mem_stripe(log);
12158c2ecf20Sopenharmony_ci
12168c2ecf20Sopenharmony_ci		found = true;
12178c2ecf20Sopenharmony_ci	}
12188c2ecf20Sopenharmony_ci
12198c2ecf20Sopenharmony_ci	return found;
12208c2ecf20Sopenharmony_ci}
12218c2ecf20Sopenharmony_ci
12228c2ecf20Sopenharmony_cistatic void __r5l_stripe_write_finished(struct r5l_io_unit *io)
12238c2ecf20Sopenharmony_ci{
12248c2ecf20Sopenharmony_ci	struct r5l_log *log = io->log;
12258c2ecf20Sopenharmony_ci	struct r5conf *conf = log->rdev->mddev->private;
12268c2ecf20Sopenharmony_ci	unsigned long flags;
12278c2ecf20Sopenharmony_ci
12288c2ecf20Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
12298c2ecf20Sopenharmony_ci	__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
12308c2ecf20Sopenharmony_ci
12318c2ecf20Sopenharmony_ci	if (!r5l_complete_finished_ios(log)) {
12328c2ecf20Sopenharmony_ci		spin_unlock_irqrestore(&log->io_list_lock, flags);
12338c2ecf20Sopenharmony_ci		return;
12348c2ecf20Sopenharmony_ci	}
12358c2ecf20Sopenharmony_ci
12368c2ecf20Sopenharmony_ci	if (r5l_reclaimable_space(log) > log->max_free_space ||
12378c2ecf20Sopenharmony_ci	    test_bit(R5C_LOG_TIGHT, &conf->cache_state))
12388c2ecf20Sopenharmony_ci		r5l_wake_reclaim(log, 0);
12398c2ecf20Sopenharmony_ci
12408c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
12418c2ecf20Sopenharmony_ci	wake_up(&log->iounit_wait);
12428c2ecf20Sopenharmony_ci}
12438c2ecf20Sopenharmony_ci
12448c2ecf20Sopenharmony_civoid r5l_stripe_write_finished(struct stripe_head *sh)
12458c2ecf20Sopenharmony_ci{
12468c2ecf20Sopenharmony_ci	struct r5l_io_unit *io;
12478c2ecf20Sopenharmony_ci
12488c2ecf20Sopenharmony_ci	io = sh->log_io;
12498c2ecf20Sopenharmony_ci	sh->log_io = NULL;
12508c2ecf20Sopenharmony_ci
12518c2ecf20Sopenharmony_ci	if (io && atomic_dec_and_test(&io->pending_stripe))
12528c2ecf20Sopenharmony_ci		__r5l_stripe_write_finished(io);
12538c2ecf20Sopenharmony_ci}
12548c2ecf20Sopenharmony_ci
12558c2ecf20Sopenharmony_cistatic void r5l_log_flush_endio(struct bio *bio)
12568c2ecf20Sopenharmony_ci{
12578c2ecf20Sopenharmony_ci	struct r5l_log *log = container_of(bio, struct r5l_log,
12588c2ecf20Sopenharmony_ci		flush_bio);
12598c2ecf20Sopenharmony_ci	unsigned long flags;
12608c2ecf20Sopenharmony_ci	struct r5l_io_unit *io;
12618c2ecf20Sopenharmony_ci
12628c2ecf20Sopenharmony_ci	if (bio->bi_status)
12638c2ecf20Sopenharmony_ci		md_error(log->rdev->mddev, log->rdev);
12648c2ecf20Sopenharmony_ci
12658c2ecf20Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
12668c2ecf20Sopenharmony_ci	list_for_each_entry(io, &log->flushing_ios, log_sibling)
12678c2ecf20Sopenharmony_ci		r5l_io_run_stripes(io);
12688c2ecf20Sopenharmony_ci	list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
12698c2ecf20Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
12708c2ecf20Sopenharmony_ci}
12718c2ecf20Sopenharmony_ci
12728c2ecf20Sopenharmony_ci/*
12738c2ecf20Sopenharmony_ci * Starting dispatch IO to raid.
12748c2ecf20Sopenharmony_ci * io_unit(meta) consists of a log. There is one situation we want to avoid. A
12758c2ecf20Sopenharmony_ci * broken meta in the middle of a log causes recovery can't find meta at the
12768c2ecf20Sopenharmony_ci * head of log. If operations require meta at the head persistent in log, we
12778c2ecf20Sopenharmony_ci * must make sure meta before it persistent in log too. A case is:
12788c2ecf20Sopenharmony_ci *
12798c2ecf20Sopenharmony_ci * stripe data/parity is in log, we start write stripe to raid disks. stripe
12808c2ecf20Sopenharmony_ci * data/parity must be persistent in log before we do the write to raid disks.
12818c2ecf20Sopenharmony_ci *
12828c2ecf20Sopenharmony_ci * The solution is we restrictly maintain io_unit list order. In this case, we
12838c2ecf20Sopenharmony_ci * only write stripes of an io_unit to raid disks till the io_unit is the first
12848c2ecf20Sopenharmony_ci * one whose data/parity is in log.
12858c2ecf20Sopenharmony_ci */
12868c2ecf20Sopenharmony_civoid r5l_flush_stripe_to_raid(struct r5l_log *log)
12878c2ecf20Sopenharmony_ci{
12888c2ecf20Sopenharmony_ci	bool do_flush;
12898c2ecf20Sopenharmony_ci
12908c2ecf20Sopenharmony_ci	if (!log || !log->need_cache_flush)
12918c2ecf20Sopenharmony_ci		return;
12928c2ecf20Sopenharmony_ci
12938c2ecf20Sopenharmony_ci	spin_lock_irq(&log->io_list_lock);
12948c2ecf20Sopenharmony_ci	/* flush bio is running */
12958c2ecf20Sopenharmony_ci	if (!list_empty(&log->flushing_ios)) {
12968c2ecf20Sopenharmony_ci		spin_unlock_irq(&log->io_list_lock);
12978c2ecf20Sopenharmony_ci		return;
12988c2ecf20Sopenharmony_ci	}
12998c2ecf20Sopenharmony_ci	list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
13008c2ecf20Sopenharmony_ci	do_flush = !list_empty(&log->flushing_ios);
13018c2ecf20Sopenharmony_ci	spin_unlock_irq(&log->io_list_lock);
13028c2ecf20Sopenharmony_ci
13038c2ecf20Sopenharmony_ci	if (!do_flush)
13048c2ecf20Sopenharmony_ci		return;
13058c2ecf20Sopenharmony_ci	bio_reset(&log->flush_bio);
13068c2ecf20Sopenharmony_ci	bio_set_dev(&log->flush_bio, log->rdev->bdev);
13078c2ecf20Sopenharmony_ci	log->flush_bio.bi_end_io = r5l_log_flush_endio;
13088c2ecf20Sopenharmony_ci	log->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
13098c2ecf20Sopenharmony_ci	submit_bio(&log->flush_bio);
13108c2ecf20Sopenharmony_ci}
13118c2ecf20Sopenharmony_ci
13128c2ecf20Sopenharmony_cistatic void r5l_write_super(struct r5l_log *log, sector_t cp);
13138c2ecf20Sopenharmony_cistatic void r5l_write_super_and_discard_space(struct r5l_log *log,
13148c2ecf20Sopenharmony_ci	sector_t end)
13158c2ecf20Sopenharmony_ci{
13168c2ecf20Sopenharmony_ci	struct block_device *bdev = log->rdev->bdev;
13178c2ecf20Sopenharmony_ci	struct mddev *mddev;
13188c2ecf20Sopenharmony_ci
13198c2ecf20Sopenharmony_ci	r5l_write_super(log, end);
13208c2ecf20Sopenharmony_ci
13218c2ecf20Sopenharmony_ci	if (!blk_queue_discard(bdev_get_queue(bdev)))
13228c2ecf20Sopenharmony_ci		return;
13238c2ecf20Sopenharmony_ci
13248c2ecf20Sopenharmony_ci	mddev = log->rdev->mddev;
13258c2ecf20Sopenharmony_ci	/*
13268c2ecf20Sopenharmony_ci	 * Discard could zero data, so before discard we must make sure
13278c2ecf20Sopenharmony_ci	 * superblock is updated to new log tail. Updating superblock (either
13288c2ecf20Sopenharmony_ci	 * directly call md_update_sb() or depend on md thread) must hold
13298c2ecf20Sopenharmony_ci	 * reconfig mutex. On the other hand, raid5_quiesce is called with
13308c2ecf20Sopenharmony_ci	 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
13318c2ecf20Sopenharmony_ci	 * for all IO finish, hence waitting for reclaim thread, while reclaim
13328c2ecf20Sopenharmony_ci	 * thread is calling this function and waitting for reconfig mutex. So
13338c2ecf20Sopenharmony_ci	 * there is a deadlock. We workaround this issue with a trylock.
13348c2ecf20Sopenharmony_ci	 * FIXME: we could miss discard if we can't take reconfig mutex
13358c2ecf20Sopenharmony_ci	 */
13368c2ecf20Sopenharmony_ci	set_mask_bits(&mddev->sb_flags, 0,
13378c2ecf20Sopenharmony_ci		BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
13388c2ecf20Sopenharmony_ci	if (!mddev_trylock(mddev))
13398c2ecf20Sopenharmony_ci		return;
13408c2ecf20Sopenharmony_ci	md_update_sb(mddev, 1);
13418c2ecf20Sopenharmony_ci	mddev_unlock(mddev);
13428c2ecf20Sopenharmony_ci
13438c2ecf20Sopenharmony_ci	/* discard IO error really doesn't matter, ignore it */
13448c2ecf20Sopenharmony_ci	if (log->last_checkpoint < end) {
13458c2ecf20Sopenharmony_ci		blkdev_issue_discard(bdev,
13468c2ecf20Sopenharmony_ci				log->last_checkpoint + log->rdev->data_offset,
13478c2ecf20Sopenharmony_ci				end - log->last_checkpoint, GFP_NOIO, 0);
13488c2ecf20Sopenharmony_ci	} else {
13498c2ecf20Sopenharmony_ci		blkdev_issue_discard(bdev,
13508c2ecf20Sopenharmony_ci				log->last_checkpoint + log->rdev->data_offset,
13518c2ecf20Sopenharmony_ci				log->device_size - log->last_checkpoint,
13528c2ecf20Sopenharmony_ci				GFP_NOIO, 0);
13538c2ecf20Sopenharmony_ci		blkdev_issue_discard(bdev, log->rdev->data_offset, end,
13548c2ecf20Sopenharmony_ci				GFP_NOIO, 0);
13558c2ecf20Sopenharmony_ci	}
13568c2ecf20Sopenharmony_ci}
13578c2ecf20Sopenharmony_ci
13588c2ecf20Sopenharmony_ci/*
13598c2ecf20Sopenharmony_ci * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
13608c2ecf20Sopenharmony_ci * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
13618c2ecf20Sopenharmony_ci *
13628c2ecf20Sopenharmony_ci * must hold conf->device_lock
13638c2ecf20Sopenharmony_ci */
13648c2ecf20Sopenharmony_cistatic void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
13658c2ecf20Sopenharmony_ci{
13668c2ecf20Sopenharmony_ci	BUG_ON(list_empty(&sh->lru));
13678c2ecf20Sopenharmony_ci	BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
13688c2ecf20Sopenharmony_ci	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
13698c2ecf20Sopenharmony_ci
13708c2ecf20Sopenharmony_ci	/*
13718c2ecf20Sopenharmony_ci	 * The stripe is not ON_RELEASE_LIST, so it is safe to call
13728c2ecf20Sopenharmony_ci	 * raid5_release_stripe() while holding conf->device_lock
13738c2ecf20Sopenharmony_ci	 */
13748c2ecf20Sopenharmony_ci	BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
13758c2ecf20Sopenharmony_ci	lockdep_assert_held(&conf->device_lock);
13768c2ecf20Sopenharmony_ci
13778c2ecf20Sopenharmony_ci	list_del_init(&sh->lru);
13788c2ecf20Sopenharmony_ci	atomic_inc(&sh->count);
13798c2ecf20Sopenharmony_ci
13808c2ecf20Sopenharmony_ci	set_bit(STRIPE_HANDLE, &sh->state);
13818c2ecf20Sopenharmony_ci	atomic_inc(&conf->active_stripes);
13828c2ecf20Sopenharmony_ci	r5c_make_stripe_write_out(sh);
13838c2ecf20Sopenharmony_ci
13848c2ecf20Sopenharmony_ci	if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
13858c2ecf20Sopenharmony_ci		atomic_inc(&conf->r5c_flushing_partial_stripes);
13868c2ecf20Sopenharmony_ci	else
13878c2ecf20Sopenharmony_ci		atomic_inc(&conf->r5c_flushing_full_stripes);
13888c2ecf20Sopenharmony_ci	raid5_release_stripe(sh);
13898c2ecf20Sopenharmony_ci}
13908c2ecf20Sopenharmony_ci
13918c2ecf20Sopenharmony_ci/*
13928c2ecf20Sopenharmony_ci * if num == 0, flush all full stripes
13938c2ecf20Sopenharmony_ci * if num > 0, flush all full stripes. If less than num full stripes are
13948c2ecf20Sopenharmony_ci *             flushed, flush some partial stripes until totally num stripes are
13958c2ecf20Sopenharmony_ci *             flushed or there is no more cached stripes.
13968c2ecf20Sopenharmony_ci */
13978c2ecf20Sopenharmony_civoid r5c_flush_cache(struct r5conf *conf, int num)
13988c2ecf20Sopenharmony_ci{
13998c2ecf20Sopenharmony_ci	int count;
14008c2ecf20Sopenharmony_ci	struct stripe_head *sh, *next;
14018c2ecf20Sopenharmony_ci
14028c2ecf20Sopenharmony_ci	lockdep_assert_held(&conf->device_lock);
14038c2ecf20Sopenharmony_ci	if (!conf->log)
14048c2ecf20Sopenharmony_ci		return;
14058c2ecf20Sopenharmony_ci
14068c2ecf20Sopenharmony_ci	count = 0;
14078c2ecf20Sopenharmony_ci	list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
14088c2ecf20Sopenharmony_ci		r5c_flush_stripe(conf, sh);
14098c2ecf20Sopenharmony_ci		count++;
14108c2ecf20Sopenharmony_ci	}
14118c2ecf20Sopenharmony_ci
14128c2ecf20Sopenharmony_ci	if (count >= num)
14138c2ecf20Sopenharmony_ci		return;
14148c2ecf20Sopenharmony_ci	list_for_each_entry_safe(sh, next,
14158c2ecf20Sopenharmony_ci				 &conf->r5c_partial_stripe_list, lru) {
14168c2ecf20Sopenharmony_ci		r5c_flush_stripe(conf, sh);
14178c2ecf20Sopenharmony_ci		if (++count >= num)
14188c2ecf20Sopenharmony_ci			break;
14198c2ecf20Sopenharmony_ci	}
14208c2ecf20Sopenharmony_ci}
14218c2ecf20Sopenharmony_ci
14228c2ecf20Sopenharmony_cistatic void r5c_do_reclaim(struct r5conf *conf)
14238c2ecf20Sopenharmony_ci{
14248c2ecf20Sopenharmony_ci	struct r5l_log *log = conf->log;
14258c2ecf20Sopenharmony_ci	struct stripe_head *sh;
14268c2ecf20Sopenharmony_ci	int count = 0;
14278c2ecf20Sopenharmony_ci	unsigned long flags;
14288c2ecf20Sopenharmony_ci	int total_cached;
14298c2ecf20Sopenharmony_ci	int stripes_to_flush;
14308c2ecf20Sopenharmony_ci	int flushing_partial, flushing_full;
14318c2ecf20Sopenharmony_ci
14328c2ecf20Sopenharmony_ci	if (!r5c_is_writeback(log))
14338c2ecf20Sopenharmony_ci		return;
14348c2ecf20Sopenharmony_ci
14358c2ecf20Sopenharmony_ci	flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
14368c2ecf20Sopenharmony_ci	flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
14378c2ecf20Sopenharmony_ci	total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
14388c2ecf20Sopenharmony_ci		atomic_read(&conf->r5c_cached_full_stripes) -
14398c2ecf20Sopenharmony_ci		flushing_full - flushing_partial;
14408c2ecf20Sopenharmony_ci
14418c2ecf20Sopenharmony_ci	if (total_cached > conf->min_nr_stripes * 3 / 4 ||
14428c2ecf20Sopenharmony_ci	    atomic_read(&conf->empty_inactive_list_nr) > 0)
14438c2ecf20Sopenharmony_ci		/*
14448c2ecf20Sopenharmony_ci		 * if stripe cache pressure high, flush all full stripes and
14458c2ecf20Sopenharmony_ci		 * some partial stripes
14468c2ecf20Sopenharmony_ci		 */
14478c2ecf20Sopenharmony_ci		stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
14488c2ecf20Sopenharmony_ci	else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
14498c2ecf20Sopenharmony_ci		 atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
14508c2ecf20Sopenharmony_ci		 R5C_FULL_STRIPE_FLUSH_BATCH(conf))
14518c2ecf20Sopenharmony_ci		/*
14528c2ecf20Sopenharmony_ci		 * if stripe cache pressure moderate, or if there is many full
14538c2ecf20Sopenharmony_ci		 * stripes,flush all full stripes
14548c2ecf20Sopenharmony_ci		 */
14558c2ecf20Sopenharmony_ci		stripes_to_flush = 0;
14568c2ecf20Sopenharmony_ci	else
14578c2ecf20Sopenharmony_ci		/* no need to flush */
14588c2ecf20Sopenharmony_ci		stripes_to_flush = -1;
14598c2ecf20Sopenharmony_ci
14608c2ecf20Sopenharmony_ci	if (stripes_to_flush >= 0) {
14618c2ecf20Sopenharmony_ci		spin_lock_irqsave(&conf->device_lock, flags);
14628c2ecf20Sopenharmony_ci		r5c_flush_cache(conf, stripes_to_flush);
14638c2ecf20Sopenharmony_ci		spin_unlock_irqrestore(&conf->device_lock, flags);
14648c2ecf20Sopenharmony_ci	}
14658c2ecf20Sopenharmony_ci
14668c2ecf20Sopenharmony_ci	/* if log space is tight, flush stripes on stripe_in_journal_list */
14678c2ecf20Sopenharmony_ci	if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
14688c2ecf20Sopenharmony_ci		spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
14698c2ecf20Sopenharmony_ci		spin_lock(&conf->device_lock);
14708c2ecf20Sopenharmony_ci		list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
14718c2ecf20Sopenharmony_ci			/*
14728c2ecf20Sopenharmony_ci			 * stripes on stripe_in_journal_list could be in any
14738c2ecf20Sopenharmony_ci			 * state of the stripe_cache state machine. In this
14748c2ecf20Sopenharmony_ci			 * case, we only want to flush stripe on
14758c2ecf20Sopenharmony_ci			 * r5c_cached_full/partial_stripes. The following
14768c2ecf20Sopenharmony_ci			 * condition makes sure the stripe is on one of the
14778c2ecf20Sopenharmony_ci			 * two lists.
14788c2ecf20Sopenharmony_ci			 */
14798c2ecf20Sopenharmony_ci			if (!list_empty(&sh->lru) &&
14808c2ecf20Sopenharmony_ci			    !test_bit(STRIPE_HANDLE, &sh->state) &&
14818c2ecf20Sopenharmony_ci			    atomic_read(&sh->count) == 0) {
14828c2ecf20Sopenharmony_ci				r5c_flush_stripe(conf, sh);
14838c2ecf20Sopenharmony_ci				if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
14848c2ecf20Sopenharmony_ci					break;
14858c2ecf20Sopenharmony_ci			}
14868c2ecf20Sopenharmony_ci		}
14878c2ecf20Sopenharmony_ci		spin_unlock(&conf->device_lock);
14888c2ecf20Sopenharmony_ci		spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
14898c2ecf20Sopenharmony_ci	}
14908c2ecf20Sopenharmony_ci
14918c2ecf20Sopenharmony_ci	if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
14928c2ecf20Sopenharmony_ci		r5l_run_no_space_stripes(log);
14938c2ecf20Sopenharmony_ci
14948c2ecf20Sopenharmony_ci	md_wakeup_thread(conf->mddev->thread);
14958c2ecf20Sopenharmony_ci}
14968c2ecf20Sopenharmony_ci
14978c2ecf20Sopenharmony_cistatic void r5l_do_reclaim(struct r5l_log *log)
14988c2ecf20Sopenharmony_ci{
14998c2ecf20Sopenharmony_ci	struct r5conf *conf = log->rdev->mddev->private;
15008c2ecf20Sopenharmony_ci	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
15018c2ecf20Sopenharmony_ci	sector_t reclaimable;
15028c2ecf20Sopenharmony_ci	sector_t next_checkpoint;
15038c2ecf20Sopenharmony_ci	bool write_super;
15048c2ecf20Sopenharmony_ci
15058c2ecf20Sopenharmony_ci	spin_lock_irq(&log->io_list_lock);
15068c2ecf20Sopenharmony_ci	write_super = r5l_reclaimable_space(log) > log->max_free_space ||
15078c2ecf20Sopenharmony_ci		reclaim_target != 0 || !list_empty(&log->no_space_stripes);
15088c2ecf20Sopenharmony_ci	/*
15098c2ecf20Sopenharmony_ci	 * move proper io_unit to reclaim list. We should not change the order.
15108c2ecf20Sopenharmony_ci	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
15118c2ecf20Sopenharmony_ci	 * shouldn't reuse space of an unreclaimable io_unit
15128c2ecf20Sopenharmony_ci	 */
15138c2ecf20Sopenharmony_ci	while (1) {
15148c2ecf20Sopenharmony_ci		reclaimable = r5l_reclaimable_space(log);
15158c2ecf20Sopenharmony_ci		if (reclaimable >= reclaim_target ||
15168c2ecf20Sopenharmony_ci		    (list_empty(&log->running_ios) &&
15178c2ecf20Sopenharmony_ci		     list_empty(&log->io_end_ios) &&
15188c2ecf20Sopenharmony_ci		     list_empty(&log->flushing_ios) &&
15198c2ecf20Sopenharmony_ci		     list_empty(&log->finished_ios)))
15208c2ecf20Sopenharmony_ci			break;
15218c2ecf20Sopenharmony_ci
15228c2ecf20Sopenharmony_ci		md_wakeup_thread(log->rdev->mddev->thread);
15238c2ecf20Sopenharmony_ci		wait_event_lock_irq(log->iounit_wait,
15248c2ecf20Sopenharmony_ci				    r5l_reclaimable_space(log) > reclaimable,
15258c2ecf20Sopenharmony_ci				    log->io_list_lock);
15268c2ecf20Sopenharmony_ci	}
15278c2ecf20Sopenharmony_ci
15288c2ecf20Sopenharmony_ci	next_checkpoint = r5c_calculate_new_cp(conf);
15298c2ecf20Sopenharmony_ci	spin_unlock_irq(&log->io_list_lock);
15308c2ecf20Sopenharmony_ci
15318c2ecf20Sopenharmony_ci	if (reclaimable == 0 || !write_super)
15328c2ecf20Sopenharmony_ci		return;
15338c2ecf20Sopenharmony_ci
15348c2ecf20Sopenharmony_ci	/*
15358c2ecf20Sopenharmony_ci	 * write_super will flush cache of each raid disk. We must write super
15368c2ecf20Sopenharmony_ci	 * here, because the log area might be reused soon and we don't want to
15378c2ecf20Sopenharmony_ci	 * confuse recovery
15388c2ecf20Sopenharmony_ci	 */
15398c2ecf20Sopenharmony_ci	r5l_write_super_and_discard_space(log, next_checkpoint);
15408c2ecf20Sopenharmony_ci
15418c2ecf20Sopenharmony_ci	mutex_lock(&log->io_mutex);
15428c2ecf20Sopenharmony_ci	log->last_checkpoint = next_checkpoint;
15438c2ecf20Sopenharmony_ci	r5c_update_log_state(log);
15448c2ecf20Sopenharmony_ci	mutex_unlock(&log->io_mutex);
15458c2ecf20Sopenharmony_ci
15468c2ecf20Sopenharmony_ci	r5l_run_no_space_stripes(log);
15478c2ecf20Sopenharmony_ci}
15488c2ecf20Sopenharmony_ci
15498c2ecf20Sopenharmony_cistatic void r5l_reclaim_thread(struct md_thread *thread)
15508c2ecf20Sopenharmony_ci{
15518c2ecf20Sopenharmony_ci	struct mddev *mddev = thread->mddev;
15528c2ecf20Sopenharmony_ci	struct r5conf *conf = mddev->private;
15538c2ecf20Sopenharmony_ci	struct r5l_log *log = conf->log;
15548c2ecf20Sopenharmony_ci
15558c2ecf20Sopenharmony_ci	if (!log)
15568c2ecf20Sopenharmony_ci		return;
15578c2ecf20Sopenharmony_ci	r5c_do_reclaim(conf);
15588c2ecf20Sopenharmony_ci	r5l_do_reclaim(log);
15598c2ecf20Sopenharmony_ci}
15608c2ecf20Sopenharmony_ci
15618c2ecf20Sopenharmony_civoid r5l_wake_reclaim(struct r5l_log *log, sector_t space)
15628c2ecf20Sopenharmony_ci{
15638c2ecf20Sopenharmony_ci	unsigned long target;
15648c2ecf20Sopenharmony_ci	unsigned long new = (unsigned long)space; /* overflow in theory */
15658c2ecf20Sopenharmony_ci
15668c2ecf20Sopenharmony_ci	if (!log)
15678c2ecf20Sopenharmony_ci		return;
15688c2ecf20Sopenharmony_ci	do {
15698c2ecf20Sopenharmony_ci		target = log->reclaim_target;
15708c2ecf20Sopenharmony_ci		if (new < target)
15718c2ecf20Sopenharmony_ci			return;
15728c2ecf20Sopenharmony_ci	} while (cmpxchg(&log->reclaim_target, target, new) != target);
15738c2ecf20Sopenharmony_ci	md_wakeup_thread(log->reclaim_thread);
15748c2ecf20Sopenharmony_ci}
15758c2ecf20Sopenharmony_ci
15768c2ecf20Sopenharmony_civoid r5l_quiesce(struct r5l_log *log, int quiesce)
15778c2ecf20Sopenharmony_ci{
15788c2ecf20Sopenharmony_ci	struct mddev *mddev;
15798c2ecf20Sopenharmony_ci
15808c2ecf20Sopenharmony_ci	if (quiesce) {
15818c2ecf20Sopenharmony_ci		/* make sure r5l_write_super_and_discard_space exits */
15828c2ecf20Sopenharmony_ci		mddev = log->rdev->mddev;
15838c2ecf20Sopenharmony_ci		wake_up(&mddev->sb_wait);
15848c2ecf20Sopenharmony_ci		kthread_park(log->reclaim_thread->tsk);
15858c2ecf20Sopenharmony_ci		r5l_wake_reclaim(log, MaxSector);
15868c2ecf20Sopenharmony_ci		r5l_do_reclaim(log);
15878c2ecf20Sopenharmony_ci	} else
15888c2ecf20Sopenharmony_ci		kthread_unpark(log->reclaim_thread->tsk);
15898c2ecf20Sopenharmony_ci}
15908c2ecf20Sopenharmony_ci
15918c2ecf20Sopenharmony_cibool r5l_log_disk_error(struct r5conf *conf)
15928c2ecf20Sopenharmony_ci{
15938c2ecf20Sopenharmony_ci	struct r5l_log *log;
15948c2ecf20Sopenharmony_ci	bool ret;
15958c2ecf20Sopenharmony_ci	/* don't allow write if journal disk is missing */
15968c2ecf20Sopenharmony_ci	rcu_read_lock();
15978c2ecf20Sopenharmony_ci	log = rcu_dereference(conf->log);
15988c2ecf20Sopenharmony_ci
15998c2ecf20Sopenharmony_ci	if (!log)
16008c2ecf20Sopenharmony_ci		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
16018c2ecf20Sopenharmony_ci	else
16028c2ecf20Sopenharmony_ci		ret = test_bit(Faulty, &log->rdev->flags);
16038c2ecf20Sopenharmony_ci	rcu_read_unlock();
16048c2ecf20Sopenharmony_ci	return ret;
16058c2ecf20Sopenharmony_ci}
16068c2ecf20Sopenharmony_ci
16078c2ecf20Sopenharmony_ci#define R5L_RECOVERY_PAGE_POOL_SIZE 256
16088c2ecf20Sopenharmony_ci
16098c2ecf20Sopenharmony_cistruct r5l_recovery_ctx {
16108c2ecf20Sopenharmony_ci	struct page *meta_page;		/* current meta */
16118c2ecf20Sopenharmony_ci	sector_t meta_total_blocks;	/* total size of current meta and data */
16128c2ecf20Sopenharmony_ci	sector_t pos;			/* recovery position */
16138c2ecf20Sopenharmony_ci	u64 seq;			/* recovery position seq */
16148c2ecf20Sopenharmony_ci	int data_parity_stripes;	/* number of data_parity stripes */
16158c2ecf20Sopenharmony_ci	int data_only_stripes;		/* number of data_only stripes */
16168c2ecf20Sopenharmony_ci	struct list_head cached_list;
16178c2ecf20Sopenharmony_ci
16188c2ecf20Sopenharmony_ci	/*
16198c2ecf20Sopenharmony_ci	 * read ahead page pool (ra_pool)
16208c2ecf20Sopenharmony_ci	 * in recovery, log is read sequentially. It is not efficient to
16218c2ecf20Sopenharmony_ci	 * read every page with sync_page_io(). The read ahead page pool
16228c2ecf20Sopenharmony_ci	 * reads multiple pages with one IO, so further log read can
16238c2ecf20Sopenharmony_ci	 * just copy data from the pool.
16248c2ecf20Sopenharmony_ci	 */
16258c2ecf20Sopenharmony_ci	struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
16268c2ecf20Sopenharmony_ci	sector_t pool_offset;	/* offset of first page in the pool */
16278c2ecf20Sopenharmony_ci	int total_pages;	/* total allocated pages */
16288c2ecf20Sopenharmony_ci	int valid_pages;	/* pages with valid data */
16298c2ecf20Sopenharmony_ci	struct bio *ra_bio;	/* bio to do the read ahead */
16308c2ecf20Sopenharmony_ci};
16318c2ecf20Sopenharmony_ci
16328c2ecf20Sopenharmony_cistatic int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
16338c2ecf20Sopenharmony_ci					    struct r5l_recovery_ctx *ctx)
16348c2ecf20Sopenharmony_ci{
16358c2ecf20Sopenharmony_ci	struct page *page;
16368c2ecf20Sopenharmony_ci
16378c2ecf20Sopenharmony_ci	ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, &log->bs);
16388c2ecf20Sopenharmony_ci	if (!ctx->ra_bio)
16398c2ecf20Sopenharmony_ci		return -ENOMEM;
16408c2ecf20Sopenharmony_ci
16418c2ecf20Sopenharmony_ci	ctx->valid_pages = 0;
16428c2ecf20Sopenharmony_ci	ctx->total_pages = 0;
16438c2ecf20Sopenharmony_ci	while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
16448c2ecf20Sopenharmony_ci		page = alloc_page(GFP_KERNEL);
16458c2ecf20Sopenharmony_ci
16468c2ecf20Sopenharmony_ci		if (!page)
16478c2ecf20Sopenharmony_ci			break;
16488c2ecf20Sopenharmony_ci		ctx->ra_pool[ctx->total_pages] = page;
16498c2ecf20Sopenharmony_ci		ctx->total_pages += 1;
16508c2ecf20Sopenharmony_ci	}
16518c2ecf20Sopenharmony_ci
16528c2ecf20Sopenharmony_ci	if (ctx->total_pages == 0) {
16538c2ecf20Sopenharmony_ci		bio_put(ctx->ra_bio);
16548c2ecf20Sopenharmony_ci		return -ENOMEM;
16558c2ecf20Sopenharmony_ci	}
16568c2ecf20Sopenharmony_ci
16578c2ecf20Sopenharmony_ci	ctx->pool_offset = 0;
16588c2ecf20Sopenharmony_ci	return 0;
16598c2ecf20Sopenharmony_ci}
16608c2ecf20Sopenharmony_ci
16618c2ecf20Sopenharmony_cistatic void r5l_recovery_free_ra_pool(struct r5l_log *log,
16628c2ecf20Sopenharmony_ci					struct r5l_recovery_ctx *ctx)
16638c2ecf20Sopenharmony_ci{
16648c2ecf20Sopenharmony_ci	int i;
16658c2ecf20Sopenharmony_ci
16668c2ecf20Sopenharmony_ci	for (i = 0; i < ctx->total_pages; ++i)
16678c2ecf20Sopenharmony_ci		put_page(ctx->ra_pool[i]);
16688c2ecf20Sopenharmony_ci	bio_put(ctx->ra_bio);
16698c2ecf20Sopenharmony_ci}
16708c2ecf20Sopenharmony_ci
16718c2ecf20Sopenharmony_ci/*
16728c2ecf20Sopenharmony_ci * fetch ctx->valid_pages pages from offset
16738c2ecf20Sopenharmony_ci * In normal cases, ctx->valid_pages == ctx->total_pages after the call.
16748c2ecf20Sopenharmony_ci * However, if the offset is close to the end of the journal device,
16758c2ecf20Sopenharmony_ci * ctx->valid_pages could be smaller than ctx->total_pages
16768c2ecf20Sopenharmony_ci */
16778c2ecf20Sopenharmony_cistatic int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
16788c2ecf20Sopenharmony_ci				      struct r5l_recovery_ctx *ctx,
16798c2ecf20Sopenharmony_ci				      sector_t offset)
16808c2ecf20Sopenharmony_ci{
16818c2ecf20Sopenharmony_ci	bio_reset(ctx->ra_bio);
16828c2ecf20Sopenharmony_ci	bio_set_dev(ctx->ra_bio, log->rdev->bdev);
16838c2ecf20Sopenharmony_ci	bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
16848c2ecf20Sopenharmony_ci	ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
16858c2ecf20Sopenharmony_ci
16868c2ecf20Sopenharmony_ci	ctx->valid_pages = 0;
16878c2ecf20Sopenharmony_ci	ctx->pool_offset = offset;
16888c2ecf20Sopenharmony_ci
16898c2ecf20Sopenharmony_ci	while (ctx->valid_pages < ctx->total_pages) {
16908c2ecf20Sopenharmony_ci		bio_add_page(ctx->ra_bio,
16918c2ecf20Sopenharmony_ci			     ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0);
16928c2ecf20Sopenharmony_ci		ctx->valid_pages += 1;
16938c2ecf20Sopenharmony_ci
16948c2ecf20Sopenharmony_ci		offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
16958c2ecf20Sopenharmony_ci
16968c2ecf20Sopenharmony_ci		if (offset == 0)  /* reached end of the device */
16978c2ecf20Sopenharmony_ci			break;
16988c2ecf20Sopenharmony_ci	}
16998c2ecf20Sopenharmony_ci
17008c2ecf20Sopenharmony_ci	return submit_bio_wait(ctx->ra_bio);
17018c2ecf20Sopenharmony_ci}
17028c2ecf20Sopenharmony_ci
17038c2ecf20Sopenharmony_ci/*
17048c2ecf20Sopenharmony_ci * try read a page from the read ahead page pool, if the page is not in the
17058c2ecf20Sopenharmony_ci * pool, call r5l_recovery_fetch_ra_pool
17068c2ecf20Sopenharmony_ci */
17078c2ecf20Sopenharmony_cistatic int r5l_recovery_read_page(struct r5l_log *log,
17088c2ecf20Sopenharmony_ci				  struct r5l_recovery_ctx *ctx,
17098c2ecf20Sopenharmony_ci				  struct page *page,
17108c2ecf20Sopenharmony_ci				  sector_t offset)
17118c2ecf20Sopenharmony_ci{
17128c2ecf20Sopenharmony_ci	int ret;
17138c2ecf20Sopenharmony_ci
17148c2ecf20Sopenharmony_ci	if (offset < ctx->pool_offset ||
17158c2ecf20Sopenharmony_ci	    offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
17168c2ecf20Sopenharmony_ci		ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
17178c2ecf20Sopenharmony_ci		if (ret)
17188c2ecf20Sopenharmony_ci			return ret;
17198c2ecf20Sopenharmony_ci	}
17208c2ecf20Sopenharmony_ci
17218c2ecf20Sopenharmony_ci	BUG_ON(offset < ctx->pool_offset ||
17228c2ecf20Sopenharmony_ci	       offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
17238c2ecf20Sopenharmony_ci
17248c2ecf20Sopenharmony_ci	memcpy(page_address(page),
17258c2ecf20Sopenharmony_ci	       page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
17268c2ecf20Sopenharmony_ci					 BLOCK_SECTOR_SHIFT]),
17278c2ecf20Sopenharmony_ci	       PAGE_SIZE);
17288c2ecf20Sopenharmony_ci	return 0;
17298c2ecf20Sopenharmony_ci}
17308c2ecf20Sopenharmony_ci
17318c2ecf20Sopenharmony_cistatic int r5l_recovery_read_meta_block(struct r5l_log *log,
17328c2ecf20Sopenharmony_ci					struct r5l_recovery_ctx *ctx)
17338c2ecf20Sopenharmony_ci{
17348c2ecf20Sopenharmony_ci	struct page *page = ctx->meta_page;
17358c2ecf20Sopenharmony_ci	struct r5l_meta_block *mb;
17368c2ecf20Sopenharmony_ci	u32 crc, stored_crc;
17378c2ecf20Sopenharmony_ci	int ret;
17388c2ecf20Sopenharmony_ci
17398c2ecf20Sopenharmony_ci	ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
17408c2ecf20Sopenharmony_ci	if (ret != 0)
17418c2ecf20Sopenharmony_ci		return ret;
17428c2ecf20Sopenharmony_ci
17438c2ecf20Sopenharmony_ci	mb = page_address(page);
17448c2ecf20Sopenharmony_ci	stored_crc = le32_to_cpu(mb->checksum);
17458c2ecf20Sopenharmony_ci	mb->checksum = 0;
17468c2ecf20Sopenharmony_ci
17478c2ecf20Sopenharmony_ci	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
17488c2ecf20Sopenharmony_ci	    le64_to_cpu(mb->seq) != ctx->seq ||
17498c2ecf20Sopenharmony_ci	    mb->version != R5LOG_VERSION ||
17508c2ecf20Sopenharmony_ci	    le64_to_cpu(mb->position) != ctx->pos)
17518c2ecf20Sopenharmony_ci		return -EINVAL;
17528c2ecf20Sopenharmony_ci
17538c2ecf20Sopenharmony_ci	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
17548c2ecf20Sopenharmony_ci	if (stored_crc != crc)
17558c2ecf20Sopenharmony_ci		return -EINVAL;
17568c2ecf20Sopenharmony_ci
17578c2ecf20Sopenharmony_ci	if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
17588c2ecf20Sopenharmony_ci		return -EINVAL;
17598c2ecf20Sopenharmony_ci
17608c2ecf20Sopenharmony_ci	ctx->meta_total_blocks = BLOCK_SECTORS;
17618c2ecf20Sopenharmony_ci
17628c2ecf20Sopenharmony_ci	return 0;
17638c2ecf20Sopenharmony_ci}
17648c2ecf20Sopenharmony_ci
17658c2ecf20Sopenharmony_cistatic void
17668c2ecf20Sopenharmony_cir5l_recovery_create_empty_meta_block(struct r5l_log *log,
17678c2ecf20Sopenharmony_ci				     struct page *page,
17688c2ecf20Sopenharmony_ci				     sector_t pos, u64 seq)
17698c2ecf20Sopenharmony_ci{
17708c2ecf20Sopenharmony_ci	struct r5l_meta_block *mb;
17718c2ecf20Sopenharmony_ci
17728c2ecf20Sopenharmony_ci	mb = page_address(page);
17738c2ecf20Sopenharmony_ci	clear_page(mb);
17748c2ecf20Sopenharmony_ci	mb->magic = cpu_to_le32(R5LOG_MAGIC);
17758c2ecf20Sopenharmony_ci	mb->version = R5LOG_VERSION;
17768c2ecf20Sopenharmony_ci	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
17778c2ecf20Sopenharmony_ci	mb->seq = cpu_to_le64(seq);
17788c2ecf20Sopenharmony_ci	mb->position = cpu_to_le64(pos);
17798c2ecf20Sopenharmony_ci}
17808c2ecf20Sopenharmony_ci
17818c2ecf20Sopenharmony_cistatic int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
17828c2ecf20Sopenharmony_ci					  u64 seq)
17838c2ecf20Sopenharmony_ci{
17848c2ecf20Sopenharmony_ci	struct page *page;
17858c2ecf20Sopenharmony_ci	struct r5l_meta_block *mb;
17868c2ecf20Sopenharmony_ci
17878c2ecf20Sopenharmony_ci	page = alloc_page(GFP_KERNEL);
17888c2ecf20Sopenharmony_ci	if (!page)
17898c2ecf20Sopenharmony_ci		return -ENOMEM;
17908c2ecf20Sopenharmony_ci	r5l_recovery_create_empty_meta_block(log, page, pos, seq);
17918c2ecf20Sopenharmony_ci	mb = page_address(page);
17928c2ecf20Sopenharmony_ci	mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
17938c2ecf20Sopenharmony_ci					     mb, PAGE_SIZE));
17948c2ecf20Sopenharmony_ci	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
17958c2ecf20Sopenharmony_ci			  REQ_SYNC | REQ_FUA, false)) {
17968c2ecf20Sopenharmony_ci		__free_page(page);
17978c2ecf20Sopenharmony_ci		return -EIO;
17988c2ecf20Sopenharmony_ci	}
17998c2ecf20Sopenharmony_ci	__free_page(page);
18008c2ecf20Sopenharmony_ci	return 0;
18018c2ecf20Sopenharmony_ci}
18028c2ecf20Sopenharmony_ci
18038c2ecf20Sopenharmony_ci/*
18048c2ecf20Sopenharmony_ci * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
18058c2ecf20Sopenharmony_ci * to mark valid (potentially not flushed) data in the journal.
18068c2ecf20Sopenharmony_ci *
18078c2ecf20Sopenharmony_ci * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
18088c2ecf20Sopenharmony_ci * so there should not be any mismatch here.
18098c2ecf20Sopenharmony_ci */
18108c2ecf20Sopenharmony_cistatic void r5l_recovery_load_data(struct r5l_log *log,
18118c2ecf20Sopenharmony_ci				   struct stripe_head *sh,
18128c2ecf20Sopenharmony_ci				   struct r5l_recovery_ctx *ctx,
18138c2ecf20Sopenharmony_ci				   struct r5l_payload_data_parity *payload,
18148c2ecf20Sopenharmony_ci				   sector_t log_offset)
18158c2ecf20Sopenharmony_ci{
18168c2ecf20Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
18178c2ecf20Sopenharmony_ci	struct r5conf *conf = mddev->private;
18188c2ecf20Sopenharmony_ci	int dd_idx;
18198c2ecf20Sopenharmony_ci
18208c2ecf20Sopenharmony_ci	raid5_compute_sector(conf,
18218c2ecf20Sopenharmony_ci			     le64_to_cpu(payload->location), 0,
18228c2ecf20Sopenharmony_ci			     &dd_idx, sh);
18238c2ecf20Sopenharmony_ci	r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
18248c2ecf20Sopenharmony_ci	sh->dev[dd_idx].log_checksum =
18258c2ecf20Sopenharmony_ci		le32_to_cpu(payload->checksum[0]);
18268c2ecf20Sopenharmony_ci	ctx->meta_total_blocks += BLOCK_SECTORS;
18278c2ecf20Sopenharmony_ci
18288c2ecf20Sopenharmony_ci	set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
18298c2ecf20Sopenharmony_ci	set_bit(STRIPE_R5C_CACHING, &sh->state);
18308c2ecf20Sopenharmony_ci}
18318c2ecf20Sopenharmony_ci
18328c2ecf20Sopenharmony_cistatic void r5l_recovery_load_parity(struct r5l_log *log,
18338c2ecf20Sopenharmony_ci				     struct stripe_head *sh,
18348c2ecf20Sopenharmony_ci				     struct r5l_recovery_ctx *ctx,
18358c2ecf20Sopenharmony_ci				     struct r5l_payload_data_parity *payload,
18368c2ecf20Sopenharmony_ci				     sector_t log_offset)
18378c2ecf20Sopenharmony_ci{
18388c2ecf20Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
18398c2ecf20Sopenharmony_ci	struct r5conf *conf = mddev->private;
18408c2ecf20Sopenharmony_ci
18418c2ecf20Sopenharmony_ci	ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
18428c2ecf20Sopenharmony_ci	r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
18438c2ecf20Sopenharmony_ci	sh->dev[sh->pd_idx].log_checksum =
18448c2ecf20Sopenharmony_ci		le32_to_cpu(payload->checksum[0]);
18458c2ecf20Sopenharmony_ci	set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
18468c2ecf20Sopenharmony_ci
18478c2ecf20Sopenharmony_ci	if (sh->qd_idx >= 0) {
18488c2ecf20Sopenharmony_ci		r5l_recovery_read_page(
18498c2ecf20Sopenharmony_ci			log, ctx, sh->dev[sh->qd_idx].page,
18508c2ecf20Sopenharmony_ci			r5l_ring_add(log, log_offset, BLOCK_SECTORS));
18518c2ecf20Sopenharmony_ci		sh->dev[sh->qd_idx].log_checksum =
18528c2ecf20Sopenharmony_ci			le32_to_cpu(payload->checksum[1]);
18538c2ecf20Sopenharmony_ci		set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
18548c2ecf20Sopenharmony_ci	}
18558c2ecf20Sopenharmony_ci	clear_bit(STRIPE_R5C_CACHING, &sh->state);
18568c2ecf20Sopenharmony_ci}
18578c2ecf20Sopenharmony_ci
18588c2ecf20Sopenharmony_cistatic void r5l_recovery_reset_stripe(struct stripe_head *sh)
18598c2ecf20Sopenharmony_ci{
18608c2ecf20Sopenharmony_ci	int i;
18618c2ecf20Sopenharmony_ci
18628c2ecf20Sopenharmony_ci	sh->state = 0;
18638c2ecf20Sopenharmony_ci	sh->log_start = MaxSector;
18648c2ecf20Sopenharmony_ci	for (i = sh->disks; i--; )
18658c2ecf20Sopenharmony_ci		sh->dev[i].flags = 0;
18668c2ecf20Sopenharmony_ci}
18678c2ecf20Sopenharmony_ci
18688c2ecf20Sopenharmony_cistatic void
18698c2ecf20Sopenharmony_cir5l_recovery_replay_one_stripe(struct r5conf *conf,
18708c2ecf20Sopenharmony_ci			       struct stripe_head *sh,
18718c2ecf20Sopenharmony_ci			       struct r5l_recovery_ctx *ctx)
18728c2ecf20Sopenharmony_ci{
18738c2ecf20Sopenharmony_ci	struct md_rdev *rdev, *rrdev;
18748c2ecf20Sopenharmony_ci	int disk_index;
18758c2ecf20Sopenharmony_ci	int data_count = 0;
18768c2ecf20Sopenharmony_ci
18778c2ecf20Sopenharmony_ci	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
18788c2ecf20Sopenharmony_ci		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
18798c2ecf20Sopenharmony_ci			continue;
18808c2ecf20Sopenharmony_ci		if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
18818c2ecf20Sopenharmony_ci			continue;
18828c2ecf20Sopenharmony_ci		data_count++;
18838c2ecf20Sopenharmony_ci	}
18848c2ecf20Sopenharmony_ci
18858c2ecf20Sopenharmony_ci	/*
18868c2ecf20Sopenharmony_ci	 * stripes that only have parity must have been flushed
18878c2ecf20Sopenharmony_ci	 * before the crash that we are now recovering from, so
18888c2ecf20Sopenharmony_ci	 * there is nothing more to recovery.
18898c2ecf20Sopenharmony_ci	 */
18908c2ecf20Sopenharmony_ci	if (data_count == 0)
18918c2ecf20Sopenharmony_ci		goto out;
18928c2ecf20Sopenharmony_ci
18938c2ecf20Sopenharmony_ci	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
18948c2ecf20Sopenharmony_ci		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
18958c2ecf20Sopenharmony_ci			continue;
18968c2ecf20Sopenharmony_ci
18978c2ecf20Sopenharmony_ci		/* in case device is broken */
18988c2ecf20Sopenharmony_ci		rcu_read_lock();
18998c2ecf20Sopenharmony_ci		rdev = rcu_dereference(conf->disks[disk_index].rdev);
19008c2ecf20Sopenharmony_ci		if (rdev) {
19018c2ecf20Sopenharmony_ci			atomic_inc(&rdev->nr_pending);
19028c2ecf20Sopenharmony_ci			rcu_read_unlock();
19038c2ecf20Sopenharmony_ci			sync_page_io(rdev, sh->sector, PAGE_SIZE,
19048c2ecf20Sopenharmony_ci				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
19058c2ecf20Sopenharmony_ci				     false);
19068c2ecf20Sopenharmony_ci			rdev_dec_pending(rdev, rdev->mddev);
19078c2ecf20Sopenharmony_ci			rcu_read_lock();
19088c2ecf20Sopenharmony_ci		}
19098c2ecf20Sopenharmony_ci		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
19108c2ecf20Sopenharmony_ci		if (rrdev) {
19118c2ecf20Sopenharmony_ci			atomic_inc(&rrdev->nr_pending);
19128c2ecf20Sopenharmony_ci			rcu_read_unlock();
19138c2ecf20Sopenharmony_ci			sync_page_io(rrdev, sh->sector, PAGE_SIZE,
19148c2ecf20Sopenharmony_ci				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
19158c2ecf20Sopenharmony_ci				     false);
19168c2ecf20Sopenharmony_ci			rdev_dec_pending(rrdev, rrdev->mddev);
19178c2ecf20Sopenharmony_ci			rcu_read_lock();
19188c2ecf20Sopenharmony_ci		}
19198c2ecf20Sopenharmony_ci		rcu_read_unlock();
19208c2ecf20Sopenharmony_ci	}
19218c2ecf20Sopenharmony_ci	ctx->data_parity_stripes++;
19228c2ecf20Sopenharmony_ciout:
19238c2ecf20Sopenharmony_ci	r5l_recovery_reset_stripe(sh);
19248c2ecf20Sopenharmony_ci}
19258c2ecf20Sopenharmony_ci
19268c2ecf20Sopenharmony_cistatic struct stripe_head *
19278c2ecf20Sopenharmony_cir5c_recovery_alloc_stripe(
19288c2ecf20Sopenharmony_ci		struct r5conf *conf,
19298c2ecf20Sopenharmony_ci		sector_t stripe_sect,
19308c2ecf20Sopenharmony_ci		int noblock)
19318c2ecf20Sopenharmony_ci{
19328c2ecf20Sopenharmony_ci	struct stripe_head *sh;
19338c2ecf20Sopenharmony_ci
19348c2ecf20Sopenharmony_ci	sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0);
19358c2ecf20Sopenharmony_ci	if (!sh)
19368c2ecf20Sopenharmony_ci		return NULL;  /* no more stripe available */
19378c2ecf20Sopenharmony_ci
19388c2ecf20Sopenharmony_ci	r5l_recovery_reset_stripe(sh);
19398c2ecf20Sopenharmony_ci
19408c2ecf20Sopenharmony_ci	return sh;
19418c2ecf20Sopenharmony_ci}
19428c2ecf20Sopenharmony_ci
19438c2ecf20Sopenharmony_cistatic struct stripe_head *
19448c2ecf20Sopenharmony_cir5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
19458c2ecf20Sopenharmony_ci{
19468c2ecf20Sopenharmony_ci	struct stripe_head *sh;
19478c2ecf20Sopenharmony_ci
19488c2ecf20Sopenharmony_ci	list_for_each_entry(sh, list, lru)
19498c2ecf20Sopenharmony_ci		if (sh->sector == sect)
19508c2ecf20Sopenharmony_ci			return sh;
19518c2ecf20Sopenharmony_ci	return NULL;
19528c2ecf20Sopenharmony_ci}
19538c2ecf20Sopenharmony_ci
19548c2ecf20Sopenharmony_cistatic void
19558c2ecf20Sopenharmony_cir5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
19568c2ecf20Sopenharmony_ci			  struct r5l_recovery_ctx *ctx)
19578c2ecf20Sopenharmony_ci{
19588c2ecf20Sopenharmony_ci	struct stripe_head *sh, *next;
19598c2ecf20Sopenharmony_ci
19608c2ecf20Sopenharmony_ci	list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
19618c2ecf20Sopenharmony_ci		r5l_recovery_reset_stripe(sh);
19628c2ecf20Sopenharmony_ci		list_del_init(&sh->lru);
19638c2ecf20Sopenharmony_ci		raid5_release_stripe(sh);
19648c2ecf20Sopenharmony_ci	}
19658c2ecf20Sopenharmony_ci}
19668c2ecf20Sopenharmony_ci
19678c2ecf20Sopenharmony_cistatic void
19688c2ecf20Sopenharmony_cir5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
19698c2ecf20Sopenharmony_ci			    struct r5l_recovery_ctx *ctx)
19708c2ecf20Sopenharmony_ci{
19718c2ecf20Sopenharmony_ci	struct stripe_head *sh, *next;
19728c2ecf20Sopenharmony_ci
19738c2ecf20Sopenharmony_ci	list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
19748c2ecf20Sopenharmony_ci		if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
19758c2ecf20Sopenharmony_ci			r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
19768c2ecf20Sopenharmony_ci			list_del_init(&sh->lru);
19778c2ecf20Sopenharmony_ci			raid5_release_stripe(sh);
19788c2ecf20Sopenharmony_ci		}
19798c2ecf20Sopenharmony_ci}
19808c2ecf20Sopenharmony_ci
19818c2ecf20Sopenharmony_ci/* if matches return 0; otherwise return -EINVAL */
19828c2ecf20Sopenharmony_cistatic int
19838c2ecf20Sopenharmony_cir5l_recovery_verify_data_checksum(struct r5l_log *log,
19848c2ecf20Sopenharmony_ci				  struct r5l_recovery_ctx *ctx,
19858c2ecf20Sopenharmony_ci				  struct page *page,
19868c2ecf20Sopenharmony_ci				  sector_t log_offset, __le32 log_checksum)
19878c2ecf20Sopenharmony_ci{
19888c2ecf20Sopenharmony_ci	void *addr;
19898c2ecf20Sopenharmony_ci	u32 checksum;
19908c2ecf20Sopenharmony_ci
19918c2ecf20Sopenharmony_ci	r5l_recovery_read_page(log, ctx, page, log_offset);
19928c2ecf20Sopenharmony_ci	addr = kmap_atomic(page);
19938c2ecf20Sopenharmony_ci	checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
19948c2ecf20Sopenharmony_ci	kunmap_atomic(addr);
19958c2ecf20Sopenharmony_ci	return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
19968c2ecf20Sopenharmony_ci}
19978c2ecf20Sopenharmony_ci
19988c2ecf20Sopenharmony_ci/*
19998c2ecf20Sopenharmony_ci * before loading data to stripe cache, we need verify checksum for all data,
20008c2ecf20Sopenharmony_ci * if there is mismatch for any data page, we drop all data in the mata block
20018c2ecf20Sopenharmony_ci */
20028c2ecf20Sopenharmony_cistatic int
20038c2ecf20Sopenharmony_cir5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
20048c2ecf20Sopenharmony_ci					 struct r5l_recovery_ctx *ctx)
20058c2ecf20Sopenharmony_ci{
20068c2ecf20Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
20078c2ecf20Sopenharmony_ci	struct r5conf *conf = mddev->private;
20088c2ecf20Sopenharmony_ci	struct r5l_meta_block *mb = page_address(ctx->meta_page);
20098c2ecf20Sopenharmony_ci	sector_t mb_offset = sizeof(struct r5l_meta_block);
20108c2ecf20Sopenharmony_ci	sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
20118c2ecf20Sopenharmony_ci	struct page *page;
20128c2ecf20Sopenharmony_ci	struct r5l_payload_data_parity *payload;
20138c2ecf20Sopenharmony_ci	struct r5l_payload_flush *payload_flush;
20148c2ecf20Sopenharmony_ci
20158c2ecf20Sopenharmony_ci	page = alloc_page(GFP_KERNEL);
20168c2ecf20Sopenharmony_ci	if (!page)
20178c2ecf20Sopenharmony_ci		return -ENOMEM;
20188c2ecf20Sopenharmony_ci
20198c2ecf20Sopenharmony_ci	while (mb_offset < le32_to_cpu(mb->meta_size)) {
20208c2ecf20Sopenharmony_ci		payload = (void *)mb + mb_offset;
20218c2ecf20Sopenharmony_ci		payload_flush = (void *)mb + mb_offset;
20228c2ecf20Sopenharmony_ci
20238c2ecf20Sopenharmony_ci		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
20248c2ecf20Sopenharmony_ci			if (r5l_recovery_verify_data_checksum(
20258c2ecf20Sopenharmony_ci				    log, ctx, page, log_offset,
20268c2ecf20Sopenharmony_ci				    payload->checksum[0]) < 0)
20278c2ecf20Sopenharmony_ci				goto mismatch;
20288c2ecf20Sopenharmony_ci		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
20298c2ecf20Sopenharmony_ci			if (r5l_recovery_verify_data_checksum(
20308c2ecf20Sopenharmony_ci				    log, ctx, page, log_offset,
20318c2ecf20Sopenharmony_ci				    payload->checksum[0]) < 0)
20328c2ecf20Sopenharmony_ci				goto mismatch;
20338c2ecf20Sopenharmony_ci			if (conf->max_degraded == 2 && /* q for RAID 6 */
20348c2ecf20Sopenharmony_ci			    r5l_recovery_verify_data_checksum(
20358c2ecf20Sopenharmony_ci				    log, ctx, page,
20368c2ecf20Sopenharmony_ci				    r5l_ring_add(log, log_offset,
20378c2ecf20Sopenharmony_ci						 BLOCK_SECTORS),
20388c2ecf20Sopenharmony_ci				    payload->checksum[1]) < 0)
20398c2ecf20Sopenharmony_ci				goto mismatch;
20408c2ecf20Sopenharmony_ci		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
20418c2ecf20Sopenharmony_ci			/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
20428c2ecf20Sopenharmony_ci		} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
20438c2ecf20Sopenharmony_ci			goto mismatch;
20448c2ecf20Sopenharmony_ci
20458c2ecf20Sopenharmony_ci		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
20468c2ecf20Sopenharmony_ci			mb_offset += sizeof(struct r5l_payload_flush) +
20478c2ecf20Sopenharmony_ci				le32_to_cpu(payload_flush->size);
20488c2ecf20Sopenharmony_ci		} else {
20498c2ecf20Sopenharmony_ci			/* DATA or PARITY payload */
20508c2ecf20Sopenharmony_ci			log_offset = r5l_ring_add(log, log_offset,
20518c2ecf20Sopenharmony_ci						  le32_to_cpu(payload->size));
20528c2ecf20Sopenharmony_ci			mb_offset += sizeof(struct r5l_payload_data_parity) +
20538c2ecf20Sopenharmony_ci				sizeof(__le32) *
20548c2ecf20Sopenharmony_ci				(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
20558c2ecf20Sopenharmony_ci		}
20568c2ecf20Sopenharmony_ci
20578c2ecf20Sopenharmony_ci	}
20588c2ecf20Sopenharmony_ci
20598c2ecf20Sopenharmony_ci	put_page(page);
20608c2ecf20Sopenharmony_ci	return 0;
20618c2ecf20Sopenharmony_ci
20628c2ecf20Sopenharmony_cimismatch:
20638c2ecf20Sopenharmony_ci	put_page(page);
20648c2ecf20Sopenharmony_ci	return -EINVAL;
20658c2ecf20Sopenharmony_ci}
20668c2ecf20Sopenharmony_ci
20678c2ecf20Sopenharmony_ci/*
20688c2ecf20Sopenharmony_ci * Analyze all data/parity pages in one meta block
20698c2ecf20Sopenharmony_ci * Returns:
20708c2ecf20Sopenharmony_ci * 0 for success
20718c2ecf20Sopenharmony_ci * -EINVAL for unknown playload type
20728c2ecf20Sopenharmony_ci * -EAGAIN for checksum mismatch of data page
20738c2ecf20Sopenharmony_ci * -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
20748c2ecf20Sopenharmony_ci */
20758c2ecf20Sopenharmony_cistatic int
20768c2ecf20Sopenharmony_cir5c_recovery_analyze_meta_block(struct r5l_log *log,
20778c2ecf20Sopenharmony_ci				struct r5l_recovery_ctx *ctx,
20788c2ecf20Sopenharmony_ci				struct list_head *cached_stripe_list)
20798c2ecf20Sopenharmony_ci{
20808c2ecf20Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
20818c2ecf20Sopenharmony_ci	struct r5conf *conf = mddev->private;
20828c2ecf20Sopenharmony_ci	struct r5l_meta_block *mb;
20838c2ecf20Sopenharmony_ci	struct r5l_payload_data_parity *payload;
20848c2ecf20Sopenharmony_ci	struct r5l_payload_flush *payload_flush;
20858c2ecf20Sopenharmony_ci	int mb_offset;
20868c2ecf20Sopenharmony_ci	sector_t log_offset;
20878c2ecf20Sopenharmony_ci	sector_t stripe_sect;
20888c2ecf20Sopenharmony_ci	struct stripe_head *sh;
20898c2ecf20Sopenharmony_ci	int ret;
20908c2ecf20Sopenharmony_ci
20918c2ecf20Sopenharmony_ci	/*
20928c2ecf20Sopenharmony_ci	 * for mismatch in data blocks, we will drop all data in this mb, but
20938c2ecf20Sopenharmony_ci	 * we will still read next mb for other data with FLUSH flag, as
20948c2ecf20Sopenharmony_ci	 * io_unit could finish out of order.
20958c2ecf20Sopenharmony_ci	 */
20968c2ecf20Sopenharmony_ci	ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
20978c2ecf20Sopenharmony_ci	if (ret == -EINVAL)
20988c2ecf20Sopenharmony_ci		return -EAGAIN;
20998c2ecf20Sopenharmony_ci	else if (ret)
21008c2ecf20Sopenharmony_ci		return ret;   /* -ENOMEM duo to alloc_page() failed */
21018c2ecf20Sopenharmony_ci
21028c2ecf20Sopenharmony_ci	mb = page_address(ctx->meta_page);
21038c2ecf20Sopenharmony_ci	mb_offset = sizeof(struct r5l_meta_block);
21048c2ecf20Sopenharmony_ci	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
21058c2ecf20Sopenharmony_ci
21068c2ecf20Sopenharmony_ci	while (mb_offset < le32_to_cpu(mb->meta_size)) {
21078c2ecf20Sopenharmony_ci		int dd;
21088c2ecf20Sopenharmony_ci
21098c2ecf20Sopenharmony_ci		payload = (void *)mb + mb_offset;
21108c2ecf20Sopenharmony_ci		payload_flush = (void *)mb + mb_offset;
21118c2ecf20Sopenharmony_ci
21128c2ecf20Sopenharmony_ci		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
21138c2ecf20Sopenharmony_ci			int i, count;
21148c2ecf20Sopenharmony_ci
21158c2ecf20Sopenharmony_ci			count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
21168c2ecf20Sopenharmony_ci			for (i = 0; i < count; ++i) {
21178c2ecf20Sopenharmony_ci				stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
21188c2ecf20Sopenharmony_ci				sh = r5c_recovery_lookup_stripe(cached_stripe_list,
21198c2ecf20Sopenharmony_ci								stripe_sect);
21208c2ecf20Sopenharmony_ci				if (sh) {
21218c2ecf20Sopenharmony_ci					WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
21228c2ecf20Sopenharmony_ci					r5l_recovery_reset_stripe(sh);
21238c2ecf20Sopenharmony_ci					list_del_init(&sh->lru);
21248c2ecf20Sopenharmony_ci					raid5_release_stripe(sh);
21258c2ecf20Sopenharmony_ci				}
21268c2ecf20Sopenharmony_ci			}
21278c2ecf20Sopenharmony_ci
21288c2ecf20Sopenharmony_ci			mb_offset += sizeof(struct r5l_payload_flush) +
21298c2ecf20Sopenharmony_ci				le32_to_cpu(payload_flush->size);
21308c2ecf20Sopenharmony_ci			continue;
21318c2ecf20Sopenharmony_ci		}
21328c2ecf20Sopenharmony_ci
21338c2ecf20Sopenharmony_ci		/* DATA or PARITY payload */
21348c2ecf20Sopenharmony_ci		stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
21358c2ecf20Sopenharmony_ci			raid5_compute_sector(
21368c2ecf20Sopenharmony_ci				conf, le64_to_cpu(payload->location), 0, &dd,
21378c2ecf20Sopenharmony_ci				NULL)
21388c2ecf20Sopenharmony_ci			: le64_to_cpu(payload->location);
21398c2ecf20Sopenharmony_ci
21408c2ecf20Sopenharmony_ci		sh = r5c_recovery_lookup_stripe(cached_stripe_list,
21418c2ecf20Sopenharmony_ci						stripe_sect);
21428c2ecf20Sopenharmony_ci
21438c2ecf20Sopenharmony_ci		if (!sh) {
21448c2ecf20Sopenharmony_ci			sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1);
21458c2ecf20Sopenharmony_ci			/*
21468c2ecf20Sopenharmony_ci			 * cannot get stripe from raid5_get_active_stripe
21478c2ecf20Sopenharmony_ci			 * try replay some stripes
21488c2ecf20Sopenharmony_ci			 */
21498c2ecf20Sopenharmony_ci			if (!sh) {
21508c2ecf20Sopenharmony_ci				r5c_recovery_replay_stripes(
21518c2ecf20Sopenharmony_ci					cached_stripe_list, ctx);
21528c2ecf20Sopenharmony_ci				sh = r5c_recovery_alloc_stripe(
21538c2ecf20Sopenharmony_ci					conf, stripe_sect, 1);
21548c2ecf20Sopenharmony_ci			}
21558c2ecf20Sopenharmony_ci			if (!sh) {
21568c2ecf20Sopenharmony_ci				int new_size = conf->min_nr_stripes * 2;
21578c2ecf20Sopenharmony_ci				pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
21588c2ecf20Sopenharmony_ci					mdname(mddev),
21598c2ecf20Sopenharmony_ci					new_size);
21608c2ecf20Sopenharmony_ci				ret = raid5_set_cache_size(mddev, new_size);
21618c2ecf20Sopenharmony_ci				if (conf->min_nr_stripes <= new_size / 2) {
21628c2ecf20Sopenharmony_ci					pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n",
21638c2ecf20Sopenharmony_ci						mdname(mddev),
21648c2ecf20Sopenharmony_ci						ret,
21658c2ecf20Sopenharmony_ci						new_size,
21668c2ecf20Sopenharmony_ci						conf->min_nr_stripes,
21678c2ecf20Sopenharmony_ci						conf->max_nr_stripes);
21688c2ecf20Sopenharmony_ci					return -ENOMEM;
21698c2ecf20Sopenharmony_ci				}
21708c2ecf20Sopenharmony_ci				sh = r5c_recovery_alloc_stripe(
21718c2ecf20Sopenharmony_ci					conf, stripe_sect, 0);
21728c2ecf20Sopenharmony_ci			}
21738c2ecf20Sopenharmony_ci			if (!sh) {
21748c2ecf20Sopenharmony_ci				pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
21758c2ecf20Sopenharmony_ci					mdname(mddev));
21768c2ecf20Sopenharmony_ci				return -ENOMEM;
21778c2ecf20Sopenharmony_ci			}
21788c2ecf20Sopenharmony_ci			list_add_tail(&sh->lru, cached_stripe_list);
21798c2ecf20Sopenharmony_ci		}
21808c2ecf20Sopenharmony_ci
21818c2ecf20Sopenharmony_ci		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
21828c2ecf20Sopenharmony_ci			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
21838c2ecf20Sopenharmony_ci			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
21848c2ecf20Sopenharmony_ci				r5l_recovery_replay_one_stripe(conf, sh, ctx);
21858c2ecf20Sopenharmony_ci				list_move_tail(&sh->lru, cached_stripe_list);
21868c2ecf20Sopenharmony_ci			}
21878c2ecf20Sopenharmony_ci			r5l_recovery_load_data(log, sh, ctx, payload,
21888c2ecf20Sopenharmony_ci					       log_offset);
21898c2ecf20Sopenharmony_ci		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
21908c2ecf20Sopenharmony_ci			r5l_recovery_load_parity(log, sh, ctx, payload,
21918c2ecf20Sopenharmony_ci						 log_offset);
21928c2ecf20Sopenharmony_ci		else
21938c2ecf20Sopenharmony_ci			return -EINVAL;
21948c2ecf20Sopenharmony_ci
21958c2ecf20Sopenharmony_ci		log_offset = r5l_ring_add(log, log_offset,
21968c2ecf20Sopenharmony_ci					  le32_to_cpu(payload->size));
21978c2ecf20Sopenharmony_ci
21988c2ecf20Sopenharmony_ci		mb_offset += sizeof(struct r5l_payload_data_parity) +
21998c2ecf20Sopenharmony_ci			sizeof(__le32) *
22008c2ecf20Sopenharmony_ci			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
22018c2ecf20Sopenharmony_ci	}
22028c2ecf20Sopenharmony_ci
22038c2ecf20Sopenharmony_ci	return 0;
22048c2ecf20Sopenharmony_ci}
22058c2ecf20Sopenharmony_ci
22068c2ecf20Sopenharmony_ci/*
22078c2ecf20Sopenharmony_ci * Load the stripe into cache. The stripe will be written out later by
22088c2ecf20Sopenharmony_ci * the stripe cache state machine.
22098c2ecf20Sopenharmony_ci */
22108c2ecf20Sopenharmony_cistatic void r5c_recovery_load_one_stripe(struct r5l_log *log,
22118c2ecf20Sopenharmony_ci					 struct stripe_head *sh)
22128c2ecf20Sopenharmony_ci{
22138c2ecf20Sopenharmony_ci	struct r5dev *dev;
22148c2ecf20Sopenharmony_ci	int i;
22158c2ecf20Sopenharmony_ci
22168c2ecf20Sopenharmony_ci	for (i = sh->disks; i--; ) {
22178c2ecf20Sopenharmony_ci		dev = sh->dev + i;
22188c2ecf20Sopenharmony_ci		if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
22198c2ecf20Sopenharmony_ci			set_bit(R5_InJournal, &dev->flags);
22208c2ecf20Sopenharmony_ci			set_bit(R5_UPTODATE, &dev->flags);
22218c2ecf20Sopenharmony_ci		}
22228c2ecf20Sopenharmony_ci	}
22238c2ecf20Sopenharmony_ci}
22248c2ecf20Sopenharmony_ci
22258c2ecf20Sopenharmony_ci/*
22268c2ecf20Sopenharmony_ci * Scan through the log for all to-be-flushed data
22278c2ecf20Sopenharmony_ci *
22288c2ecf20Sopenharmony_ci * For stripes with data and parity, namely Data-Parity stripe
22298c2ecf20Sopenharmony_ci * (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
22308c2ecf20Sopenharmony_ci *
22318c2ecf20Sopenharmony_ci * For stripes with only data, namely Data-Only stripe
22328c2ecf20Sopenharmony_ci * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
22338c2ecf20Sopenharmony_ci *
22348c2ecf20Sopenharmony_ci * For a stripe, if we see data after parity, we should discard all previous
22358c2ecf20Sopenharmony_ci * data and parity for this stripe, as these data are already flushed to
22368c2ecf20Sopenharmony_ci * the array.
22378c2ecf20Sopenharmony_ci *
22388c2ecf20Sopenharmony_ci * At the end of the scan, we return the new journal_tail, which points to
22398c2ecf20Sopenharmony_ci * first data-only stripe on the journal device, or next invalid meta block.
22408c2ecf20Sopenharmony_ci */
22418c2ecf20Sopenharmony_cistatic int r5c_recovery_flush_log(struct r5l_log *log,
22428c2ecf20Sopenharmony_ci				  struct r5l_recovery_ctx *ctx)
22438c2ecf20Sopenharmony_ci{
22448c2ecf20Sopenharmony_ci	struct stripe_head *sh;
22458c2ecf20Sopenharmony_ci	int ret = 0;
22468c2ecf20Sopenharmony_ci
22478c2ecf20Sopenharmony_ci	/* scan through the log */
22488c2ecf20Sopenharmony_ci	while (1) {
22498c2ecf20Sopenharmony_ci		if (r5l_recovery_read_meta_block(log, ctx))
22508c2ecf20Sopenharmony_ci			break;
22518c2ecf20Sopenharmony_ci
22528c2ecf20Sopenharmony_ci		ret = r5c_recovery_analyze_meta_block(log, ctx,
22538c2ecf20Sopenharmony_ci						      &ctx->cached_list);
22548c2ecf20Sopenharmony_ci		/*
22558c2ecf20Sopenharmony_ci		 * -EAGAIN means mismatch in data block, in this case, we still
22568c2ecf20Sopenharmony_ci		 * try scan the next metablock
22578c2ecf20Sopenharmony_ci		 */
22588c2ecf20Sopenharmony_ci		if (ret && ret != -EAGAIN)
22598c2ecf20Sopenharmony_ci			break;   /* ret == -EINVAL or -ENOMEM */
22608c2ecf20Sopenharmony_ci		ctx->seq++;
22618c2ecf20Sopenharmony_ci		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
22628c2ecf20Sopenharmony_ci	}
22638c2ecf20Sopenharmony_ci
22648c2ecf20Sopenharmony_ci	if (ret == -ENOMEM) {
22658c2ecf20Sopenharmony_ci		r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
22668c2ecf20Sopenharmony_ci		return ret;
22678c2ecf20Sopenharmony_ci	}
22688c2ecf20Sopenharmony_ci
22698c2ecf20Sopenharmony_ci	/* replay data-parity stripes */
22708c2ecf20Sopenharmony_ci	r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
22718c2ecf20Sopenharmony_ci
22728c2ecf20Sopenharmony_ci	/* load data-only stripes to stripe cache */
22738c2ecf20Sopenharmony_ci	list_for_each_entry(sh, &ctx->cached_list, lru) {
22748c2ecf20Sopenharmony_ci		WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
22758c2ecf20Sopenharmony_ci		r5c_recovery_load_one_stripe(log, sh);
22768c2ecf20Sopenharmony_ci		ctx->data_only_stripes++;
22778c2ecf20Sopenharmony_ci	}
22788c2ecf20Sopenharmony_ci
22798c2ecf20Sopenharmony_ci	return 0;
22808c2ecf20Sopenharmony_ci}
22818c2ecf20Sopenharmony_ci
22828c2ecf20Sopenharmony_ci/*
22838c2ecf20Sopenharmony_ci * we did a recovery. Now ctx.pos points to an invalid meta block. New
22848c2ecf20Sopenharmony_ci * log will start here. but we can't let superblock point to last valid
22858c2ecf20Sopenharmony_ci * meta block. The log might looks like:
22868c2ecf20Sopenharmony_ci * | meta 1| meta 2| meta 3|
22878c2ecf20Sopenharmony_ci * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
22888c2ecf20Sopenharmony_ci * superblock points to meta 1, we write a new valid meta 2n.  if crash
22898c2ecf20Sopenharmony_ci * happens again, new recovery will start from meta 1. Since meta 2n is
22908c2ecf20Sopenharmony_ci * valid now, recovery will think meta 3 is valid, which is wrong.
22918c2ecf20Sopenharmony_ci * The solution is we create a new meta in meta2 with its seq == meta
22928c2ecf20Sopenharmony_ci * 1's seq + 10000 and let superblock points to meta2. The same recovery
22938c2ecf20Sopenharmony_ci * will not think meta 3 is a valid meta, because its seq doesn't match
22948c2ecf20Sopenharmony_ci */
22958c2ecf20Sopenharmony_ci
22968c2ecf20Sopenharmony_ci/*
22978c2ecf20Sopenharmony_ci * Before recovery, the log looks like the following
22988c2ecf20Sopenharmony_ci *
22998c2ecf20Sopenharmony_ci *   ---------------------------------------------
23008c2ecf20Sopenharmony_ci *   |           valid log        | invalid log  |
23018c2ecf20Sopenharmony_ci *   ---------------------------------------------
23028c2ecf20Sopenharmony_ci *   ^
23038c2ecf20Sopenharmony_ci *   |- log->last_checkpoint
23048c2ecf20Sopenharmony_ci *   |- log->last_cp_seq
23058c2ecf20Sopenharmony_ci *
23068c2ecf20Sopenharmony_ci * Now we scan through the log until we see invalid entry
23078c2ecf20Sopenharmony_ci *
23088c2ecf20Sopenharmony_ci *   ---------------------------------------------
23098c2ecf20Sopenharmony_ci *   |           valid log        | invalid log  |
23108c2ecf20Sopenharmony_ci *   ---------------------------------------------
23118c2ecf20Sopenharmony_ci *   ^                            ^
23128c2ecf20Sopenharmony_ci *   |- log->last_checkpoint      |- ctx->pos
23138c2ecf20Sopenharmony_ci *   |- log->last_cp_seq          |- ctx->seq
23148c2ecf20Sopenharmony_ci *
23158c2ecf20Sopenharmony_ci * From this point, we need to increase seq number by 10 to avoid
23168c2ecf20Sopenharmony_ci * confusing next recovery.
23178c2ecf20Sopenharmony_ci *
23188c2ecf20Sopenharmony_ci *   ---------------------------------------------
23198c2ecf20Sopenharmony_ci *   |           valid log        | invalid log  |
23208c2ecf20Sopenharmony_ci *   ---------------------------------------------
23218c2ecf20Sopenharmony_ci *   ^                              ^
23228c2ecf20Sopenharmony_ci *   |- log->last_checkpoint        |- ctx->pos+1
23238c2ecf20Sopenharmony_ci *   |- log->last_cp_seq            |- ctx->seq+10001
23248c2ecf20Sopenharmony_ci *
23258c2ecf20Sopenharmony_ci * However, it is not safe to start the state machine yet, because data only
23268c2ecf20Sopenharmony_ci * parities are not yet secured in RAID. To save these data only parities, we
23278c2ecf20Sopenharmony_ci * rewrite them from seq+11.
23288c2ecf20Sopenharmony_ci *
23298c2ecf20Sopenharmony_ci *   -----------------------------------------------------------------
23308c2ecf20Sopenharmony_ci *   |           valid log        | data only stripes | invalid log  |
23318c2ecf20Sopenharmony_ci *   -----------------------------------------------------------------
23328c2ecf20Sopenharmony_ci *   ^                                                ^
23338c2ecf20Sopenharmony_ci *   |- log->last_checkpoint                          |- ctx->pos+n
23348c2ecf20Sopenharmony_ci *   |- log->last_cp_seq                              |- ctx->seq+10000+n
23358c2ecf20Sopenharmony_ci *
23368c2ecf20Sopenharmony_ci * If failure happens again during this process, the recovery can safe start
23378c2ecf20Sopenharmony_ci * again from log->last_checkpoint.
23388c2ecf20Sopenharmony_ci *
23398c2ecf20Sopenharmony_ci * Once data only stripes are rewritten to journal, we move log_tail
23408c2ecf20Sopenharmony_ci *
23418c2ecf20Sopenharmony_ci *   -----------------------------------------------------------------
23428c2ecf20Sopenharmony_ci *   |     old log        |    data only stripes    | invalid log  |
23438c2ecf20Sopenharmony_ci *   -----------------------------------------------------------------
23448c2ecf20Sopenharmony_ci *                        ^                         ^
23458c2ecf20Sopenharmony_ci *                        |- log->last_checkpoint   |- ctx->pos+n
23468c2ecf20Sopenharmony_ci *                        |- log->last_cp_seq       |- ctx->seq+10000+n
23478c2ecf20Sopenharmony_ci *
23488c2ecf20Sopenharmony_ci * Then we can safely start the state machine. If failure happens from this
23498c2ecf20Sopenharmony_ci * point on, the recovery will start from new log->last_checkpoint.
23508c2ecf20Sopenharmony_ci */
23518c2ecf20Sopenharmony_cistatic int
23528c2ecf20Sopenharmony_cir5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
23538c2ecf20Sopenharmony_ci				       struct r5l_recovery_ctx *ctx)
23548c2ecf20Sopenharmony_ci{
23558c2ecf20Sopenharmony_ci	struct stripe_head *sh;
23568c2ecf20Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
23578c2ecf20Sopenharmony_ci	struct page *page;
23588c2ecf20Sopenharmony_ci	sector_t next_checkpoint = MaxSector;
23598c2ecf20Sopenharmony_ci
23608c2ecf20Sopenharmony_ci	page = alloc_page(GFP_KERNEL);
23618c2ecf20Sopenharmony_ci	if (!page) {
23628c2ecf20Sopenharmony_ci		pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
23638c2ecf20Sopenharmony_ci		       mdname(mddev));
23648c2ecf20Sopenharmony_ci		return -ENOMEM;
23658c2ecf20Sopenharmony_ci	}
23668c2ecf20Sopenharmony_ci
23678c2ecf20Sopenharmony_ci	WARN_ON(list_empty(&ctx->cached_list));
23688c2ecf20Sopenharmony_ci
23698c2ecf20Sopenharmony_ci	list_for_each_entry(sh, &ctx->cached_list, lru) {
23708c2ecf20Sopenharmony_ci		struct r5l_meta_block *mb;
23718c2ecf20Sopenharmony_ci		int i;
23728c2ecf20Sopenharmony_ci		int offset;
23738c2ecf20Sopenharmony_ci		sector_t write_pos;
23748c2ecf20Sopenharmony_ci
23758c2ecf20Sopenharmony_ci		WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
23768c2ecf20Sopenharmony_ci		r5l_recovery_create_empty_meta_block(log, page,
23778c2ecf20Sopenharmony_ci						     ctx->pos, ctx->seq);
23788c2ecf20Sopenharmony_ci		mb = page_address(page);
23798c2ecf20Sopenharmony_ci		offset = le32_to_cpu(mb->meta_size);
23808c2ecf20Sopenharmony_ci		write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
23818c2ecf20Sopenharmony_ci
23828c2ecf20Sopenharmony_ci		for (i = sh->disks; i--; ) {
23838c2ecf20Sopenharmony_ci			struct r5dev *dev = &sh->dev[i];
23848c2ecf20Sopenharmony_ci			struct r5l_payload_data_parity *payload;
23858c2ecf20Sopenharmony_ci			void *addr;
23868c2ecf20Sopenharmony_ci
23878c2ecf20Sopenharmony_ci			if (test_bit(R5_InJournal, &dev->flags)) {
23888c2ecf20Sopenharmony_ci				payload = (void *)mb + offset;
23898c2ecf20Sopenharmony_ci				payload->header.type = cpu_to_le16(
23908c2ecf20Sopenharmony_ci					R5LOG_PAYLOAD_DATA);
23918c2ecf20Sopenharmony_ci				payload->size = cpu_to_le32(BLOCK_SECTORS);
23928c2ecf20Sopenharmony_ci				payload->location = cpu_to_le64(
23938c2ecf20Sopenharmony_ci					raid5_compute_blocknr(sh, i, 0));
23948c2ecf20Sopenharmony_ci				addr = kmap_atomic(dev->page);
23958c2ecf20Sopenharmony_ci				payload->checksum[0] = cpu_to_le32(
23968c2ecf20Sopenharmony_ci					crc32c_le(log->uuid_checksum, addr,
23978c2ecf20Sopenharmony_ci						  PAGE_SIZE));
23988c2ecf20Sopenharmony_ci				kunmap_atomic(addr);
23998c2ecf20Sopenharmony_ci				sync_page_io(log->rdev, write_pos, PAGE_SIZE,
24008c2ecf20Sopenharmony_ci					     dev->page, REQ_OP_WRITE, 0, false);
24018c2ecf20Sopenharmony_ci				write_pos = r5l_ring_add(log, write_pos,
24028c2ecf20Sopenharmony_ci							 BLOCK_SECTORS);
24038c2ecf20Sopenharmony_ci				offset += sizeof(__le32) +
24048c2ecf20Sopenharmony_ci					sizeof(struct r5l_payload_data_parity);
24058c2ecf20Sopenharmony_ci
24068c2ecf20Sopenharmony_ci			}
24078c2ecf20Sopenharmony_ci		}
24088c2ecf20Sopenharmony_ci		mb->meta_size = cpu_to_le32(offset);
24098c2ecf20Sopenharmony_ci		mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
24108c2ecf20Sopenharmony_ci						     mb, PAGE_SIZE));
24118c2ecf20Sopenharmony_ci		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
24128c2ecf20Sopenharmony_ci			     REQ_OP_WRITE, REQ_SYNC | REQ_FUA, false);
24138c2ecf20Sopenharmony_ci		sh->log_start = ctx->pos;
24148c2ecf20Sopenharmony_ci		list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
24158c2ecf20Sopenharmony_ci		atomic_inc(&log->stripe_in_journal_count);
24168c2ecf20Sopenharmony_ci		ctx->pos = write_pos;
24178c2ecf20Sopenharmony_ci		ctx->seq += 1;
24188c2ecf20Sopenharmony_ci		next_checkpoint = sh->log_start;
24198c2ecf20Sopenharmony_ci	}
24208c2ecf20Sopenharmony_ci	log->next_checkpoint = next_checkpoint;
24218c2ecf20Sopenharmony_ci	__free_page(page);
24228c2ecf20Sopenharmony_ci	return 0;
24238c2ecf20Sopenharmony_ci}
24248c2ecf20Sopenharmony_ci
24258c2ecf20Sopenharmony_cistatic void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
24268c2ecf20Sopenharmony_ci						 struct r5l_recovery_ctx *ctx)
24278c2ecf20Sopenharmony_ci{
24288c2ecf20Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
24298c2ecf20Sopenharmony_ci	struct r5conf *conf = mddev->private;
24308c2ecf20Sopenharmony_ci	struct stripe_head *sh, *next;
24318c2ecf20Sopenharmony_ci	bool cleared_pending = false;
24328c2ecf20Sopenharmony_ci
24338c2ecf20Sopenharmony_ci	if (ctx->data_only_stripes == 0)
24348c2ecf20Sopenharmony_ci		return;
24358c2ecf20Sopenharmony_ci
24368c2ecf20Sopenharmony_ci	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
24378c2ecf20Sopenharmony_ci		cleared_pending = true;
24388c2ecf20Sopenharmony_ci		clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
24398c2ecf20Sopenharmony_ci	}
24408c2ecf20Sopenharmony_ci	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
24418c2ecf20Sopenharmony_ci
24428c2ecf20Sopenharmony_ci	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
24438c2ecf20Sopenharmony_ci		r5c_make_stripe_write_out(sh);
24448c2ecf20Sopenharmony_ci		set_bit(STRIPE_HANDLE, &sh->state);
24458c2ecf20Sopenharmony_ci		list_del_init(&sh->lru);
24468c2ecf20Sopenharmony_ci		raid5_release_stripe(sh);
24478c2ecf20Sopenharmony_ci	}
24488c2ecf20Sopenharmony_ci
24498c2ecf20Sopenharmony_ci	/* reuse conf->wait_for_quiescent in recovery */
24508c2ecf20Sopenharmony_ci	wait_event(conf->wait_for_quiescent,
24518c2ecf20Sopenharmony_ci		   atomic_read(&conf->active_stripes) == 0);
24528c2ecf20Sopenharmony_ci
24538c2ecf20Sopenharmony_ci	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
24548c2ecf20Sopenharmony_ci	if (cleared_pending)
24558c2ecf20Sopenharmony_ci		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
24568c2ecf20Sopenharmony_ci}
24578c2ecf20Sopenharmony_ci
24588c2ecf20Sopenharmony_cistatic int r5l_recovery_log(struct r5l_log *log)
24598c2ecf20Sopenharmony_ci{
24608c2ecf20Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
24618c2ecf20Sopenharmony_ci	struct r5l_recovery_ctx *ctx;
24628c2ecf20Sopenharmony_ci	int ret;
24638c2ecf20Sopenharmony_ci	sector_t pos;
24648c2ecf20Sopenharmony_ci
24658c2ecf20Sopenharmony_ci	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
24668c2ecf20Sopenharmony_ci	if (!ctx)
24678c2ecf20Sopenharmony_ci		return -ENOMEM;
24688c2ecf20Sopenharmony_ci
24698c2ecf20Sopenharmony_ci	ctx->pos = log->last_checkpoint;
24708c2ecf20Sopenharmony_ci	ctx->seq = log->last_cp_seq;
24718c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&ctx->cached_list);
24728c2ecf20Sopenharmony_ci	ctx->meta_page = alloc_page(GFP_KERNEL);
24738c2ecf20Sopenharmony_ci
24748c2ecf20Sopenharmony_ci	if (!ctx->meta_page) {
24758c2ecf20Sopenharmony_ci		ret =  -ENOMEM;
24768c2ecf20Sopenharmony_ci		goto meta_page;
24778c2ecf20Sopenharmony_ci	}
24788c2ecf20Sopenharmony_ci
24798c2ecf20Sopenharmony_ci	if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
24808c2ecf20Sopenharmony_ci		ret = -ENOMEM;
24818c2ecf20Sopenharmony_ci		goto ra_pool;
24828c2ecf20Sopenharmony_ci	}
24838c2ecf20Sopenharmony_ci
24848c2ecf20Sopenharmony_ci	ret = r5c_recovery_flush_log(log, ctx);
24858c2ecf20Sopenharmony_ci
24868c2ecf20Sopenharmony_ci	if (ret)
24878c2ecf20Sopenharmony_ci		goto error;
24888c2ecf20Sopenharmony_ci
24898c2ecf20Sopenharmony_ci	pos = ctx->pos;
24908c2ecf20Sopenharmony_ci	ctx->seq += 10000;
24918c2ecf20Sopenharmony_ci
24928c2ecf20Sopenharmony_ci	if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
24938c2ecf20Sopenharmony_ci		pr_info("md/raid:%s: starting from clean shutdown\n",
24948c2ecf20Sopenharmony_ci			 mdname(mddev));
24958c2ecf20Sopenharmony_ci	else
24968c2ecf20Sopenharmony_ci		pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
24978c2ecf20Sopenharmony_ci			 mdname(mddev), ctx->data_only_stripes,
24988c2ecf20Sopenharmony_ci			 ctx->data_parity_stripes);
24998c2ecf20Sopenharmony_ci
25008c2ecf20Sopenharmony_ci	if (ctx->data_only_stripes == 0) {
25018c2ecf20Sopenharmony_ci		log->next_checkpoint = ctx->pos;
25028c2ecf20Sopenharmony_ci		r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
25038c2ecf20Sopenharmony_ci		ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
25048c2ecf20Sopenharmony_ci	} else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
25058c2ecf20Sopenharmony_ci		pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
25068c2ecf20Sopenharmony_ci		       mdname(mddev));
25078c2ecf20Sopenharmony_ci		ret =  -EIO;
25088c2ecf20Sopenharmony_ci		goto error;
25098c2ecf20Sopenharmony_ci	}
25108c2ecf20Sopenharmony_ci
25118c2ecf20Sopenharmony_ci	log->log_start = ctx->pos;
25128c2ecf20Sopenharmony_ci	log->seq = ctx->seq;
25138c2ecf20Sopenharmony_ci	log->last_checkpoint = pos;
25148c2ecf20Sopenharmony_ci	r5l_write_super(log, pos);
25158c2ecf20Sopenharmony_ci
25168c2ecf20Sopenharmony_ci	r5c_recovery_flush_data_only_stripes(log, ctx);
25178c2ecf20Sopenharmony_ci	ret = 0;
25188c2ecf20Sopenharmony_cierror:
25198c2ecf20Sopenharmony_ci	r5l_recovery_free_ra_pool(log, ctx);
25208c2ecf20Sopenharmony_cira_pool:
25218c2ecf20Sopenharmony_ci	__free_page(ctx->meta_page);
25228c2ecf20Sopenharmony_cimeta_page:
25238c2ecf20Sopenharmony_ci	kfree(ctx);
25248c2ecf20Sopenharmony_ci	return ret;
25258c2ecf20Sopenharmony_ci}
25268c2ecf20Sopenharmony_ci
25278c2ecf20Sopenharmony_cistatic void r5l_write_super(struct r5l_log *log, sector_t cp)
25288c2ecf20Sopenharmony_ci{
25298c2ecf20Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
25308c2ecf20Sopenharmony_ci
25318c2ecf20Sopenharmony_ci	log->rdev->journal_tail = cp;
25328c2ecf20Sopenharmony_ci	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
25338c2ecf20Sopenharmony_ci}
25348c2ecf20Sopenharmony_ci
25358c2ecf20Sopenharmony_cistatic ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
25368c2ecf20Sopenharmony_ci{
25378c2ecf20Sopenharmony_ci	struct r5conf *conf;
25388c2ecf20Sopenharmony_ci	int ret;
25398c2ecf20Sopenharmony_ci
25408c2ecf20Sopenharmony_ci	spin_lock(&mddev->lock);
25418c2ecf20Sopenharmony_ci	conf = mddev->private;
25428c2ecf20Sopenharmony_ci	if (!conf || !conf->log) {
25438c2ecf20Sopenharmony_ci		spin_unlock(&mddev->lock);
25448c2ecf20Sopenharmony_ci		return 0;
25458c2ecf20Sopenharmony_ci	}
25468c2ecf20Sopenharmony_ci
25478c2ecf20Sopenharmony_ci	switch (conf->log->r5c_journal_mode) {
25488c2ecf20Sopenharmony_ci	case R5C_JOURNAL_MODE_WRITE_THROUGH:
25498c2ecf20Sopenharmony_ci		ret = snprintf(
25508c2ecf20Sopenharmony_ci			page, PAGE_SIZE, "[%s] %s\n",
25518c2ecf20Sopenharmony_ci			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
25528c2ecf20Sopenharmony_ci			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
25538c2ecf20Sopenharmony_ci		break;
25548c2ecf20Sopenharmony_ci	case R5C_JOURNAL_MODE_WRITE_BACK:
25558c2ecf20Sopenharmony_ci		ret = snprintf(
25568c2ecf20Sopenharmony_ci			page, PAGE_SIZE, "%s [%s]\n",
25578c2ecf20Sopenharmony_ci			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
25588c2ecf20Sopenharmony_ci			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
25598c2ecf20Sopenharmony_ci		break;
25608c2ecf20Sopenharmony_ci	default:
25618c2ecf20Sopenharmony_ci		ret = 0;
25628c2ecf20Sopenharmony_ci	}
25638c2ecf20Sopenharmony_ci	spin_unlock(&mddev->lock);
25648c2ecf20Sopenharmony_ci	return ret;
25658c2ecf20Sopenharmony_ci}
25668c2ecf20Sopenharmony_ci
25678c2ecf20Sopenharmony_ci/*
25688c2ecf20Sopenharmony_ci * Set journal cache mode on @mddev (external API initially needed by dm-raid).
25698c2ecf20Sopenharmony_ci *
25708c2ecf20Sopenharmony_ci * @mode as defined in 'enum r5c_journal_mode'.
25718c2ecf20Sopenharmony_ci *
25728c2ecf20Sopenharmony_ci */
25738c2ecf20Sopenharmony_ciint r5c_journal_mode_set(struct mddev *mddev, int mode)
25748c2ecf20Sopenharmony_ci{
25758c2ecf20Sopenharmony_ci	struct r5conf *conf;
25768c2ecf20Sopenharmony_ci
25778c2ecf20Sopenharmony_ci	if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
25788c2ecf20Sopenharmony_ci	    mode > R5C_JOURNAL_MODE_WRITE_BACK)
25798c2ecf20Sopenharmony_ci		return -EINVAL;
25808c2ecf20Sopenharmony_ci
25818c2ecf20Sopenharmony_ci	conf = mddev->private;
25828c2ecf20Sopenharmony_ci	if (!conf || !conf->log)
25838c2ecf20Sopenharmony_ci		return -ENODEV;
25848c2ecf20Sopenharmony_ci
25858c2ecf20Sopenharmony_ci	if (raid5_calc_degraded(conf) > 0 &&
25868c2ecf20Sopenharmony_ci	    mode == R5C_JOURNAL_MODE_WRITE_BACK)
25878c2ecf20Sopenharmony_ci		return -EINVAL;
25888c2ecf20Sopenharmony_ci
25898c2ecf20Sopenharmony_ci	mddev_suspend(mddev);
25908c2ecf20Sopenharmony_ci	conf->log->r5c_journal_mode = mode;
25918c2ecf20Sopenharmony_ci	mddev_resume(mddev);
25928c2ecf20Sopenharmony_ci
25938c2ecf20Sopenharmony_ci	pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
25948c2ecf20Sopenharmony_ci		 mdname(mddev), mode, r5c_journal_mode_str[mode]);
25958c2ecf20Sopenharmony_ci	return 0;
25968c2ecf20Sopenharmony_ci}
25978c2ecf20Sopenharmony_ciEXPORT_SYMBOL(r5c_journal_mode_set);
25988c2ecf20Sopenharmony_ci
25998c2ecf20Sopenharmony_cistatic ssize_t r5c_journal_mode_store(struct mddev *mddev,
26008c2ecf20Sopenharmony_ci				      const char *page, size_t length)
26018c2ecf20Sopenharmony_ci{
26028c2ecf20Sopenharmony_ci	int mode = ARRAY_SIZE(r5c_journal_mode_str);
26038c2ecf20Sopenharmony_ci	size_t len = length;
26048c2ecf20Sopenharmony_ci	int ret;
26058c2ecf20Sopenharmony_ci
26068c2ecf20Sopenharmony_ci	if (len < 2)
26078c2ecf20Sopenharmony_ci		return -EINVAL;
26088c2ecf20Sopenharmony_ci
26098c2ecf20Sopenharmony_ci	if (page[len - 1] == '\n')
26108c2ecf20Sopenharmony_ci		len--;
26118c2ecf20Sopenharmony_ci
26128c2ecf20Sopenharmony_ci	while (mode--)
26138c2ecf20Sopenharmony_ci		if (strlen(r5c_journal_mode_str[mode]) == len &&
26148c2ecf20Sopenharmony_ci		    !strncmp(page, r5c_journal_mode_str[mode], len))
26158c2ecf20Sopenharmony_ci			break;
26168c2ecf20Sopenharmony_ci	ret = mddev_lock(mddev);
26178c2ecf20Sopenharmony_ci	if (ret)
26188c2ecf20Sopenharmony_ci		return ret;
26198c2ecf20Sopenharmony_ci	ret = r5c_journal_mode_set(mddev, mode);
26208c2ecf20Sopenharmony_ci	mddev_unlock(mddev);
26218c2ecf20Sopenharmony_ci	return ret ?: length;
26228c2ecf20Sopenharmony_ci}
26238c2ecf20Sopenharmony_ci
26248c2ecf20Sopenharmony_cistruct md_sysfs_entry
26258c2ecf20Sopenharmony_cir5c_journal_mode = __ATTR(journal_mode, 0644,
26268c2ecf20Sopenharmony_ci			  r5c_journal_mode_show, r5c_journal_mode_store);
26278c2ecf20Sopenharmony_ci
26288c2ecf20Sopenharmony_ci/*
26298c2ecf20Sopenharmony_ci * Try handle write operation in caching phase. This function should only
26308c2ecf20Sopenharmony_ci * be called in write-back mode.
26318c2ecf20Sopenharmony_ci *
26328c2ecf20Sopenharmony_ci * If all outstanding writes can be handled in caching phase, returns 0
26338c2ecf20Sopenharmony_ci * If writes requires write-out phase, call r5c_make_stripe_write_out()
26348c2ecf20Sopenharmony_ci * and returns -EAGAIN
26358c2ecf20Sopenharmony_ci */
26368c2ecf20Sopenharmony_ciint r5c_try_caching_write(struct r5conf *conf,
26378c2ecf20Sopenharmony_ci			  struct stripe_head *sh,
26388c2ecf20Sopenharmony_ci			  struct stripe_head_state *s,
26398c2ecf20Sopenharmony_ci			  int disks)
26408c2ecf20Sopenharmony_ci{
26418c2ecf20Sopenharmony_ci	struct r5l_log *log = conf->log;
26428c2ecf20Sopenharmony_ci	int i;
26438c2ecf20Sopenharmony_ci	struct r5dev *dev;
26448c2ecf20Sopenharmony_ci	int to_cache = 0;
26458c2ecf20Sopenharmony_ci	void **pslot;
26468c2ecf20Sopenharmony_ci	sector_t tree_index;
26478c2ecf20Sopenharmony_ci	int ret;
26488c2ecf20Sopenharmony_ci	uintptr_t refcount;
26498c2ecf20Sopenharmony_ci
26508c2ecf20Sopenharmony_ci	BUG_ON(!r5c_is_writeback(log));
26518c2ecf20Sopenharmony_ci
26528c2ecf20Sopenharmony_ci	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
26538c2ecf20Sopenharmony_ci		/*
26548c2ecf20Sopenharmony_ci		 * There are two different scenarios here:
26558c2ecf20Sopenharmony_ci		 *  1. The stripe has some data cached, and it is sent to
26568c2ecf20Sopenharmony_ci		 *     write-out phase for reclaim
26578c2ecf20Sopenharmony_ci		 *  2. The stripe is clean, and this is the first write
26588c2ecf20Sopenharmony_ci		 *
26598c2ecf20Sopenharmony_ci		 * For 1, return -EAGAIN, so we continue with
26608c2ecf20Sopenharmony_ci		 * handle_stripe_dirtying().
26618c2ecf20Sopenharmony_ci		 *
26628c2ecf20Sopenharmony_ci		 * For 2, set STRIPE_R5C_CACHING and continue with caching
26638c2ecf20Sopenharmony_ci		 * write.
26648c2ecf20Sopenharmony_ci		 */
26658c2ecf20Sopenharmony_ci
26668c2ecf20Sopenharmony_ci		/* case 1: anything injournal or anything in written */
26678c2ecf20Sopenharmony_ci		if (s->injournal > 0 || s->written > 0)
26688c2ecf20Sopenharmony_ci			return -EAGAIN;
26698c2ecf20Sopenharmony_ci		/* case 2 */
26708c2ecf20Sopenharmony_ci		set_bit(STRIPE_R5C_CACHING, &sh->state);
26718c2ecf20Sopenharmony_ci	}
26728c2ecf20Sopenharmony_ci
26738c2ecf20Sopenharmony_ci	/*
26748c2ecf20Sopenharmony_ci	 * When run in degraded mode, array is set to write-through mode.
26758c2ecf20Sopenharmony_ci	 * This check helps drain pending write safely in the transition to
26768c2ecf20Sopenharmony_ci	 * write-through mode.
26778c2ecf20Sopenharmony_ci	 *
26788c2ecf20Sopenharmony_ci	 * When a stripe is syncing, the write is also handled in write
26798c2ecf20Sopenharmony_ci	 * through mode.
26808c2ecf20Sopenharmony_ci	 */
26818c2ecf20Sopenharmony_ci	if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
26828c2ecf20Sopenharmony_ci		r5c_make_stripe_write_out(sh);
26838c2ecf20Sopenharmony_ci		return -EAGAIN;
26848c2ecf20Sopenharmony_ci	}
26858c2ecf20Sopenharmony_ci
26868c2ecf20Sopenharmony_ci	for (i = disks; i--; ) {
26878c2ecf20Sopenharmony_ci		dev = &sh->dev[i];
26888c2ecf20Sopenharmony_ci		/* if non-overwrite, use writing-out phase */
26898c2ecf20Sopenharmony_ci		if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
26908c2ecf20Sopenharmony_ci		    !test_bit(R5_InJournal, &dev->flags)) {
26918c2ecf20Sopenharmony_ci			r5c_make_stripe_write_out(sh);
26928c2ecf20Sopenharmony_ci			return -EAGAIN;
26938c2ecf20Sopenharmony_ci		}
26948c2ecf20Sopenharmony_ci	}
26958c2ecf20Sopenharmony_ci
26968c2ecf20Sopenharmony_ci	/* if the stripe is not counted in big_stripe_tree, add it now */
26978c2ecf20Sopenharmony_ci	if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
26988c2ecf20Sopenharmony_ci	    !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
26998c2ecf20Sopenharmony_ci		tree_index = r5c_tree_index(conf, sh->sector);
27008c2ecf20Sopenharmony_ci		spin_lock(&log->tree_lock);
27018c2ecf20Sopenharmony_ci		pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
27028c2ecf20Sopenharmony_ci					       tree_index);
27038c2ecf20Sopenharmony_ci		if (pslot) {
27048c2ecf20Sopenharmony_ci			refcount = (uintptr_t)radix_tree_deref_slot_protected(
27058c2ecf20Sopenharmony_ci				pslot, &log->tree_lock) >>
27068c2ecf20Sopenharmony_ci				R5C_RADIX_COUNT_SHIFT;
27078c2ecf20Sopenharmony_ci			radix_tree_replace_slot(
27088c2ecf20Sopenharmony_ci				&log->big_stripe_tree, pslot,
27098c2ecf20Sopenharmony_ci				(void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
27108c2ecf20Sopenharmony_ci		} else {
27118c2ecf20Sopenharmony_ci			/*
27128c2ecf20Sopenharmony_ci			 * this radix_tree_insert can fail safely, so no
27138c2ecf20Sopenharmony_ci			 * need to call radix_tree_preload()
27148c2ecf20Sopenharmony_ci			 */
27158c2ecf20Sopenharmony_ci			ret = radix_tree_insert(
27168c2ecf20Sopenharmony_ci				&log->big_stripe_tree, tree_index,
27178c2ecf20Sopenharmony_ci				(void *)(1 << R5C_RADIX_COUNT_SHIFT));
27188c2ecf20Sopenharmony_ci			if (ret) {
27198c2ecf20Sopenharmony_ci				spin_unlock(&log->tree_lock);
27208c2ecf20Sopenharmony_ci				r5c_make_stripe_write_out(sh);
27218c2ecf20Sopenharmony_ci				return -EAGAIN;
27228c2ecf20Sopenharmony_ci			}
27238c2ecf20Sopenharmony_ci		}
27248c2ecf20Sopenharmony_ci		spin_unlock(&log->tree_lock);
27258c2ecf20Sopenharmony_ci
27268c2ecf20Sopenharmony_ci		/*
27278c2ecf20Sopenharmony_ci		 * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
27288c2ecf20Sopenharmony_ci		 * counted in the radix tree
27298c2ecf20Sopenharmony_ci		 */
27308c2ecf20Sopenharmony_ci		set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
27318c2ecf20Sopenharmony_ci		atomic_inc(&conf->r5c_cached_partial_stripes);
27328c2ecf20Sopenharmony_ci	}
27338c2ecf20Sopenharmony_ci
27348c2ecf20Sopenharmony_ci	for (i = disks; i--; ) {
27358c2ecf20Sopenharmony_ci		dev = &sh->dev[i];
27368c2ecf20Sopenharmony_ci		if (dev->towrite) {
27378c2ecf20Sopenharmony_ci			set_bit(R5_Wantwrite, &dev->flags);
27388c2ecf20Sopenharmony_ci			set_bit(R5_Wantdrain, &dev->flags);
27398c2ecf20Sopenharmony_ci			set_bit(R5_LOCKED, &dev->flags);
27408c2ecf20Sopenharmony_ci			to_cache++;
27418c2ecf20Sopenharmony_ci		}
27428c2ecf20Sopenharmony_ci	}
27438c2ecf20Sopenharmony_ci
27448c2ecf20Sopenharmony_ci	if (to_cache) {
27458c2ecf20Sopenharmony_ci		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
27468c2ecf20Sopenharmony_ci		/*
27478c2ecf20Sopenharmony_ci		 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
27488c2ecf20Sopenharmony_ci		 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
27498c2ecf20Sopenharmony_ci		 * r5c_handle_data_cached()
27508c2ecf20Sopenharmony_ci		 */
27518c2ecf20Sopenharmony_ci		set_bit(STRIPE_LOG_TRAPPED, &sh->state);
27528c2ecf20Sopenharmony_ci	}
27538c2ecf20Sopenharmony_ci
27548c2ecf20Sopenharmony_ci	return 0;
27558c2ecf20Sopenharmony_ci}
27568c2ecf20Sopenharmony_ci
27578c2ecf20Sopenharmony_ci/*
27588c2ecf20Sopenharmony_ci * free extra pages (orig_page) we allocated for prexor
27598c2ecf20Sopenharmony_ci */
27608c2ecf20Sopenharmony_civoid r5c_release_extra_page(struct stripe_head *sh)
27618c2ecf20Sopenharmony_ci{
27628c2ecf20Sopenharmony_ci	struct r5conf *conf = sh->raid_conf;
27638c2ecf20Sopenharmony_ci	int i;
27648c2ecf20Sopenharmony_ci	bool using_disk_info_extra_page;
27658c2ecf20Sopenharmony_ci
27668c2ecf20Sopenharmony_ci	using_disk_info_extra_page =
27678c2ecf20Sopenharmony_ci		sh->dev[0].orig_page == conf->disks[0].extra_page;
27688c2ecf20Sopenharmony_ci
27698c2ecf20Sopenharmony_ci	for (i = sh->disks; i--; )
27708c2ecf20Sopenharmony_ci		if (sh->dev[i].page != sh->dev[i].orig_page) {
27718c2ecf20Sopenharmony_ci			struct page *p = sh->dev[i].orig_page;
27728c2ecf20Sopenharmony_ci
27738c2ecf20Sopenharmony_ci			sh->dev[i].orig_page = sh->dev[i].page;
27748c2ecf20Sopenharmony_ci			clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
27758c2ecf20Sopenharmony_ci
27768c2ecf20Sopenharmony_ci			if (!using_disk_info_extra_page)
27778c2ecf20Sopenharmony_ci				put_page(p);
27788c2ecf20Sopenharmony_ci		}
27798c2ecf20Sopenharmony_ci
27808c2ecf20Sopenharmony_ci	if (using_disk_info_extra_page) {
27818c2ecf20Sopenharmony_ci		clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
27828c2ecf20Sopenharmony_ci		md_wakeup_thread(conf->mddev->thread);
27838c2ecf20Sopenharmony_ci	}
27848c2ecf20Sopenharmony_ci}
27858c2ecf20Sopenharmony_ci
27868c2ecf20Sopenharmony_civoid r5c_use_extra_page(struct stripe_head *sh)
27878c2ecf20Sopenharmony_ci{
27888c2ecf20Sopenharmony_ci	struct r5conf *conf = sh->raid_conf;
27898c2ecf20Sopenharmony_ci	int i;
27908c2ecf20Sopenharmony_ci	struct r5dev *dev;
27918c2ecf20Sopenharmony_ci
27928c2ecf20Sopenharmony_ci	for (i = sh->disks; i--; ) {
27938c2ecf20Sopenharmony_ci		dev = &sh->dev[i];
27948c2ecf20Sopenharmony_ci		if (dev->orig_page != dev->page)
27958c2ecf20Sopenharmony_ci			put_page(dev->orig_page);
27968c2ecf20Sopenharmony_ci		dev->orig_page = conf->disks[i].extra_page;
27978c2ecf20Sopenharmony_ci	}
27988c2ecf20Sopenharmony_ci}
27998c2ecf20Sopenharmony_ci
28008c2ecf20Sopenharmony_ci/*
28018c2ecf20Sopenharmony_ci * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
28028c2ecf20Sopenharmony_ci * stripe is committed to RAID disks.
28038c2ecf20Sopenharmony_ci */
28048c2ecf20Sopenharmony_civoid r5c_finish_stripe_write_out(struct r5conf *conf,
28058c2ecf20Sopenharmony_ci				 struct stripe_head *sh,
28068c2ecf20Sopenharmony_ci				 struct stripe_head_state *s)
28078c2ecf20Sopenharmony_ci{
28088c2ecf20Sopenharmony_ci	struct r5l_log *log = conf->log;
28098c2ecf20Sopenharmony_ci	int i;
28108c2ecf20Sopenharmony_ci	int do_wakeup = 0;
28118c2ecf20Sopenharmony_ci	sector_t tree_index;
28128c2ecf20Sopenharmony_ci	void **pslot;
28138c2ecf20Sopenharmony_ci	uintptr_t refcount;
28148c2ecf20Sopenharmony_ci
28158c2ecf20Sopenharmony_ci	if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
28168c2ecf20Sopenharmony_ci		return;
28178c2ecf20Sopenharmony_ci
28188c2ecf20Sopenharmony_ci	WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
28198c2ecf20Sopenharmony_ci	clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
28208c2ecf20Sopenharmony_ci
28218c2ecf20Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
28228c2ecf20Sopenharmony_ci		return;
28238c2ecf20Sopenharmony_ci
28248c2ecf20Sopenharmony_ci	for (i = sh->disks; i--; ) {
28258c2ecf20Sopenharmony_ci		clear_bit(R5_InJournal, &sh->dev[i].flags);
28268c2ecf20Sopenharmony_ci		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
28278c2ecf20Sopenharmony_ci			do_wakeup = 1;
28288c2ecf20Sopenharmony_ci	}
28298c2ecf20Sopenharmony_ci
28308c2ecf20Sopenharmony_ci	/*
28318c2ecf20Sopenharmony_ci	 * analyse_stripe() runs before r5c_finish_stripe_write_out(),
28328c2ecf20Sopenharmony_ci	 * We updated R5_InJournal, so we also update s->injournal.
28338c2ecf20Sopenharmony_ci	 */
28348c2ecf20Sopenharmony_ci	s->injournal = 0;
28358c2ecf20Sopenharmony_ci
28368c2ecf20Sopenharmony_ci	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
28378c2ecf20Sopenharmony_ci		if (atomic_dec_and_test(&conf->pending_full_writes))
28388c2ecf20Sopenharmony_ci			md_wakeup_thread(conf->mddev->thread);
28398c2ecf20Sopenharmony_ci
28408c2ecf20Sopenharmony_ci	if (do_wakeup)
28418c2ecf20Sopenharmony_ci		wake_up(&conf->wait_for_overlap);
28428c2ecf20Sopenharmony_ci
28438c2ecf20Sopenharmony_ci	spin_lock_irq(&log->stripe_in_journal_lock);
28448c2ecf20Sopenharmony_ci	list_del_init(&sh->r5c);
28458c2ecf20Sopenharmony_ci	spin_unlock_irq(&log->stripe_in_journal_lock);
28468c2ecf20Sopenharmony_ci	sh->log_start = MaxSector;
28478c2ecf20Sopenharmony_ci
28488c2ecf20Sopenharmony_ci	atomic_dec(&log->stripe_in_journal_count);
28498c2ecf20Sopenharmony_ci	r5c_update_log_state(log);
28508c2ecf20Sopenharmony_ci
28518c2ecf20Sopenharmony_ci	/* stop counting this stripe in big_stripe_tree */
28528c2ecf20Sopenharmony_ci	if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
28538c2ecf20Sopenharmony_ci	    test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
28548c2ecf20Sopenharmony_ci		tree_index = r5c_tree_index(conf, sh->sector);
28558c2ecf20Sopenharmony_ci		spin_lock(&log->tree_lock);
28568c2ecf20Sopenharmony_ci		pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
28578c2ecf20Sopenharmony_ci					       tree_index);
28588c2ecf20Sopenharmony_ci		BUG_ON(pslot == NULL);
28598c2ecf20Sopenharmony_ci		refcount = (uintptr_t)radix_tree_deref_slot_protected(
28608c2ecf20Sopenharmony_ci			pslot, &log->tree_lock) >>
28618c2ecf20Sopenharmony_ci			R5C_RADIX_COUNT_SHIFT;
28628c2ecf20Sopenharmony_ci		if (refcount == 1)
28638c2ecf20Sopenharmony_ci			radix_tree_delete(&log->big_stripe_tree, tree_index);
28648c2ecf20Sopenharmony_ci		else
28658c2ecf20Sopenharmony_ci			radix_tree_replace_slot(
28668c2ecf20Sopenharmony_ci				&log->big_stripe_tree, pslot,
28678c2ecf20Sopenharmony_ci				(void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
28688c2ecf20Sopenharmony_ci		spin_unlock(&log->tree_lock);
28698c2ecf20Sopenharmony_ci	}
28708c2ecf20Sopenharmony_ci
28718c2ecf20Sopenharmony_ci	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
28728c2ecf20Sopenharmony_ci		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
28738c2ecf20Sopenharmony_ci		atomic_dec(&conf->r5c_flushing_partial_stripes);
28748c2ecf20Sopenharmony_ci		atomic_dec(&conf->r5c_cached_partial_stripes);
28758c2ecf20Sopenharmony_ci	}
28768c2ecf20Sopenharmony_ci
28778c2ecf20Sopenharmony_ci	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
28788c2ecf20Sopenharmony_ci		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
28798c2ecf20Sopenharmony_ci		atomic_dec(&conf->r5c_flushing_full_stripes);
28808c2ecf20Sopenharmony_ci		atomic_dec(&conf->r5c_cached_full_stripes);
28818c2ecf20Sopenharmony_ci	}
28828c2ecf20Sopenharmony_ci
28838c2ecf20Sopenharmony_ci	r5l_append_flush_payload(log, sh->sector);
28848c2ecf20Sopenharmony_ci	/* stripe is flused to raid disks, we can do resync now */
28858c2ecf20Sopenharmony_ci	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
28868c2ecf20Sopenharmony_ci		set_bit(STRIPE_HANDLE, &sh->state);
28878c2ecf20Sopenharmony_ci}
28888c2ecf20Sopenharmony_ci
28898c2ecf20Sopenharmony_ciint r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
28908c2ecf20Sopenharmony_ci{
28918c2ecf20Sopenharmony_ci	struct r5conf *conf = sh->raid_conf;
28928c2ecf20Sopenharmony_ci	int pages = 0;
28938c2ecf20Sopenharmony_ci	int reserve;
28948c2ecf20Sopenharmony_ci	int i;
28958c2ecf20Sopenharmony_ci	int ret = 0;
28968c2ecf20Sopenharmony_ci
28978c2ecf20Sopenharmony_ci	BUG_ON(!log);
28988c2ecf20Sopenharmony_ci
28998c2ecf20Sopenharmony_ci	for (i = 0; i < sh->disks; i++) {
29008c2ecf20Sopenharmony_ci		void *addr;
29018c2ecf20Sopenharmony_ci
29028c2ecf20Sopenharmony_ci		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
29038c2ecf20Sopenharmony_ci			continue;
29048c2ecf20Sopenharmony_ci		addr = kmap_atomic(sh->dev[i].page);
29058c2ecf20Sopenharmony_ci		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
29068c2ecf20Sopenharmony_ci						    addr, PAGE_SIZE);
29078c2ecf20Sopenharmony_ci		kunmap_atomic(addr);
29088c2ecf20Sopenharmony_ci		pages++;
29098c2ecf20Sopenharmony_ci	}
29108c2ecf20Sopenharmony_ci	WARN_ON(pages == 0);
29118c2ecf20Sopenharmony_ci
29128c2ecf20Sopenharmony_ci	/*
29138c2ecf20Sopenharmony_ci	 * The stripe must enter state machine again to call endio, so
29148c2ecf20Sopenharmony_ci	 * don't delay.
29158c2ecf20Sopenharmony_ci	 */
29168c2ecf20Sopenharmony_ci	clear_bit(STRIPE_DELAYED, &sh->state);
29178c2ecf20Sopenharmony_ci	atomic_inc(&sh->count);
29188c2ecf20Sopenharmony_ci
29198c2ecf20Sopenharmony_ci	mutex_lock(&log->io_mutex);
29208c2ecf20Sopenharmony_ci	/* meta + data */
29218c2ecf20Sopenharmony_ci	reserve = (1 + pages) << (PAGE_SHIFT - 9);
29228c2ecf20Sopenharmony_ci
29238c2ecf20Sopenharmony_ci	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
29248c2ecf20Sopenharmony_ci	    sh->log_start == MaxSector)
29258c2ecf20Sopenharmony_ci		r5l_add_no_space_stripe(log, sh);
29268c2ecf20Sopenharmony_ci	else if (!r5l_has_free_space(log, reserve)) {
29278c2ecf20Sopenharmony_ci		if (sh->log_start == log->last_checkpoint)
29288c2ecf20Sopenharmony_ci			BUG();
29298c2ecf20Sopenharmony_ci		else
29308c2ecf20Sopenharmony_ci			r5l_add_no_space_stripe(log, sh);
29318c2ecf20Sopenharmony_ci	} else {
29328c2ecf20Sopenharmony_ci		ret = r5l_log_stripe(log, sh, pages, 0);
29338c2ecf20Sopenharmony_ci		if (ret) {
29348c2ecf20Sopenharmony_ci			spin_lock_irq(&log->io_list_lock);
29358c2ecf20Sopenharmony_ci			list_add_tail(&sh->log_list, &log->no_mem_stripes);
29368c2ecf20Sopenharmony_ci			spin_unlock_irq(&log->io_list_lock);
29378c2ecf20Sopenharmony_ci		}
29388c2ecf20Sopenharmony_ci	}
29398c2ecf20Sopenharmony_ci
29408c2ecf20Sopenharmony_ci	mutex_unlock(&log->io_mutex);
29418c2ecf20Sopenharmony_ci	return 0;
29428c2ecf20Sopenharmony_ci}
29438c2ecf20Sopenharmony_ci
29448c2ecf20Sopenharmony_ci/* check whether this big stripe is in write back cache. */
29458c2ecf20Sopenharmony_cibool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
29468c2ecf20Sopenharmony_ci{
29478c2ecf20Sopenharmony_ci	struct r5l_log *log = conf->log;
29488c2ecf20Sopenharmony_ci	sector_t tree_index;
29498c2ecf20Sopenharmony_ci	void *slot;
29508c2ecf20Sopenharmony_ci
29518c2ecf20Sopenharmony_ci	if (!log)
29528c2ecf20Sopenharmony_ci		return false;
29538c2ecf20Sopenharmony_ci
29548c2ecf20Sopenharmony_ci	WARN_ON_ONCE(!rcu_read_lock_held());
29558c2ecf20Sopenharmony_ci	tree_index = r5c_tree_index(conf, sect);
29568c2ecf20Sopenharmony_ci	slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
29578c2ecf20Sopenharmony_ci	return slot != NULL;
29588c2ecf20Sopenharmony_ci}
29598c2ecf20Sopenharmony_ci
29608c2ecf20Sopenharmony_cistatic int r5l_load_log(struct r5l_log *log)
29618c2ecf20Sopenharmony_ci{
29628c2ecf20Sopenharmony_ci	struct md_rdev *rdev = log->rdev;
29638c2ecf20Sopenharmony_ci	struct page *page;
29648c2ecf20Sopenharmony_ci	struct r5l_meta_block *mb;
29658c2ecf20Sopenharmony_ci	sector_t cp = log->rdev->journal_tail;
29668c2ecf20Sopenharmony_ci	u32 stored_crc, expected_crc;
29678c2ecf20Sopenharmony_ci	bool create_super = false;
29688c2ecf20Sopenharmony_ci	int ret = 0;
29698c2ecf20Sopenharmony_ci
29708c2ecf20Sopenharmony_ci	/* Make sure it's valid */
29718c2ecf20Sopenharmony_ci	if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
29728c2ecf20Sopenharmony_ci		cp = 0;
29738c2ecf20Sopenharmony_ci	page = alloc_page(GFP_KERNEL);
29748c2ecf20Sopenharmony_ci	if (!page)
29758c2ecf20Sopenharmony_ci		return -ENOMEM;
29768c2ecf20Sopenharmony_ci
29778c2ecf20Sopenharmony_ci	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
29788c2ecf20Sopenharmony_ci		ret = -EIO;
29798c2ecf20Sopenharmony_ci		goto ioerr;
29808c2ecf20Sopenharmony_ci	}
29818c2ecf20Sopenharmony_ci	mb = page_address(page);
29828c2ecf20Sopenharmony_ci
29838c2ecf20Sopenharmony_ci	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
29848c2ecf20Sopenharmony_ci	    mb->version != R5LOG_VERSION) {
29858c2ecf20Sopenharmony_ci		create_super = true;
29868c2ecf20Sopenharmony_ci		goto create;
29878c2ecf20Sopenharmony_ci	}
29888c2ecf20Sopenharmony_ci	stored_crc = le32_to_cpu(mb->checksum);
29898c2ecf20Sopenharmony_ci	mb->checksum = 0;
29908c2ecf20Sopenharmony_ci	expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
29918c2ecf20Sopenharmony_ci	if (stored_crc != expected_crc) {
29928c2ecf20Sopenharmony_ci		create_super = true;
29938c2ecf20Sopenharmony_ci		goto create;
29948c2ecf20Sopenharmony_ci	}
29958c2ecf20Sopenharmony_ci	if (le64_to_cpu(mb->position) != cp) {
29968c2ecf20Sopenharmony_ci		create_super = true;
29978c2ecf20Sopenharmony_ci		goto create;
29988c2ecf20Sopenharmony_ci	}
29998c2ecf20Sopenharmony_cicreate:
30008c2ecf20Sopenharmony_ci	if (create_super) {
30018c2ecf20Sopenharmony_ci		log->last_cp_seq = prandom_u32();
30028c2ecf20Sopenharmony_ci		cp = 0;
30038c2ecf20Sopenharmony_ci		r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
30048c2ecf20Sopenharmony_ci		/*
30058c2ecf20Sopenharmony_ci		 * Make sure super points to correct address. Log might have
30068c2ecf20Sopenharmony_ci		 * data very soon. If super hasn't correct log tail address,
30078c2ecf20Sopenharmony_ci		 * recovery can't find the log
30088c2ecf20Sopenharmony_ci		 */
30098c2ecf20Sopenharmony_ci		r5l_write_super(log, cp);
30108c2ecf20Sopenharmony_ci	} else
30118c2ecf20Sopenharmony_ci		log->last_cp_seq = le64_to_cpu(mb->seq);
30128c2ecf20Sopenharmony_ci
30138c2ecf20Sopenharmony_ci	log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
30148c2ecf20Sopenharmony_ci	log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
30158c2ecf20Sopenharmony_ci	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
30168c2ecf20Sopenharmony_ci		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
30178c2ecf20Sopenharmony_ci	log->last_checkpoint = cp;
30188c2ecf20Sopenharmony_ci
30198c2ecf20Sopenharmony_ci	__free_page(page);
30208c2ecf20Sopenharmony_ci
30218c2ecf20Sopenharmony_ci	if (create_super) {
30228c2ecf20Sopenharmony_ci		log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
30238c2ecf20Sopenharmony_ci		log->seq = log->last_cp_seq + 1;
30248c2ecf20Sopenharmony_ci		log->next_checkpoint = cp;
30258c2ecf20Sopenharmony_ci	} else
30268c2ecf20Sopenharmony_ci		ret = r5l_recovery_log(log);
30278c2ecf20Sopenharmony_ci
30288c2ecf20Sopenharmony_ci	r5c_update_log_state(log);
30298c2ecf20Sopenharmony_ci	return ret;
30308c2ecf20Sopenharmony_ciioerr:
30318c2ecf20Sopenharmony_ci	__free_page(page);
30328c2ecf20Sopenharmony_ci	return ret;
30338c2ecf20Sopenharmony_ci}
30348c2ecf20Sopenharmony_ci
30358c2ecf20Sopenharmony_ciint r5l_start(struct r5l_log *log)
30368c2ecf20Sopenharmony_ci{
30378c2ecf20Sopenharmony_ci	int ret;
30388c2ecf20Sopenharmony_ci
30398c2ecf20Sopenharmony_ci	if (!log)
30408c2ecf20Sopenharmony_ci		return 0;
30418c2ecf20Sopenharmony_ci
30428c2ecf20Sopenharmony_ci	ret = r5l_load_log(log);
30438c2ecf20Sopenharmony_ci	if (ret) {
30448c2ecf20Sopenharmony_ci		struct mddev *mddev = log->rdev->mddev;
30458c2ecf20Sopenharmony_ci		struct r5conf *conf = mddev->private;
30468c2ecf20Sopenharmony_ci
30478c2ecf20Sopenharmony_ci		r5l_exit_log(conf);
30488c2ecf20Sopenharmony_ci	}
30498c2ecf20Sopenharmony_ci	return ret;
30508c2ecf20Sopenharmony_ci}
30518c2ecf20Sopenharmony_ci
30528c2ecf20Sopenharmony_civoid r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
30538c2ecf20Sopenharmony_ci{
30548c2ecf20Sopenharmony_ci	struct r5conf *conf = mddev->private;
30558c2ecf20Sopenharmony_ci	struct r5l_log *log = conf->log;
30568c2ecf20Sopenharmony_ci
30578c2ecf20Sopenharmony_ci	if (!log)
30588c2ecf20Sopenharmony_ci		return;
30598c2ecf20Sopenharmony_ci
30608c2ecf20Sopenharmony_ci	if ((raid5_calc_degraded(conf) > 0 ||
30618c2ecf20Sopenharmony_ci	     test_bit(Journal, &rdev->flags)) &&
30628c2ecf20Sopenharmony_ci	    conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
30638c2ecf20Sopenharmony_ci		schedule_work(&log->disable_writeback_work);
30648c2ecf20Sopenharmony_ci}
30658c2ecf20Sopenharmony_ci
30668c2ecf20Sopenharmony_ciint r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
30678c2ecf20Sopenharmony_ci{
30688c2ecf20Sopenharmony_ci	struct request_queue *q = bdev_get_queue(rdev->bdev);
30698c2ecf20Sopenharmony_ci	struct r5l_log *log;
30708c2ecf20Sopenharmony_ci	char b[BDEVNAME_SIZE];
30718c2ecf20Sopenharmony_ci	int ret;
30728c2ecf20Sopenharmony_ci
30738c2ecf20Sopenharmony_ci	pr_debug("md/raid:%s: using device %s as journal\n",
30748c2ecf20Sopenharmony_ci		 mdname(conf->mddev), bdevname(rdev->bdev, b));
30758c2ecf20Sopenharmony_ci
30768c2ecf20Sopenharmony_ci	if (PAGE_SIZE != 4096)
30778c2ecf20Sopenharmony_ci		return -EINVAL;
30788c2ecf20Sopenharmony_ci
30798c2ecf20Sopenharmony_ci	/*
30808c2ecf20Sopenharmony_ci	 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
30818c2ecf20Sopenharmony_ci	 * raid_disks r5l_payload_data_parity.
30828c2ecf20Sopenharmony_ci	 *
30838c2ecf20Sopenharmony_ci	 * Write journal and cache does not work for very big array
30848c2ecf20Sopenharmony_ci	 * (raid_disks > 203)
30858c2ecf20Sopenharmony_ci	 */
30868c2ecf20Sopenharmony_ci	if (sizeof(struct r5l_meta_block) +
30878c2ecf20Sopenharmony_ci	    ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
30888c2ecf20Sopenharmony_ci	     conf->raid_disks) > PAGE_SIZE) {
30898c2ecf20Sopenharmony_ci		pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
30908c2ecf20Sopenharmony_ci		       mdname(conf->mddev), conf->raid_disks);
30918c2ecf20Sopenharmony_ci		return -EINVAL;
30928c2ecf20Sopenharmony_ci	}
30938c2ecf20Sopenharmony_ci
30948c2ecf20Sopenharmony_ci	log = kzalloc(sizeof(*log), GFP_KERNEL);
30958c2ecf20Sopenharmony_ci	if (!log)
30968c2ecf20Sopenharmony_ci		return -ENOMEM;
30978c2ecf20Sopenharmony_ci	log->rdev = rdev;
30988c2ecf20Sopenharmony_ci
30998c2ecf20Sopenharmony_ci	log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
31008c2ecf20Sopenharmony_ci
31018c2ecf20Sopenharmony_ci	log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
31028c2ecf20Sopenharmony_ci				       sizeof(rdev->mddev->uuid));
31038c2ecf20Sopenharmony_ci
31048c2ecf20Sopenharmony_ci	mutex_init(&log->io_mutex);
31058c2ecf20Sopenharmony_ci
31068c2ecf20Sopenharmony_ci	spin_lock_init(&log->io_list_lock);
31078c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&log->running_ios);
31088c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&log->io_end_ios);
31098c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&log->flushing_ios);
31108c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&log->finished_ios);
31118c2ecf20Sopenharmony_ci	bio_init(&log->flush_bio, NULL, 0);
31128c2ecf20Sopenharmony_ci
31138c2ecf20Sopenharmony_ci	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
31148c2ecf20Sopenharmony_ci	if (!log->io_kc)
31158c2ecf20Sopenharmony_ci		goto io_kc;
31168c2ecf20Sopenharmony_ci
31178c2ecf20Sopenharmony_ci	ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc);
31188c2ecf20Sopenharmony_ci	if (ret)
31198c2ecf20Sopenharmony_ci		goto io_pool;
31208c2ecf20Sopenharmony_ci
31218c2ecf20Sopenharmony_ci	ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
31228c2ecf20Sopenharmony_ci	if (ret)
31238c2ecf20Sopenharmony_ci		goto io_bs;
31248c2ecf20Sopenharmony_ci
31258c2ecf20Sopenharmony_ci	ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0);
31268c2ecf20Sopenharmony_ci	if (ret)
31278c2ecf20Sopenharmony_ci		goto out_mempool;
31288c2ecf20Sopenharmony_ci
31298c2ecf20Sopenharmony_ci	spin_lock_init(&log->tree_lock);
31308c2ecf20Sopenharmony_ci	INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
31318c2ecf20Sopenharmony_ci
31328c2ecf20Sopenharmony_ci	log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
31338c2ecf20Sopenharmony_ci						 log->rdev->mddev, "reclaim");
31348c2ecf20Sopenharmony_ci	if (!log->reclaim_thread)
31358c2ecf20Sopenharmony_ci		goto reclaim_thread;
31368c2ecf20Sopenharmony_ci	log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
31378c2ecf20Sopenharmony_ci
31388c2ecf20Sopenharmony_ci	init_waitqueue_head(&log->iounit_wait);
31398c2ecf20Sopenharmony_ci
31408c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&log->no_mem_stripes);
31418c2ecf20Sopenharmony_ci
31428c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&log->no_space_stripes);
31438c2ecf20Sopenharmony_ci	spin_lock_init(&log->no_space_stripes_lock);
31448c2ecf20Sopenharmony_ci
31458c2ecf20Sopenharmony_ci	INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
31468c2ecf20Sopenharmony_ci	INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
31478c2ecf20Sopenharmony_ci
31488c2ecf20Sopenharmony_ci	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
31498c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&log->stripe_in_journal_list);
31508c2ecf20Sopenharmony_ci	spin_lock_init(&log->stripe_in_journal_lock);
31518c2ecf20Sopenharmony_ci	atomic_set(&log->stripe_in_journal_count, 0);
31528c2ecf20Sopenharmony_ci
31538c2ecf20Sopenharmony_ci	rcu_assign_pointer(conf->log, log);
31548c2ecf20Sopenharmony_ci
31558c2ecf20Sopenharmony_ci	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
31568c2ecf20Sopenharmony_ci	return 0;
31578c2ecf20Sopenharmony_ci
31588c2ecf20Sopenharmony_cireclaim_thread:
31598c2ecf20Sopenharmony_ci	mempool_exit(&log->meta_pool);
31608c2ecf20Sopenharmony_ciout_mempool:
31618c2ecf20Sopenharmony_ci	bioset_exit(&log->bs);
31628c2ecf20Sopenharmony_ciio_bs:
31638c2ecf20Sopenharmony_ci	mempool_exit(&log->io_pool);
31648c2ecf20Sopenharmony_ciio_pool:
31658c2ecf20Sopenharmony_ci	kmem_cache_destroy(log->io_kc);
31668c2ecf20Sopenharmony_ciio_kc:
31678c2ecf20Sopenharmony_ci	kfree(log);
31688c2ecf20Sopenharmony_ci	return -EINVAL;
31698c2ecf20Sopenharmony_ci}
31708c2ecf20Sopenharmony_ci
31718c2ecf20Sopenharmony_civoid r5l_exit_log(struct r5conf *conf)
31728c2ecf20Sopenharmony_ci{
31738c2ecf20Sopenharmony_ci	struct r5l_log *log = conf->log;
31748c2ecf20Sopenharmony_ci
31758c2ecf20Sopenharmony_ci	conf->log = NULL;
31768c2ecf20Sopenharmony_ci	synchronize_rcu();
31778c2ecf20Sopenharmony_ci
31788c2ecf20Sopenharmony_ci	/* Ensure disable_writeback_work wakes up and exits */
31798c2ecf20Sopenharmony_ci	wake_up(&conf->mddev->sb_wait);
31808c2ecf20Sopenharmony_ci	flush_work(&log->disable_writeback_work);
31818c2ecf20Sopenharmony_ci	md_unregister_thread(&log->reclaim_thread);
31828c2ecf20Sopenharmony_ci	mempool_exit(&log->meta_pool);
31838c2ecf20Sopenharmony_ci	bioset_exit(&log->bs);
31848c2ecf20Sopenharmony_ci	mempool_exit(&log->io_pool);
31858c2ecf20Sopenharmony_ci	kmem_cache_destroy(log->io_kc);
31868c2ecf20Sopenharmony_ci	kfree(log);
31878c2ecf20Sopenharmony_ci}
3188