162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright (C) 2015 Shaohua Li <shli@fb.com>
462306a36Sopenharmony_ci * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
562306a36Sopenharmony_ci */
662306a36Sopenharmony_ci#include <linux/kernel.h>
762306a36Sopenharmony_ci#include <linux/wait.h>
862306a36Sopenharmony_ci#include <linux/blkdev.h>
962306a36Sopenharmony_ci#include <linux/slab.h>
1062306a36Sopenharmony_ci#include <linux/raid/md_p.h>
1162306a36Sopenharmony_ci#include <linux/crc32c.h>
1262306a36Sopenharmony_ci#include <linux/random.h>
1362306a36Sopenharmony_ci#include <linux/kthread.h>
1462306a36Sopenharmony_ci#include <linux/types.h>
1562306a36Sopenharmony_ci#include "md.h"
1662306a36Sopenharmony_ci#include "raid5.h"
1762306a36Sopenharmony_ci#include "md-bitmap.h"
1862306a36Sopenharmony_ci#include "raid5-log.h"
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci/*
2162306a36Sopenharmony_ci * metadata/data stored in disk with 4k size unit (a block) regardless
2262306a36Sopenharmony_ci * underneath hardware sector size. only works with PAGE_SIZE == 4096
2362306a36Sopenharmony_ci */
2462306a36Sopenharmony_ci#define BLOCK_SECTORS (8)
2562306a36Sopenharmony_ci#define BLOCK_SECTOR_SHIFT (3)
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci/*
2862306a36Sopenharmony_ci * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
2962306a36Sopenharmony_ci *
3062306a36Sopenharmony_ci * In write through mode, the reclaim runs every log->max_free_space.
3162306a36Sopenharmony_ci * This can prevent the recovery scans for too long
3262306a36Sopenharmony_ci */
3362306a36Sopenharmony_ci#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
3462306a36Sopenharmony_ci#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_ci/* wake up reclaim thread periodically */
3762306a36Sopenharmony_ci#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
3862306a36Sopenharmony_ci/* start flush with these full stripes */
3962306a36Sopenharmony_ci#define R5C_FULL_STRIPE_FLUSH_BATCH(conf) (conf->max_nr_stripes / 4)
4062306a36Sopenharmony_ci/* reclaim stripes in groups */
4162306a36Sopenharmony_ci#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_ci/*
4462306a36Sopenharmony_ci * We only need 2 bios per I/O unit to make progress, but ensure we
4562306a36Sopenharmony_ci * have a few more available to not get too tight.
4662306a36Sopenharmony_ci */
4762306a36Sopenharmony_ci#define R5L_POOL_SIZE	4
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_cistatic char *r5c_journal_mode_str[] = {"write-through",
5062306a36Sopenharmony_ci				       "write-back"};
5162306a36Sopenharmony_ci/*
5262306a36Sopenharmony_ci * raid5 cache state machine
5362306a36Sopenharmony_ci *
5462306a36Sopenharmony_ci * With the RAID cache, each stripe works in two phases:
5562306a36Sopenharmony_ci *	- caching phase
5662306a36Sopenharmony_ci *	- writing-out phase
5762306a36Sopenharmony_ci *
5862306a36Sopenharmony_ci * These two phases are controlled by bit STRIPE_R5C_CACHING:
5962306a36Sopenharmony_ci *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
6062306a36Sopenharmony_ci *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
6162306a36Sopenharmony_ci *
6262306a36Sopenharmony_ci * When there is no journal, or the journal is in write-through mode,
6362306a36Sopenharmony_ci * the stripe is always in writing-out phase.
6462306a36Sopenharmony_ci *
6562306a36Sopenharmony_ci * For write-back journal, the stripe is sent to caching phase on write
6662306a36Sopenharmony_ci * (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
6762306a36Sopenharmony_ci * the write-out phase by clearing STRIPE_R5C_CACHING.
6862306a36Sopenharmony_ci *
6962306a36Sopenharmony_ci * Stripes in caching phase do not write the raid disks. Instead, all
7062306a36Sopenharmony_ci * writes are committed from the log device. Therefore, a stripe in
7162306a36Sopenharmony_ci * caching phase handles writes as:
7262306a36Sopenharmony_ci *	- write to log device
7362306a36Sopenharmony_ci *	- return IO
7462306a36Sopenharmony_ci *
7562306a36Sopenharmony_ci * Stripes in writing-out phase handle writes as:
7662306a36Sopenharmony_ci *	- calculate parity
7762306a36Sopenharmony_ci *	- write pending data and parity to journal
7862306a36Sopenharmony_ci *	- write data and parity to raid disks
7962306a36Sopenharmony_ci *	- return IO for pending writes
8062306a36Sopenharmony_ci */
8162306a36Sopenharmony_ci
8262306a36Sopenharmony_cistruct r5l_log {
8362306a36Sopenharmony_ci	struct md_rdev *rdev;
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_ci	u32 uuid_checksum;
8662306a36Sopenharmony_ci
8762306a36Sopenharmony_ci	sector_t device_size;		/* log device size, round to
8862306a36Sopenharmony_ci					 * BLOCK_SECTORS */
8962306a36Sopenharmony_ci	sector_t max_free_space;	/* reclaim run if free space is at
9062306a36Sopenharmony_ci					 * this size */
9162306a36Sopenharmony_ci
9262306a36Sopenharmony_ci	sector_t last_checkpoint;	/* log tail. where recovery scan
9362306a36Sopenharmony_ci					 * starts from */
9462306a36Sopenharmony_ci	u64 last_cp_seq;		/* log tail sequence */
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci	sector_t log_start;		/* log head. where new data appends */
9762306a36Sopenharmony_ci	u64 seq;			/* log head sequence */
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci	sector_t next_checkpoint;
10062306a36Sopenharmony_ci
10162306a36Sopenharmony_ci	struct mutex io_mutex;
10262306a36Sopenharmony_ci	struct r5l_io_unit *current_io;	/* current io_unit accepting new data */
10362306a36Sopenharmony_ci
10462306a36Sopenharmony_ci	spinlock_t io_list_lock;
10562306a36Sopenharmony_ci	struct list_head running_ios;	/* io_units which are still running,
10662306a36Sopenharmony_ci					 * and have not yet been completely
10762306a36Sopenharmony_ci					 * written to the log */
10862306a36Sopenharmony_ci	struct list_head io_end_ios;	/* io_units which have been completely
10962306a36Sopenharmony_ci					 * written to the log but not yet written
11062306a36Sopenharmony_ci					 * to the RAID */
11162306a36Sopenharmony_ci	struct list_head flushing_ios;	/* io_units which are waiting for log
11262306a36Sopenharmony_ci					 * cache flush */
11362306a36Sopenharmony_ci	struct list_head finished_ios;	/* io_units which settle down in log disk */
11462306a36Sopenharmony_ci	struct bio flush_bio;
11562306a36Sopenharmony_ci
11662306a36Sopenharmony_ci	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci	struct kmem_cache *io_kc;
11962306a36Sopenharmony_ci	mempool_t io_pool;
12062306a36Sopenharmony_ci	struct bio_set bs;
12162306a36Sopenharmony_ci	mempool_t meta_pool;
12262306a36Sopenharmony_ci
12362306a36Sopenharmony_ci	struct md_thread __rcu *reclaim_thread;
12462306a36Sopenharmony_ci	unsigned long reclaim_target;	/* number of space that need to be
12562306a36Sopenharmony_ci					 * reclaimed.  if it's 0, reclaim spaces
12662306a36Sopenharmony_ci					 * used by io_units which are in
12762306a36Sopenharmony_ci					 * IO_UNIT_STRIPE_END state (eg, reclaim
12862306a36Sopenharmony_ci					 * doesn't wait for specific io_unit
12962306a36Sopenharmony_ci					 * switching to IO_UNIT_STRIPE_END
13062306a36Sopenharmony_ci					 * state) */
13162306a36Sopenharmony_ci	wait_queue_head_t iounit_wait;
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci	struct list_head no_space_stripes; /* pending stripes, log has no space */
13462306a36Sopenharmony_ci	spinlock_t no_space_stripes_lock;
13562306a36Sopenharmony_ci
13662306a36Sopenharmony_ci	bool need_cache_flush;
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_ci	/* for r5c_cache */
13962306a36Sopenharmony_ci	enum r5c_journal_mode r5c_journal_mode;
14062306a36Sopenharmony_ci
14162306a36Sopenharmony_ci	/* all stripes in r5cache, in the order of seq at sh->log_start */
14262306a36Sopenharmony_ci	struct list_head stripe_in_journal_list;
14362306a36Sopenharmony_ci
14462306a36Sopenharmony_ci	spinlock_t stripe_in_journal_lock;
14562306a36Sopenharmony_ci	atomic_t stripe_in_journal_count;
14662306a36Sopenharmony_ci
14762306a36Sopenharmony_ci	/* to submit async io_units, to fulfill ordering of flush */
14862306a36Sopenharmony_ci	struct work_struct deferred_io_work;
14962306a36Sopenharmony_ci	/* to disable write back during in degraded mode */
15062306a36Sopenharmony_ci	struct work_struct disable_writeback_work;
15162306a36Sopenharmony_ci
15262306a36Sopenharmony_ci	/* to for chunk_aligned_read in writeback mode, details below */
15362306a36Sopenharmony_ci	spinlock_t tree_lock;
15462306a36Sopenharmony_ci	struct radix_tree_root big_stripe_tree;
15562306a36Sopenharmony_ci};
15662306a36Sopenharmony_ci
15762306a36Sopenharmony_ci/*
15862306a36Sopenharmony_ci * Enable chunk_aligned_read() with write back cache.
15962306a36Sopenharmony_ci *
16062306a36Sopenharmony_ci * Each chunk may contain more than one stripe (for example, a 256kB
16162306a36Sopenharmony_ci * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
16262306a36Sopenharmony_ci * chunk_aligned_read, these stripes are grouped into one "big_stripe".
16362306a36Sopenharmony_ci * For each big_stripe, we count how many stripes of this big_stripe
16462306a36Sopenharmony_ci * are in the write back cache. These data are tracked in a radix tree
16562306a36Sopenharmony_ci * (big_stripe_tree). We use radix_tree item pointer as the counter.
16662306a36Sopenharmony_ci * r5c_tree_index() is used to calculate keys for the radix tree.
16762306a36Sopenharmony_ci *
16862306a36Sopenharmony_ci * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
16962306a36Sopenharmony_ci * big_stripe of each chunk in the tree. If this big_stripe is in the
17062306a36Sopenharmony_ci * tree, chunk_aligned_read() aborts. This look up is protected by
17162306a36Sopenharmony_ci * rcu_read_lock().
17262306a36Sopenharmony_ci *
17362306a36Sopenharmony_ci * It is necessary to remember whether a stripe is counted in
17462306a36Sopenharmony_ci * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
17562306a36Sopenharmony_ci * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
17662306a36Sopenharmony_ci * two flags are set, the stripe is counted in big_stripe_tree. This
17762306a36Sopenharmony_ci * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
17862306a36Sopenharmony_ci * r5c_try_caching_write(); and moving clear_bit of
17962306a36Sopenharmony_ci * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
18062306a36Sopenharmony_ci * r5c_finish_stripe_write_out().
18162306a36Sopenharmony_ci */
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_ci/*
18462306a36Sopenharmony_ci * radix tree requests lowest 2 bits of data pointer to be 2b'00.
18562306a36Sopenharmony_ci * So it is necessary to left shift the counter by 2 bits before using it
18662306a36Sopenharmony_ci * as data pointer of the tree.
18762306a36Sopenharmony_ci */
18862306a36Sopenharmony_ci#define R5C_RADIX_COUNT_SHIFT 2
18962306a36Sopenharmony_ci
19062306a36Sopenharmony_ci/*
19162306a36Sopenharmony_ci * calculate key for big_stripe_tree
19262306a36Sopenharmony_ci *
19362306a36Sopenharmony_ci * sect: align_bi->bi_iter.bi_sector or sh->sector
19462306a36Sopenharmony_ci */
19562306a36Sopenharmony_cistatic inline sector_t r5c_tree_index(struct r5conf *conf,
19662306a36Sopenharmony_ci				      sector_t sect)
19762306a36Sopenharmony_ci{
19862306a36Sopenharmony_ci	sector_div(sect, conf->chunk_sectors);
19962306a36Sopenharmony_ci	return sect;
20062306a36Sopenharmony_ci}
20162306a36Sopenharmony_ci
20262306a36Sopenharmony_ci/*
20362306a36Sopenharmony_ci * an IO range starts from a meta data block and end at the next meta data
20462306a36Sopenharmony_ci * block. The io unit's the meta data block tracks data/parity followed it. io
20562306a36Sopenharmony_ci * unit is written to log disk with normal write, as we always flush log disk
20662306a36Sopenharmony_ci * first and then start move data to raid disks, there is no requirement to
20762306a36Sopenharmony_ci * write io unit with FLUSH/FUA
20862306a36Sopenharmony_ci */
20962306a36Sopenharmony_cistruct r5l_io_unit {
21062306a36Sopenharmony_ci	struct r5l_log *log;
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci	struct page *meta_page;	/* store meta block */
21362306a36Sopenharmony_ci	int meta_offset;	/* current offset in meta_page */
21462306a36Sopenharmony_ci
21562306a36Sopenharmony_ci	struct bio *current_bio;/* current_bio accepting new data */
21662306a36Sopenharmony_ci
21762306a36Sopenharmony_ci	atomic_t pending_stripe;/* how many stripes not flushed to raid */
21862306a36Sopenharmony_ci	u64 seq;		/* seq number of the metablock */
21962306a36Sopenharmony_ci	sector_t log_start;	/* where the io_unit starts */
22062306a36Sopenharmony_ci	sector_t log_end;	/* where the io_unit ends */
22162306a36Sopenharmony_ci	struct list_head log_sibling; /* log->running_ios */
22262306a36Sopenharmony_ci	struct list_head stripe_list; /* stripes added to the io_unit */
22362306a36Sopenharmony_ci
22462306a36Sopenharmony_ci	int state;
22562306a36Sopenharmony_ci	bool need_split_bio;
22662306a36Sopenharmony_ci	struct bio *split_bio;
22762306a36Sopenharmony_ci
22862306a36Sopenharmony_ci	unsigned int has_flush:1;		/* include flush request */
22962306a36Sopenharmony_ci	unsigned int has_fua:1;			/* include fua request */
23062306a36Sopenharmony_ci	unsigned int has_null_flush:1;		/* include null flush request */
23162306a36Sopenharmony_ci	unsigned int has_flush_payload:1;	/* include flush payload  */
23262306a36Sopenharmony_ci	/*
23362306a36Sopenharmony_ci	 * io isn't sent yet, flush/fua request can only be submitted till it's
23462306a36Sopenharmony_ci	 * the first IO in running_ios list
23562306a36Sopenharmony_ci	 */
23662306a36Sopenharmony_ci	unsigned int io_deferred:1;
23762306a36Sopenharmony_ci
23862306a36Sopenharmony_ci	struct bio_list flush_barriers;   /* size == 0 flush bios */
23962306a36Sopenharmony_ci};
24062306a36Sopenharmony_ci
24162306a36Sopenharmony_ci/* r5l_io_unit state */
24262306a36Sopenharmony_cienum r5l_io_unit_state {
24362306a36Sopenharmony_ci	IO_UNIT_RUNNING = 0,	/* accepting new IO */
24462306a36Sopenharmony_ci	IO_UNIT_IO_START = 1,	/* io_unit bio start writing to log,
24562306a36Sopenharmony_ci				 * don't accepting new bio */
24662306a36Sopenharmony_ci	IO_UNIT_IO_END = 2,	/* io_unit bio finish writing to log */
24762306a36Sopenharmony_ci	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
24862306a36Sopenharmony_ci};
24962306a36Sopenharmony_ci
25062306a36Sopenharmony_cibool r5c_is_writeback(struct r5l_log *log)
25162306a36Sopenharmony_ci{
25262306a36Sopenharmony_ci	return (log != NULL &&
25362306a36Sopenharmony_ci		log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK);
25462306a36Sopenharmony_ci}
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_cistatic sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
25762306a36Sopenharmony_ci{
25862306a36Sopenharmony_ci	start += inc;
25962306a36Sopenharmony_ci	if (start >= log->device_size)
26062306a36Sopenharmony_ci		start = start - log->device_size;
26162306a36Sopenharmony_ci	return start;
26262306a36Sopenharmony_ci}
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_cistatic sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
26562306a36Sopenharmony_ci				  sector_t end)
26662306a36Sopenharmony_ci{
26762306a36Sopenharmony_ci	if (end >= start)
26862306a36Sopenharmony_ci		return end - start;
26962306a36Sopenharmony_ci	else
27062306a36Sopenharmony_ci		return end + log->device_size - start;
27162306a36Sopenharmony_ci}
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_cistatic bool r5l_has_free_space(struct r5l_log *log, sector_t size)
27462306a36Sopenharmony_ci{
27562306a36Sopenharmony_ci	sector_t used_size;
27662306a36Sopenharmony_ci
27762306a36Sopenharmony_ci	used_size = r5l_ring_distance(log, log->last_checkpoint,
27862306a36Sopenharmony_ci					log->log_start);
27962306a36Sopenharmony_ci
28062306a36Sopenharmony_ci	return log->device_size > used_size + size;
28162306a36Sopenharmony_ci}
28262306a36Sopenharmony_ci
28362306a36Sopenharmony_cistatic void __r5l_set_io_unit_state(struct r5l_io_unit *io,
28462306a36Sopenharmony_ci				    enum r5l_io_unit_state state)
28562306a36Sopenharmony_ci{
28662306a36Sopenharmony_ci	if (WARN_ON(io->state >= state))
28762306a36Sopenharmony_ci		return;
28862306a36Sopenharmony_ci	io->state = state;
28962306a36Sopenharmony_ci}
29062306a36Sopenharmony_ci
29162306a36Sopenharmony_cistatic void
29262306a36Sopenharmony_cir5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
29362306a36Sopenharmony_ci{
29462306a36Sopenharmony_ci	struct bio *wbi, *wbi2;
29562306a36Sopenharmony_ci
29662306a36Sopenharmony_ci	wbi = dev->written;
29762306a36Sopenharmony_ci	dev->written = NULL;
29862306a36Sopenharmony_ci	while (wbi && wbi->bi_iter.bi_sector <
29962306a36Sopenharmony_ci	       dev->sector + RAID5_STRIPE_SECTORS(conf)) {
30062306a36Sopenharmony_ci		wbi2 = r5_next_bio(conf, wbi, dev->sector);
30162306a36Sopenharmony_ci		md_write_end(conf->mddev);
30262306a36Sopenharmony_ci		bio_endio(wbi);
30362306a36Sopenharmony_ci		wbi = wbi2;
30462306a36Sopenharmony_ci	}
30562306a36Sopenharmony_ci}
30662306a36Sopenharmony_ci
30762306a36Sopenharmony_civoid r5c_handle_cached_data_endio(struct r5conf *conf,
30862306a36Sopenharmony_ci				  struct stripe_head *sh, int disks)
30962306a36Sopenharmony_ci{
31062306a36Sopenharmony_ci	int i;
31162306a36Sopenharmony_ci
31262306a36Sopenharmony_ci	for (i = sh->disks; i--; ) {
31362306a36Sopenharmony_ci		if (sh->dev[i].written) {
31462306a36Sopenharmony_ci			set_bit(R5_UPTODATE, &sh->dev[i].flags);
31562306a36Sopenharmony_ci			r5c_return_dev_pending_writes(conf, &sh->dev[i]);
31662306a36Sopenharmony_ci			md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
31762306a36Sopenharmony_ci					   RAID5_STRIPE_SECTORS(conf),
31862306a36Sopenharmony_ci					   !test_bit(STRIPE_DEGRADED, &sh->state),
31962306a36Sopenharmony_ci					   0);
32062306a36Sopenharmony_ci		}
32162306a36Sopenharmony_ci	}
32262306a36Sopenharmony_ci}
32362306a36Sopenharmony_ci
32462306a36Sopenharmony_civoid r5l_wake_reclaim(struct r5l_log *log, sector_t space);
32562306a36Sopenharmony_ci
32662306a36Sopenharmony_ci/* Check whether we should flush some stripes to free up stripe cache */
32762306a36Sopenharmony_civoid r5c_check_stripe_cache_usage(struct r5conf *conf)
32862306a36Sopenharmony_ci{
32962306a36Sopenharmony_ci	int total_cached;
33062306a36Sopenharmony_ci
33162306a36Sopenharmony_ci	if (!r5c_is_writeback(conf->log))
33262306a36Sopenharmony_ci		return;
33362306a36Sopenharmony_ci
33462306a36Sopenharmony_ci	total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
33562306a36Sopenharmony_ci		atomic_read(&conf->r5c_cached_full_stripes);
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	/*
33862306a36Sopenharmony_ci	 * The following condition is true for either of the following:
33962306a36Sopenharmony_ci	 *   - stripe cache pressure high:
34062306a36Sopenharmony_ci	 *          total_cached > 3/4 min_nr_stripes ||
34162306a36Sopenharmony_ci	 *          empty_inactive_list_nr > 0
34262306a36Sopenharmony_ci	 *   - stripe cache pressure moderate:
34362306a36Sopenharmony_ci	 *          total_cached > 1/2 min_nr_stripes
34462306a36Sopenharmony_ci	 */
34562306a36Sopenharmony_ci	if (total_cached > conf->min_nr_stripes * 1 / 2 ||
34662306a36Sopenharmony_ci	    atomic_read(&conf->empty_inactive_list_nr) > 0)
34762306a36Sopenharmony_ci		r5l_wake_reclaim(conf->log, 0);
34862306a36Sopenharmony_ci}
34962306a36Sopenharmony_ci
35062306a36Sopenharmony_ci/*
35162306a36Sopenharmony_ci * flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
35262306a36Sopenharmony_ci * stripes in the cache
35362306a36Sopenharmony_ci */
35462306a36Sopenharmony_civoid r5c_check_cached_full_stripe(struct r5conf *conf)
35562306a36Sopenharmony_ci{
35662306a36Sopenharmony_ci	if (!r5c_is_writeback(conf->log))
35762306a36Sopenharmony_ci		return;
35862306a36Sopenharmony_ci
35962306a36Sopenharmony_ci	/*
36062306a36Sopenharmony_ci	 * wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
36162306a36Sopenharmony_ci	 * or a full stripe (chunk size / 4k stripes).
36262306a36Sopenharmony_ci	 */
36362306a36Sopenharmony_ci	if (atomic_read(&conf->r5c_cached_full_stripes) >=
36462306a36Sopenharmony_ci	    min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
36562306a36Sopenharmony_ci		conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
36662306a36Sopenharmony_ci		r5l_wake_reclaim(conf->log, 0);
36762306a36Sopenharmony_ci}
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci/*
37062306a36Sopenharmony_ci * Total log space (in sectors) needed to flush all data in cache
37162306a36Sopenharmony_ci *
37262306a36Sopenharmony_ci * To avoid deadlock due to log space, it is necessary to reserve log
37362306a36Sopenharmony_ci * space to flush critical stripes (stripes that occupying log space near
37462306a36Sopenharmony_ci * last_checkpoint). This function helps check how much log space is
37562306a36Sopenharmony_ci * required to flush all cached stripes.
37662306a36Sopenharmony_ci *
37762306a36Sopenharmony_ci * To reduce log space requirements, two mechanisms are used to give cache
37862306a36Sopenharmony_ci * flush higher priorities:
37962306a36Sopenharmony_ci *    1. In handle_stripe_dirtying() and schedule_reconstruction(),
38062306a36Sopenharmony_ci *       stripes ALREADY in journal can be flushed w/o pending writes;
38162306a36Sopenharmony_ci *    2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
38262306a36Sopenharmony_ci *       can be delayed (r5l_add_no_space_stripe).
38362306a36Sopenharmony_ci *
38462306a36Sopenharmony_ci * In cache flush, the stripe goes through 1 and then 2. For a stripe that
38562306a36Sopenharmony_ci * already passed 1, flushing it requires at most (conf->max_degraded + 1)
38662306a36Sopenharmony_ci * pages of journal space. For stripes that has not passed 1, flushing it
38762306a36Sopenharmony_ci * requires (conf->raid_disks + 1) pages of journal space. There are at
38862306a36Sopenharmony_ci * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
38962306a36Sopenharmony_ci * required to flush all cached stripes (in pages) is:
39062306a36Sopenharmony_ci *
39162306a36Sopenharmony_ci *     (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
39262306a36Sopenharmony_ci *     (group_cnt + 1) * (raid_disks + 1)
39362306a36Sopenharmony_ci * or
39462306a36Sopenharmony_ci *     (stripe_in_journal_count) * (max_degraded + 1) +
39562306a36Sopenharmony_ci *     (group_cnt + 1) * (raid_disks - max_degraded)
39662306a36Sopenharmony_ci */
39762306a36Sopenharmony_cistatic sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
39862306a36Sopenharmony_ci{
39962306a36Sopenharmony_ci	struct r5l_log *log = conf->log;
40062306a36Sopenharmony_ci
40162306a36Sopenharmony_ci	if (!r5c_is_writeback(log))
40262306a36Sopenharmony_ci		return 0;
40362306a36Sopenharmony_ci
40462306a36Sopenharmony_ci	return BLOCK_SECTORS *
40562306a36Sopenharmony_ci		((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
40662306a36Sopenharmony_ci		 (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
40762306a36Sopenharmony_ci}
40862306a36Sopenharmony_ci
40962306a36Sopenharmony_ci/*
41062306a36Sopenharmony_ci * evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
41162306a36Sopenharmony_ci *
41262306a36Sopenharmony_ci * R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
41362306a36Sopenharmony_ci * reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
41462306a36Sopenharmony_ci * device is less than 2x of reclaim_required_space.
41562306a36Sopenharmony_ci */
41662306a36Sopenharmony_cistatic inline void r5c_update_log_state(struct r5l_log *log)
41762306a36Sopenharmony_ci{
41862306a36Sopenharmony_ci	struct r5conf *conf = log->rdev->mddev->private;
41962306a36Sopenharmony_ci	sector_t free_space;
42062306a36Sopenharmony_ci	sector_t reclaim_space;
42162306a36Sopenharmony_ci	bool wake_reclaim = false;
42262306a36Sopenharmony_ci
42362306a36Sopenharmony_ci	if (!r5c_is_writeback(log))
42462306a36Sopenharmony_ci		return;
42562306a36Sopenharmony_ci
42662306a36Sopenharmony_ci	free_space = r5l_ring_distance(log, log->log_start,
42762306a36Sopenharmony_ci				       log->last_checkpoint);
42862306a36Sopenharmony_ci	reclaim_space = r5c_log_required_to_flush_cache(conf);
42962306a36Sopenharmony_ci	if (free_space < 2 * reclaim_space)
43062306a36Sopenharmony_ci		set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
43162306a36Sopenharmony_ci	else {
43262306a36Sopenharmony_ci		if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
43362306a36Sopenharmony_ci			wake_reclaim = true;
43462306a36Sopenharmony_ci		clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
43562306a36Sopenharmony_ci	}
43662306a36Sopenharmony_ci	if (free_space < 3 * reclaim_space)
43762306a36Sopenharmony_ci		set_bit(R5C_LOG_TIGHT, &conf->cache_state);
43862306a36Sopenharmony_ci	else
43962306a36Sopenharmony_ci		clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci	if (wake_reclaim)
44262306a36Sopenharmony_ci		r5l_wake_reclaim(log, 0);
44362306a36Sopenharmony_ci}
44462306a36Sopenharmony_ci
44562306a36Sopenharmony_ci/*
44662306a36Sopenharmony_ci * Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
44762306a36Sopenharmony_ci * This function should only be called in write-back mode.
44862306a36Sopenharmony_ci */
44962306a36Sopenharmony_civoid r5c_make_stripe_write_out(struct stripe_head *sh)
45062306a36Sopenharmony_ci{
45162306a36Sopenharmony_ci	struct r5conf *conf = sh->raid_conf;
45262306a36Sopenharmony_ci	struct r5l_log *log = conf->log;
45362306a36Sopenharmony_ci
45462306a36Sopenharmony_ci	BUG_ON(!r5c_is_writeback(log));
45562306a36Sopenharmony_ci
45662306a36Sopenharmony_ci	WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
45762306a36Sopenharmony_ci	clear_bit(STRIPE_R5C_CACHING, &sh->state);
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ci	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
46062306a36Sopenharmony_ci		atomic_inc(&conf->preread_active_stripes);
46162306a36Sopenharmony_ci}
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_cistatic void r5c_handle_data_cached(struct stripe_head *sh)
46462306a36Sopenharmony_ci{
46562306a36Sopenharmony_ci	int i;
46662306a36Sopenharmony_ci
46762306a36Sopenharmony_ci	for (i = sh->disks; i--; )
46862306a36Sopenharmony_ci		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
46962306a36Sopenharmony_ci			set_bit(R5_InJournal, &sh->dev[i].flags);
47062306a36Sopenharmony_ci			clear_bit(R5_LOCKED, &sh->dev[i].flags);
47162306a36Sopenharmony_ci		}
47262306a36Sopenharmony_ci	clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
47362306a36Sopenharmony_ci}
47462306a36Sopenharmony_ci
47562306a36Sopenharmony_ci/*
47662306a36Sopenharmony_ci * this journal write must contain full parity,
47762306a36Sopenharmony_ci * it may also contain some data pages
47862306a36Sopenharmony_ci */
47962306a36Sopenharmony_cistatic void r5c_handle_parity_cached(struct stripe_head *sh)
48062306a36Sopenharmony_ci{
48162306a36Sopenharmony_ci	int i;
48262306a36Sopenharmony_ci
48362306a36Sopenharmony_ci	for (i = sh->disks; i--; )
48462306a36Sopenharmony_ci		if (test_bit(R5_InJournal, &sh->dev[i].flags))
48562306a36Sopenharmony_ci			set_bit(R5_Wantwrite, &sh->dev[i].flags);
48662306a36Sopenharmony_ci}
48762306a36Sopenharmony_ci
48862306a36Sopenharmony_ci/*
48962306a36Sopenharmony_ci * Setting proper flags after writing (or flushing) data and/or parity to the
49062306a36Sopenharmony_ci * log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
49162306a36Sopenharmony_ci */
49262306a36Sopenharmony_cistatic void r5c_finish_cache_stripe(struct stripe_head *sh)
49362306a36Sopenharmony_ci{
49462306a36Sopenharmony_ci	struct r5l_log *log = sh->raid_conf->log;
49562306a36Sopenharmony_ci
49662306a36Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
49762306a36Sopenharmony_ci		BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
49862306a36Sopenharmony_ci		/*
49962306a36Sopenharmony_ci		 * Set R5_InJournal for parity dev[pd_idx]. This means
50062306a36Sopenharmony_ci		 * all data AND parity in the journal. For RAID 6, it is
50162306a36Sopenharmony_ci		 * NOT necessary to set the flag for dev[qd_idx], as the
50262306a36Sopenharmony_ci		 * two parities are written out together.
50362306a36Sopenharmony_ci		 */
50462306a36Sopenharmony_ci		set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
50562306a36Sopenharmony_ci	} else if (test_bit(STRIPE_R5C_CACHING, &sh->state)) {
50662306a36Sopenharmony_ci		r5c_handle_data_cached(sh);
50762306a36Sopenharmony_ci	} else {
50862306a36Sopenharmony_ci		r5c_handle_parity_cached(sh);
50962306a36Sopenharmony_ci		set_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
51062306a36Sopenharmony_ci	}
51162306a36Sopenharmony_ci}
51262306a36Sopenharmony_ci
51362306a36Sopenharmony_cistatic void r5l_io_run_stripes(struct r5l_io_unit *io)
51462306a36Sopenharmony_ci{
51562306a36Sopenharmony_ci	struct stripe_head *sh, *next;
51662306a36Sopenharmony_ci
51762306a36Sopenharmony_ci	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
51862306a36Sopenharmony_ci		list_del_init(&sh->log_list);
51962306a36Sopenharmony_ci
52062306a36Sopenharmony_ci		r5c_finish_cache_stripe(sh);
52162306a36Sopenharmony_ci
52262306a36Sopenharmony_ci		set_bit(STRIPE_HANDLE, &sh->state);
52362306a36Sopenharmony_ci		raid5_release_stripe(sh);
52462306a36Sopenharmony_ci	}
52562306a36Sopenharmony_ci}
52662306a36Sopenharmony_ci
52762306a36Sopenharmony_cistatic void r5l_log_run_stripes(struct r5l_log *log)
52862306a36Sopenharmony_ci{
52962306a36Sopenharmony_ci	struct r5l_io_unit *io, *next;
53062306a36Sopenharmony_ci
53162306a36Sopenharmony_ci	lockdep_assert_held(&log->io_list_lock);
53262306a36Sopenharmony_ci
53362306a36Sopenharmony_ci	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
53462306a36Sopenharmony_ci		/* don't change list order */
53562306a36Sopenharmony_ci		if (io->state < IO_UNIT_IO_END)
53662306a36Sopenharmony_ci			break;
53762306a36Sopenharmony_ci
53862306a36Sopenharmony_ci		list_move_tail(&io->log_sibling, &log->finished_ios);
53962306a36Sopenharmony_ci		r5l_io_run_stripes(io);
54062306a36Sopenharmony_ci	}
54162306a36Sopenharmony_ci}
54262306a36Sopenharmony_ci
54362306a36Sopenharmony_cistatic void r5l_move_to_end_ios(struct r5l_log *log)
54462306a36Sopenharmony_ci{
54562306a36Sopenharmony_ci	struct r5l_io_unit *io, *next;
54662306a36Sopenharmony_ci
54762306a36Sopenharmony_ci	lockdep_assert_held(&log->io_list_lock);
54862306a36Sopenharmony_ci
54962306a36Sopenharmony_ci	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
55062306a36Sopenharmony_ci		/* don't change list order */
55162306a36Sopenharmony_ci		if (io->state < IO_UNIT_IO_END)
55262306a36Sopenharmony_ci			break;
55362306a36Sopenharmony_ci		list_move_tail(&io->log_sibling, &log->io_end_ios);
55462306a36Sopenharmony_ci	}
55562306a36Sopenharmony_ci}
55662306a36Sopenharmony_ci
55762306a36Sopenharmony_cistatic void __r5l_stripe_write_finished(struct r5l_io_unit *io);
55862306a36Sopenharmony_cistatic void r5l_log_endio(struct bio *bio)
55962306a36Sopenharmony_ci{
56062306a36Sopenharmony_ci	struct r5l_io_unit *io = bio->bi_private;
56162306a36Sopenharmony_ci	struct r5l_io_unit *io_deferred;
56262306a36Sopenharmony_ci	struct r5l_log *log = io->log;
56362306a36Sopenharmony_ci	unsigned long flags;
56462306a36Sopenharmony_ci	bool has_null_flush;
56562306a36Sopenharmony_ci	bool has_flush_payload;
56662306a36Sopenharmony_ci
56762306a36Sopenharmony_ci	if (bio->bi_status)
56862306a36Sopenharmony_ci		md_error(log->rdev->mddev, log->rdev);
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_ci	bio_put(bio);
57162306a36Sopenharmony_ci	mempool_free(io->meta_page, &log->meta_pool);
57262306a36Sopenharmony_ci
57362306a36Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
57462306a36Sopenharmony_ci	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
57562306a36Sopenharmony_ci
57662306a36Sopenharmony_ci	/*
57762306a36Sopenharmony_ci	 * if the io doesn't not have null_flush or flush payload,
57862306a36Sopenharmony_ci	 * it is not safe to access it after releasing io_list_lock.
57962306a36Sopenharmony_ci	 * Therefore, it is necessary to check the condition with
58062306a36Sopenharmony_ci	 * the lock held.
58162306a36Sopenharmony_ci	 */
58262306a36Sopenharmony_ci	has_null_flush = io->has_null_flush;
58362306a36Sopenharmony_ci	has_flush_payload = io->has_flush_payload;
58462306a36Sopenharmony_ci
58562306a36Sopenharmony_ci	if (log->need_cache_flush && !list_empty(&io->stripe_list))
58662306a36Sopenharmony_ci		r5l_move_to_end_ios(log);
58762306a36Sopenharmony_ci	else
58862306a36Sopenharmony_ci		r5l_log_run_stripes(log);
58962306a36Sopenharmony_ci	if (!list_empty(&log->running_ios)) {
59062306a36Sopenharmony_ci		/*
59162306a36Sopenharmony_ci		 * FLUSH/FUA io_unit is deferred because of ordering, now we
59262306a36Sopenharmony_ci		 * can dispatch it
59362306a36Sopenharmony_ci		 */
59462306a36Sopenharmony_ci		io_deferred = list_first_entry(&log->running_ios,
59562306a36Sopenharmony_ci					       struct r5l_io_unit, log_sibling);
59662306a36Sopenharmony_ci		if (io_deferred->io_deferred)
59762306a36Sopenharmony_ci			schedule_work(&log->deferred_io_work);
59862306a36Sopenharmony_ci	}
59962306a36Sopenharmony_ci
60062306a36Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
60162306a36Sopenharmony_ci
60262306a36Sopenharmony_ci	if (log->need_cache_flush)
60362306a36Sopenharmony_ci		md_wakeup_thread(log->rdev->mddev->thread);
60462306a36Sopenharmony_ci
60562306a36Sopenharmony_ci	/* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
60662306a36Sopenharmony_ci	if (has_null_flush) {
60762306a36Sopenharmony_ci		struct bio *bi;
60862306a36Sopenharmony_ci
60962306a36Sopenharmony_ci		WARN_ON(bio_list_empty(&io->flush_barriers));
61062306a36Sopenharmony_ci		while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
61162306a36Sopenharmony_ci			bio_endio(bi);
61262306a36Sopenharmony_ci			if (atomic_dec_and_test(&io->pending_stripe)) {
61362306a36Sopenharmony_ci				__r5l_stripe_write_finished(io);
61462306a36Sopenharmony_ci				return;
61562306a36Sopenharmony_ci			}
61662306a36Sopenharmony_ci		}
61762306a36Sopenharmony_ci	}
61862306a36Sopenharmony_ci	/* decrease pending_stripe for flush payload */
61962306a36Sopenharmony_ci	if (has_flush_payload)
62062306a36Sopenharmony_ci		if (atomic_dec_and_test(&io->pending_stripe))
62162306a36Sopenharmony_ci			__r5l_stripe_write_finished(io);
62262306a36Sopenharmony_ci}
62362306a36Sopenharmony_ci
62462306a36Sopenharmony_cistatic void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
62562306a36Sopenharmony_ci{
62662306a36Sopenharmony_ci	unsigned long flags;
62762306a36Sopenharmony_ci
62862306a36Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
62962306a36Sopenharmony_ci	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
63062306a36Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
63162306a36Sopenharmony_ci
63262306a36Sopenharmony_ci	/*
63362306a36Sopenharmony_ci	 * In case of journal device failures, submit_bio will get error
63462306a36Sopenharmony_ci	 * and calls endio, then active stripes will continue write
63562306a36Sopenharmony_ci	 * process. Therefore, it is not necessary to check Faulty bit
63662306a36Sopenharmony_ci	 * of journal device here.
63762306a36Sopenharmony_ci	 *
63862306a36Sopenharmony_ci	 * We can't check split_bio after current_bio is submitted. If
63962306a36Sopenharmony_ci	 * io->split_bio is null, after current_bio is submitted, current_bio
64062306a36Sopenharmony_ci	 * might already be completed and the io_unit is freed. We submit
64162306a36Sopenharmony_ci	 * split_bio first to avoid the issue.
64262306a36Sopenharmony_ci	 */
64362306a36Sopenharmony_ci	if (io->split_bio) {
64462306a36Sopenharmony_ci		if (io->has_flush)
64562306a36Sopenharmony_ci			io->split_bio->bi_opf |= REQ_PREFLUSH;
64662306a36Sopenharmony_ci		if (io->has_fua)
64762306a36Sopenharmony_ci			io->split_bio->bi_opf |= REQ_FUA;
64862306a36Sopenharmony_ci		submit_bio(io->split_bio);
64962306a36Sopenharmony_ci	}
65062306a36Sopenharmony_ci
65162306a36Sopenharmony_ci	if (io->has_flush)
65262306a36Sopenharmony_ci		io->current_bio->bi_opf |= REQ_PREFLUSH;
65362306a36Sopenharmony_ci	if (io->has_fua)
65462306a36Sopenharmony_ci		io->current_bio->bi_opf |= REQ_FUA;
65562306a36Sopenharmony_ci	submit_bio(io->current_bio);
65662306a36Sopenharmony_ci}
65762306a36Sopenharmony_ci
65862306a36Sopenharmony_ci/* deferred io_unit will be dispatched here */
65962306a36Sopenharmony_cistatic void r5l_submit_io_async(struct work_struct *work)
66062306a36Sopenharmony_ci{
66162306a36Sopenharmony_ci	struct r5l_log *log = container_of(work, struct r5l_log,
66262306a36Sopenharmony_ci					   deferred_io_work);
66362306a36Sopenharmony_ci	struct r5l_io_unit *io = NULL;
66462306a36Sopenharmony_ci	unsigned long flags;
66562306a36Sopenharmony_ci
66662306a36Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
66762306a36Sopenharmony_ci	if (!list_empty(&log->running_ios)) {
66862306a36Sopenharmony_ci		io = list_first_entry(&log->running_ios, struct r5l_io_unit,
66962306a36Sopenharmony_ci				      log_sibling);
67062306a36Sopenharmony_ci		if (!io->io_deferred)
67162306a36Sopenharmony_ci			io = NULL;
67262306a36Sopenharmony_ci		else
67362306a36Sopenharmony_ci			io->io_deferred = 0;
67462306a36Sopenharmony_ci	}
67562306a36Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
67662306a36Sopenharmony_ci	if (io)
67762306a36Sopenharmony_ci		r5l_do_submit_io(log, io);
67862306a36Sopenharmony_ci}
67962306a36Sopenharmony_ci
68062306a36Sopenharmony_cistatic void r5c_disable_writeback_async(struct work_struct *work)
68162306a36Sopenharmony_ci{
68262306a36Sopenharmony_ci	struct r5l_log *log = container_of(work, struct r5l_log,
68362306a36Sopenharmony_ci					   disable_writeback_work);
68462306a36Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
68562306a36Sopenharmony_ci	struct r5conf *conf = mddev->private;
68662306a36Sopenharmony_ci	int locked = 0;
68762306a36Sopenharmony_ci
68862306a36Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
68962306a36Sopenharmony_ci		return;
69062306a36Sopenharmony_ci	pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
69162306a36Sopenharmony_ci		mdname(mddev));
69262306a36Sopenharmony_ci
69362306a36Sopenharmony_ci	/* wait superblock change before suspend */
69462306a36Sopenharmony_ci	wait_event(mddev->sb_wait,
69562306a36Sopenharmony_ci		   conf->log == NULL ||
69662306a36Sopenharmony_ci		   (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
69762306a36Sopenharmony_ci		    (locked = mddev_trylock(mddev))));
69862306a36Sopenharmony_ci	if (locked) {
69962306a36Sopenharmony_ci		mddev_suspend(mddev);
70062306a36Sopenharmony_ci		log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
70162306a36Sopenharmony_ci		mddev_resume(mddev);
70262306a36Sopenharmony_ci		mddev_unlock(mddev);
70362306a36Sopenharmony_ci	}
70462306a36Sopenharmony_ci}
70562306a36Sopenharmony_ci
70662306a36Sopenharmony_cistatic void r5l_submit_current_io(struct r5l_log *log)
70762306a36Sopenharmony_ci{
70862306a36Sopenharmony_ci	struct r5l_io_unit *io = log->current_io;
70962306a36Sopenharmony_ci	struct r5l_meta_block *block;
71062306a36Sopenharmony_ci	unsigned long flags;
71162306a36Sopenharmony_ci	u32 crc;
71262306a36Sopenharmony_ci	bool do_submit = true;
71362306a36Sopenharmony_ci
71462306a36Sopenharmony_ci	if (!io)
71562306a36Sopenharmony_ci		return;
71662306a36Sopenharmony_ci
71762306a36Sopenharmony_ci	block = page_address(io->meta_page);
71862306a36Sopenharmony_ci	block->meta_size = cpu_to_le32(io->meta_offset);
71962306a36Sopenharmony_ci	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
72062306a36Sopenharmony_ci	block->checksum = cpu_to_le32(crc);
72162306a36Sopenharmony_ci
72262306a36Sopenharmony_ci	log->current_io = NULL;
72362306a36Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
72462306a36Sopenharmony_ci	if (io->has_flush || io->has_fua) {
72562306a36Sopenharmony_ci		if (io != list_first_entry(&log->running_ios,
72662306a36Sopenharmony_ci					   struct r5l_io_unit, log_sibling)) {
72762306a36Sopenharmony_ci			io->io_deferred = 1;
72862306a36Sopenharmony_ci			do_submit = false;
72962306a36Sopenharmony_ci		}
73062306a36Sopenharmony_ci	}
73162306a36Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
73262306a36Sopenharmony_ci	if (do_submit)
73362306a36Sopenharmony_ci		r5l_do_submit_io(log, io);
73462306a36Sopenharmony_ci}
73562306a36Sopenharmony_ci
73662306a36Sopenharmony_cistatic struct bio *r5l_bio_alloc(struct r5l_log *log)
73762306a36Sopenharmony_ci{
73862306a36Sopenharmony_ci	struct bio *bio = bio_alloc_bioset(log->rdev->bdev, BIO_MAX_VECS,
73962306a36Sopenharmony_ci					   REQ_OP_WRITE, GFP_NOIO, &log->bs);
74062306a36Sopenharmony_ci
74162306a36Sopenharmony_ci	bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
74262306a36Sopenharmony_ci
74362306a36Sopenharmony_ci	return bio;
74462306a36Sopenharmony_ci}
74562306a36Sopenharmony_ci
74662306a36Sopenharmony_cistatic void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
74762306a36Sopenharmony_ci{
74862306a36Sopenharmony_ci	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
74962306a36Sopenharmony_ci
75062306a36Sopenharmony_ci	r5c_update_log_state(log);
75162306a36Sopenharmony_ci	/*
75262306a36Sopenharmony_ci	 * If we filled up the log device start from the beginning again,
75362306a36Sopenharmony_ci	 * which will require a new bio.
75462306a36Sopenharmony_ci	 *
75562306a36Sopenharmony_ci	 * Note: for this to work properly the log size needs to me a multiple
75662306a36Sopenharmony_ci	 * of BLOCK_SECTORS.
75762306a36Sopenharmony_ci	 */
75862306a36Sopenharmony_ci	if (log->log_start == 0)
75962306a36Sopenharmony_ci		io->need_split_bio = true;
76062306a36Sopenharmony_ci
76162306a36Sopenharmony_ci	io->log_end = log->log_start;
76262306a36Sopenharmony_ci}
76362306a36Sopenharmony_ci
76462306a36Sopenharmony_cistatic struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
76562306a36Sopenharmony_ci{
76662306a36Sopenharmony_ci	struct r5l_io_unit *io;
76762306a36Sopenharmony_ci	struct r5l_meta_block *block;
76862306a36Sopenharmony_ci
76962306a36Sopenharmony_ci	io = mempool_alloc(&log->io_pool, GFP_ATOMIC);
77062306a36Sopenharmony_ci	if (!io)
77162306a36Sopenharmony_ci		return NULL;
77262306a36Sopenharmony_ci	memset(io, 0, sizeof(*io));
77362306a36Sopenharmony_ci
77462306a36Sopenharmony_ci	io->log = log;
77562306a36Sopenharmony_ci	INIT_LIST_HEAD(&io->log_sibling);
77662306a36Sopenharmony_ci	INIT_LIST_HEAD(&io->stripe_list);
77762306a36Sopenharmony_ci	bio_list_init(&io->flush_barriers);
77862306a36Sopenharmony_ci	io->state = IO_UNIT_RUNNING;
77962306a36Sopenharmony_ci
78062306a36Sopenharmony_ci	io->meta_page = mempool_alloc(&log->meta_pool, GFP_NOIO);
78162306a36Sopenharmony_ci	block = page_address(io->meta_page);
78262306a36Sopenharmony_ci	clear_page(block);
78362306a36Sopenharmony_ci	block->magic = cpu_to_le32(R5LOG_MAGIC);
78462306a36Sopenharmony_ci	block->version = R5LOG_VERSION;
78562306a36Sopenharmony_ci	block->seq = cpu_to_le64(log->seq);
78662306a36Sopenharmony_ci	block->position = cpu_to_le64(log->log_start);
78762306a36Sopenharmony_ci
78862306a36Sopenharmony_ci	io->log_start = log->log_start;
78962306a36Sopenharmony_ci	io->meta_offset = sizeof(struct r5l_meta_block);
79062306a36Sopenharmony_ci	io->seq = log->seq++;
79162306a36Sopenharmony_ci
79262306a36Sopenharmony_ci	io->current_bio = r5l_bio_alloc(log);
79362306a36Sopenharmony_ci	io->current_bio->bi_end_io = r5l_log_endio;
79462306a36Sopenharmony_ci	io->current_bio->bi_private = io;
79562306a36Sopenharmony_ci	__bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
79662306a36Sopenharmony_ci
79762306a36Sopenharmony_ci	r5_reserve_log_entry(log, io);
79862306a36Sopenharmony_ci
79962306a36Sopenharmony_ci	spin_lock_irq(&log->io_list_lock);
80062306a36Sopenharmony_ci	list_add_tail(&io->log_sibling, &log->running_ios);
80162306a36Sopenharmony_ci	spin_unlock_irq(&log->io_list_lock);
80262306a36Sopenharmony_ci
80362306a36Sopenharmony_ci	return io;
80462306a36Sopenharmony_ci}
80562306a36Sopenharmony_ci
80662306a36Sopenharmony_cistatic int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
80762306a36Sopenharmony_ci{
80862306a36Sopenharmony_ci	if (log->current_io &&
80962306a36Sopenharmony_ci	    log->current_io->meta_offset + payload_size > PAGE_SIZE)
81062306a36Sopenharmony_ci		r5l_submit_current_io(log);
81162306a36Sopenharmony_ci
81262306a36Sopenharmony_ci	if (!log->current_io) {
81362306a36Sopenharmony_ci		log->current_io = r5l_new_meta(log);
81462306a36Sopenharmony_ci		if (!log->current_io)
81562306a36Sopenharmony_ci			return -ENOMEM;
81662306a36Sopenharmony_ci	}
81762306a36Sopenharmony_ci
81862306a36Sopenharmony_ci	return 0;
81962306a36Sopenharmony_ci}
82062306a36Sopenharmony_ci
82162306a36Sopenharmony_cistatic void r5l_append_payload_meta(struct r5l_log *log, u16 type,
82262306a36Sopenharmony_ci				    sector_t location,
82362306a36Sopenharmony_ci				    u32 checksum1, u32 checksum2,
82462306a36Sopenharmony_ci				    bool checksum2_valid)
82562306a36Sopenharmony_ci{
82662306a36Sopenharmony_ci	struct r5l_io_unit *io = log->current_io;
82762306a36Sopenharmony_ci	struct r5l_payload_data_parity *payload;
82862306a36Sopenharmony_ci
82962306a36Sopenharmony_ci	payload = page_address(io->meta_page) + io->meta_offset;
83062306a36Sopenharmony_ci	payload->header.type = cpu_to_le16(type);
83162306a36Sopenharmony_ci	payload->header.flags = cpu_to_le16(0);
83262306a36Sopenharmony_ci	payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
83362306a36Sopenharmony_ci				    (PAGE_SHIFT - 9));
83462306a36Sopenharmony_ci	payload->location = cpu_to_le64(location);
83562306a36Sopenharmony_ci	payload->checksum[0] = cpu_to_le32(checksum1);
83662306a36Sopenharmony_ci	if (checksum2_valid)
83762306a36Sopenharmony_ci		payload->checksum[1] = cpu_to_le32(checksum2);
83862306a36Sopenharmony_ci
83962306a36Sopenharmony_ci	io->meta_offset += sizeof(struct r5l_payload_data_parity) +
84062306a36Sopenharmony_ci		sizeof(__le32) * (1 + !!checksum2_valid);
84162306a36Sopenharmony_ci}
84262306a36Sopenharmony_ci
84362306a36Sopenharmony_cistatic void r5l_append_payload_page(struct r5l_log *log, struct page *page)
84462306a36Sopenharmony_ci{
84562306a36Sopenharmony_ci	struct r5l_io_unit *io = log->current_io;
84662306a36Sopenharmony_ci
84762306a36Sopenharmony_ci	if (io->need_split_bio) {
84862306a36Sopenharmony_ci		BUG_ON(io->split_bio);
84962306a36Sopenharmony_ci		io->split_bio = io->current_bio;
85062306a36Sopenharmony_ci		io->current_bio = r5l_bio_alloc(log);
85162306a36Sopenharmony_ci		bio_chain(io->current_bio, io->split_bio);
85262306a36Sopenharmony_ci		io->need_split_bio = false;
85362306a36Sopenharmony_ci	}
85462306a36Sopenharmony_ci
85562306a36Sopenharmony_ci	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
85662306a36Sopenharmony_ci		BUG();
85762306a36Sopenharmony_ci
85862306a36Sopenharmony_ci	r5_reserve_log_entry(log, io);
85962306a36Sopenharmony_ci}
86062306a36Sopenharmony_ci
86162306a36Sopenharmony_cistatic void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
86262306a36Sopenharmony_ci{
86362306a36Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
86462306a36Sopenharmony_ci	struct r5conf *conf = mddev->private;
86562306a36Sopenharmony_ci	struct r5l_io_unit *io;
86662306a36Sopenharmony_ci	struct r5l_payload_flush *payload;
86762306a36Sopenharmony_ci	int meta_size;
86862306a36Sopenharmony_ci
86962306a36Sopenharmony_ci	/*
87062306a36Sopenharmony_ci	 * payload_flush requires extra writes to the journal.
87162306a36Sopenharmony_ci	 * To avoid handling the extra IO in quiesce, just skip
87262306a36Sopenharmony_ci	 * flush_payload
87362306a36Sopenharmony_ci	 */
87462306a36Sopenharmony_ci	if (conf->quiesce)
87562306a36Sopenharmony_ci		return;
87662306a36Sopenharmony_ci
87762306a36Sopenharmony_ci	mutex_lock(&log->io_mutex);
87862306a36Sopenharmony_ci	meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
87962306a36Sopenharmony_ci
88062306a36Sopenharmony_ci	if (r5l_get_meta(log, meta_size)) {
88162306a36Sopenharmony_ci		mutex_unlock(&log->io_mutex);
88262306a36Sopenharmony_ci		return;
88362306a36Sopenharmony_ci	}
88462306a36Sopenharmony_ci
88562306a36Sopenharmony_ci	/* current implementation is one stripe per flush payload */
88662306a36Sopenharmony_ci	io = log->current_io;
88762306a36Sopenharmony_ci	payload = page_address(io->meta_page) + io->meta_offset;
88862306a36Sopenharmony_ci	payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
88962306a36Sopenharmony_ci	payload->header.flags = cpu_to_le16(0);
89062306a36Sopenharmony_ci	payload->size = cpu_to_le32(sizeof(__le64));
89162306a36Sopenharmony_ci	payload->flush_stripes[0] = cpu_to_le64(sect);
89262306a36Sopenharmony_ci	io->meta_offset += meta_size;
89362306a36Sopenharmony_ci	/* multiple flush payloads count as one pending_stripe */
89462306a36Sopenharmony_ci	if (!io->has_flush_payload) {
89562306a36Sopenharmony_ci		io->has_flush_payload = 1;
89662306a36Sopenharmony_ci		atomic_inc(&io->pending_stripe);
89762306a36Sopenharmony_ci	}
89862306a36Sopenharmony_ci	mutex_unlock(&log->io_mutex);
89962306a36Sopenharmony_ci}
90062306a36Sopenharmony_ci
90162306a36Sopenharmony_cistatic int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
90262306a36Sopenharmony_ci			   int data_pages, int parity_pages)
90362306a36Sopenharmony_ci{
90462306a36Sopenharmony_ci	int i;
90562306a36Sopenharmony_ci	int meta_size;
90662306a36Sopenharmony_ci	int ret;
90762306a36Sopenharmony_ci	struct r5l_io_unit *io;
90862306a36Sopenharmony_ci
90962306a36Sopenharmony_ci	meta_size =
91062306a36Sopenharmony_ci		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
91162306a36Sopenharmony_ci		 * data_pages) +
91262306a36Sopenharmony_ci		sizeof(struct r5l_payload_data_parity) +
91362306a36Sopenharmony_ci		sizeof(__le32) * parity_pages;
91462306a36Sopenharmony_ci
91562306a36Sopenharmony_ci	ret = r5l_get_meta(log, meta_size);
91662306a36Sopenharmony_ci	if (ret)
91762306a36Sopenharmony_ci		return ret;
91862306a36Sopenharmony_ci
91962306a36Sopenharmony_ci	io = log->current_io;
92062306a36Sopenharmony_ci
92162306a36Sopenharmony_ci	if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
92262306a36Sopenharmony_ci		io->has_flush = 1;
92362306a36Sopenharmony_ci
92462306a36Sopenharmony_ci	for (i = 0; i < sh->disks; i++) {
92562306a36Sopenharmony_ci		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
92662306a36Sopenharmony_ci		    test_bit(R5_InJournal, &sh->dev[i].flags))
92762306a36Sopenharmony_ci			continue;
92862306a36Sopenharmony_ci		if (i == sh->pd_idx || i == sh->qd_idx)
92962306a36Sopenharmony_ci			continue;
93062306a36Sopenharmony_ci		if (test_bit(R5_WantFUA, &sh->dev[i].flags) &&
93162306a36Sopenharmony_ci		    log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) {
93262306a36Sopenharmony_ci			io->has_fua = 1;
93362306a36Sopenharmony_ci			/*
93462306a36Sopenharmony_ci			 * we need to flush journal to make sure recovery can
93562306a36Sopenharmony_ci			 * reach the data with fua flag
93662306a36Sopenharmony_ci			 */
93762306a36Sopenharmony_ci			io->has_flush = 1;
93862306a36Sopenharmony_ci		}
93962306a36Sopenharmony_ci		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
94062306a36Sopenharmony_ci					raid5_compute_blocknr(sh, i, 0),
94162306a36Sopenharmony_ci					sh->dev[i].log_checksum, 0, false);
94262306a36Sopenharmony_ci		r5l_append_payload_page(log, sh->dev[i].page);
94362306a36Sopenharmony_ci	}
94462306a36Sopenharmony_ci
94562306a36Sopenharmony_ci	if (parity_pages == 2) {
94662306a36Sopenharmony_ci		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
94762306a36Sopenharmony_ci					sh->sector, sh->dev[sh->pd_idx].log_checksum,
94862306a36Sopenharmony_ci					sh->dev[sh->qd_idx].log_checksum, true);
94962306a36Sopenharmony_ci		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
95062306a36Sopenharmony_ci		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
95162306a36Sopenharmony_ci	} else if (parity_pages == 1) {
95262306a36Sopenharmony_ci		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
95362306a36Sopenharmony_ci					sh->sector, sh->dev[sh->pd_idx].log_checksum,
95462306a36Sopenharmony_ci					0, false);
95562306a36Sopenharmony_ci		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
95662306a36Sopenharmony_ci	} else  /* Just writing data, not parity, in caching phase */
95762306a36Sopenharmony_ci		BUG_ON(parity_pages != 0);
95862306a36Sopenharmony_ci
95962306a36Sopenharmony_ci	list_add_tail(&sh->log_list, &io->stripe_list);
96062306a36Sopenharmony_ci	atomic_inc(&io->pending_stripe);
96162306a36Sopenharmony_ci	sh->log_io = io;
96262306a36Sopenharmony_ci
96362306a36Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
96462306a36Sopenharmony_ci		return 0;
96562306a36Sopenharmony_ci
96662306a36Sopenharmony_ci	if (sh->log_start == MaxSector) {
96762306a36Sopenharmony_ci		BUG_ON(!list_empty(&sh->r5c));
96862306a36Sopenharmony_ci		sh->log_start = io->log_start;
96962306a36Sopenharmony_ci		spin_lock_irq(&log->stripe_in_journal_lock);
97062306a36Sopenharmony_ci		list_add_tail(&sh->r5c,
97162306a36Sopenharmony_ci			      &log->stripe_in_journal_list);
97262306a36Sopenharmony_ci		spin_unlock_irq(&log->stripe_in_journal_lock);
97362306a36Sopenharmony_ci		atomic_inc(&log->stripe_in_journal_count);
97462306a36Sopenharmony_ci	}
97562306a36Sopenharmony_ci	return 0;
97662306a36Sopenharmony_ci}
97762306a36Sopenharmony_ci
97862306a36Sopenharmony_ci/* add stripe to no_space_stripes, and then wake up reclaim */
97962306a36Sopenharmony_cistatic inline void r5l_add_no_space_stripe(struct r5l_log *log,
98062306a36Sopenharmony_ci					   struct stripe_head *sh)
98162306a36Sopenharmony_ci{
98262306a36Sopenharmony_ci	spin_lock(&log->no_space_stripes_lock);
98362306a36Sopenharmony_ci	list_add_tail(&sh->log_list, &log->no_space_stripes);
98462306a36Sopenharmony_ci	spin_unlock(&log->no_space_stripes_lock);
98562306a36Sopenharmony_ci}
98662306a36Sopenharmony_ci
98762306a36Sopenharmony_ci/*
98862306a36Sopenharmony_ci * running in raid5d, where reclaim could wait for raid5d too (when it flushes
98962306a36Sopenharmony_ci * data from log to raid disks), so we shouldn't wait for reclaim here
99062306a36Sopenharmony_ci */
99162306a36Sopenharmony_ciint r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
99262306a36Sopenharmony_ci{
99362306a36Sopenharmony_ci	struct r5conf *conf = sh->raid_conf;
99462306a36Sopenharmony_ci	int write_disks = 0;
99562306a36Sopenharmony_ci	int data_pages, parity_pages;
99662306a36Sopenharmony_ci	int reserve;
99762306a36Sopenharmony_ci	int i;
99862306a36Sopenharmony_ci	int ret = 0;
99962306a36Sopenharmony_ci	bool wake_reclaim = false;
100062306a36Sopenharmony_ci
100162306a36Sopenharmony_ci	if (!log)
100262306a36Sopenharmony_ci		return -EAGAIN;
100362306a36Sopenharmony_ci	/* Don't support stripe batch */
100462306a36Sopenharmony_ci	if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
100562306a36Sopenharmony_ci	    test_bit(STRIPE_SYNCING, &sh->state)) {
100662306a36Sopenharmony_ci		/* the stripe is written to log, we start writing it to raid */
100762306a36Sopenharmony_ci		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
100862306a36Sopenharmony_ci		return -EAGAIN;
100962306a36Sopenharmony_ci	}
101062306a36Sopenharmony_ci
101162306a36Sopenharmony_ci	WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
101262306a36Sopenharmony_ci
101362306a36Sopenharmony_ci	for (i = 0; i < sh->disks; i++) {
101462306a36Sopenharmony_ci		void *addr;
101562306a36Sopenharmony_ci
101662306a36Sopenharmony_ci		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) ||
101762306a36Sopenharmony_ci		    test_bit(R5_InJournal, &sh->dev[i].flags))
101862306a36Sopenharmony_ci			continue;
101962306a36Sopenharmony_ci
102062306a36Sopenharmony_ci		write_disks++;
102162306a36Sopenharmony_ci		/* checksum is already calculated in last run */
102262306a36Sopenharmony_ci		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
102362306a36Sopenharmony_ci			continue;
102462306a36Sopenharmony_ci		addr = kmap_atomic(sh->dev[i].page);
102562306a36Sopenharmony_ci		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
102662306a36Sopenharmony_ci						    addr, PAGE_SIZE);
102762306a36Sopenharmony_ci		kunmap_atomic(addr);
102862306a36Sopenharmony_ci	}
102962306a36Sopenharmony_ci	parity_pages = 1 + !!(sh->qd_idx >= 0);
103062306a36Sopenharmony_ci	data_pages = write_disks - parity_pages;
103162306a36Sopenharmony_ci
103262306a36Sopenharmony_ci	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
103362306a36Sopenharmony_ci	/*
103462306a36Sopenharmony_ci	 * The stripe must enter state machine again to finish the write, so
103562306a36Sopenharmony_ci	 * don't delay.
103662306a36Sopenharmony_ci	 */
103762306a36Sopenharmony_ci	clear_bit(STRIPE_DELAYED, &sh->state);
103862306a36Sopenharmony_ci	atomic_inc(&sh->count);
103962306a36Sopenharmony_ci
104062306a36Sopenharmony_ci	mutex_lock(&log->io_mutex);
104162306a36Sopenharmony_ci	/* meta + data */
104262306a36Sopenharmony_ci	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
104362306a36Sopenharmony_ci
104462306a36Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
104562306a36Sopenharmony_ci		if (!r5l_has_free_space(log, reserve)) {
104662306a36Sopenharmony_ci			r5l_add_no_space_stripe(log, sh);
104762306a36Sopenharmony_ci			wake_reclaim = true;
104862306a36Sopenharmony_ci		} else {
104962306a36Sopenharmony_ci			ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
105062306a36Sopenharmony_ci			if (ret) {
105162306a36Sopenharmony_ci				spin_lock_irq(&log->io_list_lock);
105262306a36Sopenharmony_ci				list_add_tail(&sh->log_list,
105362306a36Sopenharmony_ci					      &log->no_mem_stripes);
105462306a36Sopenharmony_ci				spin_unlock_irq(&log->io_list_lock);
105562306a36Sopenharmony_ci			}
105662306a36Sopenharmony_ci		}
105762306a36Sopenharmony_ci	} else {  /* R5C_JOURNAL_MODE_WRITE_BACK */
105862306a36Sopenharmony_ci		/*
105962306a36Sopenharmony_ci		 * log space critical, do not process stripes that are
106062306a36Sopenharmony_ci		 * not in cache yet (sh->log_start == MaxSector).
106162306a36Sopenharmony_ci		 */
106262306a36Sopenharmony_ci		if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
106362306a36Sopenharmony_ci		    sh->log_start == MaxSector) {
106462306a36Sopenharmony_ci			r5l_add_no_space_stripe(log, sh);
106562306a36Sopenharmony_ci			wake_reclaim = true;
106662306a36Sopenharmony_ci			reserve = 0;
106762306a36Sopenharmony_ci		} else if (!r5l_has_free_space(log, reserve)) {
106862306a36Sopenharmony_ci			if (sh->log_start == log->last_checkpoint)
106962306a36Sopenharmony_ci				BUG();
107062306a36Sopenharmony_ci			else
107162306a36Sopenharmony_ci				r5l_add_no_space_stripe(log, sh);
107262306a36Sopenharmony_ci		} else {
107362306a36Sopenharmony_ci			ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
107462306a36Sopenharmony_ci			if (ret) {
107562306a36Sopenharmony_ci				spin_lock_irq(&log->io_list_lock);
107662306a36Sopenharmony_ci				list_add_tail(&sh->log_list,
107762306a36Sopenharmony_ci					      &log->no_mem_stripes);
107862306a36Sopenharmony_ci				spin_unlock_irq(&log->io_list_lock);
107962306a36Sopenharmony_ci			}
108062306a36Sopenharmony_ci		}
108162306a36Sopenharmony_ci	}
108262306a36Sopenharmony_ci
108362306a36Sopenharmony_ci	mutex_unlock(&log->io_mutex);
108462306a36Sopenharmony_ci	if (wake_reclaim)
108562306a36Sopenharmony_ci		r5l_wake_reclaim(log, reserve);
108662306a36Sopenharmony_ci	return 0;
108762306a36Sopenharmony_ci}
108862306a36Sopenharmony_ci
108962306a36Sopenharmony_civoid r5l_write_stripe_run(struct r5l_log *log)
109062306a36Sopenharmony_ci{
109162306a36Sopenharmony_ci	if (!log)
109262306a36Sopenharmony_ci		return;
109362306a36Sopenharmony_ci	mutex_lock(&log->io_mutex);
109462306a36Sopenharmony_ci	r5l_submit_current_io(log);
109562306a36Sopenharmony_ci	mutex_unlock(&log->io_mutex);
109662306a36Sopenharmony_ci}
109762306a36Sopenharmony_ci
109862306a36Sopenharmony_ciint r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
109962306a36Sopenharmony_ci{
110062306a36Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
110162306a36Sopenharmony_ci		/*
110262306a36Sopenharmony_ci		 * in write through (journal only)
110362306a36Sopenharmony_ci		 * we flush log disk cache first, then write stripe data to
110462306a36Sopenharmony_ci		 * raid disks. So if bio is finished, the log disk cache is
110562306a36Sopenharmony_ci		 * flushed already. The recovery guarantees we can recovery
110662306a36Sopenharmony_ci		 * the bio from log disk, so we don't need to flush again
110762306a36Sopenharmony_ci		 */
110862306a36Sopenharmony_ci		if (bio->bi_iter.bi_size == 0) {
110962306a36Sopenharmony_ci			bio_endio(bio);
111062306a36Sopenharmony_ci			return 0;
111162306a36Sopenharmony_ci		}
111262306a36Sopenharmony_ci		bio->bi_opf &= ~REQ_PREFLUSH;
111362306a36Sopenharmony_ci	} else {
111462306a36Sopenharmony_ci		/* write back (with cache) */
111562306a36Sopenharmony_ci		if (bio->bi_iter.bi_size == 0) {
111662306a36Sopenharmony_ci			mutex_lock(&log->io_mutex);
111762306a36Sopenharmony_ci			r5l_get_meta(log, 0);
111862306a36Sopenharmony_ci			bio_list_add(&log->current_io->flush_barriers, bio);
111962306a36Sopenharmony_ci			log->current_io->has_flush = 1;
112062306a36Sopenharmony_ci			log->current_io->has_null_flush = 1;
112162306a36Sopenharmony_ci			atomic_inc(&log->current_io->pending_stripe);
112262306a36Sopenharmony_ci			r5l_submit_current_io(log);
112362306a36Sopenharmony_ci			mutex_unlock(&log->io_mutex);
112462306a36Sopenharmony_ci			return 0;
112562306a36Sopenharmony_ci		}
112662306a36Sopenharmony_ci	}
112762306a36Sopenharmony_ci	return -EAGAIN;
112862306a36Sopenharmony_ci}
112962306a36Sopenharmony_ci
113062306a36Sopenharmony_ci/* This will run after log space is reclaimed */
113162306a36Sopenharmony_cistatic void r5l_run_no_space_stripes(struct r5l_log *log)
113262306a36Sopenharmony_ci{
113362306a36Sopenharmony_ci	struct stripe_head *sh;
113462306a36Sopenharmony_ci
113562306a36Sopenharmony_ci	spin_lock(&log->no_space_stripes_lock);
113662306a36Sopenharmony_ci	while (!list_empty(&log->no_space_stripes)) {
113762306a36Sopenharmony_ci		sh = list_first_entry(&log->no_space_stripes,
113862306a36Sopenharmony_ci				      struct stripe_head, log_list);
113962306a36Sopenharmony_ci		list_del_init(&sh->log_list);
114062306a36Sopenharmony_ci		set_bit(STRIPE_HANDLE, &sh->state);
114162306a36Sopenharmony_ci		raid5_release_stripe(sh);
114262306a36Sopenharmony_ci	}
114362306a36Sopenharmony_ci	spin_unlock(&log->no_space_stripes_lock);
114462306a36Sopenharmony_ci}
114562306a36Sopenharmony_ci
114662306a36Sopenharmony_ci/*
114762306a36Sopenharmony_ci * calculate new last_checkpoint
114862306a36Sopenharmony_ci * for write through mode, returns log->next_checkpoint
114962306a36Sopenharmony_ci * for write back, returns log_start of first sh in stripe_in_journal_list
115062306a36Sopenharmony_ci */
115162306a36Sopenharmony_cistatic sector_t r5c_calculate_new_cp(struct r5conf *conf)
115262306a36Sopenharmony_ci{
115362306a36Sopenharmony_ci	struct stripe_head *sh;
115462306a36Sopenharmony_ci	struct r5l_log *log = conf->log;
115562306a36Sopenharmony_ci	sector_t new_cp;
115662306a36Sopenharmony_ci	unsigned long flags;
115762306a36Sopenharmony_ci
115862306a36Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
115962306a36Sopenharmony_ci		return log->next_checkpoint;
116062306a36Sopenharmony_ci
116162306a36Sopenharmony_ci	spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
116262306a36Sopenharmony_ci	if (list_empty(&conf->log->stripe_in_journal_list)) {
116362306a36Sopenharmony_ci		/* all stripes flushed */
116462306a36Sopenharmony_ci		spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
116562306a36Sopenharmony_ci		return log->next_checkpoint;
116662306a36Sopenharmony_ci	}
116762306a36Sopenharmony_ci	sh = list_first_entry(&conf->log->stripe_in_journal_list,
116862306a36Sopenharmony_ci			      struct stripe_head, r5c);
116962306a36Sopenharmony_ci	new_cp = sh->log_start;
117062306a36Sopenharmony_ci	spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
117162306a36Sopenharmony_ci	return new_cp;
117262306a36Sopenharmony_ci}
117362306a36Sopenharmony_ci
117462306a36Sopenharmony_cistatic sector_t r5l_reclaimable_space(struct r5l_log *log)
117562306a36Sopenharmony_ci{
117662306a36Sopenharmony_ci	struct r5conf *conf = log->rdev->mddev->private;
117762306a36Sopenharmony_ci
117862306a36Sopenharmony_ci	return r5l_ring_distance(log, log->last_checkpoint,
117962306a36Sopenharmony_ci				 r5c_calculate_new_cp(conf));
118062306a36Sopenharmony_ci}
118162306a36Sopenharmony_ci
118262306a36Sopenharmony_cistatic void r5l_run_no_mem_stripe(struct r5l_log *log)
118362306a36Sopenharmony_ci{
118462306a36Sopenharmony_ci	struct stripe_head *sh;
118562306a36Sopenharmony_ci
118662306a36Sopenharmony_ci	lockdep_assert_held(&log->io_list_lock);
118762306a36Sopenharmony_ci
118862306a36Sopenharmony_ci	if (!list_empty(&log->no_mem_stripes)) {
118962306a36Sopenharmony_ci		sh = list_first_entry(&log->no_mem_stripes,
119062306a36Sopenharmony_ci				      struct stripe_head, log_list);
119162306a36Sopenharmony_ci		list_del_init(&sh->log_list);
119262306a36Sopenharmony_ci		set_bit(STRIPE_HANDLE, &sh->state);
119362306a36Sopenharmony_ci		raid5_release_stripe(sh);
119462306a36Sopenharmony_ci	}
119562306a36Sopenharmony_ci}
119662306a36Sopenharmony_ci
119762306a36Sopenharmony_cistatic bool r5l_complete_finished_ios(struct r5l_log *log)
119862306a36Sopenharmony_ci{
119962306a36Sopenharmony_ci	struct r5l_io_unit *io, *next;
120062306a36Sopenharmony_ci	bool found = false;
120162306a36Sopenharmony_ci
120262306a36Sopenharmony_ci	lockdep_assert_held(&log->io_list_lock);
120362306a36Sopenharmony_ci
120462306a36Sopenharmony_ci	list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
120562306a36Sopenharmony_ci		/* don't change list order */
120662306a36Sopenharmony_ci		if (io->state < IO_UNIT_STRIPE_END)
120762306a36Sopenharmony_ci			break;
120862306a36Sopenharmony_ci
120962306a36Sopenharmony_ci		log->next_checkpoint = io->log_start;
121062306a36Sopenharmony_ci
121162306a36Sopenharmony_ci		list_del(&io->log_sibling);
121262306a36Sopenharmony_ci		mempool_free(io, &log->io_pool);
121362306a36Sopenharmony_ci		r5l_run_no_mem_stripe(log);
121462306a36Sopenharmony_ci
121562306a36Sopenharmony_ci		found = true;
121662306a36Sopenharmony_ci	}
121762306a36Sopenharmony_ci
121862306a36Sopenharmony_ci	return found;
121962306a36Sopenharmony_ci}
122062306a36Sopenharmony_ci
122162306a36Sopenharmony_cistatic void __r5l_stripe_write_finished(struct r5l_io_unit *io)
122262306a36Sopenharmony_ci{
122362306a36Sopenharmony_ci	struct r5l_log *log = io->log;
122462306a36Sopenharmony_ci	struct r5conf *conf = log->rdev->mddev->private;
122562306a36Sopenharmony_ci	unsigned long flags;
122662306a36Sopenharmony_ci
122762306a36Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
122862306a36Sopenharmony_ci	__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
122962306a36Sopenharmony_ci
123062306a36Sopenharmony_ci	if (!r5l_complete_finished_ios(log)) {
123162306a36Sopenharmony_ci		spin_unlock_irqrestore(&log->io_list_lock, flags);
123262306a36Sopenharmony_ci		return;
123362306a36Sopenharmony_ci	}
123462306a36Sopenharmony_ci
123562306a36Sopenharmony_ci	if (r5l_reclaimable_space(log) > log->max_free_space ||
123662306a36Sopenharmony_ci	    test_bit(R5C_LOG_TIGHT, &conf->cache_state))
123762306a36Sopenharmony_ci		r5l_wake_reclaim(log, 0);
123862306a36Sopenharmony_ci
123962306a36Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
124062306a36Sopenharmony_ci	wake_up(&log->iounit_wait);
124162306a36Sopenharmony_ci}
124262306a36Sopenharmony_ci
124362306a36Sopenharmony_civoid r5l_stripe_write_finished(struct stripe_head *sh)
124462306a36Sopenharmony_ci{
124562306a36Sopenharmony_ci	struct r5l_io_unit *io;
124662306a36Sopenharmony_ci
124762306a36Sopenharmony_ci	io = sh->log_io;
124862306a36Sopenharmony_ci	sh->log_io = NULL;
124962306a36Sopenharmony_ci
125062306a36Sopenharmony_ci	if (io && atomic_dec_and_test(&io->pending_stripe))
125162306a36Sopenharmony_ci		__r5l_stripe_write_finished(io);
125262306a36Sopenharmony_ci}
125362306a36Sopenharmony_ci
125462306a36Sopenharmony_cistatic void r5l_log_flush_endio(struct bio *bio)
125562306a36Sopenharmony_ci{
125662306a36Sopenharmony_ci	struct r5l_log *log = container_of(bio, struct r5l_log,
125762306a36Sopenharmony_ci		flush_bio);
125862306a36Sopenharmony_ci	unsigned long flags;
125962306a36Sopenharmony_ci	struct r5l_io_unit *io;
126062306a36Sopenharmony_ci
126162306a36Sopenharmony_ci	if (bio->bi_status)
126262306a36Sopenharmony_ci		md_error(log->rdev->mddev, log->rdev);
126362306a36Sopenharmony_ci	bio_uninit(bio);
126462306a36Sopenharmony_ci
126562306a36Sopenharmony_ci	spin_lock_irqsave(&log->io_list_lock, flags);
126662306a36Sopenharmony_ci	list_for_each_entry(io, &log->flushing_ios, log_sibling)
126762306a36Sopenharmony_ci		r5l_io_run_stripes(io);
126862306a36Sopenharmony_ci	list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
126962306a36Sopenharmony_ci	spin_unlock_irqrestore(&log->io_list_lock, flags);
127062306a36Sopenharmony_ci}
127162306a36Sopenharmony_ci
127262306a36Sopenharmony_ci/*
127362306a36Sopenharmony_ci * Starting dispatch IO to raid.
127462306a36Sopenharmony_ci * io_unit(meta) consists of a log. There is one situation we want to avoid. A
127562306a36Sopenharmony_ci * broken meta in the middle of a log causes recovery can't find meta at the
127662306a36Sopenharmony_ci * head of log. If operations require meta at the head persistent in log, we
127762306a36Sopenharmony_ci * must make sure meta before it persistent in log too. A case is:
127862306a36Sopenharmony_ci *
127962306a36Sopenharmony_ci * stripe data/parity is in log, we start write stripe to raid disks. stripe
128062306a36Sopenharmony_ci * data/parity must be persistent in log before we do the write to raid disks.
128162306a36Sopenharmony_ci *
128262306a36Sopenharmony_ci * The solution is we restrictly maintain io_unit list order. In this case, we
128362306a36Sopenharmony_ci * only write stripes of an io_unit to raid disks till the io_unit is the first
128462306a36Sopenharmony_ci * one whose data/parity is in log.
128562306a36Sopenharmony_ci */
128662306a36Sopenharmony_civoid r5l_flush_stripe_to_raid(struct r5l_log *log)
128762306a36Sopenharmony_ci{
128862306a36Sopenharmony_ci	bool do_flush;
128962306a36Sopenharmony_ci
129062306a36Sopenharmony_ci	if (!log || !log->need_cache_flush)
129162306a36Sopenharmony_ci		return;
129262306a36Sopenharmony_ci
129362306a36Sopenharmony_ci	spin_lock_irq(&log->io_list_lock);
129462306a36Sopenharmony_ci	/* flush bio is running */
129562306a36Sopenharmony_ci	if (!list_empty(&log->flushing_ios)) {
129662306a36Sopenharmony_ci		spin_unlock_irq(&log->io_list_lock);
129762306a36Sopenharmony_ci		return;
129862306a36Sopenharmony_ci	}
129962306a36Sopenharmony_ci	list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
130062306a36Sopenharmony_ci	do_flush = !list_empty(&log->flushing_ios);
130162306a36Sopenharmony_ci	spin_unlock_irq(&log->io_list_lock);
130262306a36Sopenharmony_ci
130362306a36Sopenharmony_ci	if (!do_flush)
130462306a36Sopenharmony_ci		return;
130562306a36Sopenharmony_ci	bio_init(&log->flush_bio, log->rdev->bdev, NULL, 0,
130662306a36Sopenharmony_ci		  REQ_OP_WRITE | REQ_PREFLUSH);
130762306a36Sopenharmony_ci	log->flush_bio.bi_end_io = r5l_log_flush_endio;
130862306a36Sopenharmony_ci	submit_bio(&log->flush_bio);
130962306a36Sopenharmony_ci}
131062306a36Sopenharmony_ci
131162306a36Sopenharmony_cistatic void r5l_write_super(struct r5l_log *log, sector_t cp);
131262306a36Sopenharmony_cistatic void r5l_write_super_and_discard_space(struct r5l_log *log,
131362306a36Sopenharmony_ci	sector_t end)
131462306a36Sopenharmony_ci{
131562306a36Sopenharmony_ci	struct block_device *bdev = log->rdev->bdev;
131662306a36Sopenharmony_ci	struct mddev *mddev;
131762306a36Sopenharmony_ci
131862306a36Sopenharmony_ci	r5l_write_super(log, end);
131962306a36Sopenharmony_ci
132062306a36Sopenharmony_ci	if (!bdev_max_discard_sectors(bdev))
132162306a36Sopenharmony_ci		return;
132262306a36Sopenharmony_ci
132362306a36Sopenharmony_ci	mddev = log->rdev->mddev;
132462306a36Sopenharmony_ci	/*
132562306a36Sopenharmony_ci	 * Discard could zero data, so before discard we must make sure
132662306a36Sopenharmony_ci	 * superblock is updated to new log tail. Updating superblock (either
132762306a36Sopenharmony_ci	 * directly call md_update_sb() or depend on md thread) must hold
132862306a36Sopenharmony_ci	 * reconfig mutex. On the other hand, raid5_quiesce is called with
132962306a36Sopenharmony_ci	 * reconfig_mutex hold. The first step of raid5_quiesce() is waiting
133062306a36Sopenharmony_ci	 * for all IO finish, hence waiting for reclaim thread, while reclaim
133162306a36Sopenharmony_ci	 * thread is calling this function and waiting for reconfig mutex. So
133262306a36Sopenharmony_ci	 * there is a deadlock. We workaround this issue with a trylock.
133362306a36Sopenharmony_ci	 * FIXME: we could miss discard if we can't take reconfig mutex
133462306a36Sopenharmony_ci	 */
133562306a36Sopenharmony_ci	set_mask_bits(&mddev->sb_flags, 0,
133662306a36Sopenharmony_ci		BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
133762306a36Sopenharmony_ci	if (!mddev_trylock(mddev))
133862306a36Sopenharmony_ci		return;
133962306a36Sopenharmony_ci	md_update_sb(mddev, 1);
134062306a36Sopenharmony_ci	mddev_unlock(mddev);
134162306a36Sopenharmony_ci
134262306a36Sopenharmony_ci	/* discard IO error really doesn't matter, ignore it */
134362306a36Sopenharmony_ci	if (log->last_checkpoint < end) {
134462306a36Sopenharmony_ci		blkdev_issue_discard(bdev,
134562306a36Sopenharmony_ci				log->last_checkpoint + log->rdev->data_offset,
134662306a36Sopenharmony_ci				end - log->last_checkpoint, GFP_NOIO);
134762306a36Sopenharmony_ci	} else {
134862306a36Sopenharmony_ci		blkdev_issue_discard(bdev,
134962306a36Sopenharmony_ci				log->last_checkpoint + log->rdev->data_offset,
135062306a36Sopenharmony_ci				log->device_size - log->last_checkpoint,
135162306a36Sopenharmony_ci				GFP_NOIO);
135262306a36Sopenharmony_ci		blkdev_issue_discard(bdev, log->rdev->data_offset, end,
135362306a36Sopenharmony_ci				GFP_NOIO);
135462306a36Sopenharmony_ci	}
135562306a36Sopenharmony_ci}
135662306a36Sopenharmony_ci
135762306a36Sopenharmony_ci/*
135862306a36Sopenharmony_ci * r5c_flush_stripe moves stripe from cached list to handle_list. When called,
135962306a36Sopenharmony_ci * the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
136062306a36Sopenharmony_ci *
136162306a36Sopenharmony_ci * must hold conf->device_lock
136262306a36Sopenharmony_ci */
136362306a36Sopenharmony_cistatic void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
136462306a36Sopenharmony_ci{
136562306a36Sopenharmony_ci	BUG_ON(list_empty(&sh->lru));
136662306a36Sopenharmony_ci	BUG_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
136762306a36Sopenharmony_ci	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
136862306a36Sopenharmony_ci
136962306a36Sopenharmony_ci	/*
137062306a36Sopenharmony_ci	 * The stripe is not ON_RELEASE_LIST, so it is safe to call
137162306a36Sopenharmony_ci	 * raid5_release_stripe() while holding conf->device_lock
137262306a36Sopenharmony_ci	 */
137362306a36Sopenharmony_ci	BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
137462306a36Sopenharmony_ci	lockdep_assert_held(&conf->device_lock);
137562306a36Sopenharmony_ci
137662306a36Sopenharmony_ci	list_del_init(&sh->lru);
137762306a36Sopenharmony_ci	atomic_inc(&sh->count);
137862306a36Sopenharmony_ci
137962306a36Sopenharmony_ci	set_bit(STRIPE_HANDLE, &sh->state);
138062306a36Sopenharmony_ci	atomic_inc(&conf->active_stripes);
138162306a36Sopenharmony_ci	r5c_make_stripe_write_out(sh);
138262306a36Sopenharmony_ci
138362306a36Sopenharmony_ci	if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
138462306a36Sopenharmony_ci		atomic_inc(&conf->r5c_flushing_partial_stripes);
138562306a36Sopenharmony_ci	else
138662306a36Sopenharmony_ci		atomic_inc(&conf->r5c_flushing_full_stripes);
138762306a36Sopenharmony_ci	raid5_release_stripe(sh);
138862306a36Sopenharmony_ci}
138962306a36Sopenharmony_ci
139062306a36Sopenharmony_ci/*
139162306a36Sopenharmony_ci * if num == 0, flush all full stripes
139262306a36Sopenharmony_ci * if num > 0, flush all full stripes. If less than num full stripes are
139362306a36Sopenharmony_ci *             flushed, flush some partial stripes until totally num stripes are
139462306a36Sopenharmony_ci *             flushed or there is no more cached stripes.
139562306a36Sopenharmony_ci */
139662306a36Sopenharmony_civoid r5c_flush_cache(struct r5conf *conf, int num)
139762306a36Sopenharmony_ci{
139862306a36Sopenharmony_ci	int count;
139962306a36Sopenharmony_ci	struct stripe_head *sh, *next;
140062306a36Sopenharmony_ci
140162306a36Sopenharmony_ci	lockdep_assert_held(&conf->device_lock);
140262306a36Sopenharmony_ci	if (!conf->log)
140362306a36Sopenharmony_ci		return;
140462306a36Sopenharmony_ci
140562306a36Sopenharmony_ci	count = 0;
140662306a36Sopenharmony_ci	list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
140762306a36Sopenharmony_ci		r5c_flush_stripe(conf, sh);
140862306a36Sopenharmony_ci		count++;
140962306a36Sopenharmony_ci	}
141062306a36Sopenharmony_ci
141162306a36Sopenharmony_ci	if (count >= num)
141262306a36Sopenharmony_ci		return;
141362306a36Sopenharmony_ci	list_for_each_entry_safe(sh, next,
141462306a36Sopenharmony_ci				 &conf->r5c_partial_stripe_list, lru) {
141562306a36Sopenharmony_ci		r5c_flush_stripe(conf, sh);
141662306a36Sopenharmony_ci		if (++count >= num)
141762306a36Sopenharmony_ci			break;
141862306a36Sopenharmony_ci	}
141962306a36Sopenharmony_ci}
142062306a36Sopenharmony_ci
142162306a36Sopenharmony_cistatic void r5c_do_reclaim(struct r5conf *conf)
142262306a36Sopenharmony_ci{
142362306a36Sopenharmony_ci	struct r5l_log *log = conf->log;
142462306a36Sopenharmony_ci	struct stripe_head *sh;
142562306a36Sopenharmony_ci	int count = 0;
142662306a36Sopenharmony_ci	unsigned long flags;
142762306a36Sopenharmony_ci	int total_cached;
142862306a36Sopenharmony_ci	int stripes_to_flush;
142962306a36Sopenharmony_ci	int flushing_partial, flushing_full;
143062306a36Sopenharmony_ci
143162306a36Sopenharmony_ci	if (!r5c_is_writeback(log))
143262306a36Sopenharmony_ci		return;
143362306a36Sopenharmony_ci
143462306a36Sopenharmony_ci	flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
143562306a36Sopenharmony_ci	flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
143662306a36Sopenharmony_ci	total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
143762306a36Sopenharmony_ci		atomic_read(&conf->r5c_cached_full_stripes) -
143862306a36Sopenharmony_ci		flushing_full - flushing_partial;
143962306a36Sopenharmony_ci
144062306a36Sopenharmony_ci	if (total_cached > conf->min_nr_stripes * 3 / 4 ||
144162306a36Sopenharmony_ci	    atomic_read(&conf->empty_inactive_list_nr) > 0)
144262306a36Sopenharmony_ci		/*
144362306a36Sopenharmony_ci		 * if stripe cache pressure high, flush all full stripes and
144462306a36Sopenharmony_ci		 * some partial stripes
144562306a36Sopenharmony_ci		 */
144662306a36Sopenharmony_ci		stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
144762306a36Sopenharmony_ci	else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
144862306a36Sopenharmony_ci		 atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
144962306a36Sopenharmony_ci		 R5C_FULL_STRIPE_FLUSH_BATCH(conf))
145062306a36Sopenharmony_ci		/*
145162306a36Sopenharmony_ci		 * if stripe cache pressure moderate, or if there is many full
145262306a36Sopenharmony_ci		 * stripes,flush all full stripes
145362306a36Sopenharmony_ci		 */
145462306a36Sopenharmony_ci		stripes_to_flush = 0;
145562306a36Sopenharmony_ci	else
145662306a36Sopenharmony_ci		/* no need to flush */
145762306a36Sopenharmony_ci		stripes_to_flush = -1;
145862306a36Sopenharmony_ci
145962306a36Sopenharmony_ci	if (stripes_to_flush >= 0) {
146062306a36Sopenharmony_ci		spin_lock_irqsave(&conf->device_lock, flags);
146162306a36Sopenharmony_ci		r5c_flush_cache(conf, stripes_to_flush);
146262306a36Sopenharmony_ci		spin_unlock_irqrestore(&conf->device_lock, flags);
146362306a36Sopenharmony_ci	}
146462306a36Sopenharmony_ci
146562306a36Sopenharmony_ci	/* if log space is tight, flush stripes on stripe_in_journal_list */
146662306a36Sopenharmony_ci	if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
146762306a36Sopenharmony_ci		spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
146862306a36Sopenharmony_ci		spin_lock(&conf->device_lock);
146962306a36Sopenharmony_ci		list_for_each_entry(sh, &log->stripe_in_journal_list, r5c) {
147062306a36Sopenharmony_ci			/*
147162306a36Sopenharmony_ci			 * stripes on stripe_in_journal_list could be in any
147262306a36Sopenharmony_ci			 * state of the stripe_cache state machine. In this
147362306a36Sopenharmony_ci			 * case, we only want to flush stripe on
147462306a36Sopenharmony_ci			 * r5c_cached_full/partial_stripes. The following
147562306a36Sopenharmony_ci			 * condition makes sure the stripe is on one of the
147662306a36Sopenharmony_ci			 * two lists.
147762306a36Sopenharmony_ci			 */
147862306a36Sopenharmony_ci			if (!list_empty(&sh->lru) &&
147962306a36Sopenharmony_ci			    !test_bit(STRIPE_HANDLE, &sh->state) &&
148062306a36Sopenharmony_ci			    atomic_read(&sh->count) == 0) {
148162306a36Sopenharmony_ci				r5c_flush_stripe(conf, sh);
148262306a36Sopenharmony_ci				if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
148362306a36Sopenharmony_ci					break;
148462306a36Sopenharmony_ci			}
148562306a36Sopenharmony_ci		}
148662306a36Sopenharmony_ci		spin_unlock(&conf->device_lock);
148762306a36Sopenharmony_ci		spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
148862306a36Sopenharmony_ci	}
148962306a36Sopenharmony_ci
149062306a36Sopenharmony_ci	if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state))
149162306a36Sopenharmony_ci		r5l_run_no_space_stripes(log);
149262306a36Sopenharmony_ci
149362306a36Sopenharmony_ci	md_wakeup_thread(conf->mddev->thread);
149462306a36Sopenharmony_ci}
149562306a36Sopenharmony_ci
149662306a36Sopenharmony_cistatic void r5l_do_reclaim(struct r5l_log *log)
149762306a36Sopenharmony_ci{
149862306a36Sopenharmony_ci	struct r5conf *conf = log->rdev->mddev->private;
149962306a36Sopenharmony_ci	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
150062306a36Sopenharmony_ci	sector_t reclaimable;
150162306a36Sopenharmony_ci	sector_t next_checkpoint;
150262306a36Sopenharmony_ci	bool write_super;
150362306a36Sopenharmony_ci
150462306a36Sopenharmony_ci	spin_lock_irq(&log->io_list_lock);
150562306a36Sopenharmony_ci	write_super = r5l_reclaimable_space(log) > log->max_free_space ||
150662306a36Sopenharmony_ci		reclaim_target != 0 || !list_empty(&log->no_space_stripes);
150762306a36Sopenharmony_ci	/*
150862306a36Sopenharmony_ci	 * move proper io_unit to reclaim list. We should not change the order.
150962306a36Sopenharmony_ci	 * reclaimable/unreclaimable io_unit can be mixed in the list, we
151062306a36Sopenharmony_ci	 * shouldn't reuse space of an unreclaimable io_unit
151162306a36Sopenharmony_ci	 */
151262306a36Sopenharmony_ci	while (1) {
151362306a36Sopenharmony_ci		reclaimable = r5l_reclaimable_space(log);
151462306a36Sopenharmony_ci		if (reclaimable >= reclaim_target ||
151562306a36Sopenharmony_ci		    (list_empty(&log->running_ios) &&
151662306a36Sopenharmony_ci		     list_empty(&log->io_end_ios) &&
151762306a36Sopenharmony_ci		     list_empty(&log->flushing_ios) &&
151862306a36Sopenharmony_ci		     list_empty(&log->finished_ios)))
151962306a36Sopenharmony_ci			break;
152062306a36Sopenharmony_ci
152162306a36Sopenharmony_ci		md_wakeup_thread(log->rdev->mddev->thread);
152262306a36Sopenharmony_ci		wait_event_lock_irq(log->iounit_wait,
152362306a36Sopenharmony_ci				    r5l_reclaimable_space(log) > reclaimable,
152462306a36Sopenharmony_ci				    log->io_list_lock);
152562306a36Sopenharmony_ci	}
152662306a36Sopenharmony_ci
152762306a36Sopenharmony_ci	next_checkpoint = r5c_calculate_new_cp(conf);
152862306a36Sopenharmony_ci	spin_unlock_irq(&log->io_list_lock);
152962306a36Sopenharmony_ci
153062306a36Sopenharmony_ci	if (reclaimable == 0 || !write_super)
153162306a36Sopenharmony_ci		return;
153262306a36Sopenharmony_ci
153362306a36Sopenharmony_ci	/*
153462306a36Sopenharmony_ci	 * write_super will flush cache of each raid disk. We must write super
153562306a36Sopenharmony_ci	 * here, because the log area might be reused soon and we don't want to
153662306a36Sopenharmony_ci	 * confuse recovery
153762306a36Sopenharmony_ci	 */
153862306a36Sopenharmony_ci	r5l_write_super_and_discard_space(log, next_checkpoint);
153962306a36Sopenharmony_ci
154062306a36Sopenharmony_ci	mutex_lock(&log->io_mutex);
154162306a36Sopenharmony_ci	log->last_checkpoint = next_checkpoint;
154262306a36Sopenharmony_ci	r5c_update_log_state(log);
154362306a36Sopenharmony_ci	mutex_unlock(&log->io_mutex);
154462306a36Sopenharmony_ci
154562306a36Sopenharmony_ci	r5l_run_no_space_stripes(log);
154662306a36Sopenharmony_ci}
154762306a36Sopenharmony_ci
154862306a36Sopenharmony_cistatic void r5l_reclaim_thread(struct md_thread *thread)
154962306a36Sopenharmony_ci{
155062306a36Sopenharmony_ci	struct mddev *mddev = thread->mddev;
155162306a36Sopenharmony_ci	struct r5conf *conf = mddev->private;
155262306a36Sopenharmony_ci	struct r5l_log *log = conf->log;
155362306a36Sopenharmony_ci
155462306a36Sopenharmony_ci	if (!log)
155562306a36Sopenharmony_ci		return;
155662306a36Sopenharmony_ci	r5c_do_reclaim(conf);
155762306a36Sopenharmony_ci	r5l_do_reclaim(log);
155862306a36Sopenharmony_ci}
155962306a36Sopenharmony_ci
156062306a36Sopenharmony_civoid r5l_wake_reclaim(struct r5l_log *log, sector_t space)
156162306a36Sopenharmony_ci{
156262306a36Sopenharmony_ci	unsigned long target;
156362306a36Sopenharmony_ci	unsigned long new = (unsigned long)space; /* overflow in theory */
156462306a36Sopenharmony_ci
156562306a36Sopenharmony_ci	if (!log)
156662306a36Sopenharmony_ci		return;
156762306a36Sopenharmony_ci
156862306a36Sopenharmony_ci	target = READ_ONCE(log->reclaim_target);
156962306a36Sopenharmony_ci	do {
157062306a36Sopenharmony_ci		if (new < target)
157162306a36Sopenharmony_ci			return;
157262306a36Sopenharmony_ci	} while (!try_cmpxchg(&log->reclaim_target, &target, new));
157362306a36Sopenharmony_ci	md_wakeup_thread(log->reclaim_thread);
157462306a36Sopenharmony_ci}
157562306a36Sopenharmony_ci
157662306a36Sopenharmony_civoid r5l_quiesce(struct r5l_log *log, int quiesce)
157762306a36Sopenharmony_ci{
157862306a36Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
157962306a36Sopenharmony_ci	struct md_thread *thread = rcu_dereference_protected(
158062306a36Sopenharmony_ci		log->reclaim_thread, lockdep_is_held(&mddev->reconfig_mutex));
158162306a36Sopenharmony_ci
158262306a36Sopenharmony_ci	if (quiesce) {
158362306a36Sopenharmony_ci		/* make sure r5l_write_super_and_discard_space exits */
158462306a36Sopenharmony_ci		wake_up(&mddev->sb_wait);
158562306a36Sopenharmony_ci		kthread_park(thread->tsk);
158662306a36Sopenharmony_ci		r5l_wake_reclaim(log, MaxSector);
158762306a36Sopenharmony_ci		r5l_do_reclaim(log);
158862306a36Sopenharmony_ci	} else
158962306a36Sopenharmony_ci		kthread_unpark(thread->tsk);
159062306a36Sopenharmony_ci}
159162306a36Sopenharmony_ci
159262306a36Sopenharmony_cibool r5l_log_disk_error(struct r5conf *conf)
159362306a36Sopenharmony_ci{
159462306a36Sopenharmony_ci	struct r5l_log *log = conf->log;
159562306a36Sopenharmony_ci
159662306a36Sopenharmony_ci	/* don't allow write if journal disk is missing */
159762306a36Sopenharmony_ci	if (!log)
159862306a36Sopenharmony_ci		return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
159962306a36Sopenharmony_ci	else
160062306a36Sopenharmony_ci		return test_bit(Faulty, &log->rdev->flags);
160162306a36Sopenharmony_ci}
160262306a36Sopenharmony_ci
160362306a36Sopenharmony_ci#define R5L_RECOVERY_PAGE_POOL_SIZE 256
160462306a36Sopenharmony_ci
160562306a36Sopenharmony_cistruct r5l_recovery_ctx {
160662306a36Sopenharmony_ci	struct page *meta_page;		/* current meta */
160762306a36Sopenharmony_ci	sector_t meta_total_blocks;	/* total size of current meta and data */
160862306a36Sopenharmony_ci	sector_t pos;			/* recovery position */
160962306a36Sopenharmony_ci	u64 seq;			/* recovery position seq */
161062306a36Sopenharmony_ci	int data_parity_stripes;	/* number of data_parity stripes */
161162306a36Sopenharmony_ci	int data_only_stripes;		/* number of data_only stripes */
161262306a36Sopenharmony_ci	struct list_head cached_list;
161362306a36Sopenharmony_ci
161462306a36Sopenharmony_ci	/*
161562306a36Sopenharmony_ci	 * read ahead page pool (ra_pool)
161662306a36Sopenharmony_ci	 * in recovery, log is read sequentially. It is not efficient to
161762306a36Sopenharmony_ci	 * read every page with sync_page_io(). The read ahead page pool
161862306a36Sopenharmony_ci	 * reads multiple pages with one IO, so further log read can
161962306a36Sopenharmony_ci	 * just copy data from the pool.
162062306a36Sopenharmony_ci	 */
162162306a36Sopenharmony_ci	struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
162262306a36Sopenharmony_ci	struct bio_vec ra_bvec[R5L_RECOVERY_PAGE_POOL_SIZE];
162362306a36Sopenharmony_ci	sector_t pool_offset;	/* offset of first page in the pool */
162462306a36Sopenharmony_ci	int total_pages;	/* total allocated pages */
162562306a36Sopenharmony_ci	int valid_pages;	/* pages with valid data */
162662306a36Sopenharmony_ci};
162762306a36Sopenharmony_ci
162862306a36Sopenharmony_cistatic int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
162962306a36Sopenharmony_ci					    struct r5l_recovery_ctx *ctx)
163062306a36Sopenharmony_ci{
163162306a36Sopenharmony_ci	struct page *page;
163262306a36Sopenharmony_ci
163362306a36Sopenharmony_ci	ctx->valid_pages = 0;
163462306a36Sopenharmony_ci	ctx->total_pages = 0;
163562306a36Sopenharmony_ci	while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
163662306a36Sopenharmony_ci		page = alloc_page(GFP_KERNEL);
163762306a36Sopenharmony_ci
163862306a36Sopenharmony_ci		if (!page)
163962306a36Sopenharmony_ci			break;
164062306a36Sopenharmony_ci		ctx->ra_pool[ctx->total_pages] = page;
164162306a36Sopenharmony_ci		ctx->total_pages += 1;
164262306a36Sopenharmony_ci	}
164362306a36Sopenharmony_ci
164462306a36Sopenharmony_ci	if (ctx->total_pages == 0)
164562306a36Sopenharmony_ci		return -ENOMEM;
164662306a36Sopenharmony_ci
164762306a36Sopenharmony_ci	ctx->pool_offset = 0;
164862306a36Sopenharmony_ci	return 0;
164962306a36Sopenharmony_ci}
165062306a36Sopenharmony_ci
165162306a36Sopenharmony_cistatic void r5l_recovery_free_ra_pool(struct r5l_log *log,
165262306a36Sopenharmony_ci					struct r5l_recovery_ctx *ctx)
165362306a36Sopenharmony_ci{
165462306a36Sopenharmony_ci	int i;
165562306a36Sopenharmony_ci
165662306a36Sopenharmony_ci	for (i = 0; i < ctx->total_pages; ++i)
165762306a36Sopenharmony_ci		put_page(ctx->ra_pool[i]);
165862306a36Sopenharmony_ci}
165962306a36Sopenharmony_ci
166062306a36Sopenharmony_ci/*
166162306a36Sopenharmony_ci * fetch ctx->valid_pages pages from offset
166262306a36Sopenharmony_ci * In normal cases, ctx->valid_pages == ctx->total_pages after the call.
166362306a36Sopenharmony_ci * However, if the offset is close to the end of the journal device,
166462306a36Sopenharmony_ci * ctx->valid_pages could be smaller than ctx->total_pages
166562306a36Sopenharmony_ci */
166662306a36Sopenharmony_cistatic int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
166762306a36Sopenharmony_ci				      struct r5l_recovery_ctx *ctx,
166862306a36Sopenharmony_ci				      sector_t offset)
166962306a36Sopenharmony_ci{
167062306a36Sopenharmony_ci	struct bio bio;
167162306a36Sopenharmony_ci	int ret;
167262306a36Sopenharmony_ci
167362306a36Sopenharmony_ci	bio_init(&bio, log->rdev->bdev, ctx->ra_bvec,
167462306a36Sopenharmony_ci		 R5L_RECOVERY_PAGE_POOL_SIZE, REQ_OP_READ);
167562306a36Sopenharmony_ci	bio.bi_iter.bi_sector = log->rdev->data_offset + offset;
167662306a36Sopenharmony_ci
167762306a36Sopenharmony_ci	ctx->valid_pages = 0;
167862306a36Sopenharmony_ci	ctx->pool_offset = offset;
167962306a36Sopenharmony_ci
168062306a36Sopenharmony_ci	while (ctx->valid_pages < ctx->total_pages) {
168162306a36Sopenharmony_ci		__bio_add_page(&bio, ctx->ra_pool[ctx->valid_pages], PAGE_SIZE,
168262306a36Sopenharmony_ci			       0);
168362306a36Sopenharmony_ci		ctx->valid_pages += 1;
168462306a36Sopenharmony_ci
168562306a36Sopenharmony_ci		offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
168662306a36Sopenharmony_ci
168762306a36Sopenharmony_ci		if (offset == 0)  /* reached end of the device */
168862306a36Sopenharmony_ci			break;
168962306a36Sopenharmony_ci	}
169062306a36Sopenharmony_ci
169162306a36Sopenharmony_ci	ret = submit_bio_wait(&bio);
169262306a36Sopenharmony_ci	bio_uninit(&bio);
169362306a36Sopenharmony_ci	return ret;
169462306a36Sopenharmony_ci}
169562306a36Sopenharmony_ci
169662306a36Sopenharmony_ci/*
169762306a36Sopenharmony_ci * try read a page from the read ahead page pool, if the page is not in the
169862306a36Sopenharmony_ci * pool, call r5l_recovery_fetch_ra_pool
169962306a36Sopenharmony_ci */
170062306a36Sopenharmony_cistatic int r5l_recovery_read_page(struct r5l_log *log,
170162306a36Sopenharmony_ci				  struct r5l_recovery_ctx *ctx,
170262306a36Sopenharmony_ci				  struct page *page,
170362306a36Sopenharmony_ci				  sector_t offset)
170462306a36Sopenharmony_ci{
170562306a36Sopenharmony_ci	int ret;
170662306a36Sopenharmony_ci
170762306a36Sopenharmony_ci	if (offset < ctx->pool_offset ||
170862306a36Sopenharmony_ci	    offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
170962306a36Sopenharmony_ci		ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
171062306a36Sopenharmony_ci		if (ret)
171162306a36Sopenharmony_ci			return ret;
171262306a36Sopenharmony_ci	}
171362306a36Sopenharmony_ci
171462306a36Sopenharmony_ci	BUG_ON(offset < ctx->pool_offset ||
171562306a36Sopenharmony_ci	       offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
171662306a36Sopenharmony_ci
171762306a36Sopenharmony_ci	memcpy(page_address(page),
171862306a36Sopenharmony_ci	       page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
171962306a36Sopenharmony_ci					 BLOCK_SECTOR_SHIFT]),
172062306a36Sopenharmony_ci	       PAGE_SIZE);
172162306a36Sopenharmony_ci	return 0;
172262306a36Sopenharmony_ci}
172362306a36Sopenharmony_ci
172462306a36Sopenharmony_cistatic int r5l_recovery_read_meta_block(struct r5l_log *log,
172562306a36Sopenharmony_ci					struct r5l_recovery_ctx *ctx)
172662306a36Sopenharmony_ci{
172762306a36Sopenharmony_ci	struct page *page = ctx->meta_page;
172862306a36Sopenharmony_ci	struct r5l_meta_block *mb;
172962306a36Sopenharmony_ci	u32 crc, stored_crc;
173062306a36Sopenharmony_ci	int ret;
173162306a36Sopenharmony_ci
173262306a36Sopenharmony_ci	ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
173362306a36Sopenharmony_ci	if (ret != 0)
173462306a36Sopenharmony_ci		return ret;
173562306a36Sopenharmony_ci
173662306a36Sopenharmony_ci	mb = page_address(page);
173762306a36Sopenharmony_ci	stored_crc = le32_to_cpu(mb->checksum);
173862306a36Sopenharmony_ci	mb->checksum = 0;
173962306a36Sopenharmony_ci
174062306a36Sopenharmony_ci	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
174162306a36Sopenharmony_ci	    le64_to_cpu(mb->seq) != ctx->seq ||
174262306a36Sopenharmony_ci	    mb->version != R5LOG_VERSION ||
174362306a36Sopenharmony_ci	    le64_to_cpu(mb->position) != ctx->pos)
174462306a36Sopenharmony_ci		return -EINVAL;
174562306a36Sopenharmony_ci
174662306a36Sopenharmony_ci	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
174762306a36Sopenharmony_ci	if (stored_crc != crc)
174862306a36Sopenharmony_ci		return -EINVAL;
174962306a36Sopenharmony_ci
175062306a36Sopenharmony_ci	if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
175162306a36Sopenharmony_ci		return -EINVAL;
175262306a36Sopenharmony_ci
175362306a36Sopenharmony_ci	ctx->meta_total_blocks = BLOCK_SECTORS;
175462306a36Sopenharmony_ci
175562306a36Sopenharmony_ci	return 0;
175662306a36Sopenharmony_ci}
175762306a36Sopenharmony_ci
175862306a36Sopenharmony_cistatic void
175962306a36Sopenharmony_cir5l_recovery_create_empty_meta_block(struct r5l_log *log,
176062306a36Sopenharmony_ci				     struct page *page,
176162306a36Sopenharmony_ci				     sector_t pos, u64 seq)
176262306a36Sopenharmony_ci{
176362306a36Sopenharmony_ci	struct r5l_meta_block *mb;
176462306a36Sopenharmony_ci
176562306a36Sopenharmony_ci	mb = page_address(page);
176662306a36Sopenharmony_ci	clear_page(mb);
176762306a36Sopenharmony_ci	mb->magic = cpu_to_le32(R5LOG_MAGIC);
176862306a36Sopenharmony_ci	mb->version = R5LOG_VERSION;
176962306a36Sopenharmony_ci	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
177062306a36Sopenharmony_ci	mb->seq = cpu_to_le64(seq);
177162306a36Sopenharmony_ci	mb->position = cpu_to_le64(pos);
177262306a36Sopenharmony_ci}
177362306a36Sopenharmony_ci
177462306a36Sopenharmony_cistatic int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
177562306a36Sopenharmony_ci					  u64 seq)
177662306a36Sopenharmony_ci{
177762306a36Sopenharmony_ci	struct page *page;
177862306a36Sopenharmony_ci	struct r5l_meta_block *mb;
177962306a36Sopenharmony_ci
178062306a36Sopenharmony_ci	page = alloc_page(GFP_KERNEL);
178162306a36Sopenharmony_ci	if (!page)
178262306a36Sopenharmony_ci		return -ENOMEM;
178362306a36Sopenharmony_ci	r5l_recovery_create_empty_meta_block(log, page, pos, seq);
178462306a36Sopenharmony_ci	mb = page_address(page);
178562306a36Sopenharmony_ci	mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
178662306a36Sopenharmony_ci					     mb, PAGE_SIZE));
178762306a36Sopenharmony_ci	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE |
178862306a36Sopenharmony_ci			  REQ_SYNC | REQ_FUA, false)) {
178962306a36Sopenharmony_ci		__free_page(page);
179062306a36Sopenharmony_ci		return -EIO;
179162306a36Sopenharmony_ci	}
179262306a36Sopenharmony_ci	__free_page(page);
179362306a36Sopenharmony_ci	return 0;
179462306a36Sopenharmony_ci}
179562306a36Sopenharmony_ci
179662306a36Sopenharmony_ci/*
179762306a36Sopenharmony_ci * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
179862306a36Sopenharmony_ci * to mark valid (potentially not flushed) data in the journal.
179962306a36Sopenharmony_ci *
180062306a36Sopenharmony_ci * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
180162306a36Sopenharmony_ci * so there should not be any mismatch here.
180262306a36Sopenharmony_ci */
180362306a36Sopenharmony_cistatic void r5l_recovery_load_data(struct r5l_log *log,
180462306a36Sopenharmony_ci				   struct stripe_head *sh,
180562306a36Sopenharmony_ci				   struct r5l_recovery_ctx *ctx,
180662306a36Sopenharmony_ci				   struct r5l_payload_data_parity *payload,
180762306a36Sopenharmony_ci				   sector_t log_offset)
180862306a36Sopenharmony_ci{
180962306a36Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
181062306a36Sopenharmony_ci	struct r5conf *conf = mddev->private;
181162306a36Sopenharmony_ci	int dd_idx;
181262306a36Sopenharmony_ci
181362306a36Sopenharmony_ci	raid5_compute_sector(conf,
181462306a36Sopenharmony_ci			     le64_to_cpu(payload->location), 0,
181562306a36Sopenharmony_ci			     &dd_idx, sh);
181662306a36Sopenharmony_ci	r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
181762306a36Sopenharmony_ci	sh->dev[dd_idx].log_checksum =
181862306a36Sopenharmony_ci		le32_to_cpu(payload->checksum[0]);
181962306a36Sopenharmony_ci	ctx->meta_total_blocks += BLOCK_SECTORS;
182062306a36Sopenharmony_ci
182162306a36Sopenharmony_ci	set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
182262306a36Sopenharmony_ci	set_bit(STRIPE_R5C_CACHING, &sh->state);
182362306a36Sopenharmony_ci}
182462306a36Sopenharmony_ci
182562306a36Sopenharmony_cistatic void r5l_recovery_load_parity(struct r5l_log *log,
182662306a36Sopenharmony_ci				     struct stripe_head *sh,
182762306a36Sopenharmony_ci				     struct r5l_recovery_ctx *ctx,
182862306a36Sopenharmony_ci				     struct r5l_payload_data_parity *payload,
182962306a36Sopenharmony_ci				     sector_t log_offset)
183062306a36Sopenharmony_ci{
183162306a36Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
183262306a36Sopenharmony_ci	struct r5conf *conf = mddev->private;
183362306a36Sopenharmony_ci
183462306a36Sopenharmony_ci	ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
183562306a36Sopenharmony_ci	r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
183662306a36Sopenharmony_ci	sh->dev[sh->pd_idx].log_checksum =
183762306a36Sopenharmony_ci		le32_to_cpu(payload->checksum[0]);
183862306a36Sopenharmony_ci	set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
183962306a36Sopenharmony_ci
184062306a36Sopenharmony_ci	if (sh->qd_idx >= 0) {
184162306a36Sopenharmony_ci		r5l_recovery_read_page(
184262306a36Sopenharmony_ci			log, ctx, sh->dev[sh->qd_idx].page,
184362306a36Sopenharmony_ci			r5l_ring_add(log, log_offset, BLOCK_SECTORS));
184462306a36Sopenharmony_ci		sh->dev[sh->qd_idx].log_checksum =
184562306a36Sopenharmony_ci			le32_to_cpu(payload->checksum[1]);
184662306a36Sopenharmony_ci		set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
184762306a36Sopenharmony_ci	}
184862306a36Sopenharmony_ci	clear_bit(STRIPE_R5C_CACHING, &sh->state);
184962306a36Sopenharmony_ci}
185062306a36Sopenharmony_ci
185162306a36Sopenharmony_cistatic void r5l_recovery_reset_stripe(struct stripe_head *sh)
185262306a36Sopenharmony_ci{
185362306a36Sopenharmony_ci	int i;
185462306a36Sopenharmony_ci
185562306a36Sopenharmony_ci	sh->state = 0;
185662306a36Sopenharmony_ci	sh->log_start = MaxSector;
185762306a36Sopenharmony_ci	for (i = sh->disks; i--; )
185862306a36Sopenharmony_ci		sh->dev[i].flags = 0;
185962306a36Sopenharmony_ci}
186062306a36Sopenharmony_ci
186162306a36Sopenharmony_cistatic void
186262306a36Sopenharmony_cir5l_recovery_replay_one_stripe(struct r5conf *conf,
186362306a36Sopenharmony_ci			       struct stripe_head *sh,
186462306a36Sopenharmony_ci			       struct r5l_recovery_ctx *ctx)
186562306a36Sopenharmony_ci{
186662306a36Sopenharmony_ci	struct md_rdev *rdev, *rrdev;
186762306a36Sopenharmony_ci	int disk_index;
186862306a36Sopenharmony_ci	int data_count = 0;
186962306a36Sopenharmony_ci
187062306a36Sopenharmony_ci	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
187162306a36Sopenharmony_ci		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
187262306a36Sopenharmony_ci			continue;
187362306a36Sopenharmony_ci		if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
187462306a36Sopenharmony_ci			continue;
187562306a36Sopenharmony_ci		data_count++;
187662306a36Sopenharmony_ci	}
187762306a36Sopenharmony_ci
187862306a36Sopenharmony_ci	/*
187962306a36Sopenharmony_ci	 * stripes that only have parity must have been flushed
188062306a36Sopenharmony_ci	 * before the crash that we are now recovering from, so
188162306a36Sopenharmony_ci	 * there is nothing more to recovery.
188262306a36Sopenharmony_ci	 */
188362306a36Sopenharmony_ci	if (data_count == 0)
188462306a36Sopenharmony_ci		goto out;
188562306a36Sopenharmony_ci
188662306a36Sopenharmony_ci	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
188762306a36Sopenharmony_ci		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
188862306a36Sopenharmony_ci			continue;
188962306a36Sopenharmony_ci
189062306a36Sopenharmony_ci		/* in case device is broken */
189162306a36Sopenharmony_ci		rcu_read_lock();
189262306a36Sopenharmony_ci		rdev = rcu_dereference(conf->disks[disk_index].rdev);
189362306a36Sopenharmony_ci		if (rdev) {
189462306a36Sopenharmony_ci			atomic_inc(&rdev->nr_pending);
189562306a36Sopenharmony_ci			rcu_read_unlock();
189662306a36Sopenharmony_ci			sync_page_io(rdev, sh->sector, PAGE_SIZE,
189762306a36Sopenharmony_ci				     sh->dev[disk_index].page, REQ_OP_WRITE,
189862306a36Sopenharmony_ci				     false);
189962306a36Sopenharmony_ci			rdev_dec_pending(rdev, rdev->mddev);
190062306a36Sopenharmony_ci			rcu_read_lock();
190162306a36Sopenharmony_ci		}
190262306a36Sopenharmony_ci		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
190362306a36Sopenharmony_ci		if (rrdev) {
190462306a36Sopenharmony_ci			atomic_inc(&rrdev->nr_pending);
190562306a36Sopenharmony_ci			rcu_read_unlock();
190662306a36Sopenharmony_ci			sync_page_io(rrdev, sh->sector, PAGE_SIZE,
190762306a36Sopenharmony_ci				     sh->dev[disk_index].page, REQ_OP_WRITE,
190862306a36Sopenharmony_ci				     false);
190962306a36Sopenharmony_ci			rdev_dec_pending(rrdev, rrdev->mddev);
191062306a36Sopenharmony_ci			rcu_read_lock();
191162306a36Sopenharmony_ci		}
191262306a36Sopenharmony_ci		rcu_read_unlock();
191362306a36Sopenharmony_ci	}
191462306a36Sopenharmony_ci	ctx->data_parity_stripes++;
191562306a36Sopenharmony_ciout:
191662306a36Sopenharmony_ci	r5l_recovery_reset_stripe(sh);
191762306a36Sopenharmony_ci}
191862306a36Sopenharmony_ci
191962306a36Sopenharmony_cistatic struct stripe_head *
192062306a36Sopenharmony_cir5c_recovery_alloc_stripe(
192162306a36Sopenharmony_ci		struct r5conf *conf,
192262306a36Sopenharmony_ci		sector_t stripe_sect,
192362306a36Sopenharmony_ci		int noblock)
192462306a36Sopenharmony_ci{
192562306a36Sopenharmony_ci	struct stripe_head *sh;
192662306a36Sopenharmony_ci
192762306a36Sopenharmony_ci	sh = raid5_get_active_stripe(conf, NULL, stripe_sect,
192862306a36Sopenharmony_ci				     noblock ? R5_GAS_NOBLOCK : 0);
192962306a36Sopenharmony_ci	if (!sh)
193062306a36Sopenharmony_ci		return NULL;  /* no more stripe available */
193162306a36Sopenharmony_ci
193262306a36Sopenharmony_ci	r5l_recovery_reset_stripe(sh);
193362306a36Sopenharmony_ci
193462306a36Sopenharmony_ci	return sh;
193562306a36Sopenharmony_ci}
193662306a36Sopenharmony_ci
193762306a36Sopenharmony_cistatic struct stripe_head *
193862306a36Sopenharmony_cir5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
193962306a36Sopenharmony_ci{
194062306a36Sopenharmony_ci	struct stripe_head *sh;
194162306a36Sopenharmony_ci
194262306a36Sopenharmony_ci	list_for_each_entry(sh, list, lru)
194362306a36Sopenharmony_ci		if (sh->sector == sect)
194462306a36Sopenharmony_ci			return sh;
194562306a36Sopenharmony_ci	return NULL;
194662306a36Sopenharmony_ci}
194762306a36Sopenharmony_ci
194862306a36Sopenharmony_cistatic void
194962306a36Sopenharmony_cir5c_recovery_drop_stripes(struct list_head *cached_stripe_list,
195062306a36Sopenharmony_ci			  struct r5l_recovery_ctx *ctx)
195162306a36Sopenharmony_ci{
195262306a36Sopenharmony_ci	struct stripe_head *sh, *next;
195362306a36Sopenharmony_ci
195462306a36Sopenharmony_ci	list_for_each_entry_safe(sh, next, cached_stripe_list, lru) {
195562306a36Sopenharmony_ci		r5l_recovery_reset_stripe(sh);
195662306a36Sopenharmony_ci		list_del_init(&sh->lru);
195762306a36Sopenharmony_ci		raid5_release_stripe(sh);
195862306a36Sopenharmony_ci	}
195962306a36Sopenharmony_ci}
196062306a36Sopenharmony_ci
196162306a36Sopenharmony_cistatic void
196262306a36Sopenharmony_cir5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
196362306a36Sopenharmony_ci			    struct r5l_recovery_ctx *ctx)
196462306a36Sopenharmony_ci{
196562306a36Sopenharmony_ci	struct stripe_head *sh, *next;
196662306a36Sopenharmony_ci
196762306a36Sopenharmony_ci	list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
196862306a36Sopenharmony_ci		if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
196962306a36Sopenharmony_ci			r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
197062306a36Sopenharmony_ci			list_del_init(&sh->lru);
197162306a36Sopenharmony_ci			raid5_release_stripe(sh);
197262306a36Sopenharmony_ci		}
197362306a36Sopenharmony_ci}
197462306a36Sopenharmony_ci
197562306a36Sopenharmony_ci/* if matches return 0; otherwise return -EINVAL */
197662306a36Sopenharmony_cistatic int
197762306a36Sopenharmony_cir5l_recovery_verify_data_checksum(struct r5l_log *log,
197862306a36Sopenharmony_ci				  struct r5l_recovery_ctx *ctx,
197962306a36Sopenharmony_ci				  struct page *page,
198062306a36Sopenharmony_ci				  sector_t log_offset, __le32 log_checksum)
198162306a36Sopenharmony_ci{
198262306a36Sopenharmony_ci	void *addr;
198362306a36Sopenharmony_ci	u32 checksum;
198462306a36Sopenharmony_ci
198562306a36Sopenharmony_ci	r5l_recovery_read_page(log, ctx, page, log_offset);
198662306a36Sopenharmony_ci	addr = kmap_atomic(page);
198762306a36Sopenharmony_ci	checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
198862306a36Sopenharmony_ci	kunmap_atomic(addr);
198962306a36Sopenharmony_ci	return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL;
199062306a36Sopenharmony_ci}
199162306a36Sopenharmony_ci
199262306a36Sopenharmony_ci/*
199362306a36Sopenharmony_ci * before loading data to stripe cache, we need verify checksum for all data,
199462306a36Sopenharmony_ci * if there is mismatch for any data page, we drop all data in the mata block
199562306a36Sopenharmony_ci */
199662306a36Sopenharmony_cistatic int
199762306a36Sopenharmony_cir5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
199862306a36Sopenharmony_ci					 struct r5l_recovery_ctx *ctx)
199962306a36Sopenharmony_ci{
200062306a36Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
200162306a36Sopenharmony_ci	struct r5conf *conf = mddev->private;
200262306a36Sopenharmony_ci	struct r5l_meta_block *mb = page_address(ctx->meta_page);
200362306a36Sopenharmony_ci	sector_t mb_offset = sizeof(struct r5l_meta_block);
200462306a36Sopenharmony_ci	sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
200562306a36Sopenharmony_ci	struct page *page;
200662306a36Sopenharmony_ci	struct r5l_payload_data_parity *payload;
200762306a36Sopenharmony_ci	struct r5l_payload_flush *payload_flush;
200862306a36Sopenharmony_ci
200962306a36Sopenharmony_ci	page = alloc_page(GFP_KERNEL);
201062306a36Sopenharmony_ci	if (!page)
201162306a36Sopenharmony_ci		return -ENOMEM;
201262306a36Sopenharmony_ci
201362306a36Sopenharmony_ci	while (mb_offset < le32_to_cpu(mb->meta_size)) {
201462306a36Sopenharmony_ci		payload = (void *)mb + mb_offset;
201562306a36Sopenharmony_ci		payload_flush = (void *)mb + mb_offset;
201662306a36Sopenharmony_ci
201762306a36Sopenharmony_ci		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
201862306a36Sopenharmony_ci			if (r5l_recovery_verify_data_checksum(
201962306a36Sopenharmony_ci				    log, ctx, page, log_offset,
202062306a36Sopenharmony_ci				    payload->checksum[0]) < 0)
202162306a36Sopenharmony_ci				goto mismatch;
202262306a36Sopenharmony_ci		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
202362306a36Sopenharmony_ci			if (r5l_recovery_verify_data_checksum(
202462306a36Sopenharmony_ci				    log, ctx, page, log_offset,
202562306a36Sopenharmony_ci				    payload->checksum[0]) < 0)
202662306a36Sopenharmony_ci				goto mismatch;
202762306a36Sopenharmony_ci			if (conf->max_degraded == 2 && /* q for RAID 6 */
202862306a36Sopenharmony_ci			    r5l_recovery_verify_data_checksum(
202962306a36Sopenharmony_ci				    log, ctx, page,
203062306a36Sopenharmony_ci				    r5l_ring_add(log, log_offset,
203162306a36Sopenharmony_ci						 BLOCK_SECTORS),
203262306a36Sopenharmony_ci				    payload->checksum[1]) < 0)
203362306a36Sopenharmony_ci				goto mismatch;
203462306a36Sopenharmony_ci		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
203562306a36Sopenharmony_ci			/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
203662306a36Sopenharmony_ci		} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
203762306a36Sopenharmony_ci			goto mismatch;
203862306a36Sopenharmony_ci
203962306a36Sopenharmony_ci		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
204062306a36Sopenharmony_ci			mb_offset += sizeof(struct r5l_payload_flush) +
204162306a36Sopenharmony_ci				le32_to_cpu(payload_flush->size);
204262306a36Sopenharmony_ci		} else {
204362306a36Sopenharmony_ci			/* DATA or PARITY payload */
204462306a36Sopenharmony_ci			log_offset = r5l_ring_add(log, log_offset,
204562306a36Sopenharmony_ci						  le32_to_cpu(payload->size));
204662306a36Sopenharmony_ci			mb_offset += sizeof(struct r5l_payload_data_parity) +
204762306a36Sopenharmony_ci				sizeof(__le32) *
204862306a36Sopenharmony_ci				(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
204962306a36Sopenharmony_ci		}
205062306a36Sopenharmony_ci
205162306a36Sopenharmony_ci	}
205262306a36Sopenharmony_ci
205362306a36Sopenharmony_ci	put_page(page);
205462306a36Sopenharmony_ci	return 0;
205562306a36Sopenharmony_ci
205662306a36Sopenharmony_cimismatch:
205762306a36Sopenharmony_ci	put_page(page);
205862306a36Sopenharmony_ci	return -EINVAL;
205962306a36Sopenharmony_ci}
206062306a36Sopenharmony_ci
206162306a36Sopenharmony_ci/*
206262306a36Sopenharmony_ci * Analyze all data/parity pages in one meta block
206362306a36Sopenharmony_ci * Returns:
206462306a36Sopenharmony_ci * 0 for success
206562306a36Sopenharmony_ci * -EINVAL for unknown playload type
206662306a36Sopenharmony_ci * -EAGAIN for checksum mismatch of data page
206762306a36Sopenharmony_ci * -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
206862306a36Sopenharmony_ci */
206962306a36Sopenharmony_cistatic int
207062306a36Sopenharmony_cir5c_recovery_analyze_meta_block(struct r5l_log *log,
207162306a36Sopenharmony_ci				struct r5l_recovery_ctx *ctx,
207262306a36Sopenharmony_ci				struct list_head *cached_stripe_list)
207362306a36Sopenharmony_ci{
207462306a36Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
207562306a36Sopenharmony_ci	struct r5conf *conf = mddev->private;
207662306a36Sopenharmony_ci	struct r5l_meta_block *mb;
207762306a36Sopenharmony_ci	struct r5l_payload_data_parity *payload;
207862306a36Sopenharmony_ci	struct r5l_payload_flush *payload_flush;
207962306a36Sopenharmony_ci	int mb_offset;
208062306a36Sopenharmony_ci	sector_t log_offset;
208162306a36Sopenharmony_ci	sector_t stripe_sect;
208262306a36Sopenharmony_ci	struct stripe_head *sh;
208362306a36Sopenharmony_ci	int ret;
208462306a36Sopenharmony_ci
208562306a36Sopenharmony_ci	/*
208662306a36Sopenharmony_ci	 * for mismatch in data blocks, we will drop all data in this mb, but
208762306a36Sopenharmony_ci	 * we will still read next mb for other data with FLUSH flag, as
208862306a36Sopenharmony_ci	 * io_unit could finish out of order.
208962306a36Sopenharmony_ci	 */
209062306a36Sopenharmony_ci	ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
209162306a36Sopenharmony_ci	if (ret == -EINVAL)
209262306a36Sopenharmony_ci		return -EAGAIN;
209362306a36Sopenharmony_ci	else if (ret)
209462306a36Sopenharmony_ci		return ret;   /* -ENOMEM duo to alloc_page() failed */
209562306a36Sopenharmony_ci
209662306a36Sopenharmony_ci	mb = page_address(ctx->meta_page);
209762306a36Sopenharmony_ci	mb_offset = sizeof(struct r5l_meta_block);
209862306a36Sopenharmony_ci	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
209962306a36Sopenharmony_ci
210062306a36Sopenharmony_ci	while (mb_offset < le32_to_cpu(mb->meta_size)) {
210162306a36Sopenharmony_ci		int dd;
210262306a36Sopenharmony_ci
210362306a36Sopenharmony_ci		payload = (void *)mb + mb_offset;
210462306a36Sopenharmony_ci		payload_flush = (void *)mb + mb_offset;
210562306a36Sopenharmony_ci
210662306a36Sopenharmony_ci		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
210762306a36Sopenharmony_ci			int i, count;
210862306a36Sopenharmony_ci
210962306a36Sopenharmony_ci			count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
211062306a36Sopenharmony_ci			for (i = 0; i < count; ++i) {
211162306a36Sopenharmony_ci				stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
211262306a36Sopenharmony_ci				sh = r5c_recovery_lookup_stripe(cached_stripe_list,
211362306a36Sopenharmony_ci								stripe_sect);
211462306a36Sopenharmony_ci				if (sh) {
211562306a36Sopenharmony_ci					WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
211662306a36Sopenharmony_ci					r5l_recovery_reset_stripe(sh);
211762306a36Sopenharmony_ci					list_del_init(&sh->lru);
211862306a36Sopenharmony_ci					raid5_release_stripe(sh);
211962306a36Sopenharmony_ci				}
212062306a36Sopenharmony_ci			}
212162306a36Sopenharmony_ci
212262306a36Sopenharmony_ci			mb_offset += sizeof(struct r5l_payload_flush) +
212362306a36Sopenharmony_ci				le32_to_cpu(payload_flush->size);
212462306a36Sopenharmony_ci			continue;
212562306a36Sopenharmony_ci		}
212662306a36Sopenharmony_ci
212762306a36Sopenharmony_ci		/* DATA or PARITY payload */
212862306a36Sopenharmony_ci		stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
212962306a36Sopenharmony_ci			raid5_compute_sector(
213062306a36Sopenharmony_ci				conf, le64_to_cpu(payload->location), 0, &dd,
213162306a36Sopenharmony_ci				NULL)
213262306a36Sopenharmony_ci			: le64_to_cpu(payload->location);
213362306a36Sopenharmony_ci
213462306a36Sopenharmony_ci		sh = r5c_recovery_lookup_stripe(cached_stripe_list,
213562306a36Sopenharmony_ci						stripe_sect);
213662306a36Sopenharmony_ci
213762306a36Sopenharmony_ci		if (!sh) {
213862306a36Sopenharmony_ci			sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1);
213962306a36Sopenharmony_ci			/*
214062306a36Sopenharmony_ci			 * cannot get stripe from raid5_get_active_stripe
214162306a36Sopenharmony_ci			 * try replay some stripes
214262306a36Sopenharmony_ci			 */
214362306a36Sopenharmony_ci			if (!sh) {
214462306a36Sopenharmony_ci				r5c_recovery_replay_stripes(
214562306a36Sopenharmony_ci					cached_stripe_list, ctx);
214662306a36Sopenharmony_ci				sh = r5c_recovery_alloc_stripe(
214762306a36Sopenharmony_ci					conf, stripe_sect, 1);
214862306a36Sopenharmony_ci			}
214962306a36Sopenharmony_ci			if (!sh) {
215062306a36Sopenharmony_ci				int new_size = conf->min_nr_stripes * 2;
215162306a36Sopenharmony_ci				pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
215262306a36Sopenharmony_ci					mdname(mddev),
215362306a36Sopenharmony_ci					new_size);
215462306a36Sopenharmony_ci				ret = raid5_set_cache_size(mddev, new_size);
215562306a36Sopenharmony_ci				if (conf->min_nr_stripes <= new_size / 2) {
215662306a36Sopenharmony_ci					pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n",
215762306a36Sopenharmony_ci						mdname(mddev),
215862306a36Sopenharmony_ci						ret,
215962306a36Sopenharmony_ci						new_size,
216062306a36Sopenharmony_ci						conf->min_nr_stripes,
216162306a36Sopenharmony_ci						conf->max_nr_stripes);
216262306a36Sopenharmony_ci					return -ENOMEM;
216362306a36Sopenharmony_ci				}
216462306a36Sopenharmony_ci				sh = r5c_recovery_alloc_stripe(
216562306a36Sopenharmony_ci					conf, stripe_sect, 0);
216662306a36Sopenharmony_ci			}
216762306a36Sopenharmony_ci			if (!sh) {
216862306a36Sopenharmony_ci				pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
216962306a36Sopenharmony_ci					mdname(mddev));
217062306a36Sopenharmony_ci				return -ENOMEM;
217162306a36Sopenharmony_ci			}
217262306a36Sopenharmony_ci			list_add_tail(&sh->lru, cached_stripe_list);
217362306a36Sopenharmony_ci		}
217462306a36Sopenharmony_ci
217562306a36Sopenharmony_ci		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
217662306a36Sopenharmony_ci			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
217762306a36Sopenharmony_ci			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
217862306a36Sopenharmony_ci				r5l_recovery_replay_one_stripe(conf, sh, ctx);
217962306a36Sopenharmony_ci				list_move_tail(&sh->lru, cached_stripe_list);
218062306a36Sopenharmony_ci			}
218162306a36Sopenharmony_ci			r5l_recovery_load_data(log, sh, ctx, payload,
218262306a36Sopenharmony_ci					       log_offset);
218362306a36Sopenharmony_ci		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
218462306a36Sopenharmony_ci			r5l_recovery_load_parity(log, sh, ctx, payload,
218562306a36Sopenharmony_ci						 log_offset);
218662306a36Sopenharmony_ci		else
218762306a36Sopenharmony_ci			return -EINVAL;
218862306a36Sopenharmony_ci
218962306a36Sopenharmony_ci		log_offset = r5l_ring_add(log, log_offset,
219062306a36Sopenharmony_ci					  le32_to_cpu(payload->size));
219162306a36Sopenharmony_ci
219262306a36Sopenharmony_ci		mb_offset += sizeof(struct r5l_payload_data_parity) +
219362306a36Sopenharmony_ci			sizeof(__le32) *
219462306a36Sopenharmony_ci			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
219562306a36Sopenharmony_ci	}
219662306a36Sopenharmony_ci
219762306a36Sopenharmony_ci	return 0;
219862306a36Sopenharmony_ci}
219962306a36Sopenharmony_ci
220062306a36Sopenharmony_ci/*
220162306a36Sopenharmony_ci * Load the stripe into cache. The stripe will be written out later by
220262306a36Sopenharmony_ci * the stripe cache state machine.
220362306a36Sopenharmony_ci */
220462306a36Sopenharmony_cistatic void r5c_recovery_load_one_stripe(struct r5l_log *log,
220562306a36Sopenharmony_ci					 struct stripe_head *sh)
220662306a36Sopenharmony_ci{
220762306a36Sopenharmony_ci	struct r5dev *dev;
220862306a36Sopenharmony_ci	int i;
220962306a36Sopenharmony_ci
221062306a36Sopenharmony_ci	for (i = sh->disks; i--; ) {
221162306a36Sopenharmony_ci		dev = sh->dev + i;
221262306a36Sopenharmony_ci		if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
221362306a36Sopenharmony_ci			set_bit(R5_InJournal, &dev->flags);
221462306a36Sopenharmony_ci			set_bit(R5_UPTODATE, &dev->flags);
221562306a36Sopenharmony_ci		}
221662306a36Sopenharmony_ci	}
221762306a36Sopenharmony_ci}
221862306a36Sopenharmony_ci
221962306a36Sopenharmony_ci/*
222062306a36Sopenharmony_ci * Scan through the log for all to-be-flushed data
222162306a36Sopenharmony_ci *
222262306a36Sopenharmony_ci * For stripes with data and parity, namely Data-Parity stripe
222362306a36Sopenharmony_ci * (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
222462306a36Sopenharmony_ci *
222562306a36Sopenharmony_ci * For stripes with only data, namely Data-Only stripe
222662306a36Sopenharmony_ci * (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
222762306a36Sopenharmony_ci *
222862306a36Sopenharmony_ci * For a stripe, if we see data after parity, we should discard all previous
222962306a36Sopenharmony_ci * data and parity for this stripe, as these data are already flushed to
223062306a36Sopenharmony_ci * the array.
223162306a36Sopenharmony_ci *
223262306a36Sopenharmony_ci * At the end of the scan, we return the new journal_tail, which points to
223362306a36Sopenharmony_ci * first data-only stripe on the journal device, or next invalid meta block.
223462306a36Sopenharmony_ci */
223562306a36Sopenharmony_cistatic int r5c_recovery_flush_log(struct r5l_log *log,
223662306a36Sopenharmony_ci				  struct r5l_recovery_ctx *ctx)
223762306a36Sopenharmony_ci{
223862306a36Sopenharmony_ci	struct stripe_head *sh;
223962306a36Sopenharmony_ci	int ret = 0;
224062306a36Sopenharmony_ci
224162306a36Sopenharmony_ci	/* scan through the log */
224262306a36Sopenharmony_ci	while (1) {
224362306a36Sopenharmony_ci		if (r5l_recovery_read_meta_block(log, ctx))
224462306a36Sopenharmony_ci			break;
224562306a36Sopenharmony_ci
224662306a36Sopenharmony_ci		ret = r5c_recovery_analyze_meta_block(log, ctx,
224762306a36Sopenharmony_ci						      &ctx->cached_list);
224862306a36Sopenharmony_ci		/*
224962306a36Sopenharmony_ci		 * -EAGAIN means mismatch in data block, in this case, we still
225062306a36Sopenharmony_ci		 * try scan the next metablock
225162306a36Sopenharmony_ci		 */
225262306a36Sopenharmony_ci		if (ret && ret != -EAGAIN)
225362306a36Sopenharmony_ci			break;   /* ret == -EINVAL or -ENOMEM */
225462306a36Sopenharmony_ci		ctx->seq++;
225562306a36Sopenharmony_ci		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
225662306a36Sopenharmony_ci	}
225762306a36Sopenharmony_ci
225862306a36Sopenharmony_ci	if (ret == -ENOMEM) {
225962306a36Sopenharmony_ci		r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
226062306a36Sopenharmony_ci		return ret;
226162306a36Sopenharmony_ci	}
226262306a36Sopenharmony_ci
226362306a36Sopenharmony_ci	/* replay data-parity stripes */
226462306a36Sopenharmony_ci	r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
226562306a36Sopenharmony_ci
226662306a36Sopenharmony_ci	/* load data-only stripes to stripe cache */
226762306a36Sopenharmony_ci	list_for_each_entry(sh, &ctx->cached_list, lru) {
226862306a36Sopenharmony_ci		WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
226962306a36Sopenharmony_ci		r5c_recovery_load_one_stripe(log, sh);
227062306a36Sopenharmony_ci		ctx->data_only_stripes++;
227162306a36Sopenharmony_ci	}
227262306a36Sopenharmony_ci
227362306a36Sopenharmony_ci	return 0;
227462306a36Sopenharmony_ci}
227562306a36Sopenharmony_ci
227662306a36Sopenharmony_ci/*
227762306a36Sopenharmony_ci * we did a recovery. Now ctx.pos points to an invalid meta block. New
227862306a36Sopenharmony_ci * log will start here. but we can't let superblock point to last valid
227962306a36Sopenharmony_ci * meta block. The log might looks like:
228062306a36Sopenharmony_ci * | meta 1| meta 2| meta 3|
228162306a36Sopenharmony_ci * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
228262306a36Sopenharmony_ci * superblock points to meta 1, we write a new valid meta 2n.  if crash
228362306a36Sopenharmony_ci * happens again, new recovery will start from meta 1. Since meta 2n is
228462306a36Sopenharmony_ci * valid now, recovery will think meta 3 is valid, which is wrong.
228562306a36Sopenharmony_ci * The solution is we create a new meta in meta2 with its seq == meta
228662306a36Sopenharmony_ci * 1's seq + 10000 and let superblock points to meta2. The same recovery
228762306a36Sopenharmony_ci * will not think meta 3 is a valid meta, because its seq doesn't match
228862306a36Sopenharmony_ci */
228962306a36Sopenharmony_ci
229062306a36Sopenharmony_ci/*
229162306a36Sopenharmony_ci * Before recovery, the log looks like the following
229262306a36Sopenharmony_ci *
229362306a36Sopenharmony_ci *   ---------------------------------------------
229462306a36Sopenharmony_ci *   |           valid log        | invalid log  |
229562306a36Sopenharmony_ci *   ---------------------------------------------
229662306a36Sopenharmony_ci *   ^
229762306a36Sopenharmony_ci *   |- log->last_checkpoint
229862306a36Sopenharmony_ci *   |- log->last_cp_seq
229962306a36Sopenharmony_ci *
230062306a36Sopenharmony_ci * Now we scan through the log until we see invalid entry
230162306a36Sopenharmony_ci *
230262306a36Sopenharmony_ci *   ---------------------------------------------
230362306a36Sopenharmony_ci *   |           valid log        | invalid log  |
230462306a36Sopenharmony_ci *   ---------------------------------------------
230562306a36Sopenharmony_ci *   ^                            ^
230662306a36Sopenharmony_ci *   |- log->last_checkpoint      |- ctx->pos
230762306a36Sopenharmony_ci *   |- log->last_cp_seq          |- ctx->seq
230862306a36Sopenharmony_ci *
230962306a36Sopenharmony_ci * From this point, we need to increase seq number by 10 to avoid
231062306a36Sopenharmony_ci * confusing next recovery.
231162306a36Sopenharmony_ci *
231262306a36Sopenharmony_ci *   ---------------------------------------------
231362306a36Sopenharmony_ci *   |           valid log        | invalid log  |
231462306a36Sopenharmony_ci *   ---------------------------------------------
231562306a36Sopenharmony_ci *   ^                              ^
231662306a36Sopenharmony_ci *   |- log->last_checkpoint        |- ctx->pos+1
231762306a36Sopenharmony_ci *   |- log->last_cp_seq            |- ctx->seq+10001
231862306a36Sopenharmony_ci *
231962306a36Sopenharmony_ci * However, it is not safe to start the state machine yet, because data only
232062306a36Sopenharmony_ci * parities are not yet secured in RAID. To save these data only parities, we
232162306a36Sopenharmony_ci * rewrite them from seq+11.
232262306a36Sopenharmony_ci *
232362306a36Sopenharmony_ci *   -----------------------------------------------------------------
232462306a36Sopenharmony_ci *   |           valid log        | data only stripes | invalid log  |
232562306a36Sopenharmony_ci *   -----------------------------------------------------------------
232662306a36Sopenharmony_ci *   ^                                                ^
232762306a36Sopenharmony_ci *   |- log->last_checkpoint                          |- ctx->pos+n
232862306a36Sopenharmony_ci *   |- log->last_cp_seq                              |- ctx->seq+10000+n
232962306a36Sopenharmony_ci *
233062306a36Sopenharmony_ci * If failure happens again during this process, the recovery can safe start
233162306a36Sopenharmony_ci * again from log->last_checkpoint.
233262306a36Sopenharmony_ci *
233362306a36Sopenharmony_ci * Once data only stripes are rewritten to journal, we move log_tail
233462306a36Sopenharmony_ci *
233562306a36Sopenharmony_ci *   -----------------------------------------------------------------
233662306a36Sopenharmony_ci *   |     old log        |    data only stripes    | invalid log  |
233762306a36Sopenharmony_ci *   -----------------------------------------------------------------
233862306a36Sopenharmony_ci *                        ^                         ^
233962306a36Sopenharmony_ci *                        |- log->last_checkpoint   |- ctx->pos+n
234062306a36Sopenharmony_ci *                        |- log->last_cp_seq       |- ctx->seq+10000+n
234162306a36Sopenharmony_ci *
234262306a36Sopenharmony_ci * Then we can safely start the state machine. If failure happens from this
234362306a36Sopenharmony_ci * point on, the recovery will start from new log->last_checkpoint.
234462306a36Sopenharmony_ci */
234562306a36Sopenharmony_cistatic int
234662306a36Sopenharmony_cir5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
234762306a36Sopenharmony_ci				       struct r5l_recovery_ctx *ctx)
234862306a36Sopenharmony_ci{
234962306a36Sopenharmony_ci	struct stripe_head *sh;
235062306a36Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
235162306a36Sopenharmony_ci	struct page *page;
235262306a36Sopenharmony_ci	sector_t next_checkpoint = MaxSector;
235362306a36Sopenharmony_ci
235462306a36Sopenharmony_ci	page = alloc_page(GFP_KERNEL);
235562306a36Sopenharmony_ci	if (!page) {
235662306a36Sopenharmony_ci		pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
235762306a36Sopenharmony_ci		       mdname(mddev));
235862306a36Sopenharmony_ci		return -ENOMEM;
235962306a36Sopenharmony_ci	}
236062306a36Sopenharmony_ci
236162306a36Sopenharmony_ci	WARN_ON(list_empty(&ctx->cached_list));
236262306a36Sopenharmony_ci
236362306a36Sopenharmony_ci	list_for_each_entry(sh, &ctx->cached_list, lru) {
236462306a36Sopenharmony_ci		struct r5l_meta_block *mb;
236562306a36Sopenharmony_ci		int i;
236662306a36Sopenharmony_ci		int offset;
236762306a36Sopenharmony_ci		sector_t write_pos;
236862306a36Sopenharmony_ci
236962306a36Sopenharmony_ci		WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
237062306a36Sopenharmony_ci		r5l_recovery_create_empty_meta_block(log, page,
237162306a36Sopenharmony_ci						     ctx->pos, ctx->seq);
237262306a36Sopenharmony_ci		mb = page_address(page);
237362306a36Sopenharmony_ci		offset = le32_to_cpu(mb->meta_size);
237462306a36Sopenharmony_ci		write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
237562306a36Sopenharmony_ci
237662306a36Sopenharmony_ci		for (i = sh->disks; i--; ) {
237762306a36Sopenharmony_ci			struct r5dev *dev = &sh->dev[i];
237862306a36Sopenharmony_ci			struct r5l_payload_data_parity *payload;
237962306a36Sopenharmony_ci			void *addr;
238062306a36Sopenharmony_ci
238162306a36Sopenharmony_ci			if (test_bit(R5_InJournal, &dev->flags)) {
238262306a36Sopenharmony_ci				payload = (void *)mb + offset;
238362306a36Sopenharmony_ci				payload->header.type = cpu_to_le16(
238462306a36Sopenharmony_ci					R5LOG_PAYLOAD_DATA);
238562306a36Sopenharmony_ci				payload->size = cpu_to_le32(BLOCK_SECTORS);
238662306a36Sopenharmony_ci				payload->location = cpu_to_le64(
238762306a36Sopenharmony_ci					raid5_compute_blocknr(sh, i, 0));
238862306a36Sopenharmony_ci				addr = kmap_atomic(dev->page);
238962306a36Sopenharmony_ci				payload->checksum[0] = cpu_to_le32(
239062306a36Sopenharmony_ci					crc32c_le(log->uuid_checksum, addr,
239162306a36Sopenharmony_ci						  PAGE_SIZE));
239262306a36Sopenharmony_ci				kunmap_atomic(addr);
239362306a36Sopenharmony_ci				sync_page_io(log->rdev, write_pos, PAGE_SIZE,
239462306a36Sopenharmony_ci					     dev->page, REQ_OP_WRITE, false);
239562306a36Sopenharmony_ci				write_pos = r5l_ring_add(log, write_pos,
239662306a36Sopenharmony_ci							 BLOCK_SECTORS);
239762306a36Sopenharmony_ci				offset += sizeof(__le32) +
239862306a36Sopenharmony_ci					sizeof(struct r5l_payload_data_parity);
239962306a36Sopenharmony_ci
240062306a36Sopenharmony_ci			}
240162306a36Sopenharmony_ci		}
240262306a36Sopenharmony_ci		mb->meta_size = cpu_to_le32(offset);
240362306a36Sopenharmony_ci		mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
240462306a36Sopenharmony_ci						     mb, PAGE_SIZE));
240562306a36Sopenharmony_ci		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
240662306a36Sopenharmony_ci			     REQ_OP_WRITE | REQ_SYNC | REQ_FUA, false);
240762306a36Sopenharmony_ci		sh->log_start = ctx->pos;
240862306a36Sopenharmony_ci		list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
240962306a36Sopenharmony_ci		atomic_inc(&log->stripe_in_journal_count);
241062306a36Sopenharmony_ci		ctx->pos = write_pos;
241162306a36Sopenharmony_ci		ctx->seq += 1;
241262306a36Sopenharmony_ci		next_checkpoint = sh->log_start;
241362306a36Sopenharmony_ci	}
241462306a36Sopenharmony_ci	log->next_checkpoint = next_checkpoint;
241562306a36Sopenharmony_ci	__free_page(page);
241662306a36Sopenharmony_ci	return 0;
241762306a36Sopenharmony_ci}
241862306a36Sopenharmony_ci
241962306a36Sopenharmony_cistatic void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
242062306a36Sopenharmony_ci						 struct r5l_recovery_ctx *ctx)
242162306a36Sopenharmony_ci{
242262306a36Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
242362306a36Sopenharmony_ci	struct r5conf *conf = mddev->private;
242462306a36Sopenharmony_ci	struct stripe_head *sh, *next;
242562306a36Sopenharmony_ci	bool cleared_pending = false;
242662306a36Sopenharmony_ci
242762306a36Sopenharmony_ci	if (ctx->data_only_stripes == 0)
242862306a36Sopenharmony_ci		return;
242962306a36Sopenharmony_ci
243062306a36Sopenharmony_ci	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
243162306a36Sopenharmony_ci		cleared_pending = true;
243262306a36Sopenharmony_ci		clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
243362306a36Sopenharmony_ci	}
243462306a36Sopenharmony_ci	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
243562306a36Sopenharmony_ci
243662306a36Sopenharmony_ci	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
243762306a36Sopenharmony_ci		r5c_make_stripe_write_out(sh);
243862306a36Sopenharmony_ci		set_bit(STRIPE_HANDLE, &sh->state);
243962306a36Sopenharmony_ci		list_del_init(&sh->lru);
244062306a36Sopenharmony_ci		raid5_release_stripe(sh);
244162306a36Sopenharmony_ci	}
244262306a36Sopenharmony_ci
244362306a36Sopenharmony_ci	/* reuse conf->wait_for_quiescent in recovery */
244462306a36Sopenharmony_ci	wait_event(conf->wait_for_quiescent,
244562306a36Sopenharmony_ci		   atomic_read(&conf->active_stripes) == 0);
244662306a36Sopenharmony_ci
244762306a36Sopenharmony_ci	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
244862306a36Sopenharmony_ci	if (cleared_pending)
244962306a36Sopenharmony_ci		set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
245062306a36Sopenharmony_ci}
245162306a36Sopenharmony_ci
245262306a36Sopenharmony_cistatic int r5l_recovery_log(struct r5l_log *log)
245362306a36Sopenharmony_ci{
245462306a36Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
245562306a36Sopenharmony_ci	struct r5l_recovery_ctx *ctx;
245662306a36Sopenharmony_ci	int ret;
245762306a36Sopenharmony_ci	sector_t pos;
245862306a36Sopenharmony_ci
245962306a36Sopenharmony_ci	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
246062306a36Sopenharmony_ci	if (!ctx)
246162306a36Sopenharmony_ci		return -ENOMEM;
246262306a36Sopenharmony_ci
246362306a36Sopenharmony_ci	ctx->pos = log->last_checkpoint;
246462306a36Sopenharmony_ci	ctx->seq = log->last_cp_seq;
246562306a36Sopenharmony_ci	INIT_LIST_HEAD(&ctx->cached_list);
246662306a36Sopenharmony_ci	ctx->meta_page = alloc_page(GFP_KERNEL);
246762306a36Sopenharmony_ci
246862306a36Sopenharmony_ci	if (!ctx->meta_page) {
246962306a36Sopenharmony_ci		ret =  -ENOMEM;
247062306a36Sopenharmony_ci		goto meta_page;
247162306a36Sopenharmony_ci	}
247262306a36Sopenharmony_ci
247362306a36Sopenharmony_ci	if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
247462306a36Sopenharmony_ci		ret = -ENOMEM;
247562306a36Sopenharmony_ci		goto ra_pool;
247662306a36Sopenharmony_ci	}
247762306a36Sopenharmony_ci
247862306a36Sopenharmony_ci	ret = r5c_recovery_flush_log(log, ctx);
247962306a36Sopenharmony_ci
248062306a36Sopenharmony_ci	if (ret)
248162306a36Sopenharmony_ci		goto error;
248262306a36Sopenharmony_ci
248362306a36Sopenharmony_ci	pos = ctx->pos;
248462306a36Sopenharmony_ci	ctx->seq += 10000;
248562306a36Sopenharmony_ci
248662306a36Sopenharmony_ci	if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
248762306a36Sopenharmony_ci		pr_info("md/raid:%s: starting from clean shutdown\n",
248862306a36Sopenharmony_ci			 mdname(mddev));
248962306a36Sopenharmony_ci	else
249062306a36Sopenharmony_ci		pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
249162306a36Sopenharmony_ci			 mdname(mddev), ctx->data_only_stripes,
249262306a36Sopenharmony_ci			 ctx->data_parity_stripes);
249362306a36Sopenharmony_ci
249462306a36Sopenharmony_ci	if (ctx->data_only_stripes == 0) {
249562306a36Sopenharmony_ci		log->next_checkpoint = ctx->pos;
249662306a36Sopenharmony_ci		r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
249762306a36Sopenharmony_ci		ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
249862306a36Sopenharmony_ci	} else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
249962306a36Sopenharmony_ci		pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
250062306a36Sopenharmony_ci		       mdname(mddev));
250162306a36Sopenharmony_ci		ret =  -EIO;
250262306a36Sopenharmony_ci		goto error;
250362306a36Sopenharmony_ci	}
250462306a36Sopenharmony_ci
250562306a36Sopenharmony_ci	log->log_start = ctx->pos;
250662306a36Sopenharmony_ci	log->seq = ctx->seq;
250762306a36Sopenharmony_ci	log->last_checkpoint = pos;
250862306a36Sopenharmony_ci	r5l_write_super(log, pos);
250962306a36Sopenharmony_ci
251062306a36Sopenharmony_ci	r5c_recovery_flush_data_only_stripes(log, ctx);
251162306a36Sopenharmony_ci	ret = 0;
251262306a36Sopenharmony_cierror:
251362306a36Sopenharmony_ci	r5l_recovery_free_ra_pool(log, ctx);
251462306a36Sopenharmony_cira_pool:
251562306a36Sopenharmony_ci	__free_page(ctx->meta_page);
251662306a36Sopenharmony_cimeta_page:
251762306a36Sopenharmony_ci	kfree(ctx);
251862306a36Sopenharmony_ci	return ret;
251962306a36Sopenharmony_ci}
252062306a36Sopenharmony_ci
252162306a36Sopenharmony_cistatic void r5l_write_super(struct r5l_log *log, sector_t cp)
252262306a36Sopenharmony_ci{
252362306a36Sopenharmony_ci	struct mddev *mddev = log->rdev->mddev;
252462306a36Sopenharmony_ci
252562306a36Sopenharmony_ci	log->rdev->journal_tail = cp;
252662306a36Sopenharmony_ci	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
252762306a36Sopenharmony_ci}
252862306a36Sopenharmony_ci
252962306a36Sopenharmony_cistatic ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
253062306a36Sopenharmony_ci{
253162306a36Sopenharmony_ci	struct r5conf *conf;
253262306a36Sopenharmony_ci	int ret;
253362306a36Sopenharmony_ci
253462306a36Sopenharmony_ci	ret = mddev_lock(mddev);
253562306a36Sopenharmony_ci	if (ret)
253662306a36Sopenharmony_ci		return ret;
253762306a36Sopenharmony_ci
253862306a36Sopenharmony_ci	conf = mddev->private;
253962306a36Sopenharmony_ci	if (!conf || !conf->log)
254062306a36Sopenharmony_ci		goto out_unlock;
254162306a36Sopenharmony_ci
254262306a36Sopenharmony_ci	switch (conf->log->r5c_journal_mode) {
254362306a36Sopenharmony_ci	case R5C_JOURNAL_MODE_WRITE_THROUGH:
254462306a36Sopenharmony_ci		ret = snprintf(
254562306a36Sopenharmony_ci			page, PAGE_SIZE, "[%s] %s\n",
254662306a36Sopenharmony_ci			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
254762306a36Sopenharmony_ci			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
254862306a36Sopenharmony_ci		break;
254962306a36Sopenharmony_ci	case R5C_JOURNAL_MODE_WRITE_BACK:
255062306a36Sopenharmony_ci		ret = snprintf(
255162306a36Sopenharmony_ci			page, PAGE_SIZE, "%s [%s]\n",
255262306a36Sopenharmony_ci			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_THROUGH],
255362306a36Sopenharmony_ci			r5c_journal_mode_str[R5C_JOURNAL_MODE_WRITE_BACK]);
255462306a36Sopenharmony_ci		break;
255562306a36Sopenharmony_ci	default:
255662306a36Sopenharmony_ci		ret = 0;
255762306a36Sopenharmony_ci	}
255862306a36Sopenharmony_ci
255962306a36Sopenharmony_ciout_unlock:
256062306a36Sopenharmony_ci	mddev_unlock(mddev);
256162306a36Sopenharmony_ci	return ret;
256262306a36Sopenharmony_ci}
256362306a36Sopenharmony_ci
256462306a36Sopenharmony_ci/*
256562306a36Sopenharmony_ci * Set journal cache mode on @mddev (external API initially needed by dm-raid).
256662306a36Sopenharmony_ci *
256762306a36Sopenharmony_ci * @mode as defined in 'enum r5c_journal_mode'.
256862306a36Sopenharmony_ci *
256962306a36Sopenharmony_ci */
257062306a36Sopenharmony_ciint r5c_journal_mode_set(struct mddev *mddev, int mode)
257162306a36Sopenharmony_ci{
257262306a36Sopenharmony_ci	struct r5conf *conf;
257362306a36Sopenharmony_ci
257462306a36Sopenharmony_ci	if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH ||
257562306a36Sopenharmony_ci	    mode > R5C_JOURNAL_MODE_WRITE_BACK)
257662306a36Sopenharmony_ci		return -EINVAL;
257762306a36Sopenharmony_ci
257862306a36Sopenharmony_ci	conf = mddev->private;
257962306a36Sopenharmony_ci	if (!conf || !conf->log)
258062306a36Sopenharmony_ci		return -ENODEV;
258162306a36Sopenharmony_ci
258262306a36Sopenharmony_ci	if (raid5_calc_degraded(conf) > 0 &&
258362306a36Sopenharmony_ci	    mode == R5C_JOURNAL_MODE_WRITE_BACK)
258462306a36Sopenharmony_ci		return -EINVAL;
258562306a36Sopenharmony_ci
258662306a36Sopenharmony_ci	mddev_suspend(mddev);
258762306a36Sopenharmony_ci	conf->log->r5c_journal_mode = mode;
258862306a36Sopenharmony_ci	mddev_resume(mddev);
258962306a36Sopenharmony_ci
259062306a36Sopenharmony_ci	pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
259162306a36Sopenharmony_ci		 mdname(mddev), mode, r5c_journal_mode_str[mode]);
259262306a36Sopenharmony_ci	return 0;
259362306a36Sopenharmony_ci}
259462306a36Sopenharmony_ciEXPORT_SYMBOL(r5c_journal_mode_set);
259562306a36Sopenharmony_ci
259662306a36Sopenharmony_cistatic ssize_t r5c_journal_mode_store(struct mddev *mddev,
259762306a36Sopenharmony_ci				      const char *page, size_t length)
259862306a36Sopenharmony_ci{
259962306a36Sopenharmony_ci	int mode = ARRAY_SIZE(r5c_journal_mode_str);
260062306a36Sopenharmony_ci	size_t len = length;
260162306a36Sopenharmony_ci	int ret;
260262306a36Sopenharmony_ci
260362306a36Sopenharmony_ci	if (len < 2)
260462306a36Sopenharmony_ci		return -EINVAL;
260562306a36Sopenharmony_ci
260662306a36Sopenharmony_ci	if (page[len - 1] == '\n')
260762306a36Sopenharmony_ci		len--;
260862306a36Sopenharmony_ci
260962306a36Sopenharmony_ci	while (mode--)
261062306a36Sopenharmony_ci		if (strlen(r5c_journal_mode_str[mode]) == len &&
261162306a36Sopenharmony_ci		    !strncmp(page, r5c_journal_mode_str[mode], len))
261262306a36Sopenharmony_ci			break;
261362306a36Sopenharmony_ci	ret = mddev_lock(mddev);
261462306a36Sopenharmony_ci	if (ret)
261562306a36Sopenharmony_ci		return ret;
261662306a36Sopenharmony_ci	ret = r5c_journal_mode_set(mddev, mode);
261762306a36Sopenharmony_ci	mddev_unlock(mddev);
261862306a36Sopenharmony_ci	return ret ?: length;
261962306a36Sopenharmony_ci}
262062306a36Sopenharmony_ci
262162306a36Sopenharmony_cistruct md_sysfs_entry
262262306a36Sopenharmony_cir5c_journal_mode = __ATTR(journal_mode, 0644,
262362306a36Sopenharmony_ci			  r5c_journal_mode_show, r5c_journal_mode_store);
262462306a36Sopenharmony_ci
262562306a36Sopenharmony_ci/*
262662306a36Sopenharmony_ci * Try handle write operation in caching phase. This function should only
262762306a36Sopenharmony_ci * be called in write-back mode.
262862306a36Sopenharmony_ci *
262962306a36Sopenharmony_ci * If all outstanding writes can be handled in caching phase, returns 0
263062306a36Sopenharmony_ci * If writes requires write-out phase, call r5c_make_stripe_write_out()
263162306a36Sopenharmony_ci * and returns -EAGAIN
263262306a36Sopenharmony_ci */
263362306a36Sopenharmony_ciint r5c_try_caching_write(struct r5conf *conf,
263462306a36Sopenharmony_ci			  struct stripe_head *sh,
263562306a36Sopenharmony_ci			  struct stripe_head_state *s,
263662306a36Sopenharmony_ci			  int disks)
263762306a36Sopenharmony_ci{
263862306a36Sopenharmony_ci	struct r5l_log *log = conf->log;
263962306a36Sopenharmony_ci	int i;
264062306a36Sopenharmony_ci	struct r5dev *dev;
264162306a36Sopenharmony_ci	int to_cache = 0;
264262306a36Sopenharmony_ci	void __rcu **pslot;
264362306a36Sopenharmony_ci	sector_t tree_index;
264462306a36Sopenharmony_ci	int ret;
264562306a36Sopenharmony_ci	uintptr_t refcount;
264662306a36Sopenharmony_ci
264762306a36Sopenharmony_ci	BUG_ON(!r5c_is_writeback(log));
264862306a36Sopenharmony_ci
264962306a36Sopenharmony_ci	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
265062306a36Sopenharmony_ci		/*
265162306a36Sopenharmony_ci		 * There are two different scenarios here:
265262306a36Sopenharmony_ci		 *  1. The stripe has some data cached, and it is sent to
265362306a36Sopenharmony_ci		 *     write-out phase for reclaim
265462306a36Sopenharmony_ci		 *  2. The stripe is clean, and this is the first write
265562306a36Sopenharmony_ci		 *
265662306a36Sopenharmony_ci		 * For 1, return -EAGAIN, so we continue with
265762306a36Sopenharmony_ci		 * handle_stripe_dirtying().
265862306a36Sopenharmony_ci		 *
265962306a36Sopenharmony_ci		 * For 2, set STRIPE_R5C_CACHING and continue with caching
266062306a36Sopenharmony_ci		 * write.
266162306a36Sopenharmony_ci		 */
266262306a36Sopenharmony_ci
266362306a36Sopenharmony_ci		/* case 1: anything injournal or anything in written */
266462306a36Sopenharmony_ci		if (s->injournal > 0 || s->written > 0)
266562306a36Sopenharmony_ci			return -EAGAIN;
266662306a36Sopenharmony_ci		/* case 2 */
266762306a36Sopenharmony_ci		set_bit(STRIPE_R5C_CACHING, &sh->state);
266862306a36Sopenharmony_ci	}
266962306a36Sopenharmony_ci
267062306a36Sopenharmony_ci	/*
267162306a36Sopenharmony_ci	 * When run in degraded mode, array is set to write-through mode.
267262306a36Sopenharmony_ci	 * This check helps drain pending write safely in the transition to
267362306a36Sopenharmony_ci	 * write-through mode.
267462306a36Sopenharmony_ci	 *
267562306a36Sopenharmony_ci	 * When a stripe is syncing, the write is also handled in write
267662306a36Sopenharmony_ci	 * through mode.
267762306a36Sopenharmony_ci	 */
267862306a36Sopenharmony_ci	if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
267962306a36Sopenharmony_ci		r5c_make_stripe_write_out(sh);
268062306a36Sopenharmony_ci		return -EAGAIN;
268162306a36Sopenharmony_ci	}
268262306a36Sopenharmony_ci
268362306a36Sopenharmony_ci	for (i = disks; i--; ) {
268462306a36Sopenharmony_ci		dev = &sh->dev[i];
268562306a36Sopenharmony_ci		/* if non-overwrite, use writing-out phase */
268662306a36Sopenharmony_ci		if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
268762306a36Sopenharmony_ci		    !test_bit(R5_InJournal, &dev->flags)) {
268862306a36Sopenharmony_ci			r5c_make_stripe_write_out(sh);
268962306a36Sopenharmony_ci			return -EAGAIN;
269062306a36Sopenharmony_ci		}
269162306a36Sopenharmony_ci	}
269262306a36Sopenharmony_ci
269362306a36Sopenharmony_ci	/* if the stripe is not counted in big_stripe_tree, add it now */
269462306a36Sopenharmony_ci	if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
269562306a36Sopenharmony_ci	    !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
269662306a36Sopenharmony_ci		tree_index = r5c_tree_index(conf, sh->sector);
269762306a36Sopenharmony_ci		spin_lock(&log->tree_lock);
269862306a36Sopenharmony_ci		pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
269962306a36Sopenharmony_ci					       tree_index);
270062306a36Sopenharmony_ci		if (pslot) {
270162306a36Sopenharmony_ci			refcount = (uintptr_t)radix_tree_deref_slot_protected(
270262306a36Sopenharmony_ci				pslot, &log->tree_lock) >>
270362306a36Sopenharmony_ci				R5C_RADIX_COUNT_SHIFT;
270462306a36Sopenharmony_ci			radix_tree_replace_slot(
270562306a36Sopenharmony_ci				&log->big_stripe_tree, pslot,
270662306a36Sopenharmony_ci				(void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
270762306a36Sopenharmony_ci		} else {
270862306a36Sopenharmony_ci			/*
270962306a36Sopenharmony_ci			 * this radix_tree_insert can fail safely, so no
271062306a36Sopenharmony_ci			 * need to call radix_tree_preload()
271162306a36Sopenharmony_ci			 */
271262306a36Sopenharmony_ci			ret = radix_tree_insert(
271362306a36Sopenharmony_ci				&log->big_stripe_tree, tree_index,
271462306a36Sopenharmony_ci				(void *)(1 << R5C_RADIX_COUNT_SHIFT));
271562306a36Sopenharmony_ci			if (ret) {
271662306a36Sopenharmony_ci				spin_unlock(&log->tree_lock);
271762306a36Sopenharmony_ci				r5c_make_stripe_write_out(sh);
271862306a36Sopenharmony_ci				return -EAGAIN;
271962306a36Sopenharmony_ci			}
272062306a36Sopenharmony_ci		}
272162306a36Sopenharmony_ci		spin_unlock(&log->tree_lock);
272262306a36Sopenharmony_ci
272362306a36Sopenharmony_ci		/*
272462306a36Sopenharmony_ci		 * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
272562306a36Sopenharmony_ci		 * counted in the radix tree
272662306a36Sopenharmony_ci		 */
272762306a36Sopenharmony_ci		set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
272862306a36Sopenharmony_ci		atomic_inc(&conf->r5c_cached_partial_stripes);
272962306a36Sopenharmony_ci	}
273062306a36Sopenharmony_ci
273162306a36Sopenharmony_ci	for (i = disks; i--; ) {
273262306a36Sopenharmony_ci		dev = &sh->dev[i];
273362306a36Sopenharmony_ci		if (dev->towrite) {
273462306a36Sopenharmony_ci			set_bit(R5_Wantwrite, &dev->flags);
273562306a36Sopenharmony_ci			set_bit(R5_Wantdrain, &dev->flags);
273662306a36Sopenharmony_ci			set_bit(R5_LOCKED, &dev->flags);
273762306a36Sopenharmony_ci			to_cache++;
273862306a36Sopenharmony_ci		}
273962306a36Sopenharmony_ci	}
274062306a36Sopenharmony_ci
274162306a36Sopenharmony_ci	if (to_cache) {
274262306a36Sopenharmony_ci		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
274362306a36Sopenharmony_ci		/*
274462306a36Sopenharmony_ci		 * set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
274562306a36Sopenharmony_ci		 * in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
274662306a36Sopenharmony_ci		 * r5c_handle_data_cached()
274762306a36Sopenharmony_ci		 */
274862306a36Sopenharmony_ci		set_bit(STRIPE_LOG_TRAPPED, &sh->state);
274962306a36Sopenharmony_ci	}
275062306a36Sopenharmony_ci
275162306a36Sopenharmony_ci	return 0;
275262306a36Sopenharmony_ci}
275362306a36Sopenharmony_ci
275462306a36Sopenharmony_ci/*
275562306a36Sopenharmony_ci * free extra pages (orig_page) we allocated for prexor
275662306a36Sopenharmony_ci */
275762306a36Sopenharmony_civoid r5c_release_extra_page(struct stripe_head *sh)
275862306a36Sopenharmony_ci{
275962306a36Sopenharmony_ci	struct r5conf *conf = sh->raid_conf;
276062306a36Sopenharmony_ci	int i;
276162306a36Sopenharmony_ci	bool using_disk_info_extra_page;
276262306a36Sopenharmony_ci
276362306a36Sopenharmony_ci	using_disk_info_extra_page =
276462306a36Sopenharmony_ci		sh->dev[0].orig_page == conf->disks[0].extra_page;
276562306a36Sopenharmony_ci
276662306a36Sopenharmony_ci	for (i = sh->disks; i--; )
276762306a36Sopenharmony_ci		if (sh->dev[i].page != sh->dev[i].orig_page) {
276862306a36Sopenharmony_ci			struct page *p = sh->dev[i].orig_page;
276962306a36Sopenharmony_ci
277062306a36Sopenharmony_ci			sh->dev[i].orig_page = sh->dev[i].page;
277162306a36Sopenharmony_ci			clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
277262306a36Sopenharmony_ci
277362306a36Sopenharmony_ci			if (!using_disk_info_extra_page)
277462306a36Sopenharmony_ci				put_page(p);
277562306a36Sopenharmony_ci		}
277662306a36Sopenharmony_ci
277762306a36Sopenharmony_ci	if (using_disk_info_extra_page) {
277862306a36Sopenharmony_ci		clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
277962306a36Sopenharmony_ci		md_wakeup_thread(conf->mddev->thread);
278062306a36Sopenharmony_ci	}
278162306a36Sopenharmony_ci}
278262306a36Sopenharmony_ci
278362306a36Sopenharmony_civoid r5c_use_extra_page(struct stripe_head *sh)
278462306a36Sopenharmony_ci{
278562306a36Sopenharmony_ci	struct r5conf *conf = sh->raid_conf;
278662306a36Sopenharmony_ci	int i;
278762306a36Sopenharmony_ci	struct r5dev *dev;
278862306a36Sopenharmony_ci
278962306a36Sopenharmony_ci	for (i = sh->disks; i--; ) {
279062306a36Sopenharmony_ci		dev = &sh->dev[i];
279162306a36Sopenharmony_ci		if (dev->orig_page != dev->page)
279262306a36Sopenharmony_ci			put_page(dev->orig_page);
279362306a36Sopenharmony_ci		dev->orig_page = conf->disks[i].extra_page;
279462306a36Sopenharmony_ci	}
279562306a36Sopenharmony_ci}
279662306a36Sopenharmony_ci
279762306a36Sopenharmony_ci/*
279862306a36Sopenharmony_ci * clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
279962306a36Sopenharmony_ci * stripe is committed to RAID disks.
280062306a36Sopenharmony_ci */
280162306a36Sopenharmony_civoid r5c_finish_stripe_write_out(struct r5conf *conf,
280262306a36Sopenharmony_ci				 struct stripe_head *sh,
280362306a36Sopenharmony_ci				 struct stripe_head_state *s)
280462306a36Sopenharmony_ci{
280562306a36Sopenharmony_ci	struct r5l_log *log = conf->log;
280662306a36Sopenharmony_ci	int i;
280762306a36Sopenharmony_ci	int do_wakeup = 0;
280862306a36Sopenharmony_ci	sector_t tree_index;
280962306a36Sopenharmony_ci	void __rcu **pslot;
281062306a36Sopenharmony_ci	uintptr_t refcount;
281162306a36Sopenharmony_ci
281262306a36Sopenharmony_ci	if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
281362306a36Sopenharmony_ci		return;
281462306a36Sopenharmony_ci
281562306a36Sopenharmony_ci	WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
281662306a36Sopenharmony_ci	clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
281762306a36Sopenharmony_ci
281862306a36Sopenharmony_ci	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
281962306a36Sopenharmony_ci		return;
282062306a36Sopenharmony_ci
282162306a36Sopenharmony_ci	for (i = sh->disks; i--; ) {
282262306a36Sopenharmony_ci		clear_bit(R5_InJournal, &sh->dev[i].flags);
282362306a36Sopenharmony_ci		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
282462306a36Sopenharmony_ci			do_wakeup = 1;
282562306a36Sopenharmony_ci	}
282662306a36Sopenharmony_ci
282762306a36Sopenharmony_ci	/*
282862306a36Sopenharmony_ci	 * analyse_stripe() runs before r5c_finish_stripe_write_out(),
282962306a36Sopenharmony_ci	 * We updated R5_InJournal, so we also update s->injournal.
283062306a36Sopenharmony_ci	 */
283162306a36Sopenharmony_ci	s->injournal = 0;
283262306a36Sopenharmony_ci
283362306a36Sopenharmony_ci	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
283462306a36Sopenharmony_ci		if (atomic_dec_and_test(&conf->pending_full_writes))
283562306a36Sopenharmony_ci			md_wakeup_thread(conf->mddev->thread);
283662306a36Sopenharmony_ci
283762306a36Sopenharmony_ci	if (do_wakeup)
283862306a36Sopenharmony_ci		wake_up(&conf->wait_for_overlap);
283962306a36Sopenharmony_ci
284062306a36Sopenharmony_ci	spin_lock_irq(&log->stripe_in_journal_lock);
284162306a36Sopenharmony_ci	list_del_init(&sh->r5c);
284262306a36Sopenharmony_ci	spin_unlock_irq(&log->stripe_in_journal_lock);
284362306a36Sopenharmony_ci	sh->log_start = MaxSector;
284462306a36Sopenharmony_ci
284562306a36Sopenharmony_ci	atomic_dec(&log->stripe_in_journal_count);
284662306a36Sopenharmony_ci	r5c_update_log_state(log);
284762306a36Sopenharmony_ci
284862306a36Sopenharmony_ci	/* stop counting this stripe in big_stripe_tree */
284962306a36Sopenharmony_ci	if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
285062306a36Sopenharmony_ci	    test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
285162306a36Sopenharmony_ci		tree_index = r5c_tree_index(conf, sh->sector);
285262306a36Sopenharmony_ci		spin_lock(&log->tree_lock);
285362306a36Sopenharmony_ci		pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
285462306a36Sopenharmony_ci					       tree_index);
285562306a36Sopenharmony_ci		BUG_ON(pslot == NULL);
285662306a36Sopenharmony_ci		refcount = (uintptr_t)radix_tree_deref_slot_protected(
285762306a36Sopenharmony_ci			pslot, &log->tree_lock) >>
285862306a36Sopenharmony_ci			R5C_RADIX_COUNT_SHIFT;
285962306a36Sopenharmony_ci		if (refcount == 1)
286062306a36Sopenharmony_ci			radix_tree_delete(&log->big_stripe_tree, tree_index);
286162306a36Sopenharmony_ci		else
286262306a36Sopenharmony_ci			radix_tree_replace_slot(
286362306a36Sopenharmony_ci				&log->big_stripe_tree, pslot,
286462306a36Sopenharmony_ci				(void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
286562306a36Sopenharmony_ci		spin_unlock(&log->tree_lock);
286662306a36Sopenharmony_ci	}
286762306a36Sopenharmony_ci
286862306a36Sopenharmony_ci	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
286962306a36Sopenharmony_ci		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
287062306a36Sopenharmony_ci		atomic_dec(&conf->r5c_flushing_partial_stripes);
287162306a36Sopenharmony_ci		atomic_dec(&conf->r5c_cached_partial_stripes);
287262306a36Sopenharmony_ci	}
287362306a36Sopenharmony_ci
287462306a36Sopenharmony_ci	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
287562306a36Sopenharmony_ci		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
287662306a36Sopenharmony_ci		atomic_dec(&conf->r5c_flushing_full_stripes);
287762306a36Sopenharmony_ci		atomic_dec(&conf->r5c_cached_full_stripes);
287862306a36Sopenharmony_ci	}
287962306a36Sopenharmony_ci
288062306a36Sopenharmony_ci	r5l_append_flush_payload(log, sh->sector);
288162306a36Sopenharmony_ci	/* stripe is flused to raid disks, we can do resync now */
288262306a36Sopenharmony_ci	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
288362306a36Sopenharmony_ci		set_bit(STRIPE_HANDLE, &sh->state);
288462306a36Sopenharmony_ci}
288562306a36Sopenharmony_ci
288662306a36Sopenharmony_ciint r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
288762306a36Sopenharmony_ci{
288862306a36Sopenharmony_ci	struct r5conf *conf = sh->raid_conf;
288962306a36Sopenharmony_ci	int pages = 0;
289062306a36Sopenharmony_ci	int reserve;
289162306a36Sopenharmony_ci	int i;
289262306a36Sopenharmony_ci	int ret = 0;
289362306a36Sopenharmony_ci
289462306a36Sopenharmony_ci	BUG_ON(!log);
289562306a36Sopenharmony_ci
289662306a36Sopenharmony_ci	for (i = 0; i < sh->disks; i++) {
289762306a36Sopenharmony_ci		void *addr;
289862306a36Sopenharmony_ci
289962306a36Sopenharmony_ci		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
290062306a36Sopenharmony_ci			continue;
290162306a36Sopenharmony_ci		addr = kmap_atomic(sh->dev[i].page);
290262306a36Sopenharmony_ci		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
290362306a36Sopenharmony_ci						    addr, PAGE_SIZE);
290462306a36Sopenharmony_ci		kunmap_atomic(addr);
290562306a36Sopenharmony_ci		pages++;
290662306a36Sopenharmony_ci	}
290762306a36Sopenharmony_ci	WARN_ON(pages == 0);
290862306a36Sopenharmony_ci
290962306a36Sopenharmony_ci	/*
291062306a36Sopenharmony_ci	 * The stripe must enter state machine again to call endio, so
291162306a36Sopenharmony_ci	 * don't delay.
291262306a36Sopenharmony_ci	 */
291362306a36Sopenharmony_ci	clear_bit(STRIPE_DELAYED, &sh->state);
291462306a36Sopenharmony_ci	atomic_inc(&sh->count);
291562306a36Sopenharmony_ci
291662306a36Sopenharmony_ci	mutex_lock(&log->io_mutex);
291762306a36Sopenharmony_ci	/* meta + data */
291862306a36Sopenharmony_ci	reserve = (1 + pages) << (PAGE_SHIFT - 9);
291962306a36Sopenharmony_ci
292062306a36Sopenharmony_ci	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
292162306a36Sopenharmony_ci	    sh->log_start == MaxSector)
292262306a36Sopenharmony_ci		r5l_add_no_space_stripe(log, sh);
292362306a36Sopenharmony_ci	else if (!r5l_has_free_space(log, reserve)) {
292462306a36Sopenharmony_ci		if (sh->log_start == log->last_checkpoint)
292562306a36Sopenharmony_ci			BUG();
292662306a36Sopenharmony_ci		else
292762306a36Sopenharmony_ci			r5l_add_no_space_stripe(log, sh);
292862306a36Sopenharmony_ci	} else {
292962306a36Sopenharmony_ci		ret = r5l_log_stripe(log, sh, pages, 0);
293062306a36Sopenharmony_ci		if (ret) {
293162306a36Sopenharmony_ci			spin_lock_irq(&log->io_list_lock);
293262306a36Sopenharmony_ci			list_add_tail(&sh->log_list, &log->no_mem_stripes);
293362306a36Sopenharmony_ci			spin_unlock_irq(&log->io_list_lock);
293462306a36Sopenharmony_ci		}
293562306a36Sopenharmony_ci	}
293662306a36Sopenharmony_ci
293762306a36Sopenharmony_ci	mutex_unlock(&log->io_mutex);
293862306a36Sopenharmony_ci	return 0;
293962306a36Sopenharmony_ci}
294062306a36Sopenharmony_ci
294162306a36Sopenharmony_ci/* check whether this big stripe is in write back cache. */
294262306a36Sopenharmony_cibool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
294362306a36Sopenharmony_ci{
294462306a36Sopenharmony_ci	struct r5l_log *log = conf->log;
294562306a36Sopenharmony_ci	sector_t tree_index;
294662306a36Sopenharmony_ci	void *slot;
294762306a36Sopenharmony_ci
294862306a36Sopenharmony_ci	if (!log)
294962306a36Sopenharmony_ci		return false;
295062306a36Sopenharmony_ci
295162306a36Sopenharmony_ci	WARN_ON_ONCE(!rcu_read_lock_held());
295262306a36Sopenharmony_ci	tree_index = r5c_tree_index(conf, sect);
295362306a36Sopenharmony_ci	slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
295462306a36Sopenharmony_ci	return slot != NULL;
295562306a36Sopenharmony_ci}
295662306a36Sopenharmony_ci
295762306a36Sopenharmony_cistatic int r5l_load_log(struct r5l_log *log)
295862306a36Sopenharmony_ci{
295962306a36Sopenharmony_ci	struct md_rdev *rdev = log->rdev;
296062306a36Sopenharmony_ci	struct page *page;
296162306a36Sopenharmony_ci	struct r5l_meta_block *mb;
296262306a36Sopenharmony_ci	sector_t cp = log->rdev->journal_tail;
296362306a36Sopenharmony_ci	u32 stored_crc, expected_crc;
296462306a36Sopenharmony_ci	bool create_super = false;
296562306a36Sopenharmony_ci	int ret = 0;
296662306a36Sopenharmony_ci
296762306a36Sopenharmony_ci	/* Make sure it's valid */
296862306a36Sopenharmony_ci	if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
296962306a36Sopenharmony_ci		cp = 0;
297062306a36Sopenharmony_ci	page = alloc_page(GFP_KERNEL);
297162306a36Sopenharmony_ci	if (!page)
297262306a36Sopenharmony_ci		return -ENOMEM;
297362306a36Sopenharmony_ci
297462306a36Sopenharmony_ci	if (!sync_page_io(rdev, cp, PAGE_SIZE, page, REQ_OP_READ, false)) {
297562306a36Sopenharmony_ci		ret = -EIO;
297662306a36Sopenharmony_ci		goto ioerr;
297762306a36Sopenharmony_ci	}
297862306a36Sopenharmony_ci	mb = page_address(page);
297962306a36Sopenharmony_ci
298062306a36Sopenharmony_ci	if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
298162306a36Sopenharmony_ci	    mb->version != R5LOG_VERSION) {
298262306a36Sopenharmony_ci		create_super = true;
298362306a36Sopenharmony_ci		goto create;
298462306a36Sopenharmony_ci	}
298562306a36Sopenharmony_ci	stored_crc = le32_to_cpu(mb->checksum);
298662306a36Sopenharmony_ci	mb->checksum = 0;
298762306a36Sopenharmony_ci	expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
298862306a36Sopenharmony_ci	if (stored_crc != expected_crc) {
298962306a36Sopenharmony_ci		create_super = true;
299062306a36Sopenharmony_ci		goto create;
299162306a36Sopenharmony_ci	}
299262306a36Sopenharmony_ci	if (le64_to_cpu(mb->position) != cp) {
299362306a36Sopenharmony_ci		create_super = true;
299462306a36Sopenharmony_ci		goto create;
299562306a36Sopenharmony_ci	}
299662306a36Sopenharmony_cicreate:
299762306a36Sopenharmony_ci	if (create_super) {
299862306a36Sopenharmony_ci		log->last_cp_seq = get_random_u32();
299962306a36Sopenharmony_ci		cp = 0;
300062306a36Sopenharmony_ci		r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
300162306a36Sopenharmony_ci		/*
300262306a36Sopenharmony_ci		 * Make sure super points to correct address. Log might have
300362306a36Sopenharmony_ci		 * data very soon. If super hasn't correct log tail address,
300462306a36Sopenharmony_ci		 * recovery can't find the log
300562306a36Sopenharmony_ci		 */
300662306a36Sopenharmony_ci		r5l_write_super(log, cp);
300762306a36Sopenharmony_ci	} else
300862306a36Sopenharmony_ci		log->last_cp_seq = le64_to_cpu(mb->seq);
300962306a36Sopenharmony_ci
301062306a36Sopenharmony_ci	log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
301162306a36Sopenharmony_ci	log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
301262306a36Sopenharmony_ci	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
301362306a36Sopenharmony_ci		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
301462306a36Sopenharmony_ci	log->last_checkpoint = cp;
301562306a36Sopenharmony_ci
301662306a36Sopenharmony_ci	__free_page(page);
301762306a36Sopenharmony_ci
301862306a36Sopenharmony_ci	if (create_super) {
301962306a36Sopenharmony_ci		log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
302062306a36Sopenharmony_ci		log->seq = log->last_cp_seq + 1;
302162306a36Sopenharmony_ci		log->next_checkpoint = cp;
302262306a36Sopenharmony_ci	} else
302362306a36Sopenharmony_ci		ret = r5l_recovery_log(log);
302462306a36Sopenharmony_ci
302562306a36Sopenharmony_ci	r5c_update_log_state(log);
302662306a36Sopenharmony_ci	return ret;
302762306a36Sopenharmony_ciioerr:
302862306a36Sopenharmony_ci	__free_page(page);
302962306a36Sopenharmony_ci	return ret;
303062306a36Sopenharmony_ci}
303162306a36Sopenharmony_ci
303262306a36Sopenharmony_ciint r5l_start(struct r5l_log *log)
303362306a36Sopenharmony_ci{
303462306a36Sopenharmony_ci	int ret;
303562306a36Sopenharmony_ci
303662306a36Sopenharmony_ci	if (!log)
303762306a36Sopenharmony_ci		return 0;
303862306a36Sopenharmony_ci
303962306a36Sopenharmony_ci	ret = r5l_load_log(log);
304062306a36Sopenharmony_ci	if (ret) {
304162306a36Sopenharmony_ci		struct mddev *mddev = log->rdev->mddev;
304262306a36Sopenharmony_ci		struct r5conf *conf = mddev->private;
304362306a36Sopenharmony_ci
304462306a36Sopenharmony_ci		r5l_exit_log(conf);
304562306a36Sopenharmony_ci	}
304662306a36Sopenharmony_ci	return ret;
304762306a36Sopenharmony_ci}
304862306a36Sopenharmony_ci
304962306a36Sopenharmony_civoid r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
305062306a36Sopenharmony_ci{
305162306a36Sopenharmony_ci	struct r5conf *conf = mddev->private;
305262306a36Sopenharmony_ci	struct r5l_log *log = conf->log;
305362306a36Sopenharmony_ci
305462306a36Sopenharmony_ci	if (!log)
305562306a36Sopenharmony_ci		return;
305662306a36Sopenharmony_ci
305762306a36Sopenharmony_ci	if ((raid5_calc_degraded(conf) > 0 ||
305862306a36Sopenharmony_ci	     test_bit(Journal, &rdev->flags)) &&
305962306a36Sopenharmony_ci	    conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
306062306a36Sopenharmony_ci		schedule_work(&log->disable_writeback_work);
306162306a36Sopenharmony_ci}
306262306a36Sopenharmony_ci
306362306a36Sopenharmony_ciint r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
306462306a36Sopenharmony_ci{
306562306a36Sopenharmony_ci	struct r5l_log *log;
306662306a36Sopenharmony_ci	struct md_thread *thread;
306762306a36Sopenharmony_ci	int ret;
306862306a36Sopenharmony_ci
306962306a36Sopenharmony_ci	pr_debug("md/raid:%s: using device %pg as journal\n",
307062306a36Sopenharmony_ci		 mdname(conf->mddev), rdev->bdev);
307162306a36Sopenharmony_ci
307262306a36Sopenharmony_ci	if (PAGE_SIZE != 4096)
307362306a36Sopenharmony_ci		return -EINVAL;
307462306a36Sopenharmony_ci
307562306a36Sopenharmony_ci	/*
307662306a36Sopenharmony_ci	 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
307762306a36Sopenharmony_ci	 * raid_disks r5l_payload_data_parity.
307862306a36Sopenharmony_ci	 *
307962306a36Sopenharmony_ci	 * Write journal and cache does not work for very big array
308062306a36Sopenharmony_ci	 * (raid_disks > 203)
308162306a36Sopenharmony_ci	 */
308262306a36Sopenharmony_ci	if (sizeof(struct r5l_meta_block) +
308362306a36Sopenharmony_ci	    ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
308462306a36Sopenharmony_ci	     conf->raid_disks) > PAGE_SIZE) {
308562306a36Sopenharmony_ci		pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
308662306a36Sopenharmony_ci		       mdname(conf->mddev), conf->raid_disks);
308762306a36Sopenharmony_ci		return -EINVAL;
308862306a36Sopenharmony_ci	}
308962306a36Sopenharmony_ci
309062306a36Sopenharmony_ci	log = kzalloc(sizeof(*log), GFP_KERNEL);
309162306a36Sopenharmony_ci	if (!log)
309262306a36Sopenharmony_ci		return -ENOMEM;
309362306a36Sopenharmony_ci	log->rdev = rdev;
309462306a36Sopenharmony_ci	log->need_cache_flush = bdev_write_cache(rdev->bdev);
309562306a36Sopenharmony_ci	log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
309662306a36Sopenharmony_ci				       sizeof(rdev->mddev->uuid));
309762306a36Sopenharmony_ci
309862306a36Sopenharmony_ci	mutex_init(&log->io_mutex);
309962306a36Sopenharmony_ci
310062306a36Sopenharmony_ci	spin_lock_init(&log->io_list_lock);
310162306a36Sopenharmony_ci	INIT_LIST_HEAD(&log->running_ios);
310262306a36Sopenharmony_ci	INIT_LIST_HEAD(&log->io_end_ios);
310362306a36Sopenharmony_ci	INIT_LIST_HEAD(&log->flushing_ios);
310462306a36Sopenharmony_ci	INIT_LIST_HEAD(&log->finished_ios);
310562306a36Sopenharmony_ci
310662306a36Sopenharmony_ci	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
310762306a36Sopenharmony_ci	if (!log->io_kc)
310862306a36Sopenharmony_ci		goto io_kc;
310962306a36Sopenharmony_ci
311062306a36Sopenharmony_ci	ret = mempool_init_slab_pool(&log->io_pool, R5L_POOL_SIZE, log->io_kc);
311162306a36Sopenharmony_ci	if (ret)
311262306a36Sopenharmony_ci		goto io_pool;
311362306a36Sopenharmony_ci
311462306a36Sopenharmony_ci	ret = bioset_init(&log->bs, R5L_POOL_SIZE, 0, BIOSET_NEED_BVECS);
311562306a36Sopenharmony_ci	if (ret)
311662306a36Sopenharmony_ci		goto io_bs;
311762306a36Sopenharmony_ci
311862306a36Sopenharmony_ci	ret = mempool_init_page_pool(&log->meta_pool, R5L_POOL_SIZE, 0);
311962306a36Sopenharmony_ci	if (ret)
312062306a36Sopenharmony_ci		goto out_mempool;
312162306a36Sopenharmony_ci
312262306a36Sopenharmony_ci	spin_lock_init(&log->tree_lock);
312362306a36Sopenharmony_ci	INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
312462306a36Sopenharmony_ci
312562306a36Sopenharmony_ci	thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev,
312662306a36Sopenharmony_ci				    "reclaim");
312762306a36Sopenharmony_ci	if (!thread)
312862306a36Sopenharmony_ci		goto reclaim_thread;
312962306a36Sopenharmony_ci
313062306a36Sopenharmony_ci	thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
313162306a36Sopenharmony_ci	rcu_assign_pointer(log->reclaim_thread, thread);
313262306a36Sopenharmony_ci
313362306a36Sopenharmony_ci	init_waitqueue_head(&log->iounit_wait);
313462306a36Sopenharmony_ci
313562306a36Sopenharmony_ci	INIT_LIST_HEAD(&log->no_mem_stripes);
313662306a36Sopenharmony_ci
313762306a36Sopenharmony_ci	INIT_LIST_HEAD(&log->no_space_stripes);
313862306a36Sopenharmony_ci	spin_lock_init(&log->no_space_stripes_lock);
313962306a36Sopenharmony_ci
314062306a36Sopenharmony_ci	INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
314162306a36Sopenharmony_ci	INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
314262306a36Sopenharmony_ci
314362306a36Sopenharmony_ci	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
314462306a36Sopenharmony_ci	INIT_LIST_HEAD(&log->stripe_in_journal_list);
314562306a36Sopenharmony_ci	spin_lock_init(&log->stripe_in_journal_lock);
314662306a36Sopenharmony_ci	atomic_set(&log->stripe_in_journal_count, 0);
314762306a36Sopenharmony_ci
314862306a36Sopenharmony_ci	conf->log = log;
314962306a36Sopenharmony_ci
315062306a36Sopenharmony_ci	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
315162306a36Sopenharmony_ci	return 0;
315262306a36Sopenharmony_ci
315362306a36Sopenharmony_cireclaim_thread:
315462306a36Sopenharmony_ci	mempool_exit(&log->meta_pool);
315562306a36Sopenharmony_ciout_mempool:
315662306a36Sopenharmony_ci	bioset_exit(&log->bs);
315762306a36Sopenharmony_ciio_bs:
315862306a36Sopenharmony_ci	mempool_exit(&log->io_pool);
315962306a36Sopenharmony_ciio_pool:
316062306a36Sopenharmony_ci	kmem_cache_destroy(log->io_kc);
316162306a36Sopenharmony_ciio_kc:
316262306a36Sopenharmony_ci	kfree(log);
316362306a36Sopenharmony_ci	return -EINVAL;
316462306a36Sopenharmony_ci}
316562306a36Sopenharmony_ci
316662306a36Sopenharmony_civoid r5l_exit_log(struct r5conf *conf)
316762306a36Sopenharmony_ci{
316862306a36Sopenharmony_ci	struct r5l_log *log = conf->log;
316962306a36Sopenharmony_ci
317062306a36Sopenharmony_ci	md_unregister_thread(conf->mddev, &log->reclaim_thread);
317162306a36Sopenharmony_ci
317262306a36Sopenharmony_ci	/*
317362306a36Sopenharmony_ci	 * 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to
317462306a36Sopenharmony_ci	 * ensure disable_writeback_work wakes up and exits.
317562306a36Sopenharmony_ci	 */
317662306a36Sopenharmony_ci	conf->log = NULL;
317762306a36Sopenharmony_ci	wake_up(&conf->mddev->sb_wait);
317862306a36Sopenharmony_ci	flush_work(&log->disable_writeback_work);
317962306a36Sopenharmony_ci
318062306a36Sopenharmony_ci	mempool_exit(&log->meta_pool);
318162306a36Sopenharmony_ci	bioset_exit(&log->bs);
318262306a36Sopenharmony_ci	mempool_exit(&log->io_pool);
318362306a36Sopenharmony_ci	kmem_cache_destroy(log->io_kc);
318462306a36Sopenharmony_ci	kfree(log);
318562306a36Sopenharmony_ci}
3186