18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0 */
28c2ecf20Sopenharmony_ci#ifndef _RAID5_H
38c2ecf20Sopenharmony_ci#define _RAID5_H
48c2ecf20Sopenharmony_ci
58c2ecf20Sopenharmony_ci#include <linux/raid/xor.h>
68c2ecf20Sopenharmony_ci#include <linux/dmaengine.h>
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci/*
98c2ecf20Sopenharmony_ci *
108c2ecf20Sopenharmony_ci * Each stripe contains one buffer per device.  Each buffer can be in
118c2ecf20Sopenharmony_ci * one of a number of states stored in "flags".  Changes between
128c2ecf20Sopenharmony_ci * these states happen *almost* exclusively under the protection of the
138c2ecf20Sopenharmony_ci * STRIPE_ACTIVE flag.  Some very specific changes can happen in bi_end_io, and
148c2ecf20Sopenharmony_ci * these are not protected by STRIPE_ACTIVE.
158c2ecf20Sopenharmony_ci *
168c2ecf20Sopenharmony_ci * The flag bits that are used to represent these states are:
178c2ecf20Sopenharmony_ci *   R5_UPTODATE and R5_LOCKED
188c2ecf20Sopenharmony_ci *
198c2ecf20Sopenharmony_ci * State Empty == !UPTODATE, !LOCK
208c2ecf20Sopenharmony_ci *        We have no data, and there is no active request
218c2ecf20Sopenharmony_ci * State Want == !UPTODATE, LOCK
228c2ecf20Sopenharmony_ci *        A read request is being submitted for this block
238c2ecf20Sopenharmony_ci * State Dirty == UPTODATE, LOCK
248c2ecf20Sopenharmony_ci *        Some new data is in this buffer, and it is being written out
258c2ecf20Sopenharmony_ci * State Clean == UPTODATE, !LOCK
268c2ecf20Sopenharmony_ci *        We have valid data which is the same as on disc
278c2ecf20Sopenharmony_ci *
288c2ecf20Sopenharmony_ci * The possible state transitions are:
298c2ecf20Sopenharmony_ci *
308c2ecf20Sopenharmony_ci *  Empty -> Want   - on read or write to get old data for  parity calc
318c2ecf20Sopenharmony_ci *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.
328c2ecf20Sopenharmony_ci *  Empty -> Clean  - on compute_block when computing a block for failed drive
338c2ecf20Sopenharmony_ci *  Want  -> Empty  - on failed read
348c2ecf20Sopenharmony_ci *  Want  -> Clean  - on successful completion of read request
358c2ecf20Sopenharmony_ci *  Dirty -> Clean  - on successful completion of write request
368c2ecf20Sopenharmony_ci *  Dirty -> Clean  - on failed write
378c2ecf20Sopenharmony_ci *  Clean -> Dirty  - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
388c2ecf20Sopenharmony_ci *
398c2ecf20Sopenharmony_ci * The Want->Empty, Want->Clean, Dirty->Clean, transitions
408c2ecf20Sopenharmony_ci * all happen in b_end_io at interrupt time.
418c2ecf20Sopenharmony_ci * Each sets the Uptodate bit before releasing the Lock bit.
428c2ecf20Sopenharmony_ci * This leaves one multi-stage transition:
438c2ecf20Sopenharmony_ci *    Want->Dirty->Clean
448c2ecf20Sopenharmony_ci * This is safe because thinking that a Clean buffer is actually dirty
458c2ecf20Sopenharmony_ci * will at worst delay some action, and the stripe will be scheduled
468c2ecf20Sopenharmony_ci * for attention after the transition is complete.
478c2ecf20Sopenharmony_ci *
488c2ecf20Sopenharmony_ci * There is one possibility that is not covered by these states.  That
498c2ecf20Sopenharmony_ci * is if one drive has failed and there is a spare being rebuilt.  We
508c2ecf20Sopenharmony_ci * can't distinguish between a clean block that has been generated
518c2ecf20Sopenharmony_ci * from parity calculations, and a clean block that has been
528c2ecf20Sopenharmony_ci * successfully written to the spare ( or to parity when resyncing).
538c2ecf20Sopenharmony_ci * To distinguish these states we have a stripe bit STRIPE_INSYNC that
548c2ecf20Sopenharmony_ci * is set whenever a write is scheduled to the spare, or to the parity
558c2ecf20Sopenharmony_ci * disc if there is no spare.  A sync request clears this bit, and
568c2ecf20Sopenharmony_ci * when we find it set with no buffers locked, we know the sync is
578c2ecf20Sopenharmony_ci * complete.
588c2ecf20Sopenharmony_ci *
598c2ecf20Sopenharmony_ci * Buffers for the md device that arrive via make_request are attached
608c2ecf20Sopenharmony_ci * to the appropriate stripe in one of two lists linked on b_reqnext.
618c2ecf20Sopenharmony_ci * One list (bh_read) for read requests, one (bh_write) for write.
628c2ecf20Sopenharmony_ci * There should never be more than one buffer on the two lists
638c2ecf20Sopenharmony_ci * together, but we are not guaranteed of that so we allow for more.
648c2ecf20Sopenharmony_ci *
658c2ecf20Sopenharmony_ci * If a buffer is on the read list when the associated cache buffer is
668c2ecf20Sopenharmony_ci * Uptodate, the data is copied into the read buffer and it's b_end_io
678c2ecf20Sopenharmony_ci * routine is called.  This may happen in the end_request routine only
688c2ecf20Sopenharmony_ci * if the buffer has just successfully been read.  end_request should
698c2ecf20Sopenharmony_ci * remove the buffers from the list and then set the Uptodate bit on
708c2ecf20Sopenharmony_ci * the buffer.  Other threads may do this only if they first check
718c2ecf20Sopenharmony_ci * that the Uptodate bit is set.  Once they have checked that they may
728c2ecf20Sopenharmony_ci * take buffers off the read queue.
738c2ecf20Sopenharmony_ci *
748c2ecf20Sopenharmony_ci * When a buffer on the write list is committed for write it is copied
758c2ecf20Sopenharmony_ci * into the cache buffer, which is then marked dirty, and moved onto a
768c2ecf20Sopenharmony_ci * third list, the written list (bh_written).  Once both the parity
778c2ecf20Sopenharmony_ci * block and the cached buffer are successfully written, any buffer on
788c2ecf20Sopenharmony_ci * a written list can be returned with b_end_io.
798c2ecf20Sopenharmony_ci *
808c2ecf20Sopenharmony_ci * The write list and read list both act as fifos.  The read list,
818c2ecf20Sopenharmony_ci * write list and written list are protected by the device_lock.
828c2ecf20Sopenharmony_ci * The device_lock is only for list manipulations and will only be
838c2ecf20Sopenharmony_ci * held for a very short time.  It can be claimed from interrupts.
848c2ecf20Sopenharmony_ci *
858c2ecf20Sopenharmony_ci *
868c2ecf20Sopenharmony_ci * Stripes in the stripe cache can be on one of two lists (or on
878c2ecf20Sopenharmony_ci * neither).  The "inactive_list" contains stripes which are not
888c2ecf20Sopenharmony_ci * currently being used for any request.  They can freely be reused
898c2ecf20Sopenharmony_ci * for another stripe.  The "handle_list" contains stripes that need
908c2ecf20Sopenharmony_ci * to be handled in some way.  Both of these are fifo queues.  Each
918c2ecf20Sopenharmony_ci * stripe is also (potentially) linked to a hash bucket in the hash
928c2ecf20Sopenharmony_ci * table so that it can be found by sector number.  Stripes that are
938c2ecf20Sopenharmony_ci * not hashed must be on the inactive_list, and will normally be at
948c2ecf20Sopenharmony_ci * the front.  All stripes start life this way.
958c2ecf20Sopenharmony_ci *
968c2ecf20Sopenharmony_ci * The inactive_list, handle_list and hash bucket lists are all protected by the
978c2ecf20Sopenharmony_ci * device_lock.
988c2ecf20Sopenharmony_ci *  - stripes have a reference counter. If count==0, they are on a list.
998c2ecf20Sopenharmony_ci *  - If a stripe might need handling, STRIPE_HANDLE is set.
1008c2ecf20Sopenharmony_ci *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
1018c2ecf20Sopenharmony_ci *    handle_list else inactive_list
1028c2ecf20Sopenharmony_ci *
1038c2ecf20Sopenharmony_ci * This, combined with the fact that STRIPE_HANDLE is only ever
1048c2ecf20Sopenharmony_ci * cleared while a stripe has a non-zero count means that if the
1058c2ecf20Sopenharmony_ci * refcount is 0 and STRIPE_HANDLE is set, then it is on the
1068c2ecf20Sopenharmony_ci * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
1078c2ecf20Sopenharmony_ci * the stripe is on inactive_list.
1088c2ecf20Sopenharmony_ci *
1098c2ecf20Sopenharmony_ci * The possible transitions are:
1108c2ecf20Sopenharmony_ci *  activate an unhashed/inactive stripe (get_active_stripe())
1118c2ecf20Sopenharmony_ci *     lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
1128c2ecf20Sopenharmony_ci *  activate a hashed, possibly active stripe (get_active_stripe())
1138c2ecf20Sopenharmony_ci *     lockdev check-hash if(!cnt++)unlink-stripe unlockdev
1148c2ecf20Sopenharmony_ci *  attach a request to an active stripe (add_stripe_bh())
1158c2ecf20Sopenharmony_ci *     lockdev attach-buffer unlockdev
1168c2ecf20Sopenharmony_ci *  handle a stripe (handle_stripe())
1178c2ecf20Sopenharmony_ci *     setSTRIPE_ACTIVE,  clrSTRIPE_HANDLE ...
1188c2ecf20Sopenharmony_ci *		(lockdev check-buffers unlockdev) ..
1198c2ecf20Sopenharmony_ci *		change-state ..
1208c2ecf20Sopenharmony_ci *		record io/ops needed clearSTRIPE_ACTIVE schedule io/ops
1218c2ecf20Sopenharmony_ci *  release an active stripe (release_stripe())
1228c2ecf20Sopenharmony_ci *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
1238c2ecf20Sopenharmony_ci *
1248c2ecf20Sopenharmony_ci * The refcount counts each thread that have activated the stripe,
1258c2ecf20Sopenharmony_ci * plus raid5d if it is handling it, plus one for each active request
1268c2ecf20Sopenharmony_ci * on a cached buffer, and plus one if the stripe is undergoing stripe
1278c2ecf20Sopenharmony_ci * operations.
1288c2ecf20Sopenharmony_ci *
1298c2ecf20Sopenharmony_ci * The stripe operations are:
1308c2ecf20Sopenharmony_ci * -copying data between the stripe cache and user application buffers
1318c2ecf20Sopenharmony_ci * -computing blocks to save a disk access, or to recover a missing block
1328c2ecf20Sopenharmony_ci * -updating the parity on a write operation (reconstruct write and
1338c2ecf20Sopenharmony_ci *  read-modify-write)
1348c2ecf20Sopenharmony_ci * -checking parity correctness
1358c2ecf20Sopenharmony_ci * -running i/o to disk
1368c2ecf20Sopenharmony_ci * These operations are carried out by raid5_run_ops which uses the async_tx
1378c2ecf20Sopenharmony_ci * api to (optionally) offload operations to dedicated hardware engines.
1388c2ecf20Sopenharmony_ci * When requesting an operation handle_stripe sets the pending bit for the
1398c2ecf20Sopenharmony_ci * operation and increments the count.  raid5_run_ops is then run whenever
1408c2ecf20Sopenharmony_ci * the count is non-zero.
1418c2ecf20Sopenharmony_ci * There are some critical dependencies between the operations that prevent some
1428c2ecf20Sopenharmony_ci * from being requested while another is in flight.
1438c2ecf20Sopenharmony_ci * 1/ Parity check operations destroy the in cache version of the parity block,
1448c2ecf20Sopenharmony_ci *    so we prevent parity dependent operations like writes and compute_blocks
1458c2ecf20Sopenharmony_ci *    from starting while a check is in progress.  Some dma engines can perform
1468c2ecf20Sopenharmony_ci *    the check without damaging the parity block, in these cases the parity
1478c2ecf20Sopenharmony_ci *    block is re-marked up to date (assuming the check was successful) and is
1488c2ecf20Sopenharmony_ci *    not re-read from disk.
1498c2ecf20Sopenharmony_ci * 2/ When a write operation is requested we immediately lock the affected
1508c2ecf20Sopenharmony_ci *    blocks, and mark them as not up to date.  This causes new read requests
1518c2ecf20Sopenharmony_ci *    to be held off, as well as parity checks and compute block operations.
1528c2ecf20Sopenharmony_ci * 3/ Once a compute block operation has been requested handle_stripe treats
1538c2ecf20Sopenharmony_ci *    that block as if it is up to date.  raid5_run_ops guaruntees that any
1548c2ecf20Sopenharmony_ci *    operation that is dependent on the compute block result is initiated after
1558c2ecf20Sopenharmony_ci *    the compute block completes.
1568c2ecf20Sopenharmony_ci */
1578c2ecf20Sopenharmony_ci
1588c2ecf20Sopenharmony_ci/*
1598c2ecf20Sopenharmony_ci * Operations state - intermediate states that are visible outside of
1608c2ecf20Sopenharmony_ci *   STRIPE_ACTIVE.
1618c2ecf20Sopenharmony_ci * In general _idle indicates nothing is running, _run indicates a data
1628c2ecf20Sopenharmony_ci * processing operation is active, and _result means the data processing result
1638c2ecf20Sopenharmony_ci * is stable and can be acted upon.  For simple operations like biofill and
1648c2ecf20Sopenharmony_ci * compute that only have an _idle and _run state they are indicated with
1658c2ecf20Sopenharmony_ci * sh->state flags (STRIPE_BIOFILL_RUN and STRIPE_COMPUTE_RUN)
1668c2ecf20Sopenharmony_ci */
1678c2ecf20Sopenharmony_ci/**
1688c2ecf20Sopenharmony_ci * enum check_states - handles syncing / repairing a stripe
1698c2ecf20Sopenharmony_ci * @check_state_idle - check operations are quiesced
1708c2ecf20Sopenharmony_ci * @check_state_run - check operation is running
1718c2ecf20Sopenharmony_ci * @check_state_result - set outside lock when check result is valid
1728c2ecf20Sopenharmony_ci * @check_state_compute_run - check failed and we are repairing
1738c2ecf20Sopenharmony_ci * @check_state_compute_result - set outside lock when compute result is valid
1748c2ecf20Sopenharmony_ci */
1758c2ecf20Sopenharmony_cienum check_states {
1768c2ecf20Sopenharmony_ci	check_state_idle = 0,
1778c2ecf20Sopenharmony_ci	check_state_run, /* xor parity check */
1788c2ecf20Sopenharmony_ci	check_state_run_q, /* q-parity check */
1798c2ecf20Sopenharmony_ci	check_state_run_pq, /* pq dual parity check */
1808c2ecf20Sopenharmony_ci	check_state_check_result,
1818c2ecf20Sopenharmony_ci	check_state_compute_run, /* parity repair */
1828c2ecf20Sopenharmony_ci	check_state_compute_result,
1838c2ecf20Sopenharmony_ci};
1848c2ecf20Sopenharmony_ci
1858c2ecf20Sopenharmony_ci/**
1868c2ecf20Sopenharmony_ci * enum reconstruct_states - handles writing or expanding a stripe
1878c2ecf20Sopenharmony_ci */
1888c2ecf20Sopenharmony_cienum reconstruct_states {
1898c2ecf20Sopenharmony_ci	reconstruct_state_idle = 0,
1908c2ecf20Sopenharmony_ci	reconstruct_state_prexor_drain_run,	/* prexor-write */
1918c2ecf20Sopenharmony_ci	reconstruct_state_drain_run,		/* write */
1928c2ecf20Sopenharmony_ci	reconstruct_state_run,			/* expand */
1938c2ecf20Sopenharmony_ci	reconstruct_state_prexor_drain_result,
1948c2ecf20Sopenharmony_ci	reconstruct_state_drain_result,
1958c2ecf20Sopenharmony_ci	reconstruct_state_result,
1968c2ecf20Sopenharmony_ci};
1978c2ecf20Sopenharmony_ci
1988c2ecf20Sopenharmony_ci#define DEFAULT_STRIPE_SIZE	4096
1998c2ecf20Sopenharmony_cistruct stripe_head {
2008c2ecf20Sopenharmony_ci	struct hlist_node	hash;
2018c2ecf20Sopenharmony_ci	struct list_head	lru;	      /* inactive_list or handle_list */
2028c2ecf20Sopenharmony_ci	struct llist_node	release_list;
2038c2ecf20Sopenharmony_ci	struct r5conf		*raid_conf;
2048c2ecf20Sopenharmony_ci	short			generation;	/* increments with every
2058c2ecf20Sopenharmony_ci						 * reshape */
2068c2ecf20Sopenharmony_ci	sector_t		sector;		/* sector of this row */
2078c2ecf20Sopenharmony_ci	short			pd_idx;		/* parity disk index */
2088c2ecf20Sopenharmony_ci	short			qd_idx;		/* 'Q' disk index for raid6 */
2098c2ecf20Sopenharmony_ci	short			ddf_layout;/* use DDF ordering to calculate Q */
2108c2ecf20Sopenharmony_ci	short			hash_lock_index;
2118c2ecf20Sopenharmony_ci	unsigned long		state;		/* state flags */
2128c2ecf20Sopenharmony_ci	atomic_t		count;	      /* nr of active thread/requests */
2138c2ecf20Sopenharmony_ci	int			bm_seq;	/* sequence number for bitmap flushes */
2148c2ecf20Sopenharmony_ci	int			disks;		/* disks in stripe */
2158c2ecf20Sopenharmony_ci	int			overwrite_disks; /* total overwrite disks in stripe,
2168c2ecf20Sopenharmony_ci						  * this is only checked when stripe
2178c2ecf20Sopenharmony_ci						  * has STRIPE_BATCH_READY
2188c2ecf20Sopenharmony_ci						  */
2198c2ecf20Sopenharmony_ci	enum check_states	check_state;
2208c2ecf20Sopenharmony_ci	enum reconstruct_states reconstruct_state;
2218c2ecf20Sopenharmony_ci	spinlock_t		stripe_lock;
2228c2ecf20Sopenharmony_ci	int			cpu;
2238c2ecf20Sopenharmony_ci	struct r5worker_group	*group;
2248c2ecf20Sopenharmony_ci
2258c2ecf20Sopenharmony_ci	struct stripe_head	*batch_head; /* protected by stripe lock */
2268c2ecf20Sopenharmony_ci	spinlock_t		batch_lock; /* only header's lock is useful */
2278c2ecf20Sopenharmony_ci	struct list_head	batch_list; /* protected by head's batch lock*/
2288c2ecf20Sopenharmony_ci
2298c2ecf20Sopenharmony_ci	union {
2308c2ecf20Sopenharmony_ci		struct r5l_io_unit	*log_io;
2318c2ecf20Sopenharmony_ci		struct ppl_io_unit	*ppl_io;
2328c2ecf20Sopenharmony_ci	};
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_ci	struct list_head	log_list;
2358c2ecf20Sopenharmony_ci	sector_t		log_start; /* first meta block on the journal */
2368c2ecf20Sopenharmony_ci	struct list_head	r5c; /* for r5c_cache->stripe_in_journal */
2378c2ecf20Sopenharmony_ci
2388c2ecf20Sopenharmony_ci	struct page		*ppl_page; /* partial parity of this stripe */
2398c2ecf20Sopenharmony_ci	/**
2408c2ecf20Sopenharmony_ci	 * struct stripe_operations
2418c2ecf20Sopenharmony_ci	 * @target - STRIPE_OP_COMPUTE_BLK target
2428c2ecf20Sopenharmony_ci	 * @target2 - 2nd compute target in the raid6 case
2438c2ecf20Sopenharmony_ci	 * @zero_sum_result - P and Q verification flags
2448c2ecf20Sopenharmony_ci	 * @request - async service request flags for raid_run_ops
2458c2ecf20Sopenharmony_ci	 */
2468c2ecf20Sopenharmony_ci	struct stripe_operations {
2478c2ecf20Sopenharmony_ci		int 		     target, target2;
2488c2ecf20Sopenharmony_ci		enum sum_check_flags zero_sum_result;
2498c2ecf20Sopenharmony_ci	} ops;
2508c2ecf20Sopenharmony_ci
2518c2ecf20Sopenharmony_ci#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
2528c2ecf20Sopenharmony_ci	/* These pages will be used by bios in dev[i] */
2538c2ecf20Sopenharmony_ci	struct page	**pages;
2548c2ecf20Sopenharmony_ci	int	nr_pages;	/* page array size */
2558c2ecf20Sopenharmony_ci	int	stripes_per_page;
2568c2ecf20Sopenharmony_ci#endif
2578c2ecf20Sopenharmony_ci	struct r5dev {
2588c2ecf20Sopenharmony_ci		/* rreq and rvec are used for the replacement device when
2598c2ecf20Sopenharmony_ci		 * writing data to both devices.
2608c2ecf20Sopenharmony_ci		 */
2618c2ecf20Sopenharmony_ci		struct bio	req, rreq;
2628c2ecf20Sopenharmony_ci		struct bio_vec	vec, rvec;
2638c2ecf20Sopenharmony_ci		struct page	*page, *orig_page;
2648c2ecf20Sopenharmony_ci		unsigned int    offset;     /* offset of the page */
2658c2ecf20Sopenharmony_ci		struct bio	*toread, *read, *towrite, *written;
2668c2ecf20Sopenharmony_ci		sector_t	sector;			/* sector of this page */
2678c2ecf20Sopenharmony_ci		unsigned long	flags;
2688c2ecf20Sopenharmony_ci		u32		log_checksum;
2698c2ecf20Sopenharmony_ci		unsigned short	write_hint;
2708c2ecf20Sopenharmony_ci	} dev[1]; /* allocated with extra space depending of RAID geometry */
2718c2ecf20Sopenharmony_ci};
2728c2ecf20Sopenharmony_ci
2738c2ecf20Sopenharmony_ci/* stripe_head_state - collects and tracks the dynamic state of a stripe_head
2748c2ecf20Sopenharmony_ci *     for handle_stripe.
2758c2ecf20Sopenharmony_ci */
2768c2ecf20Sopenharmony_cistruct stripe_head_state {
2778c2ecf20Sopenharmony_ci	/* 'syncing' means that we need to read all devices, either
2788c2ecf20Sopenharmony_ci	 * to check/correct parity, or to reconstruct a missing device.
2798c2ecf20Sopenharmony_ci	 * 'replacing' means we are replacing one or more drives and
2808c2ecf20Sopenharmony_ci	 * the source is valid at this point so we don't need to
2818c2ecf20Sopenharmony_ci	 * read all devices, just the replacement targets.
2828c2ecf20Sopenharmony_ci	 */
2838c2ecf20Sopenharmony_ci	int syncing, expanding, expanded, replacing;
2848c2ecf20Sopenharmony_ci	int locked, uptodate, to_read, to_write, failed, written;
2858c2ecf20Sopenharmony_ci	int to_fill, compute, req_compute, non_overwrite;
2868c2ecf20Sopenharmony_ci	int injournal, just_cached;
2878c2ecf20Sopenharmony_ci	int failed_num[2];
2888c2ecf20Sopenharmony_ci	int p_failed, q_failed;
2898c2ecf20Sopenharmony_ci	int dec_preread_active;
2908c2ecf20Sopenharmony_ci	unsigned long ops_request;
2918c2ecf20Sopenharmony_ci
2928c2ecf20Sopenharmony_ci	struct md_rdev *blocked_rdev;
2938c2ecf20Sopenharmony_ci	int handle_bad_blocks;
2948c2ecf20Sopenharmony_ci	int log_failed;
2958c2ecf20Sopenharmony_ci	int waiting_extra_page;
2968c2ecf20Sopenharmony_ci};
2978c2ecf20Sopenharmony_ci
2988c2ecf20Sopenharmony_ci/* Flags for struct r5dev.flags */
2998c2ecf20Sopenharmony_cienum r5dev_flags {
3008c2ecf20Sopenharmony_ci	R5_UPTODATE,	/* page contains current data */
3018c2ecf20Sopenharmony_ci	R5_LOCKED,	/* IO has been submitted on "req" */
3028c2ecf20Sopenharmony_ci	R5_DOUBLE_LOCKED,/* Cannot clear R5_LOCKED until 2 writes complete */
3038c2ecf20Sopenharmony_ci	R5_OVERWRITE,	/* towrite covers whole page */
3048c2ecf20Sopenharmony_ci/* and some that are internal to handle_stripe */
3058c2ecf20Sopenharmony_ci	R5_Insync,	/* rdev && rdev->in_sync at start */
3068c2ecf20Sopenharmony_ci	R5_Wantread,	/* want to schedule a read */
3078c2ecf20Sopenharmony_ci	R5_Wantwrite,
3088c2ecf20Sopenharmony_ci	R5_Overlap,	/* There is a pending overlapping request
3098c2ecf20Sopenharmony_ci			 * on this block */
3108c2ecf20Sopenharmony_ci	R5_ReadNoMerge, /* prevent bio from merging in block-layer */
3118c2ecf20Sopenharmony_ci	R5_ReadError,	/* seen a read error here recently */
3128c2ecf20Sopenharmony_ci	R5_ReWrite,	/* have tried to over-write the readerror */
3138c2ecf20Sopenharmony_ci
3148c2ecf20Sopenharmony_ci	R5_Expanded,	/* This block now has post-expand data */
3158c2ecf20Sopenharmony_ci	R5_Wantcompute,	/* compute_block in progress treat as
3168c2ecf20Sopenharmony_ci			 * uptodate
3178c2ecf20Sopenharmony_ci			 */
3188c2ecf20Sopenharmony_ci	R5_Wantfill,	/* dev->toread contains a bio that needs
3198c2ecf20Sopenharmony_ci			 * filling
3208c2ecf20Sopenharmony_ci			 */
3218c2ecf20Sopenharmony_ci	R5_Wantdrain,	/* dev->towrite needs to be drained */
3228c2ecf20Sopenharmony_ci	R5_WantFUA,	/* Write should be FUA */
3238c2ecf20Sopenharmony_ci	R5_SyncIO,	/* The IO is sync */
3248c2ecf20Sopenharmony_ci	R5_WriteError,	/* got a write error - need to record it */
3258c2ecf20Sopenharmony_ci	R5_MadeGood,	/* A bad block has been fixed by writing to it */
3268c2ecf20Sopenharmony_ci	R5_ReadRepl,	/* Will/did read from replacement rather than orig */
3278c2ecf20Sopenharmony_ci	R5_MadeGoodRepl,/* A bad block on the replacement device has been
3288c2ecf20Sopenharmony_ci			 * fixed by writing to it */
3298c2ecf20Sopenharmony_ci	R5_NeedReplace,	/* This device has a replacement which is not
3308c2ecf20Sopenharmony_ci			 * up-to-date at this stripe. */
3318c2ecf20Sopenharmony_ci	R5_WantReplace, /* We need to update the replacement, we have read
3328c2ecf20Sopenharmony_ci			 * data in, and now is a good time to write it out.
3338c2ecf20Sopenharmony_ci			 */
3348c2ecf20Sopenharmony_ci	R5_Discard,	/* Discard the stripe */
3358c2ecf20Sopenharmony_ci	R5_SkipCopy,	/* Don't copy data from bio to stripe cache */
3368c2ecf20Sopenharmony_ci	R5_InJournal,	/* data being written is in the journal device.
3378c2ecf20Sopenharmony_ci			 * if R5_InJournal is set for parity pd_idx, all the
3388c2ecf20Sopenharmony_ci			 * data and parity being written are in the journal
3398c2ecf20Sopenharmony_ci			 * device
3408c2ecf20Sopenharmony_ci			 */
3418c2ecf20Sopenharmony_ci	R5_OrigPageUPTDODATE,	/* with write back cache, we read old data into
3428c2ecf20Sopenharmony_ci				 * dev->orig_page for prexor. When this flag is
3438c2ecf20Sopenharmony_ci				 * set, orig_page contains latest data in the
3448c2ecf20Sopenharmony_ci				 * raid disk.
3458c2ecf20Sopenharmony_ci				 */
3468c2ecf20Sopenharmony_ci};
3478c2ecf20Sopenharmony_ci
3488c2ecf20Sopenharmony_ci/*
3498c2ecf20Sopenharmony_ci * Stripe state
3508c2ecf20Sopenharmony_ci */
3518c2ecf20Sopenharmony_cienum {
3528c2ecf20Sopenharmony_ci	STRIPE_ACTIVE,
3538c2ecf20Sopenharmony_ci	STRIPE_HANDLE,
3548c2ecf20Sopenharmony_ci	STRIPE_SYNC_REQUESTED,
3558c2ecf20Sopenharmony_ci	STRIPE_SYNCING,
3568c2ecf20Sopenharmony_ci	STRIPE_INSYNC,
3578c2ecf20Sopenharmony_ci	STRIPE_REPLACED,
3588c2ecf20Sopenharmony_ci	STRIPE_PREREAD_ACTIVE,
3598c2ecf20Sopenharmony_ci	STRIPE_DELAYED,
3608c2ecf20Sopenharmony_ci	STRIPE_DEGRADED,
3618c2ecf20Sopenharmony_ci	STRIPE_BIT_DELAY,
3628c2ecf20Sopenharmony_ci	STRIPE_EXPANDING,
3638c2ecf20Sopenharmony_ci	STRIPE_EXPAND_SOURCE,
3648c2ecf20Sopenharmony_ci	STRIPE_EXPAND_READY,
3658c2ecf20Sopenharmony_ci	STRIPE_IO_STARTED,	/* do not count towards 'bypass_count' */
3668c2ecf20Sopenharmony_ci	STRIPE_FULL_WRITE,	/* all blocks are set to be overwritten */
3678c2ecf20Sopenharmony_ci	STRIPE_BIOFILL_RUN,
3688c2ecf20Sopenharmony_ci	STRIPE_COMPUTE_RUN,
3698c2ecf20Sopenharmony_ci	STRIPE_ON_UNPLUG_LIST,
3708c2ecf20Sopenharmony_ci	STRIPE_DISCARD,
3718c2ecf20Sopenharmony_ci	STRIPE_ON_RELEASE_LIST,
3728c2ecf20Sopenharmony_ci	STRIPE_BATCH_READY,
3738c2ecf20Sopenharmony_ci	STRIPE_BATCH_ERR,
3748c2ecf20Sopenharmony_ci	STRIPE_BITMAP_PENDING,	/* Being added to bitmap, don't add
3758c2ecf20Sopenharmony_ci				 * to batch yet.
3768c2ecf20Sopenharmony_ci				 */
3778c2ecf20Sopenharmony_ci	STRIPE_LOG_TRAPPED,	/* trapped into log (see raid5-cache.c)
3788c2ecf20Sopenharmony_ci				 * this bit is used in two scenarios:
3798c2ecf20Sopenharmony_ci				 *
3808c2ecf20Sopenharmony_ci				 * 1. write-out phase
3818c2ecf20Sopenharmony_ci				 *  set in first entry of r5l_write_stripe
3828c2ecf20Sopenharmony_ci				 *  clear in second entry of r5l_write_stripe
3838c2ecf20Sopenharmony_ci				 *  used to bypass logic in handle_stripe
3848c2ecf20Sopenharmony_ci				 *
3858c2ecf20Sopenharmony_ci				 * 2. caching phase
3868c2ecf20Sopenharmony_ci				 *  set in r5c_try_caching_write()
3878c2ecf20Sopenharmony_ci				 *  clear when journal write is done
3888c2ecf20Sopenharmony_ci				 *  used to initiate r5c_cache_data()
3898c2ecf20Sopenharmony_ci				 *  also used to bypass logic in handle_stripe
3908c2ecf20Sopenharmony_ci				 */
3918c2ecf20Sopenharmony_ci	STRIPE_R5C_CACHING,	/* the stripe is in caching phase
3928c2ecf20Sopenharmony_ci				 * see more detail in the raid5-cache.c
3938c2ecf20Sopenharmony_ci				 */
3948c2ecf20Sopenharmony_ci	STRIPE_R5C_PARTIAL_STRIPE,	/* in r5c cache (to-be/being handled or
3958c2ecf20Sopenharmony_ci					 * in conf->r5c_partial_stripe_list)
3968c2ecf20Sopenharmony_ci					 */
3978c2ecf20Sopenharmony_ci	STRIPE_R5C_FULL_STRIPE,	/* in r5c cache (to-be/being handled or
3988c2ecf20Sopenharmony_ci				 * in conf->r5c_full_stripe_list)
3998c2ecf20Sopenharmony_ci				 */
4008c2ecf20Sopenharmony_ci	STRIPE_R5C_PREFLUSH,	/* need to flush journal device */
4018c2ecf20Sopenharmony_ci};
4028c2ecf20Sopenharmony_ci
4038c2ecf20Sopenharmony_ci#define STRIPE_EXPAND_SYNC_FLAGS \
4048c2ecf20Sopenharmony_ci	((1 << STRIPE_EXPAND_SOURCE) |\
4058c2ecf20Sopenharmony_ci	(1 << STRIPE_EXPAND_READY) |\
4068c2ecf20Sopenharmony_ci	(1 << STRIPE_EXPANDING) |\
4078c2ecf20Sopenharmony_ci	(1 << STRIPE_SYNC_REQUESTED))
4088c2ecf20Sopenharmony_ci/*
4098c2ecf20Sopenharmony_ci * Operation request flags
4108c2ecf20Sopenharmony_ci */
4118c2ecf20Sopenharmony_cienum {
4128c2ecf20Sopenharmony_ci	STRIPE_OP_BIOFILL,
4138c2ecf20Sopenharmony_ci	STRIPE_OP_COMPUTE_BLK,
4148c2ecf20Sopenharmony_ci	STRIPE_OP_PREXOR,
4158c2ecf20Sopenharmony_ci	STRIPE_OP_BIODRAIN,
4168c2ecf20Sopenharmony_ci	STRIPE_OP_RECONSTRUCT,
4178c2ecf20Sopenharmony_ci	STRIPE_OP_CHECK,
4188c2ecf20Sopenharmony_ci	STRIPE_OP_PARTIAL_PARITY,
4198c2ecf20Sopenharmony_ci};
4208c2ecf20Sopenharmony_ci
4218c2ecf20Sopenharmony_ci/*
4228c2ecf20Sopenharmony_ci * RAID parity calculation preferences
4238c2ecf20Sopenharmony_ci */
4248c2ecf20Sopenharmony_cienum {
4258c2ecf20Sopenharmony_ci	PARITY_DISABLE_RMW = 0,
4268c2ecf20Sopenharmony_ci	PARITY_ENABLE_RMW,
4278c2ecf20Sopenharmony_ci	PARITY_PREFER_RMW,
4288c2ecf20Sopenharmony_ci};
4298c2ecf20Sopenharmony_ci
4308c2ecf20Sopenharmony_ci/*
4318c2ecf20Sopenharmony_ci * Pages requested from set_syndrome_sources()
4328c2ecf20Sopenharmony_ci */
4338c2ecf20Sopenharmony_cienum {
4348c2ecf20Sopenharmony_ci	SYNDROME_SRC_ALL,
4358c2ecf20Sopenharmony_ci	SYNDROME_SRC_WANT_DRAIN,
4368c2ecf20Sopenharmony_ci	SYNDROME_SRC_WRITTEN,
4378c2ecf20Sopenharmony_ci};
4388c2ecf20Sopenharmony_ci/*
4398c2ecf20Sopenharmony_ci * Plugging:
4408c2ecf20Sopenharmony_ci *
4418c2ecf20Sopenharmony_ci * To improve write throughput, we need to delay the handling of some
4428c2ecf20Sopenharmony_ci * stripes until there has been a chance that several write requests
4438c2ecf20Sopenharmony_ci * for the one stripe have all been collected.
4448c2ecf20Sopenharmony_ci * In particular, any write request that would require pre-reading
4458c2ecf20Sopenharmony_ci * is put on a "delayed" queue until there are no stripes currently
4468c2ecf20Sopenharmony_ci * in a pre-read phase.  Further, if the "delayed" queue is empty when
4478c2ecf20Sopenharmony_ci * a stripe is put on it then we "plug" the queue and do not process it
4488c2ecf20Sopenharmony_ci * until an unplug call is made. (the unplug_io_fn() is called).
4498c2ecf20Sopenharmony_ci *
4508c2ecf20Sopenharmony_ci * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
4518c2ecf20Sopenharmony_ci * it to the count of prereading stripes.
4528c2ecf20Sopenharmony_ci * When write is initiated, or the stripe refcnt == 0 (just in case) we
4538c2ecf20Sopenharmony_ci * clear the PREREAD_ACTIVE flag and decrement the count
4548c2ecf20Sopenharmony_ci * Whenever the 'handle' queue is empty and the device is not plugged, we
4558c2ecf20Sopenharmony_ci * move any strips from delayed to handle and clear the DELAYED flag and set
4568c2ecf20Sopenharmony_ci * PREREAD_ACTIVE.
4578c2ecf20Sopenharmony_ci * In stripe_handle, if we find pre-reading is necessary, we do it if
4588c2ecf20Sopenharmony_ci * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
4598c2ecf20Sopenharmony_ci * HANDLE gets cleared if stripe_handle leaves nothing locked.
4608c2ecf20Sopenharmony_ci */
4618c2ecf20Sopenharmony_ci
4628c2ecf20Sopenharmony_ci/* Note: disk_info.rdev can be set to NULL asynchronously by raid5_remove_disk.
4638c2ecf20Sopenharmony_ci * There are three safe ways to access disk_info.rdev.
4648c2ecf20Sopenharmony_ci * 1/ when holding mddev->reconfig_mutex
4658c2ecf20Sopenharmony_ci * 2/ when resync/recovery/reshape is known to be happening - i.e. in code that
4668c2ecf20Sopenharmony_ci *    is called as part of performing resync/recovery/reshape.
4678c2ecf20Sopenharmony_ci * 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
4688c2ecf20Sopenharmony_ci *    and if it is non-NULL, increment rdev->nr_pending before dropping the RCU
4698c2ecf20Sopenharmony_ci *    lock.
4708c2ecf20Sopenharmony_ci * When .rdev is set to NULL, the nr_pending count checked again and if
4718c2ecf20Sopenharmony_ci * it has been incremented, the pointer is put back in .rdev.
4728c2ecf20Sopenharmony_ci */
4738c2ecf20Sopenharmony_ci
4748c2ecf20Sopenharmony_cistruct disk_info {
4758c2ecf20Sopenharmony_ci	struct md_rdev	*rdev, *replacement;
4768c2ecf20Sopenharmony_ci	struct page	*extra_page; /* extra page to use in prexor */
4778c2ecf20Sopenharmony_ci};
4788c2ecf20Sopenharmony_ci
4798c2ecf20Sopenharmony_ci/*
4808c2ecf20Sopenharmony_ci * Stripe cache
4818c2ecf20Sopenharmony_ci */
4828c2ecf20Sopenharmony_ci
4838c2ecf20Sopenharmony_ci#define NR_STRIPES		256
4848c2ecf20Sopenharmony_ci
4858c2ecf20Sopenharmony_ci#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
4868c2ecf20Sopenharmony_ci#define STRIPE_SIZE		PAGE_SIZE
4878c2ecf20Sopenharmony_ci#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
4888c2ecf20Sopenharmony_ci#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
4898c2ecf20Sopenharmony_ci#endif
4908c2ecf20Sopenharmony_ci
4918c2ecf20Sopenharmony_ci#define	IO_THRESHOLD		1
4928c2ecf20Sopenharmony_ci#define BYPASS_THRESHOLD	1
4938c2ecf20Sopenharmony_ci#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
4948c2ecf20Sopenharmony_ci#define HASH_MASK		(NR_HASH - 1)
4958c2ecf20Sopenharmony_ci#define MAX_STRIPE_BATCH	8
4968c2ecf20Sopenharmony_ci
4978c2ecf20Sopenharmony_ci/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
4988c2ecf20Sopenharmony_ci * This is because we sometimes take all the spinlocks
4998c2ecf20Sopenharmony_ci * and creating that much locking depth can cause
5008c2ecf20Sopenharmony_ci * problems.
5018c2ecf20Sopenharmony_ci */
5028c2ecf20Sopenharmony_ci#define NR_STRIPE_HASH_LOCKS 8
5038c2ecf20Sopenharmony_ci#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_cistruct r5worker {
5068c2ecf20Sopenharmony_ci	struct work_struct work;
5078c2ecf20Sopenharmony_ci	struct r5worker_group *group;
5088c2ecf20Sopenharmony_ci	struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5098c2ecf20Sopenharmony_ci	bool working;
5108c2ecf20Sopenharmony_ci};
5118c2ecf20Sopenharmony_ci
5128c2ecf20Sopenharmony_cistruct r5worker_group {
5138c2ecf20Sopenharmony_ci	struct list_head handle_list;
5148c2ecf20Sopenharmony_ci	struct list_head loprio_list;
5158c2ecf20Sopenharmony_ci	struct r5conf *conf;
5168c2ecf20Sopenharmony_ci	struct r5worker *workers;
5178c2ecf20Sopenharmony_ci	int stripes_cnt;
5188c2ecf20Sopenharmony_ci};
5198c2ecf20Sopenharmony_ci
5208c2ecf20Sopenharmony_ci/*
5218c2ecf20Sopenharmony_ci * r5c journal modes of the array: write-back or write-through.
5228c2ecf20Sopenharmony_ci * write-through mode has identical behavior as existing log only
5238c2ecf20Sopenharmony_ci * implementation.
5248c2ecf20Sopenharmony_ci */
5258c2ecf20Sopenharmony_cienum r5c_journal_mode {
5268c2ecf20Sopenharmony_ci	R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
5278c2ecf20Sopenharmony_ci	R5C_JOURNAL_MODE_WRITE_BACK = 1,
5288c2ecf20Sopenharmony_ci};
5298c2ecf20Sopenharmony_ci
5308c2ecf20Sopenharmony_cienum r5_cache_state {
5318c2ecf20Sopenharmony_ci	R5_INACTIVE_BLOCKED,	/* release of inactive stripes blocked,
5328c2ecf20Sopenharmony_ci				 * waiting for 25% to be free
5338c2ecf20Sopenharmony_ci				 */
5348c2ecf20Sopenharmony_ci	R5_ALLOC_MORE,		/* It might help to allocate another
5358c2ecf20Sopenharmony_ci				 * stripe.
5368c2ecf20Sopenharmony_ci				 */
5378c2ecf20Sopenharmony_ci	R5_DID_ALLOC,		/* A stripe was allocated, don't allocate
5388c2ecf20Sopenharmony_ci				 * more until at least one has been
5398c2ecf20Sopenharmony_ci				 * released.  This avoids flooding
5408c2ecf20Sopenharmony_ci				 * the cache.
5418c2ecf20Sopenharmony_ci				 */
5428c2ecf20Sopenharmony_ci	R5C_LOG_TIGHT,		/* log device space tight, need to
5438c2ecf20Sopenharmony_ci				 * prioritize stripes at last_checkpoint
5448c2ecf20Sopenharmony_ci				 */
5458c2ecf20Sopenharmony_ci	R5C_LOG_CRITICAL,	/* log device is running out of space,
5468c2ecf20Sopenharmony_ci				 * only process stripes that are already
5478c2ecf20Sopenharmony_ci				 * occupying the log
5488c2ecf20Sopenharmony_ci				 */
5498c2ecf20Sopenharmony_ci	R5C_EXTRA_PAGE_IN_USE,	/* a stripe is using disk_info.extra_page
5508c2ecf20Sopenharmony_ci				 * for prexor
5518c2ecf20Sopenharmony_ci				 */
5528c2ecf20Sopenharmony_ci};
5538c2ecf20Sopenharmony_ci
5548c2ecf20Sopenharmony_ci#define PENDING_IO_MAX 512
5558c2ecf20Sopenharmony_ci#define PENDING_IO_ONE_FLUSH 128
5568c2ecf20Sopenharmony_cistruct r5pending_data {
5578c2ecf20Sopenharmony_ci	struct list_head sibling;
5588c2ecf20Sopenharmony_ci	sector_t sector; /* stripe sector */
5598c2ecf20Sopenharmony_ci	struct bio_list bios;
5608c2ecf20Sopenharmony_ci};
5618c2ecf20Sopenharmony_ci
5628c2ecf20Sopenharmony_cistruct r5conf {
5638c2ecf20Sopenharmony_ci	struct hlist_head	*stripe_hashtbl;
5648c2ecf20Sopenharmony_ci	/* only protect corresponding hash list and inactive_list */
5658c2ecf20Sopenharmony_ci	spinlock_t		hash_locks[NR_STRIPE_HASH_LOCKS];
5668c2ecf20Sopenharmony_ci	struct mddev		*mddev;
5678c2ecf20Sopenharmony_ci	int			chunk_sectors;
5688c2ecf20Sopenharmony_ci	int			level, algorithm, rmw_level;
5698c2ecf20Sopenharmony_ci	int			max_degraded;
5708c2ecf20Sopenharmony_ci	int			raid_disks;
5718c2ecf20Sopenharmony_ci	int			max_nr_stripes;
5728c2ecf20Sopenharmony_ci	int			min_nr_stripes;
5738c2ecf20Sopenharmony_ci#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
5748c2ecf20Sopenharmony_ci	unsigned long	stripe_size;
5758c2ecf20Sopenharmony_ci	unsigned int	stripe_shift;
5768c2ecf20Sopenharmony_ci	unsigned long	stripe_sectors;
5778c2ecf20Sopenharmony_ci#endif
5788c2ecf20Sopenharmony_ci
5798c2ecf20Sopenharmony_ci	/* reshape_progress is the leading edge of a 'reshape'
5808c2ecf20Sopenharmony_ci	 * It has value MaxSector when no reshape is happening
5818c2ecf20Sopenharmony_ci	 * If delta_disks < 0, it is the last sector we started work on,
5828c2ecf20Sopenharmony_ci	 * else is it the next sector to work on.
5838c2ecf20Sopenharmony_ci	 */
5848c2ecf20Sopenharmony_ci	sector_t		reshape_progress;
5858c2ecf20Sopenharmony_ci	/* reshape_safe is the trailing edge of a reshape.  We know that
5868c2ecf20Sopenharmony_ci	 * before (or after) this address, all reshape has completed.
5878c2ecf20Sopenharmony_ci	 */
5888c2ecf20Sopenharmony_ci	sector_t		reshape_safe;
5898c2ecf20Sopenharmony_ci	int			previous_raid_disks;
5908c2ecf20Sopenharmony_ci	int			prev_chunk_sectors;
5918c2ecf20Sopenharmony_ci	int			prev_algo;
5928c2ecf20Sopenharmony_ci	short			generation; /* increments with every reshape */
5938c2ecf20Sopenharmony_ci	seqcount_spinlock_t	gen_lock;	/* lock against generation changes */
5948c2ecf20Sopenharmony_ci	unsigned long		reshape_checkpoint; /* Time we last updated
5958c2ecf20Sopenharmony_ci						     * metadata */
5968c2ecf20Sopenharmony_ci	long long		min_offset_diff; /* minimum difference between
5978c2ecf20Sopenharmony_ci						  * data_offset and
5988c2ecf20Sopenharmony_ci						  * new_data_offset across all
5998c2ecf20Sopenharmony_ci						  * devices.  May be negative,
6008c2ecf20Sopenharmony_ci						  * but is closest to zero.
6018c2ecf20Sopenharmony_ci						  */
6028c2ecf20Sopenharmony_ci
6038c2ecf20Sopenharmony_ci	struct list_head	handle_list; /* stripes needing handling */
6048c2ecf20Sopenharmony_ci	struct list_head	loprio_list; /* low priority stripes */
6058c2ecf20Sopenharmony_ci	struct list_head	hold_list; /* preread ready stripes */
6068c2ecf20Sopenharmony_ci	struct list_head	delayed_list; /* stripes that have plugged requests */
6078c2ecf20Sopenharmony_ci	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
6088c2ecf20Sopenharmony_ci	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
6098c2ecf20Sopenharmony_ci	unsigned int		retry_read_offset; /* sector offset into retry_read_aligned */
6108c2ecf20Sopenharmony_ci	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
6118c2ecf20Sopenharmony_ci	atomic_t		preread_active_stripes; /* stripes with scheduled io */
6128c2ecf20Sopenharmony_ci	atomic_t		active_aligned_reads;
6138c2ecf20Sopenharmony_ci	atomic_t		pending_full_writes; /* full write backlog */
6148c2ecf20Sopenharmony_ci	int			bypass_count; /* bypassed prereads */
6158c2ecf20Sopenharmony_ci	int			bypass_threshold; /* preread nice */
6168c2ecf20Sopenharmony_ci	int			skip_copy; /* Don't copy data from bio to stripe cache */
6178c2ecf20Sopenharmony_ci	struct list_head	*last_hold; /* detect hold_list promotions */
6188c2ecf20Sopenharmony_ci
6198c2ecf20Sopenharmony_ci	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
6208c2ecf20Sopenharmony_ci	/* unfortunately we need two cache names as we temporarily have
6218c2ecf20Sopenharmony_ci	 * two caches.
6228c2ecf20Sopenharmony_ci	 */
6238c2ecf20Sopenharmony_ci	int			active_name;
6248c2ecf20Sopenharmony_ci	char			cache_name[2][32];
6258c2ecf20Sopenharmony_ci	struct kmem_cache	*slab_cache; /* for allocating stripes */
6268c2ecf20Sopenharmony_ci	struct mutex		cache_size_mutex; /* Protect changes to cache size */
6278c2ecf20Sopenharmony_ci
6288c2ecf20Sopenharmony_ci	int			seq_flush, seq_write;
6298c2ecf20Sopenharmony_ci	int			quiesce;
6308c2ecf20Sopenharmony_ci
6318c2ecf20Sopenharmony_ci	int			fullsync;  /* set to 1 if a full sync is needed,
6328c2ecf20Sopenharmony_ci					    * (fresh device added).
6338c2ecf20Sopenharmony_ci					    * Cleared when a sync completes.
6348c2ecf20Sopenharmony_ci					    */
6358c2ecf20Sopenharmony_ci	int			recovery_disabled;
6368c2ecf20Sopenharmony_ci	/* per cpu variables */
6378c2ecf20Sopenharmony_ci	struct raid5_percpu {
6388c2ecf20Sopenharmony_ci		struct page	*spare_page; /* Used when checking P/Q in raid6 */
6398c2ecf20Sopenharmony_ci		void		*scribble;  /* space for constructing buffer
6408c2ecf20Sopenharmony_ci					     * lists and performing address
6418c2ecf20Sopenharmony_ci					     * conversions
6428c2ecf20Sopenharmony_ci					     */
6438c2ecf20Sopenharmony_ci		int scribble_obj_size;
6448c2ecf20Sopenharmony_ci	} __percpu *percpu;
6458c2ecf20Sopenharmony_ci	int scribble_disks;
6468c2ecf20Sopenharmony_ci	int scribble_sectors;
6478c2ecf20Sopenharmony_ci	struct hlist_node node;
6488c2ecf20Sopenharmony_ci
6498c2ecf20Sopenharmony_ci	/*
6508c2ecf20Sopenharmony_ci	 * Free stripes pool
6518c2ecf20Sopenharmony_ci	 */
6528c2ecf20Sopenharmony_ci	atomic_t		active_stripes;
6538c2ecf20Sopenharmony_ci	struct list_head	inactive_list[NR_STRIPE_HASH_LOCKS];
6548c2ecf20Sopenharmony_ci
6558c2ecf20Sopenharmony_ci	atomic_t		r5c_cached_full_stripes;
6568c2ecf20Sopenharmony_ci	struct list_head	r5c_full_stripe_list;
6578c2ecf20Sopenharmony_ci	atomic_t		r5c_cached_partial_stripes;
6588c2ecf20Sopenharmony_ci	struct list_head	r5c_partial_stripe_list;
6598c2ecf20Sopenharmony_ci	atomic_t		r5c_flushing_full_stripes;
6608c2ecf20Sopenharmony_ci	atomic_t		r5c_flushing_partial_stripes;
6618c2ecf20Sopenharmony_ci
6628c2ecf20Sopenharmony_ci	atomic_t		empty_inactive_list_nr;
6638c2ecf20Sopenharmony_ci	struct llist_head	released_stripes;
6648c2ecf20Sopenharmony_ci	wait_queue_head_t	wait_for_quiescent;
6658c2ecf20Sopenharmony_ci	wait_queue_head_t	wait_for_stripe;
6668c2ecf20Sopenharmony_ci	wait_queue_head_t	wait_for_overlap;
6678c2ecf20Sopenharmony_ci	unsigned long		cache_state;
6688c2ecf20Sopenharmony_ci	struct shrinker		shrinker;
6698c2ecf20Sopenharmony_ci	int			pool_size; /* number of disks in stripeheads in pool */
6708c2ecf20Sopenharmony_ci	spinlock_t		device_lock;
6718c2ecf20Sopenharmony_ci	struct disk_info	*disks;
6728c2ecf20Sopenharmony_ci	struct bio_set		bio_split;
6738c2ecf20Sopenharmony_ci
6748c2ecf20Sopenharmony_ci	/* When taking over an array from a different personality, we store
6758c2ecf20Sopenharmony_ci	 * the new thread here until we fully activate the array.
6768c2ecf20Sopenharmony_ci	 */
6778c2ecf20Sopenharmony_ci	struct md_thread	*thread;
6788c2ecf20Sopenharmony_ci	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS];
6798c2ecf20Sopenharmony_ci	struct r5worker_group	*worker_groups;
6808c2ecf20Sopenharmony_ci	int			group_cnt;
6818c2ecf20Sopenharmony_ci	int			worker_cnt_per_group;
6828c2ecf20Sopenharmony_ci	struct r5l_log		*log;
6838c2ecf20Sopenharmony_ci	void			*log_private;
6848c2ecf20Sopenharmony_ci
6858c2ecf20Sopenharmony_ci	spinlock_t		pending_bios_lock;
6868c2ecf20Sopenharmony_ci	bool			batch_bio_dispatch;
6878c2ecf20Sopenharmony_ci	struct r5pending_data	*pending_data;
6888c2ecf20Sopenharmony_ci	struct list_head	free_list;
6898c2ecf20Sopenharmony_ci	struct list_head	pending_list;
6908c2ecf20Sopenharmony_ci	int			pending_data_cnt;
6918c2ecf20Sopenharmony_ci	struct r5pending_data	*next_pending_data;
6928c2ecf20Sopenharmony_ci};
6938c2ecf20Sopenharmony_ci
6948c2ecf20Sopenharmony_ci#if PAGE_SIZE == DEFAULT_STRIPE_SIZE
6958c2ecf20Sopenharmony_ci#define RAID5_STRIPE_SIZE(conf)	STRIPE_SIZE
6968c2ecf20Sopenharmony_ci#define RAID5_STRIPE_SHIFT(conf)	STRIPE_SHIFT
6978c2ecf20Sopenharmony_ci#define RAID5_STRIPE_SECTORS(conf)	STRIPE_SECTORS
6988c2ecf20Sopenharmony_ci#else
6998c2ecf20Sopenharmony_ci#define RAID5_STRIPE_SIZE(conf)	((conf)->stripe_size)
7008c2ecf20Sopenharmony_ci#define RAID5_STRIPE_SHIFT(conf)	((conf)->stripe_shift)
7018c2ecf20Sopenharmony_ci#define RAID5_STRIPE_SECTORS(conf)	((conf)->stripe_sectors)
7028c2ecf20Sopenharmony_ci#endif
7038c2ecf20Sopenharmony_ci
7048c2ecf20Sopenharmony_ci/* bio's attached to a stripe+device for I/O are linked together in bi_sector
7058c2ecf20Sopenharmony_ci * order without overlap.  There may be several bio's per stripe+device, and
7068c2ecf20Sopenharmony_ci * a bio could span several devices.
7078c2ecf20Sopenharmony_ci * When walking this list for a particular stripe+device, we must never proceed
7088c2ecf20Sopenharmony_ci * beyond a bio that extends past this device, as the next bio might no longer
7098c2ecf20Sopenharmony_ci * be valid.
7108c2ecf20Sopenharmony_ci * This function is used to determine the 'next' bio in the list, given the
7118c2ecf20Sopenharmony_ci * sector of the current stripe+device
7128c2ecf20Sopenharmony_ci */
7138c2ecf20Sopenharmony_cistatic inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector)
7148c2ecf20Sopenharmony_ci{
7158c2ecf20Sopenharmony_ci	if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf))
7168c2ecf20Sopenharmony_ci		return bio->bi_next;
7178c2ecf20Sopenharmony_ci	else
7188c2ecf20Sopenharmony_ci		return NULL;
7198c2ecf20Sopenharmony_ci}
7208c2ecf20Sopenharmony_ci
7218c2ecf20Sopenharmony_ci/*
7228c2ecf20Sopenharmony_ci * Our supported algorithms
7238c2ecf20Sopenharmony_ci */
7248c2ecf20Sopenharmony_ci#define ALGORITHM_LEFT_ASYMMETRIC	0 /* Rotating Parity N with Data Restart */
7258c2ecf20Sopenharmony_ci#define ALGORITHM_RIGHT_ASYMMETRIC	1 /* Rotating Parity 0 with Data Restart */
7268c2ecf20Sopenharmony_ci#define ALGORITHM_LEFT_SYMMETRIC	2 /* Rotating Parity N with Data Continuation */
7278c2ecf20Sopenharmony_ci#define ALGORITHM_RIGHT_SYMMETRIC	3 /* Rotating Parity 0 with Data Continuation */
7288c2ecf20Sopenharmony_ci
7298c2ecf20Sopenharmony_ci/* Define non-rotating (raid4) algorithms.  These allow
7308c2ecf20Sopenharmony_ci * conversion of raid4 to raid5.
7318c2ecf20Sopenharmony_ci */
7328c2ecf20Sopenharmony_ci#define ALGORITHM_PARITY_0		4 /* P or P,Q are initial devices */
7338c2ecf20Sopenharmony_ci#define ALGORITHM_PARITY_N		5 /* P or P,Q are final devices. */
7348c2ecf20Sopenharmony_ci
7358c2ecf20Sopenharmony_ci/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
7368c2ecf20Sopenharmony_ci * Firstly, the exact positioning of the parity block is slightly
7378c2ecf20Sopenharmony_ci * different between the 'LEFT_*' modes of md and the "_N_*" modes
7388c2ecf20Sopenharmony_ci * of DDF.
7398c2ecf20Sopenharmony_ci * Secondly, or order of datablocks over which the Q syndrome is computed
7408c2ecf20Sopenharmony_ci * is different.
7418c2ecf20Sopenharmony_ci * Consequently we have different layouts for DDF/raid6 than md/raid6.
7428c2ecf20Sopenharmony_ci * These layouts are from the DDFv1.2 spec.
7438c2ecf20Sopenharmony_ci * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
7448c2ecf20Sopenharmony_ci * leaves RLQ=3 as 'Vendor Specific'
7458c2ecf20Sopenharmony_ci */
7468c2ecf20Sopenharmony_ci
7478c2ecf20Sopenharmony_ci#define ALGORITHM_ROTATING_ZERO_RESTART	8 /* DDF PRL=6 RLQ=1 */
7488c2ecf20Sopenharmony_ci#define ALGORITHM_ROTATING_N_RESTART	9 /* DDF PRL=6 RLQ=2 */
7498c2ecf20Sopenharmony_ci#define ALGORITHM_ROTATING_N_CONTINUE	10 /*DDF PRL=6 RLQ=3 */
7508c2ecf20Sopenharmony_ci
7518c2ecf20Sopenharmony_ci/* For every RAID5 algorithm we define a RAID6 algorithm
7528c2ecf20Sopenharmony_ci * with exactly the same layout for data and parity, and
7538c2ecf20Sopenharmony_ci * with the Q block always on the last device (N-1).
7548c2ecf20Sopenharmony_ci * This allows trivial conversion from RAID5 to RAID6
7558c2ecf20Sopenharmony_ci */
7568c2ecf20Sopenharmony_ci#define ALGORITHM_LEFT_ASYMMETRIC_6	16
7578c2ecf20Sopenharmony_ci#define ALGORITHM_RIGHT_ASYMMETRIC_6	17
7588c2ecf20Sopenharmony_ci#define ALGORITHM_LEFT_SYMMETRIC_6	18
7598c2ecf20Sopenharmony_ci#define ALGORITHM_RIGHT_SYMMETRIC_6	19
7608c2ecf20Sopenharmony_ci#define ALGORITHM_PARITY_0_6		20
7618c2ecf20Sopenharmony_ci#define ALGORITHM_PARITY_N_6		ALGORITHM_PARITY_N
7628c2ecf20Sopenharmony_ci
7638c2ecf20Sopenharmony_cistatic inline int algorithm_valid_raid5(int layout)
7648c2ecf20Sopenharmony_ci{
7658c2ecf20Sopenharmony_ci	return (layout >= 0) &&
7668c2ecf20Sopenharmony_ci		(layout <= 5);
7678c2ecf20Sopenharmony_ci}
7688c2ecf20Sopenharmony_cistatic inline int algorithm_valid_raid6(int layout)
7698c2ecf20Sopenharmony_ci{
7708c2ecf20Sopenharmony_ci	return (layout >= 0 && layout <= 5)
7718c2ecf20Sopenharmony_ci		||
7728c2ecf20Sopenharmony_ci		(layout >= 8 && layout <= 10)
7738c2ecf20Sopenharmony_ci		||
7748c2ecf20Sopenharmony_ci		(layout >= 16 && layout <= 20);
7758c2ecf20Sopenharmony_ci}
7768c2ecf20Sopenharmony_ci
7778c2ecf20Sopenharmony_cistatic inline int algorithm_is_DDF(int layout)
7788c2ecf20Sopenharmony_ci{
7798c2ecf20Sopenharmony_ci	return layout >= 8 && layout <= 10;
7808c2ecf20Sopenharmony_ci}
7818c2ecf20Sopenharmony_ci
7828c2ecf20Sopenharmony_ci#if PAGE_SIZE != DEFAULT_STRIPE_SIZE
7838c2ecf20Sopenharmony_ci/*
7848c2ecf20Sopenharmony_ci * Return offset of the corresponding page for r5dev.
7858c2ecf20Sopenharmony_ci */
7868c2ecf20Sopenharmony_cistatic inline int raid5_get_page_offset(struct stripe_head *sh, int disk_idx)
7878c2ecf20Sopenharmony_ci{
7888c2ecf20Sopenharmony_ci	return (disk_idx % sh->stripes_per_page) * RAID5_STRIPE_SIZE(sh->raid_conf);
7898c2ecf20Sopenharmony_ci}
7908c2ecf20Sopenharmony_ci
7918c2ecf20Sopenharmony_ci/*
7928c2ecf20Sopenharmony_ci * Return corresponding page address for r5dev.
7938c2ecf20Sopenharmony_ci */
7948c2ecf20Sopenharmony_cistatic inline struct page *
7958c2ecf20Sopenharmony_ciraid5_get_dev_page(struct stripe_head *sh, int disk_idx)
7968c2ecf20Sopenharmony_ci{
7978c2ecf20Sopenharmony_ci	return sh->pages[disk_idx / sh->stripes_per_page];
7988c2ecf20Sopenharmony_ci}
7998c2ecf20Sopenharmony_ci#endif
8008c2ecf20Sopenharmony_ci
8018c2ecf20Sopenharmony_ciextern void md_raid5_kick_device(struct r5conf *conf);
8028c2ecf20Sopenharmony_ciextern int raid5_set_cache_size(struct mddev *mddev, int size);
8038c2ecf20Sopenharmony_ciextern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
8048c2ecf20Sopenharmony_ciextern void raid5_release_stripe(struct stripe_head *sh);
8058c2ecf20Sopenharmony_ciextern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
8068c2ecf20Sopenharmony_ci				     int previous, int *dd_idx,
8078c2ecf20Sopenharmony_ci				     struct stripe_head *sh);
8088c2ecf20Sopenharmony_ciextern struct stripe_head *
8098c2ecf20Sopenharmony_ciraid5_get_active_stripe(struct r5conf *conf, sector_t sector,
8108c2ecf20Sopenharmony_ci			int previous, int noblock, int noquiesce);
8118c2ecf20Sopenharmony_ciextern int raid5_calc_degraded(struct r5conf *conf);
8128c2ecf20Sopenharmony_ciextern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
8138c2ecf20Sopenharmony_ci#endif
814